On Fri, 14 Jun 2024, Li, Pan2 wrote:

> Hi Richard,
> 
> Here is one PR related to this patch (by git bisect), details as below.
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115458
> 
> I am still trying to narrow down which change caused this failure or any 
> hints here?

It definitely looks like a latent issue being triggered.  Either in LRA
or in how the target presents itself.

Richard.

> Pan
> 
> -----Original Message-----
> From: Richard Biener <rguent...@suse.de> 
> Sent: Wednesday, May 15, 2024 5:39 PM
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] tree-optimization/114589 - remove profile based sink 
> heuristics
> 
> The following removes the profile based heuristic limiting sinking
> and instead uses post-dominators to avoid sinking to places that
> are executed under the same conditions as the earlier location which
> the profile based heuristic should have guaranteed as well.
> 
> To avoid regressing this moves the empty-latch check to cover all
> sink cases.
> 
> It also stream-lines the resulting select_best_block a bit but avoids
> adjusting heuristics more with this change.  gfortran.dg/streamio_9.f90
> starts execute failing with this on x86_64 with -m32 because the
> (float)i * 9.9999...e-7 compute is sunk across a STOP causing it
> to be no longer spilled and thus the compare failing due to excess
> precision.  The patch adds -ffloat-store to avoid this, following
> other similar testcases.
> 
> This change doesn't fix the testcase in the PR on itself.
> 
> Bootstrapped on x86_64-unknown-linux-gnu, re-testing in progress.
> 
>       PR tree-optimization/114589
>       * tree-ssa-sink.cc (select_best_block): Remove profile-based
>       heuristics.  Instead reject sink locations that sink
>         to post-dominators.  Move empty latch check here from
>       statement_sink_location.  Also consider early_bb for the
>       loop depth check.
>       (statement_sink_location): Remove superfluous check.  Remove
>       empty latch check.
>       (pass_sink_code::execute): Compute/release post-dominators.
> 
>       * gfortran.dg/streamio_9.f90: Use -ffloat-store to avoid
>       excess precision when not spilling.
> ---
>  gcc/testsuite/gfortran.dg/streamio_9.f90 |  1 +
>  gcc/tree-ssa-sink.cc                     | 62 ++++++++----------------
>  2 files changed, 20 insertions(+), 43 deletions(-)
> 
> diff --git a/gcc/testsuite/gfortran.dg/streamio_9.f90 
> b/gcc/testsuite/gfortran.dg/streamio_9.f90
> index b6bddb973f8..f29ded6ba54 100644
> --- a/gcc/testsuite/gfortran.dg/streamio_9.f90
> +++ b/gcc/testsuite/gfortran.dg/streamio_9.f90
> @@ -1,4 +1,5 @@
>  ! { dg-do run }
> +! { dg-options "-ffloat-store" }
>  ! PR29053 Stream IO test 9.
>  ! Contributed by Jerry DeLisle <jvdeli...@gcc.gnu.org>.
>  ! Test case derived from that given in PR by Steve Kargl.
> diff --git a/gcc/tree-ssa-sink.cc b/gcc/tree-ssa-sink.cc
> index 2f90acb7ef4..2188b7523c7 100644
> --- a/gcc/tree-ssa-sink.cc
> +++ b/gcc/tree-ssa-sink.cc
> @@ -178,15 +178,7 @@ nearest_common_dominator_of_uses (def_operand_p def_p, 
> bool *debug_stmts)
>  
>     We want the most control dependent block in the shallowest loop nest.
>  
> -   If the resulting block is in a shallower loop nest, then use it.  Else
> -   only use the resulting block if it has significantly lower execution
> -   frequency than EARLY_BB to avoid gratuitous statement movement.  We
> -   consider statements with VOPS more desirable to move.
> -
> -   This pass would obviously benefit from PDO as it utilizes block
> -   frequencies.  It would also benefit from recomputing frequencies
> -   if profile data is not available since frequencies often get out
> -   of sync with reality.  */
> +   If the resulting block is in a shallower loop nest, then use it.  */
>  
>  static basic_block
>  select_best_block (basic_block early_bb,
> @@ -195,18 +187,17 @@ select_best_block (basic_block early_bb,
>  {
>    basic_block best_bb = late_bb;
>    basic_block temp_bb = late_bb;
> -  int threshold;
>  
>    while (temp_bb != early_bb)
>      {
> +      /* Walk up the dominator tree, hopefully we'll find a shallower
> +      loop nest.  */
> +      temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
> +
>        /* If we've moved into a lower loop nest, then that becomes
>        our best block.  */
>        if (bb_loop_depth (temp_bb) < bb_loop_depth (best_bb))
>       best_bb = temp_bb;
> -
> -      /* Walk up the dominator tree, hopefully we'll find a shallower
> -      loop nest.  */
> -      temp_bb = get_immediate_dominator (CDI_DOMINATORS, temp_bb);
>      }
>  
>    /* Placing a statement before a setjmp-like function would be invalid
> @@ -221,6 +212,16 @@ select_best_block (basic_block early_bb,
>    if (bb_loop_depth (best_bb) < bb_loop_depth (early_bb))
>      return best_bb;
>  
> +  /* Do not move stmts to post-dominating places on the same loop depth.  */
> +  if (dominated_by_p (CDI_POST_DOMINATORS, early_bb, best_bb))
> +    return early_bb;
> +
> +  /* If the latch block is empty, don't make it non-empty by sinking
> +     something into it.  */
> +  if (best_bb == early_bb->loop_father->latch
> +      && empty_block_p (best_bb))
> +    return early_bb;
> +
>    /* Avoid turning an unconditional read into a conditional one when we
>       still might want to perform vectorization.  */
>    if (best_bb->loop_father == early_bb->loop_father
> @@ -233,28 +234,7 @@ select_best_block (basic_block early_bb,
>        && !dominated_by_p (CDI_DOMINATORS, best_bb->loop_father->latch, 
> best_bb))
>      return early_bb;
>  
> -  /* Get the sinking threshold.  If the statement to be moved has memory
> -     operands, then increase the threshold by 7% as those are even more
> -     profitable to avoid, clamping at 100%.  */
> -  threshold = param_sink_frequency_threshold;
> -  if (gimple_vuse (stmt) || gimple_vdef (stmt))
> -    {
> -      threshold += 7;
> -      if (threshold > 100)
> -     threshold = 100;
> -    }
> -
> -  /* If BEST_BB is at the same nesting level, then require it to have
> -     significantly lower execution frequency to avoid gratuitous movement.  
> */
> -  if (bb_loop_depth (best_bb) == bb_loop_depth (early_bb)
> -      /* If result of comparsion is unknown, prefer EARLY_BB.
> -      Thus use !(...>=..) rather than (...<...)  */
> -      && !(best_bb->count * 100 >= early_bb->count * threshold))
> -    return best_bb;
> -
> -  /* No better block found, so return EARLY_BB, which happens to be the
> -     statement's original block.  */
> -  return early_bb;
> +  return best_bb;
>  }
>  
>  /* Given a statement (STMT) and the basic block it is currently in (FROMBB),
> @@ -452,13 +432,7 @@ statement_sink_location (gimple *stmt, basic_block 
> frombb,
>      return false;
>    
>    sinkbb = select_best_block (frombb, sinkbb, stmt);
> -  if (!sinkbb || sinkbb == frombb)
> -    return false;
> -
> -  /* If the latch block is empty, don't make it non-empty by sinking
> -     something into it.  */
> -  if (sinkbb == frombb->loop_father->latch
> -      && empty_block_p (sinkbb))
> +  if (sinkbb == frombb)
>      return false;
>  
>    *togsi = gsi_after_labels (sinkbb);
> @@ -822,6 +796,7 @@ pass_sink_code::execute (function *fun)
>    mark_dfs_back_edges (fun);
>    memset (&sink_stats, 0, sizeof (sink_stats));
>    calculate_dominance_info (CDI_DOMINATORS);
> +  calculate_dominance_info (CDI_POST_DOMINATORS);
>  
>    virtual_operand_live vop_live;
>  
> @@ -833,6 +808,7 @@ pass_sink_code::execute (function *fun)
>  
>    statistics_counter_event (fun, "Sunk statements", sink_stats.sunk);
>    statistics_counter_event (fun, "Commoned stores", sink_stats.commoned);
> +  free_dominance_info (CDI_POST_DOMINATORS);
>    remove_fake_exit_edges ();
>    loop_optimizer_finalize ();
>  
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to