Hi,

On 2021/8/16 19:46, Richard Biener wrote:
On Mon, 16 Aug 2021, Xiong Hu Luo wrote:

It seems to me that ALWAYS_EXECUTED_IN is not computed correctly for
nested loops.  inn_loop is updated to inner loop, so it need be restored
when exiting from innermost loop. With this patch, the store instruction
in outer loop could also be moved out of outer loop by store motion.
Any comments?  Thanks.

gcc/ChangeLog:

        * tree-ssa-loop-im.c (fill_always_executed_in_1): Restore
        inn_loop when exiting from innermost loop.

gcc/testsuite/ChangeLog:

        * gcc.dg/tree-ssa/ssa-lim-19.c: New test.
---
  gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c | 24 ++++++++++++++++++++++
  gcc/tree-ssa-loop-im.c                     |  6 +++++-
  2 files changed, 29 insertions(+), 1 deletion(-)
  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
new file mode 100644
index 00000000000..097a5ee4a4b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-19.c
@@ -0,0 +1,24 @@
+/* PR/101293 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-lim2-details" } */
+
+struct X { int i; int j; int k;};
+
+void foo(struct X *x, int n, int l)
+{
+  for (int j = 0; j < l; j++)
+    {
+      for (int i = 0; i < n; ++i)
+       {
+         int *p = &x->j;
+         int tem = *p;
+         x->j += tem * i;
+       }
+      int *r = &x->k;
+      int tem2 = *r;
+      x->k += tem2 * j;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "Executing store motion" 2 "lim2" } } */
+
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index b24bc64f2a7..5ca4738b20e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -3211,6 +3211,10 @@ fill_always_executed_in_1 (class loop *loop, sbitmap 
contains_call)
          if (dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
            last = bb;
+ if (inn_loop != loop
+             && flow_loop_nested_p (bb->loop_father, inn_loop))
+           inn_loop = bb->loop_father;
+

The comment says

               /* In a loop that is always entered we may proceed anyway.
                  But record that we entered it and stop once we leave it.
*/
               inn_loop = bb->loop_father;

and your change would defeat that early return, no?

The issue is the search method exits too early when iterating the outer
loop.  For example of a nested loop, loop 1 includes 5,8,3,10,4,9
and loop2 includes 3,10.  Currently, it breaks when bb is 3 as bb 3
doesn't dominate bb 9 of loop 1.  But actually, both bb 5 and bb 4 are
ALWAYS_EXECUTED for loop 1, so if there are store instructions in bb 4
they won't be processed by store motion again.


    5<----
    |\   |
    8 \  9
    |  \ |
--->3--->4
| | 10---|


SET_ALWAYS_EXECUTED_IN is only set to bb 5 on master code now, with this
patch, it will continue search when meet bb 3 until bb 4, then last is updated
to bb 4, it will break until exit edge is found at bb 4 by
"if (!flow_bb_inside_loop_p (loop, e->dest))".  Then the followed loop code will
set bb 4 as ALWAYS_EXEUCTED and all it's idoms bb 5.


     while (1)
        {
          SET_ALWAYS_EXECUTED_IN (last, loop);
          if (last == loop->header)
            break;
          last = get_immediate_dominator (CDI_DOMINATORS, last);
        }

After further discussion with Kewen, we found that the inn_loop variable is
totally useless and could be removed.



          if (bitmap_bit_p (contains_call, bb->index))
            break;
@@ -3238,7 +3242,7 @@ fill_always_executed_in_1 (class loop *loop, sbitmap contains_call) if (bb->loop_father->header == bb)
            {
-             if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
+             if (!dominated_by_p (CDI_DOMINATORS, bb->loop_father->latch, bb))
                break;

That's now a always false condition - a loops latch is always dominated
by its header.  The condition as written tries to verify whether the
loop is always entered - mind we visit all blocks, not only those
always executed.

Thanks for the catch!  I am afraid the piece of code should be removed since it 
stops
search of potential ALWAYS EXECUTED bb after inner loop...


In fact for your testcase the x->j ref is _not_ always executed
since the inner loop is conditional on n > 0.

Yes.  But I want to move x->k (not x->j) out of loop 1 when l > 0 in 
store-motion.
Attached the diff file without and with my patch to show the extra optimization.

x->j is already moved out of loop 2 on master code.
If change n and l to constant numbers like 100, master code could also do 2 
store
motions as expected. The edge from bb 5 to bb 4 doesn't exist now, so bb 4, bb 3
and bb 5 are ALWAYS EXECUTED for loop 1.


struct X { int i; int j; int k;};

void foo(struct X *x, int n, int l)
{
 for (int j = 0; j < l; j++) // loop 1
   {
     for (int i = 0; i < n; ++i)  // loop 2
       {
         int *p = &x->j;
         int tem = *p;
         x->j += tem * i;
       }
     int *r = &x->k;
     int tem2 = *r;
     x->k += tem2 * j;
   }
}


Richard.


--
Thanks,
Xionghu
--- ssa-lim-19.c.137t.lim2      2021-08-16 03:27:04.173709326 -0500
+++ ../debug/ssa-lim-19.c.137t.lim2     2021-08-16 03:26:47.866261053 -0500
@@ -34,141 +34,172 @@
 Basic block 5 (loop 1 -- depth 1):
 
 if (n_11(D) > 0)
   invariant up to level 1, cost 20.
 
 Basic block 8 (loop 1 -- depth 1):
 
 Basic block 3 (loop 2 -- depth 2):
 
 Basic block 4 (loop 1 -- depth 1):
 
 Basic block 9 (loop 1 -- depth 1):
 
 Basic block 10 (loop 2 -- depth 2):
 
+Querying dependency of refs 2 and 1: independent.
+Querying RAW dependencies of ref 2 in loop 2: independent
+Querying RAW dependencies of ref 2 in loop 1: independent
+Querying dependency of refs 2 and 1: independent.
+Querying SM WAR dependencies of ref 2 in loop 2: independent
+Querying SM WAR dependencies of ref 2 in loop 1: independent
+Executing store motion of MEM[(int *)x_12(D) + 8B] from loop 1
 Querying RAW dependencies of ref 1 in loop 2: independent
 Querying SM WAR dependencies of ref 1 in loop 2: independent
 Executing store motion of MEM[(int *)x_12(D) + 4B] from loop 2
 Moving statement
-x__lsm.4 = MEM[(int *)x_12(D) + 4B];
+x__lsm.5 = MEM[(int *)x_12(D) + 4B];
 (cost 0) out of loop 2.
 
+Moving statement
+x__lsm.4 = MEM[(int *)x_12(D) + 8B];
+(cost 0) out of loop 1.
+
 
 Updating SSA:
-creating PHI node in block #3 for x__lsm.4
+creating PHI node in block #5 for x__lsm.4
+creating PHI node in block #3 for x__lsm.5
 Registering new PHI nodes in block #0
 Registering new PHI nodes in block #2
 Registering new PHI nodes in block #7
+Updating SSA information for statement x__lsm.4 = MEM[(int *)x_12(D) + 8B];
 Registering new PHI nodes in block #5
 Registering new PHI nodes in block #8
-Updating SSA information for statement x__lsm.4 = MEM[(int *)x_12(D) + 4B];
+Updating SSA information for statement x__lsm.5 = MEM[(int *)x_12(D) + 4B];
 Registering new PHI nodes in block #3
-Updating SSA information for statement tem_16 = x__lsm.4;
-Updating SSA information for statement x__lsm.4 = _2;
-Registering new PHI nodes in block #11
-Updating SSA information for statement MEM[(int *)x_12(D) + 4B] = x__lsm.4;
+Updating SSA information for statement tem_16 = x__lsm.5;
+Updating SSA information for statement x__lsm.5 = _2;
+Registering new PHI nodes in block #12
+Updating SSA information for statement MEM[(int *)x_12(D) + 4B] = x__lsm.5;
 Registering new PHI nodes in block #10
 Registering new PHI nodes in block #4
-Updating SSA information for statement tem2_13 = MEM[(int *)x_12(D) + 8B];
-Updating SSA information for statement x_12(D)->k = _4;
+Updating SSA information for statement tem2_13 = x__lsm.4;
+Updating SSA information for statement x__lsm.4 = _4;
+Registering new PHI nodes in block #11
+Updating SSA information for statement MEM[(int *)x_12(D) + 8B] = x__lsm.4;
 Registering new PHI nodes in block #9
 Registering new PHI nodes in block #6
 Updating SSA information for statement return;
 
 Symbols to be put in SSA form
-{ D.3324 D.3329 }
+{ D.3324 D.3329 D.3330 }
 Incremental SSA update started at block: 0
-Number of blocks in CFG: 12
-Number of blocks to update: 11 ( 92%)
-Affected blocks: 0 2 3 4 5 6 7 8 9 10 11
+Number of blocks in CFG: 13
+Number of blocks to update: 12 ( 92%)
+Affected blocks: 0 2 3 4 5 6 7 8 9 10 11 12
 
 
-;; Created LCSSA PHI: x__lsm.4_6 = PHI <x__lsm.4_5(3)>
+;; Created LCSSA PHI: x__lsm.5_29 = PHI <x__lsm.5_6(3)>
+;; Created LCSSA PHI: x__lsm.4_30 = PHI <x__lsm.4_21(4)>
 
 Updating SSA:
+Registering new PHI nodes in block #5
+Registering new PHI nodes in block #8
 Registering new PHI nodes in block #3
-Updating SSA information for statement x__lsm.4_5 = _2;
-Registering new PHI nodes in block #11
-Updating SSA information for statement MEM[(int *)x_12(D) + 4B] = x__lsm.4_5;
+Updating SSA information for statement x__lsm.5_6 = _2;
+Registering new PHI nodes in block #12
+Updating SSA information for statement MEM[(int *)x_12(D) + 4B] = x__lsm.5_6;
 Registering new PHI nodes in block #10
+Registering new PHI nodes in block #4
+Updating SSA information for statement x__lsm.4_21 = _4;
+Registering new PHI nodes in block #11
+Updating SSA information for statement MEM[(int *)x_12(D) + 8B] = x__lsm.4_21;
+Registering new PHI nodes in block #9
 
 SSA replacement table
 N_i -> { O_1 ... O_j } means that N_i replaces O_1, ..., O_j
 
-x__lsm.4_6 -> { x__lsm.4_5 }
-Incremental SSA update started at block: 3
-Number of blocks in CFG: 12
-Number of blocks to update: 3 ( 25%)
-Affected blocks: 3 10 11
+x__lsm.5_29 -> { x__lsm.5_6 }
+x__lsm.4_30 -> { x__lsm.4_21 }
+Incremental SSA update started at block: 5
+Number of blocks in CFG: 13
+Number of blocks to update: 7 ( 54%)
+Affected blocks: 3 4 5 9 10 11 12
 
 
 void foo (struct X * x, int n, int l)
 {
+  int x__lsm.5;
   int x__lsm.4;
   int tem;
   int i;
   int tem2;
   int j;
   int _1;
   int _2;
   int _3;
   int _4;
 
   <bb 2> [local count: 14598063]:
   if (l_10(D) > 0)
     goto <bb 7>; [89.00%]
   else
     goto <bb 6>; [11.00%]
 
   <bb 7> [local count: 12992276]:
+  x__lsm.4_5 = MEM[(int *)x_12(D) + 8B];
   goto <bb 5>; [100.00%]
 
   <bb 8> [local count: 105119324]:
-  x__lsm.4_8 = MEM[(int *)x_12(D) + 4B];
+  x__lsm.5_7 = MEM[(int *)x_12(D) + 4B];
 
   <bb 3> [local count: 955630225]:
   # i_24 = PHI <i_18(10), 0(8)>
-  # x__lsm.4_20 = PHI <x__lsm.4_5(10), x__lsm.4_8(8)>
-  tem_16 = x__lsm.4_20;
+  # x__lsm.5_8 = PHI <x__lsm.5_6(10), x__lsm.5_7(8)>
+  tem_16 = x__lsm.5_8;
   _1 = tem_16 * i_24;
   _2 = _1 + tem_16;
-  x__lsm.4_5 = _2;
+  x__lsm.5_6 = _2;
   i_18 = i_24 + 1;
   if (n_11(D) > i_18)
     goto <bb 10>; [89.00%]
   else
-    goto <bb 11>; [11.00%]
+    goto <bb 12>; [11.00%]
 
   <bb 10> [local count: 850510901]:
   goto <bb 3>; [100.00%]
 
-  <bb 11> [local count: 105119324]:
-  # x__lsm.4_6 = PHI <x__lsm.4_5(3)>
-  MEM[(int *)x_12(D) + 4B] = x__lsm.4_6;
+  <bb 12> [local count: 105119324]:
+  # x__lsm.5_29 = PHI <x__lsm.5_6(3)>
+  MEM[(int *)x_12(D) + 4B] = x__lsm.5_29;
 
   <bb 4> [local count: 118111600]:
-  tem2_13 = MEM[(int *)x_12(D) + 8B];
+  tem2_13 = x__lsm.4_20;
   _3 = tem2_13 * j_23;
   _4 = _3 + tem2_13;
-  x_12(D)->k = _4;
+  x__lsm.4_21 = _4;
   j_15 = j_23 + 1;
   if (l_10(D) > j_15)
     goto <bb 9>; [89.00%]
   else
-    goto <bb 6>; [11.00%]
+    goto <bb 11>; [11.00%]
 
   <bb 9> [local count: 105119324]:
 
   <bb 5> [local count: 118111600]:
   # j_23 = PHI <j_15(9), 0(7)>
+  # x__lsm.4_20 = PHI <x__lsm.4_21(9), x__lsm.4_5(7)>
   if (n_11(D) > 0)
     goto <bb 8>; [89.00%]
   else
     goto <bb 4>; [11.00%]
 
+  <bb 11> [local count: 12992276]:
+  # x__lsm.4_30 = PHI <x__lsm.4_21(4)>
+  MEM[(int *)x_12(D) + 8B] = x__lsm.4_30;
+
   <bb 6> [local count: 14598063]:
   return;
 
 }
 
 

Reply via email to