[gcc r15-2097] vect: Optimize order of lane-reducing operations in loop def-use cycles

Feng Xue via Gcc-cvs Wed, 17 Jul 2024 06:58:56 -0700

https://gcc.gnu.org/g:db3c8c9726d0bafbb9f85b6d7027fe83602643e7


commit r15-2097-gdb3c8c9726d0bafbb9f85b6d7027fe83602643e7
Author: Feng Xue <f...@os.amperecomputing.com>
Date:   Wed May 29 17:28:14 2024 +0800

    vect: Optimize order of lane-reducing operations in loop def-use cycles
    
    When transforming multiple lane-reducing operations in a loop reduction 
chain,
    originally, corresponding vectorized statements are generated into def-use
    cycles starting from 0. The def-use cycle with smaller index, would contain
    more statements, which means more instruction dependency. For example:
    
       int sum = 1;
       for (i)
         {
           sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
           sum += w[i];               // widen-sum <vector(16) char>
           sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
           sum += n[i];               // normal <vector(4) int>
         }
    
    Original transformation result:
    
       for (i / 16)
         {
           sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
           sum_v1 = sum_v1;  // copy
           sum_v2 = sum_v2;  // copy
           sum_v3 = sum_v3;  // copy
    
           sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
           sum_v1 = sum_v1;  // copy
           sum_v2 = sum_v2;  // copy
           sum_v3 = sum_v3;  // copy
    
           sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
           sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
           sum_v2 = sum_v2;  // copy
           sum_v3 = sum_v3;  // copy
    
           ...
         }
    
    For a higher instruction parallelism in final vectorized loop, an optimal
    means is to make those effective vector lane-reducing ops be distributed
    evenly among all def-use cycles. Transformed as the below, DOT_PROD,
    WIDEN_SUM and SADs are generated into disparate cycles, instruction
    dependency among them could be eliminated.
    
       for (i / 16)
         {
           sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
           sum_v1 = sum_v1;  // copy
           sum_v2 = sum_v2;  // copy
           sum_v3 = sum_v3;  // copy
    
           sum_v0 = sum_v0;  // copy
           sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
           sum_v2 = sum_v2;  // copy
           sum_v3 = sum_v3;  // copy
    
           sum_v0 = sum_v0;  // copy
           sum_v1 = sum_v1;  // copy
           sum_v2 = SAD (s0_v2[i: 0 ~ 7 ], s1_v2[i: 0 ~ 7 ], sum_v2);
           sum_v3 = SAD (s0_v3[i: 8 ~ 15], s1_v3[i: 8 ~ 15], sum_v3);
    
           ...
         }
    
    2024-03-22 Feng Xue <f...@os.amperecomputing.com>
    
    gcc/
            PR tree-optimization/114440
            * tree-vectorizer.h (struct _stmt_vec_info): Add a new field
            reduc_result_pos.
            * tree-vect-loop.cc (vect_transform_reduction): Generate 
lane-reducing
            statements in an optimized order.

Diff:
---
 gcc/tree-vect-loop.cc | 64 +++++++++++++++++++++++++++++++++++++++++++++------
 gcc/tree-vectorizer.h |  6 +++++
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 1c3dbf4bc71b..d7d628efa60f 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8844,6 +8844,7 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
               sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
               sum += w[i];               // widen-sum <vector(16) char>
               sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
+              sum += n[i];               // normal <vector(4) int>
             }
 
         The vector size is 128-bit，vectorization factor is 16.  Reduction
@@ -8861,19 +8862,27 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
               sum_v2 = sum_v2;  // copy
               sum_v3 = sum_v3;  // copy
 
-              sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
-              sum_v1 = sum_v1;  // copy
+              sum_v0 = sum_v0;  // copy
+              sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
               sum_v2 = sum_v2;  // copy
               sum_v3 = sum_v3;  // copy
 
-              sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
-              sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
-              sum_v2 = sum_v2;  // copy
+              sum_v0 = sum_v0;  // copy
+              sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
+              sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
               sum_v3 = sum_v3;  // copy
+
+              sum_v0 += n_v0[i: 0  ~ 3 ];
+              sum_v1 += n_v1[i: 4  ~ 7 ];
+              sum_v2 += n_v2[i: 8  ~ 11];
+              sum_v3 += n_v3[i: 12 ~ 15];
             }
 
-          sum_v = sum_v0 + sum_v1 + sum_v2 + sum_v3;   // = sum_v0 + sum_v1
-       */
+        Moreover, for a higher instruction parallelism in final vectorized
+        loop, it is considered to make those effective vector lane-reducing
+        ops be distributed evenly among all def-use cycles.  In the above
+        example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
+        cycles, instruction dependency among them could be eliminated.  */
       unsigned effec_ncopies = vec_oprnds[0].length ();
       unsigned total_ncopies = vec_oprnds[reduc_index].length ();
 
@@ -8887,6 +8896,47 @@ vect_transform_reduction (loop_vec_info loop_vinfo,
              vec_oprnds[i].safe_grow_cleared (total_ncopies);
            }
        }
+
+      tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
+      gcc_assert (reduc_vectype_in);
+
+      unsigned effec_reduc_ncopies
+       = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
+
+      gcc_assert (effec_ncopies <= effec_reduc_ncopies);
+
+      if (effec_ncopies < effec_reduc_ncopies)
+       {
+         /* Find suitable def-use cycles to generate vectorized statements
+            into, and reorder operands based on the selection.  */
+         unsigned curr_pos = reduc_info->reduc_result_pos;
+         unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
+
+         gcc_assert (curr_pos < effec_reduc_ncopies);
+          reduc_info->reduc_result_pos = next_pos;
+
+         if (curr_pos)
+           {
+             unsigned count = effec_reduc_ncopies - effec_ncopies;
+             unsigned start = curr_pos - count;
+
+             if ((int) start < 0)
+               {
+                 count = curr_pos;
+                 start = 0;
+               }
+
+             for (unsigned i = 0; i < op.num_ops - 1; i++)
+               {
+                 for (unsigned j = effec_ncopies; j > start; j--)
+                   {
+                     unsigned k = j - 1;
+                     std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
+                     gcc_assert (!vec_oprnds[i][k]);
+                   }
+               }
+           }
+       }
     }
 
   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d8be89c37891..df6c8ada2f78 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1402,6 +1402,12 @@ public:
   /* The vector type for performing the actual reduction.  */
   tree reduc_vectype;
 
+  /* For loop reduction with multiple vectorized results (ncopies > 1), a
+     lane-reducing operation participating in it may not use all of those
+     results, this field specifies result index starting from which any
+     following land-reducing operation would be assigned to.  */
+  unsigned int reduc_result_pos;
+
   /* If IS_REDUC_INFO is true and if the vector code is performing
      N scalar reductions in parallel, this variable gives the initial
      scalar values of those N reductions.  */

[gcc r15-2097] vect: Optimize order of lane-reducing operations in loop def-use cycles

Reply via email to