[PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

Juzhe-Zhong Thu, 04 Jan 2024 17:54:04 -0800

1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift 
with vector shift amount,
that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
shift with scalar shift amount,
that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.


For the 2) case, we don't need to allocate a vector register group for shift 
amount.

So consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
   int n)
{
  for (int i = 0; i < n; i++)
    {
      int tmp = b[i] >> x;
      int tmp2 = tmp * b[i];
      c[i] = tmp2 * b[i];
      d[i] = tmp * tmp2 * b[i] >> x;
    }
}

Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL 
= 8:

f:
        ble     a5,zero,.L5
.L3:
        vsetvli a0,a5,e32,m8,ta,ma
        slli    a6,a0,2
        vle32.v v16,0(a1)
        vsra.vx v24,v16,a4
        vmul.vv v8,v24,v16
        vmul.vv v0,v8,v16
        vse32.v v0,0(a2)
        vmul.vv v8,v8,v24
        vmul.vv v8,v8,v16
        vsra.vx v8,v8,a4
        vse32.v v8,0(a3)
        add     a1,a1,a6
        add     a2,a2,a6
        add     a3,a3,a6
        sub     a5,a5,a0
        bne     a5,zero,.L3
.L5:
        ret

Tested on both RV32/RV64 no regression.  Ok for trunk ?

Note that we will apply same heuristic for vadd.vx, ... etc when the 
late-combine pass from
Richard Sandiford is committed (Since we need late combine pass to do vv->vx 
transformation for vadd).

gcc/ChangeLog:

        * config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New 
function.
        (variable_vectorized_p): Teach loop invariant.
        (has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test.
        * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc        | 31 +++++++--
 .../costmodel/riscv/rvv/dynamic-lmul4-12.c    | 40 ++++++++++++
 .../costmodel/riscv/rvv/dynamic-lmul8-14.c    | 64 +++++++++++++++++++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index ec8156fbaf8..00b0b4d64b9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if OP is invariant.  */
+
+static bool
+loop_invariant_op_p (class loop *loop,
+                    tree op)
+{
+  if (is_gimple_min_invariant (op))
+    return true;
+  if (SSA_NAME_IS_DEFAULT_DEF (op)
+      || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op))))
+    return true;
+  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+}
+
 /* Return true if the variable should be counted into liveness.  */
 static bool
-variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
+                      bool lhs_p)
 {
   if (!var)
     return false;
@@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
                 || !tree_fits_shwi_p (var)
                 || !IN_RANGE (tree_to_shwi (var), -16, 15)
                 || gimple_assign_rhs1 (stmt) != var;
+       case LSHIFT_EXPR:
+       case RSHIFT_EXPR:
+         return gimple_assign_rhs2 (stmt) != var
+                || !loop_invariant_op_p (loop, var);
        default:
          break;
        }
@@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
    The live range of SSA 2 is [0, 4] in bb 3.  */
 static machine_mode
 compute_local_live_ranges (
+  loop_vec_info loop_vinfo,
   const hash_map<basic_block, vec<stmt_point>> &program_points_per_bb,
   hash_map<basic_block, hash_map<tree, pair>> &live_ranges_per_bb)
 {
   machine_mode biggest_mode = QImode;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   if (!program_points_per_bb.is_empty ())
     {
       auto_vec<tree> visited_vars;
@@ -339,7 +360,8 @@ compute_local_live_ranges (
              unsigned int point = program_point.point;
              gimple *stmt = program_point.stmt;
              tree lhs = gimple_get_lhs (stmt);
-             if (variable_vectorized_p (program_point.stmt_info, lhs, true))
+             if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
+                                        true))
                {
                  biggest_mode = get_biggest_mode (biggest_mode,
                                                   TYPE_MODE (TREE_TYPE (lhs)));
@@ -356,7 +378,7 @@ compute_local_live_ranges (
              for (i = 0; i < gimple_num_args (stmt); i++)
                {
                  tree var = gimple_arg (stmt, i);
-                 if (variable_vectorized_p (program_point.stmt_info, var,
+                 if (variable_vectorized_p (loop, program_point.stmt_info, var,
                                             false))
                    {
                      biggest_mode
@@ -781,7 +803,8 @@ has_unexpected_spills_p (loop_vec_info loop_vinfo)
   /* Compute local live ranges.  */
   hash_map<basic_block, hash_map<tree, pair>> live_ranges_per_bb;
   machine_mode biggest_mode
-    = compute_local_live_ranges (program_points_per_bb, live_ranges_per_bb);
+    = compute_local_live_ranges (loop_vinfo, program_points_per_bb,
+                                live_ranges_per_bb);
 
   /* Update live ranges according to PHI.  */
   update_local_live_ranges (loop_vinfo, program_points_per_bb,
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
new file mode 100644
index 00000000000..0cb492e611c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d,
+   int *restrict x, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> x[i];
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x[i];
+    }
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d,
+    int *restrict x, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << x[i];
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x[i];
+    }
+}
+
+/* { dg-final { scan-assembler-times {e32,m4} 2 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-not {e32,m8} } } */
+/* { dg-final { scan-assembler-not {e32,m2} } } */
+/* { dg-final { scan-assembler-not {e32,m1} } } */
+/* { dg-final { scan-assembler-times {ret} 2 } } */
+/* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it 
has unexpected spills" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c
new file mode 100644
index 00000000000..0d42c3b27cb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
+   int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> x;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x;
+    }
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
+    int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << x;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> x;
+    }
+}
+
+void
+f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] >> 17;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> 17;
+    }
+}
+
+void
+f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+    {
+      int tmp = b[i] << 17;
+      int tmp2 = tmp * b[i];
+      c[i] = tmp2 * b[i];
+      d[i] = tmp * tmp2 * b[i] >> 17;
+    }
+}
+
+/* { dg-final { scan-assembler-times {e32,m8} 4 } } */
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-assembler-not {jr} } } */
+/* { dg-final { scan-assembler-not {e32,m4} } } */
+/* { dg-final { scan-assembler-not {e32,m2} } } */
+/* { dg-final { scan-assembler-not {e32,m1} } } */
+/* { dg-final { scan-assembler-times {ret} 4 } } */
+/* { dg-final { scan-tree-dump-not "Preferring smaller LMUL loop because it 
has unexpected spills" "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 4 "vect" } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 4 "vect" } } */
-- 
2.36.3

[PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

Reply via email to