[PATCH] RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice

Juzhe-Zhong Thu, 16 Nov 2023 20:43:43 -0800

When evaluating dynamic LMUL, notice we can do better on VLA SLP with duplicate 
VLA shuffle indice.


Consider this following case:

void
foo (uint16_t *restrict a, uint16_t *restrict b, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 8] = b[i * 8 + 3] + 1;
      a[i * 8 + 1] = b[i * 8 + 6] + 1;
      a[i * 8 + 2] = b[i * 8 + 0] + 1;
      a[i * 8 + 3] = b[i * 8 + 2] + 1;
      a[i * 8 + 4] = b[i * 8 + 1] + 1;
      a[i * 8 + 5] = b[i * 8 + 7] + 1;
      a[i * 8 + 6] = b[i * 8 + 5] + 1;
      a[i * 8 + 7] = b[i * 8 + 4] + 1;
    }
}

We want to generate this following indice:

{ 3, 6, 0, 2, 1, 7, 5, 4, 11, 14, 8, 10, 9, 15, 13, 12, 19, 22, 16, 18, 17, 23, 
21, 20, ... }

Before this patch, we are using pair of vmseq + vmerge to create such vector in 
prologue:

https://godbolt.org/z/r919WPabK

foo:
        ble     a2,zero,.L5
        vsetvli a5,zero,e16,m8,ta,ma
        vid.v   v16
        vand.vi v24,v16,7
        vmseq.vi        v0,v24,1
        vmseq.vi        v1,v24,2
        vmv.v.i v8,3
        vmerge.vim      v8,v8,5,v0
        vmv1r.v v0,v1
        vmseq.vi        v2,v24,3
        vmerge.vim      v8,v8,-2,v0
        vmv1r.v v0,v2
        vmseq.vi        v1,v24,4
        csrr    a4,vlenb
        vmerge.vim      v8,v8,-1,v0
        csrr    a6,vlenb
        vmv1r.v v0,v1
        vmseq.vi        v2,v24,5
        slli    a3,a4,2
        vmerge.vim      v8,v8,-3,v0
        slli    a6,a6,2
        vmv1r.v v0,v2
        vmseq.vi        v1,v24,6
        vmerge.vim      v8,v8,2,v0
        addi    a6,a6,-1
        vmv1r.v v0,v1
        slli    a2,a2,3
        slli    a4,a4,3
        neg     a7,a3
        vmseq.vi        v2,v24,7
        vmerge.vim      v8,v8,-1,v0
        vmv.v.x v24,a6
        vmv1r.v v0,v2
        vmerge.vim      v8,v8,-3,v0
        vadd.vv v16,v16,v8
        vand.vv v24,v16,v24
.L3:
        minu    a5,a2,a3
        vsetvli zero,a5,e16,m8,ta,ma
        mv      a6,a2
        vle16.v v16,0(a1)
        vrgather.vv     v8,v16,v24
        vadd.vi v8,v8,1
        vse16.v v8,0(a0)
        add     a1,a1,a4
        add     a0,a0,a4
        add     a2,a2,a7
        bgtu    a6,a3,.L3
.L5:
        ret

After this patch:

foo:
        ble     a2,zero,.L5
        li      a6,-536875008
        slli    a6,a6,4
        addi    a6,a6,3
        csrr    a4,vlenb
        csrr    a5,vlenb
        li      t1,-536850432
        slli    a6,a6,16
        slli    a3,a4,1
        addi    a6,a6,-3
        slli    a5,a5,1
        slli    t1,t1,4
        vsetvli t3,zero,e64,m4,ta,ma
        addi    a5,a5,-1
        vmv.v.x v4,a6
        addi    t1,t1,3
        slli    a2,a2,3
        slli    a4,a4,2
        neg     a7,a3
        vslide1up.vx    v16,v4,t1
        vsetvli a6,zero,e16,m4,ta,ma
        vid.v   v12
        vmv.v.x v4,a5
        vand.vi v20,v12,7
        vrgather.vv     v8,v16,v20
        vadd.vv v12,v12,v8
        vand.vv v12,v12,v4
.L3:
        minu    a5,a2,a3
        vsetvli zero,a5,e16,m4,ta,ma
        mv      a6,a2
        vle16.v v8,0(a1)
        vrgather.vv     v4,v8,v12
        vadd.vi v4,v4,1
        vse16.v v4,0(a0)
        add     a1,a1,a4
        add     a0,a0,a4
        add     a2,a2,a7
        bgtu    a6,a3,.L3
.L5:
        ret

Optimization:
1. reduce 9 dynamic instructions in prologue.
2. Fewer vector instructions reduce hardware pipeline consuming.

gcc/ChangeLog:

        * config/riscv/riscv-v.cc (rvv_builder::merge_pattern): New function.
        (expand_vector_init_trailing_same_elem): Adapt function.
        (expand_const_vector): Ditto.
        (expand_vec_init): Ditto.

gcc/testsuite/ChangeLog:

        * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Adapt test.
        * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
        * gcc.dg/vect/costmodel/riscv/rvv/slp-1.c: New test.

---
 gcc/config/riscv/riscv-v.cc                   | 174 +++++++++++++-----
 .../costmodel/riscv/rvv/dynamic-lmul4-6.c     |   7 +-
 .../costmodel/riscv/rvv/dynamic-lmul4-8.c     |   7 +-
 .../gcc.dg/vect/costmodel/riscv/rvv/slp-1.c   |  40 ++++
 4 files changed, 174 insertions(+), 54 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 000d0c2c721..6a2009ffb05 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -420,6 +420,7 @@ public:
 
   bool single_step_npatterns_p () const;
   bool npatterns_all_equal_p () const;
+  void merge_pattern (rtx);
 
   machine_mode new_mode () const { return m_new_mode; }
   scalar_mode inner_mode () const { return m_inner_mode; }
@@ -679,6 +680,50 @@ rvv_builder::npatterns_all_equal_p () const
   return true;
 }
 
+/* Merge pattern to reduce the elements we need to process.
+
+   E.g. v = { 0, 1, 2, 3 }, mode = V4SI.
+   Since we can use EEW = 64 RVV instructions.
+
+   Transform it into:
+   v = { 1 << 32, 3 << 32 | 2 }, mode = V2DI.  */
+void
+rvv_builder::merge_pattern (rtx src)
+{
+  if (this->inner_bits_size () >= GET_MODE_BITSIZE (Pmode))
+    return;
+  unsigned int ele_num = GET_MODE_BITSIZE (Pmode) / this->inner_bits_size ();
+  poly_uint64 nunits = exact_div (this->full_nelts (), ele_num);
+  machine_mode mode;
+  if (!get_vector_mode (Pmode, nunits).exists (&mode))
+    return;
+  this->new_vector (mode, this->npatterns () / ele_num,
+                   this->nelts_per_pattern ());
+  rtx imm = gen_int_mode ((1ULL << this->inner_bits_size ()) - 1, Pmode);
+  for (unsigned int i = 0; i < this->nelts_per_pattern (); i++)
+    {
+      for (unsigned int j = 0; j < this->npatterns (); j++)
+       {
+         rtx val = const0_rtx;
+         for (unsigned int k = 0; k < ele_num; k++)
+           {
+             rtx e
+               = gen_lowpart (Pmode, CONST_VECTOR_ELT (src, k + j * ele_num));
+             e = expand_simple_binop (Pmode, AND, e, imm, NULL_RTX, false,
+                                      OPTAB_DIRECT);
+             e = expand_simple_binop (
+               Pmode, ASHIFT, e,
+               gen_int_mode (this->inner_bits_size () * k, Pmode), NULL_RTX,
+               false, OPTAB_DIRECT);
+             val = expand_simple_binop (Pmode, IOR, e, val, NULL_RTX, false,
+                                        OPTAB_DIRECT);
+           }
+         this->quick_push (val);
+       }
+    }
+  this->finalize ();
+}
+
 static unsigned
 get_sew (machine_mode mode)
 {
@@ -1040,6 +1085,39 @@ expand_vec_series (rtx dest, rtx base, rtx step)
     emit_move_insn (dest, result);
 }
 
+/* Subroutine of expand_vec_init and expand_const_vector to handle case
+   when all trailing elements of builder are same.
+   This works as follows:
+   (a) Use expand_insn interface to broadcast last vector element in TARGET.
+   (b) Insert remaining elements in TARGET using insr.
+
+   ??? The heuristic used is to do above if number of same trailing elements
+   is greater than leading_ndups, loosely based on
+   heuristic from mostly_zeros_p.  May need fine-tuning.  */
+
+static void
+expand_vector_init_trailing_same_elem (rtx target,
+                                      const rtx_vector_builder &builder,
+                                      int start, int nelts_reqd)
+{
+  machine_mode mode = GET_MODE (target);
+
+  rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
+  for (int i = start - 1; i >= 0; i--)
+    {
+      unsigned int unspec
+       = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
+      insn_code icode = code_for_pred_slide (unspec, mode);
+      rtx tmp = gen_reg_rtx (mode);
+      rtx ops[] = {tmp, dup, builder.elt (i)};
+      emit_vlmax_insn (icode, BINARY_OP, ops);
+      /* slide1up need source and dest to be different REG.  */
+      dup = tmp;
+    }
+
+  emit_move_insn (target, dup);
+}
+
 static void
 expand_const_vector (rtx target, rtx src)
 {
@@ -1139,6 +1217,43 @@ expand_const_vector (rtx target, rtx src)
          rtx dup = expand_vector_broadcast (builder.new_mode (), ele);
          emit_move_insn (target, gen_lowpart (mode, dup));
        }
+      /* We don't apply such approach for LMUL = 8 since vrgather.vv doesn't
+        allow dest overlap with any source register and VLA repeating vector
+        always by a addition.  So, it such VLA constant vector will consume
+        32 registers if LMUL = 8 which cause serious high register pressure. */
+      else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && npatterns > 2)
+       {
+         /* We handle the case that we can't find a vector containter to hold
+            element bitsize = NPATTERNS * ele_bitsize.
+
+              NPATTERNS = 8, element width = 16
+                v = { 3, 5, -2, -1, -3, 2, -1, -3, ... }
+              Since NPATTERNS * element width = 128, we can't find a container
+              to hold it.
+
+              In this case, we use NPATTERNS merge operations to generate such
+              vector.  */
+
+         /* Generate index = { 0, 1, 2, 3, 4, 5, 6, 7, ... }.  */
+         rtx vid = gen_reg_rtx (builder.int_mode ());
+         expand_vec_series (vid, const0_rtx, const1_rtx);
+         rtx index = gen_reg_rtx (builder.int_mode ());
+         rtx range = gen_int_mode (builder.npatterns () - 1,
+                                   GET_MODE_INNER (builder.int_mode ()));
+         rtx and_ops[] = {index, vid, range};
+         emit_vlmax_insn (code_for_pred_scalar (AND, builder.int_mode ()),
+                          BINARY_OP, and_ops);
+
+         /* Create vector = { 3, 5, -2, -1, -3, 2, -1, -3, X, ... , X }.  */
+         builder.merge_pattern (src);
+         rtx init = gen_reg_rtx (builder.mode ());
+         expand_vector_init_trailing_same_elem (init, builder,
+                                                builder.npatterns () - 1,
+                                                builder.npatterns ());
+
+         /* Shuffle vector with index.  */
+         emit_vlmax_gather_insn (target, gen_lowpart (mode, init), index);
+       }
       else
        {
          /* We handle the case that we can't find a vector containter to hold
@@ -2268,47 +2383,6 @@ expand_vector_init_merge_combine_sequence (rtx target,
   emit_vlmax_insn (icode, MERGE_OP, merge_ops);
 }
 
-/* Subroutine of expand_vec_init to handle case
-   when all trailing elements of builder are same.
-   This works as follows:
-   (a) Use expand_insn interface to broadcast last vector element in TARGET.
-   (b) Insert remaining elements in TARGET using insr.
-
-   ??? The heuristic used is to do above if number of same trailing elements
-   is greater than leading_ndups, loosely based on
-   heuristic from mostly_zeros_p.  May need fine-tuning.  */
-
-static bool
-expand_vector_init_trailing_same_elem (rtx target,
-                                      const rtx_vector_builder &builder,
-                                      int nelts_reqd)
-{
-  int leading_ndups = builder.count_dups (0, nelts_reqd - 1, 1);
-  int trailing_ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
-  machine_mode mode = GET_MODE (target);
-
-  if (trailing_ndups > leading_ndups)
-    {
-      rtx dup = expand_vector_broadcast (mode, builder.elt (nelts_reqd - 1));
-      for (int i = nelts_reqd - trailing_ndups - 1; i >= 0; i--)
-       {
-         unsigned int unspec
-           = FLOAT_MODE_P (mode) ? UNSPEC_VFSLIDE1UP : UNSPEC_VSLIDE1UP;
-         insn_code icode = code_for_pred_slide (unspec, mode);
-         rtx tmp = gen_reg_rtx (mode);
-         rtx ops[] = {tmp, dup, builder.elt (i)};
-         emit_vlmax_insn (icode, BINARY_OP, ops);
-         /* slide1up need source and dest to be different REG.  */
-         dup = tmp;
-       }
-
-      emit_move_insn (target, dup);
-      return true;
-    }
-
-  return false;
-}
-
 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
 
 void
@@ -2378,11 +2452,19 @@ expand_vec_init (rtx target, rtx vals)
 
   /* Optimize trailing same elements sequence:
       v = {y, y2, y3, y4, y5, x, x, x, x, x, x, x, x, x, x, x};  */
-  if (!expand_vector_init_trailing_same_elem (target, v, nelts))
-    /* Handle common situation by vslide1down. This function can handle any
-       situation of vec_init<mode>. Only the cases that are not optimized above
-       will fall through here.  */
-    expand_vector_init_insert_elems (target, v, nelts);
+  int leading_ndups = v.count_dups (0, nelts - 1, 1);
+  int trailing_ndups = v.count_dups (nelts - 1, -1, -1);
+  if (trailing_ndups > leading_ndups)
+    {
+      expand_vector_init_trailing_same_elem (target, v, nelts - trailing_ndups,
+                                            nelts);
+      return;
+    }
+
+  /* Handle common situation by vslide1down. This function can handle any
+     situation of vec_init<mode>. Only the cases that are not optimized above
+     will fall through here.  */
+  expand_vector_init_insert_elems (target, v, nelts);
 }
 
 /* Get insn code for corresponding comparison.  */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
index c7b88635923..d3149052b13 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c
@@ -4,7 +4,7 @@
 #include <stdint-gcc.h>
 
 void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
 {
   for (int i = 0; i < n; ++i)
     {
@@ -19,9 +19,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
     }
 }
 
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
index 4d9175e86f8..4c428f33a79 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c
@@ -4,7 +4,7 @@
 #include <stdint-gcc.h>
 
 void
-foo (uint8_t *restrict a, uint8_t *restrict b, int n)
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
 {
   for (int i = 0; i < n; ++i)
     {
@@ -28,9 +28,8 @@ foo (uint8_t *restrict a, uint8_t *restrict b, int n)
     }
 }
 
-/* { dg-final { scan-assembler {e8,m4} } } */
-/* { dg-final { scan-assembler-times {csrr} 1 } } */
-/* { dg-final { scan-tree-dump-not "Maximum lmul = 8" "vect" } } */
+/* { dg-final { scan-assembler {e16,m4} } } */
+/* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 2" "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
new file mode 100644
index 00000000000..23bf59b0cb3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/slp-1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fno-schedule-insns -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+
+/*
+** foo:
+**   ...
+**   vslide1up\.vx\s+v[0-9]+,\s*v[0-9]+,\s*[a-x0-9]+
+**   ...
+**   vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   ...
+**   vle16\.v\s+v8,\s*0\([a-x0-9]+\)
+**   ...
+**   vrgather\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+**   ...
+*/
+void
+foo (uint16_t *restrict a, uint16_t *restrict b, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      a[i * 8] = b[i * 8 + 3] + 1;
+      a[i * 8 + 1] = b[i * 8 + 6] + 1;
+      a[i * 8 + 2] = b[i * 8 + 0] + 1;
+      a[i * 8 + 3] = b[i * 8 + 2] + 1;
+      a[i * 8 + 4] = b[i * 8 + 1] + 1;
+      a[i * 8 + 5] = b[i * 8 + 7] + 1;
+      a[i * 8 + 6] = b[i * 8 + 5] + 1;
+      a[i * 8 + 7] = b[i * 8 + 4] + 1;
+    }
+}
+
+/* { dg-final { scan-assembler-times {vslide1up\.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv} 2 } } */
+/* { dg-final { scan-assembler-not {vmv1r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv2r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv4r\.v} } } */
+/* { dg-final { scan-assembler-not {vmv8r\.v} } } */
-- 
2.36.3

[PATCH] RISC-V: Optimize VLA SLP with duplicate VLA shuffle indice

Reply via email to