[PATCH] RISC-V: Use widening shift for scatter/gather if applicable.

Robin Dapp Fri, 17 May 2024 08:26:13 -0700

Hi,

with the zvbb extension we can emit a widening shift for scatter/gather
index preparation in case we need to multiply by 2 and zero extend.


The patch also adds vwsll to the mode_idx attribute and removes the
mode from shift-count operand of the insn pattern.

Regtested on rv64gcv_zvfh_zvbb.

Regards
 Robin

gcc/ChangeLog:

        * config/riscv/riscv-v.cc (expand_gather_scatter): Use vwsll if
        applicable.
        * config/riscv/vector-crypto.md: Remove mode from vwsll shift
        count operator.
        * config/riscv/vector.md: Add vwsll to mode iterator.

gcc/testsuite/ChangeLog:

        * lib/target-supports.exp: Add zvbb.
        * gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c: 
New test.
---
 gcc/config/riscv/riscv-v.cc                   |  42 +++++--
 gcc/config/riscv/vector-crypto.md             |   4 +-
 gcc/config/riscv/vector.md                    |   4 +-
 .../gather-scatter/gather_load_64-12-zvbb.c   | 113 ++++++++++++++++++
 gcc/testsuite/lib/target-supports.exp         |  48 +++++++-
 5 files changed, 193 insertions(+), 18 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 814c5febabe..8b41b9c7774 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4016,7 +4016,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
 {
   rtx ptr, vec_offset, vec_reg;
   bool zero_extend_p;
-  int scale_log2;
+  int shift;
   rtx mask = ops[5];
   rtx len = ops[6];
   if (is_load)
@@ -4025,7 +4025,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
       ptr = ops[1];
       vec_offset = ops[2];
       zero_extend_p = INTVAL (ops[3]);
-      scale_log2 = exact_log2 (INTVAL (ops[4]));
+      shift = exact_log2 (INTVAL (ops[4]));
     }
   else
     {
@@ -4033,7 +4033,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
       ptr = ops[0];
       vec_offset = ops[1];
       zero_extend_p = INTVAL (ops[2]);
-      scale_log2 = exact_log2 (INTVAL (ops[3]));
+      shift = exact_log2 (INTVAL (ops[3]));
     }
 
   machine_mode vec_mode = GET_MODE (vec_reg);
@@ -4043,9 +4043,12 @@ expand_gather_scatter (rtx *ops, bool is_load)
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
+  bool use_widening_shift = false;
+
   /* Extend the offset element to address width.  */
   if (inner_offsize < BITS_PER_WORD)
     {
+      use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1;
       /* 7.2. Vector Load/Store Addressing Modes.
         If the vector offset elements are narrower than XLEN, they are
         zero-extended to XLEN before adding to the ptr effective address. If
@@ -4054,8 +4057,8 @@ expand_gather_scatter (rtx *ops, bool is_load)
         raise an illegal instruction exception if the EEW is not supported for
         offset elements.
 
-        RVV spec only refers to the scale_log == 0 case.  */
-      if (!zero_extend_p || scale_log2 != 0)
+        RVV spec only refers to the shift == 0 case.  */
+      if (!zero_extend_p || shift)
        {
          if (zero_extend_p)
            inner_idx_mode
@@ -4064,19 +4067,32 @@ expand_gather_scatter (rtx *ops, bool is_load)
            inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require ();
          machine_mode new_idx_mode
            = get_vector_mode (inner_idx_mode, nunits).require ();
-         rtx tmp = gen_reg_rtx (new_idx_mode);
-         emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode,
-                                     zero_extend_p ? true : false));
-         vec_offset = tmp;
+         if (!use_widening_shift)
+           {
+             rtx tmp = gen_reg_rtx (new_idx_mode);
+             emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, 
idx_mode,
+                                         zero_extend_p ? true : false));
+             vec_offset = tmp;
+           }
          idx_mode = new_idx_mode;
        }
     }
 
-  if (scale_log2 != 0)
+  if (shift)
     {
-      rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
-                             gen_int_mode (scale_log2, Pmode), NULL_RTX, 0,
-                             OPTAB_DIRECT);
+      rtx tmp;
+      if (!use_widening_shift)
+       tmp = expand_binop (idx_mode, ashl_optab, vec_offset,
+                           gen_int_mode (shift, Pmode), NULL_RTX, 0,
+                           OPTAB_DIRECT);
+      else
+       {
+         tmp = gen_reg_rtx (idx_mode);
+         insn_code icode = code_for_pred_vwsll_scalar (idx_mode);
+         rtx ops[] = {tmp, vec_offset, const1_rtx};
+         emit_vlmax_insn (icode, BINARY_OP, ops);
+       }
+
       vec_offset = tmp;
     }
 
diff --git a/gcc/config/riscv/vector-crypto.md 
b/gcc/config/riscv/vector-crypto.md
index 24822e2712c..0ddc2f3f3c6 100755
--- a/gcc/config/riscv/vector-crypto.md
+++ b/gcc/config/riscv/vector-crypto.md
@@ -295,7 +295,7 @@ (define_insn "@pred_vwsll<mode>"
        (ashift:VWEXTI
          (zero_extend:VWEXTI
            (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "vr"))
-         (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand"  "vr"))
+         (match_operand:<V_DOUBLE_TRUNC> 4 "vector_shift_operand"  "vrvk"))
        (match_operand:VWEXTI 2 "vector_merge_operand" "0vu")))]
   "TARGET_ZVBB"
   "vwsll.v%o4\t%0,%3,%4%p1"
@@ -316,7 +316,7 @@ (define_insn "@pred_vwsll<mode>_scalar"
        (ashift:VWEXTI
          (zero_extend:VWEXTI
            (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "   vr,    
vr"))
-         (match_operand:<VSUBEL> 4 "pmode_reg_or_uimm5_operand" "   rK,    
rK"))
+         (match_operand 4 "pmode_reg_or_uimm5_operand"         "   rK,    rK"))
        (match_operand:VWEXTI 2 "vector_merge_operand"           "   vu,    
0")))]
   "TARGET_ZVBB"
   "vwsll.v%o4\t%0,%3,%4%p1"
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 248461302dd..c6a3845dc13 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -750,10 +750,10 @@ (define_attr "mode_idx" ""
               (const_int 1)
 
               (eq_attr "type" "vssegte,vmpop,vmffs")
-              (const_int 2)       
+              (const_int 2)
 
               (eq_attr "type" 
"vstux,vstox,vssegts,vssegtux,vssegtox,vfcvtftoi,vfwcvtitof,vfwcvtftoi,
-                               
vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo")
+                               
vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vwsll")
               (const_int 3)
 
               (eq_attr "type" 
"viwalu,viwmul,viwmuladd,vfwalu,vfwmul,vfwmuladd")
diff --git 
a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
new file mode 100644
index 00000000000..11a4031f47b
--- /dev/null
+++ 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c
@@ -0,0 +1,113 @@
+/* { dg-do compile } */
+/* { dg-add-options "riscv_v" } */
+/* { dg-add-options "riscv_zvbb" } */
+/* { dg-additional-options "-fno-vect-cost-model -fdump-tree-vect-details 
-mrvv-max-lmul=m4" } */
+
+#include <stdint-gcc.h>
+
+#define TEST_LOOP(DATA_TYPE, INDEX_TYPE)                                       
\
+  void __attribute__ ((noinline, noclone))                                     
\
+  f_##DATA_TYPE##_##INDEX_TYPE (DATA_TYPE *restrict y, DATA_TYPE *restrict x,  
\
+                               INDEX_TYPE *restrict index)                    \
+  {                                                                            
\
+    for (int i = 0; i < 100; ++i)                                              
\
+      {                                                                        
\
+       y[i * 2] = x[index[i * 2]] + 1;                                        \
+       y[i * 2 + 1] = x[index[i * 2 + 1]] + 2;                                \
+      }                                                                        
\
+  }
+
+TEST_LOOP (int8_t, int8_t)
+TEST_LOOP (uint8_t, int8_t)
+TEST_LOOP (int16_t, int8_t)
+TEST_LOOP (uint16_t, int8_t)
+TEST_LOOP (int32_t, int8_t)
+TEST_LOOP (uint32_t, int8_t)
+TEST_LOOP (int64_t, int8_t)
+TEST_LOOP (uint64_t, int8_t)
+TEST_LOOP (_Float16, int8_t)
+TEST_LOOP (float, int8_t)
+TEST_LOOP (double, int8_t)
+TEST_LOOP (int8_t, int16_t)
+TEST_LOOP (uint8_t, int16_t)
+TEST_LOOP (int16_t, int16_t)
+TEST_LOOP (uint16_t, int16_t)
+TEST_LOOP (int32_t, int16_t)
+TEST_LOOP (uint32_t, int16_t)
+TEST_LOOP (int64_t, int16_t)
+TEST_LOOP (uint64_t, int16_t)
+TEST_LOOP (_Float16, int16_t)
+TEST_LOOP (float, int16_t)
+TEST_LOOP (double, int16_t)
+TEST_LOOP (int8_t, int32_t)
+TEST_LOOP (uint8_t, int32_t)
+TEST_LOOP (int16_t, int32_t)
+TEST_LOOP (uint16_t, int32_t)
+TEST_LOOP (int32_t, int32_t)
+TEST_LOOP (uint32_t, int32_t)
+TEST_LOOP (int64_t, int32_t)
+TEST_LOOP (uint64_t, int32_t)
+TEST_LOOP (_Float16, int32_t)
+TEST_LOOP (float, int32_t)
+TEST_LOOP (double, int32_t)
+TEST_LOOP (int8_t, int64_t)
+TEST_LOOP (uint8_t, int64_t)
+TEST_LOOP (int16_t, int64_t)
+TEST_LOOP (uint16_t, int64_t)
+TEST_LOOP (int32_t, int64_t)
+TEST_LOOP (uint32_t, int64_t)
+TEST_LOOP (int64_t, int64_t)
+TEST_LOOP (uint64_t, int64_t)
+TEST_LOOP (_Float16, int64_t)
+TEST_LOOP (float, int64_t)
+TEST_LOOP (double, int64_t)
+TEST_LOOP (int8_t, uint8_t)
+TEST_LOOP (uint8_t, uint8_t)
+TEST_LOOP (int16_t, uint8_t)
+TEST_LOOP (uint16_t, uint8_t)
+TEST_LOOP (int32_t, uint8_t)
+TEST_LOOP (uint32_t, uint8_t)
+TEST_LOOP (int64_t, uint8_t)
+TEST_LOOP (uint64_t, uint8_t)
+TEST_LOOP (_Float16, uint8_t)
+TEST_LOOP (float, uint8_t)
+TEST_LOOP (double, uint8_t)
+TEST_LOOP (int8_t, uint16_t)
+TEST_LOOP (uint8_t, uint16_t)
+TEST_LOOP (int16_t, uint16_t)
+TEST_LOOP (uint16_t, uint16_t)
+TEST_LOOP (int32_t, uint16_t)
+TEST_LOOP (uint32_t, uint16_t)
+TEST_LOOP (int64_t, uint16_t)
+TEST_LOOP (uint64_t, uint16_t)
+TEST_LOOP (_Float16, uint16_t)
+TEST_LOOP (float, uint16_t)
+TEST_LOOP (double, uint16_t)
+TEST_LOOP (int8_t, uint32_t)
+TEST_LOOP (uint8_t, uint32_t)
+TEST_LOOP (int16_t, uint32_t)
+TEST_LOOP (uint16_t, uint32_t)
+TEST_LOOP (int32_t, uint32_t)
+TEST_LOOP (uint32_t, uint32_t)
+TEST_LOOP (int64_t, uint32_t)
+TEST_LOOP (uint64_t, uint32_t)
+TEST_LOOP (_Float16, uint32_t)
+TEST_LOOP (float, uint32_t)
+TEST_LOOP (double, uint32_t)
+TEST_LOOP (int8_t, uint64_t)
+TEST_LOOP (uint8_t, uint64_t)
+TEST_LOOP (int16_t, uint64_t)
+TEST_LOOP (uint16_t, uint64_t)
+TEST_LOOP (int32_t, uint64_t)
+TEST_LOOP (uint32_t, uint64_t)
+TEST_LOOP (int64_t, uint64_t)
+TEST_LOOP (uint64_t, uint64_t)
+TEST_LOOP (_Float16, uint64_t)
+TEST_LOOP (float, uint64_t)
+TEST_LOOP (double, uint64_t)
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 88 
"vect" } } */
+/* { dg-final { scan-tree-dump " \.MASK_LEN_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "vect" } } */
+/* { dg-final { scan-assembler "vwsll.vi" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 3a55b2a4159..999e2e974ef 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -1965,6 +1965,17 @@ proc check_effective_target_riscv_zbb { } {
     }]
 }
 
+# Return 1 if the target arch supports the Zbb extension, 0 otherwise.
+# Cache the result.
+
+proc check_effective_target_riscv_zvbb { } {
+    return [check_no_compiler_messages riscv_ext_zvbb assembly {
+       #ifndef __riscv_zvbb
+       #error "Not __riscv_zvbb"
+       #endif
+    }]
+}
+
 # Return 1 if the target arch supports the XTheadVector extension, 0 otherwise.
 # Cache the result.
 
@@ -2053,10 +2064,33 @@ proc check_effective_target_riscv_zvfh_ok { } {
     return 0
 }
 
+proc check_effective_target_riscv_zvbb_ok { } {
+    # If the target already supports v without any added options,
+    # we may assume we can execute just fine.
+    if { [check_effective_target_riscv_zvbb] } {
+       return 1
+    }
+
+    # check if we can execute vector insns with the given hardware or
+    # simulator
+    set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &zvbb]
+    if { [check_runtime ${gcc_march}_exec {
+       int main()
+       {
+           asm ("vsetivli zero,8,e16,m1,ta,ma");
+           asm ("vwsll.vi v8,v16,2" : : : "v8");
+           return 0;
+       } } "-march=${gcc_march}"] } {
+           return 1
+       }
+
+    return 0
+}
+
 proc riscv_get_arch { } {
     set gcc_march ""
     # ??? do we neeed to add more extensions to the list below?
-    foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvfh ztso 
} {
+    foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvbb zvfh 
ztso } {
        if { [check_no_compiler_messages  riscv_ext_$ext assembly [string map 
[list DEF __riscv_$ext] {
                #ifndef DEF
                #error "Not DEF"
@@ -2151,6 +2185,18 @@ proc add_options_for_riscv_zvfh { flags } {
     return "$flags -march=[riscv_get_arch]_zvfh"
 }
 
+proc add_options_for_riscv_zvbb { flags } {
+    if { [lsearch $flags -march=*] >= 0 } {
+       # If there are multiple -march flags, we have to adjust all of them.
+       set flags [regsub -all -- {(?:^|[[:space:]])-march=[[:alnum:]_.]*} 
$flags &_zvbb ]
+       return [regsub -all -- 
{((?:^|[[:space:]])-march=[[:alnum:]_.]*_zvbb[[:alnum:]_.]*)_zvbb} $flags \\1 ]
+    }
+    if { [check_effective_target_riscv_zvbb] } {
+       return "$flags"
+    }
+    return "$flags -march=[riscv_get_arch]_zvbb"
+}
+
 # Return 1 if the target OS supports running SSE executables, 0
 # otherwise.  Cache the result.
 
-- 
2.45.0

[PATCH] RISC-V: Use widening shift for scatter/gather if applicable.

Reply via email to