https://gcc.gnu.org/g:a424318d32103dde827e8507fa27d24d33407ec9

commit r15-1587-ga424318d32103dde827e8507fa27d24d33407ec9
Author: Sergei Lewis <sle...@rivosinc.com>
Date:   Mon Jun 24 14:20:14 2024 -0600

    [PATCH v2 2/3] RISC-V: setmem for RISCV with V extension
    
    This is primarily Sergei's work, my contributions were limited to
    merging his expander with the one that's on the trunk, allowing
    non-constant value and trivial testsuite adjustments due to option renaming.
    
    I'm doing setmem first because it's the easiest.  The others will follow
    soon enough.
    
    I've tested this in my system, waiting on pre-commit CI to render its
    verdict before moving forward.
    
    gcc/ChangeLog
    
            * config/riscv/riscv-protos.h (riscv_vector::expand_vec_setmem): New
            function declaration.
    
            * config/riscv/riscv-string.cc (riscv_vector::expand_vec_setmem): 
New
            function: this generates an inline vectorised memory set, if and 
only if
            we know the entire operation can be performed in a single vector 
store.
    
            * config/riscv/riscv.md (setmem<mode>): Try 
riscv_vector::expand_vec_setmem
            for constant lengths.  Do not require operand 2 to be a constant.
    
    gcc/testsuite/ChangeLog
    
            * gcc.target/riscv/rvv/base/setmem-1.c: New tests
            * gcc.target/riscv/rvv/base/setmem-2.c: New tests
            * gcc.target/riscv/rvv/base/setmem-3.c: New tests

Diff:
---
 gcc/config/riscv/riscv-protos.h                    |   1 +
 gcc/config/riscv/riscv-string.cc                   |  89 ++++++++++++++++++
 gcc/config/riscv/riscv.md                          |   9 +-
 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c | 103 +++++++++++++++++++++
 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c |  51 ++++++++++
 gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c |  69 ++++++++++++++
 6 files changed, 320 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index d6473d0cd85..a3380d4250d 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -678,6 +678,7 @@ void expand_popcount (rtx *);
 void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false);
 bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
 void emit_vec_extract (rtx, rtx, rtx);
+bool expand_vec_setmem (rtx, rtx, rtx);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 4702001bd9b..1ddebdcee3f 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1516,4 +1516,93 @@ expand_strcmp (rtx result, rtx src1, rtx src2, rtx 
nbytes,
   return true;
 }
 
+/* Check we are permitted to vectorise a memory operation.
+   If so, return true and populate lmul_out.
+   Otherwise, return false and leave lmul_out unchanged.  */
+static bool
+check_vectorise_memory_operation (rtx length_in, HOST_WIDE_INT &lmul_out)
+{
+  /* If we either can't or have been asked not to vectorise, respect this.  */
+  if (!TARGET_VECTOR)
+    return false;
+  if (!(stringop_strategy & STRATEGY_VECTOR))
+    return false;
+
+  /* If we can't reason about the length, don't vectorise.  */
+  if (!CONST_INT_P (length_in))
+    return false;
+
+  HOST_WIDE_INT length = INTVAL (length_in);
+
+  /* If it's tiny, default operation is likely better; maybe worth
+     considering fractional lmul in the future as well.  */
+  if (length < (TARGET_MIN_VLEN / 8))
+    return false;
+
+  /* If we've been asked to use a specific LMUL,
+     check the operation fits and do that.  */
+  if (rvv_max_lmul != RVV_DYNAMIC)
+    {
+      lmul_out = TARGET_MAX_LMUL;
+      return (length <= ((TARGET_MAX_LMUL * TARGET_MIN_VLEN) / 8));
+    }
+
+  /* Find smallest lmul large enough for entire op.  */
+  HOST_WIDE_INT lmul = 1;
+  while ((lmul <= 8) && (length > ((lmul * TARGET_MIN_VLEN) / 8)))
+    {
+      lmul <<= 1;
+    }
+
+  if (lmul > 8)
+    return false;
+
+  lmul_out = lmul;
+  return true;
+}
+
+/* Used by setmemdi in riscv.md.  */
+bool
+expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
+{
+  HOST_WIDE_INT lmul;
+  /* Check we are able and allowed to vectorise this operation;
+     bail if not.  */
+  if (!check_vectorise_memory_operation (length_in, lmul))
+    return false;
+
+  machine_mode vmode
+      = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul)
+           .require ();
+  rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
+  rtx dst = change_address (dst_in, vmode, dst_addr);
+
+  rtx fill_value = gen_reg_rtx (vmode);
+  rtx broadcast_ops[] = { fill_value, fill_value_in };
+
+  /* If the length is exactly vlmax for the selected mode, do that.
+     Otherwise, use a predicated store.  */
+  if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_vlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
+                         broadcast_ops);
+      emit_move_insn (dst, fill_value);
+    }
+  else
+    {
+      if (!satisfies_constraint_K (length_in))
+             length_in = force_reg (Pmode, length_in);
+      emit_nonvlmax_insn (code_for_pred_broadcast (vmode), UNARY_OP,
+                         broadcast_ops, length_in);
+      machine_mode mask_mode
+             = riscv_vector::get_vector_mode (BImode, GET_MODE_NUNITS (vmode))
+                     .require ();
+      rtx mask = CONSTM1_RTX (mask_mode);
+      emit_insn (gen_pred_store (vmode, dst, mask, fill_value, length_in,
+                         get_avl_type_rtx (riscv_vector::NONVLMAX)));
+    }
+
+  return true;
+}
+
 }
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 7a9454de430..78cf83c9252 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2697,11 +2697,16 @@
 
 (define_expand "setmem<mode>"
   [(parallel [(set (match_operand:BLK 0 "memory_operand")
-                  (match_operand:QI 2 "const_int_operand"))
+                  (match_operand:QI 2 "nonmemory_operand"))
              (use (match_operand:P 1 ""))
              (use (match_operand:SI 3 "const_int_operand"))])]
  ""
- {
+{
+  /* If TARGET_VECTOR is false, this routine will return false and we will
+     try scalar expansion.  */
+  if (riscv_vector::expand_vec_setmem (operands[0], operands[1], operands[2]))
+    DONE;
+
   /* If value to set is not zero, use the library routine.  */
   if (operands[2] != const0_rtx)
     FAIL;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
new file mode 100644
index 00000000000..22844ff348c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-1.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Tiny memsets should use scalar ops.
+** f1:
+**  sb\s+a1,0\(a0\)
+**  ret
+*/
+void *
+f1 (void *a, int const b)
+{
+  return __builtin_memset (a, b, 1);
+}
+
+/* Tiny memsets should use scalar ops.
+** f2:
+**  sb\s+a1,0\(a0\)
+**  sb\s+a1,1\(a0\)
+**  ret
+*/
+void *
+f2 (void *a, int const b)
+{
+  return __builtin_memset (a, b, 2);
+}
+
+/* Tiny memsets should use scalar ops.
+** f3:
+**  sb\s+a1,0\(a0\)
+**  sb\s+a1,1\(a0\)
+**  sb\s+a1,2\(a0\)
+**  ret
+*/
+void *
+f3 (void *a, int const b)
+{
+  return __builtin_memset (a, b, 3);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=1
+** f4:
+**  (
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f4 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorised code should use smallest lmul known to fit length
+** f5:
+**  (
+**  vsetivli\s+zero,\d+,e8,m2,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m2,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f5 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES + 1);
+}
+
+/* Vectorise+inline up to LMUL=8
+** f6:
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f6 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't vectorise if the move is too large for one operation.
+** f7:
+**  li\s+a2,\d+
+**  tail\s+memset
+*/
+void *
+f7 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
new file mode 100644
index 00000000000..faea442a4bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Small memsets shouldn't be vectorised.
+** f1:
+**  (
+**  sb\s+a1,0\(a0\)
+**  ...
+**  |
+**  li\s+a2,\d+
+**  tail\s+memset
+**  )
+*/
+void *
+f1 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width using requested lmul.
+** f2:
+**  (
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f2 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Don't vectorise if the move is too large for requested lmul.
+** f3:
+**  li\s+a2,\d+
+**  tail\s+memset
+*/
+void *
+f3 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES + 1);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
new file mode 100644
index 00000000000..25be694d248
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/setmem-3.c
@@ -0,0 +1,69 @@
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Small memsets shouldn't be vectorised.
+** f1:
+**  (
+**  sb\s+a1,0\(a0\)
+**  ...
+**  |
+**  li\s+a2,\d+
+**  tail\s+memset
+**  )
+*/
+void *
+f1 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width using requested lmul.
+** f2:
+**  (
+**  vsetivli\s+zero,\d+,e8,m8,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f2 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorise+inline operations up to requested lmul.
+** f3:
+**  (
+**  vsetivli\s+zero,\d+,e8,m8,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  )
+**  vmv\.v\.x\s+v\d+,a1
+**  vse8\.v\s+v\d+,0\(a0\)
+**  ret
+*/
+void *
+f3 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't vectorise if the move is too large for requested lmul.
+** f4:
+**  li\s+a2,\d+
+**  tail\s+memset
+*/
+void *
+f4 (void *a, int const b)
+{
+  return __builtin_memset (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}

Reply via email to