Re: [PATCH] RISC-V: Lower vmv.v.x (avl = 1) into vmv.s.x

2024-01-22 Thread Robin Dapp
LGTM.

Regards
 Robin


[PATCH] RISC-V: Lower vmv.v.x (avl = 1) into vmv.s.x

2024-01-21 Thread Juzhe-Zhong
Notice there is a AI benchmark, GCC vs Clang has 3% performance drop.

It's because Clang/LLVM has a simplification transform vmv.v.x (avl = 1) into 
vmv.s.x.

Since vmv.s.x has more flexible vsetvl demand than vmv.v.x that can allow us to 
have
better chances to fuse vsetvl.

Consider this following case:

void
foo (uint32_t *outputMat, uint32_t *inputMat)
{
  vuint32m1_t matRegIn0 = __riscv_vle32_v_u32m1 (inputMat, 4);
  vuint32m1_t matRegIn1 = __riscv_vle32_v_u32m1 (inputMat + 4, 4);
  vuint32m1_t matRegIn2 = __riscv_vle32_v_u32m1 (inputMat + 8, 4);
  vuint32m1_t matRegIn3 = __riscv_vle32_v_u32m1 (inputMat + 12, 4);

  vbool32_t oddMask
= __riscv_vreinterpret_v_u32m1_b32 (__riscv_vmv_v_x_u32m1 (0x, 1));

  vuint32m1_t smallTransposeMat0
= __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn0, matRegIn1, 1, 4);
  vuint32m1_t smallTransposeMat2
= __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn2, matRegIn3, 1, 4);

  vuint32m1_t outMat0 = __riscv_vslideup_vx_u32m1_tu (smallTransposeMat0,
  smallTransposeMat2, 2, 4);

  __riscv_vse32_v_u32m1 (outputMat, outMat0, 4);
}

Before this patch:

vsetivlizero,4,e32,m1,ta,ma
li  a5,45056
addia2,a1,16
addia3,a1,32
addia4,a1,48
vle32.v v1,0(a1)
vle32.v v4,0(a2)
vle32.v v2,0(a3)
vle32.v v3,0(a4)
addiw   a5,a5,-1366
vsetivlizero,1,e32,m1,ta,ma
vmv.v.x v0,a5 ---> Since it avl = 1, we can 
transform it into vmv.s.x
vsetivlizero,4,e32,m1,tu,mu
vslideup.vi v1,v4,1,v0.t
vslideup.vi v2,v3,1,v0.t
vslideup.vi v1,v2,2
vse32.v v1,0(a0)
ret

After this patch:

li  a5,45056
addia2,a1,16
vsetivlizero,4,e32,m1,tu,mu
addiw   a5,a5,-1366
vle32.v v3,0(a2)
addia3,a1,32
addia4,a1,48
vle32.v v1,0(a1)
vmv.s.x v0,a5
vle32.v v2,0(a3)
vslideup.vi v1,v3,1,v0.t
vle32.v v3,0(a4)
vslideup.vi v2,v3,1,v0.t
vslideup.vi v1,v2,2
vse32.v v1,0(a0)
ret

Tested on both RV32 and RV64 no regression.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (splat_to_scalar_move_p): New function.
* config/riscv/riscv-v.cc (splat_to_scalar_move_p): Ditto.
* config/riscv/vector.md: Simplify vmv.v.x. into vmv.s.x.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/attribute-2.c: New test.
* gcc.target/riscv/rvv/vsetvl/attribute-3.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 12 ++
 gcc/config/riscv/vector.md|  9 -
 .../gcc.target/riscv/rvv/vsetvl/attribute-2.c | 37 +++
 .../gcc.target/riscv/rvv/vsetvl/attribute-3.c | 36 ++
 5 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-3.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 7fe26fcd939..b3f0bdb9924 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -708,6 +708,7 @@ bool can_be_broadcasted_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
 bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
+bool splat_to_scalar_move_p (rtx *);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 93a1238a5ab..4bacb7fea45 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5151,4 +5151,16 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, 
int avl_type_index)
   return false;
 }
 
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
+bool
+splat_to_scalar_move_p (rtx *ops)
+{
+  return satisfies_constraint_Wc1 (ops[1])
+&& satisfies_constraint_vu (ops[2])
+&& !MEM_P (ops[3])
+&& satisfies_constraint_c01 (ops[4])
+&& INTVAL (ops[7]) == NONVLMAX
+&& known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 307d9a8c952..ab6e099852d 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1977,8 +1977,15 @@
  (match_operand:V_VLS 2 "vector_merge_operand")))]
   "TARGET_VECTOR"
 {
+  /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
+ has better chances to do vsetvl fusion in vsetvl pass.  */
+  if (riscv_vector::splat_to_scalar_move_p (operands))
+{
+  operands[1] = riscv_vector::gen_scalar_move_mask (mode);
+  operands[3]