Re: [PATCH V2] RISC-V: Remove earlyclobber for wx/wf instructions.

2023-11-30 Thread Robin Dapp
OK.

Regards
 Robin



[PATCH V2] RISC-V: Remove earlyclobber for wx/wf instructions.

2023-11-30 Thread Juzhe-Zhong
While working on overlap for widening instructions, I realize that we set
vwadd.wx/vfwadd.wf as earlyclobber which is incorrect.

Since according to RVV ISA:
"The destination EEW equals the source EEW."

vwadd.vx widens the first source operand (i.e. 2 * source EEW = dest EEW) while
vwadd.wx only widens the second/scalar source operand.

Therefore overlap is legal for wx but not for vx.

Before this patch (heave spillings):

csrra5,vlenb
sllia5,a5,1
addia5,a5,64
vfwadd.wf   v2,v14,fs0
add a5,a5,sp
vs2r.v  v2,0(a5)
vl2re32.v   v2,0(a1)
vfwadd.wf   v14,v12,fs0
vfwadd.wf   v12,v10,fs0
vfwadd.wf   v10,v8,fs0
vfwadd.wf   v8,v6,fs0
vfwadd.wf   v6,v4,fs0
vfwadd.wf   v4,v2,fs0
vfwadd.wf   v2,v16,fs0
vfwadd.wf   v16,v18,fs0
vfwadd.wf   v18,v20,fs0
vfwadd.wf   v20,v22,fs0
vfwadd.wf   v22,v24,fs0
vfwadd.wf   v24,v26,fs0
vfwadd.wf   v26,v28,fs0
vfwadd.wf   v28,v30,fs0
vfwadd.wf   v30,v0,fs0
nop
vsetvli zero,zero,e32,m2,ta,ma
csrra5,vlenb

After this patch (no spillings):

vfwadd.wf   v16,v16,fs0
vfwadd.wf   v14,v14,fs0
vfwadd.wf   v12,v12,fs0
vfwadd.wf   v10,v10,fs0
vfwadd.wf   v8,v8,fs0
vfwadd.wf   v6,v6,fs0
vfwadd.wf   v4,v4,fs0
vfwadd.wf   v2,v2,fs0
vfwadd.wf   v18,v18,fs0
vfwadd.wf   v20,v20,fs0
vfwadd.wf   v22,v22,fs0
vfwadd.wf   v24,v24,fs0
vfwadd.wf   v26,v26,fs0
vfwadd.wf   v28,v28,fs0
vfwadd.wf   v30,v30,fs0
vfwadd.wf   v0,v0,fs0

Confirm the codegen above run successfully on both SPIKE/QEMU.

PR target/112431

gcc/ChangeLog:

* config/riscv/vector.md: Remove earlyclobber for wx/wf instructions.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112431-19.c: New test.
* gcc.target/riscv/rvv/base/pr112431-20.c: New test.
* gcc.target/riscv/rvv/base/pr112431-21.c: New test.

---
 gcc/config/riscv/vector.md|   4 +-
 .../gcc.target/riscv/rvv/base/pr112431-19.c   | 103 +
 .../gcc.target/riscv/rvv/base/pr112431-20.c   | 103 +
 .../gcc.target/riscv/rvv/base/pr112431-21.c   | 106 ++
 4 files changed, 314 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-19.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-20.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-21.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index e5d62c6e58b..b47b9742b62 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3833,7 +3833,7 @@
(set_attr "mode" "")])
 
 (define_insn 
"@pred_single_widen__scalar"
-  [(set (match_operand:VWEXTI 0 "register_operand"  "=&vr,&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand"   "=vr,   
vr")
(if_then_else:VWEXTI
  (unspec:
[(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1")
@@ -7114,7 +7114,7 @@
(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
 
 (define_insn "@pred_single_widen__scalar"
-  [(set (match_operand:VWEXTF 0 "register_operand"  "=&vr,  
&vr")
+  [(set (match_operand:VWEXTF 0 "register_operand"   "=vr,   
vr")
(if_then_else:VWEXTF
  (unspec:
[(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-19.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-19.c
new file mode 100644
index 000..affe1aaf4f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr112431-19.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+size_t __attribute__ ((noinline))
+sumation (size_t sum0, size_t sum1, size_t sum2, size_t sum3, size_t sum4,
+ size_t sum5, size_t sum6, size_t sum7, size_t sum8, size_t sum9,
+ size_t sum10, size_t sum11, size_t sum12, size_t sum13, size_t sum14,
+ size_t sum15)
+{
+  return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7 + sum8 + sum9
++ sum10 + sum11 + sum12 + sum13 + sum14 + sum15;
+}
+
+size_t __attribute__ ((noinline))
+foo (short const *buf, size_t len)
+{
+  size_t sum = 0;
+  size_t vl = 4;
+  const short *it = buf;
+  for (int i = 0; i < len; i++)
+{
+  vint16m2_t v0 = __riscv_vle16_v_i16m2 (it, vl);
+  it += vl;
+  vint16m2_t v1 = __riscv_vle16_v_i16m2 (it, vl);
+  it += vl;
+  vint16m2_t v2 = __riscv_vle16_v_i16m2 (it, vl);
+  it += vl;
+  vint16m2_t v3 = __riscv