[PATCH] RISC-V: Support one more overlap for wv instructions

2023-12-18 Thread Juzhe-Zhong
For 'wv' instructions, e.g. vwadd.wv vd,vs2,vs1.

vs2 has same EEW as vd.
vs1 has smaller than vd.

So, vs2 can overlap with vd, but vs1 can only overlap highest-number of vd
when LMUL of vs1 is greater than 1.

We already have supported overlap for vs1 LMUL >= 1.
But I forget vs1 LMUL < 1, vs2 can overlap vd even though vs1 totally can not 
overlap vd.

Consider the reduction auto-vectorization:

int64_t
reduc_plus_int (int *__restrict a, int n)
{
  int64_t r = 0;
  for (int i = 0; i < n; ++i)
r += a[i];
  return r;
}

Before this patch:

reduc_plus_int:
ble a1,zero,.L4
vsetvli a5,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a1,e32,mf2,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
vmv1r.v v3,v1  >  This should be removed.
add a0,a0,a4
vwadd.wvv1,v3,v2   >  vs2 should be v1
bne a1,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
ret
.L4:
li  a0,0
ret

After this patch:

reduc_plus_int:
ble a1,zero,.L4
vsetvli a5,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a1,e32,mf2,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vwadd.wvv1,v1,v2
bne a1,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
ret
.L4:
li  a0,0
ret

PR target/112432

gcc/ChangeLog:

* config/riscv/riscv.md (none,W21,W42,W84,W43,W86,W87): Add W0.
(none,W21,W42,W84,W43,W86,W87,W0): Ditto.
* config/riscv/vector.md: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr112432-42.c: New test.

---
 gcc/config/riscv/riscv.md | 14 +++-
 gcc/config/riscv/vector.md| 84 +--
 .../gcc.target/riscv/rvv/base/pr112432-42.c   | 30 +++
 3 files changed, 82 insertions(+), 46 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112432-42.c

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index eed997116b0..ee8b71c22aa 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -503,7 +503,7 @@
 ;; Widening instructions have group-overlap constraints.  Those are only
 ;; valid for certain register-group sizes.  This attribute marks the
 ;; alternatives not matching the required register-group size as disabled.
-(define_attr "group_overlap" "none,W21,W42,W84,W43,W86,W87"
+(define_attr "group_overlap" "none,W21,W42,W84,W43,W86,W87,W0"
   (const_string "none"))
 
 (define_attr "group_overlap_valid" "no,yes"
@@ -524,9 +524,9 @@
 
  ;; According to RVV ISA:
  ;; The destination EEW is greater than the source EEW, the source 
EMUL is at least 1,
-;; and the overlap is in the highest-numbered part of the destination 
register group
-;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, 
v2, or v4 is not).
-;; So the source operand should have LMUL >= 1.
+ ;; and the overlap is in the highest-numbered part of the destination 
register group
+ ;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, 
v2, or v4 is not).
+ ;; So the source operand should have LMUL >= 1.
  (and (eq_attr "group_overlap" "W43")
  (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) 
!= 4
   && riscv_get_v_regno_alignment (GET_MODE 
(operands[3])) >= 1"))
@@ -536,6 +536,12 @@
  (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) 
!= 8
   && riscv_get_v_regno_alignment (GET_MODE 
(operands[3])) >= 1"))
 (const_string "no")
+
+ ;; W21 supports highest-number overlap for source LMUL = 1.
+ ;; For 'wv' variant, we can also allow wide source operand overlaps 
dest operand.
+ (and (eq_attr "group_overlap" "W0")
+ (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) 
> 1"))
+(const_string "no")
 ]
(const_string "yes")))
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 7646615b12a..d475e14a823 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3776,48 +3776,48 @@
(set_attr "group_overlap" 
"W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
 
 (define_insn "@pred_single_widen_sub"
-  [(set (match_operand:VWEXTI 0 "register_operand" "=vd, vr, vd, 
vr, vd, vr, vd, vr, vd, vr, vd, vr, ?&vr, ?&vr")
+  [(set (match_operand:VWEXTI 0 "register_operand" "=vd, vr, 
vd, vr, vd, vr, vd, 

Re: [PATCH] RISC-V: Support one more overlap for wv instructions

2023-12-18 Thread juzhe.zh...@rivai.ai
Update in V2 with more information in commit log:
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640863.html 




juzhe.zh...@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-12-18 18:59
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support one more overlap for wv instructions
For 'wv' instructions, e.g. vwadd.wv vd,vs2,vs1.
 
vs2 has same EEW as vd.
vs1 has smaller than vd.
 
So, vs2 can overlap with vd, but vs1 can only overlap highest-number of vd
when LMUL of vs1 is greater than 1.
 
We already have supported overlap for vs1 LMUL >= 1.
But I forget vs1 LMUL < 1, vs2 can overlap vd even though vs1 totally can not 
overlap vd.
 
Consider the reduction auto-vectorization:
 
int64_t
reduc_plus_int (int *__restrict a, int n)
{
  int64_t r = 0;
  for (int i = 0; i < n; ++i)
r += a[i];
  return r;
}
 
Before this patch:
 
reduc_plus_int:
ble a1,zero,.L4
vsetvli a5,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a1,e32,mf2,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
vmv1r.v v3,v1  >  This should be removed.
add a0,a0,a4
vwadd.wvv1,v3,v2   >  vs2 should be v1
bne a1,zero,.L3
li  a5,0
vsetivlizero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
ret
.L4:
li  a0,0
ret
 
After this patch:
 
reduc_plus_int:
ble a1,zero,.L4
vsetvli a5,zero,e64,m1,ta,ma
vmv.v.i v1,0
.L3:
vsetvli a5,a1,e32,mf2,tu,ma
slli a4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vwadd.wv v1,v1,v2
bne a1,zero,.L3
li a5,0
vsetivli zero,1,e64,m1,ta,ma
vmv.s.x v2,a5
vsetvli a5,zero,e64,m1,ta,ma
vredsum.vs v1,v1,v2
vmv.x.s a0,v1
ret
.L4:
li a0,0
ret
 
PR target/112432
 
gcc/ChangeLog:
 
* config/riscv/riscv.md (none,W21,W42,W84,W43,W86,W87): Add W0.
(none,W21,W42,W84,W43,W86,W87,W0): Ditto.
* config/riscv/vector.md: Ditto.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/pr112432-42.c: New test.
 
---
gcc/config/riscv/riscv.md | 14 +++-
gcc/config/riscv/vector.md| 84 +--
.../gcc.target/riscv/rvv/base/pr112432-42.c   | 30 +++
3 files changed, 82 insertions(+), 46 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr112432-42.c
 
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index eed997116b0..ee8b71c22aa 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -503,7 +503,7 @@
;; Widening instructions have group-overlap constraints.  Those are only
;; valid for certain register-group sizes.  This attribute marks the
;; alternatives not matching the required register-group size as disabled.
-(define_attr "group_overlap" "none,W21,W42,W84,W43,W86,W87"
+(define_attr "group_overlap" "none,W21,W42,W84,W43,W86,W87,W0"
   (const_string "none"))
(define_attr "group_overlap_valid" "no,yes"
@@ -524,9 +524,9 @@
  ;; According to RVV ISA:
  ;; The destination EEW is greater than the source EEW, the source 
EMUL is at least 1,
- ;; and the overlap is in the highest-numbered part of the destination 
register group
- ;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, v2, or 
v4 is not).
- ;; So the source operand should have LMUL >= 1.
+ ;; and the overlap is in the highest-numbered part of the destination 
register group
+ ;; (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a source of v0, 
v2, or v4 is not).
+ ;; So the source operand should have LMUL >= 1.
  (and (eq_attr "group_overlap" "W43")
  (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 4
   && riscv_get_v_regno_alignment (GET_MODE (operands[3])) >= 1"))
@@ -536,6 +536,12 @@
  (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) != 8
   && riscv_get_v_regno_alignment (GET_MODE (operands[3])) >= 1"))
(const_string "no")
+
+ ;; W21 supports highest-number overlap for source LMUL = 1.
+ ;; For 'wv' variant, we can also allow wide source operand overlaps 
dest operand.
+ (and (eq_attr "group_overlap" "W0")
+   (match_test "riscv_get_v_regno_alignment (GET_MODE (operands[0])) > 1"))
+ (const_string "no")
 ]
(const_string "yes")))
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 7646615b12a..d475e14a823 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -3776,48 +3776,48 @@
(set_attr "group_overlap" 
"W21,W21,W21,W21,W42,W42,W42,W42,W84,W84,W84,W84,none,none")])
(define_insn "@pre