Re: Re: [PATCH] RISC-V: Support COND_LEN_* patterns

2023-07-12 Thread 钟居哲
>> Return true if the operation requires a rounding mode operand.  Maybe also
>>call it needs_fp_rounding?
ok

>>What's FMLA?  That's SVE I suppose and ours is fmacc?
Yes, the comments is misleading will fix it soon.


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-12 22:24
To: Juzhe-Zhong; gcc-patches
CC: rdapp.gcc; kito.cheng; kito.cheng; jeffreyalaw
Subject: Re: [PATCH] RISC-V: Support COND_LEN_* patterns
Hi Juzhe,
 
> +/* Return true if the operation is the floating-point operation need FRM.  */
> +static bool
> +need_frm_p (rtx_code code, machine_mode mode)
> +{
> +  if (!FLOAT_MODE_P (mode))
> +return false;
> +  return code != SMIN && code != SMAX;
> +}
 
Return true if the operation requires a rounding mode operand.  Maybe also
call it needs_fp_rounding?
 
> +  if (need_frm_p (code, mode))
> + emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +  else
> + emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +}
 
This feels like we could decide it inside emit_nonvlmax_tu_insn.
Same for without _tu.  But let's keep it like this for now in
order not to stall progress.
 
> +/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
> +   prefer to use the first arithmetic operand as the else value if
> +   the else value doesn't matter, since that exactly matches the SVE
> +   destructive merging form.  For ternary operations we could either
> +   pick the first operand and use FMAD-like instructions or the last
> +   operand and use FMLA-like instructions; the latter seems more
> +   natural.  */
 
What's FMLA?  That's SVE I suppose and ours is fmacc?
 
Apart from that fine from my side, thanks for supporting this.
 
Regards
Robin
 
 


Re: [PATCH] RISC-V: Support COND_LEN_* patterns

2023-07-12 Thread Robin Dapp via Gcc-patches
Hi Juzhe,

> +/* Return true if the operation is the floating-point operation need FRM.  */
> +static bool
> +need_frm_p (rtx_code code, machine_mode mode)
> +{
> +  if (!FLOAT_MODE_P (mode))
> +return false;
> +  return code != SMIN && code != SMAX;
> +}

Return true if the operation requires a rounding mode operand.  Maybe also
call it needs_fp_rounding?

> +  if (need_frm_p (code, mode))
> + emit_nonvlmax_fp_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +  else
> + emit_nonvlmax_tu_insn (icode, RVV_BINOP_MU, ops, len);
> +}

This feels like we could decide it inside emit_nonvlmax_tu_insn.
Same for without _tu.  But let's keep it like this for now in
order not to stall progress.

> +/* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
> +   prefer to use the first arithmetic operand as the else value if
> +   the else value doesn't matter, since that exactly matches the SVE
> +   destructive merging form.  For ternary operations we could either
> +   pick the first operand and use FMAD-like instructions or the last
> +   operand and use FMLA-like instructions; the latter seems more
> +   natural.  */

What's FMLA?  That's SVE I suppose and ours is fmacc?

Apart from that fine from my side, thanks for supporting this.

Regards
 Robin



Re: [PATCH] RISC-V: Support COND_LEN_* patterns

2023-07-12 Thread 钟居哲
The middle-end vectorizer patch is approved and soon will be merged.

The middle-end dependency is resolved.

Ok for trunk?


juzhe.zh...@rivai.ai
 
From: Juzhe-Zhong
Date: 2023-07-12 12:44
To: gcc-patches
CC: kito.cheng; kito.cheng; jeffreyalaw; rdapp.gcc; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support COND_LEN_* patterns
This patch is depending on the following patch on Vectorizer:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624179.html
 
With this patch, we can handle operations may trap on elements outside the loop.
 
These 2 following cases will be addressed by this patch:
 
1. integer division:
 
  #define TEST_TYPE(TYPE) \
  __attribute__((noipa)) \
  void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * 
__restrict b, int n) \
  { \
for (int i = 0; i < n; i++) \
  dst[i] = a[i] % b[i]; \
  }
  #define TEST_ALL() \
   TEST_TYPE(int8_t) \
  TEST_ALL()
 
  Before this patch:
 
   vrem_int8_t:
ble a3,zero,.L14
csrrt4,vlenb
addiw   a5,a3,-1
addiw   a4,t4,-1
sext.w  t5,a3
bltua5,a4,.L10
csrrt3,vlenb
subwt3,t5,t3
li  a5,0
vsetvli t6,zero,e8,m1,ta,ma
.L4:
add a6,a2,a5
add a7,a0,a5
add t1,a1,a5
mv  a4,a5
add a5,a5,t4
vl1re8.vv2,0(a6)
vl1re8.vv1,0(t1)
sext.w  a6,a5
vrem.vv v1,v1,v2
vs1r.v  v1,0(a7)
bleua6,t3,.L4
csrra5,vlenb
addwa4,a4,a5
sext.w  a5,a4
beq t5,a4,.L16
.L3:
csrra6,vlenb
subwt5,t5,a4
srlia6,a6,1
addiw   t1,t5,-1
addiw   a7,a6,-1
bltut1,a7,.L9
sllia4,a4,32
srlia4,a4,32
add t0,a1,a4
add t6,a2,a4
add a4,a0,a4
vsetvli a7,zero,e8,mf2,ta,ma
sext.w  t3,a6
vle8.v  v1,0(t0)
vle8.v  v2,0(t6)
subwt4,t5,a6
vrem.vv v1,v1,v2
vse8.v  v1,0(a4)
mv  t1,t3
bltut4,t3,.L7
csrrt1,vlenb
add a4,a4,a6
add t0,t0,a6
add t6,t6,a6
sext.w  t1,t1
vle8.v  v1,0(t0)
vle8.v  v2,0(t6)
vrem.vv v1,v1,v2
vse8.v  v1,0(a4)
.L7:
addwa5,t1,a5
beq t5,t1,.L14
.L9:
add a4,a1,a5
add a6,a2,a5
lb  a6,0(a6)
lb  a4,0(a4)
add a7,a0,a5
addia5,a5,1
remwa4,a4,a6
sext.w  a6,a5
sb  a4,0(a7)
bgt a3,a6,.L9
.L14:
ret
.L10:
li  a4,0
li  a5,0
j   .L3
.L16:
ret
 
After this patch:
 
   vrem_int8_t:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,m1,tu,ma
vle8.v v1,0(a1)
vle8.v v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v1,v2
vse8.v v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret
 
2. Floating-point operation **WITHOUT** -ffast-math:
 
#define TEST_TYPE(TYPE) \
__attribute__((noipa)) \
void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE 
*__restrict b, int n) \
{ \
  for (int i = 0; i < n; i++) \
dst[i] = a[i] + b[i]; \
}
 
#define TEST_ALL() \
 TEST_TYPE(float) \
 
TEST_ALL()
   
Before this patch:
   
   vadd_float:
ble a3,zero,.L10
csrra4,vlenb
srlit3,a4,2
addiw   a5,a3,-1
addiw   a6,t3,-1
sext.w  t6,a3
bltua5,a6,.L7
subwt5,t6,t3
mv  t1,a1
mv  a7,a2
mv  a6,a0
li  a5,0
vsetvli t4,zero,e32,m1,ta,ma
.L4:
vl1re32.v   v1,0(t1)
vl1re32.v   v2,0(a7)
addwa5,a5,t3
vfadd.vvv1,v1,v2
vs1r.v  v1,0(a6)
add t1,t1,a4
add a7,a7,a4
add a6,a6,a4
bgeut5,a5,.L4
beq t6,a5,.L10
sext.w  a5,a5
.L3:
sllia4,a5,2
.L6:
add a6,a1,a4
add a7,a2,a4
flw fa4,0(a6)
flw fa5,0(a7)
add a6,a0,a4
addiw   a5,a5,1
fadd.s  fa5,fa5,fa4
addia4,a4,4
fsw fa5,0(a6)
bgt a3,a5,.L6
.L10:
ret
.L7:
li  a5,0
j   .L3
 
After this patch:
 
   vadd_float:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e32,m1,tu,ma
slli a4,a5,2
vle32.v v1,0(a1)
vle32.v v2,0(a2)
sub a3,a3,a5
vfadd.vv v1,v1,v2
vse32.v v1,0(a0)
add a1,a1,a4
add a2,a2,a4
add a0,a0,a4
bne a3,zero,.L3
.L5:
ret
  
gcc/ChangeLog:
 
* config/riscv/autovec.md (cond_len_): New pattern.
* config/riscv/riscv-protos.h (enum insn_type): New enum.
(expand_cond_len_binop): New function.
* config/riscv/riscv-v.cc (emit_nonvlmax_tu_insn): Ditto.
(emit_nonvlmax_fp_tu_insn): Ditto.
(need_frm_p): Ditto.
(expand_cond_len_binop): Ditto.
* config

[PATCH] RISC-V: Support COND_LEN_* patterns

2023-07-11 Thread Juzhe-Zhong
This patch is depending on the following patch on Vectorizer:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/624179.html

With this patch, we can handle operations may trap on elements outside the loop.

These 2 following cases will be addressed by this patch:

1. integer division:

  #define TEST_TYPE(TYPE)   \
  __attribute__((noipa))\
  void vrem_##TYPE (TYPE * __restrict dst, TYPE * __restrict a, TYPE * 
__restrict b, int n) \
  { \
for (int i = 0; i < n; i++) \
  dst[i] = a[i] % b[i]; \
  }
  #define TEST_ALL()\
   TEST_TYPE(int8_t)\
  TEST_ALL()

  Before this patch:

   vrem_int8_t:
ble a3,zero,.L14
csrrt4,vlenb
addiw   a5,a3,-1
addiw   a4,t4,-1
sext.w  t5,a3
bltua5,a4,.L10
csrrt3,vlenb
subwt3,t5,t3
li  a5,0
vsetvli t6,zero,e8,m1,ta,ma
.L4:
add a6,a2,a5
add a7,a0,a5
add t1,a1,a5
mv  a4,a5
add a5,a5,t4
vl1re8.vv2,0(a6)
vl1re8.vv1,0(t1)
sext.w  a6,a5
vrem.vv v1,v1,v2
vs1r.v  v1,0(a7)
bleua6,t3,.L4
csrra5,vlenb
addwa4,a4,a5
sext.w  a5,a4
beq t5,a4,.L16
.L3:
csrra6,vlenb
subwt5,t5,a4
srlia6,a6,1
addiw   t1,t5,-1
addiw   a7,a6,-1
bltut1,a7,.L9
sllia4,a4,32
srlia4,a4,32
add t0,a1,a4
add t6,a2,a4
add a4,a0,a4
vsetvli a7,zero,e8,mf2,ta,ma
sext.w  t3,a6
vle8.v  v1,0(t0)
vle8.v  v2,0(t6)
subwt4,t5,a6
vrem.vv v1,v1,v2
vse8.v  v1,0(a4)
mv  t1,t3
bltut4,t3,.L7
csrrt1,vlenb
add a4,a4,a6
add t0,t0,a6
add t6,t6,a6
sext.w  t1,t1
vle8.v  v1,0(t0)
vle8.v  v2,0(t6)
vrem.vv v1,v1,v2
vse8.v  v1,0(a4)
.L7:
addwa5,t1,a5
beq t5,t1,.L14
.L9:
add a4,a1,a5
add a6,a2,a5
lb  a6,0(a6)
lb  a4,0(a4)
add a7,a0,a5
addia5,a5,1
remwa4,a4,a6
sext.w  a6,a5
sb  a4,0(a7)
bgt a3,a6,.L9
.L14:
ret
.L10:
li  a4,0
li  a5,0
j   .L3
.L16:
ret

After this patch:

   vrem_int8_t:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e8,m1,tu,ma
vle8.v  v1,0(a1)
vle8.v  v2,0(a2)
sub a3,a3,a5
vrem.vv v1,v1,v2
vse8.v  v1,0(a0)
add a1,a1,a5
add a2,a2,a5
add a0,a0,a5
bne a3,zero,.L3
.L5:
ret

2. Floating-point operation **WITHOUT** -ffast-math:
 
#define TEST_TYPE(TYPE) \
__attribute__((noipa))  \
void vadd_##TYPE (TYPE * __restrict dst, TYPE *__restrict a, TYPE 
*__restrict b, int n) \
{   \
  for (int i = 0; i < n; i++)   \
dst[i] = a[i] + b[i];   \
}

#define TEST_ALL()  \
 TEST_TYPE(float)   \

TEST_ALL()
   
Before this patch:
   
   vadd_float:
ble a3,zero,.L10
csrra4,vlenb
srlit3,a4,2
addiw   a5,a3,-1
addiw   a6,t3,-1
sext.w  t6,a3
bltua5,a6,.L7
subwt5,t6,t3
mv  t1,a1
mv  a7,a2
mv  a6,a0
li  a5,0
vsetvli t4,zero,e32,m1,ta,ma
.L4:
vl1re32.v   v1,0(t1)
vl1re32.v   v2,0(a7)
addwa5,a5,t3
vfadd.vvv1,v1,v2
vs1r.v  v1,0(a6)
add t1,t1,a4
add a7,a7,a4
add a6,a6,a4
bgeut5,a5,.L4
beq t6,a5,.L10
sext.w  a5,a5
.L3:
sllia4,a5,2
.L6:
add a6,a1,a4
add a7,a2,a4
flw fa4,0(a6)
flw fa5,0(a7)
add a6,a0,a4
addiw   a5,a5,1
fadd.s  fa5,fa5,fa4
addia4,a4,4
fsw fa5,0(a6)
bgt a3,a5,.L6
.L10:
ret
.L7:
li  a5,0
j   .L3

After this patch:

   vadd_float:
ble a3,zero,.L5
.L3:
vsetvli a5,a3,e32,m1,tu,ma
sllia4,a5,2
vle32.v v1,0(a1)
vle32.v v2,0(a2)
sub a3,a3,a5
vfadd.vvv1,v1,v2
vse32.v v1,0(a0)
add a1,a1,a4
add a2,a2,a4
add a0,a0,a4
bne a3,zero,.L3
.L5:
ret
  
gcc/ChangeLog:

* config/riscv/autovec.md (cond_len_): New pattern.
* config/riscv/riscv-protos.h (enum insn_type