Re: [PATCH] RISC-V: Fix inferior codegen for vse intrinsics.

2023-01-26 Thread Kito Cheng via Gcc-patches
committed, thanks.

On Thu, Dec 29, 2022 at 11:34 PM  wrote:

> From: Ju-Zhe Zhong 
>
> Currently we use pred_mov to to do the codegen for vse intrinsics.
> However, it
> generates inferior codegen when I am testing AVL model of VSETVL PASS
> using vse intrinsics.
>
> Consider this following code:
> void f2 (int * restrict in, int * restrict out, void * restrict mask_in,
> int n)
> {
>   vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 1), 19);
>   __riscv_vse32_v_f32mf2 ((float *)(out + 1), v, 19);
>   vbool64_t mask = *(vbool64_t*)mask_in;
>   for (int i = 0; i < n; i++)
> {
>   vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1),
> 19);
>   __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
>
>   vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2),
> 19);
>   __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
>
>   vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t
> *)(in + i + 200), 13);
>   __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
>
>   vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i +
> 300), 11);
>   __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
>
>   vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in
> + i + 500), 11);
>   __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
>
>   vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in
> + i + 600), 11);
>   __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
>
>   vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700),
> 11);
>   __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
> }
> }
>
> Before this patch:
> csrrt2,vlenb
> srlit2,t2,1
> sllis0,t2,2
> vsetvli zero,19,e16,mf2,ta,ma
> sub s0,s0,t2
> csrrt2,vlenb
> vle16.v v24,0(a3)
> mv  a4,a3
> vse16.v v24,0(a1)
> srlit2,t2,1
> add a2,a3,t6
> add s0,s0,sp
> vsetvli zero,19,e32,mf2,ta,ma
> addia3,a3,4
> vle32.v v24,0(a3)
> vsetvli zero,t0,e32,mf2,ta,ma
> vse32.v v24,0(s0)
> sllis0,t2,2
> sub s0,s0,t2
> add s0,s0,sp
> vsetvli t0,zero,e32,mf2,ta,ma
> vle32.v v24,0(s0)
> mv  s0,t2
> sllit2,t2,2
> mv  a5,a1
> vsetvli zero,19,e32,mf2,ta,ma
> addia1,a1,4
> sub t2,t2,s0
> vse32.v v24,0(a1)
> add t2,t2,sp
> vsetvli t0,zero,e32,mf2,ta,ma
> addit1,a5,796
> vle32.v v24,0(t2)
> addit5,a4,1196
> addia7,a5,1196
> addit4,a4,1996
> addia6,a5,1996
> vsetvli zero,13,e32,mf2,ta,ma
> add a4,a4,t3
> vse32.v v24,0(t1)
> add a5,a5,t3
> vsetvli zero,11,e64,m1,tu,mu
> vle64.v v24,0(t5),v0.t
> vse64.v v24,0(a7)
> vle64.v v24,0(t4),v0.t
> vse64.v v24,0(a6)
> vle64.v v24,0(a4),v0.t
> vse64.v v24,0(a5),v0.t
> vsetvli zero,11,e8,mf4,ta,ma
> vle8.v  v24,0(a2)
> vse8.v  v24,0(a2)
> bne a0,a3,.L8
> csrrt0,vlenb
> sllit1,t0,1
> add sp,sp,t1
> lw  s0,12(sp)
> addisp,sp,16
> jr  ra
>
> We are generating redundant spilling codes.
> Here we introduce a dedicated pred_store pattern for vse intrinsics like
> maskstore in ARM SVE.
>
> After this patch:
> vsetvli zero,19,e16,mf2,ta,ma
> mv  a5,a4
> vle16.v v24,0(a0)
> mv  a3,a0
> vse16.v 19,0(a4)
> addit1,a4,796
> vsetvli zero,19,e32,mf2,ta,ma
> addia0,a0,4
> addia4,a4,4
> vle32.v v24,0(a0)
> addit0,a3,1196
> vse32.v 19,0(a4)
> addia7,a5,1196
> addit6,a3,1996
> addia6,a5,1996
> add t5,a3,t4
> vsetvli zero,13,e32,mf2,ta,ma
> add a2,a5,t4
> vse32.v 13,0(t1)
> add a3,a3,t3
> vsetvli zero,11,e64,m1,tu,mu
> add a5,a5,t3
> vle64.v v24,0(t0),v0.t
> vse64.v 11,0(a7)
> vle64.v v24,0(t6),v0.t
> vse64.v 11,0(a6)
> vle64.v v24,0(t5),v0.t
> vse64.v 11,0(a2),v0.t
> vsetvli zero,11,e8,mf4,ta,ma
> vle8.v  v24,0(a3)
> vse8.v  11,0(a5)
> bne a1,a4,.L8
> .L6:
> ret
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-vector-builtins-bases.cc (class loadstore):
> use pred_store for vse.
> * config/riscv/riscv-vector-builtins.cc
> (function_expander::add_mem_operand): Refine function.
> (function_expander::use_contiguous_load_insn): Adjust new
> implementation.
> (function_expander::use_contiguous_store_insn): Ditto.
> * config/riscv/

[PATCH] RISC-V: Fix inferior codegen for vse intrinsics.

2022-12-29 Thread juzhe . zhong
From: Ju-Zhe Zhong 

Currently we use pred_mov to to do the codegen for vse intrinsics. However, it
generates inferior codegen when I am testing AVL model of VSETVL PASS using vse 
intrinsics.

Consider this following code:
void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
{
  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 1), 19);
  __riscv_vse32_v_f32mf2 ((float *)(out + 1), v, 19);
  vbool64_t mask = *(vbool64_t*)mask_in;
  for (int i = 0; i < n; i++)
{
  vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
  __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);

  vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
  __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);

  vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + 
i + 200), 13);
  __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);

  vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 
300), 11);
  __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);

  vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i 
+ 500), 11);
  __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);

  vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 
600), 11);
  __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);

  vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
  __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
}
}

Before this patch:
csrrt2,vlenb
srlit2,t2,1
sllis0,t2,2
vsetvli zero,19,e16,mf2,ta,ma
sub s0,s0,t2
csrrt2,vlenb
vle16.v v24,0(a3)
mv  a4,a3
vse16.v v24,0(a1)
srlit2,t2,1
add a2,a3,t6
add s0,s0,sp
vsetvli zero,19,e32,mf2,ta,ma
addia3,a3,4
vle32.v v24,0(a3)
vsetvli zero,t0,e32,mf2,ta,ma
vse32.v v24,0(s0)
sllis0,t2,2
sub s0,s0,t2
add s0,s0,sp
vsetvli t0,zero,e32,mf2,ta,ma
vle32.v v24,0(s0)
mv  s0,t2
sllit2,t2,2
mv  a5,a1
vsetvli zero,19,e32,mf2,ta,ma
addia1,a1,4
sub t2,t2,s0
vse32.v v24,0(a1)
add t2,t2,sp
vsetvli t0,zero,e32,mf2,ta,ma
addit1,a5,796
vle32.v v24,0(t2)
addit5,a4,1196
addia7,a5,1196
addit4,a4,1996
addia6,a5,1996
vsetvli zero,13,e32,mf2,ta,ma
add a4,a4,t3
vse32.v v24,0(t1)
add a5,a5,t3
vsetvli zero,11,e64,m1,tu,mu
vle64.v v24,0(t5),v0.t
vse64.v v24,0(a7)
vle64.v v24,0(t4),v0.t
vse64.v v24,0(a6)
vle64.v v24,0(a4),v0.t
vse64.v v24,0(a5),v0.t
vsetvli zero,11,e8,mf4,ta,ma
vle8.v  v24,0(a2)
vse8.v  v24,0(a2)
bne a0,a3,.L8
csrrt0,vlenb
sllit1,t0,1
add sp,sp,t1
lw  s0,12(sp)
addisp,sp,16
jr  ra

We are generating redundant spilling codes.
Here we introduce a dedicated pred_store pattern for vse intrinsics like
maskstore in ARM SVE.

After this patch:
vsetvli zero,19,e16,mf2,ta,ma
mv  a5,a4
vle16.v v24,0(a0)
mv  a3,a0
vse16.v 19,0(a4)
addit1,a4,796
vsetvli zero,19,e32,mf2,ta,ma
addia0,a0,4
addia4,a4,4
vle32.v v24,0(a0)
addit0,a3,1196
vse32.v 19,0(a4)
addia7,a5,1196
addit6,a3,1996
addia6,a5,1996
add t5,a3,t4
vsetvli zero,13,e32,mf2,ta,ma
add a2,a5,t4
vse32.v 13,0(t1)
add a3,a3,t3
vsetvli zero,11,e64,m1,tu,mu
add a5,a5,t3
vle64.v v24,0(t0),v0.t
vse64.v 11,0(a7)
vle64.v v24,0(t6),v0.t
vse64.v 11,0(a6)
vle64.v v24,0(t5),v0.t
vse64.v 11,0(a2),v0.t
vsetvli zero,11,e8,mf4,ta,ma
vle8.v  v24,0(a3)
vse8.v  11,0(a5)
bne a1,a4,.L8
.L6:
ret

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-bases.cc (class loadstore): use 
pred_store for vse.
* config/riscv/riscv-vector-builtins.cc 
(function_expander::add_mem_operand): Refine function.
(function_expander::use_contiguous_load_insn): Adjust new 
implementation.
(function_expander::use_contiguous_store_insn): Ditto.
* config/riscv/riscv-vector-builtins.h: Refine function.
* config/riscv/vector.md (@pred_store): New pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/vse-constraint-1.c: New test.

---
 .../riscv/riscv-vector-builtins-bases.cc  |  2 +-
 gcc/config/riscv/riscv-vector-builtins.cc | 22 +
 gcc/config/riscv/riscv-vector-builtins