Re: [PATCH] RISC-V: Fix inferior codegen for vse intrinsics.
committed, thanks. On Thu, Dec 29, 2022 at 11:34 PM wrote: > From: Ju-Zhe Zhong > > Currently we use pred_mov to to do the codegen for vse intrinsics. > However, it > generates inferior codegen when I am testing AVL model of VSETVL PASS > using vse intrinsics. > > Consider this following code: > void f2 (int * restrict in, int * restrict out, void * restrict mask_in, > int n) > { > vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 1), 19); > __riscv_vse32_v_f32mf2 ((float *)(out + 1), v, 19); > vbool64_t mask = *(vbool64_t*)mask_in; > for (int i = 0; i < n; i++) > { > vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), > 19); > __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); > > vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), > 19); > __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); > > vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t > *)(in + i + 200), 13); > __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13); > > vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + > 300), 11); > __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); > > vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in > + i + 500), 11); > __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); > > vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in > + i + 600), 11); > __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); > > vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), > 11); > __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); > } > } > > Before this patch: > csrrt2,vlenb > srlit2,t2,1 > sllis0,t2,2 > vsetvli zero,19,e16,mf2,ta,ma > sub s0,s0,t2 > csrrt2,vlenb > vle16.v v24,0(a3) > mv a4,a3 > vse16.v v24,0(a1) > srlit2,t2,1 > add a2,a3,t6 > add s0,s0,sp > vsetvli zero,19,e32,mf2,ta,ma > addia3,a3,4 > vle32.v v24,0(a3) > vsetvli zero,t0,e32,mf2,ta,ma > vse32.v v24,0(s0) > sllis0,t2,2 > sub s0,s0,t2 > add s0,s0,sp > vsetvli t0,zero,e32,mf2,ta,ma > vle32.v v24,0(s0) > mv s0,t2 > sllit2,t2,2 > mv a5,a1 > vsetvli zero,19,e32,mf2,ta,ma > addia1,a1,4 > sub t2,t2,s0 > vse32.v v24,0(a1) > add t2,t2,sp > vsetvli t0,zero,e32,mf2,ta,ma > addit1,a5,796 > vle32.v v24,0(t2) > addit5,a4,1196 > addia7,a5,1196 > addit4,a4,1996 > addia6,a5,1996 > vsetvli zero,13,e32,mf2,ta,ma > add a4,a4,t3 > vse32.v v24,0(t1) > add a5,a5,t3 > vsetvli zero,11,e64,m1,tu,mu > vle64.v v24,0(t5),v0.t > vse64.v v24,0(a7) > vle64.v v24,0(t4),v0.t > vse64.v v24,0(a6) > vle64.v v24,0(a4),v0.t > vse64.v v24,0(a5),v0.t > vsetvli zero,11,e8,mf4,ta,ma > vle8.v v24,0(a2) > vse8.v v24,0(a2) > bne a0,a3,.L8 > csrrt0,vlenb > sllit1,t0,1 > add sp,sp,t1 > lw s0,12(sp) > addisp,sp,16 > jr ra > > We are generating redundant spilling codes. > Here we introduce a dedicated pred_store pattern for vse intrinsics like > maskstore in ARM SVE. > > After this patch: > vsetvli zero,19,e16,mf2,ta,ma > mv a5,a4 > vle16.v v24,0(a0) > mv a3,a0 > vse16.v 19,0(a4) > addit1,a4,796 > vsetvli zero,19,e32,mf2,ta,ma > addia0,a0,4 > addia4,a4,4 > vle32.v v24,0(a0) > addit0,a3,1196 > vse32.v 19,0(a4) > addia7,a5,1196 > addit6,a3,1996 > addia6,a5,1996 > add t5,a3,t4 > vsetvli zero,13,e32,mf2,ta,ma > add a2,a5,t4 > vse32.v 13,0(t1) > add a3,a3,t3 > vsetvli zero,11,e64,m1,tu,mu > add a5,a5,t3 > vle64.v v24,0(t0),v0.t > vse64.v 11,0(a7) > vle64.v v24,0(t6),v0.t > vse64.v 11,0(a6) > vle64.v v24,0(t5),v0.t > vse64.v 11,0(a2),v0.t > vsetvli zero,11,e8,mf4,ta,ma > vle8.v v24,0(a3) > vse8.v 11,0(a5) > bne a1,a4,.L8 > .L6: > ret > > gcc/ChangeLog: > > * config/riscv/riscv-vector-builtins-bases.cc (class loadstore): > use pred_store for vse. > * config/riscv/riscv-vector-builtins.cc > (function_expander::add_mem_operand): Refine function. > (function_expander::use_contiguous_load_insn): Adjust new > implementation. > (function_expander::use_contiguous_store_insn): Ditto. > * config/riscv/
[PATCH] RISC-V: Fix inferior codegen for vse intrinsics.
From: Ju-Zhe Zhong Currently we use pred_mov to to do the codegen for vse intrinsics. However, it generates inferior codegen when I am testing AVL model of VSETVL PASS using vse intrinsics. Consider this following code: void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n) { vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 1), 19); __riscv_vse32_v_f32mf2 ((float *)(out + 1), v, 19); vbool64_t mask = *(vbool64_t*)mask_in; for (int i = 0; i < n; i++) { vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19); __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19); vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19); __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19); vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13); __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13); vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11); vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11); __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11); vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11); __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11); vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11); __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11); } } Before this patch: csrrt2,vlenb srlit2,t2,1 sllis0,t2,2 vsetvli zero,19,e16,mf2,ta,ma sub s0,s0,t2 csrrt2,vlenb vle16.v v24,0(a3) mv a4,a3 vse16.v v24,0(a1) srlit2,t2,1 add a2,a3,t6 add s0,s0,sp vsetvli zero,19,e32,mf2,ta,ma addia3,a3,4 vle32.v v24,0(a3) vsetvli zero,t0,e32,mf2,ta,ma vse32.v v24,0(s0) sllis0,t2,2 sub s0,s0,t2 add s0,s0,sp vsetvli t0,zero,e32,mf2,ta,ma vle32.v v24,0(s0) mv s0,t2 sllit2,t2,2 mv a5,a1 vsetvli zero,19,e32,mf2,ta,ma addia1,a1,4 sub t2,t2,s0 vse32.v v24,0(a1) add t2,t2,sp vsetvli t0,zero,e32,mf2,ta,ma addit1,a5,796 vle32.v v24,0(t2) addit5,a4,1196 addia7,a5,1196 addit4,a4,1996 addia6,a5,1996 vsetvli zero,13,e32,mf2,ta,ma add a4,a4,t3 vse32.v v24,0(t1) add a5,a5,t3 vsetvli zero,11,e64,m1,tu,mu vle64.v v24,0(t5),v0.t vse64.v v24,0(a7) vle64.v v24,0(t4),v0.t vse64.v v24,0(a6) vle64.v v24,0(a4),v0.t vse64.v v24,0(a5),v0.t vsetvli zero,11,e8,mf4,ta,ma vle8.v v24,0(a2) vse8.v v24,0(a2) bne a0,a3,.L8 csrrt0,vlenb sllit1,t0,1 add sp,sp,t1 lw s0,12(sp) addisp,sp,16 jr ra We are generating redundant spilling codes. Here we introduce a dedicated pred_store pattern for vse intrinsics like maskstore in ARM SVE. After this patch: vsetvli zero,19,e16,mf2,ta,ma mv a5,a4 vle16.v v24,0(a0) mv a3,a0 vse16.v 19,0(a4) addit1,a4,796 vsetvli zero,19,e32,mf2,ta,ma addia0,a0,4 addia4,a4,4 vle32.v v24,0(a0) addit0,a3,1196 vse32.v 19,0(a4) addia7,a5,1196 addit6,a3,1996 addia6,a5,1996 add t5,a3,t4 vsetvli zero,13,e32,mf2,ta,ma add a2,a5,t4 vse32.v 13,0(t1) add a3,a3,t3 vsetvli zero,11,e64,m1,tu,mu add a5,a5,t3 vle64.v v24,0(t0),v0.t vse64.v 11,0(a7) vle64.v v24,0(t6),v0.t vse64.v 11,0(a6) vle64.v v24,0(t5),v0.t vse64.v 11,0(a2),v0.t vsetvli zero,11,e8,mf4,ta,ma vle8.v v24,0(a3) vse8.v 11,0(a5) bne a1,a4,.L8 .L6: ret gcc/ChangeLog: * config/riscv/riscv-vector-builtins-bases.cc (class loadstore): use pred_store for vse. * config/riscv/riscv-vector-builtins.cc (function_expander::add_mem_operand): Refine function. (function_expander::use_contiguous_load_insn): Adjust new implementation. (function_expander::use_contiguous_store_insn): Ditto. * config/riscv/riscv-vector-builtins.h: Refine function. * config/riscv/vector.md (@pred_store): New pattern. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/vse-constraint-1.c: New test. --- .../riscv/riscv-vector-builtins-bases.cc | 2 +- gcc/config/riscv/riscv-vector-builtins.cc | 22 + gcc/config/riscv/riscv-vector-builtins