https://gcc.gnu.org/g:309ee005aa871286c8daccbce7586f82be347440
commit r15-954-g309ee005aa871286c8daccbce7586f82be347440 Author: Robin Dapp <rd...@ventanamicro.com> Date: Fri May 10 13:37:03 2024 +0200 RISC-V: Use widening shift for scatter/gather if applicable. With the zvbb extension we can emit a widening shift for scatter/gather index preparation in case we need to multiply by 2 and zero extend. The patch also adds vwsll to the mode_idx attribute and removes the mode from shift-count operand of the insn pattern. gcc/ChangeLog: * config/riscv/riscv-v.cc (expand_gather_scatter): Use vwsll if applicable. * config/riscv/vector-crypto.md: Remove mode from vwsll shift count operator. * config/riscv/vector.md: Add vwsll to mode iterator. gcc/testsuite/ChangeLog: * lib/target-supports.exp: Add zvbb. * gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c: New test. Diff: --- gcc/config/riscv/riscv-v.cc | 42 +++++--- gcc/config/riscv/vector-crypto.md | 4 +- gcc/config/riscv/vector.md | 4 +- .../gather-scatter/gather_load_64-12-zvbb.c | 113 +++++++++++++++++++++ gcc/testsuite/lib/target-supports.exp | 48 ++++++++- 5 files changed, 193 insertions(+), 18 deletions(-) diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index f105f470495..9428beca268 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -4016,7 +4016,7 @@ expand_gather_scatter (rtx *ops, bool is_load) { rtx ptr, vec_offset, vec_reg; bool zero_extend_p; - int scale_log2; + int shift; rtx mask = ops[5]; rtx len = ops[6]; if (is_load) @@ -4025,7 +4025,7 @@ expand_gather_scatter (rtx *ops, bool is_load) ptr = ops[1]; vec_offset = ops[2]; zero_extend_p = INTVAL (ops[3]); - scale_log2 = exact_log2 (INTVAL (ops[4])); + shift = exact_log2 (INTVAL (ops[4])); } else { @@ -4033,7 +4033,7 @@ expand_gather_scatter (rtx *ops, bool is_load) ptr = ops[0]; vec_offset = ops[1]; zero_extend_p = INTVAL (ops[2]); - scale_log2 = exact_log2 (INTVAL (ops[3])); + shift = exact_log2 (INTVAL (ops[3])); } machine_mode vec_mode = GET_MODE (vec_reg); @@ -4043,9 +4043,12 @@ expand_gather_scatter (rtx *ops, bool is_load) poly_int64 nunits = GET_MODE_NUNITS (vec_mode); bool is_vlmax = is_vlmax_len_p (vec_mode, len); + bool use_widening_shift = false; + /* Extend the offset element to address width. */ if (inner_offsize < BITS_PER_WORD) { + use_widening_shift = TARGET_ZVBB && zero_extend_p && shift == 1; /* 7.2. Vector Load/Store Addressing Modes. If the vector offset elements are narrower than XLEN, they are zero-extended to XLEN before adding to the ptr effective address. If @@ -4054,8 +4057,8 @@ expand_gather_scatter (rtx *ops, bool is_load) raise an illegal instruction exception if the EEW is not supported for offset elements. - RVV spec only refers to the scale_log == 0 case. */ - if (!zero_extend_p || scale_log2 != 0) + RVV spec only refers to the shift == 0 case. */ + if (!zero_extend_p || shift) { if (zero_extend_p) inner_idx_mode @@ -4064,19 +4067,32 @@ expand_gather_scatter (rtx *ops, bool is_load) inner_idx_mode = int_mode_for_size (BITS_PER_WORD, 0).require (); machine_mode new_idx_mode = get_vector_mode (inner_idx_mode, nunits).require (); - rtx tmp = gen_reg_rtx (new_idx_mode); - emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode, - zero_extend_p ? true : false)); - vec_offset = tmp; + if (!use_widening_shift) + { + rtx tmp = gen_reg_rtx (new_idx_mode); + emit_insn (gen_extend_insn (tmp, vec_offset, new_idx_mode, idx_mode, + zero_extend_p ? true : false)); + vec_offset = tmp; + } idx_mode = new_idx_mode; } } - if (scale_log2 != 0) + if (shift) { - rtx tmp = expand_binop (idx_mode, ashl_optab, vec_offset, - gen_int_mode (scale_log2, Pmode), NULL_RTX, 0, - OPTAB_DIRECT); + rtx tmp; + if (!use_widening_shift) + tmp = expand_binop (idx_mode, ashl_optab, vec_offset, + gen_int_mode (shift, Pmode), NULL_RTX, 0, + OPTAB_DIRECT); + else + { + tmp = gen_reg_rtx (idx_mode); + insn_code icode = code_for_pred_vwsll_scalar (idx_mode); + rtx ops[] = {tmp, vec_offset, const1_rtx}; + emit_vlmax_insn (icode, BINARY_OP, ops); + } + vec_offset = tmp; } diff --git a/gcc/config/riscv/vector-crypto.md b/gcc/config/riscv/vector-crypto.md index 24822e2712c..0ddc2f3f3c6 100755 --- a/gcc/config/riscv/vector-crypto.md +++ b/gcc/config/riscv/vector-crypto.md @@ -295,7 +295,7 @@ (ashift:VWEXTI (zero_extend:VWEXTI (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" "vr")) - (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand" "vr")) + (match_operand:<V_DOUBLE_TRUNC> 4 "vector_shift_operand" "vrvk")) (match_operand:VWEXTI 2 "vector_merge_operand" "0vu")))] "TARGET_ZVBB" "vwsll.v%o4\t%0,%3,%4%p1" @@ -316,7 +316,7 @@ (ashift:VWEXTI (zero_extend:VWEXTI (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand" " vr, vr")) - (match_operand:<VSUBEL> 4 "pmode_reg_or_uimm5_operand" " rK, rK")) + (match_operand 4 "pmode_reg_or_uimm5_operand" " rK, rK")) (match_operand:VWEXTI 2 "vector_merge_operand" " vu, 0")))] "TARGET_ZVBB" "vwsll.v%o4\t%0,%3,%4%p1" diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index dccf76f0003..69423be6917 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -750,10 +750,10 @@ (const_int 1) (eq_attr "type" "vssegte,vmpop,vmffs") - (const_int 2) + (const_int 2) (eq_attr "type" "vstux,vstox,vssegts,vssegtux,vssegtox,vfcvtftoi,vfwcvtitof,vfwcvtftoi, - vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo") + vfwcvtftof,vmsfs,vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vwsll") (const_int 3) (eq_attr "type" "viwalu,viwmul,viwmuladd,vfwalu,vfwmul,vfwmuladd") diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c new file mode 100644 index 00000000000..11a4031f47b --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/gather-scatter/gather_load_64-12-zvbb.c @@ -0,0 +1,113 @@ +/* { dg-do compile } */ +/* { dg-add-options "riscv_v" } */ +/* { dg-add-options "riscv_zvbb" } */ +/* { dg-additional-options "-fno-vect-cost-model -fdump-tree-vect-details -mrvv-max-lmul=m4" } */ + +#include <stdint-gcc.h> + +#define TEST_LOOP(DATA_TYPE, INDEX_TYPE) \ + void __attribute__ ((noinline, noclone)) \ + f_##DATA_TYPE##_##INDEX_TYPE (DATA_TYPE *restrict y, DATA_TYPE *restrict x, \ + INDEX_TYPE *restrict index) \ + { \ + for (int i = 0; i < 100; ++i) \ + { \ + y[i * 2] = x[index[i * 2]] + 1; \ + y[i * 2 + 1] = x[index[i * 2 + 1]] + 2; \ + } \ + } + +TEST_LOOP (int8_t, int8_t) +TEST_LOOP (uint8_t, int8_t) +TEST_LOOP (int16_t, int8_t) +TEST_LOOP (uint16_t, int8_t) +TEST_LOOP (int32_t, int8_t) +TEST_LOOP (uint32_t, int8_t) +TEST_LOOP (int64_t, int8_t) +TEST_LOOP (uint64_t, int8_t) +TEST_LOOP (_Float16, int8_t) +TEST_LOOP (float, int8_t) +TEST_LOOP (double, int8_t) +TEST_LOOP (int8_t, int16_t) +TEST_LOOP (uint8_t, int16_t) +TEST_LOOP (int16_t, int16_t) +TEST_LOOP (uint16_t, int16_t) +TEST_LOOP (int32_t, int16_t) +TEST_LOOP (uint32_t, int16_t) +TEST_LOOP (int64_t, int16_t) +TEST_LOOP (uint64_t, int16_t) +TEST_LOOP (_Float16, int16_t) +TEST_LOOP (float, int16_t) +TEST_LOOP (double, int16_t) +TEST_LOOP (int8_t, int32_t) +TEST_LOOP (uint8_t, int32_t) +TEST_LOOP (int16_t, int32_t) +TEST_LOOP (uint16_t, int32_t) +TEST_LOOP (int32_t, int32_t) +TEST_LOOP (uint32_t, int32_t) +TEST_LOOP (int64_t, int32_t) +TEST_LOOP (uint64_t, int32_t) +TEST_LOOP (_Float16, int32_t) +TEST_LOOP (float, int32_t) +TEST_LOOP (double, int32_t) +TEST_LOOP (int8_t, int64_t) +TEST_LOOP (uint8_t, int64_t) +TEST_LOOP (int16_t, int64_t) +TEST_LOOP (uint16_t, int64_t) +TEST_LOOP (int32_t, int64_t) +TEST_LOOP (uint32_t, int64_t) +TEST_LOOP (int64_t, int64_t) +TEST_LOOP (uint64_t, int64_t) +TEST_LOOP (_Float16, int64_t) +TEST_LOOP (float, int64_t) +TEST_LOOP (double, int64_t) +TEST_LOOP (int8_t, uint8_t) +TEST_LOOP (uint8_t, uint8_t) +TEST_LOOP (int16_t, uint8_t) +TEST_LOOP (uint16_t, uint8_t) +TEST_LOOP (int32_t, uint8_t) +TEST_LOOP (uint32_t, uint8_t) +TEST_LOOP (int64_t, uint8_t) +TEST_LOOP (uint64_t, uint8_t) +TEST_LOOP (_Float16, uint8_t) +TEST_LOOP (float, uint8_t) +TEST_LOOP (double, uint8_t) +TEST_LOOP (int8_t, uint16_t) +TEST_LOOP (uint8_t, uint16_t) +TEST_LOOP (int16_t, uint16_t) +TEST_LOOP (uint16_t, uint16_t) +TEST_LOOP (int32_t, uint16_t) +TEST_LOOP (uint32_t, uint16_t) +TEST_LOOP (int64_t, uint16_t) +TEST_LOOP (uint64_t, uint16_t) +TEST_LOOP (_Float16, uint16_t) +TEST_LOOP (float, uint16_t) +TEST_LOOP (double, uint16_t) +TEST_LOOP (int8_t, uint32_t) +TEST_LOOP (uint8_t, uint32_t) +TEST_LOOP (int16_t, uint32_t) +TEST_LOOP (uint16_t, uint32_t) +TEST_LOOP (int32_t, uint32_t) +TEST_LOOP (uint32_t, uint32_t) +TEST_LOOP (int64_t, uint32_t) +TEST_LOOP (uint64_t, uint32_t) +TEST_LOOP (_Float16, uint32_t) +TEST_LOOP (float, uint32_t) +TEST_LOOP (double, uint32_t) +TEST_LOOP (int8_t, uint64_t) +TEST_LOOP (uint8_t, uint64_t) +TEST_LOOP (int16_t, uint64_t) +TEST_LOOP (uint16_t, uint64_t) +TEST_LOOP (int32_t, uint64_t) +TEST_LOOP (uint32_t, uint64_t) +TEST_LOOP (int64_t, uint64_t) +TEST_LOOP (uint64_t, uint64_t) +TEST_LOOP (_Float16, uint64_t) +TEST_LOOP (float, uint64_t) +TEST_LOOP (double, uint64_t) + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 88 "vect" } } */ +/* { dg-final { scan-tree-dump " \.MASK_LEN_GATHER_LOAD" "vect" } } */ +/* { dg-final { scan-tree-dump-not " \.GATHER_LOAD" "vect" } } */ +/* { dg-final { scan-tree-dump-not " \.MASK_GATHER_LOAD" "vect" } } */ +/* { dg-final { scan-assembler "vwsll.vi" } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index a3992faab5e..836545b4e11 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1958,6 +1958,17 @@ proc check_effective_target_riscv_zbb { } { }] } +# Return 1 if the target arch supports the Zbb extension, 0 otherwise. +# Cache the result. + +proc check_effective_target_riscv_zvbb { } { + return [check_no_compiler_messages riscv_ext_zvbb assembly { + #ifndef __riscv_zvbb + #error "Not __riscv_zvbb" + #endif + }] +} + # Return 1 if the target arch supports the XTheadVector extension, 0 otherwise. # Cache the result. @@ -2046,10 +2057,33 @@ proc check_effective_target_riscv_zvfh_ok { } { return 0 } +proc check_effective_target_riscv_zvbb_ok { } { + # If the target already supports v without any added options, + # we may assume we can execute just fine. + if { [check_effective_target_riscv_zvbb] } { + return 1 + } + + # check if we can execute vector insns with the given hardware or + # simulator + set gcc_march [regsub {[[:alnum:]]*} [riscv_get_arch] &zvbb] + if { [check_runtime ${gcc_march}_zvbb_exec { + int main() + { + asm ("vsetivli zero,8,e16,m1,ta,ma"); + asm ("vwsll.vi v8,v16,2" : : : "v8"); + return 0; + } } "-march=${gcc_march}"] } { + return 1 + } + + return 0 +} + proc riscv_get_arch { } { set gcc_march "" # ??? do we neeed to add more extensions to the list below? - foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvfh ztso } { + foreach ext { i m a f d q c v zicsr zifencei zfh zba zbb zbc zbs zvbb zvfh ztso } { if { [check_no_compiler_messages riscv_ext_$ext assembly [string map [list DEF __riscv_$ext] { #ifndef DEF #error "Not DEF" @@ -2144,6 +2178,18 @@ proc add_options_for_riscv_zvfh { flags } { return "$flags -march=[riscv_get_arch]_zvfh" } +proc add_options_for_riscv_zvbb { flags } { + if { [lsearch $flags -march=*] >= 0 } { + # If there are multiple -march flags, we have to adjust all of them. + set flags [regsub -all -- {(?:^|[[:space:]])-march=[[:alnum:]_.]*} $flags &_zvbb ] + return [regsub -all -- {((?:^|[[:space:]])-march=[[:alnum:]_.]*_zvbb[[:alnum:]_.]*)_zvbb} $flags \\1 ] + } + if { [check_effective_target_riscv_zvbb] } { + return "$flags" + } + return "$flags -march=[riscv_get_arch]_zvbb" +} + # Return 1 if the target OS supports running SSE executables, 0 # otherwise. Cache the result.