Hi. Robin. Thanks for working on this bug. As I remembered, the reason I put vleff intrinsic genereted into vleff + csrr vl in gimple level is because:
In this intrinsic code: vleff (.... &vl, avl) vadd (..., vl) The assembly will become: vleff .... vadd ... Instead of vleff .... csrr vl vadd ... The "csrr vl" will be elided. So I wonder, whether "csrr vl" can be elided after this patch ? > From: "Robin Dapp"<[email protected]> > Date: Tue, Jan 20, 2026, 23:22 > Subject: [PATCH] RISC-V: Fix intrinsic FoF load at -O0 [PR122869]. > To: "gcc-patches"<[email protected]> > Cc: <[email protected]>, <[email protected]>, <[email protected]>, > <[email protected]>, <[email protected]> > Hi, > > In the PR we try to compile a loop at -O0 with fault-only-first loads. > We use the VL adjusted by the FoF loads to count the number of > processed elements. Currently, this is implemented as "folding" the FoF > load into a FoF load and a riscv_read_vl directly after. > We cannot guarantee the value of VL between two calls, though. It is > possible that we need a vector store in between which would clobber VL. > > This patch makes the VL -> pseudo semantics of the FoF insn explicit and > adjusts the intrinsics expander accordingly. > > There is a problem with this approach, though: Technically, the VL > adjustment of the FoF loads is modelled as a store and the VL variable > is made TREE_ADDRESSABLE. At the gimple level we managed to elide the > store very early but at RTL level we don't. Also, we don't manage to > re-use the same register for VL at -O2 and -O3 while it still works for > -O1. > > What might help with the second issue above is to add value tracking > to the vsetvl pass. I suppose the first issue would require a larger > intervention. > > Regtested on rv64gcv_zvl512b. > > Regards > Robin > > PR target/122869 > > gcc/ChangeLog: > > * config/riscv/riscv-vector-builtins-bases.cc (fold_fault_load): > Remove > * config/riscv/riscv-vector-builtins.cc > (function_expander::use_contiguous_load_insn): > Use new helper. > (function_expander::prepare_contiguous_load_insn): New helper. > (function_expander::use_fof_load_insn): New function to emit FoF > loads. > * config/riscv/riscv-vector-builtins.h: Declare new functions. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/base/pr122656-1.c: Remove dg-error. > * gcc.target/riscv/rvv/vsetvl/ffload-3.c: XFAIL for -O2 and -O3. > * gcc.target/riscv/rvv/base/pr122869.c: New test. > --- > .../riscv/riscv-vector-builtins-bases.cc | 64 +------------------ > gcc/config/riscv/riscv-vector-builtins.cc | 64 +++++++++++++++++-- > gcc/config/riscv/riscv-vector-builtins.h | 2 + > .../gcc.target/riscv/rvv/base/pr122656-1.c | 2 +- > .../gcc.target/riscv/rvv/base/pr122869.c | 22 +++++++ > .../gcc.target/riscv/rvv/vsetvl/ffload-3.c | 3 +- > 6 files changed, 89 insertions(+), 68 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c > > diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc > b/gcc/config/riscv/riscv-vector-builtins-bases.cc > index 58960037b1b..0bb878f0122 100644 > --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc > +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc > @@ -58,54 +58,6 @@ enum lst_type > LST_INDEXED, > }; > > -/* Helper function to fold vleff and vlsegff. */ > -static gimple * > -fold_fault_load (gimple_folder &f) > -{ > - /* fold fault_load (const *base, size_t *new_vl, size_t vl) > - > - ====> fault_load (const *base, size_t vl) > - new_vl = MEM_REF[read_vl ()]. */ > - > - auto_vec<tree> vargs (gimple_call_num_args (f.call) - 1); > - > - for (unsigned i = 0; i < gimple_call_num_args (f.call); i++) > - { > - /* Exclude size_t *new_vl argument. */ > - if (i == gimple_call_num_args (f.call) - 2) > - continue; > - > - vargs.quick_push (gimple_call_arg (f.call, i)); > - } > - > - gimple *repl = gimple_build_call_vec (gimple_call_fn (f.call), vargs); > - gimple_call_set_lhs (repl, f.lhs); > - > - /* Handle size_t *new_vl by read_vl. */ > - tree new_vl = gimple_call_arg (f.call, gimple_call_num_args (f.call) - 2); > - if (integer_zerop (new_vl)) > - { > - /* This case happens when user passes the nullptr to new_vl argument. > - In this case, we just need to ignore the new_vl argument and return > - fault_load instruction directly. */ > - return repl; > - } > - > - tree tmp_var = create_tmp_var (size_type_node, "new_vl"); > - tree decl = get_read_vl_decl (); > - gimple *g = gimple_build_call (decl, 0); > - gimple_call_set_lhs (g, tmp_var); > - tree indirect > - = fold_build2 (MEM_REF, size_type_node, > - gimple_call_arg (f.call, gimple_call_num_args (f.call) - > 2), > - build_int_cst (build_pointer_type (size_type_node), 0)); > - gassign *assign = gimple_build_assign (indirect, tmp_var); > - > - gsi_insert_after (f.gsi, assign, GSI_SAME_STMT); > - gsi_insert_after (f.gsi, g, GSI_SAME_STMT); > - return repl; > -} > - > /* Implements vsetvl<mode> && vsetvlmax<mode>. */ > template<bool VLMAX_P> > class vsetvl : public function_base > @@ -1995,15 +1947,9 @@ public: > return pred != PRED_TYPE_none; > } > > - gimple *fold (gimple_folder &f) const override > - { > - return fold_fault_load (f); > - } > - > rtx expand (function_expander &e) const override > { > - return e.use_contiguous_load_insn ( > - code_for_pred_fault_load (e.vector_mode ())); > + return e.use_fof_load_insn (); > } > }; > > @@ -2171,15 +2117,9 @@ public: > return pred != PRED_TYPE_none; > } > > - gimple *fold (gimple_folder &f) const override > - { > - return fold_fault_load (f); > - } > - > rtx expand (function_expander &e) const override > { > - return e.use_contiguous_load_insn > - (code_for_pred_fault_load (e.vector_mode ())); > + return e.use_fof_load_insn (); > } > }; > > diff --git a/gcc/config/riscv/riscv-vector-builtins.cc > b/gcc/config/riscv/riscv-vector-builtins.cc > index b7dba4eada2..63cf4d691e7 100644 > --- a/gcc/config/riscv/riscv-vector-builtins.cc > +++ b/gcc/config/riscv/riscv-vector-builtins.cc > @@ -4839,9 +4839,8 @@ function_expander::use_exact_insn (insn_code icode) > return generate_insn (icode); > } > > -/* Use contiguous load INSN. */ > -rtx > -function_expander::use_contiguous_load_insn (insn_code icode) > +int > +function_expander::prepare_contiguous_load_insn () > { > gcc_assert (call_expr_nargs (exp) > 0); > machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); > @@ -4860,10 +4859,19 @@ function_expander::use_contiguous_load_insn > (insn_code icode) > add_vundef_operand (mode); > > add_mem_operand (mode, arg_offset++); > + return arg_offset; > +} > + > +/* Use contiguous load INSN. */ > +rtx > +function_expander::use_contiguous_load_insn (insn_code icode) > +{ > + int arg_offset = prepare_contiguous_load_insn (); > > for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) > add_input_operand (argno); > > + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); > if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) > { > add_input_operand (Pmode, get_tail_policy_for_pred (pred)); > @@ -4872,10 +4880,58 @@ function_expander::use_contiguous_load_insn > (insn_code icode) > > if (opno != insn_data[icode].n_generator_args) > add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); > - > return generate_insn (icode); > } > > +/* Similar to use_contiguous_load_insn but skips the vector-length > destination > + operand that a fault-only-first load intrinsic has. Then we add tail and > + mask policy as well as AVL operand. Last, add the vector-length > destination > + operand that we skipped initially. */ > +rtx > +function_expander::use_fof_load_insn () > +{ > + int arg_offset = prepare_contiguous_load_insn (); > + > + int vl_dest_arg = call_expr_nargs (exp) - 2; > + for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++) > + { > + /* Skip argument for VL destination in memory but add the others. */ > + if (argno != vl_dest_arg) > + add_input_operand (argno); > + } > + > + machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); > + if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL) > + { > + add_input_operand (Pmode, get_tail_policy_for_pred (pred)); > + add_input_operand (Pmode, get_mask_policy_for_pred (pred)); > + } > + > + add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX)); > + > + tree arg = CALL_EXPR_ARG (exp, vl_dest_arg); > + > + /* Use a regular FoF load if the user does not want to store VL. */ > + insn_code icode = code_for_pred_fault_load (mode); > + rtx result = generate_insn (icode); > + > + /* If user wants VL stored, emit a read_vl and store to memory. */ > + if (!integer_zerop (arg)) > + { > + rtx vl_reg = gen_reg_rtx (Pmode); > + if (Pmode == SImode) > + emit_insn (gen_read_vlsi (vl_reg)); > + else > + emit_insn (gen_read_vldi_zero_extend (vl_reg)); > + > + rtx addr = expand_normal (arg); > + rtx mem = gen_rtx_MEM (Pmode, memory_address (Pmode, addr)); > + emit_move_insn (mem, vl_reg); > + } > + > + return result; > +} > + > /* Use contiguous store INSN. */ > rtx > function_expander::use_contiguous_store_insn (insn_code icode) > diff --git a/gcc/config/riscv/riscv-vector-builtins.h > b/gcc/config/riscv/riscv-vector-builtins.h > index d864e22be4c..d5fe0cd7a22 100644 > --- a/gcc/config/riscv/riscv-vector-builtins.h > +++ b/gcc/config/riscv/riscv-vector-builtins.h > @@ -548,7 +548,9 @@ public: > machine_mode ret_mode (void) const; > > rtx use_exact_insn (insn_code); > + int prepare_contiguous_load_insn (); > rtx use_contiguous_load_insn (insn_code); > + rtx use_fof_load_insn (); > rtx use_contiguous_store_insn (insn_code); > rtx use_compare_insn (rtx_code, insn_code); > rtx use_ternop_insn (bool, insn_code); > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c > index 76adbed3f61..1757989856c 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c > @@ -4,4 +4,4 @@ > #include "riscv_vector.h" > int a; > long b, c; > -void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); } /* { dg-error > "invalid argument to built-in function" } */ > +void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); } > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c > b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c > new file mode 100644 > index 00000000000..e00ac04bebb > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c > @@ -0,0 +1,22 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-O0 -std=gnu99" } */ > +/* We used to generate a separate riscv_read_vl () after the FoF load. > + In case of -O0 (or otherwise) it could happen that "g" wouldn't > + get a hard reg and we'd need to store it, clobbering VL. > + This leads to an infinite loop or a segfault. */ > + > +#include <riscv_vector.h> > + > +uint8_t a[1]; > +int16_t b[1]; > + > +int main () > +{ > + for (size_t c = 0, avl = 1; avl > 0;) > + { > + size_t d = avl; > + vint16mf2_t g = __riscv_vle16ff_v_i16mf2 (&b[c], &d, d); > + avl -= d; > + c += d; // Segmentation fault > + } > +} > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c > b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c > index b4f7cc4431e..732c70ecdf2 100644 > --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c > +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c > @@ -25,4 +25,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, > int m, int cond) > } > } > > -/* { dg-final { scan-assembler-times > {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { > no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */ > +/* -O2 and -O3 fail now, see PR target/122869. */ > +/* { dg-final { scan-assembler-times > {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { > no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } xfail { any-opts "-O2" > "-O3" } } } } */ > -- > 2.52.0 >
