Hi. Robin. Thanks for working on this bug.

As I remembered, the reason I put vleff intrinsic genereted into vleff + csrr 
vl in gimple level is because:

In this intrinsic code:

vleff (.... &vl, avl)
vadd (..., vl)

The assembly will become:

vleff ....
vadd ...

Instead of

vleff ....
csrr vl
vadd ...

The "csrr vl" will be elided.

So I wonder, whether "csrr vl" can be elided after this patch ?

> From: "Robin Dapp"<[email protected]>
> Date:  Tue, Jan 20, 2026, 23:22
> Subject:  [PATCH] RISC-V: Fix intrinsic FoF load at -O0 [PR122869].
> To: "gcc-patches"<[email protected]>
> Cc: <[email protected]>, <[email protected]>, <[email protected]>, 
> <[email protected]>, <[email protected]>
> Hi,
> 
> In the PR we try to compile a loop at -O0 with fault-only-first loads.
> We use the VL adjusted by the FoF loads to count the number of
> processed elements.  Currently, this is implemented as "folding" the FoF
> load into a FoF load and a riscv_read_vl directly after.
> We cannot guarantee the value of VL between two calls, though.  It is
> possible that we need a vector store in between which would clobber VL.
> 
> This patch makes the VL -> pseudo semantics of the FoF insn explicit and
> adjusts the intrinsics expander accordingly.
> 
> There is a problem with this approach, though:  Technically, the VL
> adjustment of the FoF loads is modelled as a store and the VL variable
> is made TREE_ADDRESSABLE.  At the gimple level we managed to elide the
> store very early but at RTL level we don't.  Also, we don't manage to
> re-use the same register for VL at -O2 and -O3 while it still works for
> -O1.
> 
> What might help with the second issue above is to add value tracking
> to the vsetvl pass.  I suppose the first issue would require a larger
> intervention.
> 
> Regtested on rv64gcv_zvl512b.
> 
> Regards
>  Robin
> 
>         PR target/122869
> 
> gcc/ChangeLog:
> 
>         * config/riscv/riscv-vector-builtins-bases.cc (fold_fault_load):
>         Remove
>         * config/riscv/riscv-vector-builtins.cc 
> (function_expander::use_contiguous_load_insn):
>         Use new helper.
>         (function_expander::prepare_contiguous_load_insn): New helper.
>         (function_expander::use_fof_load_insn): New function to emit FoF
>         loads.
>         * config/riscv/riscv-vector-builtins.h: Declare new functions.
> 
> gcc/testsuite/ChangeLog:
> 
>         * gcc.target/riscv/rvv/base/pr122656-1.c: Remove dg-error.
>         * gcc.target/riscv/rvv/vsetvl/ffload-3.c: XFAIL for -O2 and -O3.
>         * gcc.target/riscv/rvv/base/pr122869.c: New test.
> ---
>  .../riscv/riscv-vector-builtins-bases.cc      | 64 +------------------
>  gcc/config/riscv/riscv-vector-builtins.cc     | 64 +++++++++++++++++--
>  gcc/config/riscv/riscv-vector-builtins.h      |  2 +
>  .../gcc.target/riscv/rvv/base/pr122656-1.c    |  2 +-
>  .../gcc.target/riscv/rvv/base/pr122869.c      | 22 +++++++
>  .../gcc.target/riscv/rvv/vsetvl/ffload-3.c    |  3 +-
>  6 files changed, 89 insertions(+), 68 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
> 
> diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc 
> b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> index 58960037b1b..0bb878f0122 100644
> --- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
> @@ -58,54 +58,6 @@ enum lst_type
>    LST_INDEXED,
>  };
>  
> -/* Helper function to fold vleff and vlsegff.  */
> -static gimple *
> -fold_fault_load (gimple_folder &f)
> -{
> -  /* fold fault_load (const *base, size_t *new_vl, size_t vl)
> -
> -     ====> fault_load (const *base, size_t vl)
> -           new_vl = MEM_REF[read_vl ()].  */
> -
> -  auto_vec<tree> vargs (gimple_call_num_args (f.call) - 1);
> -
> -  for (unsigned i = 0; i < gimple_call_num_args (f.call); i++)
> -    {
> -      /* Exclude size_t *new_vl argument.  */
> -      if (i == gimple_call_num_args (f.call) - 2)
> -        continue;
> -
> -      vargs.quick_push (gimple_call_arg (f.call, i));
> -    }
> -
> -  gimple *repl = gimple_build_call_vec (gimple_call_fn (f.call), vargs);
> -  gimple_call_set_lhs (repl, f.lhs);
> -
> -  /* Handle size_t *new_vl by read_vl.  */
> -  tree new_vl = gimple_call_arg (f.call, gimple_call_num_args (f.call) - 2);
> -  if (integer_zerop (new_vl))
> -    {
> -      /* This case happens when user passes the nullptr to new_vl argument.
> -         In this case, we just need to ignore the new_vl argument and return
> -         fault_load instruction directly. */
> -      return repl;
> -    }
> -
> -  tree tmp_var = create_tmp_var (size_type_node, "new_vl");
> -  tree decl = get_read_vl_decl ();
> -  gimple *g = gimple_build_call (decl, 0);
> -  gimple_call_set_lhs (g, tmp_var);
> -  tree indirect
> -    = fold_build2 (MEM_REF, size_type_node,
> -                   gimple_call_arg (f.call, gimple_call_num_args (f.call) - 
> 2),
> -                   build_int_cst (build_pointer_type (size_type_node), 0));
> -  gassign *assign = gimple_build_assign (indirect, tmp_var);
> -
> -  gsi_insert_after (f.gsi, assign, GSI_SAME_STMT);
> -  gsi_insert_after (f.gsi, g, GSI_SAME_STMT);
> -  return repl;
> -}
> -
>  /* Implements vsetvl<mode> && vsetvlmax<mode>.  */
>  template<bool VLMAX_P>
>  class vsetvl : public function_base
> @@ -1995,15 +1947,9 @@ public:
>      return pred != PRED_TYPE_none;
>    }
>  
> -  gimple *fold (gimple_folder &f) const override
> -  {
> -    return fold_fault_load (f);
> -  }
> -
>    rtx expand (function_expander &e) const override
>    {
> -    return e.use_contiguous_load_insn (
> -      code_for_pred_fault_load (e.vector_mode ()));
> +    return e.use_fof_load_insn ();
>    }
>  };
>  
> @@ -2171,15 +2117,9 @@ public:
>      return pred != PRED_TYPE_none;
>    }
>  
> -  gimple *fold (gimple_folder &f) const override
> -  {
> -    return fold_fault_load (f);
> -  }
> -
>    rtx expand (function_expander &e) const override
>    {
> -    return e.use_contiguous_load_insn
> -      (code_for_pred_fault_load (e.vector_mode ()));
> +    return e.use_fof_load_insn ();
>    }
>  };
>  
> diff --git a/gcc/config/riscv/riscv-vector-builtins.cc 
> b/gcc/config/riscv/riscv-vector-builtins.cc
> index b7dba4eada2..63cf4d691e7 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.cc
> +++ b/gcc/config/riscv/riscv-vector-builtins.cc
> @@ -4839,9 +4839,8 @@ function_expander::use_exact_insn (insn_code icode)
>    return generate_insn (icode);
>  }
>  
> -/* Use contiguous load INSN.  */
> -rtx
> -function_expander::use_contiguous_load_insn (insn_code icode)
> +int
> +function_expander::prepare_contiguous_load_insn ()
>  {
>    gcc_assert (call_expr_nargs (exp) > 0);
>    machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
> @@ -4860,10 +4859,19 @@ function_expander::use_contiguous_load_insn 
> (insn_code icode)
>      add_vundef_operand (mode);
>  
>    add_mem_operand (mode, arg_offset++);
> +  return arg_offset;
> +}
> +
> +/* Use contiguous load INSN.  */
> +rtx
> +function_expander::use_contiguous_load_insn (insn_code icode)
> +{
> +  int arg_offset = prepare_contiguous_load_insn ();
>  
>    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
>      add_input_operand (argno);
>  
> +  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
>    if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
>      {
>        add_input_operand (Pmode, get_tail_policy_for_pred (pred));
> @@ -4872,10 +4880,58 @@ function_expander::use_contiguous_load_insn 
> (insn_code icode)
>  
>    if (opno != insn_data[icode].n_generator_args)
>      add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
> -
>    return generate_insn (icode);
>  }
>  
> +/* Similar to use_contiguous_load_insn but skips the vector-length 
> destination
> +   operand that a fault-only-first load intrinsic has.  Then we add tail and
> +   mask policy as well as AVL operand.  Last, add the vector-length 
> destination
> +   operand that we skipped initially.  */
> +rtx
> +function_expander::use_fof_load_insn ()
> +{
> +  int arg_offset = prepare_contiguous_load_insn ();
> +
> +  int vl_dest_arg = call_expr_nargs (exp) - 2;
> +  for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
> +    {
> +      /* Skip argument for VL destination in memory but add the others.  */
> +      if (argno != vl_dest_arg)
> +        add_input_operand (argno);
> +    }
> +
> +  machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
> +  if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
> +    {
> +      add_input_operand (Pmode, get_tail_policy_for_pred (pred));
> +      add_input_operand (Pmode, get_mask_policy_for_pred (pred));
> +    }
> +
> +  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
> +
> +  tree arg = CALL_EXPR_ARG (exp, vl_dest_arg);
> +
> +  /* Use a regular FoF load if the user does not want to store VL.  */
> +  insn_code icode = code_for_pred_fault_load (mode);
> +  rtx result = generate_insn (icode);
> +
> +  /* If user wants VL stored, emit a read_vl and store to memory.  */
> +  if (!integer_zerop (arg))
> +    {
> +      rtx vl_reg = gen_reg_rtx (Pmode);
> +      if (Pmode == SImode)
> +        emit_insn (gen_read_vlsi (vl_reg));
> +      else
> +        emit_insn (gen_read_vldi_zero_extend (vl_reg));
> +
> +      rtx addr = expand_normal (arg);
> +      rtx mem = gen_rtx_MEM (Pmode, memory_address (Pmode, addr));
> +      emit_move_insn (mem, vl_reg);
> +    }
> +
> +  return result;
> +}
> +
>  /* Use contiguous store INSN.  */
>  rtx
>  function_expander::use_contiguous_store_insn (insn_code icode)
> diff --git a/gcc/config/riscv/riscv-vector-builtins.h 
> b/gcc/config/riscv/riscv-vector-builtins.h
> index d864e22be4c..d5fe0cd7a22 100644
> --- a/gcc/config/riscv/riscv-vector-builtins.h
> +++ b/gcc/config/riscv/riscv-vector-builtins.h
> @@ -548,7 +548,9 @@ public:
>    machine_mode ret_mode (void) const;
>  
>    rtx use_exact_insn (insn_code);
> +  int prepare_contiguous_load_insn ();
>    rtx use_contiguous_load_insn (insn_code);
> +  rtx use_fof_load_insn ();
>    rtx use_contiguous_store_insn (insn_code);
>    rtx use_compare_insn (rtx_code, insn_code);
>    rtx use_ternop_insn (bool, insn_code);
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
> index 76adbed3f61..1757989856c 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
> @@ -4,4 +4,4 @@
>  #include "riscv_vector.h"
>  int a;
>  long b, c;
> -void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); } /* { dg-error 
> "invalid argument to built-in function" } */
> +void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); }
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
> new file mode 100644
> index 00000000000..e00ac04bebb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
> @@ -0,0 +1,22 @@
> +/* { dg-do run } */
> +/* { dg-additional-options "-O0 -std=gnu99" } */
> +/* We used to generate a separate riscv_read_vl () after the FoF load.
> +   In case of -O0 (or otherwise) it could happen that "g" wouldn't
> +   get a hard reg and we'd need to store it, clobbering VL.
> +   This leads to an infinite loop or a segfault.  */
> +
> +#include <riscv_vector.h>
> +
> +uint8_t a[1];
> +int16_t b[1];
> +
> +int main ()
> +{
> +  for (size_t c = 0, avl = 1; avl > 0;)
> +    {
> +      size_t d = avl;
> +      vint16mf2_t g = __riscv_vle16ff_v_i16mf2 (&b[c], &d, d);
> +      avl -= d;
> +      c += d; // Segmentation fault
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
> index b4f7cc4431e..732c70ecdf2 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
> @@ -25,4 +25,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n, 
> int m, int cond)
>      }
>  }
>  
> -/* { dg-final { scan-assembler-times 
> {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { 
> no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
> +/* -O2 and -O3 fail now, see PR target/122869.  */
> +/* { dg-final { scan-assembler-times 
> {vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target { 
> no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } xfail { any-opts "-O2" 
> "-O3" } } } } */
> -- 
> 2.52.0
> 

Reply via email to