Hi,
In the PR we try to compile a loop at -O0 with fault-only-first loads.
We use the VL adjusted by the FoF loads to count the number of
processed elements. Currently, this is implemented as "folding" the FoF
load into a FoF load and a riscv_read_vl directly after.
We cannot guarantee the value of VL between two calls, though. It is
possible that we need a vector store in between which would clobber VL.
This patch makes the VL -> pseudo semantics of the FoF insn explicit and
adjusts the intrinsics expander accordingly.
There is a problem with this approach, though: Technically, the VL
adjustment of the FoF loads is modelled as a store and the VL variable
is made TREE_ADDRESSABLE. At the gimple level we managed to elide the
store very early but at RTL level we don't. Also, we don't manage to
re-use the same register for VL at -O2 and -O3 while it still works for
-O1.
What might help with the second issue above is to add value tracking
to the vsetvl pass. I suppose the first issue would require a larger
intervention.
Regtested on rv64gcv_zvl512b.
Regards
Robin
PR target/122869
gcc/ChangeLog:
* config/riscv/riscv-vector-builtins-bases.cc (fold_fault_load):
Remove
* config/riscv/riscv-vector-builtins.cc
(function_expander::use_contiguous_load_insn):
Use new helper.
(function_expander::prepare_contiguous_load_insn): New helper.
(function_expander::use_fof_load_insn): New function to emit FoF
loads.
* config/riscv/riscv-vector-builtins.h: Declare new functions.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/base/pr122656-1.c: Remove dg-error.
* gcc.target/riscv/rvv/vsetvl/ffload-3.c: XFAIL for -O2 and -O3.
* gcc.target/riscv/rvv/base/pr122869.c: New test.
---
.../riscv/riscv-vector-builtins-bases.cc | 64 +------------------
gcc/config/riscv/riscv-vector-builtins.cc | 64 +++++++++++++++++--
gcc/config/riscv/riscv-vector-builtins.h | 2 +
.../gcc.target/riscv/rvv/base/pr122656-1.c | 2 +-
.../gcc.target/riscv/rvv/base/pr122869.c | 22 +++++++
.../gcc.target/riscv/rvv/vsetvl/ffload-3.c | 3 +-
6 files changed, 89 insertions(+), 68 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc
b/gcc/config/riscv/riscv-vector-builtins-bases.cc
index 58960037b1b..0bb878f0122 100644
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -58,54 +58,6 @@ enum lst_type
LST_INDEXED,
};
-/* Helper function to fold vleff and vlsegff. */
-static gimple *
-fold_fault_load (gimple_folder &f)
-{
- /* fold fault_load (const *base, size_t *new_vl, size_t vl)
-
- ====> fault_load (const *base, size_t vl)
- new_vl = MEM_REF[read_vl ()]. */
-
- auto_vec<tree> vargs (gimple_call_num_args (f.call) - 1);
-
- for (unsigned i = 0; i < gimple_call_num_args (f.call); i++)
- {
- /* Exclude size_t *new_vl argument. */
- if (i == gimple_call_num_args (f.call) - 2)
- continue;
-
- vargs.quick_push (gimple_call_arg (f.call, i));
- }
-
- gimple *repl = gimple_build_call_vec (gimple_call_fn (f.call), vargs);
- gimple_call_set_lhs (repl, f.lhs);
-
- /* Handle size_t *new_vl by read_vl. */
- tree new_vl = gimple_call_arg (f.call, gimple_call_num_args (f.call) - 2);
- if (integer_zerop (new_vl))
- {
- /* This case happens when user passes the nullptr to new_vl argument.
- In this case, we just need to ignore the new_vl argument and return
- fault_load instruction directly. */
- return repl;
- }
-
- tree tmp_var = create_tmp_var (size_type_node, "new_vl");
- tree decl = get_read_vl_decl ();
- gimple *g = gimple_build_call (decl, 0);
- gimple_call_set_lhs (g, tmp_var);
- tree indirect
- = fold_build2 (MEM_REF, size_type_node,
- gimple_call_arg (f.call, gimple_call_num_args (f.call) - 2),
- build_int_cst (build_pointer_type (size_type_node), 0));
- gassign *assign = gimple_build_assign (indirect, tmp_var);
-
- gsi_insert_after (f.gsi, assign, GSI_SAME_STMT);
- gsi_insert_after (f.gsi, g, GSI_SAME_STMT);
- return repl;
-}
-
/* Implements vsetvl<mode> && vsetvlmax<mode>. */
template<bool VLMAX_P>
class vsetvl : public function_base
@@ -1995,15 +1947,9 @@ public:
return pred != PRED_TYPE_none;
}
- gimple *fold (gimple_folder &f) const override
- {
- return fold_fault_load (f);
- }
-
rtx expand (function_expander &e) const override
{
- return e.use_contiguous_load_insn (
- code_for_pred_fault_load (e.vector_mode ()));
+ return e.use_fof_load_insn ();
}
};
@@ -2171,15 +2117,9 @@ public:
return pred != PRED_TYPE_none;
}
- gimple *fold (gimple_folder &f) const override
- {
- return fold_fault_load (f);
- }
-
rtx expand (function_expander &e) const override
{
- return e.use_contiguous_load_insn
- (code_for_pred_fault_load (e.vector_mode ()));
+ return e.use_fof_load_insn ();
}
};
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc
b/gcc/config/riscv/riscv-vector-builtins.cc
index b7dba4eada2..63cf4d691e7 100644
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -4839,9 +4839,8 @@ function_expander::use_exact_insn (insn_code icode)
return generate_insn (icode);
}
-/* Use contiguous load INSN. */
-rtx
-function_expander::use_contiguous_load_insn (insn_code icode)
+int
+function_expander::prepare_contiguous_load_insn ()
{
gcc_assert (call_expr_nargs (exp) > 0);
machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
@@ -4860,10 +4859,19 @@ function_expander::use_contiguous_load_insn (insn_code
icode)
add_vundef_operand (mode);
add_mem_operand (mode, arg_offset++);
+ return arg_offset;
+}
+
+/* Use contiguous load INSN. */
+rtx
+function_expander::use_contiguous_load_insn (insn_code icode)
+{
+ int arg_offset = prepare_contiguous_load_insn ();
for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
add_input_operand (argno);
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
{
add_input_operand (Pmode, get_tail_policy_for_pred (pred));
@@ -4872,10 +4880,58 @@ function_expander::use_contiguous_load_insn (insn_code
icode)
if (opno != insn_data[icode].n_generator_args)
add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
-
return generate_insn (icode);
}
+/* Similar to use_contiguous_load_insn but skips the vector-length destination
+ operand that a fault-only-first load intrinsic has. Then we add tail and
+ mask policy as well as AVL operand. Last, add the vector-length destination
+ operand that we skipped initially. */
+rtx
+function_expander::use_fof_load_insn ()
+{
+ int arg_offset = prepare_contiguous_load_insn ();
+
+ int vl_dest_arg = call_expr_nargs (exp) - 2;
+ for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
+ {
+ /* Skip argument for VL destination in memory but add the others. */
+ if (argno != vl_dest_arg)
+ add_input_operand (argno);
+ }
+
+ machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
+ if (GET_MODE_CLASS (mode) != MODE_VECTOR_BOOL)
+ {
+ add_input_operand (Pmode, get_tail_policy_for_pred (pred));
+ add_input_operand (Pmode, get_mask_policy_for_pred (pred));
+ }
+
+ add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
+
+ tree arg = CALL_EXPR_ARG (exp, vl_dest_arg);
+
+ /* Use a regular FoF load if the user does not want to store VL. */
+ insn_code icode = code_for_pred_fault_load (mode);
+ rtx result = generate_insn (icode);
+
+ /* If user wants VL stored, emit a read_vl and store to memory. */
+ if (!integer_zerop (arg))
+ {
+ rtx vl_reg = gen_reg_rtx (Pmode);
+ if (Pmode == SImode)
+ emit_insn (gen_read_vlsi (vl_reg));
+ else
+ emit_insn (gen_read_vldi_zero_extend (vl_reg));
+
+ rtx addr = expand_normal (arg);
+ rtx mem = gen_rtx_MEM (Pmode, memory_address (Pmode, addr));
+ emit_move_insn (mem, vl_reg);
+ }
+
+ return result;
+}
+
/* Use contiguous store INSN. */
rtx
function_expander::use_contiguous_store_insn (insn_code icode)
diff --git a/gcc/config/riscv/riscv-vector-builtins.h
b/gcc/config/riscv/riscv-vector-builtins.h
index d864e22be4c..d5fe0cd7a22 100644
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -548,7 +548,9 @@ public:
machine_mode ret_mode (void) const;
rtx use_exact_insn (insn_code);
+ int prepare_contiguous_load_insn ();
rtx use_contiguous_load_insn (insn_code);
+ rtx use_fof_load_insn ();
rtx use_contiguous_store_insn (insn_code);
rtx use_compare_insn (rtx_code, insn_code);
rtx use_ternop_insn (bool, insn_code);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
index 76adbed3f61..1757989856c 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122656-1.c
@@ -4,4 +4,4 @@
#include "riscv_vector.h"
int a;
long b, c;
-void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); } /* { dg-error "invalid
argument to built-in function" } */
+void d() { __riscv_vlseg2e32ff_v_i32mf2x2(&a, &c, b); }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
new file mode 100644
index 00000000000..e00ac04bebb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr122869.c
@@ -0,0 +1,22 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O0 -std=gnu99" } */
+/* We used to generate a separate riscv_read_vl () after the FoF load.
+ In case of -O0 (or otherwise) it could happen that "g" wouldn't
+ get a hard reg and we'd need to store it, clobbering VL.
+ This leads to an infinite loop or a segfault. */
+
+#include <riscv_vector.h>
+
+uint8_t a[1];
+int16_t b[1];
+
+int main ()
+{
+ for (size_t c = 0, avl = 1; avl > 0;)
+ {
+ size_t d = avl;
+ vint16mf2_t g = __riscv_vle16ff_v_i16mf2 (&b[c], &d, d);
+ avl -= d;
+ c += d; // Segmentation fault
+ }
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
index b4f7cc4431e..732c70ecdf2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/ffload-3.c
@@ -25,4 +25,5 @@ void f (int8_t * restrict in, int8_t * restrict out, int n,
int m, int cond)
}
}
-/* { dg-final { scan-assembler-times
{vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target {
no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } } } } */
+/* -O2 and -O3 fail now, see PR target/122869. */
+/* { dg-final { scan-assembler-times
{vsetvli\s+zero,\s*[a-x0-9]+,\s*e8,\s*mf8,\s*t[au],\s*m[au]} 2 { target {
no-opts "-O0" no-opts "-g" no-opts "-funroll-loops" } xfail { any-opts "-O2"
"-O3" } } } } */
--
2.52.0