On 2/24/25 16:07, Edwin Lu wrote:
> See [1] thread for original patch which spawned this one.
>
> We are currently seeing the following code where we perform a vsetvl
> before a branching instruction against the avl.
>
> vsetvli a5,a1,e32,m1,tu,ma
> vle32.v v2,0(a0)
> sub a1,a1,a5 <-- a1 potentially set to 0
> sh2add a0,a5,a0
> vfmacc.vv v1,v2,v2
> vsetvli a5,a1,e32,m1,tu,ma <-- incompatible vinfo. update vl to 0
> beq a1,zero,.L12 <-- check if avl is 0
>
> Since we are branching off of the avl, we don't need to update vl until
> after the branch is taken. Search the ready queue for vsetvls scheduled
> before branching instructions that branch off of the same regno and
> promote the branches to execute first. This can improve performancy by
> potentially avoiding setting VL=0 which may be expensive on some uarches.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675622.html
>
> PR/117974
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (vsetvl_avl_regno): New helper function.
> (insn_increases_zeroness_p): Ditto.
> (riscv_promote_ready): Ditto.
> (riscv_sched_reorder): Implement hook.
> (TARGET_SCHED_REORDER): Define Hook.
> * config/riscv/riscv.opt: New flag.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/vsetvl/pr117974.c: New test.
>
> Signed-off-by: Edwin Lu <[email protected]>
> Co-authored-by: Palmer Dabbelt <[email protected]>
> ---
> gcc/config/riscv/riscv.cc | 103 ++++++++++++++++++
> gcc/config/riscv/riscv.opt | 4 +
> .../gcc.target/riscv/rvv/vsetvl/pr117974.c | 15 +++
> 3 files changed, 122 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 89aa25d5da9..cf0866fa3fb 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -14035,6 +14035,106 @@ bool need_shadow_stack_push_pop_p ()
> return is_zicfiss_p () && riscv_save_return_addr_reg_p ();
> }
>
> +static int
> +vsetvl_avl_regno(rtx_insn *insn)
> +{
> + if (recog_memoized (insn) < 0)
> + return -1;
> +
> + if (get_attr_type (insn) != TYPE_VSETVL
> + && get_attr_type (insn) != TYPE_VSETVL_PRE)
> + return -1;
> +
> + extract_insn (insn);
> + /* From vector.md, vsetvl operands are as follows:
> + ;; operands[0]: VL.
> + ;; operands[1]: AVL.
> + ;; operands[2]: SEW
> + ;; operands[3]: LMUL
> + ;; operands[4]: Tail policy 0 or 1 (undisturbed/agnostic)
> + ;; operands[5]: Mask policy 0 or 1 (undisturbed/agnostic)
> + Return regno of avl operand. */
> + return REGNO (recog_data.operand[1]);
> +}
> +
> +static bool
> +insn_increases_zeroness_p(rtx_insn *insn, int regno)
Mnir point, but this is just one case of introducing VL=0 zeroness specific to
branch, not the general case.
> +{
> + /* Check for branching against zero. */
> + if (JUMP_P (insn))
> + {
> + extract_insn (insn);
> + bool match_reg = false;
> + bool comp_zero = false;
> + for (int i = 0; i < recog_data.n_operands; i++)
> + {
> + if (REG_P (recog_data.operand[i])
> + && REGNO (recog_data.operand[i]) == regno)
> + match_reg = true;
> + if (CONST_INT_P (recog_data.operand[i])
> + && XINT (recog_data.operand[i], 0) == 0
> + && XWINT (recog_data.operand[i], 0) == 0)
> + comp_zero = true;
> + }
> + return match_reg && comp_zero;
> + }
> + return false;
> +}
> +
> +/* Copied from MIPS. Removes the instruction at index LOWER from ready
> + queue READY and reinserts it in from of the instruction at index
> + HIGHER. LOWER must be <= HIGHER. */
> +static void
> +riscv_promote_ready (rtx_insn **ready, int lower, int higher)
> +{
> + rtx_insn *new_head;
> + int i;
> +
> + new_head = ready[lower];
> + for (i = lower; i < higher; i++)
> + ready[i] = ready[i + 1];
> + ready[i] = new_head;
> +}
> +
> +/* Attempt to avoid issuing VSETVL-type instructions before a branch that
> + ensures they are non-zero, as setting VL=0 dynamically can be slow. */
> +static int
> +riscv_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose
> ATTRIBUTE_UNUSED,
> + rtx_insn **ready, int *nreadyp, int cycle ATTRIBUTE_UNUSED)
> +{
> + if (! TARGET_AVOID_VL_EQ_0)
> + return riscv_issue_rate ();
> +
> + for (int i = *nreadyp - 1; i >= 0; i--)
> + {
> + /* Find the vsetvl. */
> + int avl_regno = vsetvl_avl_regno (ready[i]);
> + if (avl_regno == -1 || i == 0)
> + continue;
> + for (int j = i - 1; j >= 0; j--)
> + {
> + /* Exit if another vsetvl is found before finding a branch insn
> + in the ready queue. */
> + if (recog_memoized (ready[j]) >= 0
> + && get_attr_type (ready[j]) == TYPE_VSETVL
> + && get_attr_type (ready[j]) == TYPE_VSETVL_PRE)
> + break;
> + /* Find branch. */
> + if (recog_memoized (ready[j]) >= 0
> + && insn_increases_zeroness_p (ready[j], avl_regno))
> + {
> + /* Right now the only zeroness-increasing pattern we recognize
> + is a branch-not-zero, so there's no sense in looking for any
> + more zeroness at that point. */
> + riscv_promote_ready (ready, j, i);
> + break;
> + }
> + }
> + }
> +
> + return riscv_issue_rate ();
> +}
> +
> /* Initialize the GCC target structure. */
> #undef TARGET_ASM_ALIGNED_HI_OP
> #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> @@ -14430,6 +14530,9 @@ bool need_shadow_stack_push_pop_p ()
> #undef TARGET_DOCUMENTATION_NAME
> #define TARGET_DOCUMENTATION_NAME "RISC-V"
>
> +#undef TARGET_SCHED_REORDER
> +#define TARGET_SCHED_REORDER riscv_sched_reorder
> +
> struct gcc_target targetm = TARGET_INITIALIZER;
>
> #include "gt-riscv.h"
> diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
> index 7515c8ea13d..c6cab61fdc0 100644
> --- a/gcc/config/riscv/riscv.opt
> +++ b/gcc/config/riscv/riscv.opt
> @@ -681,3 +681,7 @@ Specifies whether the fence.tso instruction should be
> used.
> mautovec-segment
> Target Integer Var(riscv_mautovec_segment) Init(1)
> Enable (default) or disable generation of vector segment load/store
> instructions.
> +
> +mavoid-vl0
> +Target Var(TARGET_AVOID_VL_EQ_0) Init(1)
> +Avoid (default) code that dynamically sets VL=0 where possible.
As stated above, this is not the general case of VL=0 avoidance but intersection
of VL=0 and a branch, so better to phrase it that way,
Also as others have said in earlier threads, this would be better as a cpu tune,
specifically part of vector tuning, maybe default for OoO tune.
Otherwise looks pretty cool !
-Vineet
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> new file mode 100644
> index 00000000000..275922eb0bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr117974.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -mrvv-vector-bits=zvl
> -Ofast" } */
> +
> +float g(float q[], int N){
> + float dqnorm = 0.0;
> +
> + #pragma GCC unroll 4
> +
> + for (int i=0; i < N; i++) {
> + dqnorm = dqnorm + q[i] * q[i];
> + }
> + return dqnorm;
> +}
> +
> +/* { dg-final { scan-assembler-times {beq\s+[a-x0-9]+,zero,.L12\s+vsetvli} 3
> } } */
> --
> 2.43.0
>