[to-be-committed][RISC-V][PR 121136] Improve various tests which only need to examine upper bits in a GPR

Jeff Law Tue, 04 Nov 2025 15:13:00 -0800

This addresses the first level issues seen in generating betterperforming code for testcases derived from pr121136. It likelyregresses code size in some cases as in many cases it selects codesequences that should be better performing, though larger to encode.

Improving -Os code generation should remain the primary focus ofpr121136. Any improvements in code size with this change are a niceside effect, but not the primary goal.



--





Let's take this test (derived from the PR):

_Bool func1_0x1U (unsigned int x) { return x <= 0x1U; }

_Bool func2_0x1U (unsigned int x) { return ((x >> __builtin_ctz (0x1U +1U)) == 0); }


_Bool func3_0x1U (unsigned int x) { return ((x / (0x1U + 1U)) == 0); }

Those should produce the same output. We currently get these fragmentsfor the 3 cases. In particular note how the second variant is a twoinstruction sequence.


        sltiu   a0,a0,2

        srliw   a0,a0,1
        seqz    a0,a0

        sltiu   a0,a0,2

This patch will adjust that second sequence to match the first and thirdand is optimal.

Let's take another case. This is interesting as it's right at thesimm12 border:



_Bool func1_0x7ffU (unsigned long x) { return x <= 0x7ffU; }

_Bool func2_0x7ffU (unsigned long x) { return ((x >> __builtin_ctzl(0x7ffU + 1UL)) == 0); }


_Bool func3_0x7ffU (unsigned long x) { return ((x / (0x7ffU + 1UL)) == 0); }

We get:

        li      a5,2047
        sltu    a0,a5,a0
        seqz    a0,a0

        srli    a0,a0,11
        seqz    a0,a0

        li      a5,2047
        sltu    a0,a5,a0
        seqz    a0,a0

In this case the second sequence is pretty good. Not perfect, butclearly better than the other two. This patch will fix the code forcase #1 and case #3 to match case #2.

So anyway, that's the basic motivation here. So to be 100% clear, whilethe bug is focused on code size, I'm focused on the performance of theresulting code.

This has been tested on riscv32-elf and riscv64-elf. It's alsobootstrapped and regression tested on the Pioneer. The BPI won't haveresults for this patch until late tomorrow.


Waiting on pre-commit CI before moving forward

Jeff

        PR rtl-optimization/121136
gcc/
        * config/riscv/riscv.md: Add define_insn to test the
        upper bits of a register against zero using sltiu when
        the bits are extracted via zero_extract or logial right shift.
        Add 3->2 define_splits for gtu/leu cases testing upper bits
        against zero.

gcc/testsuite
        * gcc.target/riscv/pr121136.c: New test.

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 640ca5f9b0ea..a09079cb8b1c 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3752,6 +3752,57 @@ (define_insn "*sle<u>_<X:mode><GPR:mode>"
   [(set_attr "type" "slt")
    (set_attr "mode" "<X:MODE>")])
 
+;; We can sometimes do better for unsigned comparisons against
+;; values where there's a run of 1s in the LSBs.
+;;
+(define_split
+  [(set (match_operand:X 0 "register_operand")
+       (gtu:X (match_operand:X 1 "register_operand")
+              (match_operand 2 "const_int_operand")))
+   (clobber (match_operand:X 3 "register_operand"))]
+  "exact_log2 (INTVAL (operands[2]) + 1) >= 0"
+  [(set (match_dup 3) (lshiftrt:X (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (ne:X (match_dup 3) (const_int 0)))]
+{ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2]) + 1)); })
+
+(define_split
+  [(set (match_operand:X 0 "register_operand")
+       (leu:X (match_operand:X 1 "register_operand")
+              (match_operand 2 "const_int_operand")))
+   (clobber (match_operand:X 3 "register_operand"))]
+  "exact_log2 (INTVAL (operands[2]) + 1) >= 0"
+  [(set (match_dup 3) (lshiftrt:X (match_dup 1) (match_dup 2)))
+   (set (match_dup 0) (eq:X (match_dup 3) (const_int 0)))]
+{ operands[2] = GEN_INT (exact_log2 (INTVAL (operands[2]) + 1)); })
+
+;; Alternate forms that are ultimately just sltiu
+(define_insn ""
+  [(set (match_operand:X 0 "register_operand" "=r")
+       (eq:X (zero_extract:X (match_operand:X 1 "register_operand" "r")
+                             (match_operand 2 "const_int_operand")
+                             (match_operand 3 "const_int_operand"))
+             (const_int 0)))]
+  "(INTVAL (operands[3]) < 11
+    && INTVAL (operands[2]) + INTVAL (operands[3]) == BITS_PER_WORD)"
+{
+  operands[2] = GEN_INT (HOST_WIDE_INT_1U << INTVAL (operands[3]));
+  return "sltiu\t%0,%1,%2";
+}
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
+
+(define_insn ""
+  [(set (match_operand:X 0 "register_operand" "=r")
+       (eq:X (lshiftrt:X (match_operand:X 1 "register_operand" "r")
+                         (match_operand 2 "const_int_operand"))
+             (const_int 0)))]
+  "INTVAL (operands[2]) < 11"
+{
+  operands[2] = GEN_INT (HOST_WIDE_INT_1U << INTVAL (operands[2]));
+  return "sltiu\t%0,%1,%2";
+}
+  [(set_attr "type" "slt")
+   (set_attr "mode" "<X:MODE>")])
 ;;
 ;;  ....................
 ;;
diff --git a/gcc/testsuite/gcc.target/riscv/pr121136.c 
b/gcc/testsuite/gcc.target/riscv/pr121136.c
new file mode 100644
index 000000000000..77585f42cbf4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr121136.c
@@ -0,0 +1,103 @@
+/* { dg-do compile } */
+/* { dg-options "-std=gnu23 -O2 -march=rv64gc -mabi=lp64d" { target rv64} } */
+/* { dg-options "-std=gnu23 -O2 -march=rv32gc -mabi=ilp32" { target rv32} } */
+
+/* We need to adjust the constant so this works for rv32 and rv64.  */
+#if __riscv_xlen == 32
+#define ONE 1U
+#define TYPE unsigned int
+#define CTZ __builtin_ctz
+#else
+#define ONE 1UL
+#define TYPE unsigned long
+#define CTZ __builtin_ctzl
+#endif
+
+#define F1(C) _Bool func1_##C(TYPE x) { return x <= C; }
+#define F2(C) _Bool func2_##C(TYPE x) { return ((x >> CTZ (C+ONE)) == 0); }
+#define F3(C) _Bool func3_##C(TYPE x) { return ((x / (C+ONE)) == 0); }
+
+#define F(C) F1(C) F2(C) F3(C)
+
+F (0x1U)
+F (0x3U)
+F (0x7U)
+F (0xfU)
+F (0x1fU)
+F (0x3fU)
+F (0x7fU)
+F (0xffU)
+F (0x1ffU)
+F (0x3ffU)
+F (0x7ffU)
+F (0xfffU)
+F (0x1fffU)
+F (0x3fffU)
+F (0x7fffU)
+F (0xffffU)
+F (0x1ffffU)
+F (0x3ffffU)
+F (0x7ffffU)
+F (0xfffffU)
+F (0x1fffffU)
+F (0x3fffffU)
+F (0x7fffffU)
+F (0xffffffU)
+F (0x1ffffffU)
+F (0x3ffffffU)
+F (0x7ffffffU)
+F (0xfffffffU)
+F (0x1fffffffU)
+F (0x3fffffffU)
+F (0x7fffffffU)
+#if __riscv_xlen == 64
+F (0xffffffffUL)
+F (0x1ffffffffUL)
+F (0x3ffffffffUL)
+F (0x7ffffffffUL)
+F (0xfffffffffUL)
+F (0x1fffffffffUL)
+F (0x3fffffffffUL)
+F (0x7fffffffffUL)
+F (0xffffffffffUL)
+F (0x1ffffffffffUL)
+F (0x3ffffffffffUL)
+F (0x7ffffffffffUL)
+F (0xfffffffffffUL)
+F (0x1fffffffffffUL)
+F (0x3fffffffffffUL)
+F (0x7fffffffffffUL)
+F (0xffffffffffffUL)
+F (0x1ffffffffffffUL)
+F (0x3ffffffffffffUL)
+F (0x7ffffffffffffUL)
+F (0xfffffffffffffUL)
+F (0x1fffffffffffffUL)
+F (0x3fffffffffffffUL)
+F (0x7fffffffffffffUL)
+F (0xffffffffffffffUL)
+F (0x1ffffffffffffffUL)
+F (0x3ffffffffffffffUL)
+F (0x7ffffffffffffffUL)
+F (0xfffffffffffffffUL)
+F (0x1fffffffffffffffUL)
+F (0x3fffffffffffffffUL)
+F (0x7fffffffffffffffUL)
+#endif
+
+/* These represent current state.  They are not optimal as some of the cases
+   where we shift are better implemented by loading 2^n constant and using
+   sltu as the lui has no incoming data dependencies.  */
+/* { dg-final { scan-assembler-times "\tsltiu" 30 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "\tnot" 3 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "\tsrli" 121 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "\tsltu" 38 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "\tseqz" 118 { target { rv64 } } } } */
+/* { dg-final { scan-assembler-times "\tli" 38 { target { rv64 } } } } */
+
+/* { dg-final { scan-assembler-times "\tsltiu" 30 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "\tnot" 3 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "\tsrli" 25 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "\tsltu" 38 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "\tseqz" 22 { target { rv32 } } } } */
+/* { dg-final { scan-assembler-times "\tli" 38 { target { rv32 } } } } */

[to-be-committed][RISC-V][PR 121136] Improve various tests which only need to examine upper bits in a GPR

Reply via email to