On 01/12/2025 13:07, Christophe Lyon wrote:
On Mon, 1 Dec 2025 at 11:24, Richard Earnshaw <[email protected]> wrote:On 01/12/2025 07:47, Christophe Lyon wrote:The second alternative for operand 1 needs a new constraint which does not overlap with Pg, except that it can handle 32 to generate an optimal mov. This patch introduces a new Ph constraint which has the [32..255] range. This fixes a lot of regressions when running the testsuite for an MVE target such as -march=armv8.1-m.main+mve.fp+fp.dp -mfloat-abi=hard. gcc/ChangeLog: PR target/122858 * config/arm/constraints.md (Ph): New constraint. * config/arm/mve.md (mve_asrl_imm, mve_lsll_imm): Fix constraints of operand 1 and handle 32 as special shift amount. gcc/testsuite/ChangeLog: PR target/122858 * gcc.target/arm/mve/pr122858.c: New test. --- gcc/config/arm/constraints.md | 9 +++- gcc/config/arm/mve.md | 57 +++++++++++++-------- gcc/testsuite/gcc.target/arm/mve/pr122858.c | 40 +++++++++++++++ 3 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/mve/pr122858.c diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index 86a9e97f514..e3809d31ba9 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -35,8 +35,8 @@ ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di, ;; Dj, Ds, Dt, Dp, Dz, Tu, Te ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe -;; in Thumb-2 state: Ha, Pg, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Ra, Rb, -;; Rd, Rf, Rg, Ri +;; in Thumb-2 state: Ha, Pg, Ph, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Ra, +;; Rb, Rd, Rf, Rg, Ri ;; The following memory constraints have been used: ;; in ARM/Thumb-2 state: Uh, Ut, Uv, Un, Um, Us, Uo, Up, Uf, Ux, Ul, Uz @@ -237,6 +237,11 @@ (define_constraint "Pg" (and (match_code "const_int") (match_test "TARGET_THUMB2 && ival >= 1 && ival <= 32"))) +(define_constraint "Ph" + "@internal In Thumb-2 state a constant in range 32 to 255" + (and (match_code "const_int") + (match_test "TARGET_THUMB2 && ival >= 32 && ival <= 255"))) + (define_constraint "Ps" "@internal In Thumb-2 state a constant in the range -255 to +255" (and (match_code "const_int") diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index b29c5f1bcd5..cba07fd991e 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -4762,7 +4762,7 @@ (define_expand "mve_asrl" (define_insn_and_split "mve_asrl_imm" [(set (match_operand:DI 0 "arm_general_register_operand" "=r,r") (ashiftrt:DI (match_operand:DI 1 "arm_general_register_operand" "0,r") - (match_operand:QI 2 "immediate_operand" "Pg,I")))] + (match_operand:QI 2 "immediate_operand" "Pg,Ph")))] "TARGET_HAVE_MVE" "asrl%?\\t%Q0, %R1, %2" "&& !satisfies_constraint_Pg (operands[2])"As Andre pointed out offline, this should be satisfies_constraint_Ph (here and below), OK with that change?
Yes, but see below on the testcase.
@@ -4781,30 +4781,36 @@ (define_insn_and_split "mve_asrl_imm" } /* ival < 0 should have already been handled by mve_asrl. */ - gcc_assert (ival > 32); + gcc_assert (ival >= 32); - /* Shift amount above immediate range (ival > 32). - out_hi gets the sign bit - out_lo gets in_hi << (ival - 32) or << 31 if ival >= 64. - If ival >= 64, the result is either 0 or -1, depending on the - input sign. */ rtx in_hi = gen_highpart (SImode, operands[1]); rtx out_lo = gen_lowpart (SImode, operands[0]); rtx out_hi = gen_highpart (SImode, operands[0]); - emit_insn (gen_rtx_SET (out_lo, - gen_rtx_fmt_ee (ASHIFTRT, - SImode, - in_hi, - GEN_INT (MIN (ival - 32, - 31))))); + if (ival == 32) + /* out_hi gets the sign bit + out_lo gets in_hi. */ + emit_insn (gen_movsi (out_lo, in_hi)); + else + /* Shift amount above immediate range (ival > 32). + out_hi gets the sign bit + out_lo gets in_hi << (ival - 32) or << 31 if ival >= 64. + If ival >= 64, the result is either 0 or -1, depending on the + input sign. */ + emit_insn (gen_rtx_SET (out_lo, + gen_rtx_fmt_ee (ASHIFTRT, + SImode, + in_hi, + GEN_INT (MIN (ival - 32, + 31))))); + /* Copy sign bit, which is OK even if out_lo == in_hi. */ emit_insn (gen_rtx_SET (out_hi, gen_rtx_fmt_ee (ASHIFTRT, SImode, in_hi, GEN_INT (31)))); - DONE; + DONE; " [(set_attr "predicable" "yes,yes") (set_attr "length" "4,8")]) @@ -4852,7 +4858,7 @@ (define_expand "mve_lsll" (define_insn_and_split "mve_lsll_imm" [(set (match_operand:DI 0 "arm_general_register_operand" "=r,r") (ashift:DI (match_operand:DI 1 "arm_general_register_operand" "0,r") - (match_operand:QI 2 "immediate_operand" "Pg,I")))] + (match_operand:QI 2 "immediate_operand" "Pg,Ph")))] "TARGET_HAVE_MVE" "lsll%?\\t%Q0, %R1, %2" "&& !satisfies_constraint_Pg (operands[2])" @@ -4878,17 +4884,24 @@ (define_insn_and_split "mve_lsll_imm" } /* ival < 0 should have already been handled by mve_asrl. */ - gcc_assert (ival > 32); + gcc_assert (ival >= 32); - /* Shift amount above immediate range: 32 < ival < 64. */ rtx in_lo = gen_lowpart (SImode, operands[1]); rtx out_lo = gen_lowpart (SImode, operands[0]); rtx out_hi = gen_highpart (SImode, operands[0]); - emit_insn (gen_rtx_SET (out_hi, - gen_rtx_fmt_ee (ASHIFT, - SImode, - in_lo, - GEN_INT (ival - 32)))); + + if (ival == 32) + /* Shift by 32 is just a move. */ + emit_insn (gen_movsi (out_hi, in_lo)); + else + /* Shift amount above immediate range: 32 < ival < 64. */ + emit_insn (gen_rtx_SET (out_hi, + gen_rtx_fmt_ee (ASHIFT, + SImode, + in_lo, + GEN_INT (ival - 32)))); + + /* Clear low 32 bits. */ emit_insn (gen_rtx_SET (out_lo, const0_rtx)); DONE; " diff --git a/gcc/testsuite/gcc.target/arm/mve/pr122858.c b/gcc/testsuite/gcc.target/arm/mve/pr122858.c new file mode 100644 index 00000000000..d77d395d154 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/mve/pr122858.c @@ -0,0 +1,40 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_mve_hw } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_v8_1m_mve } */ + +extern void abort (void); + +__attribute__ ((noipa)) +long long ashrl_fn (long long a) +{ + long long c; + + c = a >> 31; + c += a; + return c; +} + +__attribute__ ((noipa)) +long long ashll_fn (long long a) +{ + long long c; + + c = a << 33; + c += a; + return c; +} + +int main(void) +{ + long long var1 = 1; + long long var2 = ashll_fn (var1); + if (var2 != 0x200000001) + abort (); + + var2 = ashrl_fn (var2); + if (var2 != 0x200000005) + abort (); + + return 0; +}The patch itself is OK, but given PR122871, I think we need a more solid testcase to check for regressions - once that is fixed I really hope we will get nowhere near the above test demonstrating a potential regression.Not sure to understand: do you want me to add testcase for PR122871 with this patch? Or an additional patch with just a new testcase for it? Not sure what to test... the presence of a single lsll? But that would make the testcase target-dependent.
The optimal thumb2 code for ashll_fn is add r1, r1, r0, lsl #1 bx lrso what I'm saying is that this testcase is only generating shifts of less than 32 because of PR122871 (it currently expands to 3 shifts of 11 instead of 1 shift of 33). Once that is fixed, this test should go back to generating something very similar to the optimal sequence without any chance of proving that a regression in the ashll late split is not happening.
We need a test that shows that ashll is not being incorrectly assigned to alternative 1 when the shift count is less than 32 that doesn't rely on the sub-optimal expansion of the multiply.
For example, I think something like
long long ashll_fn(long long a /* unused */, long long x)
{
return x << 7;
}
should do that, and shouldn't be subject to the vagaries of the multiply
expansion code.
--- Oh, and one more thing that I noticed just now. We currently have: "lsll%?\\t%Q0, %R1, %2" But it would be more conventional to write "lsll%?\\t%Q0, %R0, %2"since you're referring to the upper and lower halves of the destination register, not a mix of the source and destination regs.
R.
Thanks, ChristopheR.
