On 01/12/2025 13:07, Christophe Lyon wrote:
On Mon, 1 Dec 2025 at 11:24, Richard Earnshaw <[email protected]> wrote:

On 01/12/2025 07:47, Christophe Lyon wrote:
The second alternative for operand 1 needs a new constraint which does
not overlap with Pg, except that it can handle 32 to generate an
optimal mov.

This patch introduces a new Ph constraint which has the [32..255]
range.

This fixes a lot of regressions when running the testsuite for an MVE
target such as -march=armv8.1-m.main+mve.fp+fp.dp -mfloat-abi=hard.

gcc/ChangeLog:

       PR target/122858
       * config/arm/constraints.md (Ph): New constraint.
       * config/arm/mve.md (mve_asrl_imm, mve_lsll_imm): Fix constraints
       of operand 1 and handle 32 as special shift amount.

gcc/testsuite/ChangeLog:

       PR target/122858
       * gcc.target/arm/mve/pr122858.c: New test.
---
   gcc/config/arm/constraints.md               |  9 +++-
   gcc/config/arm/mve.md                       | 57 +++++++++++++--------
   gcc/testsuite/gcc.target/arm/mve/pr122858.c | 40 +++++++++++++++
   3 files changed, 82 insertions(+), 24 deletions(-)
   create mode 100644 gcc/testsuite/gcc.target/arm/mve/pr122858.c

diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md
index 86a9e97f514..e3809d31ba9 100644
--- a/gcc/config/arm/constraints.md
+++ b/gcc/config/arm/constraints.md
@@ -35,8 +35,8 @@
   ;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di,
   ;;                   Dj, Ds, Dt, Dp, Dz, Tu, Te
   ;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
-;; in Thumb-2 state: Ha, Pg, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Ra, Rb,
-;;                Rd, Rf, Rg, Ri
+;; in Thumb-2 state: Ha, Pg, Ph, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Ra,
+;;                Rb, Rd, Rf, Rg, Ri

   ;; The following memory constraints have been used:
   ;; in ARM/Thumb-2 state: Uh, Ut, Uv, Un, Um, Us, Uo, Up, Uf, Ux, Ul, Uz
@@ -237,6 +237,11 @@ (define_constraint "Pg"
     (and (match_code "const_int")
          (match_test "TARGET_THUMB2 && ival >= 1 && ival <= 32")))

+(define_constraint "Ph"
+  "@internal In Thumb-2 state a constant in range 32 to 255"
+  (and (match_code "const_int")
+       (match_test "TARGET_THUMB2 && ival >= 32 && ival <= 255")))
+
   (define_constraint "Ps"
     "@internal In Thumb-2 state a constant in the range -255 to +255"
     (and (match_code "const_int")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b29c5f1bcd5..cba07fd991e 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -4762,7 +4762,7 @@ (define_expand "mve_asrl"
   (define_insn_and_split "mve_asrl_imm"
     [(set (match_operand:DI 0 "arm_general_register_operand" "=r,r")
         (ashiftrt:DI (match_operand:DI 1 "arm_general_register_operand" "0,r")
-                    (match_operand:QI 2 "immediate_operand" "Pg,I")))]
+                    (match_operand:QI 2 "immediate_operand" "Pg,Ph")))]
     "TARGET_HAVE_MVE"
     "asrl%?\\t%Q0, %R1, %2"
     "&& !satisfies_constraint_Pg (operands[2])"

As Andre pointed out offline, this should be satisfies_constraint_Ph
(here and below), OK with that change?

Yes, but see below on the testcase.



@@ -4781,30 +4781,36 @@ (define_insn_and_split "mve_asrl_imm"
       }

     /* ival < 0 should have already been handled by mve_asrl. */
-  gcc_assert (ival > 32);
+  gcc_assert (ival >= 32);

-  /* Shift amount above immediate range (ival > 32).
-     out_hi gets the sign bit
-     out_lo gets in_hi << (ival - 32) or << 31 if ival >= 64.
-     If ival >= 64, the result is either 0 or -1, depending on the
-     input sign.  */
     rtx in_hi = gen_highpart (SImode, operands[1]);
     rtx out_lo = gen_lowpart (SImode, operands[0]);
     rtx out_hi = gen_highpart (SImode, operands[0]);

-  emit_insn (gen_rtx_SET (out_lo,
-                       gen_rtx_fmt_ee (ASHIFTRT,
-                                       SImode,
-                                       in_hi,
-                                       GEN_INT (MIN (ival - 32,
-                                                     31)))));
+  if (ival == 32)
+    /* out_hi gets the sign bit
+       out_lo gets in_hi.  */
+    emit_insn (gen_movsi (out_lo, in_hi));
+  else
+    /* Shift amount above immediate range (ival > 32).
+       out_hi gets the sign bit
+       out_lo gets in_hi << (ival - 32) or << 31 if ival >= 64.
+       If ival >= 64, the result is either 0 or -1, depending on the
+       input sign.  */
+    emit_insn (gen_rtx_SET (out_lo,
+                         gen_rtx_fmt_ee (ASHIFTRT,
+                                         SImode,
+                                         in_hi,
+                                         GEN_INT (MIN (ival - 32,
+                                                       31)))));
+
     /* Copy sign bit, which is OK even if out_lo == in_hi.  */
     emit_insn (gen_rtx_SET (out_hi,
                         gen_rtx_fmt_ee (ASHIFTRT,
                                         SImode,
                                         in_hi,
                                         GEN_INT (31))));
-     DONE;
+  DONE;
     "
     [(set_attr "predicable" "yes,yes")
      (set_attr "length" "4,8")])
@@ -4852,7 +4858,7 @@ (define_expand "mve_lsll"
   (define_insn_and_split "mve_lsll_imm"
     [(set (match_operand:DI 0 "arm_general_register_operand" "=r,r")
         (ashift:DI (match_operand:DI 1 "arm_general_register_operand" "0,r")
-                  (match_operand:QI 2 "immediate_operand" "Pg,I")))]
+                  (match_operand:QI 2 "immediate_operand" "Pg,Ph")))]
     "TARGET_HAVE_MVE"
     "lsll%?\\t%Q0, %R1, %2"
     "&& !satisfies_constraint_Pg (operands[2])"
@@ -4878,17 +4884,24 @@ (define_insn_and_split "mve_lsll_imm"
       }

     /* ival < 0 should have already been handled by mve_asrl. */
-  gcc_assert (ival > 32);
+  gcc_assert (ival >= 32);

-  /* Shift amount above immediate range: 32 < ival < 64.  */
     rtx in_lo = gen_lowpart (SImode, operands[1]);
     rtx out_lo = gen_lowpart (SImode, operands[0]);
     rtx out_hi = gen_highpart (SImode, operands[0]);
-  emit_insn (gen_rtx_SET (out_hi,
-                       gen_rtx_fmt_ee (ASHIFT,
-                                       SImode,
-                                       in_lo,
-                                       GEN_INT (ival - 32))));
+
+  if (ival == 32)
+    /* Shift by 32 is just a move.  */
+    emit_insn (gen_movsi (out_hi, in_lo));
+  else
+    /* Shift amount above immediate range: 32 < ival < 64.  */
+    emit_insn (gen_rtx_SET (out_hi,
+                         gen_rtx_fmt_ee (ASHIFT,
+                                         SImode,
+                                         in_lo,
+                                         GEN_INT (ival - 32))));
+
+  /* Clear low 32 bits.  */
     emit_insn (gen_rtx_SET (out_lo, const0_rtx));
     DONE;
     "
diff --git a/gcc/testsuite/gcc.target/arm/mve/pr122858.c 
b/gcc/testsuite/gcc.target/arm/mve/pr122858.c
new file mode 100644
index 00000000000..d77d395d154
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/pr122858.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+extern void abort (void);
+
+__attribute__ ((noipa))
+long long ashrl_fn (long long a)
+{
+  long long c;
+
+  c = a >> 31;
+  c += a;
+  return c;
+}
+
+__attribute__ ((noipa))
+long long ashll_fn (long long a)
+{
+  long long c;
+
+  c = a << 33;
+  c += a;
+  return c;
+}
+
+int main(void)
+{
+  long long var1 = 1;
+  long long var2 = ashll_fn (var1);
+  if (var2 != 0x200000001)
+    abort ();
+
+  var2 = ashrl_fn (var2);
+  if (var2 != 0x200000005)
+    abort ();
+
+  return 0;
+}

The patch itself is OK, but given PR122871, I think we need a more solid
testcase to check for regressions - once that is fixed I really hope we
will get nowhere near the above test demonstrating a potential regression.

Not sure to understand: do you want me to add testcase for PR122871
with this patch? Or an additional patch with just a new testcase for
it?
Not sure what to test... the presence of a single lsll? But that would
make the testcase target-dependent.


The optimal thumb2 code for ashll_fn is

  add r1, r1, r0, lsl #1
  bx lr

so what I'm saying is that this testcase is only generating shifts of less than 32 because of PR122871 (it currently expands to 3 shifts of 11 instead of 1 shift of 33). Once that is fixed, this test should go back to generating something very similar to the optimal sequence without any chance of proving that a regression in the ashll late split is not happening.

We need a test that shows that ashll is not being incorrectly assigned to alternative 1 when the shift count is less than 32 that doesn't rely on the sub-optimal expansion of the multiply.

For example, I think something like

long long ashll_fn(long long a /* unused */, long long x)
{
  return x << 7;
}

should do that, and shouldn't be subject to the vagaries of the multiply expansion code.

---

Oh, and one more thing that I noticed just now.  We currently have:

"lsll%?\\t%Q0, %R1, %2"

But it would be more conventional to write

"lsll%?\\t%Q0, %R0, %2"

since you're referring to the upper and lower halves of the destination register, not a mix of the source and destination regs.


R.

Thanks,

Christophe



R.

Reply via email to