[gcc r14-10014] AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

Tamar Christina via Gcc-cvs Thu, 18 Apr 2024 03:50:14 -0700

https://gcc.gnu.org/g:a2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6


commit r14-10014-ga2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6
Author: Tamar Christina <[email protected]>
Date:   Thu Apr 18 11:47:42 2024 +0100

    AArch64: remove reliance on register allocator for simd/gpreg costing. 
[PR114741]
    
    In PR114741 we see that we have a regression in codegen when SVE is enable 
where
    the simple testcase:
    
    void foo(unsigned v, unsigned *p)
    {
        *p = v & 1;
    }
    
    generates
    
    foo:
            fmov    s31, w0
            and     z31.s, z31.s, #1
            str     s31, [x1]
            ret
    
    instead of:
    
    foo:
            and     w0, w0, 1
            str     w0, [x1]
            ret
    
    This causes an impact it not just codesize but also performance.  This is 
caused
    by the use of the ^ constraint modifier in the pattern <optab><mode>3.
    
    The documentation states that this modifier should only have an effect on 
the
    alternative costing in that a particular alternative is to be preferred 
unless
    a non-psuedo reload is needed.
    
    The pattern was trying to convey that whenever both r and w are required, 
that
    it should prefer r unless a reload is needed.  This is because if a reload 
is
    needed then we can construct the constants more flexibly on the SIMD side.
    
    We were using this so simplify the implementation and to get generic cases 
such
    as:
    
    double negabs (double x)
    {
       unsigned long long y;
       memcpy (&y, &x, sizeof(double));
       y = y | (1UL << 63);
       memcpy (&x, &y, sizeof(double));
       return x;
    }
    
    which don't go through an expander.
    However the implementation of ^ in the register allocator is not according 
to
    the documentation in that it also has an effect during coloring.  During 
initial
    register class selection it applies a penalty to a class, similar to how ? 
does.
    
    In this example the penalty makes the use of GP regs expensive enough that 
it no
    longer considers them:
    
        r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
    ;;        3--> b  0: i   9 r106=r105&0x1
        :cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0)
                             PR_HI_REGS+0(0):model 4
    
    which is not the expected behavior.  For GCC 14 this is a conservative fix.
    
    1. we remove the ^ modifier from the logical optabs.
    
    2. In order not to regress copysign we then move the copysign expansion to
       directly use the SIMD variant.  Since copysign only supports floating 
point
       modes this is fine and no longer relies on the register allocator to 
select
       the right alternative.
    
    It once again regresses the general case, but this case wasn't optimized in
    earlier GCCs either so it's not a regression in GCC 14.  This change gives
    strict better codegen than earlier GCCs and still optimizes the important 
cases.
    
    gcc/ChangeLog:
    
            PR target/114741
            * config/aarch64/aarch64.md (<optab><mode>3): Remove ^ from alt 2.
            (copysign<GPF:mode>3): Use SIMD version of IOR directly.
    
    gcc/testsuite/ChangeLog:
    
            PR target/114741
            * gcc.target/aarch64/fneg-abs_2.c: Update codegen.
            * gcc.target/aarch64/fneg-abs_4.c: xfail for now.
            * gcc.target/aarch64/pr114741.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.md                 | 23 +++++++++++++--------
 gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c |  5 ++---
 gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c |  4 ++--
 gcc/testsuite/gcc.target/aarch64/pr114741.c   | 29 +++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 385a669b9b3..dbde066f747 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4811,7 +4811,7 @@
   ""
   {@ [ cons: =0 , 1  , 2        ; attrs: type , arch  ]
      [ r        , %r , r        ; logic_reg   , *     ] <logical>\t%<w>0, 
%<w>1, %<w>2
-     [ rk       , ^r , <lconst> ; logic_imm   , *     ] <logical>\t%<w>0, 
%<w>1, %2
+     [ rk       , r  , <lconst> ; logic_imm   , *     ] <logical>\t%<w>0, 
%<w>1, %2
      [ w        , 0  , <lconst> ; *           , sve   ] <logical>\t%Z0.<s>, 
%Z0.<s>, #%2
      [ w        , w  , w        ; neon_logic  , simd  ] 
<logical>\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
   }
@@ -7192,22 +7192,29 @@
    (match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  machine_mode int_mode = <V_INT_EQUIV>mode;
-  rtx bitmask = gen_reg_rtx (int_mode);
-  emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
-                                   << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
+  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
+                              << (GET_MODE_BITSIZE (<MODE>mode) - 1));
   /* copysign (x, -1) should instead be expanded as orr with the sign
      bit.  */
   rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
   if (GET_CODE (op2_elt) == CONST_DOUBLE
       && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
     {
-      emit_insn (gen_ior<v_int_equiv>3 (
-       lowpart_subreg (int_mode, operands[0], <MODE>mode),
-       lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
+      rtx v_bitmask
+       = force_reg (V2<V_INT_EQUIV>mode,
+                    gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
+                                             signbit_const));
+
+      emit_insn (gen_iorv2<v_int_equiv>3 (
+       lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
+       lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
+       v_bitmask));
       DONE;
     }
 
+  machine_mode int_mode = <V_INT_EQUIV>mode;
+  rtx bitmask = gen_reg_rtx (int_mode);
+  emit_move_insn (bitmask, signbit_const);
   operands[2] = force_reg (<MODE>mode, operands[2]);
   emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
                                       bitmask));
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c 
b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
index eed41ea18e6..18d10ee834d 100644
--- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -9,8 +9,7 @@
 
 /*
 ** f1:
-**     movi    v[0-9]+.2s, 0x80, lsl 24
-**     orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**     orr     v[0-9]+.2s, #?128, lsl #?24
 **     ret
 */
 float32_t f1 (float32_t a)
@@ -22,7 +21,7 @@ float32_t f1 (float32_t a)
 ** f2:
 **     movi    v[0-9]+.4s, #?0
 **     fneg    v[0-9]+.2d, v[0-9]+.2d
-**     orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+**     orr     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
 **     ret
 */
 float64_t f2 (float64_t a)
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c 
b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
index d45c3d1210c..6da95a4b052 100644
--- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
@@ -7,7 +7,7 @@
 #include <string.h>
 
 /*
-** negabs:
+** negabs: { xfail *-*-* }
 **     movi    v31.4s, #?0
 **     fneg    v[0-9]+.2d, v[0-9]+.2d
 **     orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
@@ -23,7 +23,7 @@ double negabs (double x)
 }
 
 /*
-** negabsf:
+** negabsf: { xfail *-*-* }
 **     movi    v[0-9]+.2s, 0x80, lsl 24
 **     orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
 **     ret
diff --git a/gcc/testsuite/gcc.target/aarch64/pr114741.c 
b/gcc/testsuite/gcc.target/aarch64/pr114741.c
new file mode 100644
index 00000000000..fd2182da0e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr114741.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+/*
+** foo1:
+**     and     w0, w0, 1
+**     str     w0, \[x1\]
+**     ret
+*/
+void foo1(unsigned v, unsigned *p)
+{
+    *p = v & 1;
+}
+
+#pragma GCC target "+sve"
+
+/*
+** foo2:
+**     and     w0, w0, 1
+**     str     w0, \[x1\]
+**     ret
+*/
+void foo2(unsigned v, unsigned *p)
+{
+    *p = v & 1;
+}

[gcc r14-10014] AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

Reply via email to