Re: [PATCH]AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

Richard Earnshaw (lists) Thu, 18 Apr 2024 03:20:27 -0700

On 18/04/2024 11:11, Tamar Christina wrote:
> Hi All,
> 
> In PR114741 we see that we have a regression in codegen when SVE is enable 
> where
> the simple testcase:
> 
> void foo(unsigned v, unsigned *p)
> {
>     *p = v & 1;
> }
> 
> generates
> 
> foo:
>         fmov    s31, w0
>         and     z31.s, z31.s, #1
>         str     s31, [x1]
>         ret
> 
> instead of:
> 
> foo:
>         and     w0, w0, 1
>         str     w0, [x1]
>         ret
> 
> This causes an impact it not just codesize but also performance.  This is 
> caused
> by the use of the ^ constraint modifier in the pattern <optab><mode>3.
> 
> The documentation states that this modifier should only have an effect on the
> alternative costing in that a particular alternative is to be preferred unless
> a non-psuedo reload is needed.
> 
> The pattern was trying to convey that whenever both r and w are required, that
> it should prefer r unless a reload is needed.  This is because if a reload is
> needed then we can construct the constants more flexibly on the SIMD side.
> 
> We were using this so simplify the implementation and to get generic cases 
> such
> as:
> 
> double negabs (double x)
> {
>    unsigned long long y;
>    memcpy (&y, &x, sizeof(double));
>    y = y | (1UL << 63);
>    memcpy (&x, &y, sizeof(double));
>    return x;
> }
> 
> which don't go through an expander.
> However the implementation of ^ in the register allocator is not according to
> the documentation in that it also has an effect during coloring.  During 
> initial
> register class selection it applies a penalty to a class, similar to how ? 
> does.
> 
> In this example the penalty makes the use of GP regs expensive enough that it 
> no
> longer considers them:
> 
>     r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
> ;;        3--> b  0: i   9 r106=r105&0x1
>     :cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0)
>                          PR_HI_REGS+0(0):model 4
> 
> which is not the expected behavior.  For GCC 14 this is a conservative fix.
> 
> 1. we remove the ^ modifier from the logical optabs.
> 
> 2. In order not to regress copysign we then move the copysign expansion to
>    directly use the SIMD variant.  Since copysign only supports floating point
>    modes this is fine and no longer relies on the register allocator to select
>    the right alternative.
> 
> It once again regresses the general case, but this case wasn't optimized in
> earlier GCCs either so it's not a regression in GCC 14.  This change gives
> strict better codegen than earlier GCCs and still optimizes the important 
> cases.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 
>       PR target/114741
>       * config/aarch64/aarch64.md (<optab><mode>3): Remove ^ from alt 2.
>       (copysign<GPF:mode>3): Use SIMD version of IOR directly.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR target/114741
>       * gcc.target/aarch64/fneg-abs_2.c: Update codegen.
>       * gcc.target/aarch64/fneg-abs_4.c: xfail for now.
>       * gcc.target/aarch64/pr114741.c: New test.
> 
> ---
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 385a669b9b3c31cc9108a660e881b9091c71fc7c..dbde066f7478bec51a8703b017ea553aa98be309
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4811,7 +4811,7 @@ (define_insn "<optab><mode>3"
>    ""
>    {@ [ cons: =0 , 1  , 2        ; attrs: type , arch  ]
>       [ r        , %r , r        ; logic_reg   , *     ] <logical>\t%<w>0, 
> %<w>1, %<w>2
> -     [ rk       , ^r , <lconst> ; logic_imm   , *     ] <logical>\t%<w>0, 
> %<w>1, %2
> +     [ rk       , r  , <lconst> ; logic_imm   , *     ] <logical>\t%<w>0, 
> %<w>1, %2
>       [ w        , 0  , <lconst> ; *           , sve   ] <logical>\t%Z0.<s>, 
> %Z0.<s>, #%2
>       [ w        , w  , w        ; neon_logic  , simd  ] 
> <logical>\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
>    }
> @@ -7192,22 +7192,29 @@ (define_expand "copysign<GPF:mode>3"
>     (match_operand:GPF 2 "nonmemory_operand")]
>    "TARGET_SIMD"
>  {
> -  machine_mode int_mode = <V_INT_EQUIV>mode;
> -  rtx bitmask = gen_reg_rtx (int_mode);
> -  emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
> -                                 << (GET_MODE_BITSIZE (<MODE>mode) - 1)));
> +  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
> +                            << (GET_MODE_BITSIZE (<MODE>mode) - 1));
>    /* copysign (x, -1) should instead be expanded as orr with the sign
>       bit.  */
>    rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
>    if (GET_CODE (op2_elt) == CONST_DOUBLE
>        && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
>      {
> -      emit_insn (gen_ior<v_int_equiv>3 (
> -     lowpart_subreg (int_mode, operands[0], <MODE>mode),
> -     lowpart_subreg (int_mode, operands[1], <MODE>mode), bitmask));
> +      rtx v_bitmask
> +     = force_reg (V2<V_INT_EQUIV>mode,
> +                  gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
> +                                           signbit_const));
> +
> +      emit_insn (gen_iorv2<v_int_equiv>3 (
> +     lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
> +     lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
> +     v_bitmask));
>        DONE;
>      }
>  
> +  machine_mode int_mode = <V_INT_EQUIV>mode;
> +  rtx bitmask = gen_reg_rtx (int_mode);
> +  emit_move_insn (bitmask, signbit_const);
>    operands[2] = force_reg (<MODE>mode, operands[2]);
>    emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
>                                      bitmask));
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c 
> b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> index 
> eed41ea18e69ff60ac79cbdb7c0c935850b49b33..18d10ee834d5d9b4361d890447060e78f09d3a73
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -9,8 +9,7 @@
>  
>  /*
>  ** f1:
> -**   movi    v[0-9]+.2s, 0x80, lsl 24
> -**   orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**   orr     v[0-9]+.2s, #?128, lsl #?24
>  **   ret
>  */
>  float32_t f1 (float32_t a)
> @@ -22,7 +21,7 @@ float32_t f1 (float32_t a)
>  ** f2:
>  **   movi    v[0-9]+.4s, #?0
>  **   fneg    v[0-9]+.2d, v[0-9]+.2d
> -**   orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**   orr     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>  **   ret
>  */
>  float64_t f2 (float64_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c 
> b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> index 
> d45c3d1210c682f38177a89f1137ab4f6decfbc1..6da95a4b052ca3d9e0b8db0e465c5f40139b2b18
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> @@ -7,7 +7,7 @@
>  #include <string.h>
>  
>  /*
> -** negabs:
> +** negabs: { xfail *-*-* }
>  **   movi    v31.4s, #?0
>  **   fneg    v[0-9]+.2d, v[0-9]+.2d
>  **   orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> @@ -23,7 +23,7 @@ double negabs (double x)
>  }
>  
>  /*
> -** negabsf:
> +** negabsf: { xfail *-*-* }
>  **   movi    v[0-9]+.2s, 0x80, lsl 24
>  **   orr     v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>  **   ret
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr114741.c 
> b/gcc/testsuite/gcc.target/aarch64/pr114741.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..fd2182da0e966afcf0ab5bdef2f8d3892c2a3d67
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr114741.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** foo1:
> +**   and     w0, w0, 1
> +**   str     w0, \[x1\]
> +**   ret
> +*/
> +void foo1(unsigned v, unsigned *p)
> +{
> +    *p = v & 1;
> +}
> +
> +#pragma GCC target "+sve"
> +
> +/*
> +** foo2:
> +**   and     w0, w0, 1
> +**   str     w0, \[x1\]
> +**   ret
> +*/
> +void foo2(unsigned v, unsigned *p)
> +{
> +    *p = v & 1;
> +}
> 
> 
> 
>



OK.

R.

Re: [PATCH]AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

Reply via email to