Jonathan Wright <jonathan.wri...@arm.com> writes:
> Hi,
>
> V2 of this change implements the same approach as for the multiply
> and add-widen patches.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-28  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
>         of vec_select high-half from being added into Neon subtract
>         cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vsubX_high_cost.c: New test.

OK, thanks.

Richard

> From: Jonathan Wright
> Sent: 29 July 2021 10:23
> To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org>
> Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov 
> <kyrylo.tkac...@arm.com>
> Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD subtract 
> cost
>
> Hi,
>
> The Neon subtract-long/subract-widen instructions can select the top
> or bottom half of the operand registers. This selection does not
> change the cost of the underlying instruction and this should be
> reflected by the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon subtract cost function
> to match vec_select high-half of its operands. This traversal
> prevents the cost of the vec_select from being added into the cost of
> the subtract - meaning that these instructions can now be emitted in
> the combine pass as they are no longer deemed prohibitively
> expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-28  Jonathan Wright  <jonathan.wri...@arm.com>
>
>         * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost
>         of vec_select high-half from being added into Neon subtract
>         cost.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vsubX_high_cost.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> cc92cc9c208e63f262c22c7fe8e6915825884775..89129c8ecf1655fbb69437733b0d42d79c864836
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
> outer ATTRIBUTE_UNUSED,
>       op1 = XEXP (x, 1);
>  
>  cost_minus:
> +     if (VECTOR_MODE_P (mode))
> +       {
> +         /* SUBL2 and SUBW2.  */
> +         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
> +         if (vec_flags & VEC_ADVSIMD)
> +           {
> +             /* The select-operand-high-half versions of the sub instruction
> +                have the same cost as the regular three vector version -
> +                don't add the costs of the select into the costs of the sub.
> +                */
> +             op0 = aarch64_strip_extend_vec_half (op0);
> +             op1 = aarch64_strip_extend_vec_half (op1);
> +           }
> +       }
> +
>       *cost += rtx_cost (op0, mode, MINUS, 0, speed);
>  
>       /* Detect valid immediates.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c 
> b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..09bc7fc7766e8bcb468d592cbf4005a57cf09397
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c
> @@ -0,0 +1,38 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +#define TEST_SUBL(rettype, intype, ts, rs) \
> +  rettype test_vsubl_ ## ts (intype a, intype b, intype c) \
> +     { \
> +             rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \
> +                                        vget_high_ ## ts (c)); \
> +             rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \
> +                                        vget_high_ ## ts (c)); \
> +             return vaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_SUBL (int16x8_t, int8x16_t, s8, s16)
> +TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16)
> +TEST_SUBL (int32x4_t, int16x8_t, s16, s32)
> +TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32)
> +TEST_SUBL (int64x2_t, int32x4_t, s32, s64)
> +TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64)
> +
> +#define TEST_SUBW(rettype, intype, intypel, ts, rs) \
> +  rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \
> +     { \
> +             rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \
> +             rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \
> +             return vaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16)
> +TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16)
> +TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32)
> +TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64)
> +TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64)
> +
> +/* { dg-final { scan-assembler-not "dup\\t" } } */

Reply via email to