Jonathan Wright <jonathan.wri...@arm.com> writes: > Hi, > > V2 of this change implements the same approach as for the multiply > and add-widen patches. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master? > > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-07-28 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost > of vec_select high-half from being added into Neon subtract > cost. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vsubX_high_cost.c: New test.
OK, thanks. Richard > From: Jonathan Wright > Sent: 29 July 2021 10:23 > To: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org> > Cc: Richard Sandiford <richard.sandif...@arm.com>; Kyrylo Tkachov > <kyrylo.tkac...@arm.com> > Subject: [PATCH] aarch64: Don't include vec_select high-half in SIMD subtract > cost > > Hi, > > The Neon subtract-long/subract-widen instructions can select the top > or bottom half of the operand registers. This selection does not > change the cost of the underlying instruction and this should be > reflected by the RTL cost function. > > This patch adds RTL tree traversal in the Neon subtract cost function > to match vec_select high-half of its operands. This traversal > prevents the cost of the vec_select from being added into the cost of > the subtract - meaning that these instructions can now be emitted in > the combine pass as they are no longer deemed prohibitively > expensive. > > Regression tested and bootstrapped on aarch64-none-linux-gnu - no > issues. > > Ok for master? > > Thanks, > Jonathan > > --- > > gcc/ChangeLog: > > 2021-07-28 Jonathan Wright <jonathan.wri...@arm.com> > > * config/aarch64/aarch64.c: Traverse RTL tree to prevent cost > of vec_select high-half from being added into Neon subtract > cost. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/vsubX_high_cost.c: New test. > > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > cc92cc9c208e63f262c22c7fe8e6915825884775..89129c8ecf1655fbb69437733b0d42d79c864836 > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -13089,6 +13089,21 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int > outer ATTRIBUTE_UNUSED, > op1 = XEXP (x, 1); > > cost_minus: > + if (VECTOR_MODE_P (mode)) > + { > + /* SUBL2 and SUBW2. */ > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > + if (vec_flags & VEC_ADVSIMD) > + { > + /* The select-operand-high-half versions of the sub instruction > + have the same cost as the regular three vector version - > + don't add the costs of the select into the costs of the sub. > + */ > + op0 = aarch64_strip_extend_vec_half (op0); > + op1 = aarch64_strip_extend_vec_half (op1); > + } > + } > + > *cost += rtx_cost (op0, mode, MINUS, 0, speed); > > /* Detect valid immediates. */ > diff --git a/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c > b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..09bc7fc7766e8bcb468d592cbf4005a57cf09397 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vsubX_high_cost.c > @@ -0,0 +1,38 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3" } */ > + > +#include <arm_neon.h> > + > +#define TEST_SUBL(rettype, intype, ts, rs) \ > + rettype test_vsubl_ ## ts (intype a, intype b, intype c) \ > + { \ > + rettype t0 = vsubl_ ## ts (vget_high_ ## ts (a), \ > + vget_high_ ## ts (c)); \ > + rettype t1 = vsubl_ ## ts (vget_high_ ## ts (b), \ > + vget_high_ ## ts (c)); \ > + return vaddq ## _ ## rs (t0, t1); \ > + } > + > +TEST_SUBL (int16x8_t, int8x16_t, s8, s16) > +TEST_SUBL (uint16x8_t, uint8x16_t, u8, u16) > +TEST_SUBL (int32x4_t, int16x8_t, s16, s32) > +TEST_SUBL (uint32x4_t, uint16x8_t, u16, u32) > +TEST_SUBL (int64x2_t, int32x4_t, s32, s64) > +TEST_SUBL (uint64x2_t, uint32x4_t, u32, u64) > + > +#define TEST_SUBW(rettype, intype, intypel, ts, rs) \ > + rettype test_vsubw_ ## ts (intype a, intype b, intypel c) \ > + { \ > + rettype t0 = vsubw_ ## ts (a, vget_high_ ## ts (c)); \ > + rettype t1 = vsubw_ ## ts (b, vget_high_ ## ts (c)); \ > + return vaddq ## _ ## rs (t0, t1); \ > + } > + > +TEST_SUBW (int16x8_t, int16x8_t, int8x16_t, s8, s16) > +TEST_SUBW (uint16x8_t, uint16x8_t, uint8x16_t, u8, u16) > +TEST_SUBW (int32x4_t, int32x4_t, int16x8_t, s16, s32) > +TEST_SUBW (uint32x4_t, uint32x4_t, uint16x8_t, u16, u32) > +TEST_SUBW (int64x2_t, int64x2_t, int32x4_t, s32, s64) > +TEST_SUBW (uint64x2_t, uint64x2_t, uint32x4_t, u32, u64) > + > +/* { dg-final { scan-assembler-not "dup\\t" } } */