> -----Original Message----- > From: Tamar Christina <tamar.christ...@arm.com> > Sent: 01 February 2021 12:39 > To: gcc-patches@gcc.gnu.org > Cc: nd <n...@arm.com>; Richard Earnshaw <richard.earns...@arm.com>; > Marcus Shawcroft <marcus.shawcr...@arm.com>; Kyrylo Tkachov > <kyrylo.tkac...@arm.com>; Richard Sandiford > <richard.sandif...@arm.com> > Subject: [PATCH]AArch64 Change canonization of smlal and smlsl in order to > be able to optimize the vec_dup > > Hi All, > > g:87301e3956d44ad45e384a8eb16c79029d20213a and > g:ee4c4fe289e768d3c6b6651c8bfa3fdf458934f4 changed the intrinsics to be > proper RTL but accidentally ended up creating a regression because of the > ordering in the RTL pattern. > > The existing RTL that combine should try to match to remove the vec_dup is > aarch64_vec_<su>mlal_lane<Qlane> and > aarch64_vec_<su>mult_lane<Qlane> which > expects the select register to be the second operand of mult. > > The pattern introduced has it as the first operand so combine was unable to > remove the vec_dup. This flips the order such that the patterns optimize > correctly. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master?
Ok. I wonder how many of these unfortunate occurrences we have in the backend... Thanks, Kyrill > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md (aarch64_<su>mlal_n<mode>, > aarch64_<su>mlsl<mode>, aarch64_<su>mlsl_n<mode>): Flip mult > operands. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull- > optimized.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > bca2d8a3437fdcee77c7c357663c78c418b32a88..d1858663a4e78c0861d902 > b37e93c0b00d75e661 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -1950,10 +1950,10 @@ (define_insn "aarch64_<su>mlal_n<mode>" > (plus:<VWIDE> > (mult:<VWIDE> > (ANY_EXTEND:<VWIDE> > - (vec_duplicate:VD_HSI > - (match_operand:<VEL> 3 "register_operand" "<h_con>"))) > + (match_operand:VD_HSI 2 "register_operand" "w")) > (ANY_EXTEND:<VWIDE> > - (match_operand:VD_HSI 2 "register_operand" "w"))) > + (vec_duplicate:VD_HSI > + (match_operand:<VEL> 3 "register_operand" "<h_con>")))) > (match_operand:<VWIDE> 1 "register_operand" "0")))] > "TARGET_SIMD" > "<su>mlal\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]" > @@ -1980,10 +1980,10 @@ (define_insn "aarch64_<su>mlsl_n<mode>" > (match_operand:<VWIDE> 1 "register_operand" "0") > (mult:<VWIDE> > (ANY_EXTEND:<VWIDE> > - (vec_duplicate:VD_HSI > - (match_operand:<VEL> 3 "register_operand" "<h_con>"))) > + (match_operand:VD_HSI 2 "register_operand" "w")) > (ANY_EXTEND:<VWIDE> > - (match_operand:VD_HSI 2 "register_operand" "w")))))] > + (vec_duplicate:VD_HSI > + (match_operand:<VEL> 3 "register_operand" "<h_con>"))))))] > "TARGET_SIMD" > "<su>mlsl\t%0.<Vwtype>, %2.<Vtype>, %3.<Vetype>[0]" > [(set_attr "type" "neon_mla_<Vetype>_long")] > @@ -2078,10 +2078,10 @@ (define_insn "aarch64_<su>mull_n<mode>" > [(set (match_operand:<VWIDE> 0 "register_operand" "=w") > (mult:<VWIDE> > (ANY_EXTEND:<VWIDE> > - (vec_duplicate:<VCOND> > - (match_operand:<VEL> 2 "register_operand" "<h_con>"))) > + (match_operand:VD_HSI 1 "register_operand" "w")) > (ANY_EXTEND:<VWIDE> > - (match_operand:VD_HSI 1 "register_operand" "w"))))] > + (vec_duplicate:<VCOND> > + (match_operand:<VEL> 2 "register_operand" "<h_con>")))))] > "TARGET_SIMD" > "<su>mull\t%0.<Vwtype>, %1.<Vtype>, %2.<Vetype>[0]" > [(set_attr "type" "neon_mul_<Vetype>_scalar_long")] > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl- > mull-optimized.c b/gcc/testsuite/gcc.target/aarch64/advsimd- > intrinsics/smlal-smlsl-mull-optimized.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..1e963e5002e666e32e12b > 2eef965b206c7344015 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/smlal-smlsl-mull- > optimized.c > @@ -0,0 +1,45 @@ > +/* { dg-do compile { target aarch64-*-* } } */ > + > +#include <arm_neon.h> > + > +/* > +**add: > +** smlal v0.4s, v1.4h, v2.h[3] > +** ret > +*/ > + > +int32x4_t add(int32x4_t acc, int16x4_t b, int16x4_t c) { > + return vmlal_n_s16(acc, b, c[3]); > +} > + > +/* > +**sub: > +** smlsl v0.4s, v1.4h, v2.h[3] > +** ret > +*/ > + > +int32x4_t sub(int32x4_t acc, int16x4_t b, int16x4_t c) { > + return vmlsl_n_s16(acc, b, c[3]); > +} > + > +/* > +**smull: > +** smull v0.4s, v1.4h, v2.h[3] > +** ret > +*/ > + > +int32x4_t smull(int16x4_t b, int16x4_t c) { > + return vmull_n_s16(b, c[3]); > +} > + > +/* > +**umull: > +** umull v0.4s, v1.4h, v2.h[3] > +** ret > +*/ > + > +uint32x4_t umull(uint16x4_t b, uint16x4_t c) { > + return vmull_n_u16(b, c[3]); > +} > + > +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" {-O[^0]} } } */ > > > --