H, Christophe, > -----Original Message----- > From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of > Christophe Lyon via Gcc-patches > Sent: 15 October 2020 18:23 > To: gcc-patches@gcc.gnu.org > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 > intrinsics > > This patch adds implementations for vceqq_p64, vceqz_p64 and > vceqzq_p64 intrinsics. > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors > into their high and low halves. > > vceqz[q] simply call the vceq and vceqq with a second argument equal > to zero. > > The added (executable) testcases make sure that the poly64x2_t > variants have results with one element of all zeroes (false) and the > other element with all bits set to one (true). > > 2020-10-15 Christophe Lyon <christophe.l...@linaro.org> > > gcc/ > * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): > New. > > gcc/testsuite/ > * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for > vceqz_p64, vceqq_p64 and vceqzq_p64. > --- > gcc/config/arm/arm_neon.h | 31 +++++++++++++++ > .../aarch64/advsimd-intrinsics/p64_p128.c | 46 > +++++++++++++++++++++- > 2 files changed, 76 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > index aa21730..f7eff37 100644 > --- a/gcc/config/arm/arm_neon.h > +++ b/gcc/config/arm/arm_neon.h > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b) > return vreinterpret_u64_u32 (__m); > } > > +__extension__ extern __inline uint64x1_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vceqz_p64 (poly64x1_t __a) > +{ > + poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0)); > + return vceq_p64 (__a, __b); > +}
This approach is okay, but can we have some kind of test to confirm it generates the VCEQ instruction with immediate zero rather than having a separate DUP... Thanks, Kyrill > + > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements. */ > +__extension__ extern __inline uint64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) > +{ > + poly64_t __high_a = vget_high_p64 (__a); > + poly64_t __high_b = vget_high_p64 (__b); > + uint64x1_t __high = vceq_p64(__high_a, __high_b); > + > + poly64_t __low_a = vget_low_p64 (__a); > + poly64_t __low_b = vget_low_p64 (__b); > + uint64x1_t __low = vceq_p64(__low_a, __low_b); > + return vcombine_u64 (__low, __high); > +} > + > +__extension__ extern __inline uint64x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vceqzq_p64 (poly64x2_t __a) > +{ > + poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0)); > + return vceqq_p64 (__a, __b); > +} > + > /* The vtst_p64 intrinsic does not map to a single instruction. > We emulate it in way similar to vceq_p64 above but here we do > a reduction with max since if any two corresponding bits > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > index a3210a9..6aed096 100644 > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = > { 0xfffffff1, > > /* Expected results: vceq. */ > VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 }; > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; > + > +/* Expected results: vceqz. */ > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 }; > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff }; > > /* Expected results: vcombine. */ > VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, > 0x88 }; > @@ -213,7 +218,7 @@ int main (void) > > /* vceq_p64 tests. */ > #undef TEST_MSG > -#define TEST_MSG "VCEQ" > +#define TEST_MSG "VCEQ/VCEQQ" > > #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N) > \ > VECT_VAR(vceq_vector_res, T3, W, N) = > \ > @@ -227,16 +232,55 @@ int main (void) > DECL_VARIABLE(vceq_vector, poly, 64, 1); > DECL_VARIABLE(vceq_vector2, poly, 64, 1); > DECL_VARIABLE(vceq_vector_res, uint, 64, 1); > + DECL_VARIABLE(vceq_vector, poly, 64, 2); > + DECL_VARIABLE(vceq_vector2, poly, 64, 2); > + DECL_VARIABLE(vceq_vector_res, uint, 64, 2); > > CLEAN(result, uint, 64, 1); > + CLEAN(result, uint, 64, 2); > > VLOAD(vceq_vector, buffer, , poly, p, 64, 1); > + VLOAD(vceq_vector, buffer, q, poly, p, 64, 2); > > VDUP(vceq_vector2, , poly, p, 64, 1, 0x88); > + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88); > + VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1); > > TEST_VCOMP(vceq, , poly, p, uint, 64, 1); > + TEST_VCOMP(vceq, q, poly, p, uint, 64, 2); > > CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, ""); > + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, ""); > + > + /* vceqz_p64 tests. */ > +#undef TEST_MSG > +#define TEST_MSG "VCEQZ/VCEQZQ" > + > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) > \ > + VECT_VAR(vceqz_vector_res, T3, W, N) = \ > + INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N)); \ > + vst1##Q##_u##W(VECT_VAR(result, T3, W, N), > VECT_VAR(vceqz_vector_res, T3, W, N)) > + > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N) > \ > + TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N) > + > + DECL_VARIABLE(vceqz_vector, poly, 64, 1); > + DECL_VARIABLE(vceqz_vector_res, uint, 64, 1); > + DECL_VARIABLE(vceqz_vector, poly, 64, 2); > + DECL_VARIABLE(vceqz_vector_res, uint, 64, 2); > + > + CLEAN(result, uint, 64, 1); > + CLEAN(result, uint, 64, 2); > + > + VLOAD(vceqz_vector, buffer, , poly, p, 64, 1); > + VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2); > + VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0); > + > + TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1); > + TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2); > + > + CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, ""); > + CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, ""); > > /* vcombine_p64 tests. */ > #undef TEST_MSG > -- > 2.7.4