RE: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Kyrylo Tkachov via Gcc-patches Thu, 05 Nov 2020 01:37:25 -0800

H, Christophe,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-boun...@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 15 October 2020 18:23
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> intrinsics
> 
> This patch adds implementations for vceqq_p64, vceqz_p64 and
> vceqzq_p64 intrinsics.
> 
> vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> into their high and low halves.
> 
> vceqz[q] simply call the vceq and vceqq with a second argument equal
> to zero.
> 
> The added (executable) testcases make sure that the poly64x2_t
> variants have results with one element of all zeroes (false) and the
> other element with all bits set to one (true).
> 
> 2020-10-15  Christophe Lyon  <christophe.l...@linaro.org>
> 
>       gcc/
>       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> New.
> 
>       gcc/testsuite/
>       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
>       vceqz_p64, vceqq_p64 and vceqzq_p64.
> ---
>  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
>  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> +++++++++++++++++++++-
>  2 files changed, 76 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index aa21730..f7eff37 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
>    return vreinterpret_u64_u32 (__m);
>  }
> 
> +__extension__ extern __inline uint64x1_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqz_p64 (poly64x1_t __a)
> +{
> +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> +  return vceq_p64 (__a, __b);
> +}


This approach is okay, but can we have some kind of test to confirm it 
generates the VCEQ instruction with immediate zero rather than having a 
separate DUP...
Thanks,
Kyrill

> +
> +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> +{
> +  poly64_t __high_a = vget_high_p64 (__a);
> +  poly64_t __high_b = vget_high_p64 (__b);
> +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> +
> +  poly64_t __low_a = vget_low_p64 (__a);
> +  poly64_t __low_b = vget_low_p64 (__b);
> +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> +  return vcombine_u64 (__low, __high);
> +}
> +
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqzq_p64 (poly64x2_t __a)
> +{
> +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> +  return vceqq_p64 (__a, __b);
> +}
> +
>  /* The vtst_p64 intrinsic does not map to a single instruction.
>     We emulate it in way similar to vceq_p64 above but here we do
>     a reduction with max since if any two corresponding bits
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> index a3210a9..6aed096 100644
> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> { 0xfffffff1,
> 
>  /* Expected results: vceq.  */
>  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> +
> +/* Expected results: vceqz.  */
> +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> 
>  /* Expected results: vcombine.  */
>  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> 0x88 };
> @@ -213,7 +218,7 @@ int main (void)
> 
>    /* vceq_p64 tests. */
>  #undef TEST_MSG
> -#define TEST_MSG "VCEQ"
> +#define TEST_MSG "VCEQ/VCEQQ"
> 
>  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
>       \
>    VECT_VAR(vceq_vector_res, T3, W, N) =
>       \
> @@ -227,16 +232,55 @@ int main (void)
>    DECL_VARIABLE(vceq_vector, poly, 64, 1);
>    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
>    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> 
>    CLEAN(result, uint, 64, 1);
> +  CLEAN(result, uint, 64, 2);
> 
>    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> 
>    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> 
>    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> 
>    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> +
> +  /* vceqz_p64 tests. */
> +#undef TEST_MSG
> +#define TEST_MSG "VCEQZ/VCEQZQ"
> +
> +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
>       \
> +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> VECT_VAR(vceqz_vector_res, T3, W, N))
> +
> +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
>       \
> +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> +
> +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> +
> +  CLEAN(result, uint, 64, 1);
> +  CLEAN(result, uint, 64, 2);
> +
> +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> +
> +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> +
> +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> 
>    /* vcombine_p64 tests.  */
>  #undef TEST_MSG
> --
> 2.7.4

RE: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics

Reply via email to