On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> wrote: > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni > <prathamesh.kulka...@linaro.org> wrote: > > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.gli...@inria.fr> wrote: > > > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote: > > > > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a > > > > * __b. > > > > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise, > > > is vmul_n already undefined in case of overflow? > > Hi Marc, > > Sorry for late reply, for vmul_n_s*, I think they are signed > > (int<width>x<width>_t). > Oops, I meant int<width>x<nelems>_t. > > I am not sure how should the intrinsic behave in case of signed overflow, > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too. > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of > > overflow ? The attached version fixes one fallout I missed earlier. Is this OK to commit ?
Thanks, Prathamesh > > > > Thanks, > > Prathamesh > > > > > > -- > > > Marc Glisse
2021-06-07 Prathamesh Kulkarni <prathamesh.kulka...@linaro.org> PR target/66791 * config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin with __a * __b. (vmul_n_s32): Likewise. (vmul_n_u16): Likewise. (vmul_n_u32): Likewise. (vmulq_n_s16): Likewise. (vmulq_n_s32): Likewise. (vmulq_n_u16): Likewise. (vmulq_n_u32): Likewise. (vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__. (vmulq_n_f32): Likewise. (vmul_n_f16): Likewise. (vmulq_n_f16): Likewise. testsuite/ * gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index dcd533fd003..8ac00774e6c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_s16 (int16x4_t __a, int16_t __b) { - return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_s32 (int32x2_t __a, int32_t __b) { - return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_f32 (float32x2_t __a, float32_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_u16 (uint16x4_t __a, uint16_t __b) { - return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_u32 (uint32x2_t __a, uint32_t __b) { - return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_s16 (int16x8_t __a, int16_t __b) { - return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_s32 (int32x4_t __a, int32_t __b) { - return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_f32 (float32x4_t __a, float32_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_u16 (uint16x8_t __a, uint16_t __b) { - return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b); + return __a * __b; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_u32 (uint32x4_t __a, uint32_t __b) { - return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b); + return __a * __b; } __extension__ extern __inline int32x4_t @@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmul_n_f16 (float16x4_t __a, float16_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmul_nv4hf (__a, __b); +#endif } __extension__ extern __inline float16x8_t @@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmulq_n_f16 (float16x8_t __a, float16_t __b) { +#ifdef __FAST_MATH__ + return __a * __b; +#else return __builtin_neon_vmul_nv8hf (__a, __b); +#endif } __extension__ extern __inline float16x4_t diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c index 50f689352ca..6808576ce59 100644 --- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c @@ -327,13 +327,13 @@ BINOP_TEST (vminnm) BINOP_TEST (vmul) /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } } - { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ BINOP_LANE_TEST (vmul, 2) /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } } { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } } */ BINOP_N_TEST (vmul) -/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } } - { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/ +/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } } + { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }*/ float16x4_t test_vpadd_16x4 (float16x4_t a, float16x4_t b) @@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a) { return vdup_n_f16 (a); } -/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } } */ float16x8_t test_vmovq_n_f16 (float16_t a) @@ -400,7 +400,7 @@ test_vdupq_n_f16 (float16_t a) { return vdupq_n_f16 (a); } -/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } } */ +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 3 } } */ float16x4_t test_vdup_lane_f16 (float16x4_t a)