Re: [PATCH 16/35] arm: Add integer vector overloading of vsubq_x instrinsic

2022-11-22 Thread Christophe Lyon via Gcc-patches




On 11/17/22 17:37, Andrea Corallo via Gcc-patches wrote:

From: Stam Markianos-Wright 

In the past we had only defined the vsubq_x generic overload of the
vsubq_x_* intrinsics for float vector types.  This would cause them
to fall back to the `__ARM_undef` failure state if they was called
through the generic version.
This patch simply adds these overloads.

gcc/ChangeLog:

 * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
  (__arm_vsubq_x Integer): New.


Hi Stam,

To hopefully help Kyrill in the review, I think this fix is tested by 
patch #19, where we now have

+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
(this line explains why this bug was not noticed so far)

Thanks,

Christophe


---
  gcc/config/arm/arm_mve.h | 28 
  1 file changed, 28 insertions(+)

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index f6b42dc3fab..09167ec118e 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
  #define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
__typeof(p2) __p2 = (p2); \
_Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: 
__arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, 
int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: 
__arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, 
int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: 
__arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, 
int32x4_t), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 
(__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 
(__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 
(__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: 
__arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, 
uint8x16_t), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: 
__arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, 
uint16x8_t), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: 
__arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, 
uint32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 
(__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, 
int), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, 
int), p3), \
int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: 
__arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, 
float16x8_t), p3), \
int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: 
__arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, 
float32x4_t), p3), \
int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: 
__arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(p2, 
double), p3), \
@@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 
(__ARM_mve_coerce1(p0, uint16_t *)), \
int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 
(__ARM_mve_coerce1(p0, uint32_t *
  
+#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \

+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: 
__arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, 
int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: 
__arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, 
int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: 
__arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, 
int32x4_t), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 
(__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 
(__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 
(__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*

Re: [PATCH 16/35] arm: Add integer vector overloading of vsubq_x instrinsic

2022-11-22 Thread Andrea Corallo via Gcc-patches
Christophe Lyon  writes:

> On 11/17/22 17:37, Andrea Corallo via Gcc-patches wrote:
>> From: Stam Markianos-Wright 
>> In the past we had only defined the vsubq_x generic overload of the
>> vsubq_x_* intrinsics for float vector types.  This would cause them
>> to fall back to the `__ARM_undef` failure state if they was called
>> through the generic version.
>> This patch simply adds these overloads.
>> gcc/ChangeLog:
>>  * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
>>   (__arm_vsubq_x Integer): New.
>
> Hi Stam,
>
> To hopefully help Kyrill in the review, I think this fix is tested by
> patch #19, where we now have
> +/* { dg-final { scan-assembler-not "__ARM_undef" } } */
> (this line explains why this bug was not noticed so far)
>
> Thanks,
>
> Christophe

Exactly

PS also the fact that now tests are 'check-function-bodies' should catch
that.

Thanks

  Andrea


RE: [PATCH 16/35] arm: Add integer vector overloading of vsubq_x instrinsic

2022-11-22 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Andrea Corallo 
> Sent: Thursday, November 17, 2022 4:38 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Richard Earnshaw
> ; Stam Markianos-Wright  wri...@arm.com>
> Subject: [PATCH 16/35] arm: Add integer vector overloading of vsubq_x
> instrinsic
> 
> From: Stam Markianos-Wright 
> 
> In the past we had only defined the vsubq_x generic overload of the
> vsubq_x_* intrinsics for float vector types.  This would cause them
> to fall back to the `__ARM_undef` failure state if they was called
> through the generic version.
> This patch simply adds these overloads.

Ok.
Thanks,
Kyrill

> 
> gcc/ChangeLog:
> 
> * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
>  (__arm_vsubq_x Integer): New.
> ---
>  gcc/config/arm/arm_mve.h | 28 
>  1 file changed, 28 insertions(+)
> 
> diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
> index f6b42dc3fab..09167ec118e 100644
> --- a/gcc/config/arm/arm_mve.h
> +++ b/gcc/config/arm/arm_mve.h
> @@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
>  #define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
>__typeof(p2) __p2 = (p2); \
>_Generic( (int
> (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
> __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce(__p2, int8x16_t), p3), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
> __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce(__p2, int16x8_t), p3), \
> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
> __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce(__p2, int32x4_t), p3), \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]:
> __arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce(__p2, uint8x16_t), p3), \
> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]:
> __arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce(__p2, uint16x8_t), p3), \
> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]:
> __arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce(__p2, uint32x4_t), p3), \
> +  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u8 (__ARM_mve_coerce(__p1, uint8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t),
> __ARM_mve_coerce3(p2, int), p3), \
>int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]:
> __arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce(__p2, float16x8_t), p3), \
>int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]:
> __arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t),
> __ARM_mve_coerce(__p2, float32x4_t), p3), \
>int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]:
> __arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t),
> __ARM_mve_coerce2(p2, double), p3), \
> @@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
>int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16
> (__ARM_mve_coerce1(p0, uint16_t *)), \
>int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32
> (__ARM_mve_coerce1(p0, uint32_t *
> 
> +#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
> +  __typeof(p2) __p2 = (p2); \
> +  _Generic( (int
> (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]:
> __arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce(__p2, int8x16_t), p3), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]:
> __arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce(__p2, int16x8_t), p3), \
> +  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]:
> __arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t),
> __ARM_mve_coerce(__p2, int32x4_t), p3), \
> +  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s8 (__ARM_mve_coerce(__p1, int8x16_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]:
> __arm_vsubq_x_n_s16 (__ARM_mve_coerce(__p1, int16x8_t),
> __ARM_mve_coerce3(p2, int), p3), \
> +  int (*)