On 11/17/22 17:37, Andrea Corallo via Gcc-patches wrote:
From: Stam Markianos-Wright <stam.markianos-wri...@arm.com>

In the past we had only defined the vsubq_x generic overload of the
vsubq_x_* intrinsics for float vector types.  This would cause them
to fall back to the `__ARM_undef` failure state if they was called
through the generic version.
This patch simply adds these overloads.

gcc/ChangeLog:

         * config/arm/arm_mve.h (__arm_vsubq_x FP): New overloads.
          (__arm_vsubq_x Integer): New.

Hi Stam,

To hopefully help Kyrill in the review, I think this fix is tested by patch #19, where we now have
+/* { dg-final { scan-assembler-not "__ARM_undef" } } */
(this line explains why this bug was not noticed so far)

Thanks,

Christophe

---
  gcc/config/arm/arm_mve.h | 28 ++++++++++++++++++++++++++++
  1 file changed, 28 insertions(+)

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index f6b42dc3fab..09167ec118e 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -38259,6 +38259,18 @@ extern void *__ARM_undef;
  #define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
    __typeof(p2) __p2 = (p2); \
    _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: 
__arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, 
int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: 
__arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, 
int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: 
__arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, 
int32x4_t), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 
(__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 
(__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 
(__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: 
__arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, 
uint8x16_t), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: 
__arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, 
uint16x8_t), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: 
__arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, 
uint32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 
(__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, 
int), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, 
int), p3), \
    int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_float16x8_t]: 
__arm_vsubq_x_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce(__p2, 
float16x8_t), p3), \
    int (*)[__ARM_mve_type_float32x4_t][__ARM_mve_type_float32x4_t]: 
__arm_vsubq_x_f32 (__ARM_mve_coerce(__p1, float32x4_t), __ARM_mve_coerce(__p2, 
float32x4_t), p3), \
    int (*)[__ARM_mve_type_float16x8_t][__ARM_mve_type_fp_n]: 
__arm_vsubq_x_n_f16 (__ARM_mve_coerce(__p1, float16x8_t), __ARM_mve_coerce2(p2, 
double), p3), \
@@ -40223,6 +40235,22 @@ extern void *__ARM_undef;
    int (*)[__ARM_mve_type_uint16_t_ptr]: __arm_vld4q_u16 
(__ARM_mve_coerce1(p0, uint16_t *)), \
    int (*)[__ARM_mve_type_uint32_t_ptr]: __arm_vld4q_u32 
(__ARM_mve_coerce1(p0, uint32_t *))))
+#define __arm_vsubq_x(p1,p2,p3) ({ __typeof(p1) __p1 = (p1); \
+  __typeof(p2) __p2 = (p2); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p1)][__ARM_mve_typeid(__p2)])0, \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int8x16_t]: 
__arm_vsubq_x_s8 (__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce(__p2, 
int8x16_t), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int16x8_t]: 
__arm_vsubq_x_s16 (__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce(__p2, 
int16x8_t), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int32x4_t]: 
__arm_vsubq_x_s32 (__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce(__p2, 
int32x4_t), p3), \
+  int (*)[__ARM_mve_type_int8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s8 
(__ARM_mve_coerce(__p1, int8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int16x8_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s16 
(__ARM_mve_coerce(__p1, int16x8_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_int32x4_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_s32 
(__ARM_mve_coerce(__p1, int32x4_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_uint8x16_t]: 
__arm_vsubq_x_u8 (__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce(__p2, 
uint8x16_t), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_uint16x8_t]: 
__arm_vsubq_x_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce(__p2, 
uint16x8_t), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_uint32x4_t]: 
__arm_vsubq_x_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce(__p2, 
uint32x4_t), p3), \
+  int (*)[__ARM_mve_type_uint8x16_t][__ARM_mve_type_int_n]: __arm_vsubq_x_n_u8 
(__ARM_mve_coerce(__p1, uint8x16_t), __ARM_mve_coerce3(p2, int), p3), \
+  int (*)[__ARM_mve_type_uint16x8_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u16 (__ARM_mve_coerce(__p1, uint16x8_t), __ARM_mve_coerce3(p2, 
int), p3), \
+  int (*)[__ARM_mve_type_uint32x4_t][__ARM_mve_type_int_n]: 
__arm_vsubq_x_n_u32 (__ARM_mve_coerce(__p1, uint32x4_t), __ARM_mve_coerce3(p2, 
int), p3));})
+
  #define __arm_vgetq_lane(p0,p1) ({ __typeof(p0) __p0 = (p0); \
    _Generic( (int (*)[__ARM_mve_typeid(__p0)])0, \
    int (*)[__ARM_mve_type_int8x16_t]: __arm_vgetq_lane_s8 
(__ARM_mve_coerce(__p0, int8x16_t), p1), \

Reply via email to