https://gcc.gnu.org/g:daee0409d195d346562e423da783d5d1cf8ea175
commit r11-11295-gdaee0409d195d346562e423da783d5d1cf8ea175 Author: Richard Sandiford <richard.sandif...@arm.com> Date: Wed Mar 27 19:26:56 2024 +0000 aarch64: Fix vld1/st1_x4 intrinsic definitions The vld1_x4 and vst1_x4 patterns use XI registers for both 64-bit and 128-bit vectors. This has the nice property that each individual vector is within a separate 16-byte subreg of the XI, which should reduce the number of memory spills needed. However, it means that the 64-bit vector forms must convert between the native 4x64-bit structure layout and the padded 4x128-bit XI layout. The vld4 and vst4 functions did this correctly. But the vld1x4 and vst1x4 functions used a union between the native and padded layouts, even though the layouts are different sizes. This patch makes vld1x4 and vst1x4 use the same approach as vld4 and vst4. It also fixes some uses of variables in the user namespace. gcc/ * config/aarch64/arm_neon.h (vld1_s8_x4, vld1_s16_x4, vld1_s32_x4): (vld1_u8_x4, vld1_u16_x4, vld1_u32_x4, vld1_f16_x4, vld1_f32_x4): (vld1_p8_x4, vld1_p16_x4, vld1_s64_x4, vld1_u64_x4, vld1_p64_x4): (vld1_f64_x4): Avoid using a union of a 256-bit structure and 512-bit XImode integer. Instead use the same approach as the vld4 intrinsics. (vst1_s8_x4, vst1_s16_x4, vst1_s32_x4, vst1_u8_x4, vst1_u16_x4): (vst1_u32_x4, vst1_f16_x4, vst1_f32_x4, vst1_p8_x4, vst1_p16_x4): (vst1_s64_x4, vst1_u64_x4, vst1_p64_x4, vst1_f64_x4, vld1_bf16_x4): (vst1_bf16_x4): Likewise for stores. (vst1q_s8_x4, vst1q_s16_x4, vst1q_s32_x4, vst1q_u8_x4, vst1q_u16_x4): (vst1q_u32_x4, vst1q_f16_x4, vst1q_f32_x4, vst1q_p8_x4, vst1q_p16_x4): (vst1q_s64_x4, vst1q_u64_x4, vst1q_p64_x4, vst1q_f64_x4) (vst1q_bf16_x4): Rename val parameter to __val. Diff: --- gcc/config/aarch64/arm_neon.h | 469 ++++++++++++++++++++++++++++++------------ 1 file changed, 334 insertions(+), 135 deletions(-) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index baa30bd5a9d..8f53f4e1559 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -16498,10 +16498,14 @@ __extension__ extern __inline int8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s8_x4 (const int8_t *__a) { - union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); - return __au.__i; + int8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; } __extension__ extern __inline int8x16x4_t @@ -16518,10 +16522,14 @@ __extension__ extern __inline int16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s16_x4 (const int16_t *__a) { - union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); - return __au.__i; + int16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; } __extension__ extern __inline int16x8x4_t @@ -16538,10 +16546,14 @@ __extension__ extern __inline int32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s32_x4 (const int32_t *__a) { - union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); - return __au.__i; + int32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; } __extension__ extern __inline int32x4x4_t @@ -16558,10 +16570,14 @@ __extension__ extern __inline uint8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u8_x4 (const uint8_t *__a) { - union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); - return __au.__i; + uint8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; } __extension__ extern __inline uint8x16x4_t @@ -16578,10 +16594,14 @@ __extension__ extern __inline uint16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u16_x4 (const uint16_t *__a) { - union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); - return __au.__i; + uint16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; } __extension__ extern __inline uint16x8x4_t @@ -16598,10 +16618,14 @@ __extension__ extern __inline uint32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u32_x4 (const uint32_t *__a) { - union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); - return __au.__i; + uint32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; } __extension__ extern __inline uint32x4x4_t @@ -16618,10 +16642,14 @@ __extension__ extern __inline float16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f16_x4 (const float16_t *__a) { - union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v4hf ((const __builtin_aarch64_simd_hf *) __a); - return __au.__i; + float16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2); + ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3); + return ret; } __extension__ extern __inline float16x8x4_t @@ -16638,10 +16666,14 @@ __extension__ extern __inline float32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f32_x4 (const float32_t *__a) { - union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v2sf ((const __builtin_aarch64_simd_sf *) __a); - return __au.__i; + float32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1); + ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2); + ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3); + return ret; } __extension__ extern __inline float32x4x4_t @@ -16658,10 +16690,14 @@ __extension__ extern __inline poly8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p8_x4 (const poly8_t *__a) { - union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); - return __au.__i; + poly8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; } __extension__ extern __inline poly8x16x4_t @@ -16678,10 +16714,14 @@ __extension__ extern __inline poly16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p16_x4 (const poly16_t *__a) { - union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); - return __au.__i; + poly16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; } __extension__ extern __inline poly16x8x4_t @@ -16698,30 +16738,42 @@ __extension__ extern __inline int64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_s64_x4 (const int64_t *__a) { - union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); - return __au.__i; + int64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; } __extension__ extern __inline uint64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_u64_x4 (const uint64_t *__a) { - union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); - return __au.__i; + uint64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; } __extension__ extern __inline poly64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_p64_x4 (const poly64_t *__a) { - union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); - return __au.__i; + poly64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1); + ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2); + ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3); + return ret; } __extension__ extern __inline int64x2x4_t @@ -16758,10 +16810,14 @@ __extension__ extern __inline float64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_f64_x4 (const float64_t *__a) { - union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4df ((const __builtin_aarch64_simd_df *) __a); - return __au.__i; + float64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)}; + ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)}; + ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)}; + return ret; } __extension__ extern __inline float64x2x4_t @@ -27427,225 +27483,355 @@ vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s8_x4 (int8_t * __a, int8x8x4_t val) +vst1_s8_x4 (int8_t * __a, int8x8x4_t __val) { - union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s8_x4 (int8_t * __a, int8x16x4_t val) +vst1q_s8_x4 (int8_t * __a, int8x16x4_t __val) { - union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s16_x4 (int16_t * __a, int16x4x4_t val) +vst1_s16_x4 (int16_t * __a, int16x4x4_t __val) { - union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s16_x4 (int16_t * __a, int16x8x4_t val) +vst1q_s16_x4 (int16_t * __a, int16x8x4_t __val) { - union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s32_x4 (int32_t * __a, int32x2x4_t val) +vst1_s32_x4 (int32_t * __a, int32x2x4_t __val) { - union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s32_x4 (int32_t * __a, int32x4x4_t val) +vst1q_s32_x4 (int32_t * __a, int32x4x4_t __val) { - union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u8_x4 (uint8_t * __a, uint8x8x4_t val) +vst1_u8_x4 (uint8_t * __a, uint8x8x4_t __val) { - union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t val) +vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t __val) { - union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u16_x4 (uint16_t * __a, uint16x4x4_t val) +vst1_u16_x4 (uint16_t * __a, uint16x4x4_t __val) { - union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t val) +vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t __val) { - union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u32_x4 (uint32_t * __a, uint32x2x4_t val) +vst1_u32_x4 (uint32_t * __a, uint32x2x4_t __val) { - union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t val) +vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t __val) { - union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f16_x4 (float16_t * __a, float16x4x4_t val) +vst1_f16_x4 (float16_t * __a, float16x4x4_t __val) { - union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f16_x4 (float16_t * __a, float16x8x4_t val) +vst1q_f16_x4 (float16_t * __a, float16x8x4_t __val) { - union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f32_x4 (float32_t * __a, float32x2x4_t val) +vst1_f32_x4 (float32_t * __a, float32x2x4_t __val) { - union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f32_x4 (float32_t * __a, float32x4x4_t val) +vst1q_f32_x4 (float32_t * __a, float32x4x4_t __val) { - union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p8_x4 (poly8_t * __a, poly8x8x4_t val) +vst1_p8_x4 (poly8_t * __a, poly8x8x4_t __val) { - union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t val) +vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t __val) { - union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p16_x4 (poly16_t * __a, poly16x4x4_t val) +vst1_p16_x4 (poly16_t * __a, poly16x4x4_t __val) { - union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t val) +vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t __val) { - union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s64_x4 (int64_t * __a, int64x1x4_t val) +vst1_s64_x4 (int64_t * __a, int64x1x4_t __val) { - union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u64_x4 (uint64_t * __a, uint64x1x4_t val) +vst1_u64_x4 (uint64_t * __a, uint64x1x4_t __val) { - union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p64_x4 (poly64_t * __a, poly64x1x4_t val) +vst1_p64_x4 (poly64_t * __a, poly64x1x4_t __val) { - union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[3], 3); + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s64_x4 (int64_t * __a, int64x2x4_t val) +vst1q_s64_x4 (int64_t * __a, int64x2x4_t __val) { - union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t val) +vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t __val) { - union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t val) +vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t __val) { - union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f64_x4 (float64_t * __a, float64x1x4_t val) +vst1_f64_x4 (float64_t * __a, float64x1x4_t __val) { - union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3); + __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f64_x4 (float64_t * __a, float64x2x4_t val) +vst1q_f64_x4 (float64_t * __a, float64x2x4_t __val) { - union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __u.__o); } @@ -33380,10 +33566,14 @@ __extension__ extern __inline bfloat16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vld1_bf16_x4 (const bfloat16_t *__a) { - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; - __au.__o - = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a); - return __au.__i; + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; } __extension__ extern __inline bfloat16x8x4_t @@ -33640,17 +33830,26 @@ vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val) { - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val) { - union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { __val }; __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); }