-#ifdef __ARM_BIG_ENDIAN
-#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))
-#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) -
1))
-#define __arm_laneq(__vec, __idx) (__idx ^
(__ARM_NUM_LANES(__vec)/2 - 1))
-#else
-#define __arm_lane(__vec, __idx) __idx
-#define __arm_laneq(__vec, __idx) __idx
-#endif
#define vget_lane_f16(__v, __idx) \
__extension__ \
@@ -14476,6 +14477,15 @@ vreinterpret_p16_u32 (uint32x2_t __a)
#if defined (__ARM_FP16_FORMAT_IEEE) || defined
(__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+ return (float16x4_t) __a;
+}
+#endif
+
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined
(__ARM_FP16_FORMAT_ALTERNATIVE)
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpret_f16_p8 (poly8x8_t __a)
{
return (float16x4_t) __a;
@@ -15688,6 +15698,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a)
#if defined (__ARM_FP16_FORMAT_IEEE) || defined
(__ARM_FP16_FORMAT_ALTERNATIVE)
__extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+ return (float16x8_t) __a;
+}
+#endif
+
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined
(__ARM_FP16_FORMAT_ALTERNATIVE)
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_f16_f32 (float32x4_t __a)
{
return (float16x8_t) __a;
@@ -18750,6 +18769,492 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r,
float32x4_t __a, float32x4_t __b,
#pragma GCC push_options
#pragma GCC target ("arch=armv8.2-a+bf16")
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+ return (bfloat16x4_t) __a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+ return __builtin_neon_vdup_nv4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+ return __builtin_neon_vdup_nv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return __builtin_neon_vdup_lanev4bf (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return __builtin_neon_vdup_lanev8bf (__a, __b);
+}
+
+#define vset_lane_bf16(__e, __v, __idx) \
+ __extension__ \
+ ({ \
+ bfloat16_t __elem = (__e); \
+ bfloat16x4_t __vec = (__v); \
+ __builtin_arm_lane_check (4, __idx); \
+ __vec[__arm_lane(__vec, __idx)] = __elem; \
+ __vec; \
+ })
+
+#define vsetq_lane_bf16(__e, __v, __idx) \
+ __extension__ \
+ ({ \
+ bfloat16_t __elem = (__e); \
+ bfloat16x8_t __vec = (__v); \
+ __builtin_arm_lane_check (8, __idx); \
+ __vec[__arm_laneq(__vec, __idx)] = __elem; \
+ __vec; \
+ })
+
+#define vget_lane_bf16(__v, __idx) \
+ __extension__ \
+ ({ \
+ bfloat16x4_t __vec = (__v); \
+ __builtin_arm_lane_check (4, __idx); \
+ bfloat16_t __res = __vec[__arm_lane(__vec, __idx)]; \
+ __res; \
+ })
+
+#define vgetq_lane_bf16(__v, __idx) \
+ __extension__ \
+ ({ \
+ bfloat16x8_t __vec = (__v); \
+ __builtin_arm_lane_check (8, __idx); \
+ bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)]; \
+ __res; \
+ })
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return vdup_n_bf16( vgetq_lane_bf16 (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return vget_lane_bf16 (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return vgetq_lane_bf16 (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_neon_vget_highv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_neon_vget_lowv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return __builtin_neon_vcombinev4bf (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+ return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+ return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+ return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+ return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+ return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+ return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+ return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+ return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+ return (float32x2_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+ return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+ return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+ return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+ return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+ return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+ return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+ return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+ return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+ return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+ return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+ return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+ return (float32x4_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+ return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+ return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+ return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+ return (poly128_t)__a;
+}
+
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
diff --git a/gcc/config/arm/arm_neon_builtins.def
b/gcc/config/arm/arm_neon_builtins.def
index
85aeaecf0dc7579f511d0979708635ed65399614..bf28b24b108a081a023aa76f70d4da8bc0cc2d7e
100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane,
VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
VAR10 (UNOP, vdup_n,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR2 (UNOP, vdup_n, v8hf, v4hf)
+VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf)
VAR10 (GETLANE, vdup_lane,
v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR2 (GETLANE, vdup_lane, v8hf, v4hf)
-VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
-VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)
+VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf)
+VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
+VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
@@ -376,4 +376,4 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
VAR2 (TERNOP, vbfdot, v2sf, v4sf)
VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
-VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
\ No newline at end of file
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index
cf5bfb4c77a7be0400bada8c517b877537f4d2c6..1b6aada0d0879a7f521bf868ad2c19166962fff2
100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -82,14 +82,14 @@
(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements.
-(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI])
+(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes plus 64-bit elements,
;; with V4BFmode added, suitable for moves.
(define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
-(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF])
+(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF])
;; Double-width vector modes without floating-point elements.
(define_mode_iterator VDI [V8QI V4HI V2SI])
@@ -104,7 +104,7 @@
(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
;; Quad-width vector modes plus 64-bit elements.
-(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
+(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI])
;; Quad-width vector modes without floating-point elements.
(define_mode_iterator VQI [V16QI V8HI V4SI])
@@ -153,7 +153,7 @@
;; Vector modes, including 64-bit integer elements.
(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
- V4HF V8HF V2SF V4SF DI V2DI])
+ V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI])
;; Vector modes including 64-bit integer elements, but no floats.
(define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
@@ -518,6 +518,7 @@
(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
(V4HI "HI") (V8HI "HI")
(V4HF "HF") (V8HF "HF")
+ (V4BF "BF") (V8BF "BF")
(V2SI "SI") (V4SI "SI")
(V2SF "SF") (V4SF "SF")
(DI "DI") (V2DI "DI")])
@@ -526,6 +527,7 @@
(define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi")
(V4HI "hi") (V8HI "hi")
(V4HF "hf") (V8HF "hf")
+ (V4BF "bf") (V8BF "bf")
(V2SI "si") (V4SI "si")
(V2SF "sf") (V4SF "sf")
(DI "di") (V2DI "di")])
@@ -543,6 +545,7 @@
(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
(V4HI "SI") (V8HI "SI")
(V4HF "SF") (V8HF "SF")
+ (V4BF "BF") (V8BF "BF")
(V2SI "V2SI") (V4SI "V2SI")
(V2SF "V2SF") (V4SF "V2SF")
(DI "V2DI") (V2DI "V2DI")])
@@ -563,6 +566,7 @@
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK")
(V4HF "BLK") (V8HF "BLK")
+ (V4BF "BLK") (V8BF "BLK")
(V2SI "BLK") (V4SI "BLK")
(V2SF "BLK") (V4SF "BLK")
(DI "EI") (V2DI "EI")])
@@ -571,6 +575,7 @@
(define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI")
(V4HI "V4HI") (V8HI "V4HI")
(V4HF "V4HF") (V8HF "V4HF")
+ (V4BF "V4BF") (V8BF "V4BF")
(V2SI "V4SI") (V4SI "V4SI")
(V2SF "V4SF") (V4SF "V4SF")
(DI "OI") (V2DI "OI")])
@@ -579,6 +584,7 @@
(define_mode_attr V_reg [(V8QI "P") (V16QI "q")
(V4HI "P") (V8HI "q")
(V4HF "P") (V8HF "q")
+ (V4BF "P") (V8BF "q")
(V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q")
(DI "P") (V2DI "q")
@@ -609,7 +615,8 @@
(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
(V8HF "V4HF") (V4SI "V2SI")
(V4SF "V2SF") (V2DF "DF")
- (V2DI "DI") (V4HF "HF")])
+ (V2DI "DI") (V4HF "HF")
+ (V4BF "BF") (V8BF "V4BF")])
;; Same, but lower-case.
(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -620,7 +627,7 @@
(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
(V2SI "V4SI") (V4HF "V8HF")
�� (V2SF "V4SF") (DF "V2DF")
- (DI "V2DI")])
+ (DI "V2DI") (V4BF "V8BF")])
;; Same, but lower-case.
(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
@@ -639,6 +646,7 @@
(V4SI "V2SI") (V4SF "V2SF")
(V8QI "V8QI") (V4HI "V4HI")
(V2SI "V2SI") (V2SF "V2SF")
+ (V8BF "V4BF") (V4BF "V4BF")
(V8HF "V4HF") (V4HF "V4HF")])
;; Mode of result of comparison operations (and bit-select operand 1).
@@ -646,6 +654,7 @@
(V4HI "V4HI") (V8HI "V8HI")
(V2SI "V2SI") (V4SI "V4SI")
(V4HF "V4HI") (V8HF "V8HI")
+ (V4BF "V4HI") (V8BF "V8HI")
(V2SF "V2SI") (V4SF "V4SI")
(DI "DI") (V2DI "V2DI")])
@@ -687,6 +696,7 @@
(V4HI "u16") (V8HI "u16")
(V2SI "32") (V4SI "32")
(V4HF "u16") (V8HF "u16")
+ (V4BF "u16") (V8BF "u16")
(V2SF "32") (V4SF "32")])
(define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8")
@@ -694,6 +704,7 @@
(V2SI "32") (V4SI "32")
(DI "64") (V2DI "64")
(V4HF "16") (V8HF "16")
+ (V4BF "16") (V8BF "16")
(V2SF "32") (V4SF "32")])
(define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b")
@@ -764,10 +775,12 @@
(V2SI "true") (V4SI "false")
(V2SF "true") (V4SF "false")
(DI "true") (V2DI "false")
+ (V4BF "true") (V8BF "false")
(V4HF "true") (V8HF "false")])
(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
(V4HF "4") (V8HF "8")
+ (V4BF "4") (V8BF "8")
(V4HI "4") (V8HI "8")
(V2SI "2") (V4SI "4")
(V2SF "2") (V4SF "4")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index
21701b34fcd2c86bfd310904c7ceca0ce9fb047e..e732600719e2c0df35e1ec0a4ed1cb235dc25726
100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3704,6 +3704,22 @@ if (BYTES_BIG_ENDIAN)
[(set_attr "type" "neon_from_gp_q")]
)
+(define_insn "neon_vdup_nv4bf"
+ [(set (match_operand:V4BF 0 "s_register_operand" "=w")
+ (vec_duplicate:V4BF (match_operand:BF 1
"s_register_operand" "r")))]
+ "TARGET_NEON"
+ "vdup.16\t%P0, %1"
+ [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8bf"
+ [(set (match_operand:V8BF 0 "s_register_operand" "=w")
+ (vec_duplicate:V8BF (match_operand:BF 1
"s_register_operand" "r")))]
+ "TARGET_NEON"
+ "vdup.16\t%q0, %1"
+ [(set_attr "type" "neon_from_gp_q")]
+)
+
(define_insn "neon_vdup_n<mode>"
[(set (match_operand:V32 0 "s_register_operand" "=w,w")
(vec_duplicate:V32 (match_operand:<V_elem> 1
"s_register_operand" "r,t")))]
@@ -3737,7 +3753,7 @@ if (BYTES_BIG_ENDIAN)
(define_insn "neon_vdup_lane<mode>_internal"
[(set (match_operand:VDQW 0 "s_register_operand" "=w")
- (vec_duplicate:VDQW
+ (vec_duplicate:VDQW
(vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1
"s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand"
"i")]))))]
@@ -3758,12 +3774,12 @@ if (BYTES_BIG_ENDIAN)
)
(define_insn "neon_vdup_lane<mode>_internal"
- [(set (match_operand:VH 0 "s_register_operand" "=w")
- (vec_duplicate:VH
+ [(set (match_operand:VHFBF 0 "s_register_operand" "=w")
+ (vec_duplicate:VHFBF
(vec_select:<V_elem>
(match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
(parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
- "TARGET_NEON && TARGET_FP16"
+ "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
@@ -3799,10 +3815,10 @@ if (BYTES_BIG_ENDIAN)
})
(define_expand "neon_vdup_lane<mode>"
- [(match_operand:VH 0 "s_register_operand")
+ [(match_operand:VHFBF 0 "s_register_operand")
(match_operand:<V_double_vector_mode> 1 "s_register_operand")
(match_operand:SI 2 "immediate_operand")]
- "TARGET_NEON && TARGET_FP16"
+ "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
{
if (BYTES_BIG_ENDIAN)
{
@@ -6599,4 +6615,4 @@ if (BYTES_BIG_ENDIAN)
}
}
[(set_attr "type" "neon_dot<q>")]
-)
\ No newline at end of file
+)
diff --git a/gcc/testsuite/gcc.target/arm/bf16_dup.c
b/gcc/testsuite/gcc.target/arm/bf16_dup.c
new file mode 100644
index
0000000000000000000000000000000000000000..82dff25fc6e244a1d930375a1e3505e9173e53dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/bf16_dup.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+bf16+fp16
-mfloat-abi=softfp" } */
+