Re: [GCC][PATCH][ARM] Add vreinterpret, vdup, vget and vset bfloat16 intrinsic

Kyrill Tkachov Thu, 27 Feb 2020 09:04:25 -0800

Hi Mihail,

On 2/27/20 2:44 PM, Mihail Ionescu wrote:

Hi Kyrill,


On 02/27/2020 11:09 AM, Kyrill Tkachov wrote:

Hi Mihail,

On 2/27/20 10:27 AM, Mihail Ionescu wrote:

Hi,

This patch adds support for the bf16 vector create, get, set,
duplicate and reinterpret intrinsics.
ACLE documents are at https://developer.arm.com/docs/101028/latest
ISA documents are at https://developer.arm.com/docs/ddi0596/latest

Regression tested on arm-none-eabi.


gcc/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ione...@arm.com>

        * (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the
        beginning of the file.
        (vcreate_bf16, vcombine_bf16): New.
        (vdup_n_bf16, vdupq_n_bf16): New.
        (vdup_lane_bf16, vdup_laneq_bf16): New.
        (vdupq_lane_bf16, vdupq_laneq_bf16): New.
        (vduph_lane_bf16, vduph_laneq_bf16): New.
        (vset_lane_bf16, vsetq_lane_bf16): New.
        (vget_lane_bf16, vgetq_lane_bf16): New.
        (vget_high_bf16, vget_low_bf16): New.
        (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New.
        (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New.
        (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New.
        (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New.
        (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New.
        (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New.
        (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New.
        (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New.
        (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New.
        (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New.
        (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New.
        (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New.
        (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New.
        (vreinterpretq_bf16_p128): New.
        (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New.
        (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New.
        (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New.
        (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New.
        (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New.
        (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New.
        (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New.
        (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New.
        (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New.
        (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New.
        (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New.
        (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New.
        (vreinterpretq_p128_bf16): New.
        * config/arm/arm_neon_builtins.def (VDX): Add V4BF.
        (V_elem): Likewise.
        (V_elem_l): Likewise.
        (VD_LANE): Likewise.
        (VQX) Add V8BF.
        (V_DOUBLE): Likewise.
        (VDQX): Add V4BF and V8BF.
        (V_two_elem, V_three_elem, V_four_elem): Likewise.
        (V_reg): Likewise.
        (V_HALF): Likewise.
        (V_double_vector_mode): Likewise.
        (V_cmp_result): Likewise.
        (V_uf_sclr): Likewise.
        (V_sz_elem): Likewise.
        (Is_d_reg): Likewise.
        (V_mode_nunits): Likewise.
        * config/arm/neon.md (neon_vdup_lane): Enable for BFloat.

gcc/testsuite/ChangeLog:

2020-02-27  Mihail Ionescu  <mihail.ione...@arm.com>

        * gcc.target/arm/bf16_dup.c: New test.
        * gcc.target/arm/bf16_reinterpret.c: Likewise.

Is it ok for trunk?


This looks mostly ok with a few nits...


Regards,
Mihail

############### Attachment also inlined for ease of reply###############



diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h

index09297831cdcd6e695843c17b7724c114f3a129fe..5901a8f1fb84f204ae95f0ccc97bf5ae944c482c100644

--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -42,6 +42,15 @@ extern "C" {
 #include <arm_bf16.h>
 #include <stdint.h>

+#ifdef __ARM_BIG_ENDIAN
+#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))

+#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) -1))+#define __arm_laneq(__vec, __idx) (__idx ^(__ARM_NUM_LANES(__vec)/2 - 1))

+#else
+#define __arm_lane(__vec, __idx) __idx
+#define __arm_laneq(__vec, __idx) __idx
+#endif
+
 typedef __simd64_int8_t int8x8_t;
 typedef __simd64_int16_t int16x4_t;
 typedef __simd64_int32_t int32x2_t;
@@ -6147,14 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b)
   /* For big-endian, GCC's vector indices are reversed within each 64
      bits compared to the architectural lane indices used by Neon
      intrinsics.  */



Please move this comment as well.

-#ifdef __ARM_BIG_ENDIAN
-#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0]))

-#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) -1))-#define __arm_laneq(__vec, __idx) (__idx ^(__ARM_NUM_LANES(__vec)/2 - 1))

-#else
-#define __arm_lane(__vec, __idx) __idx
-#define __arm_laneq(__vec, __idx) __idx
-#endif

 #define vget_lane_f16(__v, __idx)                       \
__extension__ \
@@ -14476,6 +14477,15 @@ vreinterpret_p16_u32 (uint32x2_t __a)

#if defined (__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)

 __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+#endif
+

+#if defined (__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)

+__extension__ extern __inline float16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpret_f16_p8 (poly8x8_t __a)
 {
   return (float16x4_t) __a;
@@ -15688,6 +15698,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a)

#if defined (__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)

 __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+#endif
+

+#if defined (__ARM_FP16_FORMAT_IEEE) || defined(__ARM_FP16_FORMAT_ALTERNATIVE)

+__extension__ extern __inline float16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vreinterpretq_f16_f32 (float32x4_t __a)
 {
   return (float16x8_t) __a;

@@ -18750,6 +18769,492 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r,float32x4_t __a, float32x4_t __b,

 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.2-a+bf16")

+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+  return (bfloat16x4_t) __a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+  return __builtin_neon_vdup_nv4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+  return __builtin_neon_vdup_nv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __builtin_neon_vdup_lanev4bf (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return __builtin_neon_vdup_lanev8bf (__a, __b);
+}
+
+#define vset_lane_bf16(__e, __v, __idx)                \
+  __extension__                                        \
+  ({                                           \
+    bfloat16_t __elem = (__e);                 \
+    bfloat16x4_t __vec = (__v);                        \
+    __builtin_arm_lane_check (4, __idx);       \
+    __vec[__arm_lane(__vec, __idx)] = __elem;  \
+    __vec;                                     \
+  })
+
+#define vsetq_lane_bf16(__e, __v, __idx)               \
+  __extension__                                        \
+  ({                                           \
+    bfloat16_t __elem = (__e);                 \
+    bfloat16x8_t __vec = (__v);                        \
+    __builtin_arm_lane_check (8, __idx);       \
+    __vec[__arm_laneq(__vec, __idx)] = __elem; \
+    __vec;                                     \
+  })
+
+#define vget_lane_bf16(__v, __idx)                     \
+ __extension__ \
+  ({                                                   \
+    bfloat16x4_t __vec = (__v);                                \
+    __builtin_arm_lane_check (4, __idx);               \
+    bfloat16_t __res = __vec[__arm_lane(__vec, __idx)];        \
+    __res;                                             \
+  })
+
+#define vgetq_lane_bf16(__v, __idx)                    \
+ __extension__ \
+  ({                                                   \
+    bfloat16x8_t __vec = (__v);                                \
+    __builtin_arm_lane_check (8, __idx);               \
+    bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)];       \
+    __res;                                             \
+  })
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdup_n_bf16( vgetq_lane_bf16 (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+  return vget_lane_bf16 (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+  return vgetq_lane_bf16 (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_neon_vget_highv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_neon_vget_lowv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_neon_vcombinev4bf (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+  return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+  return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+  return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+  return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+  return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+  return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+  return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+  return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+  return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+  return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+  return (float32x2_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+  return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+  return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+  return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+  return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+  return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+  return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+  return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+  return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+  return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+  return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+  return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+  return (float32x4_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+  return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+  return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+  return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+  return (poly128_t)__a;
+}
+
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)

diff --git a/gcc/config/arm/arm_neon_builtins.defb/gcc/config/arm/arm_neon_builtins.defindex85aeaecf0dc7579f511d0979708635ed65399614..bf28b24b108a081a023aa76f70d4da8bc0cc2d7e100644

--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane,
 VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di)
 VAR10 (UNOP, vdup_n,
          v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR2 (UNOP, vdup_n, v8hf, v4hf)
+VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf)
 VAR10 (GETLANE, vdup_lane,
          v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR2 (GETLANE, vdup_lane, v8hf, v4hf)
-VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
-VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di)
+VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)
+VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf)
+VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
+VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di)
 VAR3 (UNOP, vmovn, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovns, v8hi, v4si, v2di)
 VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di)
@@ -376,4 +376,4 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)

 VAR2 (TERNOP, vbfdot, v2sf, v4sf)
 VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
-VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
\ No newline at end of file
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

indexcf5bfb4c77a7be0400bada8c517b877537f4d2c6..1b6aada0d0879a7f521bf868ad2c19166962fff2100644

--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -82,14 +82,14 @@
 (define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI])

 ;; Double-width vector modes plus 64-bit elements.
-(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI])
+(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])

 ;; Double-width vector modes plus 64-bit elements,
 ;; with V4BFmode added, suitable for moves.
 (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])

 ;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane.
-(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF])
+(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF])

 ;; Double-width vector modes without floating-point elements.
 (define_mode_iterator VDI [V8QI V4HI V2SI])
@@ -104,7 +104,7 @@
 (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])

 ;; Quad-width vector modes plus 64-bit elements.
-(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI])
+(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI])

 ;; Quad-width vector modes without floating-point elements.
 (define_mode_iterator VQI [V16QI V8HI V4SI])
@@ -153,7 +153,7 @@

 ;; Vector modes, including 64-bit integer elements.
 (define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI
-                           V4HF V8HF V2SF V4SF DI V2DI])
+                           V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI])

 ;; Vector modes including 64-bit integer elements, but no floats.
 (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
@@ -518,6 +518,7 @@
 (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
                           (V4HI "HI") (V8HI "HI")
                           (V4HF "HF") (V8HF "HF")
+                         (V4BF "BF") (V8BF "BF")
                           (V2SI "SI") (V4SI "SI")
                           (V2SF "SF") (V4SF "SF")
                           (DI "DI")   (V2DI "DI")])
@@ -526,6 +527,7 @@
 (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi")
                             (V4HI "hi") (V8HI "hi")
                             (V4HF "hf") (V8HF "hf")
+                           (V4BF "bf") (V8BF "bf")
                             (V2SI "si") (V4SI "si")
                             (V2SF "sf") (V4SF "sf")
                             (DI "di")   (V2DI "di")])
@@ -543,6 +545,7 @@
 (define_mode_attr V_two_elem [(V8QI "HI")   (V16QI "HI")
                               (V4HI "SI")   (V8HI "SI")
                               (V4HF "SF")   (V8HF "SF")
+                              (V4BF "BF")   (V8BF "BF")
                               (V2SI "V2SI") (V4SI "V2SI")
                               (V2SF "V2SF") (V4SF "V2SF")
                               (DI "V2DI")   (V2DI "V2DI")])
@@ -563,6 +566,7 @@
 (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                 (V4HI "BLK") (V8HI "BLK")
                                 (V4HF "BLK") (V8HF "BLK")
+                                (V4BF "BLK") (V8BF "BLK")
                                 (V2SI "BLK") (V4SI "BLK")
                                 (V2SF "BLK") (V4SF "BLK")
                                 (DI "EI")    (V2DI "EI")])
@@ -571,6 +575,7 @@
 (define_mode_attr V_four_elem [(V8QI "SI")   (V16QI "SI")
                                (V4HI "V4HI") (V8HI "V4HI")
                                (V4HF "V4HF") (V8HF "V4HF")
+                               (V4BF "V4BF") (V8BF "V4BF")
                                (V2SI "V4SI") (V4SI "V4SI")
                                (V2SF "V4SF") (V4SF "V4SF")
                                (DI "OI")     (V2DI "OI")])
@@ -579,6 +584,7 @@
 (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
                          (V4HI "P") (V8HI  "q")
                          (V4HF "P") (V8HF  "q")
+                        (V4BF "P") (V8BF  "q")
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
@@ -609,7 +615,8 @@
 (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
                           (V8HF "V4HF") (V4SI  "V2SI")
                           (V4SF "V2SF") (V2DF "DF")
-                         (V2DI "DI") (V4HF "HF")])
+                         (V2DI "DI") (V4HF "HF")
+                         (V4BF "BF") (V8BF  "V4BF")])

 ;; Same, but lower-case.
 (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
@@ -620,7 +627,7 @@
 (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
                             (V2SI "V4SI") (V4HF "V8HF")
                    ��        (V2SF "V4SF") (DF "V2DF")
-                           (DI "V2DI")])
+                           (DI "V2DI")   (V4BF "V8BF")])

 ;; Same, but lower-case.
 (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
@@ -639,6 +646,7 @@
                                         (V4SI "V2SI") (V4SF "V2SF")
                                         (V8QI "V8QI") (V4HI "V4HI")
                                         (V2SI "V2SI") (V2SF "V2SF")
+                                       (V8BF "V4BF") (V4BF "V4BF")
                                         (V8HF "V4HF") (V4HF "V4HF")])

 ;; Mode of result of comparison operations (and bit-select operand 1).
@@ -646,6 +654,7 @@
                                 (V4HI "V4HI") (V8HI "V8HI")
                                 (V2SI "V2SI") (V4SI "V4SI")
                                 (V4HF "V4HI") (V8HF "V8HI")
+                               (V4BF "V4HI") (V8BF  "V8HI")
                                 (V2SF "V2SI") (V4SF "V4SI")
                                 (DI   "DI")   (V2DI "V2DI")])

@@ -687,6 +696,7 @@
                  (V4HI "u16") (V8HI "u16")
                              (V2SI "32") (V4SI "32")
                              (V4HF "u16") (V8HF "u16")
+                             (V4BF "u16") (V8BF "u16")
                              (V2SF "32") (V4SF "32")])

 (define_mode_attr V_sz_elem [(V8QI "8")  (V16QI "8")
@@ -694,6 +704,7 @@
                              (V2SI "32") (V4SI  "32")
                              (DI   "64") (V2DI  "64")
                              (V4HF "16") (V8HF "16")
+                            (V4BF "16") (V8BF "16")
                              (V2SF "32") (V4SF  "32")])

 (define_mode_attr V_elem_ch [(V8QI "b")  (V16QI "b")
@@ -764,10 +775,12 @@
                             (V2SI "true") (V4SI  "false")
                             (V2SF "true") (V4SF  "false")
                             (DI   "true") (V2DI  "false")
+                           (V4BF "true") (V8BF  "false")
                             (V4HF "true") (V8HF  "false")])

 (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
                                  (V4HF "4") (V8HF "8")
+                                (V4BF "4") (V8BF "8")
                                  (V4HI "4") (V8HI "8")
                                  (V2SI "2") (V4SI "4")
                                  (V2SF "2") (V4SF "4")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md

index21701b34fcd2c86bfd310904c7ceca0ce9fb047e..e732600719e2c0df35e1ec0a4ed1cb235dc25726100644

--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -3704,6 +3704,22 @@ if (BYTES_BIG_ENDIAN)
   [(set_attr "type" "neon_from_gp_q")]
 )

+(define_insn "neon_vdup_nv4bf"
+  [(set (match_operand:V4BF 0 "s_register_operand" "=w")

+ (vec_duplicate:V4BF (match_operand:BF 1"s_register_operand" "r")))]

+  "TARGET_NEON"
+  "vdup.16\t%P0, %1"
+  [(set_attr "type" "neon_from_gp")]
+)
+
+(define_insn "neon_vdup_nv8bf"
+  [(set (match_operand:V8BF 0 "s_register_operand" "=w")

+ (vec_duplicate:V8BF (match_operand:BF 1"s_register_operand" "r")))]

+  "TARGET_NEON"
+  "vdup.16\t%q0, %1"
+  [(set_attr "type" "neon_from_gp_q")]
+)
+
 (define_insn "neon_vdup_n<mode>"
   [(set (match_operand:V32 0 "s_register_operand" "=w,w")

(vec_duplicate:V32 (match_operand:<V_elem> 1"s_register_operand" "r,t")))]

@@ -3737,7 +3753,7 @@ if (BYTES_BIG_ENDIAN)

 (define_insn "neon_vdup_lane<mode>_internal"
   [(set (match_operand:VDQW 0 "s_register_operand" "=w")
-        (vec_duplicate:VDQW
+        (vec_duplicate:VDQW
           (vec_select:<V_elem>

(match_operand:<V_double_vector_mode> 1"s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand""i")]))))]

@@ -3758,12 +3774,12 @@ if (BYTES_BIG_ENDIAN)
 )

 (define_insn "neon_vdup_lane<mode>_internal"
- [(set (match_operand:VH 0 "s_register_operand" "=w")
-   (vec_duplicate:VH
+ [(set (match_operand:VHFBF 0 "s_register_operand" "=w")
+   (vec_duplicate:VHFBF
     (vec_select:<V_elem>
      (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w")
      (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))]
- "TARGET_NEON && TARGET_FP16"
+ "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
 {
   if (BYTES_BIG_ENDIAN)
     {
@@ -3799,10 +3815,10 @@ if (BYTES_BIG_ENDIAN)
 })

 (define_expand "neon_vdup_lane<mode>"
-  [(match_operand:VH 0 "s_register_operand")
+  [(match_operand:VHFBF 0 "s_register_operand")
    (match_operand:<V_double_vector_mode> 1 "s_register_operand")
    (match_operand:SI 2 "immediate_operand")]
-  "TARGET_NEON && TARGET_FP16"
+  "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)"
 {
   if (BYTES_BIG_ENDIAN)
     {
@@ -6599,4 +6615,4 @@ if (BYTES_BIG_ENDIAN)
       }
   }
   [(set_attr "type" "neon_dot<q>")]
-)
\ No newline at end of file
+)

diff --git a/gcc/testsuite/gcc.target/arm/bf16_dup.cb/gcc/testsuite/gcc.target/arm/bf16_dup.c

new file mode 100644

index0000000000000000000000000000000000000000..82dff25fc6e244a1d930375a1e3505e9173e53dc

--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/bf16_dup.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */

+/* { dg-additional-options "-march=armv8.2-a+bf16+fp16-mfloat-abi=softfp" } */


Doesn't this need something like

/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
?

We wouldn't want to try it on M-profile targets, for example.

+#include "arm_neon.h"
+
+float32x2_t
+test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b)
+{
+  bfloat16x4_t _a = vcreate_bf16(a);
+  bfloat16x4_t _b = vcreate_bf16(b);
+
+  return vbfdot_f32 (r, _a, _b);
+}

+/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+,d[0-9]+} } } */

+
+bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b)
+{
+  return vcombine_bf16 (a, b);
+}
+
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}
+
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a)
+{
+  return vget_lane_bf16 (a, 1);
+}
+
+bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a)
+{
+  return vgetq_lane_bf16 (a, 7);
+}
+
+bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b)
+{
+  return vset_lane_bf16 (a, b, 1);
+}
+
+bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b)
+{
+  return vsetq_lane_bf16 (a, b, 7);
+}
+
+bfloat16x4_t vdup_test (bfloat16_t a)
+{
+  return vdup_n_bf16 (a);
+}
+/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } }  */
+
+bfloat16x8_t vdupq_test (bfloat16_t a)
+{
+  return vdupq_n_bf16 (a);
+}
+/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } }  */
+
+
+bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a)
+{
+  return vdup_lane_bf16 (a, 1);
+}

+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+,d[0-9]+\[1\]} 1 } } */

+
+bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a)
+{
+  return vdupq_lane_bf16 (a, 1);
+}

+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+,d[0-9]+\[1\]} 1 } } */

+
+bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdup_laneq_bf16 (a, 3);
+}
+
+bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a)
+{
+  return vdupq_laneq_bf16 (a, 3);
+}
+
+bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a)
+{
+  return vduph_lane_bf16 (a, 1);
+}
+
+bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
+{
+  return vduph_laneq_bf16 (a, 7);
+}

diff --git a/gcc/testsuite/gcc.target/arm/bf16_reinterpret.cb/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c

new file mode 100644

index0000000000000000000000000000000000000000..e7d30a95fbc3ceaf4a92057a10e6be4a34e1957c

--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c
@@ -0,0 +1,435 @@
+/* { dg-do assemble { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon }  */

+/* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16-mfloat-abi=hard -mfpu=crypto-neon-fp-armv8" } */

Why the dg-additional-options ? Doesn't dg-add-options do the rightthing here?


Thanks,

Kyrill

Thanks for the review.
The dg-additional-options is needed here in order to add the other
required extensions. A new target check could be added to do all of
this, but I think it will be too specific to these test cases and it
wouldn't have any other use.
I decided to use the bf16 target check because that one passes, then
fp16 will also pass, and then I overwrite the options with the
additional ones to ensure that the other extensions are added.



Ok, thanks.

Kyrill


I've addressed the other two comments in the attached diff.


Regards,
Mihail

Re: [GCC][PATCH][ARM] Add vreinterpret, vdup, vget and vset bfloat16 intrinsic

Reply via email to