Thanks. When trying a bootstrap locally I get: include/arm_neon.h:34709:38: error: cannot convert ‘const __bf16*’ to ‘const __fp16*’ 34709 | __o = __builtin_aarch64_ld1x2v4hf ((const __builtin_aarch64_simd_bf *) __a); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | const __bf16*
I think the affected lines are: Mihail Ionescu <mihail.ione...@foss.arm.com> writes: > +__extension__ extern __inline bfloat16x4x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vld1_bf16_x2 (const bfloat16_t *__a) > +{ > + bfloat16x4x2_t ret; > + __builtin_aarch64_simd_oi __o; > + __o = __builtin_aarch64_ld1x2v4hf ((const __builtin_aarch64_simd_bf *) > __a); bf rather than hf here (the error above). > + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); > + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); > + return ret; > +} > [...] > +__extension__ extern __inline bfloat16x4x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vld2_dup_bf16 (const bfloat16_t * __a) > +{ > + bfloat16x4x2_t ret; > + __builtin_aarch64_simd_oi __o; > + __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a); > + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 0); Here too (although the choice is cosmetic). > + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); > + return ret; > +} > + > +__extension__ extern __inline bfloat16x8x2_t > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vld2q_dup_bf16 (const bfloat16_t * __a) > +{ > + bfloat16x8x2_t ret; > + __builtin_aarch64_simd_oi __o; > + __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a); > + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0); Same here (again cosmetic). > + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); > + return ret; > +} > [...] > +__extension__ extern __inline void > +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > +vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) > +{ > + __builtin_aarch64_simd_ci __o; > + bfloat16x8x3_t __temp; > + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 > (__AARCH64_UINT64_C (0))); > + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 > (__AARCH64_UINT64_C (0))); > + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 > (__AARCH64_UINT64_C (0))); > + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], > 0); > + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], > 1); > + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], > 2); > + __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_bf *) __a, __o); Here too, to avoid: include/arm_neon.h:35000:32: error: cannot convert ‘__bf16*’ to ‘__fp16*’ 35000 | __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_bf *) __a, __o); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | __bf16* Looks good otherwise. I guess this shows we should be running the intrinsics tests for C++ as well as C (in general, not just for this patch). Richard