Sorry for the slow review. Andrew Carlotti via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > This removes a significant number of intrinsic definitions from the arm_neon.h > header file, and reduces the amount of code duplication. The new macros and > data structures are intended to also facilitate moving other intrinsic > definitions out of the header file in future.
Nice. > There is a a slight change in the behaviour of the bf16 vreinterpret > intrinsics > when compiling without bf16 support. Expressions like: > > b = vreinterpretq_s32_bf16(vreinterpretq_bf16_s64(a)); > > are now compiled successfully, instead of causing a 'target specific option > mismatch' during inlining. Yeah. The ACLE says that these are conditional on +bf16, but no-one seems to have a strong opinion about whether it needs to be that way. Accepting them unconditionally makes them consistent with arm_sve.h. Could you split out the part that adds V1DI? I never did understand why we had V1DF but not V1DI, but neither one seemed obviously more right than the other. Having the V1DI change as a separate patch would help with bisecting and might help with later archaeology. > @@ -523,6 +581,99 @@ static aarch64_simd_builtin_datum > aarch64_simd_builtin_data[] = { > FCMLA_LANEQ_BUILTIN (180, v4hf, fcmla_laneq, V4HF, true) \ > FCMLA_LANEQ_BUILTIN (270, v4hf, fcmla_laneq, V4HF, true) \ > > + > +/* vreinterpret intrinsics are defined for any pair of element types. > + { _bf16 } { _bf16 } (if bf16 is supported) Like you say, the bf16 alternatives are now unconditional, so it might be better to remove "(if bf16 is supported)". > + { _f16 _f32 _f64 } { _f16 _f32 _f64 } > + { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 } > + { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 } > + { _p8 _p16 _p64 } { _p8 _p16 _p64 }. */ > +#define VREINTERPRET_BUILTIN2(A, B) \ > + VREINTERPRET_BUILTIN (A, B, d) > […] > @@ -636,6 +804,22 @@ static aarch64_fcmla_laneq_builtin_datum > aarch64_fcmla_lane_builtin_data[] = { > AARCH64_SIMD_FCMLA_LANEQ_BUILTINS > }; > > +#undef VREINTERPRET_BUILTIN > +#define VREINTERPRET_BUILTIN(A, B, L) \ > + {"vreinterpret" SIMD_INTR_LENGTH_CHAR(L) "_" #A "_" #B, \ > + AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B, \ > + 2, \ > + { SIMD_INTR_MODE(A, L), SIMD_INTR_MODE(B, L) }, \ > + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(B) }, \ > + FLAG_AUTO_FP, \ > + !strcmp(#A, #B) \ Could we instead use: SIMD_INTR_MODE(A, L) == SIMD_INTR_MODE(B, L) ? That should be a definite compile-time constant. It should then be possible to make… > + }, > + > +static aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = { > + AARCH64_SIMD_VREINTERPRET_BUILTINS > +}; …this a static const array. > + > + > #undef CRC32_BUILTIN > > static GTY(()) tree aarch64_builtin_decls[AARCH64_BUILTIN_MAX]; > @@ -1127,6 +1311,58 @@ aarch64_init_fcmla_laneq_builtins (void) > } > } > > +void > +aarch64_init_simd_intrinsics (void) > +{ > + unsigned int i = 0; > + > + for (i = 0; i < ARRAY_SIZE (aarch64_simd_intrinsic_data); ++i) > + { > + aarch64_simd_intrinsic_datum* d = &aarch64_simd_intrinsic_data[i]; Making the type const would overflow this line, but using auto would be OK. > + > + if (d->skip) > + continue; > + > + tree return_type = void_type_node; > + tree args = void_list_node; > + > + for (int op_num = d->op_count - 1; op_num >= 0; op_num--) > + { > + machine_mode op_mode = d->op_modes[op_num]; > + enum aarch64_type_qualifiers qualifiers = d->qualifiers[op_num]; > + > + /* For pointers, we want a pointer to the basic type > + of the vector. */ > + if (qualifiers & qualifier_pointer && VECTOR_MODE_P (op_mode)) > + op_mode = GET_MODE_INNER (op_mode); > + > + tree eltype = aarch64_simd_builtin_type > + (op_mode, > + (qualifiers & qualifier_unsigned) != 0, > + (qualifiers & qualifier_poly) != 0); > + gcc_assert (eltype != NULL); > + > + /* Add qualifiers. */ > + if (qualifiers & qualifier_const) > + eltype = build_qualified_type (eltype, TYPE_QUAL_CONST); > + > + if (qualifiers & qualifier_pointer) > + eltype = build_pointer_type (eltype); Would be better to put this eltype stuff in a common subroutine rather than duplicate it from aarch64_init_simd_builtin_functions. > + > + if (op_num == 0) > + return_type = eltype; > + else > + args = tree_cons (NULL_TREE, eltype, args); > + } > + > + tree ftype = build_function_type (return_type, args); > + tree attrs = aarch64_get_attributes (FLAG_AUTO_FP, d->op_modes[0]); > + unsigned int code = (d->fcode << AARCH64_BUILTIN_SHIFT | > AARCH64_BUILTIN_GENERAL); > + tree fndecl = simulate_builtin_function_decl (input_location, d->name, > ftype, code, NULL, attrs); Formatting nit: last two lines overflow the 80 character limit. > + aarch64_builtin_decls[d->fcode] = fndecl; > + } > +} > + > void > aarch64_init_simd_builtin_functions (bool called_from_pragma) > { > […] > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > a00e1c6ef8d6b43d8b1a0fe4701e6b8c1f0f622f..a3f8f20e5fb20eeb9b9f48a27d83343638ba3c93 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -8039,6 +8039,20 @@ > DONE; > }) > > +;; And same for V1DI (this should probably be merged with the above pattern) Yes please :-) E.g. maybe add a VQ_2E mode iterator as the 128-bit half of VP_2E, and a V1HALF mode attributes that gives V1DI and V1DF (instead of VHALF, which gives DI and DF). Thanks, Richard > +(define_expand "vec_extractv2div1di" > + [(match_operand:V1DI 0 "register_operand") > + (match_operand:V2DI 1 "register_operand") > + (match_operand 2 "immediate_operand")] > + "TARGET_SIMD" > +{ > + /* V1DI is rarely used by other patterns, so it should be better to hide > + it in a subreg destination of a normal DI op. */ > + rtx scalar0 = gen_lowpart (DImode, operands[0]); > + emit_insn (gen_vec_extractv2didi (scalar0, operands[1], operands[2])); > + DONE; > +}) > + > ;; aes > > (define_insn "aarch64_crypto_aes<aes_op>v16qi"