> On Mon, May 20, 2024 at 2:57 AM Richard Sandiford > <richard.sandif...@arm.com> wrote: > > > > Pengxuan Zheng <quic_pzh...@quicinc.com> writes: > > > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up > > > more optimization opportunities for gimple optimizers. > > > > > > While we are here, we also remove the vget_low_* definitions from > > > arm_neon.h and use the new intrinsics framework. > > > > > > PR target/102171 > > > > > > gcc/ChangeLog: > > > > > > * config/aarch64/aarch64-builtins.cc > (AARCH64_SIMD_VGET_LOW_BUILTINS): > > > New macro to create definitions for all vget_low intrinsics. > > > (VGET_LOW_BUILTIN): Likewise. > > > (enum aarch64_builtins): Add vget_low function codes. > > > (aarch64_general_fold_builtin): Fold vget_low calls. > > > * config/aarch64/aarch64-simd-builtins.def: Delete vget_low > > > builtins. > > > * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete. > > > (aarch64_vget_lo_halfv8bf): Likewise. > > > * config/aarch64/arm_neon.h (__attribute__): Delete. > > > (vget_low_f16): Likewise. > > > (vget_low_f32): Likewise. > > > (vget_low_f64): Likewise. > > > (vget_low_p8): Likewise. > > > (vget_low_p16): Likewise. > > > (vget_low_p64): Likewise. > > > (vget_low_s8): Likewise. > > > (vget_low_s16): Likewise. > > > (vget_low_s32): Likewise. > > > (vget_low_s64): Likewise. > > > (vget_low_u8): Likewise. > > > (vget_low_u16): Likewise. > > > (vget_low_u32): Likewise. > > > (vget_low_u64): Likewise. > > > (vget_low_bf16): Likewise. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/aarch64/pr113573.c: Replace > __builtin_aarch64_get_lowv8hi > > > with vget_low_s16. > > > * gcc.target/aarch64/vget_low_2.c: New test. > > > * gcc.target/aarch64/vget_low_2_be.c: New test. > > > > Ok, thanks. I suppose the patch has the side effect of allowing > > vget_low_bf16 to be called without +bf16. IMO that's the correct > > behaviour though, and is consistent with how we handle reinterprets.
Thanks, Richard! Yes, it does have the side effect you pointed out and is consistent with how reinterprets are handled currently. > > Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 . Thanks, Andrew! > > Thanks, > Andrew > > > > > Richard > > > > > Signed-off-by: Pengxuan Zheng <quic_pzh...@quicinc.com> > > > --- > > > gcc/config/aarch64/aarch64-builtins.cc | 60 ++++++++++ > > > gcc/config/aarch64/aarch64-simd-builtins.def | 5 +- > > > gcc/config/aarch64/aarch64-simd.md | 23 +--- > > > gcc/config/aarch64/arm_neon.h | 105 ------------------ > > > gcc/testsuite/gcc.target/aarch64/pr113573.c | 2 +- > > > gcc/testsuite/gcc.target/aarch64/vget_low_2.c | 30 +++++ > > > .../gcc.target/aarch64/vget_low_2_be.c | 31 ++++++ > > > 7 files changed, 124 insertions(+), 132 deletions(-) create mode > > > 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > > > > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > > > b/gcc/config/aarch64/aarch64-builtins.cc > > > index 75d21de1401..4afe7c86ae3 100644 > > > --- a/gcc/config/aarch64/aarch64-builtins.cc > > > +++ b/gcc/config/aarch64/aarch64-builtins.cc > > > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum > aarch64_simd_builtin_data[] = { > > > VREINTERPRET_BUILTINS \ > > > VREINTERPRETQ_BUILTINS > > > > > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ > > > + VGET_LOW_BUILTIN(f16) \ > > > + VGET_LOW_BUILTIN(f32) \ > > > + VGET_LOW_BUILTIN(f64) \ > > > + VGET_LOW_BUILTIN(p8) \ > > > + VGET_LOW_BUILTIN(p16) \ > > > + VGET_LOW_BUILTIN(p64) \ > > > + VGET_LOW_BUILTIN(s8) \ > > > + VGET_LOW_BUILTIN(s16) \ > > > + VGET_LOW_BUILTIN(s32) \ > > > + VGET_LOW_BUILTIN(s64) \ > > > + VGET_LOW_BUILTIN(u8) \ > > > + VGET_LOW_BUILTIN(u16) \ > > > + VGET_LOW_BUILTIN(u32) \ > > > + VGET_LOW_BUILTIN(u64) \ > > > + VGET_LOW_BUILTIN(bf16) > > > + > > > typedef struct > > > { > > > const char *name; > > > @@ -697,6 +714,9 @@ typedef struct > > > #define VREINTERPRET_BUILTIN(A, B, L) \ > > > AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B, > > > > > > +#define VGET_LOW_BUILTIN(A) \ > > > + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, > > > + > > > #undef VAR1 > > > #define VAR1(T, N, MAP, FLAG, A) \ > > > AARCH64_SIMD_BUILTIN_##T##_##N##A, > > > @@ -732,6 +752,7 @@ enum aarch64_builtins > > > AARCH64_CRC32_BUILTIN_MAX, > > > /* SIMD intrinsic builtins. */ > > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > > /* ARMv8.3-A Pointer Authentication Builtins. */ > > > AARCH64_PAUTH_BUILTIN_AUTIA1716, > > > AARCH64_PAUTH_BUILTIN_PACIA1716, > > > @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum > aarch64_fcmla_lane_builtin_data[] = { > > > && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \ > > > }, > > > > > > +#undef VGET_LOW_BUILTIN > > > +#define VGET_LOW_BUILTIN(A) \ > > > + {"vget_low_" #A, \ > > > + AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \ > > > + 2, \ > > > + { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \ > > > + { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \ > > > + FLAG_AUTO_FP, \ > > > + false \ > > > + }, > > > + > > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \ > > > + VGET_LOW_BUILTIN(f16) \ > > > + VGET_LOW_BUILTIN(f32) \ > > > + VGET_LOW_BUILTIN(f64) \ > > > + VGET_LOW_BUILTIN(p8) \ > > > + VGET_LOW_BUILTIN(p16) \ > > > + VGET_LOW_BUILTIN(p64) \ > > > + VGET_LOW_BUILTIN(s8) \ > > > + VGET_LOW_BUILTIN(s16) \ > > > + VGET_LOW_BUILTIN(s32) \ > > > + VGET_LOW_BUILTIN(s64) \ > > > + VGET_LOW_BUILTIN(u8) \ > > > + VGET_LOW_BUILTIN(u16) \ > > > + VGET_LOW_BUILTIN(u32) \ > > > + VGET_LOW_BUILTIN(u64) \ > > > + VGET_LOW_BUILTIN(bf16) > > > + > > > static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] > = { > > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > > }; > > > > > > > > > @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, > > > tree arg1, tree arg2) #define VREINTERPRET_BUILTIN(A, B, L) \ > > > case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B: > > > > > > +#undef VGET_LOW_BUILTIN > > > +#define VGET_LOW_BUILTIN(A) \ > > > + case AARCH64_SIMD_BUILTIN_VGET_LOW_##A: > > > > > > /* Try to fold a call to the built-in function with subcode FCODE. The > > > function is passed the N_ARGS arguments in ARGS and it returns a > > > value @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned > int fcode, tree type, > > > return fold_build1 (FLOAT_EXPR, type, args[0]); > > > AARCH64_SIMD_VREINTERPRET_BUILTINS > > > return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]); > > > + AARCH64_SIMD_VGET_LOW_BUILTINS > > > + { > > > + auto pos = BYTES_BIG_ENDIAN ? 64 : 0; > > > + > > > + return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int > > > (64), > > > + bitsize_int (pos)); > > > + } > > > case AARCH64_SIMD_BUILTIN_LANE_CHECK: > > > gcc_assert (n_args == 3); > > > if (aarch64_fold_builtin_lane_check (args[0], args[1], > > > args[2])) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def > > > b/gcc/config/aarch64/aarch64-simd-builtins.def > > > index da16f602a55..a9f0558f8b6 100644 > > > --- a/gcc/config/aarch64/aarch64-simd-builtins.def > > > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def > > > @@ -65,8 +65,6 @@ > > > BUILTIN_VS (UNOP, ctz, 2, NONE) > > > BUILTIN_VB (UNOP, popcount, 2, NONE) > > > > > > - /* Implemented by aarch64_get_low<mode>. */ > > > - BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP) > > > /* Implemented by aarch64_get_high<mode>. */ > > > BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP) > > > > > > @@ -960,8 +958,7 @@ > > > VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf) > > > VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf) > > > > > > - /* Implemented by aarch64_vget_lo/hi_halfv8bf. */ > > > - VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf) > > > + /* Implemented by aarch64_vget_hi_halfv8bf. */ > > > VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf) > > > > > > /* Implemented by aarch64_simd_<sur>mmlav16qi. */ diff --git > > > a/gcc/config/aarch64/aarch64-simd.md > > > b/gcc/config/aarch64/aarch64-simd.md > > > index f8bb973a278..5a28a8e3c6a 100644 > > > --- a/gcc/config/aarch64/aarch64-simd.md > > > +++ b/gcc/config/aarch64/aarch64-simd.md > > > @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>" > > > } > > > ) > > > > > > -(define_expand "aarch64_get_low<mode>" > > > - [(match_operand:<VHALF> 0 "register_operand") > > > - (match_operand:VQMOV 1 "register_operand")] > > > - "TARGET_FLOAT" > > > - { > > > - rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, > false); > > > - emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], > lo)); > > > - DONE; > > > - } > > > -) > > > - > > > (define_expand "aarch64_get_high<mode>" > > > [(match_operand:<VHALF> 0 "register_operand") > > > (match_operand:VQMOV 1 "register_operand")] @@ -9774,17 +9763,7 > > > @@ (define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>" > > > [(set_attr "type" "neon_dot<VDQSF:q>")] > > > ) > > > > > > -;; vget_low/high_bf16 > > > -(define_expand "aarch64_vget_lo_halfv8bf" > > > - [(match_operand:V4BF 0 "register_operand") > > > - (match_operand:V8BF 1 "register_operand")] > > > - "TARGET_BF16_SIMD" > > > -{ > > > - rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false); > > > - emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], > > > p)); > > > - DONE; > > > -}) > > > - > > > +;; vget_high_bf16 > > > (define_expand "aarch64_vget_hi_halfv8bf" > > > [(match_operand:V4BF 0 "register_operand") > > > (match_operand:V8BF 1 "register_operand")] diff --git > > > a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > > > index 0ee325dccad..92c2c5361cd 100644 > > > --- a/gcc/config/aarch64/arm_neon.h > > > +++ b/gcc/config/aarch64/arm_neon.h > > > @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t > __vec, const int __index) > > > return __aarch64_vset_lane_any (__elem, __vec, __index); } > > > > > > -__extension__ extern __inline float16x4_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_f16 (float16x8_t __a) > > > -{ > > > - return __builtin_aarch64_get_lowv8hf (__a); -} > > > - > > > -__extension__ extern __inline float32x2_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_f32 (float32x4_t __a) > > > -{ > > > - return __builtin_aarch64_get_lowv4sf (__a); -} > > > - > > > -__extension__ extern __inline float64x1_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_f64 (float64x2_t __a) > > > -{ > > > - return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; -} > > > - > > > -__extension__ extern __inline poly8x8_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_p8 (poly8x16_t __a) > > > -{ > > > - return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) > > > __a); -} > > > - > > > -__extension__ extern __inline poly16x4_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_p16 (poly16x8_t __a) > > > -{ > > > - return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) > > > __a); -} > > > - > > > -__extension__ extern __inline poly64x1_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_p64 (poly64x2_t __a) > > > -{ > > > - return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) > > > __a); -} > > > - > > > -__extension__ extern __inline int8x8_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_s8 (int8x16_t __a) > > > -{ > > > - return __builtin_aarch64_get_lowv16qi (__a); -} > > > - > > > -__extension__ extern __inline int16x4_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_s16 (int16x8_t __a) > > > -{ > > > - return __builtin_aarch64_get_lowv8hi (__a); -} > > > - > > > -__extension__ extern __inline int32x2_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_s32 (int32x4_t __a) > > > -{ > > > - return __builtin_aarch64_get_lowv4si (__a); -} > > > - > > > -__extension__ extern __inline int64x1_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_s64 (int64x2_t __a) > > > -{ > > > - return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; -} > > > - > > > -__extension__ extern __inline uint8x8_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_u8 (uint8x16_t __a) > > > -{ > > > - return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) > > > __a); -} > > > - > > > -__extension__ extern __inline uint16x4_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_u16 (uint16x8_t __a) > > > -{ > > > - return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) > > > __a); -} > > > - > > > -__extension__ extern __inline uint32x2_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_u32 (uint32x4_t __a) > > > -{ > > > - return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) > > > __a); -} > > > - > > > -__extension__ extern __inline uint64x1_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_u64 (uint64x2_t __a) > > > -{ > > > - return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) > > > __a)}; -} > > > - > > > __extension__ extern __inline float16x4_t __attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > vget_high_f16 (float16x8_t __a) > > > @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, > bfloat16x8_t __a, bfloat16x8_t __b, > > > return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, > > > __index); } > > > > > > -__extension__ extern __inline bfloat16x4_t -__attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > -vget_low_bf16 (bfloat16x8_t __a) > > > -{ > > > - return __builtin_aarch64_vget_lo_halfv8bf (__a); -} > > > - > > > __extension__ extern __inline bfloat16x4_t __attribute__ > > > ((__always_inline__, __gnu_inline__, __artificial__)) > > > vget_high_bf16 (bfloat16x8_t __a) > > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c > > > b/gcc/testsuite/gcc.target/aarch64/pr113573.c > > > index a8e445c6e19..fc8607f7218 100644 > > > --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c > > > +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c > > > @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() { > > > int y_l = vmull_laneq_u16(r); > > > uint16x8_t __a = g; > > > jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 = > > > - (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a); > > > + (uint16x4_t)vget_low_s16((int16x8_t)__a); > > > __a = b; > > > int cb_l = scaled_128_5; > > > int cb_h = scaled_128_5; > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > > new file mode 100644 > > > index 00000000000..44414e1c043 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c > > > @@ -0,0 +1,30 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O3 -fdump-tree-optimized" } */ > > > + > > > +#include <arm_neon.h> > > > + > > > +#define VARIANTS \ > > > +VARIANT (uint8x8_t, uint8x16_t, u8) \ > > > +VARIANT (uint16x4_t, uint16x8_t, u16) \ > > > +VARIANT (uint32x2_t, uint32x4_t, u32) \ > > > +VARIANT (uint64x1_t, uint64x2_t, u64) \ > > > +VARIANT (int8x8_t, int8x16_t, s8) \ > > > +VARIANT (int16x4_t, int16x8_t, s16) \ > > > +VARIANT (int32x2_t, int32x4_t, s32) \ > > > +VARIANT (int64x1_t, int64x2_t, s64) \ > > > +VARIANT (float16x4_t, float16x8_t, f16) \ > > > +VARIANT (float32x2_t, float32x4_t, f32) \ > > > +VARIANT (float64x1_t, float64x2_t, f64) \ > > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16) > > > + > > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */ > > > +#define VARIANT(TYPE64, TYPE128, SUFFIX) \ > > > +TYPE64 \ > > > +test_vget_low_##SUFFIX (TYPE128 vec) \ > > > +{ \ > > > + return vget_low_##SUFFIX (vec); \ > > > +} > > > + > > > +VARIANTS > > > + > > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF > > > +<vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */ > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > > new file mode 100644 > > > index 00000000000..c3f4c4f0e0d > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c > > > @@ -0,0 +1,31 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-require-effective-target stdint_types_mbig_endian } */ > > > +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */ > > > + > > > +#include <arm_neon.h> > > > + > > > +#define VARIANTS \ > > > +VARIANT (uint8x8_t, uint8x16_t, u8) \ > > > +VARIANT (uint16x4_t, uint16x8_t, u16) \ > > > +VARIANT (uint32x2_t, uint32x4_t, u32) \ > > > +VARIANT (uint64x1_t, uint64x2_t, u64) \ > > > +VARIANT (int8x8_t, int8x16_t, s8) \ > > > +VARIANT (int16x4_t, int16x8_t, s16) \ > > > +VARIANT (int32x2_t, int32x4_t, s32) \ > > > +VARIANT (int64x1_t, int64x2_t, s64) \ > > > +VARIANT (float16x4_t, float16x8_t, f16) \ > > > +VARIANT (float32x2_t, float32x4_t, f32) \ > > > +VARIANT (float64x1_t, float64x2_t, f64) \ > > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16) > > > + > > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */ > > > +#define VARIANT(TYPE64, TYPE128, SUFFIX) \ > > > +TYPE64 \ > > > +test_vget_low_##SUFFIX (TYPE128 vec) \ > > > +{ \ > > > + return vget_low_##SUFFIX (vec); \ > > > +} > > > + > > > +VARIANTS > > > + > > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF > > > +<vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */