Re: [PATCH 1/3] aarch64: Add support for fp8 convert and scale

Richard Sandiford Mon, 11 Nov 2024 06:53:46 -0800

<[email protected]> writes:
> The AArch64 FEAT_FP8 extension introduces instructions for conversion
> and scaling.
>
> This patch introduces the following intrinsics:
> 1. vcvt{1|2}_{bf16|high_bf16|low_bf16}_mf8_fpm.
> 2. vcvt{q}_mf8_f16_fpm.
> 3. vcvt_{high}_mf8_f32_fpm.
> 4. vscale{q}_{f16|f32|f64}.
>
> We introduced three new aarch64_builtin_signatures enum variants:
> 1. binary_fpm.
> 2. ternary_fpm.
> 3. unary_fpm.
>
> We added support for these variants for declaring types and for expanding to 
> RTL.
>
> We added new simd_types for integers (s32, s32q, and s64q) and for
> fp8 (f8, and f8q).
>
> Also changed the faminmax intrinsic instruction pattern so that it works
> better with the new fscale pattern.
>
> Because we added support for fp8 intrinsics here, we modified the check
> in acle/fp8.c that was checking that __ARM_FEATURE_FP8 macro is not
> defined.
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-builtins.cc
>       (enum class): New variants to support new signatures.
>       (aarch64_fntype): Handle new signatures.
>       (aarch64_expand_pragma_builtin): Handle new signatures.
>       * config/aarch64/aarch64-c.cc
>       (aarch64_update_cpp_builtins): New flag for FP8.
>       * config/aarch64/aarch64-simd-pragma-builtins.def
>       (ENTRY_BINARY_FPM): Macro to declare unary fpm intrinsics.
>       (ENTRY_TERNARY_FPM): Macro to declare ternary fpm intrinsics.
>       (ENTRY_UNARY_FPM): Macro to declare unary fpm intrinsics.
>       (ENTRY_VHSDF_VHSDI): Macro to declare binary intrinsics.
>       * config/aarch64/aarch64-simd.md
>       (@aarch64_<faminmax_uns_op><mode>): Renamed.
>       (@aarch64_<faminmax_uns_op><VHSDF:mode><VHSDF:mode>): Renamed.
>       (@aarch64_<fpm_uns_name><V8HFBF:mode><VB:mode>): Unary fpm
>       pattern.
>       (@aarch64_<fpm_uns_name><V8HFBF:mode><V16QI_ONLY:mode>): Unary
>       fpm pattern.
>       (@aarch64_<fpm_uns_name><VB:mode><VCVTFPM:mode><VH_SF:mode>):
>       Binary fpm pattern.
>       
> (@aarch64_<fpm_uns_name><V16QI_ONLY:mode><V8QI_ONLY:mode><V4SF_ONLY:mode><V4SF_ONLY:mode>):
>       Ternary fpm pattern.
>       (@aarch64_<fpm_uns_op><VHSDF:mode><VHSDI:mode>): Scale fpm
>       pattern.
>       * config/aarch64/iterators.md: New attributes and iterators.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/acle/fp8.c: Remove check that fp8 feature
>       macro doesn't exist.
>       * gcc.target/aarch64/simd/scale_fpm.c: New test.
>       * gcc.target/aarch64/simd/vcvt_fpm.c: New test.
>
>       ---
>
>       I could not find a way to compress declarations in
>       aarch64-simd-pragma-builtins.def for convert instructions as there was
>       no pattern apart from the repetion for vcvt1/vcvt2 types. Let me know
>       if those declrations can be expressed more concisely.
>
>       In the scale instructions, I am not doing any casting from float to int
>       modes in the second operand. Let me know if that's a problem.
> ---
>  gcc/config/aarch64/aarch64-builtins.cc        | 132 ++++++++++--
>  gcc/config/aarch64/aarch64-c.cc               |   2 +
>  .../aarch64/aarch64-simd-pragma-builtins.def  |  56 +++++
>  gcc/config/aarch64/aarch64-simd.md            |  72 ++++++-
>  gcc/config/aarch64/iterators.md               |  99 +++++++++
>  gcc/testsuite/gcc.target/aarch64/acle/fp8.c   |  10 -
>  .../gcc.target/aarch64/simd/scale_fpm.c       |  60 ++++++
>  .../gcc.target/aarch64/simd/vcvt_fpm.c        | 197 ++++++++++++++++++
>  8 files changed, 603 insertions(+), 25 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index ad82c680c6a..df19bff71d0 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -1591,6 +1591,9 @@ aarch64_init_simd_builtin_functions (bool 
> called_from_pragma)
>  enum class aarch64_builtin_signatures
>  {
>    binary,
> +  binary_fpm,
> +  ternary_fpm,
> +  unary_fpm,
>  };


I wonder whether we could instead add an extra flags field to
aarch64_pragma_builtins_data and use a flag to indicate whether
it takes an fpm argument.  I think most of the behaviour around
that argument could be done orthogonally to unary/binary/ternary.

E.g.:

>  
>  namespace {
> @@ -1602,6 +1605,9 @@ struct simd_type {
>  
>  namespace simd_types {
>  
> +  constexpr simd_type f8 { V8QImode, qualifier_modal_float };
> +  constexpr simd_type f8q { V16QImode, qualifier_modal_float };
> +
>    constexpr simd_type s8 { V8QImode, qualifier_none };
>    constexpr simd_type u8 { V8QImode, qualifier_unsigned };
>    constexpr simd_type s8q { V16QImode, qualifier_none };
> @@ -1612,6 +1618,11 @@ namespace simd_types {
>    constexpr simd_type s16q { V8HImode, qualifier_none };
>    constexpr simd_type u16q { V8HImode, qualifier_unsigned };
>  
> +  constexpr simd_type s32 { V2SImode, qualifier_none };
> +  constexpr simd_type s32q { V4SImode, qualifier_none };
> +
> +  constexpr simd_type s64q { V2DImode, qualifier_none };
> +
>    constexpr simd_type p8 { V8QImode, qualifier_poly };
>    constexpr simd_type p8q { V16QImode, qualifier_poly };
>    constexpr simd_type p16 { V4HImode, qualifier_poly };
> @@ -1655,7 +1666,7 @@ static aarch64_pragma_builtins_data 
> aarch64_pragma_builtins[] = {
>  static tree
>  aarch64_fntype (const aarch64_pragma_builtins_data &builtin_data)
>  {
> -  tree type0, type1, type2;
> +  tree type0, type1, type2, type3;
>  
>    switch (builtin_data.signature)
>      {
> @@ -1668,6 +1679,36 @@ aarch64_fntype (const aarch64_pragma_builtins_data 
> &builtin_data)
>       builtin_data.types[2].qualifiers);
>        return build_function_type_list (type0, type1, type2, NULL_TREE);
>  
> +    case aarch64_builtin_signatures::binary_fpm:
> +      type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
> +                                      builtin_data.types[0].qualifiers);
> +      type1 = aarch64_simd_builtin_type (builtin_data.types[1].mode,
> +                                      builtin_data.types[1].qualifiers);
> +      type2 = aarch64_simd_builtin_type (builtin_data.types[2].mode,
> +                                      builtin_data.types[2].qualifiers);
> +      return build_function_type_list (type0, type1, type2, uint64_type_node,
> +                                    NULL_TREE);
> +
> +    case aarch64_builtin_signatures::ternary_fpm:
> +      type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
> +                                      builtin_data.types[0].qualifiers);
> +      type1 = aarch64_simd_builtin_type (builtin_data.types[1].mode,
> +                                      builtin_data.types[1].qualifiers);
> +      type2 = aarch64_simd_builtin_type (builtin_data.types[2].mode,
> +                                      builtin_data.types[2].qualifiers);
> +      type3 = aarch64_simd_builtin_type (builtin_data.types[3].mode,
> +                                      builtin_data.types[3].qualifiers);
> +      return build_function_type_list (type0, type1, type2, type3,
> +                                    uint64_type_node, NULL_TREE);
> +
> +    case aarch64_builtin_signatures::unary_fpm:
> +      type0 = aarch64_simd_builtin_type (builtin_data.types[0].mode,
> +                                      builtin_data.types[0].qualifiers);
> +      type1 = aarch64_simd_builtin_type (builtin_data.types[1].mode,
> +                                      builtin_data.types[1].qualifiers);
> +      return build_function_type_list (type0, type1, uint64_type_node,
> +                                    NULL_TREE);
> +
>      default:
>        gcc_unreachable ();
>      }

this switch could push the argument types to a local:

  auto_vec<tree, 8> args;

and then code after the switch could add an extra argument for fpm_t
where necessary.  The final function type could be created using
build_function_type_vec.

> @@ -3383,24 +3424,89 @@ static rtx
>  aarch64_expand_pragma_builtin (tree exp, rtx target,
>                              const aarch64_pragma_builtins_data *builtin_data)
>  {
> -  expand_operand ops[3];
> -  auto op1 = expand_normal (CALL_EXPR_ARG (exp, 0));
> -  auto op2 = expand_normal (CALL_EXPR_ARG (exp, 1));
> -  create_output_operand (&ops[0], target, builtin_data->types[0].mode);
> -  create_input_operand (&ops[1], op1, builtin_data->types[1].mode);
> -  create_input_operand (&ops[2], op2, builtin_data->types[2].mode);
> -
>    auto unspec = builtin_data->unspec;
> -  insn_code icode;
> +  expand_operand ops[4];
>  
>    switch (builtin_data->signature)
>      {
>      case aarch64_builtin_signatures::binary:
> -      icode = code_for_aarch64 (unspec, builtin_data->types[0].mode);
> -      expand_insn (icode, 3, ops);
> -      break;
> +      {
> +     auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
> +     auto input2 = expand_normal (CALL_EXPR_ARG (exp, 1));
> +
> +     create_output_operand (&ops[0], target, builtin_data->types[0].mode);
> +     create_input_operand (&ops[1], input1, builtin_data->types[1].mode);
> +     create_input_operand (&ops[2], input2, builtin_data->types[2].mode);
> +
> +     auto icode = code_for_aarch64 (unspec,
> +                                    builtin_data->types[1].mode,
> +                                    builtin_data->types[2].mode);
> +     expand_insn (icode, 3, ops);
> +     break;
> +      }
> +
> +    case aarch64_builtin_signatures::binary_fpm:
> +      {
> +     auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
> +     auto input2 = expand_normal (CALL_EXPR_ARG (exp, 1));
> +     auto fpm_input = expand_normal (CALL_EXPR_ARG (exp, 2));
> +
> +     auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
> +     emit_move_insn (fpmr, fpm_input);

Similarly this could happen outside of the loop.

I think we could use a loop to expand the arguments and push them
to a local rtx vector, to avoid repeating that code for every case.

> +
> +     create_output_operand (&ops[0], target, builtin_data->types[0].mode);
> +     create_input_operand (&ops[1], input1, builtin_data->types[1].mode);
> +     create_input_operand (&ops[2], input2, builtin_data->types[2].mode);
> +     auto icode = code_for_aarch64 (unspec,
> +                                    builtin_data->types[0].mode,
> +                                    builtin_data->types[1].mode,
> +                                    builtin_data->types[2].mode);
> +     expand_insn (icode, 3, ops);
> +     break;
> +      }
> +
> +    case aarch64_builtin_signatures::ternary_fpm:
> +      {
> +     auto input1 = expand_normal (CALL_EXPR_ARG (exp, 0));
> +     auto input2 = expand_normal (CALL_EXPR_ARG (exp, 1));
> +     auto input3 = expand_normal (CALL_EXPR_ARG (exp, 2));
> +     auto fpm_input = expand_normal (CALL_EXPR_ARG (exp, 3));
> +
> +     auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
> +     emit_move_insn (fpmr, fpm_input);
> +
> +     create_output_operand (&ops[0], target, builtin_data->types[0].mode);
> +     create_input_operand (&ops[1], input1, builtin_data->types[1].mode);
> +     create_input_operand (&ops[2], input2, builtin_data->types[2].mode);
> +     create_input_operand (&ops[3], input3, builtin_data->types[3].mode);
> +     auto icode = code_for_aarch64 (unspec,
> +                                    builtin_data->types[0].mode,
> +                                    builtin_data->types[1].mode,
> +                                    builtin_data->types[2].mode,
> +                                    builtin_data->types[3].mode);
> +     expand_insn (icode, 4, ops);
> +     break;
> +      }
> +
> +    case aarch64_builtin_signatures::unary_fpm:
> +      {
> +     auto input = expand_normal (CALL_EXPR_ARG (exp, 0));
> +     auto fpm_input = expand_normal (CALL_EXPR_ARG (exp, 1));
> +
> +     auto fpmr = gen_rtx_REG (DImode, FPM_REGNUM);
> +     emit_move_insn (fpmr, fpm_input);
> +
> +     create_output_operand (&ops[0], target, builtin_data->types[0].mode);
> +     create_input_operand (&ops[1], input, builtin_data->types[1].mode);
> +     auto icode = code_for_aarch64 (unspec,
> +                                    builtin_data->types[0].mode,
> +                                    builtin_data->types[1].mode);
> +     expand_insn (icode, 2, ops);
> +     break;
> +      }
> +
>      default:
> -      gcc_unreachable();
> +      gcc_unreachable ();
>      }
>    return target;
>  }
> diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
> index f9b9e379375..68f9180520a 100644
> --- a/gcc/config/aarch64/aarch64-c.cc
> +++ b/gcc/config/aarch64/aarch64-c.cc
> @@ -257,6 +257,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
>    aarch64_def_or_undef (TARGET_SVE_BF16,
>                       "__ARM_FEATURE_SVE_BF16", pfile);
>  
> +  aarch64_def_or_undef (TARGET_FP8, "__ARM_FEATURE_FP8", pfile);
> +
>    aarch64_def_or_undef (TARGET_LS64,
>                       "__ARM_FEATURE_LS64", pfile);
>    aarch64_def_or_undef (TARGET_RCPC, "__ARM_FEATURE_RCPC", pfile);
> diff --git a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def 
> b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
> index c669919fa04..cb5b546c541 100644
> --- a/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-pragma-builtins.def
> @@ -23,6 +23,16 @@
>  #define ENTRY_BINARY(N, S, T0, T1, T2, U) \
>    ENTRY (N, S, T0, T1, T2, none, U)
>  
> +#undef ENTRY_BINARY_FPM
> +#define ENTRY_BINARY_FPM(N, S, T0, T1, T2, U) \
> +  ENTRY (N, S, T0, T1, T2, none, U)
> +
> +#define ENTRY_TERNARY_FPM(N, S, T0, T1, T2, T3, U) \
> +  ENTRY (N, S, T0, T1, T2, T3, U)
> +
> +#define ENTRY_UNARY_FPM(N, S, T0, T1, U) \
> +  ENTRY (N, S, T0, T1, none, none, U)
> +
>  #undef ENTRY_VHSDF
>  #define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC) \
>    ENTRY_BINARY (NAME##_f16, SIGNATURE, f16, f16, f16, UNSPEC) \
> @@ -31,8 +41,54 @@
>    ENTRY_BINARY (NAME##q_f32, SIGNATURE, f32q, f32q, f32q, UNSPEC) \
>    ENTRY_BINARY (NAME##q_f64, SIGNATURE, f64q, f64q, f64q, UNSPEC)
>  
> +#undef ENTRY_VHSDF_VHSDI
> +#define ENTRY_VHSDF_VHSDI(NAME, SIGNATURE, UNSPEC) \
> +  ENTRY_BINARY (NAME##_f16, SIGNATURE, f16, f16, s16, UNSPEC) \
> +  ENTRY_BINARY (NAME##q_f16, SIGNATURE, f16q, f16q, s16q, UNSPEC) \
> +  ENTRY_BINARY (NAME##_f32, SIGNATURE, f32, f32, s32, UNSPEC) \
> +  ENTRY_BINARY (NAME##q_f32, SIGNATURE, f32q, f32q, s32q, UNSPEC) \
> +  ENTRY_BINARY (NAME##q_f64, SIGNATURE, f64q, f64q, s64q, UNSPEC)
> +
>  // faminmax
>  #define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FAMINMAX)
>  ENTRY_VHSDF (vamax, binary, UNSPEC_FAMAX)
>  ENTRY_VHSDF (vamin, binary, UNSPEC_FAMIN)
>  #undef REQUIRED_EXTENSIONS
> +
> +// fpm conversion
> +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
> +ENTRY_UNARY_FPM (vcvt1_bf16_mf8_fpm, unary_fpm, bf16q, f8, UNSPEC_VCVT1_BF16)
> +ENTRY_UNARY_FPM (vcvt1_high_bf16_mf8_fpm, unary_fpm, bf16q, f8q, \
> +              UNSPEC_VCVT1_HIGH_BF16)
> +ENTRY_UNARY_FPM (vcvt1_low_bf16_mf8_fpm, unary_fpm, bf16q, f8q, \
> +              UNSPEC_VCVT1_LOW_BF16)
> +ENTRY_UNARY_FPM (vcvt1_f16_mf8_fpm, unary_fpm, f16q, f8, UNSPEC_VCVT1_F16)
> +ENTRY_UNARY_FPM (vcvt1_high_f16_mf8_fpm, unary_fpm, f16q, f8q, \
> +              UNSPEC_VCVT1_HIGH_F16)
> +ENTRY_UNARY_FPM (vcvt1_low_f16_mf8_fpm, unary_fpm, f16q, f8q, \
> +              UNSPEC_VCVT1_LOW_F16)
> +ENTRY_UNARY_FPM (vcvt2_bf16_mf8_fpm, unary_fpm, bf16q, f8, UNSPEC_VCVT2_BF16)
> +ENTRY_UNARY_FPM (vcvt2_high_bf16_mf8_fpm, unary_fpm, bf16q, f8q, \
> +              UNSPEC_VCVT2_HIGH_BF16)
> +ENTRY_UNARY_FPM (vcvt2_low_bf16_mf8_fpm, unary_fpm, bf16q, f8q, \
> +              UNSPEC_VCVT2_LOW_BF16)
> +ENTRY_UNARY_FPM (vcvt2_f16_mf8_fpm, unary_fpm, f16q, f8, UNSPEC_VCVT2_F16)
> +ENTRY_UNARY_FPM (vcvt2_high_f16_mf8_fpm, unary_fpm, f16q, f8q, \
> +              UNSPEC_VCVT2_HIGH_F16)
> +ENTRY_UNARY_FPM (vcvt2_low_f16_mf8_fpm, unary_fpm, f16q, f8q, \
> +              UNSPEC_VCVT2_LOW_F16)
> +
> +ENTRY_BINARY_FPM (vcvt_mf8_f16_fpm, binary_fpm, f8, f16, f16, 
> UNSPEC_VCVT_F16)
> +ENTRY_BINARY_FPM (vcvtq_mf8_f16_fpm, binary_fpm, f8q, f16q, f16q, \
> +               UNSPEC_VCVTQ_F16)
> +ENTRY_BINARY_FPM (vcvt_mf8_f32_fpm, binary_fpm, f8, f32q, f32q, \
> +               UNSPEC_VCVT_F32)
> +
> +ENTRY_TERNARY_FPM (vcvt_high_mf8_f32_fpm, ternary_fpm, f8q, f8, f32q, f32q, \
> +                UNSPEC_VCVT_HIGH_F32)
> +#undef REQUIRED_EXTENSIONS
> +
> +// fpm scaling
> +#define REQUIRED_EXTENSIONS nonstreaming_only (AARCH64_FL_FP8)
> +ENTRY_VHSDF_VHSDI (vscale, binary, UNSPEC_FSCALE)
> +#undef REQUIRED_EXTENSIONS
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index cfe95bd4c31..87bbfb0e586 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -9982,13 +9982,13 @@
>  )
>  
>  ;; faminmax
> -(define_insn "@aarch64_<faminmax_uns_op><mode>"
> +(define_insn "@aarch64_<faminmax_uns_op><VHSDF:mode><VHSDF:mode>"
>    [(set (match_operand:VHSDF 0 "register_operand" "=w")
>       (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
>                      (match_operand:VHSDF 2 "register_operand" "w")]
>                     FAMINMAX_UNS))]
>    "TARGET_FAMINMAX"
> -  "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
> +  "<faminmax_uns_op>\t%0.<Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDF:Vtype>"
>  )
>  

I don't think this is necessary.  The usual style elsewhere is to
omit the iterator names from things like <mode> when there is only
one mode iterator being used.

>  (define_insn "*aarch64_faminmax_fused"
> @@ -9999,3 +9999,71 @@
>    "TARGET_FAMINMAX"
>    "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
>  )
> +
> +;; fpm unary instructions.
> +(define_insn "@aarch64_<fpm_uns_name><V8HFBF:mode><VB:mode>"
> +  [(set (match_operand:V8HFBF 0 "register_operand" "=w")
> +     (unspec:V8HFBF
> +      [(match_operand:VB 1 "register_operand" "w")
> +       (reg:DI FPM_REGNUM)]
> +     FPM_UNARY_UNS))]
> +  "TARGET_FP8"
> +  "<fpm_uns_op>\t%0.<V8HFBF:Vtype>, %1.<VB:Vtype>"
> +)
> +
> +;; fpm unary instructions, where the input is lowered from V16QI to
> +;; V8QI.
> +(define_insn "@aarch64_<fpm_uns_name><V8HFBF:mode><V16QI_ONLY:mode>"
> +  [(set (match_operand:V8HFBF 0 "register_operand" "=w")
> +     (unspec:V8HFBF
> +      [(match_operand:V16QI_ONLY 1 "register_operand" "w")
> +       (reg:DI FPM_REGNUM)]
> +     FPM_UNARY_LOW_UNS))]
> +  "TARGET_FP8"
> +  {
> +    operands[1] = force_lowpart_subreg (V8QImode,
> +                                     operands[1],
> +                                     recog_data.operand[1]->mode);
> +    return "<fpm_uns_op>\t%0.<V8HFBF:Vtype>, %1.8b";
> +  }
> +)
> +
> +;; fpm binary instructions.
> +(define_insn
> +  "@aarch64_<fpm_uns_name><VB:mode><VCVTFPM:mode><VH_SF:mode>"
> +  [(set (match_operand:VB 0 "register_operand" "=w")
> +     (unspec:VB
> +      [(match_operand:VCVTFPM 1 "register_operand" "w")
> +       (match_operand:VH_SF 2 "register_operand" "w")
> +       (reg:DI FPM_REGNUM)]
> +     FPM_BINARY_UNS))]
> +  "TARGET_FP8"
> +  "<fpm_uns_op>\t%0.<VB:Vtype>, %1.<VCVTFPM:Vtype>, %2.<VH_SF:Vtype>"
> +)
> +
> +;; fpm ternary instructions.
> +(define_insn
> +  
> "@aarch64_<fpm_uns_name><V16QI_ONLY:mode><V8QI_ONLY:mode><V4SF_ONLY:mode><V4SF_ONLY:mode>"
> +  [(set (match_operand:V16QI_ONLY 0 "register_operand" "=w")
> +     (unspec:V16QI_ONLY
> +      [(match_operand:V8QI_ONLY 1 "register_operand" "w")
> +       (match_operand:V4SF_ONLY 2 "register_operand" "w")
> +       (match_operand:V4SF_ONLY 3 "register_operand" "w")
> +       (reg:DI FPM_REGNUM)]
> +     FPM_TERNARY_VCVT_UNS))]
> +  "TARGET_FP8"
> +  {
> +    operands[1] = force_reg (V16QImode, operands[1]);
> +    return "<fpm_uns_op>\t%1.16b, %2.<V4SF_ONLY:Vtype>, 
> %3.<V4SF_ONLY:Vtype>";
> +  }
> +)
> +
> +;; fpm scale instructions
> +(define_insn "@aarch64_<fpm_uns_op><VHSDF:mode><VHSDI:mode>"
> +  [(set (match_operand:VHSDF 0 "register_operand" "=w")
> +     (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
> +                    (match_operand:VHSDI 2 "register_operand" "w")]
> +                   FPM_SCALE_UNS))]
> +  "TARGET_FP8"
> +  "<fpm_uns_op>\t%0.<VHSDF:Vtype>, %1.<VHSDF:Vtype>, %2.<VHSDI:Vtype>"
> +)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 8269b0cdcd9..e3026c36e1c 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -41,6 +41,9 @@
>  ;; Iterators for single modes, for "@" patterns.
>  (define_mode_iterator SI_ONLY [SI])
>  (define_mode_iterator DI_ONLY [DI])
> +(define_mode_iterator V8QI_ONLY [V8QI])
> +(define_mode_iterator V16QI_ONLY [V16QI])
> +(define_mode_iterator V4SF_ONLY [V4SF])
>  
>  ;; Iterator for all integer modes (up to 64-bit)
>  (define_mode_iterator ALLI [QI HI SI DI])
> @@ -163,6 +166,12 @@
>  (define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST")
>                            (V8HF "TARGET_SIMD_F16INST")
>                            V2SF V4SF V2DF])
> +(define_mode_iterator VH_SF [(V4HF "TARGET_SIMD_F16INST")
> +                          (V8HF "TARGET_SIMD_F16INST")
> +                          V4SF])
> +
> +;; Advanced SIMD Integer modes.
> +(define_mode_iterator VHSDI [V4HI V8HI V2SI V4SI V2DI])
>  
>  ;; Advanced SIMD Float modes, and DF.
>  (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF])
> @@ -426,6 +435,12 @@
>                            (V8HF "TARGET_SIMD_F16INST")
>                            V2SF V4SF])
>  
> +;; Modes available for Advanced SIMD FP8 conversion operations.
> +(define_mode_iterator VCVTFPM [V8QI
> +                            (V4HF "TARGET_SIMD_F16INST")
> +                            (V8HF "TARGET_SIMD_F16INST")
> +                            V4SF])
> +
>  ;; Iterators for single modes, for "@" patterns.
>  (define_mode_iterator VNx16QI_ONLY [VNx16QI])
>  (define_mode_iterator VNx16SI_ONLY [VNx16SI])
> @@ -630,6 +645,9 @@
>  ;; Bfloat16 modes to which V4SF can be converted
>  (define_mode_iterator V4SF_TO_BF [V4BF V8BF])
>  
> +;; Float16 and Bfloat16 modes
> +(define_mode_iterator V8HFBF [V8HF V8BF])
> +
>  (define_mode_iterator SVE_BHSx24 [VNx32QI VNx16HI VNx8SI
>                                 VNx16BF VNx16HF VNx8SF
>                                 VNx64QI VNx32HI VNx16SI
> @@ -694,6 +712,7 @@
>      UNSPEC_FMINV     ; Used in aarch64-simd.md.
>      UNSPEC_FADDV     ; Used in aarch64-simd.md.
>      UNSPEC_FNEG              ; Used in aarch64-simd.md.
> +    UNSPEC_FSCALE    ; Used in aarch64-simd.md.
>      UNSPEC_ADDV              ; Used in aarch64-simd.md.
>      UNSPEC_SMAXV     ; Used in aarch64-simd.md.
>      UNSPEC_SMINV     ; Used in aarch64-simd.md.
> @@ -731,6 +750,22 @@
>      UNSPEC_SSHLL     ; Used in aarch64-simd.md.
>      UNSPEC_USHLL     ; Used in aarch64-simd.md.
>      UNSPEC_ADDP              ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT_F16  ; Used in aarch64-simd.md.
> +    UNSPEC_VCVTQ_F16 ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT_F32  ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT_HIGH_F32     ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_BF16        ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_F16 ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_HIGH_BF16   ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_HIGH_F16    ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_LOW_BF16    ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT1_LOW_F16     ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_BF16        ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_F16 ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_HIGH_BF16   ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_HIGH_F16    ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_LOW_BF16    ; Used in aarch64-simd.md.
> +    UNSPEC_VCVT2_LOW_F16     ; Used in aarch64-simd.md.
>      UNSPEC_TBL               ; Used in vector permute patterns.
>      UNSPEC_TBX               ; Used in vector permute patterns.
>      UNSPEC_CONCAT    ; Used in vector permute patterns.
> @@ -4534,3 +4569,67 @@
>  
>  (define_code_attr faminmax_op
>    [(smax "famax") (smin "famin")])
> +
> +;; Iterators and attributes for fpm instructions
> +
> +(define_int_iterator FPM_UNARY_UNS
> +  [UNSPEC_VCVT1_BF16
> +   UNSPEC_VCVT1_F16
> +   UNSPEC_VCVT1_HIGH_BF16
> +   UNSPEC_VCVT1_HIGH_F16
> +   UNSPEC_VCVT2_BF16
> +   UNSPEC_VCVT2_F16
> +   UNSPEC_VCVT2_HIGH_BF16
> +   UNSPEC_VCVT2_HIGH_F16])

We shouldn't need separate unspecs for BF16 vs. F16.  That information
is encoded in the mode instead.  Same for the others.

Thanks,
Richard

> +
> +(define_int_iterator FPM_UNARY_LOW_UNS
> +  [UNSPEC_VCVT1_LOW_BF16
> +   UNSPEC_VCVT1_LOW_F16
> +   UNSPEC_VCVT2_LOW_BF16
> +   UNSPEC_VCVT2_LOW_F16])
> +
> +(define_int_iterator FPM_BINARY_UNS
> +  [UNSPEC_VCVT_F16
> +   UNSPEC_VCVTQ_F16
> +   UNSPEC_VCVT_F32])
> +
> +(define_int_iterator FPM_SCALE_UNS [UNSPEC_FSCALE])
> +
> +(define_int_iterator FPM_TERNARY_VCVT_UNS [UNSPEC_VCVT_HIGH_F32])
> +
> +(define_int_attr fpm_uns_op
> +  [(UNSPEC_FSCALE "fscale")
> +   (UNSPEC_VCVT_F16 "fcvtn")
> +   (UNSPEC_VCVTQ_F16 "fcvtn")
> +   (UNSPEC_VCVT_F32 "fcvtn")
> +   (UNSPEC_VCVT_HIGH_F32 "fcvtn2")
> +   (UNSPEC_VCVT1_BF16 "bf1cvtl")
> +   (UNSPEC_VCVT1_F16 "f1cvtl")
> +   (UNSPEC_VCVT1_HIGH_BF16 "bf1cvtl2")
> +   (UNSPEC_VCVT1_HIGH_F16 "f1cvtl2")
> +   (UNSPEC_VCVT1_LOW_BF16 "bf1cvtl")
> +   (UNSPEC_VCVT1_LOW_F16 "f1cvtl")
> +   (UNSPEC_VCVT2_BF16 "bf2cvtl")
> +   (UNSPEC_VCVT2_F16 "f2cvtl")
> +   (UNSPEC_VCVT2_HIGH_BF16 "bf2cvtl2")
> +   (UNSPEC_VCVT2_HIGH_F16 "f2cvtl2")
> +   (UNSPEC_VCVT2_LOW_BF16 "bf2cvtl")
> +   (UNSPEC_VCVT2_LOW_F16 "f2cvtl")])
> +
> +(define_int_attr fpm_uns_name
> +  [(UNSPEC_VCVT_F16 "vcvt_mf8_f16_fpm")
> +   (UNSPEC_VCVTQ_F16 "vcvtq_mf8_f16_fpm")
> +   (UNSPEC_VCVT_F32 "vcvt_mf8_f32_fpm")
> +   (UNSPEC_VCVT_HIGH_F32 "vcvt_high_mf8_f32_fpm")
> +   (UNSPEC_VCVT1_BF16 "vcvt1_bf16_mf8_fpm")
> +   (UNSPEC_VCVT1_F16 "vcvt1_f16_mf8_fpm")
> +   (UNSPEC_VCVT1_HIGH_BF16 "vcvt1_high_bf16_mf8_fpm")
> +   (UNSPEC_VCVT1_HIGH_F16 "vcvt1_high_f16_mf8_fpm")
> +   (UNSPEC_VCVT1_LOW_BF16 "vcvt1_low_bf16_mf8_fpm")
> +   (UNSPEC_VCVT1_LOW_F16 "vcvt1_low_f16_mf8_fpm")
> +   (UNSPEC_VCVT2_BF16 "vcvt2_bf16_mf8_fpm")
> +   (UNSPEC_VCVT2_F16 "vcvt2_f16_mf8_fpm")
> +   (UNSPEC_VCVT2_HIGH_BF16 "vcvt2_high_bf16_mf8_fpm")
> +   (UNSPEC_VCVT2_HIGH_F16 "vcvt2_high_f16_mf8_fpm")
> +   (UNSPEC_VCVT2_LOW_BF16 "vcvt2_low_bf16_mf8_fpm")
> +   (UNSPEC_VCVT2_LOW_F16 "vcvt2_low_f16_mf8_fpm")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c 
> b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> index afb44f83f60..635a7eaf4a2 100644
> --- a/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8.c
> @@ -5,19 +5,9 @@
>  
>  #include <arm_acle.h>
>  
> -#ifdef __ARM_FEATURE_FP8
> -#error "__ARM_FEATURE_FP8 feature macro defined."
> -#endif
> -
>  #pragma GCC push_options
>  #pragma GCC target("arch=armv9.4-a+fp8")
>  
> -/* We do not define __ARM_FEATURE_FP8 until all
> -   relevant features have been added. */
> -#ifdef __ARM_FEATURE_FP8
> -#error "__ARM_FEATURE_FP8 feature macro defined."
> -#endif
> -
>  /*
>  **test_write_fpmr_sysreg_asm_64:
>  **   msr     fpmr, x0
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
> new file mode 100644
> index 00000000000..d95a861fcfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scale_fpm.c
> @@ -0,0 +1,60 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_neon.h"
> +
> +/*
> +** test_vscale_f16:
> +**   fscale  v0.4h, v0.4h, v1.4h
> +**   ret
> +*/
> +float16x4_t
> +test_vscale_f16 (float16x4_t a, int16x4_t b)
> +{
> +  return vscale_f16 (a, b);
> +}
> +
> +/*
> +** test_vscaleq_f16:
> +**   fscale  v0.8h, v0.8h, v1.8h
> +**   ret
> +*/
> +float16x8_t
> +test_vscaleq_f16 (float16x8_t a, int16x8_t b)
> +{
> +  return vscaleq_f16 (a, b);
> +}
> +
> +/*
> +** test_vscale_f32:
> +**   fscale  v0.2s, v0.2s, v1.2s
> +**   ret
> +*/
> +float32x2_t
> +test_vscale_f32 (float32x2_t a, int32x2_t b)
> +{
> +  return vscale_f32 (a, b);
> +}
> +
> +/*
> +** test_vscaleq_f32:
> +**   fscale  v0.4s, v0.4s, v1.4s
> +**   ret
> +*/
> +float32x4_t
> +test_vscaleq_f32 (float32x4_t a, int32x4_t b)
> +{
> +  return vscaleq_f32 (a, b);
> +}
> +
> +/*
> +** test_vscaleq_f64:
> +**   fscale  v0.2d, v0.2d, v1.2d
> +**   ret
> +*/
> +float64x2_t
> +test_vscaleq_f64 (float64x2_t a, int64x2_t b)
> +{
> +  return vscaleq_f64 (a, b);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
> new file mode 100644
> index 00000000000..39076684345
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/vcvt_fpm.c
> @@ -0,0 +1,197 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=armv9-a+fp8" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_neon.h"
> +
> +/*
> +** test_vcvt1_bf16:
> +**   msr     fpmr, x0
> +**   bf1cvtl v0.8h, v0.8b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_vcvt1_bf16 (mfloat8x8_t a, fpm_t b)
> +{
> +  return vcvt1_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_high_vcvt1_bf16:
> +**   msr     fpmr, x0
> +**   bf1cvtl2        v0.8h, v0.16b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_high_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_high_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_low_vcvt1_bf16:
> +**   msr     fpmr, x0
> +**   bf1cvtl v0.8h, v0.8b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_low_vcvt1_bf16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_low_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_vcvt1_f16:
> +**   msr     fpmr, x0
> +**   f1cvtl  v0.8h, v0.8b
> +**   ret
> +*/
> +float16x8_t
> +test_vcvt1_f16 (mfloat8x8_t a, fpm_t b)
> +{
> +  return vcvt1_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_high_vcvt1_f16:
> +**   msr     fpmr, x0
> +**   f1cvtl2 v0.8h, v0.16b
> +**   ret
> +*/
> +float16x8_t
> +test_high_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_high_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_low_vcvt1_f16:
> +**   msr     fpmr, x0
> +**   f1cvtl  v0.8h, v0.8b
> +**   ret
> +*/
> +float16x8_t
> +test_low_vcvt1_f16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_low_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_vcvt2_bf16:
> +**   msr     fpmr, x0
> +**   bf2cvtl v0.8h, v0.8b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_vcvt2_bf16 (mfloat8x8_t a, fpm_t b)
> +{
> +  return vcvt2_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_high_vcvt2_bf16:
> +**   msr     fpmr, x0
> +**   bf2cvtl2        v0.8h, v0.16b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_high_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt2_high_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_low_vcvt2_bf16:
> +**   msr     fpmr, x0
> +**   bf1cvtl v0.8h, v0.8b
> +**   ret
> +*/
> +bfloat16x8_t
> +test_low_vcvt2_bf16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_low_bf16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_vcvt2_f16:
> +**   msr     fpmr, x0
> +**   f2cvtl  v0.8h, v0.8b
> +**   ret
> +*/
> +float16x8_t
> +test_vcvt2_f16 (mfloat8x8_t a, fpm_t b)
> +{
> +  return vcvt2_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_high_vcvt2_f16:
> +**   msr     fpmr, x0
> +**   f2cvtl2 v0.8h, v0.16b
> +**   ret
> +*/
> +float16x8_t
> +test_high_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt2_high_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_low_vcvt2_f16:
> +**   msr     fpmr, x0
> +**   f1cvtl  v0.8h, v0.8b
> +**   ret
> +*/
> +float16x8_t
> +test_low_vcvt2_f16 (mfloat8x16_t a, fpm_t b)
> +{
> +  return vcvt1_low_f16_mf8_fpm(a, b);
> +}
> +
> +/*
> +** test_vcvt_f16:
> +**   msr     fpmr, x0
> +**   fcvtn   v0.8b, v0.4h, v1.4h
> +**   ret
> +*/
> +mfloat8x8_t
> +test_vcvt_f16 (float16x4_t a, float16x4_t b, fpm_t c)
> +{
> +  return vcvt_mf8_f16_fpm(a, b, c);
> +}
> +
> +/*
> +** test_vcvtq_f16:
> +**   msr     fpmr, x0
> +**   fcvtn   v0.16b, v0.8h, v1.8h
> +**   ret
> +*/
> +mfloat8x16_t
> +test_vcvtq_f16 (float16x8_t a, float16x8_t b, fpm_t c)
> +{
> +  return vcvtq_mf8_f16_fpm(a, b, c);
> +}
> +
> +/*
> +** test_vcvt_f32:
> +**   msr     fpmr, x0
> +**   fcvtn   v0.8b, v0.4s, v1.4s
> +**   ret
> +*/
> +mfloat8x8_t
> +test_vcvt_f32 (float32x4_t a, float32x4_t b, fpm_t c)
> +{
> +  return vcvt_mf8_f32_fpm(a, b, c);
> +}
> +
> +/*
> +** test_vcvt_high_f32:
> +**   msr     fpmr, x0
> +**   fcvtn2  v0.16b, v1.4s, v2.4s
> +**   ret
> +*/
> +mfloat8x16_t
> +test_vcvt_high_f32 (mfloat8x8_t a, float32x4_t b, float32x4_t c, fpm_t d)
> +{
> +  return vcvt_high_mf8_f32_fpm(a, b, c, d);
> +}

Re: [PATCH 1/3] aarch64: Add support for fp8 convert and scale

Reply via email to