On Fri, Jan 02, 2026 at 08:24:19PM +0000, Karl Meakin wrote:
> Add support for the `SVE_BFSCALE` architecture extension.
>
> gcc/ChangeLog:
>
> * doc/invoke.texi: Document `+sve-bfscale` flag.
> * config/aarch64/aarch64.h (TARGET_SVE_BFSCALE): New macro.
> * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins):
> Define `__AARCH64_FEATURE_SVE_BFSCALE`.
> * config/aarch64/aarch64-sve-builtins-base.cc: Skip constant
> folding for floating-point or unpredicated multiplications.
> * config/aarch64/aarch64-sve-builtins-sve2.def: New `SVE_FUNCTION`s.
> * config/aarch64/aarch64-sve.md: Modify insns for
> `SVE_COND_FP_BINARY_INT` to handle BF16 modes.
> (@aarch64_sve_<optab><mode>, @aarch64_sve_<optab><mode>_single): New
> insn for `BFSCALE`.
> * config/aarch64/aarch64-sve2.md: Modify insns for `UNSPEC_FSCALE` to
> handle BF16 modes.
> * config/aarch64/iterators.md (SVE_FULL_F_SCALAR): Add `VNx8BF` to
> iterator.
> (SVE_FULL_F_BFSCALE): New iterator.
> (SVE_Fx24_BFSCALE): New iterator.
> (SVE_BFx24): New iterator.
> (UNSPEC_FMUL): New unspec.
> (V_INT_EQUIV): Add entries for BF16 modes.
> (b): Add entries for scalar float modes.
> (is_bf16): Add entries for BF16 modes and reformat.
> (SVSCALE_SINGLE_INTARG): Likewise.
> (SVSCALE_INTARG): Likewise.
> (SVE_FP_MULL): New iterator.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp: Add `sve-bfscale` to `sve_exts`.
> * gcc.target/aarch64/pragma_cpp_predefs_4.c: Add test for
> `__ARM_SVE_FEATURE_BFSCALE`.
> * gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c: New test.
> * gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c: New test.
> * gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c: New test.
> * gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c: New test.
> * gcc.target/aarch64/sve/acle/asm/scale_bf16.c: New test.
> * gcc.target/aarch64/sve/acle/general-c/bfscale.c: New test.
> ---
> gcc/config/aarch64/aarch64-c.cc | 2 +
> .../aarch64/aarch64-sve-builtins-base.cc | 9 +-
> .../aarch64/aarch64-sve-builtins-sve2.def | 43 +++
> gcc/config/aarch64/aarch64-sve.md | 119 ++++---
> gcc/config/aarch64/aarch64-sve2.md | 32 +-
> gcc/config/aarch64/aarch64.h | 1 +
> gcc/config/aarch64/iterators.md | 64 +++-
> gcc/doc/invoke.texi | 3 +-
> .../gcc.target/aarch64/pragma_cpp_predefs_4.c | 5 +
> .../aarch64/sme2/acle-asm/mul_bf16_x2.c | 191 ++++++++++
> .../aarch64/sme2/acle-asm/mul_bf16_x4.c | 225 ++++++++++++
> .../aarch64/sme2/acle-asm/scale_bf16_x2.c | 192 ++++++++++
> .../aarch64/sme2/acle-asm/scale_bf16_x4.c | 229 ++++++++++++
> .../aarch64/sve/acle/asm/scale_bf16.c | 335 ++++++++++++++++++
> .../aarch64/sve/acle/general-c/bfscale.c | 114 ++++++
> gcc/testsuite/lib/target-supports.exp | 2 +-
> 16 files changed, 1491 insertions(+), 75 deletions(-)
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
>
> diff --git a/gcc/config/aarch64/aarch64-c.cc b/gcc/config/aarch64/aarch64-c.cc
> index ee539531d36..d4c396906e4 100644
> --- a/gcc/config/aarch64/aarch64-c.cc
> +++ b/gcc/config/aarch64/aarch64-c.cc
> @@ -275,6 +275,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile)
> "__ARM_FEATURE_BF16", pfile);
> aarch64_def_or_undef (TARGET_SVE_BF16,
> "__ARM_FEATURE_SVE_BF16", pfile);
> + aarch64_def_or_undef (TARGET_SVE_BFSCALE,
> + "__ARM_FEATURE_SVE_BFSCALE", pfile);
>
> aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile);
> aarch64_def_or_undef (TARGET_SME_LUTv2, "__ARM_FEATURE_SME_LUTv2", pfile);
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index f07727416b5..999c029f3e8 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -2308,11 +2308,18 @@ class svmul_impl : public rtx_code_function
> {
> public:
> CONSTEXPR svmul_impl ()
> - : rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL) {}
> + : rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL, UNSPEC_FMUL) {}
>
> gimple *
> fold (gimple_folder &f) const override
> {
> + /* The code below assumes that the function has 3 arguments (pg, rn, rm).
> + * Unpredicated functions have only 2 arguments (rn, rm) so will cause
> the
> + * code below to crash. Also skip if it does not operatoe on integers,
> + * since all the optimizations below are for integer multiplication. */
> + if (!f.type_suffix (0).integer_p || f.pred == aarch64_sve::PRED_none)
> + return nullptr;
> +
> if (auto *res = f.fold_const_binary (MULT_EXPR))
> return res;
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> index e7142080c05..62506a76a57 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def
> @@ -430,3 +430,46 @@ DEF_SVE_FUNCTION_GS_FPM (svcvtl1, unary_convert,
> cvt_mf8, x2, none, set)
> DEF_SVE_FUNCTION_GS_FPM (svcvtl2, unary_convert, cvt_mf8, x2, none, set)
> DEF_SVE_FUNCTION_GS (svscale, binary_int_opt_single_n, all_float, x24, none)
> #undef REQUIRED_EXTENSIONS
> +
> +/*
> +- BFSCALE (predicated)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SVE2 != 0
> + svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t
> zm); */
> +#define REQUIRED_EXTENSIONS \
> + sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_SVE_BFSCALE, \
> + AARCH64_FL_SME2 | AARCH64_FL_SVE_BFSCALE)
> +DEF_SVE_FUNCTION (svscale, binary_int_opt_n, h_bfloat, mxz)
> +#undef REQUIRED_EXTENSIONS
> +
> +/*
> +- BFSCALE (multiple vectors)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svscale[_bf16_x2] (svbfloat16x2_t zdn, svint16x2_t zm)
> __arm_streaming;
> + svbfloat16x4_t svscale[_bf16_x4] (svbfloat16x4_t zdn, svint16x4_t zm)
> __arm_streaming;
> +
> +- BFSCALE (multiple and single vector)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svscale[_single_bf16_x2] (svbfloat16x2_t zn, svint16_t zm)
> __arm_streaming;
> + svbfloat16x4_t svscale[_single_bf16_x4] (svbfloat16x4_t zn, svint16_t zm)
> __arm_streaming; */
> +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SVE_BFSCALE |
> AARCH64_FL_SME2)
> +DEF_SVE_FUNCTION_GS (svscale, binary_int_opt_single_n, h_bfloat, x24, none)
> +#undef REQUIRED_EXTENSIONS
> +
> +/*
> +- BFMUL (multiple vectors)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svmul[_bf16_x2] (svbfloat16x2_t zdn, svbfloat16x2_t zm)
> __arm_streaming;
> + svbfloat16x4_t svmul[_bf16_x4] (svbfloat16x4_t zdn, svbfloat16x4_t zm)
> __arm_streaming;
> +
> +- BFMUL (multiple and single vector)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svmul[_single_bf16_x2] (svbfloat16x2_t zn, svbfloat16x2_t
> zm) __arm_streaming;
> + svbfloat16x4_t svmul[_single_bf16_x4] (svbfloat16x4_t zn, svbfloat16x4_t
> zm) __arm_streaming; */
> +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SVE_BFSCALE |
> AARCH64_FL_SME2)
> +DEF_SVE_FUNCTION_GS (svmul, binary_opt_single_n, h_bfloat, x24, none)
> +#undef REQUIRED_EXTENSIONS
> \ No newline at end of file
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index cc024a81746..5d790feb2f5 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -5527,6 +5527,7 @@ (define_insn_and_rewrite "*cond_<sve_int_op><mode>_any"
> ;; -------------------------------------------------------------------------
> ;; Includes:
> ;; - FSCALE
> +;; - BFSCALE (SVE_BFSCALE)
> ;; - FTSMUL
> ;; - FTSSEL
> ;; -------------------------------------------------------------------------
> @@ -5564,15 +5565,15 @@ (define_insn "@aarch64_sve_<optab><mode>"
> (define_insn "@aarch64_pred_<optab><mode>"
> [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
> (unspec:SVE_FULL_F_SCALAR
> - [(match_operand:<VPRED> 1 "register_operand")
> - (match_operand:SI 4 "aarch64_sve_gp_strictness")
> + [(match_operand:<VPRED> 1 "register_operand")
> + (match_operand:SI 4 "aarch64_sve_gp_strictness")
> (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
> - (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> + (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT))]
> "TARGET_SVE"
> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%Z0.<Vetype>,
> %1/m, %Z0.<Vetype>, %Z3.<Vetype>
> - [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0,
> %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%Z0.<Vetype>,
> %1/m, %Z0.<Vetype>, %Z3.<Vetype>
> + [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0,
> %Z2\;<b><sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
> }
> [(set_attr "sve_type" "sve_fp_mul")]
> )
> @@ -5580,16 +5581,16 @@ (define_insn "@aarch64_pred_<optab><mode>"
> ;; Predicated floating-point binary operations with merging, taking an
> ;; integer as their second operand.
> (define_expand "@cond_<optab><mode>"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand")
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand")
> + (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE"
> )
> @@ -5597,21 +5598,21 @@ (define_expand "@cond_<optab><mode>"
> ;; Predicated floating-point binary operations that take an integer as their
> ;; second operand, with inactive lanes coming from the first operand.
> (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand")
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand 4)
> (const_int SVE_RELAXED_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand")
> (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT)
> (match_dup 2)]
> UNSPEC_SEL))]
> "TARGET_SVE"
> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>,
> %1/m, %0.<Vetype>, %3.<Vetype>
> - [ ?&w , Upl , w , w ; yes ] movprfx\t%0,
> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>,
> %1/m, %0.<Vetype>, %3.<Vetype>
> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0,
> %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> }
> "&& !rtx_equal_p (operands[1], operands[4])"
> {
> @@ -5621,21 +5622,21 @@ (define_insn_and_rewrite
> "*cond_<optab><mode>_2_relaxed"
> )
>
> (define_insn "*cond_<optab><mode>_2_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand")
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand")
> (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT)
> (match_dup 2)]
> UNSPEC_SEL))]
> "TARGET_SVE"
> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>,
> %1/m, %0.<Vetype>, %3.<Vetype>
> - [ ?&w , Upl , w , w ; yes ] movprfx\t%0,
> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ]
> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>,
> %1/m, %0.<Vetype>, %3.<Vetype>
> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0,
> %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> }
> [(set_attr "sve_type" "sve_fp_mul")]
> )
> @@ -5644,22 +5645,22 @@ (define_insn "*cond_<optab><mode>_2_strict"
> ;; their second operand, with the values of inactive lanes being distinct
> ;; from the other inputs.
> (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand")
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand 5)
> (const_int SVE_RELAXED_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand")
> (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
> {@ [ cons: =0 , 1 , 2 , 3 , 4 ]
> - [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> - [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> - [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> [ ?&w , Upl , w , w , w ] #
> }
> "&& 1"
> @@ -5682,22 +5683,22 @@ (define_insn_and_rewrite
> "*cond_<optab><mode>_any_relaxed"
> )
>
> (define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand")
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + (unspec:SVE_FULL_F_BFSCALE
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand")
> + (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> SVE_COND_FP_BINARY_INT)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
> {@ [ cons: =0 , 1 , 2 , 3 , 4 ]
> - [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> - [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> - [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m,
> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> + [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m,
> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> [ ?&w , Upl , w , w , w ] #
> }
> "&& reload_completed
> @@ -6895,7 +6896,7 @@ (define_insn_and_rewrite
> "*aarch64_cond_abd<mode>_any_strict"
> ;; ---- [FP] Multiplication
> ;; -------------------------------------------------------------------------
> ;; Includes:
> -;; - BFMUL (SVE_B16B16)
> +;; - BFMUL (SVE_B16B16 || SVE_BFSCALE)
> ;; - FMUL
> ;; -------------------------------------------------------------------------
>
> @@ -6936,6 +6937,36 @@ (define_insn "@aarch64_mul_lane_<mode>"
> [(set_attr "sve_type" "sve_fp_mul")]
> )
>
> +;; BFMUL (multiple vectors)
> +;; svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm)
> __arm_streaming;
> +;; svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm)
> __arm_streaming;
> +;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, { <Zm1>.H-<Zm2>.H }
> +;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, { <Zm1>.H-<Zm4>.H }
> +(define_insn "@aarch64_sve_<optab><mode>"
> + [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>")
> + (unspec:SVE_BFx24
> + [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>")
> + (match_operand:SVE_BFx24 2 "register_operand" "Uw<vector_count>")]
> + SVE_FP_MUL))]
> + "TARGET_SVE_BFSCALE && TARGET_SME2"
> + "bfmul\t%0, %1, %2"
> +)
> +
> +;; BFMUL (multiple and single vector)
> +;; svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm)
> __arm_streaming;
> +;; svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm)
> __arm_streaming;
> +;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, <Zm>.H
> +;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, <Zm>.H
> +(define_insn "@aarch64_sve_<optab><mode>_single"
> + [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>")
> + (unspec:SVE_BFx24
> + [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>")
> + (match_operand:<VSINGLE> 2 "register_operand" "x")]
> + SVE_FP_MUL))]
> + "TARGET_SVE_BFSCALE && TARGET_SME2"
> + "bfmul\t%0, %1, %2.h"
> +)
These two patterns should be moved to aarch64-sve2.md; multi-vector
operations like this are currently handled there.
> +
> ;; -------------------------------------------------------------------------
> ;; ---- [FP] Division
> ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/aarch64-sve2.md
> b/gcc/config/aarch64/aarch64-sve2.md
> index 1aa885abedd..3bf6344c345 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1487,26 +1487,38 @@ (define_insn "@aarch64_sve_fclamp_single<mode>"
> ;; -------------------------------------------------------------------------
> ;; Includes the multiple and single vector and multiple vectors forms of
> ;; - FSCALE
> +;; - BFSCALE
> ;; -------------------------------------------------------------------------
>
> +;; FSCALE (multiple vectors)
> +;; svfloat16x2_t svscale[_f16_x2] (sbfloat16x2_t zdn, svint16x2_t zm)
> __arm_streaming;
> +;; svfloat16x4_t svscale[_f16_x4] (sbfloat16x4_t zdn, svint16x4_t zm)
> __arm_streaming;
> +;; FSCALE { <Zdn1>.H-<Zdn4>.H }, { <Zdn1>.H-<Zdn4>.H }, { <Zm1>.H-<Zm4>.H }
> (define_insn "@aarch64_sve_fscale<mode>"
> - [(set (match_operand:SVE_Fx24_NOBF 0 "register_operand"
> "=Uw<vector_count>")
> - (unspec:SVE_Fx24_NOBF
> - [(match_operand:SVE_Fx24_NOBF 1 "register_operand" "0")
> + [(set (match_operand:SVE_Fx24_BFSCALE 0 "register_operand"
> "=Uw<vector_count>")
> + (unspec:SVE_Fx24_BFSCALE
> + [(match_operand:SVE_Fx24_BFSCALE 1 "register_operand" "0")
> (match_operand:<SVSCALE_INTARG> 2 "register_operand"
> "Uw<vector_count>")]
> UNSPEC_FSCALE))]
> - "TARGET_STREAMING_SME2 && TARGET_FP8"
> - "fscale\t%0, %1, %2"
> + "<is_bf16> ? (TARGET_SME2 && TARGET_SVE_BFSCALE)
> + : (TARGET_SME2 && TARGET_FP8)"
Where did TARGET_STREAMING_SME2 from the original condition go? Moreover,
for all multi-vector patterns that you're adding, you might want to
use TARGET_STREAMING_SME2 as well?
> + "<b>fscale\t%0, %1, %2"
> )
>
> +;; FSCALE (multiple and single vector)
> +;; svfloat16x2_t svscale[_single_f16_x2](svfloat16x2_t zn, svint16_t zm)
> __arm_streaming;
> +;; svfloat16x4_t svscale[_single_f16_x4](svfloat16x4_t zn, svint16_t zm)
> __arm_streaming;
> +;; FSCALE { <Zdn1>.H-<Zdn2>.H }, { <Zdn1>.H-<Zdn2>.H }, <Zm>.H
> +;; FSCALE { <Zdn1>.H-<Zdn4>.H }, { <Zdn1>.H-<Zdn4>.H }, <Zm>.H
> (define_insn "@aarch64_sve_single_fscale<mode>"
> - [(set (match_operand:SVE_Fx24_NOBF 0 "register_operand"
> "=Uw<vector_count>")
> - (unspec:SVE_Fx24_NOBF
> - [(match_operand:SVE_Fx24_NOBF 1 "register_operand" "0")
> + [(set (match_operand:SVE_Fx24_BFSCALE 0 "register_operand"
> "=Uw<vector_count>")
> + (unspec:SVE_Fx24_BFSCALE
> + [(match_operand:SVE_Fx24_BFSCALE 1 "register_operand" "0")
> (match_operand:<SVSCALE_SINGLE_INTARG> 2 "register_operand" "x")]
> UNSPEC_FSCALE))]
> - "TARGET_STREAMING_SME2 && TARGET_FP8"
> - "fscale\t%0, %1, %2.<Vetype>"
> + "<is_bf16> ? (TARGET_SME2 && TARGET_SVE_BFSCALE)
> + : (TARGET_SME2 && TARGET_FP8)"
> + "<b>fscale\t%0, %1, %2.<Vetype>"
> )
>
> ;; =========================================================================
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index 2b7d266de10..7c70be500ee 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -385,6 +385,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE
> ATTRIBUTE_UNUSED
> #define TARGET_BF16_FP AARCH64_HAVE_ISA (BF16)
> #define TARGET_BF16_SIMD (TARGET_BF16_FP && TARGET_SIMD)
> #define TARGET_SVE_BF16 (TARGET_BF16_FP && TARGET_SVE)
> +#define TARGET_SVE_BFSCALE (AARCH64_HAVE_ISA (SVE_BFSCALE))
>
> /* PAUTH instructions are enabled through +pauth. */
> #define TARGET_PAUTH AARCH64_HAVE_ISA (PAUTH)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 569c0876fab..7bed30682ca 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -498,10 +498,13 @@ (define_mode_iterator SVE_PARTIAL_F [VNx2HF VNx4HF
> VNx2SF])
> (define_mode_iterator SVE_F [SVE_PARTIAL_F SVE_FULL_F])
>
> ;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
> -(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
> +(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF (VNx8BF
> "TARGET_SVE_BFSCALE")])
>
> (define_mode_iterator SVE_FULL_F_B16B16 [(VNx8BF "TARGET_SSVE_B16B16")
> SVE_FULL_F])
>
> +(define_mode_iterator SVE_FULL_F_BFSCALE [SVE_FULL_F
> + (VNx8BF "TARGET_SVE_BFSCALE")])
> +
> (define_mode_iterator SVE_PARTIAL_F_B16B16 [(VNx2BF "TARGET_SSVE_B16B16")
> (VNx4BF "TARGET_SSVE_B16B16")
> SVE_PARTIAL_F])
> @@ -738,10 +741,19 @@ (define_mode_iterator SVE_Ix24 [VNx32QI VNx16HI VNx8SI
> VNx4DI
> (define_mode_iterator SVE_Fx24_NOBF [VNx16HF VNx8SF VNx4DF
> VNx32HF VNx16SF VNx8DF])
>
> +(define_mode_iterator SVE_Fx24_BFSCALE [
> + VNx16HF VNx8SF VNx4DF
> + VNx32HF VNx16SF VNx8DF
> + (VNx16BF "TARGET_SVE_BFSCALE")
> + (VNx32BF "TARGET_SVE_BFSCALE")
> +])
> +
> (define_mode_iterator SVE_Fx24 [(VNx16BF "TARGET_SSVE_B16B16")
> (VNx32BF "TARGET_SSVE_B16B16")
> SVE_Fx24_NOBF])
>
> +(define_mode_iterator SVE_BFx24 [VNx16BF VNx32BF])
> +
> (define_mode_iterator SVE_SFx24 [VNx8SF VNx16SF])
>
> ;; The modes used to represent different ZA access sizes.
> @@ -816,6 +828,7 @@ (define_c_enum "unspec"
> UNSPEC_FMAX ; Used in aarch64-simd.md.
> UNSPEC_FMAXNMV ; Used in aarch64-simd.md.
> UNSPEC_FMAXV ; Used in aarch64-simd.md.
> + UNSPEC_FMUL ; Used in aarch64-sve.md.
> UNSPEC_FMIN ; Used in aarch64-simd.md.
> UNSPEC_FMINNMV ; Used in aarch64-simd.md.
> UNSPEC_FMINV ; Used in aarch64-simd.md.
> @@ -2202,6 +2215,8 @@ (define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI
> "V16QI")
> (VNx16QI "VNx16QI")
> (VNx8HI "VNx8HI") (VNx8HF "VNx8HI")
> (VNx8BF "VNx8HI")
> + (VNx16BF "VNx16HI")
> + (VNx32BF "VNx32HI")
> (VNx4SI "VNx4SI") (VNx4SF "VNx4SI")
> (VNx2DI "VNx2DI") (VNx2DF "VNx2DI")
> (VNx8SF "VNx8SI") (VNx16SF "VNx16SI")
> @@ -2783,17 +2798,20 @@ (define_mode_attr vec_or_offset [(V8QI "vec") (V16QI
> "vec") (V4HI "vec")
> (V8HI "vec") (V2SI "vec") (V4SI "vec")
> (V2DI "vec") (DI "offset")])
>
> -(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
> +(define_mode_attr b [(BF "b") (HF "") (SF "") (DF "")
> + (V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
> (VNx2BF "b") (VNx2HF "") (VNx2SF "")
> (VNx4BF "b") (VNx4HF "") (VNx4SF "")
> (VNx8BF "b") (VNx8HF "") (VNx2DF "")
> (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
> (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
>
> -(define_mode_attr is_bf16 [(VNx2BF "true") (VNx4BF "true") (VNx8BF "true")
> - (VNx2HF "false") (VNx4HF "false") (VNx8HF "false")
> - (VNx2SF "false") (VNx4SF "false")
> - (VNx2DF "false")])
> +(define_mode_attr is_bf16 [
> + (VNx2BF "true") (VNx4BF "true") (VNx8BF "true") (VNx16BF "true")
> (VNx32BF "true")
> + (VNx2HF "false") (VNx4HF "false") (VNx8HF "false") (VNx16HF "false")
> (VNx32HF "false")
> + (VNx2SF "false") (VNx4SF "false") (VNx8SF "false") (VNx16SF "false")
> + (VNx2DF "false") (VNx4DF "false") (VNx8DF "false")
> +])
>
> (define_mode_attr aligned_operand [(VNx16QI "register_operand")
> (VNx8HI "register_operand")
> @@ -2820,22 +2838,29 @@ (define_mode_attr LD1_EXTENDQ_MEM [(VNx4SI "VNx1SI")
> (VNx4SF "VNx1SI")
>
> ;; Maps the output type of svscale to the corresponding int vector type in
> the
> ;; second argument.
> -(define_mode_attr SVSCALE_SINGLE_INTARG [(VNx16HF "VNx8HI") ;; f16_x2 -> s16
> - (VNx32HF "VNx8HI") ;; f16_x4 -> s16
> - (VNx8SF "VNx4SI") ;; f32_x2 -> s32
> - (VNx16SF "VNx4SI") ;; f32_x4 -> s32
> - (VNx4DF "VNx2DI") ;; f64_x2 -> s64
> - (VNx8DF "VNx2DI") ;; f64_x4 -> s64
> +(define_mode_attr SVSCALE_SINGLE_INTARG [
> + (VNx16HF "VNx8HI") ;; f16_x2 -> s16
> + (VNx32HF "VNx8HI") ;; f16_x4 -> s16
> + (VNx16BF "VNx8HI") ;; bf16_x2 -> s16
> + (VNx32BF "VNx8HI") ;; bf16_x4 -> s16
> + (VNx8SF "VNx4SI") ;; f32_x2 -> s32
> + (VNx16SF "VNx4SI") ;; f32_x4 -> s32
> + (VNx4DF "VNx2DI") ;; f64_x2 -> s64
> + (VNx8DF "VNx2DI") ;; f64_x4 -> s64
> ])
>
> -(define_mode_attr SVSCALE_INTARG [(VNx16HF "VNx16HI") ;; f16_x2 -> s16x2
> - (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4
> - (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2
> - (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4
> - (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2
> - (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4
> +(define_mode_attr SVSCALE_INTARG [
> + (VNx16HF "VNx16HI") ;; f16_x2 -> s16x2
> + (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4
> + (VNx16BF "VNx16HI") ;; bf16_x2 -> s16x2
> + (VNx32BF "VNx32HI") ;; bf16_x4 -> s16x4
> + (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2
> + (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4
> + (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2
> + (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4
> ])
>
> +
> ;; -------------------------------------------------------------------
> ;; Code Iterators
> ;; -------------------------------------------------------------------
> @@ -3635,6 +3660,8 @@ (define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
> (define_int_iterator SVE_COND_FP_SUB [UNSPEC_COND_FSUB])
> (define_int_iterator SVE_COND_FP_MUL [UNSPEC_COND_FMUL])
>
> +(define_int_iterator SVE_FP_MUL [UNSPEC_FMUL])
> +
> (define_int_iterator SVE_COND_FP_BINARY_I1 [UNSPEC_COND_FMAX
> UNSPEC_COND_FMAXNM
> UNSPEC_COND_FMIN
> @@ -4195,6 +4222,7 @@ (define_int_attr optab [(UNSPEC_ANDF "and")
> (UNSPEC_FMINNMQV "fminnmqv")
> (UNSPEC_FMINNMV "smin")
> (UNSPEC_FMINV "smin_nan")
> + (UNSPEC_FMUL "fmul")
> (UNSPEC_SMUL_HIGHPART "smulh")
> (UNSPEC_UMUL_HIGHPART "umulh")
> (UNSPEC_FMLA "fma")
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index a25b43b3e85..cf7930a4858 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -23146,7 +23146,8 @@ Enable the Checked Pointer Arithmetic instructions.
> @item sve-b16b16
> Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
> This only has an effect when @code{sve2} or @code{sme2} are also enabled.
> -
> +@item sve-bfscale
> +Enable the SVE_BFSCALE extension.
> @end table
>
> Feature @option{crypto} implies @option{aes}, @option{sha2}, and
> @option{simd},
> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
> b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
> index 284c2a23252..70f59b47aee 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
> @@ -111,6 +111,11 @@
> #error Foo
> #endif
>
> +#pragma GCC target "+nothing+sve-bfscale"
> +#ifndef __ARM_FEATURE_SVE_BFSCALE
> +#error "__ARM_FEATURE_SVE_BFSCALE should be defined but isn't"
> +#endif
> +
> #pragma GCC target "+nothing+sve2+sme-f8f16"
> #ifndef __ARM_FEATURE_SME_F8F16
> #error Foo
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
> new file mode 100644
> index 00000000000..b89c1ac98bf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
> @@ -0,0 +1,191 @@
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
Here and in the rest of the tests, you want to have something like:
/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
so that they don't fail if gas isn't recent enough to know about this
extension.
Thanks,
Artemiy
> +
> +#include "test_sme2_acle.h"
> +#pragma GCC target "+sve-bfscale"
> +
> +/*
> +** mul_z0_z0_z4:
> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z0_z4, svbfloat16x2_t, z0,
> + svmul_bf16_x2 (z0, z4),
> + svmul (z0, z4))
> +
> +/*
> +** mul_z0_z4_z0:
> +** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z4_z0, svbfloat16x2_t, z0,
> + svmul_bf16_x2 (z4, z0),
> + svmul (z4, z0))
> +
> +/*
> +** mul_z0_z4_z28:
> +** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z28\.h - z29\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z4_z28, svbfloat16x2_t, z0,
> + svmul_bf16_x2 (z4, z28),
> + svmul (z4, z28))
> +
> +/*
> +** mul_z18_z18_z4:
> +** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
> +** ret
> +*/
> +TEST_XN (mul_z18_z18_z4, svbfloat16x2_t, z18,
> + svmul_bf16_x2 (z18, z4),
> + svmul (z18, z4))
> +
> +/*
> +** mul_z23_z23_z18:
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul [^\n]+, {z18\.h - z19\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN (mul_z23_z23_z18, svbfloat16x2_t, z23,
> + svmul_bf16_x2 (z23, z18),
> + svmul (z23, z18))
> +
> +/*
> +** mul_z28_z28_z0:
> +** bfmul {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
> +** ret
> +*/
> +TEST_XN (mul_z28_z28_z0, svbfloat16x2_t, z28,
> + svmul_bf16_x2 (z28, z0),
> + svmul (z28, z0))
> +
> +/*
> +** mul_z0_z0_z18:
> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z0_z18, svbfloat16x2_t, z0,
> + svmul_bf16_x2 (z0, z18),
> + svmul (z0, z18))
> +
> +/*
> +** mul_z4_z4_z23:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
> +** |
> +** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_XN (mul_z4_z4_z23, svbfloat16x2_t, z4,
> + svmul_bf16_x2 (z4, z23),
> + svmul (z4, z23))
> +
> +/*
> +** mul_single_z24_z24_z0:
> +** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
> + svmul_single_bf16_x2 (z24, z0),
> + svmul (z24, z0))
> +
> +/*
> +** mul_single_z24_z28_z0:
> +** bfmul {z24\.h - z25\.h}, {z28\.h - z29\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
> + svmul_single_bf16_x2 (z28, z0),
> + svmul (z28, z0))
> +
> +/*
> +** mul_single_z24_z1_z0:
> +** (
> +** mov z30\.d, z1\.d
> +** mov z31\.d, z2\.d
> +** |
> +** mov z31\.d, z2\.d
> +** mov z30\.d, z1\.d
> +** )
> +** bfmul {z24\.h - z25\.h}, {z30\.h - z31\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
> + svmul_single_bf16_x2 (z1, z0),
> + svmul (z1, z0))
> +
> +/*
> +** mul_single_z1_z24_z0:
> +** bfmul {z30\.h - z31\.h}, {z24\.h - z25\.h}, z0\.h
> +** (
> +** mov z2\.d, z31\.d
> +** mov z1\.d, z30\.d
> +** |
> +** mov z1\.d, z30\.d
> +** mov z2\.d, z31\.d
> +** )
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
> + svmul_single_bf16_x2 (z24, z0),
> + svmul (z24, z0))
> +
> +/*
> +** mul_single_z1_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
> + svmul_single_bf16_x2 (z1, z0),
> + svmul (z1, z0))
> +
> +/*
> +** mul_single_z18_z18_z0:
> +** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
> + svmul_single_bf16_x2 (z18, z0),
> + svmul (z18, z0))
> +
> +/*
> +** mul_single_awkward:
> +** ...
> +** bfmul {z0\.h - z1\.h}, {z30\.h - z31\.h}, z[0-9]+\.h
> +** ret
> +*/
> +TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x2_t, svbfloat16_t,
> + z0_res = svmul_single_bf16_x2 (z1, z0),
> + z0_res = svmul (z1, z0))
> +
> +/*
> +** mul_single_z0_z0_z15:
> +** ...
> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
> + z0 = svmul_single_bf16_x2 (z0, z15),
> + z0 = svmul (z0, z15))
> +
> +/*
> +** mul_single_z24_z24_z16:
> +** mov (z[0-7])\.d, z16\.d
> +** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
> + svmul_single_bf16_x2 (z24, z16),
> + svmul (z24, z16))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
> new file mode 100644
> index 00000000000..1078a65e7fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
> @@ -0,0 +1,225 @@
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sme2_acle.h"
> +#pragma GCC target "+sve-bfscale"
> +
> +/*
> +** mul_z0_z0_z4:
> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z0_z4, svbfloat16x4_t, z0,
> + svmul_bf16_x4 (z0, z4),
> + svmul (z0, z4))
> +
> +/*
> +** mul_z0_z4_z0:
> +** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z4_z0, svbfloat16x4_t, z0,
> + svmul_bf16_x4 (z4, z0),
> + svmul (z4, z0))
> +
> +/*
> +** mul_z0_z4_z28:
> +** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z28\.h - z31\.h}
> +** ret
> +*/
> +TEST_XN (mul_z0_z4_z28, svbfloat16x4_t, z0,
> + svmul_bf16_x4 (z4, z28),
> + svmul (z4, z28))
> +
> +/*
> +** mul_z18_z18_z4:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul [^\n]+, {z4\.h - z7\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN (mul_z18_z18_z4, svbfloat16x4_t, z18,
> + svmul_bf16_x4 (z18, z4),
> + svmul (z18, z4))
> +
> +/*
> +** mul_z23_z23_z28:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul [^\n]+, {z28\.h - z31\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN (mul_z23_z23_z28, svbfloat16x4_t, z23,
> + svmul_bf16_x4 (z23, z28),
> + svmul (z23, z28))
> +
> +/*
> +** mul_z28_z28_z0:
> +** bfmul {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
> +** ret
> +*/
> +TEST_XN (mul_z28_z28_z0, svbfloat16x4_t, z28,
> + svmul_bf16_x4 (z28, z0),
> + svmul (z28, z0))
> +
> +/*
> +** mul_z0_z0_z18:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
> +** |
> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_XN (mul_z0_z0_z18, svbfloat16x4_t, z0,
> + svmul_bf16_x4 (z0, z18),
> + svmul (z0, z18))
> +
> +/*
> +** mul_z4_z4_z23:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
> +** |
> +** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_XN (mul_z4_z4_z23, svbfloat16x4_t, z4,
> + svmul_bf16_x4 (z4, z23),
> + svmul (z4, z23))
> +
> +/*
> +** mul_single_z24_z24_z0:
> +** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
> + svmul_single_bf16_x4 (z24, z0),
> + svmul (z24, z0))
> +
> +/*
> +** mul_single_z24_z28_z0:
> +** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
> + svmul_single_bf16_x4 (z28, z0),
> + svmul (z28, z0))
> +
> +/*
> +** mul_single_z24_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
> + svmul_single_bf16_x4 (z1, z0),
> + svmul (z1, z0))
> +
> +/*
> +** mul_single_z1_z24_z0:
> +** bfmul {z28\.h - z31\.h}, {z24\.h - z27\.h}, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
> + svmul_single_bf16_x4 (z24, z0),
> + svmul (z24, z0))
> +
> +/*
> +** mul_single_z1_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
> + svmul_single_bf16_x4 (z1, z0),
> + svmul (z1, z0))
> +
> +/*
> +** mul_single_z18_z18_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfmul [^\n]+, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
> + svmul_single_bf16_x4 (z18, z0),
> + svmul (z18, z0))
> +
> +/*
> +** mul_single_awkward:
> +** ...
> +** bfmul {z0\.h - z3\.h}, {z[0-9]+\.h - z[0-9]+\.h}, z[0-9]+\.h
> +** ret
> +*/
> +TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x4_t, svbfloat16_t,
> + z0_res = svmul_single_bf16_x4 (z1, z0),
> + z0_res = svmul (z1, z0))
> +
> +/*
> +** mul_single_z0_z0_z15:
> +** ...
> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
> + z0 = svmul_single_bf16_x4 (z0, z15),
> + z0 = svmul (z0, z15))
> +
> +/*
> +** mul_single_z24_z24_z16:
> +** mov (z[0-7])\.d, z16\.d
> +** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
> + svmul_single_bf16_x4 (z24, z16),
> + svmul (z24, z16))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
> new file mode 100644
> index 00000000000..aabfbfddfc8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
> @@ -0,0 +1,192 @@
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sme2_acle.h"
> +#pragma GCC target "+sve-bfscale"
> +
> +/*
> +** bfscale_z0_z0_z4:
> +** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x2_t, svint16x2_t, z0,
> + svscale_bf16_x2 (z0, z4),
> + svscale (z0, z4))
> +
> +/*
> +** bfscale_z4_z4_z0:
> +** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x2_t, svbfloat16x2_t, z4,
> + svscale_bf16_x2 (z4, z0),
> + svscale (z4, z0))
> +
> +/*
> +** bfscale_z18_z18_z4:
> +** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x2_t, svint16x2_t, z18,
> + svscale_bf16_x2 (z18, z4),
> + svscale (z18, z4))
> +
> +/*
> +** bfscale_z23_z23_z18:
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale [^\n]+, {z18\.h - z19\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z23_z23_z18, svint16x2_t, svbfloat16x2_t, z23,
> + svscale_bf16_x2 (z23, z18),
> + svscale (z23, z18))
> +
> +
> +/*
> +** bfscale_z28_z28_z4:
> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z4\.h - z5\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x2_t, svint16x2_t, z28,
> + svscale_bf16_x2 (z28, z4),
> + svscale (z28, z4))
> +
> +/*
> +** bfscale_z4_z4_z18:
> +** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z18\.h - z19\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x2_t, svbfloat16x2_t, z4,
> + svscale_bf16_x2 (z4, z18),
> + svscale (z4, z18))
> +
> +/*
> +** bfscale_z28_28_z23:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+
> +** |
> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z28_28_z23, svbfloat16x2_t, svint16x2_t, z28,
> + svscale_bf16_x2 (z28, z23),
> + svscale (z28, z23))
> +
> +/*
> +** bfscale_single_z24_z24_z0:
> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x2_t, svint16_t, z24,
> + svscale_single_bf16_x2 (z24, z0),
> + svscale (z24, z0))
> +
> +/*
> +** bfscale_single_z24_z28_z0:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
> +** |
> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x2_t, svint16_t, z24,
> + svscale_single_bf16_x2 (z28, z0),
> + svscale (z28, z0))
> +
> +/*
> +** bfscale_single_z24_z1_z0:
> +** (
> +** mov z24\.d, z1\.d
> +** mov z25\.d, z2\.d
> +** |
> +** mov z25\.d, z2\.d
> +** mov z24\.d, z1\.d
> +** )
> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x2_t, svint16_t, z24,
> + svscale_single_bf16_x2 (z1, z0),
> + svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z1_z24_z0:
> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
> +** (
> +** mov z1\.d, z24\.d
> +** mov z2\.d, z25\.d
> +** |
> +** mov z2\.d, z25\.d
> +** mov z1\.d, z24\.d
> +** )
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x2_t, svint16_t, z1,
> + svscale_single_bf16_x2 (z24, z0),
> + svscale (z24, z0))
> +
> +/*
> +** bfscale_single_z1_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x2_t, svint16_t, z1,
> + svscale_single_bf16_x2 (z1, z0),
> + svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z18_z18_z0:
> +** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x2_t, svint16_t, z18,
> + svscale_single_bf16_x2 (z18, z0),
> + svscale (z18, z0))
> +
> +/*
> +** bfscale_single_awkward:
> +** ...
> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x2_t, svint16_t,
> + z0_res = svscale_single_bf16_x2 (z1, z0),
> + z0_res = svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z0_z0_z15:
> +** ...
> +** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x2_t, svint16_t,
> + z0 = svscale_single_bf16_x2 (z0, z15),
> + z0 = svscale (z0, z15))
> +
> +/*
> +** bfscale_single_z24_z24_z16:
> +** mov (z[0-7])\.d, z16\.d
> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x2_t, svint16_t, z24,
> + svscale_single_bf16_x2 (z24, z16),
> + svscale (z24, z16))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
> new file mode 100644
> index 00000000000..3103abf48fb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
> @@ -0,0 +1,229 @@
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sme2_acle.h"
> +#pragma GCC target "+sve-bfscale"
> +
> +/*
> +** bfscale_z0_z0_z4:
> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x4_t, svint16x4_t, z0,
> + svscale_bf16_x4 (z0, z4),
> + svscale (z0, z4))
> +
> +/*
> +** bfscale_z4_z4_z0:
> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x4_t, svbfloat16x4_t, z4,
> + svscale_bf16_x4 (z4, z0),
> + svscale (z4, z0))
> +
> +/*
> +** bfscale_z18_z18_z4:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale [^\n]+, {z4\.h - z7\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x4_t, svint16x4_t, z18,
> + svscale_bf16_x4 (z18, z4),
> + svscale (z18, z4))
> +
> +/*
> +** bfscale_z23_z23_z28:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale [^\n]+, {z28\.h - z31\.h}
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z23_z23_z28, svint16x4_t, svbfloat16x4_t, z23,
> + svscale_bf16_x4 (z23, z28),
> + svscale (z23, z28))
> +
> +/*
> +** bfscale_z28_z28_z4:
> +** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z4\.h - z7\.h}
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x4_t, svint16x4_t, z28,
> + svscale_bf16_x4 (z28, z4),
> + svscale (z28, z4))
> +
> +/*
> +** bfscale_z4_z4_z18:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
> +** |
> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x4_t, svbfloat16x4_t, z4,
> + svscale_bf16_x4 (z4, z18),
> + svscale (z4, z18))
> +
> +/*
> +** bfscale_z0_z0_z23:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
> +** |
> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_DUAL_XN (bfscale_z0_z0_z23, svbfloat16x4_t, svint16x4_t, z0,
> + svscale_bf16_x4 (z0, z23),
> + svscale (z0, z23))
> +
> +/*
> +** bfscale_single_z24_z24_z0:
> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x4_t, svint16_t, z24,
> + svscale_single_bf16_x4 (z24, z0),
> + svscale (z24, z0))
> +
> +/*
> +** bfscale_single_z24_z28_z0:
> +** (
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
> +** |
> +** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** )
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x4_t, svint16_t, z24,
> + svscale_single_bf16_x4 (z28, z0),
> + svscale (z28, z0))
> +
> +/*
> +** bfscale_single_z24_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x4_t, svint16_t, z24,
> + svscale_single_bf16_x4 (z1, z0),
> + svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z1_z24_z0:
> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x4_t, svint16_t, z1,
> + svscale_single_bf16_x4 (z24, z0),
> + svscale (z24, z0))
> +
> +/*
> +** bfscale_single_z1_z1_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x4_t, svint16_t, z1,
> + svscale_single_bf16_x4 (z1, z0),
> + svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z18_z18_z0:
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** bfscale [^\n]+, z0\.h
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** mov [^\n]+
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x4_t, svint16_t, z18,
> + svscale_single_bf16_x4 (z18, z0),
> + svscale (z18, z0))
> +
> +/*
> +** bfscale_single_awkward:
> +** ...
> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x4_t, svint16_t,
> + z0_res = svscale_single_bf16_x4 (z1, z0),
> + z0_res = svscale (z1, z0))
> +
> +/*
> +** bfscale_single_z0_z0_z15:
> +** ...
> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
> +** ...
> +** ret
> +*/
> +TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x4_t, svint16_t,
> + z0 = svscale_single_bf16_x4 (z0, z15),
> + z0 = svscale (z0, z15))
> +
> +/*
> +** bfscale_single_z24_z24_z16:
> +** mov (z[0-7])\.d, z16\.d
> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
> +** ret
> +*/
> +TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x4_t, svint16_t, z24,
> + svscale_single_bf16_x4 (z24, z16),
> + svscale (z24, z16))
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
> new file mode 100644
> index 00000000000..021cd44fe81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
> @@ -0,0 +1,335 @@
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +#include "test_sve_acle.h"
> +#pragma GCC target "+sve2,+sve-bfscale"
> +#ifdef STREAMING_COMPATIBLE
> +#pragma GCC target "+sme2"
> +#endif
> +
> +/*
> +** scale_bf16_m_tied1:
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_m_tied1, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_m (p0, z0, z4),
> + z0 = svscale_m (p0, z0, z4))
> +
> +/*
> +** scale_bf16_m_tied2:
> +** mov (z[0-9]+)\.d, z0\.d
> +** movprfx z0, z4
> +** bfscale z0\.h, p0/m, z0\.h, \1\.h
> +** ret
> +*/
> +TEST_DUAL_Z_REV (scale_bf16_m_tied2, svbfloat16_t, svint16_t,
> + z0_res = svscale_bf16_m (p0, z4, z0),
> + z0_res = svscale_m (p0, z4, z0))
> +
> +/*
> +** scale_bf16_m_untied:
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_m_untied, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_m (p0, z1, z4),
> + z0 = svscale_m (p0, z1, z4))
> +
> +/*
> +** scale_w0_bf16_m_tied1:
> +** mov (z[0-9]+\.h), w0
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_m_tied1, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_m (p0, z0, x0),
> + z0 = svscale_m (p0, z0, x0))
> +
> +/*
> +** scale_w0_bf16_m_untied:
> +** mov (z[0-9]+\.h), w0
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_m_untied, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_m (p0, z1, x0),
> + z0 = svscale_m (p0, z1, x0))
> +
> +/*
> +** scale_3_bf16_m_tied1:
> +** mov (z[0-9]+\.h), #3
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_m_tied1, svbfloat16_t,
> + z0 = svscale_n_bf16_m (p0, z0, 3),
> + z0 = svscale_m (p0, z0, 3))
> +
> +/*
> +** scale_3_bf16_m_untied:
> +** mov (z[0-9]+\.h), #3
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_m_untied, svbfloat16_t,
> + z0 = svscale_n_bf16_m (p0, z1, 3),
> + z0 = svscale_m (p0, z1, 3))
> +
> +/*
> +** scale_m3_bf16_m:
> +** mov (z[0-9]+\.h), #-3
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_m3_bf16_m, svbfloat16_t,
> + z0 = svscale_n_bf16_m (p0, z0, -3),
> + z0 = svscale_m (p0, z0, -3))
> +
> +/*
> +** scale_bf16_z_tied1:
> +** movprfx z0\.h, p0/z, z0\.h
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_z_tied1, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_z (p0, z0, z4),
> + z0 = svscale_z (p0, z0, z4))
> +
> +/*
> +** scale_bf16_z_tied2:
> +** mov (z[0-9]+)\.d, z0\.d
> +** movprfx z0\.h, p0/z, z4\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1\.h
> +** ret
> +*/
> +TEST_DUAL_Z_REV (scale_bf16_z_tied2, svbfloat16_t, svint16_t,
> + z0_res = svscale_bf16_z (p0, z4, z0),
> + z0_res = svscale_z (p0, z4, z0))
> +
> +/*
> +** scale_bf16_z_untied:
> +** movprfx z0\.h, p0/z, z1\.h
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_z_untied, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_z (p0, z1, z4),
> + z0 = svscale_z (p0, z1, z4))
> +
> +/*
> +** scale_w0_bf16_z_tied1:
> +** mov (z[0-9]+\.h), w0
> +** movprfx z0\.h, p0/z, z0\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_z_tied1, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_z (p0, z0, x0),
> + z0 = svscale_z (p0, z0, x0))
> +
> +/*
> +** scale_w0_bf16_z_untied:
> +** mov (z[0-9]+\.h), w0
> +** movprfx z0\.h, p0/z, z1\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_z_untied, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_z (p0, z1, x0),
> + z0 = svscale_z (p0, z1, x0))
> +
> +/*
> +** scale_3_bf16_z_tied1:
> +** mov (z[0-9]+\.h), #3
> +** movprfx z0\.h, p0/z, z0\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_z_tied1, svbfloat16_t,
> + z0 = svscale_n_bf16_z (p0, z0, 3),
> + z0 = svscale_z (p0, z0, 3))
> +
> +/*
> +** scale_3_bf16_z_untied:
> +** mov (z[0-9]+\.h), #3
> +** movprfx z0\.h, p0/z, z1\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_z_untied, svbfloat16_t,
> + z0 = svscale_n_bf16_z (p0, z1, 3),
> + z0 = svscale_z (p0, z1, 3))
> +
> +/*
> +** scale_m3_bf16_z:
> +** mov (z[0-9]+\.h), #-3
> +** movprfx z0\.h, p0/z, z0\.h
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_m3_bf16_z, svbfloat16_t,
> + z0 = svscale_n_bf16_z (p0, z0, -3),
> + z0 = svscale_z (p0, z0, -3))
> +
> +/*
> +** scale_bf16_x_tied1:
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_x_tied1, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_x (p0, z0, z4),
> + z0 = svscale_x (p0, z0, z4))
> +
> +/*
> +** scale_bf16_x_tied2:
> +** mov (z[0-9]+)\.d, z0\.d
> +** movprfx z0, z4
> +** bfscale z0\.h, p0/m, z0\.h, \1\.h
> +** ret
> +*/
> +TEST_DUAL_Z_REV (scale_bf16_x_tied2, svbfloat16_t, svint16_t,
> + z0_res = svscale_bf16_x (p0, z4, z0),
> + z0_res = svscale_x (p0, z4, z0))
> +
> +/*
> +** scale_bf16_x_untied:
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, z4\.h
> +** ret
> +*/
> +TEST_DUAL_Z (scale_bf16_x_untied, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_x (p0, z1, z4),
> + z0 = svscale_x (p0, z1, z4))
> +
> +/*
> +** scale_w0_bf16_x_tied1:
> +** mov (z[0-9]+\.h), w0
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_x_tied1, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_x (p0, z0, x0),
> + z0 = svscale_x (p0, z0, x0))
> +
> +/*
> +** scale_w0_bf16_x_untied:
> +** mov (z[0-9]+\.h), w0
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_ZX (scale_w0_bf16_x_untied, svbfloat16_t, int16_t,
> + z0 = svscale_n_bf16_x (p0, z1, x0),
> + z0 = svscale_x (p0, z1, x0))
> +
> +/*
> +** scale_3_bf16_x_tied1:
> +** mov (z[0-9]+\.h), #3
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_x_tied1, svbfloat16_t,
> + z0 = svscale_n_bf16_x (p0, z0, 3),
> + z0 = svscale_x (p0, z0, 3))
> +
> +/*
> +** scale_3_bf16_x_untied:
> +** mov (z[0-9]+\.h), #3
> +** movprfx z0, z1
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_3_bf16_x_untied, svbfloat16_t,
> + z0 = svscale_n_bf16_x (p0, z1, 3),
> + z0 = svscale_x (p0, z1, 3))
> +
> +/*
> +** scale_m3_bf16_x:
> +** mov (z[0-9]+\.h), #-3
> +** bfscale z0\.h, p0/m, z0\.h, \1
> +** ret
> +*/
> +TEST_UNIFORM_Z (scale_m3_bf16_x, svbfloat16_t,
> + z0 = svscale_n_bf16_x (p0, z0, -3),
> + z0 = svscale_x (p0, z0, -3))
> +
> +/*
> +** ptrue_scale_bf16_x_tied1:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_DUAL_Z (ptrue_scale_bf16_x_tied1, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_x (svptrue_b16 (), z0, z4),
> + z0 = svscale_x (svptrue_b16 (), z0, z4))
> +
> +/*
> +** ptrue_scale_bf16_x_tied2:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_DUAL_Z_REV (ptrue_scale_bf16_x_tied2, svbfloat16_t, svint16_t,
> + z0_res = svscale_bf16_x (svptrue_b16 (), z4, z0),
> + z0_res = svscale_x (svptrue_b16 (), z4, z0))
> +
> +/*
> +** ptrue_scale_bf16_x_untied:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_DUAL_Z (ptrue_scale_bf16_x_untied, svbfloat16_t, svint16_t,
> + z0 = svscale_bf16_x (svptrue_b16 (), z1, z4),
> + z0 = svscale_x (svptrue_b16 (), z1, z4))
> +
> +/*
> +** ptrue_scale_3_bf16_x_tied1:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_tied1, svbfloat16_t,
> + z0 = svscale_n_bf16_x (svptrue_b16 (), z0, 3),
> + z0 = svscale_x (svptrue_b16 (), z0, 3))
> +
> +/*
> +** ptrue_scale_3_bf16_x_untied:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_untied, svbfloat16_t,
> + z0 = svscale_n_bf16_x (svptrue_b16 (), z1, 3),
> + z0 = svscale_x (svptrue_b16 (), z1, 3))
> +
> +/*
> +** ptrue_scale_m3_bf16_x_tied1:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_tied1, svbfloat16_t,
> + z0 = svscale_n_bf16_x (svptrue_b16 (), z0, -3),
> + z0 = svscale_x (svptrue_b16 (), z0, -3))
> +
> +/*
> +** ptrue_scale_m3_bf16_x_untied:
> +** ...
> +** ptrue p[0-9]+\.b[^\n]*
> +** ...
> +** ret
> +*/
> +TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_untied, svbfloat16_t,
> + z0 = svscale_n_bf16_x (svptrue_b16 (), z1, -3),
> + z0 = svscale_x (svptrue_b16 (), z1, -3))
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
> new file mode 100644
> index 00000000000..051ff47b3bc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
> @@ -0,0 +1,114 @@
> +// { dg-options "-std=c23 -fsyntax-only" }
> +// { dg-do compile }
> +
> +#pragma GCC target "+sve,+sve2,+sme,+sme2,+sve-bfscale"
> +static_assert (__ARM_FEATURE_SVE2 == 1);
> +static_assert (__ARM_FEATURE_SME2 == 1);
> +static_assert (__ARM_FEATURE_SVE_BFSCALE == 1);
> +#include <arm_sve.h>
> +#include <arm_sme.h>
> +
> +/*
> +- BFSCALE (predicated)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SVE2 != 0
> + svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t
> zm);
> + svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t
> zm); */
> +
> +void
> +svscale_predicated_explicit_ok (svbool_t p, svbfloat16_t bf16x1,
> + svint16_t i16x1, int16_t i16)
> +{
> + bf16x1 = svscale_bf16_m (p, bf16x1, i16x1);
> + bf16x1 = svscale_bf16_x (p, bf16x1, i16x1);
> + bf16x1 = svscale_bf16_z (p, bf16x1, i16x1);
> +
> + bf16x1 = svscale_n_bf16_m (p, bf16x1, i16);
> + bf16x1 = svscale_n_bf16_x (p, bf16x1, i16);
> + bf16x1 = svscale_n_bf16_z (p, bf16x1, i16);
> +}
> +
> +void
> +svscale_predicated_inferred_ok (svbool_t p, svbfloat16_t bf16x1,
> + svbfloat16x4_t bf16x4, svint16_t i16x1,
> + int16_t i16)
> +{
> + bf16x1 = svscale_m (p, bf16x1, i16x1);
> + bf16x1 = svscale_x (p, bf16x1, i16x1);
> + bf16x1 = svscale_z (p, bf16x1, i16x1);
> +
> + bf16x1 = svscale_m (p, bf16x1, i16);
> + bf16x1 = svscale_x (p, bf16x1, i16);
> + bf16x1 = svscale_z (p, bf16x1, i16);
> +}
> +
> +/*
> +- BFSCALE (multiple vectors)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svscale[_bf16_x2] (svbfloat16x2_t zdn, svint16x2_t zm)
> __arm_streaming;
> + svbfloat16x4_t svscale[_bf16_x4] (svbfloat16x4_t zdn, svint16x4_t zm)
> __arm_streaming;
> +
> +- BFSCALE (multiple and single vector)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svscale[_single_bf16_x2] (svbfloat16x2_t zn, svint16_t zm)
> __arm_streaming;
> + svbfloat16x4_t svscale[_single_bf16_x4] (svbfloat16x4_t zn, svint16_t zm)
> __arm_streaming; */
> +
> +void
> +svscale_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
> + svbfloat16x4_t bf16x4, svint16_t i16x1, svint16x2_t i16x2,
> + svint16x4_t i16x4) __arm_streaming
> +{
> + bf16x2 = svscale_bf16_x2 (bf16x2, i16x2);
> + bf16x4 = svscale_bf16_x4 (bf16x4, i16x4);
> +
> + bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1);
> + bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1);
> +}
> +
> +void
> +svscale_inferred_ok (svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4,
> + svint16_t i16x1, svint16x2_t i16x2,
> + svint16x4_t i16x4) __arm_streaming
> +{
> + bf16x2 = svscale_bf16_x2 (bf16x2, i16x2);
> + bf16x4 = svscale_bf16_x4 (bf16x4, i16x4);
> +
> + bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1);
> + bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1);
> +}
> +
> +/*
> +- BFMUL (multiple vectors)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svmul[_bf16_x2] (svbfloat16x2_t zdn, svbfloat16x2_t zm)
> __arm_streaming;
> + svbfloat16x4_t svmul[_bf16_x4] (svbfloat16x4_t zdn, svbfloat16x4_t zm)
> __arm_streaming;
> +
> +- BFMUL (multiple and single vector)
> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
> + svbfloat16x2_t svmul[_single_bf16_x2] (svbfloat16x2_t zn, svbfloat16x2_t
> zm) __arm_streaming;
> + svbfloat16x4_t svmul[_single_bf16_x4] (svbfloat16x4_t zn, svbfloat16x4_t
> zm) __arm_streaming; */
> +
> +void
> +svmul_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
> + svbfloat16x4_t bf16x4) __arm_streaming
> +{
> + svmul_bf16_x2 (bf16x2, bf16x2);
> + svmul_bf16_x4 (bf16x4, bf16x4);
> +
> + svmul_single_bf16_x2 (bf16x2, bf16x1);
> + svmul_single_bf16_x4 (bf16x4, bf16x1);
> +}
> +
> +void
> +svmul_inferred_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
> + svbfloat16x4_t bf16x4) __arm_streaming
> +{
> + svmul (bf16x2, bf16x2);
> + svmul (bf16x4, bf16x4);
> +
> + svmul (bf16x2, bf16x1);
> + svmul (bf16x4, bf16x1);
> +}
> diff --git a/gcc/testsuite/lib/target-supports.exp
> b/gcc/testsuite/lib/target-supports.exp
> index d335735382c..97e5d6560fa 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -12672,7 +12672,7 @@ set exts {
> set exts_sve2 {
> "sme-f8f16" "sme-f8f32"
> "sme-b16b16" "sme-f16f16" "sme-i16i64" "sme" "sme2" "sme2p1"
> - "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma"
> + "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma" "sve-bfscale"
> }
>
> foreach { aarch64_ext } $exts {
> --
> 2.43.0
>