Spencer Abson <[email protected]> writes:
> This patch extends the expander for conditional smax, smin, add, sub, mul,
> min, max, and div to support partial SVE FP modes.
>
> If exceptions from undefined vector elements must be suppressed, this
> expansion converts the container-level predicate to an element-level one, and
> ensures that these elements are inactive for the operation. In practice, this
> is a predicate AND with the existing mask and a container-size PTRUE.
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-protos.h (aarch64_sve_emit_masked_fp_pred):
> Declare.
> * config/aarch64/aarch64-sve.md (and<mode>3): Change this to...
> (@and<mode>3): ...this, so that we can use gen_and3.
> (@cond_<optab><mode>): Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16,
> use aarch64_predicate_operand.
> (*cond_<optab><mode>_2_strict): Likewise.
> (*cond_<optab><mode>_3_strict): Likewise.
> (*cond_<optab><mode>_any_strict): Likwise.
> (*cond_<optab><mode>_2_const_strict): Extend from SVE_FULL_F to SVE_F,
> use aarch64_predicate_operand.
> (*cond_<optab><mode>_any_const_strict): Likewise.
> (*cond_sub<mode>_3_const_strict): Likwise.
> (*cond_sub<mode>_const_strict): Likewise.
> (*vcond_mask_<mode><vpred>): Use aarch64_predicate_operand, and update
> the comment here.
> * config/aarch64/aarch64.cc (aarch64_sve_emit_masked_fp_pred): New
> function. Helper to mask the predicate in conditional expanders.
>
> gcc/testsuite/ChangeLog:
>
> * g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C: New test.
> * gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fadd_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fmul_2.c: Likewise.
> * gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c: Likewise.
OK, thanks.
Richard
> ---
> gcc/config/aarch64/aarch64-protos.h | 1 +
> gcc/config/aarch64/aarch64-sve.md | 152 +++++++++---------
> gcc/config/aarch64/aarch64.cc | 27 ++++
> .../aarch64/sve/unpacked_cond_binary_bf16_2.C | 18 +++
> .../sve/unpacked_cond_builtin_fmax_2.c | 24 +++
> .../sve/unpacked_cond_builtin_fmin_2.c | 24 +++
> .../aarch64/sve/unpacked_cond_fadd_2.c | 28 ++++
> .../aarch64/sve/unpacked_cond_fdiv_2.c | 22 +++
> .../aarch64/sve/unpacked_cond_fmaxnm_2.c | 24 +++
> .../aarch64/sve/unpacked_cond_fminnm_2.c | 24 +++
> .../aarch64/sve/unpacked_cond_fmul_2.c | 22 +++
> .../aarch64/sve/unpacked_cond_fsubr_2.c | 26 +++
> 12 files changed, 319 insertions(+), 73 deletions(-)
> create mode 100644
> gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h
> b/gcc/config/aarch64/aarch64-protos.h
> index e946e8da11d..38c307cdc3a 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
> bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
> rtx aarch64_sve_packed_pred (machine_mode);
> rtx aarch64_sve_fp_pred (machine_mode, rtx *);
> +rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
> void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
> bool aarch64_expand_maskloadstore (rtx *, machine_mode);
> void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-sve.md
> index b252eef411c..fe407f7e77f 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -5605,18 +5605,21 @@
>
> ;; Predicated floating-point operations with merging.
> (define_expand "@cond_<optab><mode>"
> - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> + [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> + (unspec:SVE_F_B16B16
> [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> + (unspec:SVE_F_B16B16
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
> - (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
> + (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
> + (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
> SVE_COND_FP_BINARY)
> - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
> + {
> + operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
> + }
> )
>
> ;; Predicated floating-point operations, merging with the first input.
> @@ -5644,14 +5647,14 @@
> )
>
> (define_insn "*cond_<optab><mode>_2_strict"
> - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> + [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> + (unspec:SVE_F_B16B16
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F_B16B16
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> + (match_operand:SVE_F_B16B16 2 "register_operand")
> + (match_operand:SVE_F_B16B16 3 "register_operand")]
> SVE_COND_FP_BINARY)
> (match_dup 2)]
> UNSPEC_SEL))]
> @@ -5687,14 +5690,14 @@
> )
>
> (define_insn "*cond_<optab><mode>_2_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
> + (match_operand:SVE_F 2 "register_operand")
> + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
> SVE_COND_FP_BINARY_I1)
> (match_dup 2)]
> UNSPEC_SEL))]
> @@ -5730,14 +5733,14 @@
> )
>
> (define_insn "*cond_<optab><mode>_3_strict"
> - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> + [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> + (unspec:SVE_F_B16B16
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F_B16B16
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> + (match_operand:SVE_F_B16B16 2 "register_operand")
> + (match_operand:SVE_F_B16B16 3 "register_operand")]
> SVE_COND_FP_BINARY)
> (match_dup 3)]
> UNSPEC_SEL))]
> @@ -5794,16 +5797,16 @@
> )
>
> (define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
> - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F_B16B16
> + [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> + (unspec:SVE_F_B16B16
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F_B16B16
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> - (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> + (match_operand:SVE_F_B16B16 2 "register_operand")
> + (match_operand:SVE_F_B16B16 3 "register_operand")]
> SVE_COND_FP_BINARY)
> - (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE
> && (<supports_bf16> || !<is_bf16>)
> @@ -5868,16 +5871,16 @@
> )
>
> (define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
> + (match_operand:SVE_F 2 "register_operand")
> + (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
> SVE_COND_FP_BINARY_I1)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
> {@ [ cons: =0 , 1 , 2 , 4 ]
> @@ -5953,14 +5956,14 @@
> )
>
> (define_insn "*cond_add<mode>_2_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:SVE_FULL_F 3
> "aarch64_sve_float_arith_with_sub_immediate")]
> + (match_operand:SVE_F 2 "register_operand")
> + (match_operand:SVE_F 3
> "aarch64_sve_float_arith_with_sub_immediate")]
> UNSPEC_COND_FADD)
> (match_dup 2)]
> UNSPEC_SEL))]
> @@ -6015,16 +6018,16 @@
> )
>
> (define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "register_operand")
> - (match_operand:SVE_FULL_F 3
> "aarch64_sve_float_arith_with_sub_immediate")]
> + (match_operand:SVE_F 2 "register_operand")
> + (match_operand:SVE_F 3
> "aarch64_sve_float_arith_with_sub_immediate")]
> UNSPEC_COND_FADD)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
> {@ [ cons: =0 , 1 , 2 , 3 , 4 ]
> @@ -6266,14 +6269,14 @@
> )
>
> (define_insn "*cond_sub<mode>_3_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
> - (match_operand:SVE_FULL_F 3 "register_operand")]
> + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
> + (match_operand:SVE_F 3 "register_operand")]
> UNSPEC_COND_FSUB)
> (match_dup 3)]
> UNSPEC_SEL))]
> @@ -6323,16 +6326,16 @@
> )
>
> (define_insn_and_rewrite "*cond_sub<mode>_const_strict"
> - [(set (match_operand:SVE_FULL_F 0 "register_operand")
> - (unspec:SVE_FULL_F
> - [(match_operand:<VPRED> 1 "register_operand")
> - (unspec:SVE_FULL_F
> + [(set (match_operand:SVE_F 0 "register_operand")
> + (unspec:SVE_F
> + [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> + (unspec:SVE_F
> [(match_dup 1)
> (const_int SVE_STRICT_GP)
> - (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
> - (match_operand:SVE_FULL_F 3 "register_operand")]
> + (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
> + (match_operand:SVE_F 3 "register_operand")]
> UNSPEC_COND_FSUB)
> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> + (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
> {@ [ cons: =0 , 1 , 3 , 4 ]
> @@ -6913,7 +6916,7 @@
> ;; Predicate AND. We can reuse one of the inputs as the GP.
> ;; Doubling the second operand is the preferred implementation
> ;; of the MOV alias, so we use that instead of %1/z, %1, %2.
> -(define_insn "and<mode>3"
> +(define_insn "@and<mode>3"
> [(set (match_operand:PRED_ALL 0 "register_operand")
> (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
> (match_operand:PRED_ALL 2 "register_operand")))]
> @@ -8201,20 +8204,23 @@
> ;;
> ;; For unpacked vectors, it doesn't really matter whether SEL uses the
> ;; the container size or the element size. If SEL used the container size,
> -;; it would ignore undefined bits of the predicate but would copy the
> -;; upper (undefined) bits of each container along with the defined bits.
> -;; If SEL used the element size, it would use undefined bits of the predicate
> -;; to select between undefined elements in each input vector. Thus the only
> -;; difference is whether the undefined bits in a container always come from
> -;; the same input as the defined bits, or whether the choice can vary
> -;; independently of the defined bits.
> +;; it would would copy the upper (undefined) bits of each container along
> +;; with the corresponding defined bits. If SEL used the element size,
> +;; it would use separate predicate bits to select between the undefined
> +;; elements in each input vector; these seperate predicate bits might
> +;; themselves be undefined, depending on the mode of the predicate.
> +;;
> +;; Thus the only difference is whether the undefined bits in a container
> +;; always come from the same input as the defined bits, or whether the
> +;; choice can vary independently of the defined bits.
> ;;
> ;; For the other instructions, using the element size is more natural,
> ;; so we do that for SEL as well.
> +;;
> (define_insn "*vcond_mask_<mode><vpred>"
> [(set (match_operand:SVE_ALL 0 "register_operand")
> (unspec:SVE_ALL
> - [(match_operand:<VPRED> 3 "register_operand")
> + [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
> (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
> (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
> UNSPEC_SEL))]
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 9e4a37bcaff..a06d34bf4ed 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3931,6 +3931,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx
> *strictness)
> return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
> }
>
> +/* PRED is a predicate that governs an operation on DATA_MODE. If DATA_MODE
> + is a partial vector mode, and if exceptions must be suppressed for its
> + undefined elements, convert PRED from a container-level predicate to
> + an element-level predicate and ensure that the undefined elements
> + are inactive. Make no changes otherwise.
> +
> + Return the resultant predicate. */
> +rtx
> +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred)
> +{
> + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
> + if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
> + {
> + /* Generate an element-level mask. */
> + rtx mask = aarch64_sve_packed_pred (data_mode);
> + machine_mode pmode = GET_MODE (mask);
> +
> + /* Apply the existing predicate. */
> + rtx dst = gen_reg_rtx (pmode);
> + emit_insn (gen_and3 (pmode, dst, mask,
> + gen_lowpart (pmode, pred)));
> + return dst;
> + }
> +
> + return pred;
> +}
> +
> /* Emit a comparison CMP between OP0 and OP1, both of which have mode
> DATA_MODE, and return the result in a predicate of mode PRED_MODE.
> Use TARGET as the target register if nonnull and convenient. */
> diff --git
> a/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> new file mode 100644
> index 00000000000..02880efa333
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> @@ -0,0 +1,18 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O -ffinite-math-only -fno-signed-zeros
> -msve-vector-bits=2048 " } */
> +
> +#include "unpacked_cond_binary_bf16_1.C"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 15 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 15 } } */
> +/* { dg-final { scan-assembler-times {\tand} 30 } } */
> +
> +/* { dg-final { scan-assembler-times {\tbfadd\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfsub\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfmul\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +
> +/* { dg-final { scan-assembler-times {\tbfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +
> +// There's no BFSUBR.
> +/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
> diff --git
> a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> new file mode 100644
> index 00000000000..f84ded5ea3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_builtin_fmax_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git
> a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> new file mode 100644
> index 00000000000..bceddf9ef74
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_builtin_fmin_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> new file mode 100644
> index 00000000000..e59864b5e8f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fadd_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 11 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 22 } } */
> +/* { dg-final { scan-assembler-times {\tand} 33 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 19 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 19 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 19 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 5 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 10 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> new file mode 100644
> index 00000000000..1ca3dbf2242
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fdiv_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 3 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 6 } } */
> +/* { dg-final { scan-assembler-times {\tand} 9 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 7 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 7 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> new file mode 100644
> index 00000000000..282f3ed0830
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048
> -fno-signed-zeros -ffinite-math-only" } */
> +
> +#include "unpacked_cond_fmaxnm_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> new file mode 100644
> index 00000000000..8226a6fadc4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048
> -fno-signed-zeros -ffinite-math-only" } */
> +
> +#include "unpacked_cond_fminnm_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> new file mode 100644
> index 00000000000..21713f58379
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fmul_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 5 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 10 } } */
> +/* { dg-final { scan-assembler-times {\tand} 15 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 10 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 10 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 10 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.5\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.5\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
> new file mode 100644
> index 00000000000..cd7a0e16047
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fsubr_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m,
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m,
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */