Re: [PATCH v2 1/1] aarch64: Add support for unpacked SVE FP conditional binary arithmetic

Richard Sandiford Tue, 29 Jul 2025 08:29:22 -0700

Spencer Abson <[email protected]> writes:
> This patch extends the expander for conditional smax, smin, add, sub, mul,
> min, max, and div to support partial SVE FP modes.
>
> If exceptions from undefined vector elements must be suppressed, this
> expansion converts the container-level predicate to an element-level one, and
> ensures that these elements are inactive for the operation.  In practice, this
> is a predicate AND with the existing mask and a container-size PTRUE.
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-protos.h (aarch64_sve_emit_masked_fp_pred):
>       Declare.
>       * config/aarch64/aarch64-sve.md (and<mode>3):  Change this to...
>       (@and<mode>3): ...this, so that we can use gen_and3.
>       (@cond_<optab><mode>): Extend from SVE_FULL_F_B16B16 to SVE_F_B16B16,
>       use aarch64_predicate_operand.
>       (*cond_<optab><mode>_2_strict): Likewise.
>       (*cond_<optab><mode>_3_strict): Likewise.
>       (*cond_<optab><mode>_any_strict): Likwise.
>       (*cond_<optab><mode>_2_const_strict): Extend from SVE_FULL_F to SVE_F,
>       use aarch64_predicate_operand.
>       (*cond_<optab><mode>_any_const_strict): Likewise.
>       (*cond_sub<mode>_3_const_strict): Likwise.
>       (*cond_sub<mode>_const_strict): Likewise.
>       (*vcond_mask_<mode><vpred>): Use aarch64_predicate_operand, and update
>       the comment here.
>       * config/aarch64/aarch64.cc (aarch64_sve_emit_masked_fp_pred): New
>       function.  Helper to mask the predicate in conditional expanders.
>
> gcc/testsuite/ChangeLog:
>
>       * g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C: New test.
>       * gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fadd_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fmul_2.c: Likewise.
>       * gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c: Likewise.


OK, thanks.

Richard

> ---
>  gcc/config/aarch64/aarch64-protos.h           |   1 +
>  gcc/config/aarch64/aarch64-sve.md             | 152 +++++++++---------
>  gcc/config/aarch64/aarch64.cc                 |  27 ++++
>  .../aarch64/sve/unpacked_cond_binary_bf16_2.C |  18 +++
>  .../sve/unpacked_cond_builtin_fmax_2.c        |  24 +++
>  .../sve/unpacked_cond_builtin_fmin_2.c        |  24 +++
>  .../aarch64/sve/unpacked_cond_fadd_2.c        |  28 ++++
>  .../aarch64/sve/unpacked_cond_fdiv_2.c        |  22 +++
>  .../aarch64/sve/unpacked_cond_fmaxnm_2.c      |  24 +++
>  .../aarch64/sve/unpacked_cond_fminnm_2.c      |  24 +++
>  .../aarch64/sve/unpacked_cond_fmul_2.c        |  22 +++
>  .../aarch64/sve/unpacked_cond_fsubr_2.c       |  26 +++
>  12 files changed, 319 insertions(+), 73 deletions(-)
>  create mode 100644 
> gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index e946e8da11d..38c307cdc3a 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1031,6 +1031,7 @@ rtx aarch64_pfalse_reg (machine_mode);
>  bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
>  rtx aarch64_sve_packed_pred (machine_mode);
>  rtx aarch64_sve_fp_pred (machine_mode, rtx *);
> +rtx aarch64_sve_emit_masked_fp_pred (machine_mode, rtx);
>  void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
>  bool aarch64_expand_maskloadstore (rtx *, machine_mode);
>  void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index b252eef411c..fe407f7e77f 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -5605,18 +5605,21 @@
>  
>  ;; Predicated floating-point operations with merging.
>  (define_expand "@cond_<optab><mode>"
> -  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> -     (unspec:SVE_FULL_F_B16B16
> +  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> +     (unspec:SVE_F_B16B16
>         [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F_B16B16
> +        (unspec:SVE_F_B16B16
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
> -           (match_operand:SVE_FULL_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
> +           (match_operand:SVE_F_B16B16 2 "<sve_pred_fp_rhs1_operand>")
> +           (match_operand:SVE_F_B16B16 3 "<sve_pred_fp_rhs2_operand>")]
>            SVE_COND_FP_BINARY)
> -        (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> +        (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
>    "TARGET_SVE && (<supports_bf16> || !<is_bf16>)"
> +  {
> +    operands[1] = aarch64_sve_emit_masked_fp_pred (<MODE>mode, operands[1]);
> +  }
>  )
>  
>  ;; Predicated floating-point operations, merging with the first input.
> @@ -5644,14 +5647,14 @@
>  )
>  
>  (define_insn "*cond_<optab><mode>_2_strict"
> -  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> -     (unspec:SVE_FULL_F_B16B16
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F_B16B16
> +  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> +     (unspec:SVE_F_B16B16
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F_B16B16
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> -           (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> +           (match_operand:SVE_F_B16B16 2 "register_operand")
> +           (match_operand:SVE_F_B16B16 3 "register_operand")]
>            SVE_COND_FP_BINARY)
>          (match_dup 2)]
>         UNSPEC_SEL))]
> @@ -5687,14 +5690,14 @@
>  )
>  
>  (define_insn "*cond_<optab><mode>_2_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "register_operand")
> -           (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
> +           (match_operand:SVE_F 2 "register_operand")
> +           (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
>            SVE_COND_FP_BINARY_I1)
>          (match_dup 2)]
>         UNSPEC_SEL))]
> @@ -5730,14 +5733,14 @@
>  )
>  
>  (define_insn "*cond_<optab><mode>_3_strict"
> -  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> -     (unspec:SVE_FULL_F_B16B16
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F_B16B16
> +  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> +     (unspec:SVE_F_B16B16
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F_B16B16
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> -           (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> +           (match_operand:SVE_F_B16B16 2 "register_operand")
> +           (match_operand:SVE_F_B16B16 3 "register_operand")]
>            SVE_COND_FP_BINARY)
>          (match_dup 3)]
>         UNSPEC_SEL))]
> @@ -5794,16 +5797,16 @@
>  )
>  
>  (define_insn_and_rewrite "*cond_<optab><mode>_any_strict"
> -  [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand")
> -     (unspec:SVE_FULL_F_B16B16
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F_B16B16
> +  [(set (match_operand:SVE_F_B16B16 0 "register_operand")
> +     (unspec:SVE_F_B16B16
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F_B16B16
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F_B16B16 2 "register_operand")
> -           (match_operand:SVE_FULL_F_B16B16 3 "register_operand")]
> +           (match_operand:SVE_F_B16B16 2 "register_operand")
> +           (match_operand:SVE_F_B16B16 3 "register_operand")]
>            SVE_COND_FP_BINARY)
> -        (match_operand:SVE_FULL_F_B16B16 4 "aarch64_simd_reg_or_zero")]
> +        (match_operand:SVE_F_B16B16 4 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
>    "TARGET_SVE
>     && (<supports_bf16> || !<is_bf16>)
> @@ -5868,16 +5871,16 @@
>  )
>  
>  (define_insn_and_rewrite "*cond_<optab><mode>_any_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "register_operand")
> -           (match_operand:SVE_FULL_F 3 "<sve_pred_fp_rhs2_immediate>")]
> +           (match_operand:SVE_F 2 "register_operand")
> +           (match_operand:SVE_F 3 "<sve_pred_fp_rhs2_immediate>")]
>            SVE_COND_FP_BINARY_I1)
> -        (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +        (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
>    "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
>    {@ [ cons: =0 , 1   , 2 , 4   ]
> @@ -5953,14 +5956,14 @@
>  )
>  
>  (define_insn "*cond_add<mode>_2_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "register_operand")
> -           (match_operand:SVE_FULL_F 3 
> "aarch64_sve_float_arith_with_sub_immediate")]
> +           (match_operand:SVE_F 2 "register_operand")
> +           (match_operand:SVE_F 3 
> "aarch64_sve_float_arith_with_sub_immediate")]
>            UNSPEC_COND_FADD)
>          (match_dup 2)]
>         UNSPEC_SEL))]
> @@ -6015,16 +6018,16 @@
>  )
>  
>  (define_insn_and_rewrite "*cond_add<mode>_any_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "register_operand")
> -           (match_operand:SVE_FULL_F 3 
> "aarch64_sve_float_arith_with_sub_immediate")]
> +           (match_operand:SVE_F 2 "register_operand")
> +           (match_operand:SVE_F 3 
> "aarch64_sve_float_arith_with_sub_immediate")]
>            UNSPEC_COND_FADD)
> -        (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +        (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
>    "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])"
>    {@ [ cons: =0 , 1   , 2 , 3   , 4   ]
> @@ -6266,14 +6269,14 @@
>  )
>  
>  (define_insn "*cond_sub<mode>_3_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
> -           (match_operand:SVE_FULL_F 3 "register_operand")]
> +           (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
> +           (match_operand:SVE_F 3 "register_operand")]
>            UNSPEC_COND_FSUB)
>          (match_dup 3)]
>         UNSPEC_SEL))]
> @@ -6323,16 +6326,16 @@
>  )
>  
>  (define_insn_and_rewrite "*cond_sub<mode>_const_strict"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> -       [(match_operand:<VPRED> 1 "register_operand")
> -        (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_F 0 "register_operand")
> +     (unspec:SVE_F
> +       [(match_operand:<VPRED> 1 "aarch64_predicate_operand")
> +        (unspec:SVE_F
>            [(match_dup 1)
>             (const_int SVE_STRICT_GP)
> -           (match_operand:SVE_FULL_F 2 "aarch64_sve_float_arith_immediate")
> -           (match_operand:SVE_FULL_F 3 "register_operand")]
> +           (match_operand:SVE_F 2 "aarch64_sve_float_arith_immediate")
> +           (match_operand:SVE_F 3 "register_operand")]
>            UNSPEC_COND_FSUB)
> -        (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +        (match_operand:SVE_F 4 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
>    "TARGET_SVE && !rtx_equal_p (operands[3], operands[4])"
>    {@ [ cons: =0 , 1   , 3 , 4   ]
> @@ -6913,7 +6916,7 @@
>  ;; Predicate AND.  We can reuse one of the inputs as the GP.
>  ;; Doubling the second operand is the preferred implementation
>  ;; of the MOV alias, so we use that instead of %1/z, %1, %2.
> -(define_insn "and<mode>3"
> +(define_insn "@and<mode>3"
>    [(set (match_operand:PRED_ALL 0 "register_operand")
>       (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
>                     (match_operand:PRED_ALL 2 "register_operand")))]
> @@ -8201,20 +8204,23 @@
>  ;;
>  ;; For unpacked vectors, it doesn't really matter whether SEL uses the
>  ;; the container size or the element size.  If SEL used the container size,
> -;; it would ignore undefined bits of the predicate but would copy the
> -;; upper (undefined) bits of each container along with the defined bits.
> -;; If SEL used the element size, it would use undefined bits of the predicate
> -;; to select between undefined elements in each input vector.  Thus the only
> -;; difference is whether the undefined bits in a container always come from
> -;; the same input as the defined bits, or whether the choice can vary
> -;; independently of the defined bits.
> +;; it would would copy the upper (undefined) bits of each container along
> +;; with the corresponding defined bits.  If SEL used the element size,
> +;; it would use separate predicate bits to select between the undefined
> +;; elements in each input vector; these seperate predicate bits might
> +;; themselves be undefined, depending on the mode of the predicate.
> +;;
> +;; Thus the only difference is whether the undefined bits in a container
> +;; always come from the same input as the defined bits, or whether the
> +;; choice can vary independently of the defined bits.
>  ;;
>  ;; For the other instructions, using the element size is more natural,
>  ;; so we do that for SEL as well.
> +;;
>  (define_insn "*vcond_mask_<mode><vpred>"
>    [(set (match_operand:SVE_ALL 0 "register_operand")
>       (unspec:SVE_ALL
> -       [(match_operand:<VPRED> 3 "register_operand")
> +       [(match_operand:<VPRED> 3 "aarch64_predicate_operand")
>          (match_operand:SVE_ALL 1 "aarch64_sve_reg_or_dup_imm")
>          (match_operand:SVE_ALL 2 "aarch64_simd_reg_or_zero")]
>         UNSPEC_SEL))]
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 9e4a37bcaff..a06d34bf4ed 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -3931,6 +3931,33 @@ aarch64_sve_fp_pred (machine_mode data_mode, rtx 
> *strictness)
>     return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode));
>  }
>  
> +/* PRED is a predicate that governs an operation on DATA_MODE.  If DATA_MODE
> +   is a partial vector mode, and if exceptions must be suppressed for its
> +   undefined elements, convert PRED from a container-level predicate to
> +   an element-level predicate and ensure that the undefined elements
> +   are inactive.  Make no changes otherwise.
> +
> +   Return the resultant predicate.  */
> +rtx
> +aarch64_sve_emit_masked_fp_pred (machine_mode data_mode, rtx pred)
> +{
> +  unsigned int vec_flags = aarch64_classify_vector_mode (data_mode);
> +  if (flag_trapping_math && (vec_flags & VEC_PARTIAL))
> +    {
> +      /* Generate an element-level mask.  */
> +      rtx mask = aarch64_sve_packed_pred (data_mode);
> +      machine_mode pmode = GET_MODE (mask);
> +
> +      /* Apply the existing predicate.  */
> +      rtx dst = gen_reg_rtx (pmode);
> +      emit_insn (gen_and3 (pmode, dst, mask,
> +                        gen_lowpart (pmode, pred)));
> +      return dst;
> +    }
> +
> +  return pred;
> +}
> +
>  /* Emit a comparison CMP between OP0 and OP1, both of which have mode
>     DATA_MODE, and return the result in a predicate of mode PRED_MODE.
>     Use TARGET as the target register if nonnull and convenient.  */
> diff --git 
> a/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C 
> b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> new file mode 100644
> index 00000000000..02880efa333
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/aarch64/sve/unpacked_cond_binary_bf16_2.C
> @@ -0,0 +1,18 @@
> +/* { dg-do compile }*/
> +/* { dg-options "-O -ffinite-math-only -fno-signed-zeros 
> -msve-vector-bits=2048 " } */
> +
> +#include "unpacked_cond_binary_bf16_1.C"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 15 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 15 } } */
> +/* { dg-final { scan-assembler-times {\tand} 30 } } */
> +
> +/* { dg-final { scan-assembler-times {\tbfadd\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfsub\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfmul\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +
> +/* { dg-final { scan-assembler-times {\tbfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tbfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +
> +// There's no BFSUBR.
> +/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> new file mode 100644
> index 00000000000..f84ded5ea3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmax_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_builtin_fmax_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git 
> a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> new file mode 100644
> index 00000000000..bceddf9ef74
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_builtin_fmin_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_builtin_fmin_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> new file mode 100644
> index 00000000000..e59864b5e8f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fadd_2.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fadd_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 11 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 22 } } */
> +/* { dg-final { scan-assembler-times {\tand} 33 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 19 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 19 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 19 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 5 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 10 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> new file mode 100644
> index 00000000000..1ca3dbf2242
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fdiv_2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fdiv_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 3 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 6 } } */
> +/* { dg-final { scan-assembler-times {\tand} 9 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 7 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 7 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfdivr\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> new file mode 100644
> index 00000000000..282f3ed0830
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmaxnm_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 
> -fno-signed-zeros -ffinite-math-only" } */
> +
> +#include "unpacked_cond_fmaxnm_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmaxnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> new file mode 100644
> index 00000000000..8226a6fadc4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fminnm_2.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048 
> -fno-signed-zeros -ffinite-math-only" } */
> +
> +#include "unpacked_cond_fminnm_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.0\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.0\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfminnm\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> new file mode 100644
> index 00000000000..21713f58379
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmul_2.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fmul_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 5 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 10 } } */
> +/* { dg-final { scan-assembler-times {\tand} 15 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 10 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 10 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 10 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.5\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.5\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
> new file mode 100644
> index 00000000000..cd7a0e16047
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fsubr_2.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize -moverride=sve_width=2048" } */
> +
> +#include "unpacked_cond_fsubr_1.c"
> +
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s} 7 } } */
> +/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d} 14 } } */
> +/* { dg-final { scan-assembler-times {\tand} 21 } } */
> +
> +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 13 } } */
> +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 13 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s\n} 3 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #0.5\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.s, p[0-7]/m, 
> z[0-9]+\.s, #1.0\n} 2 } } */
> +
> +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h\n} 6 } } */
> +/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, z[0-9]+\.h\n} 2 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #0.5\n} 4 } } */
> +/* { dg-final { scan-assembler-times {\tfsubr\tz[0-9]+\.h, p[0-7]/m, 
> z[0-9]+\.h, #1.0\n} 4 } } */
> +
> +/* { dg-final { scan-assembler-not {\tsel\t} } } */

Re: [PATCH v2 1/1] aarch64: Add support for unpacked SVE FP conditional binary arithmetic

Reply via email to