Hi Akram,
> On 14 Nov 2024, at 16:53, Akram Ahmad <[email protected]> wrote:
>
> This renames the existing {s,u}q{add,sub} instructions to use the
> standard names {s,u}s{add,sub}3 which are used by IFN_SAT_ADD and
> IFN_SAT_SUB.
>
> The NEON intrinsics for saturating arithmetic and their corresponding
> builtins are changed to use these standard names too.
>
> Using the standard names for the instructions causes 32 and 64-bit
> unsigned scalar saturating arithmetic to use the NEON instructions,
> resulting in an additional (and inefficient) FMOV to be generated when
> the original operands are in GP registers. This patch therefore also
> restores the original behaviour of using the adds/subs instructions
> in this circumstance.
>
> Furthermore, this patch introduces a new optimisation for signed 32
> and 64-bit scalar saturating arithmetic which uses adds/subs in place
> of the NEON instruction.
>
> Addition, before:
> fmov d0, x0
> fmov d1, x1
> sqadd d0, d0, d1
> fmov x0, d0
>
> Addition, after:
> asr x2, x1, 63
> adds x0, x0, x1
> eor x2, x2, 0x8000000000000000
> csinv x0, x0, x2, vc
>
> In the above example, subtraction replaces the adds with subs and the
> csinv with csel. The 32-bit case follows the same approach. Arithmetic
> with a constant operand is simplified further by directly storing the
> saturating limit in the temporary register, resulting in only three
> instructions being used. It is important to note that this only works
> when early-ra is disabled due to an early-ra bug which erroneously
> assigns FP registers to the operands; if early-ra is enabled, then the
> original behaviour (NEON instruction) occurs.
>
> Additional tests are written for the scalar and Adv. SIMD cases to
> ensure that the correct instructions are used. The NEON intrinsics are
> already tested elsewhere. The signed scalar case is also tested with
> an execution test to check the results.
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-builtins.cc: Expand iterators.
> * config/aarch64/aarch64-simd-builtins.def: Use standard names
> * config/aarch64/aarch64-simd.md: Use standard names, split insn
> definitions on signedness of operator and type of operands.
> * config/aarch64/arm_neon.h: Use standard builtin names.
> * config/aarch64/iterators.md: Add VSDQ_I_QI_HI iterator to
> simplify splitting of insn for scalar arithmetic.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc:
> Template file for unsigned vector saturating arithmetic tests.
> * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c:
> 8-bit vector type tests.
> * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c:
> 16-bit vector type tests.
> * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c:
> 32-bit vector type tests.
> * gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c:
> 64-bit vector type tests.
> * gcc.target/aarch64/saturating_arithmetic.inc: Template file
> for scalar saturating arithmetic tests.
> * gcc.target/aarch64/saturating_arithmetic_1.c: 8-bit tests.
> * gcc.target/aarch64/saturating_arithmetic_2.c: 16-bit tests.
> * gcc.target/aarch64/saturating_arithmetic_3.c: 32-bit tests.
> * gcc.target/aarch64/saturating_arithmetic_4.c: 64-bit tests.
> * gcc.target/aarch64/saturating_arithmetic_signed.c: Signed tests.
> ---
> gcc/config/aarch64/aarch64-builtins.cc | 13 +
> gcc/config/aarch64/aarch64-simd-builtins.def | 8 +-
> gcc/config/aarch64/aarch64-simd.md | 209 ++++++++++++++-
> gcc/config/aarch64/arm_neon.h | 96 +++----
> gcc/config/aarch64/iterators.md | 4 +
> .../saturating_arithmetic_autovect.inc | 58 +++++
> .../saturating_arithmetic_autovect_1.c | 79 ++++++
> .../saturating_arithmetic_autovect_2.c | 79 ++++++
> .../saturating_arithmetic_autovect_3.c | 75 ++++++
> .../saturating_arithmetic_autovect_4.c | 77 ++++++
> .../aarch64/saturating-arithmetic-signed.c | 244 ++++++++++++++++++
> .../aarch64/saturating_arithmetic.inc | 39 +++
> .../aarch64/saturating_arithmetic_1.c | 36 +++
> .../aarch64/saturating_arithmetic_2.c | 36 +++
> .../aarch64/saturating_arithmetic_3.c | 30 +++
> .../aarch64/saturating_arithmetic_4.c | 30 +++
> 16 files changed, 1057 insertions(+), 56 deletions(-)
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect.inc
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_1.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_2.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_3.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/saturating_arithmetic_autovect_4.c
> create mode 100644
> gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
> create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
> create mode 100644 gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 86d96e47f01..79e43d0c0b3 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -3863,6 +3863,19 @@ aarch64_general_gimple_fold_builtin (unsigned int
> fcode, gcall *stmt,
> new_stmt = gimple_build_assign (gimple_call_lhs (stmt),
> LSHIFT_EXPR, args[0], args[1]);
> break;
> +
> + /* lower saturating add/sub neon builtins to gimple. */
> + BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE)
> + BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE)
> + new_stmt = gimple_build_call_internal (IFN_SAT_ADD, 2, args[0], args[1]);
> + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> + break;
> + BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE)
> + BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE)
> + new_stmt = gimple_build_call_internal (IFN_SAT_SUB, 2, args[0], args[1]);
> + gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> + break;
> +
> BUILTIN_VSDQ_I_DI (BINOP, sshl, 0, NONE)
> BUILTIN_VSDQ_I_DI (BINOP_UUS, ushl, 0, NONE)
> {
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def
> b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 0814f8ba14f..43a0a62caee 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -71,10 +71,10 @@
> BUILTIN_VSDQ_I (BINOP, sqrshl, 0, NONE)
> BUILTIN_VSDQ_I (BINOP_UUS, uqrshl, 0, NONE)
> /* Implemented by aarch64_<su_optab><optab><mode>. */
> - BUILTIN_VSDQ_I (BINOP, sqadd, 0, NONE)
> - BUILTIN_VSDQ_I (BINOPU, uqadd, 0, NONE)
> - BUILTIN_VSDQ_I (BINOP, sqsub, 0, NONE)
> - BUILTIN_VSDQ_I (BINOPU, uqsub, 0, NONE)
> + BUILTIN_VSDQ_I (BINOP, ssadd, 3, NONE)
> + BUILTIN_VSDQ_I (BINOPU, usadd, 3, NONE)
> + BUILTIN_VSDQ_I (BINOP, sssub, 3, NONE)
> + BUILTIN_VSDQ_I (BINOPU, ussub, 3, NONE)
> /* Implemented by aarch64_<sur>qadd<mode>. */
> BUILTIN_VSDQ_I (BINOP_SSU, suqadd, 0, NONE)
> BUILTIN_VSDQ_I (BINOP_UUS, usqadd, 0, NONE)
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index e456f693d2f..fc18a822c52 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -5230,15 +5230,216 @@
> )
> ;; <su>q<addsub>
>
> -(define_insn "aarch64_<su_optab>q<addsub><mode><vczle><vczbe>"
> - [(set (match_operand:VSDQ_I 0 "register_operand" "=w")
> - (BINQOPS:VSDQ_I (match_operand:VSDQ_I 1 "register_operand" "w")
> - (match_operand:VSDQ_I 2 "register_operand" "w")))]
> +(define_insn "<su_optab>s<addsub><mode>3<vczle><vczbe>"
> + [(set (match_operand:VSDQ_I_QI_HI 0 "register_operand" "=w")
> + (BINQOPS:VSDQ_I_QI_HI (match_operand:VSDQ_I_QI_HI 1 "register_operand" "w")
> + (match_operand:VSDQ_I_QI_HI 2 "register_operand" "w")))]
> "TARGET_SIMD"
> "<su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
> [(set_attr "type" "neon_q<addsub><q>")]
> )
>
> +(define_expand "<su_optab>s<addsub><mode>3<vczle><vczbe>"
You shouldn’t need the <vczle><vczbe> for the define_expand, that only does
something useful for define_insns.
> + [(parallel [(set (match_operand:GPI 0 "register_operand")
> + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
> + (match_operand:GPI 2 "aarch64_plus_operand")))
> + (clobber (scratch:GPI))
> + (clobber (reg:CC CC_REGNUM))])]
> +)
> +
> +;; Signed saturating arithmetic with GPR operands can be calculated without
> +;; moving these operands to and from FP regs if we introduce an additional
> +;; temporary GP reg. This uses asr and xor to calculate the saturating limit
> +;; based on the sign of the second (register) operand, with adds/subs and
> csinv
> +;; or csel being used respectively to select the saturating limit if the
> +;; overflow flag is set. The additional asr, xor instructions are cheaper
> than
> +;; using introducing the three fmov instructions that would be needed to
> +;; calculate this result using the NEON instruction. If operand2 is a
> constant
Minor nit, but we prefer to refer to NEON as “”Advanced SIMD” in the AArch64
world.
Generally, it would be good to have an example sequence for the =r,r,JIr,=&r
alternative in the comment here, as it’s quite specific.
> +;; value, then the temporary register is used to store the saturating limit
> +;; without the need for asr, xor.
> +
> +(define_insn_and_split "aarch64_<su_optab>s<addsub><mode>3<vczle><vczbe>"
> + [(set (match_operand:GPI 0 "register_operand")
> + (SBINQOPS:GPI (match_operand:GPI 1 "register_operand")
> + (match_operand:GPI 2 "aarch64_plus_operand")))
> + (clobber (match_scratch:GPI 3))
> + (clobber (reg:CC CC_REGNUM))]
> + ""
> + {@ [ cons: =0, 1 , 2 , =3 ; attrs: type, arch, length ]
> + [ w , w , w , X ; neon_q<addsub><q>, *, 4 ]
> <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
The arch attribute for this alternative should be simd.
> + [ r , r , JIr , &r ; * , *, 8 ] #
> + }
> + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
> + [(set (match_dup 0)
> + (if_then_else:GPI
> + (match_operator 4 "comparison_operator" [(reg:CC_V CC_REGNUM) (const_int
> 0)])
> + (match_dup 5)
> + (match_dup 6)))]
> + {
> + if (REG_P (operands[2]))
> + {
> + switch (<MODE>mode)
> + {
> + case SImode:
> + emit_insn (gen_ashr<mode>3 (operands[3], operands[2],
> + gen_int_mode (31, <MODE>mode)));
> + emit_insn (gen_xor<mode>3 (operands[3], operands[3],
> + gen_int_mode (0x80000000, <MODE>mode)));
> + break;
> + case DImode:
> + emit_insn (gen_ashr<mode>3 (operands[3], operands[2],
> + gen_int_mode (63, <MODE>mode)));
> + emit_insn (gen_xor<mode>3 (operands[3], operands[3],
> + gen_int_mode (0x8000000000000000,
> + <MODE>mode)));
> + break;
> + default:
> + break;
> + }
> + switch (<CODE>)
> + {
> + case SS_MINUS:
> + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
> + operands[2]));
> + break;
> + case SS_PLUS:
> + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
> + operands[2]));
> + break;
> + default:
> + break;
> + }
> +
> + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
> + switch (<CODE>)
> + {
> + case SS_PLUS:
> + operands[4] = gen_rtx_NE (<MODE>mode, ccin, const0_rtx);
> + operands[5] = gen_rtx_NOT (<MODE>mode, operands[3]);
> + operands[6] = operands[0];
> + break;
> + case SS_MINUS:
> + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
> + operands[5] = operands[0];
> + operands[6] = operands[3];
> + break;
> + default:
> + break;
> + }
> + }
> + else
> + {
> + long imm = INTVAL (operands[2]);
> + gcc_assert (imm != 0);
> + rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
> + wide_int limit;
> +
> + switch (<CODE>)
> + {
> + case SS_MINUS:
> + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
> + operands[2], neg_imm));
> + limit = (imm >> 63) + 1 ? wi::min_value (<MODE>mode, SIGNED)
> + : wi::max_value (<MODE>mode, SIGNED);
> + break;
> + case SS_PLUS:
> + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
> + neg_imm, operands[2]));
> + limit = (imm >> 63) + 1 ? wi::max_value (<MODE>mode, SIGNED)
> + : wi::min_value (<MODE>mode, SIGNED);
> + break;
> + default:
> + break;
> + }
> +
> + rtx sat_limit = immed_wide_int_const (limit, <MODE>mode);
> + emit_insn (gen_rtx_SET (operands[3], sat_limit));
> +
> + rtx ccin = gen_rtx_REG (E_CC_Vmode, CC_REGNUM);
> + operands[4] = gen_rtx_EQ (<MODE>mode, ccin, const0_rtx);
> + operands[5] = operands[0];
> + operands[6] = operands[3];
> + }
> + }
> +)
> +
> +;; If this is an unsigned saturating arithmetic and the operands arrive in GP
> +;; registers, then it is possible to perform this arithmetic without using
> the
> +;; NEON instructions. This avoids using unnecessary fmov instructions to
> move
> +;; either the operands or the result to and from GP regs to FP regs. This is
> +;; only possible with SImode and DImode.
> +
> +(define_insn_and_split "<su_optab>s<addsub><mode>3<vczle><vczbe>"
> + [(set (match_operand:GPI 0 "register_operand")
> + (UBINQOPS:GPI (match_operand:GPI 1 "register_operand")
> + (match_operand:GPI 2 "aarch64_plus_operand")))
> + (clobber (reg:CC CC_REGNUM))]
> + ""
> + {@ [ cons: =0, 1 , 2 ; attrs: type, arch, length ]
> + [ w , w , w ; neon_q<addsub><q>, *, 4 ]
> <su_optab>q<addsub>\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>
> + [ r , r , JIr ; * , *, 8 ] #
> + }
> + "&& reload_completed && GP_REGNUM_P (REGNO (operands[0]))"
> + [(set (match_dup 0)
> + (if_then_else:GPI
> + (match_operator 3 "comparison_operator" [(reg:CC CC_REGNUM) (const_int 0)])
> + (match_dup 0)
> + (match_operand:GPI 4 "immediate_operand" "i")))]
> + {
> +
> + if (REG_P (operands[2]))
> + {
> + switch (<CODE>)
> + {
> + case US_MINUS:
> + emit_insn (gen_sub<mode>3_compare1 (operands[0], operands[1],
> + operands[2]));
> + break;
> + case US_PLUS:
> + emit_insn (gen_add<mode>3_compare0 (operands[0], operands[1],
> + operands[2]));
> + break;
> + default:
> + break;
> + }
> + }
> + else
> + {
> + unsigned long imm = UINTVAL (operands[2]);
> + gcc_assert (imm != 0);
> + rtx neg_imm = gen_int_mode (-imm, <MODE>mode);
> + switch (<CODE>)
> + {
> + case US_MINUS:
> + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
> + operands[2], neg_imm));
> + break;
> + case US_PLUS:
> + emit_insn (gen_sub<mode>3_compare1_imm (operands[0], operands[1],
> + neg_imm, operands[2]));
> + break;
> + default:
> + break;
> + }
> + }
> +
> + rtx ccin = gen_rtx_REG (CC_Cmode, CC_REGNUM);
> + switch (<CODE>)
> + {
> + case US_PLUS:
> + operands[3] = gen_rtx_LTU (<MODE>mode, ccin, const0_rtx);
> + operands[4] = gen_int_mode (-1, <MODE>mode);
> + break;
> + case US_MINUS:
> + operands[3] = gen_rtx_GEU (<MODE>mode, ccin, const0_rtx);
> + operands[4] = const0_rtx;
> + break;
> + default:
> + break;
> + }
> + }
> +)
> +
> ;; suqadd and usqadd
>
> (define_insn "aarch64_<sur>qadd<mode><vczle><vczbe>"
….
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
> b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
> new file mode 100644
> index 00000000000..429a2f9ed28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating-arithmetic-signed.c
> @@ -0,0 +1,244 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 --save-temps -mearly-ra=none" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** sadd32:
> +** asr w([0-9]+), w1, 31
> +** adds w([0-9]+), (?:w0, w1|w1, w0)
> +** eor w\1, w\1, -2147483648
> +** csinv w0, w\2, w\1, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +sadd32 (int32_t __a, int32_t __b)
> +{
> + return __builtin_aarch64_ssaddsi (__a, __b);
> +}
We avoid using the __builtin_aarch64_* builtins in test cases as they are
undocumented and we don’t make any guarantees about their stability to users.
I’d prefer if the saturating operation was open-coded in C. I expect the midend
machinery is smart enough to recognize the saturating logic for scalars by now?
Thanks,
Kyrill
> +
> +/*
> +** sadd32_imm:
> +** adds w([0-9]+), w0, #67
> +** mov w([0-9]+), 2147483647
> +** csel w0, w\1, w\2, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +sadd32_imm (int32_t __a)
> +{
> + return __builtin_aarch64_ssaddsi (__a, 67);
> +}
> +
> +/*
> +** sadd32_imm2:
> +** subs w([0-9]+), w0, 67
> +** mov w([0-9]+), -2147483648
> +** csel w0, w\1, w\2, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +sadd32_imm2 (int32_t __a)
> +{
> + return __builtin_aarch64_ssaddsi (__a, -67);
> +}
> +
> +/*
> +** ssub32:
> +** asr w([0-9]+), w1, 31
> +** subs w([0-9]+), w0, w1
> +** eor w\1, w\1, -2147483648
> +** csel w0, w\2, w\1, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +ssub32 (int32_t __a, int32_t __b)
> +{
> + return __builtin_aarch64_sssubsi (__a, __b);
> +}
> +
> +/*
> +** ssub32_imm:
> +** subs w([0-9]+), w0, 67
> +** mov w([0-9]+), -2147483648
> +** csel w0, w\1, w\2, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +ssub32_imm (int32_t __a)
> +{
> + return __builtin_aarch64_sssubsi (__a, 67);
> +}
> +
> +/*
> +** ssub32_imm2:
> +** adds w([0-9]+), w0, #67
> +** mov w([0-9]+), 2147483647
> +** csel w0, w\1, w\2, vc
> +** ret
> +*/
> +int32_t __attribute__((noipa))
> +ssub32_imm2 (int32_t __a)
> +{
> + return __builtin_aarch64_sssubsi (__a, -67);
> +}
> +
> +/*
> +** sadd64:
> +** asr x([0-9]+), x1, 63
> +** adds x([0-9]+), (?:x0, x1|x1, x0)
> +** eor x\1, x\1, -9223372036854775808
> +** csinv x0, x\2, x\1, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +sadd64 (int64_t __a, int64_t __b)
> +{
> + return __builtin_aarch64_ssadddi (__a, __b);
> +}
> +
> +/*
> +** sadd64_imm:
> +** adds x([0-9]+), x0, #67
> +** mov x([0-9]+), 9223372036854775807
> +** csel x0, x\1, x\2, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +sadd64_imm (int64_t __a)
> +{
> + return __builtin_aarch64_ssadddi (__a, (int64_t) 67);
> +}
> +
> +/*
> +** sadd64_imm2:
> +** subs x([0-9]+), x0, 67
> +** mov x([0-9]+), -9223372036854775808
> +** csel x0, x\1, x\2, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +sadd64_imm2 (int64_t __a)
> +{
> + return __builtin_aarch64_ssadddi (__a, (int64_t) -67);
> +}
> +
> +/*
> +** ssub64:
> +** asr x([0-9]+), x1, 63
> +** subs x([0-9]+), x0, x1
> +** eor x\1, x\1, -9223372036854775808
> +** csel x0, x\2, x\1, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +ssub64 (int64_t __a, int64_t __b)
> +{
> + return __builtin_aarch64_sssubdi (__a, __b);
> +}
> +
> +/*
> +** ssub64_imm:
> +** subs x([0-9]+), x0, 67
> +** mov x([0-9]+), -9223372036854775808
> +** csel x0, x\1, x\2, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +ssub64_imm (int64_t __a)
> +{
> + return __builtin_aarch64_sssubdi (__a, (int64_t) 67);
> +}
> +
> +/*
> +** ssub64_imm2:
> +** adds x([0-9]+), x0, #67
> +** mov x([0-9]+), 9223372036854775807
> +** csel x0, x\1, x\2, vc
> +** ret
> +*/
> +int64_t __attribute__((noipa))
> +ssub64_imm2 (int64_t __a)
> +{
> + return __builtin_aarch64_sssubdi (__a, (int64_t) -67);
> +}
> +
> +int
> +main (void)
> +{
> + /* Addition:
> + SAT_ADD(x, +ve), non-saturating
> + SAT_ADD(x, +ve), saturating
> + SAT_ADD(x, immediate +ve)
> + SAT_ADD(x, immediate -ve)
> + SAT_ADD(x, -ve), non-saturating
> + SAT_ADD(x, -ve), saturating
> +
> + Subtraction:
> + SAT_SUB(x, +ve), non-saturating
> + SAT_SUB(x, +ve), saturating
> + SAT_SUB(x, immediate +ve)
> + SAT_SUB(x, immediate -ve)
> + SAT_SUB(x, -ve), non-saturating */
> +
> + int32_t a = 4;
> + int32_t b = 70;
> + int32_t c = 2147483647;
> + int32_t d = (int32_t) -2147483648;
> +
> + if (sadd32 (a, b) != (a + b))
> + __builtin_abort ();
> + if (sadd32 (a, c) != c)
> + __builtin_abort ();
> + if (sadd32_imm (a) != (a + 67))
> + __builtin_abort ();
> + if (sadd32_imm2 (a) != (a - 67))
> + __builtin_abort ();
> + if (sadd32 (a, -b) != (a - b))
> + __builtin_abort ();
> + if (sadd32 (a, d) != (d + 4))
> + __builtin_abort ();
> +
> + if (ssub32 (a, b) != (a - b))
> + __builtin_abort ();
> + if (ssub32 (-a, c) != d)
> + __builtin_abort ();
> + if (ssub32_imm (a) != (a - 67))
> + __builtin_abort ();
> + if (ssub32_imm2 (a) != (a + 67))
> + __builtin_abort ();
> + if (ssub32 (a, -b) != (a + b))
> + __builtin_abort ();
> +
> + int64_t a_64 = a;
> + int64_t b_64 = b;
> + int64_t c_64 = (int64_t) 9223372036854775807;
> + int64_t d_64 = (int64_t) 0x8000000000000000;
> +
> + if (sadd64 (a_64, b_64) != (a_64 + b_64))
> + __builtin_abort ();
> + if (sadd64 (a_64, c_64) != c_64)
> + __builtin_abort ();
> + if (sadd64_imm (a_64) != (a_64 + 67))
> + __builtin_abort ();
> + if (sadd64_imm2 (a_64) != (a_64 - 67))
> + __builtin_abort ();
> + if (sadd64 (a_64, -b_64) != (a_64 - b_64))
> + __builtin_abort ();
> + if (sadd64 (a_64, d_64) != (d_64 + 4))
> + __builtin_abort ();
> +
> + if (ssub64 (a_64, b_64) != (a_64 - b_64))
> + __builtin_abort ();
> + if (ssub64 (-a_64, c_64) != d_64)
> + __builtin_abort ();
> + if (ssub64_imm (a_64) != (a_64 - 67))
> + __builtin_abort ();
> + if (ssub64_imm2 (a_64) != (a_64 + 67))
> + __builtin_abort ();
> + if (ssub64 (a_64, -b_64) != (a_64 + b_64))
> + __builtin_abort ();
> +
> + return 0;
> +}
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
> b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
> new file mode 100644
> index 00000000000..e979d535405
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic.inc
> @@ -0,0 +1,39 @@
> +/* Template file for scalar saturating arithmetic validation.
> +
> + This file defines scalar saturating addition and subtraction functions
> for a
> + given type. This type, along with the corresponding minimum and maximum
> + values for that type, must be defined by any test file which includes this
> + template file. */
> +
> +#ifndef SAT_ARIT_INC
> +#define SAT_ARIT_INC
> +
> +#include <limits.h>
> +
> +#ifndef UT
> +#define UT unsigned int
> +#define UMAX UINT_MAX
> +#define UMIN 0
> +#endif
> +
> +UT uadd (UT a, UT b)
> +{
> + UT sum = a + b;
> + return sum < a ? UMAX : sum;
> +}
> +
> +UT uadd2 (UT a, UT b)
> +{
> + UT c;
> + if (!__builtin_add_overflow(a, b, &c))
> + return c;
> + return UMAX;
> +}
> +
> +UT usub (UT a, UT b)
> +{
> + UT sum = a - b;
> + return sum > a ? UMIN : sum;
> +}
> +
> +#endif
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
> b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
> new file mode 100644
> index 00000000000..56873f99b81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_1.c
> @@ -0,0 +1,36 @@
> +/* { dg-do-compile } */
> +/* { dg-options "-O2 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** uadd:
> +** dup v([0-9]+).8b, w0
> +** dup v([0-9]+).8b, w1
> +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2)
> +** umov w0, v\3.b\[0\]
> +** ret
> +*/
> +/*
> +** uadd2:
> +** dup v([0-9]+).8b, w0
> +** dup v([0-9]+).8b, w1
> +** uqadd b([0-9]+), (?:b\2, b\1|b\1, b\2)
> +** umov w0, v\3.b\[0\]
> +** ret
> +*/
> +/*
> +** usub: { xfail *-*-* }
> +** dup v([0-9]+).8b, w0
> +** dup v([0-9]+).8b, w1
> +** uqsub b([0-9]+), b\1, b\2
> +** umov w0, v\3.b\[0\]
> +** ret
> +*/
> +
> +#include <limits.h>
> +
> +#define UT unsigned char
> +#define UMAX UCHAR_MAX
> +#define UMIN 0
> +
> +#include "saturating_arithmetic.inc"
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
> b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
> new file mode 100644
> index 00000000000..a719aebbcf3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_2.c
> @@ -0,0 +1,36 @@
> +/* { dg-do-compile } */
> +/* { dg-options "-O2 --save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** uadd:
> +** dup v([0-9]+).4h, w0
> +** dup v([0-9]+).4h, w1
> +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2)
> +** umov w0, v\3.h\[0\]
> +** ret
> +*/
> +/*
> +** uadd2:
> +** dup v([0-9]+).4h, w0
> +** dup v([0-9]+).4h, w1
> +** uqadd h([0-9]+), (?:h\2, h\1|h\1, h\2)
> +** umov w0, v\3.h\[0\]
> +** ret
> +*/
> +/*
> +** usub: { xfail *-*-* }
> +** dup v([0-9]+).4h, w0
> +** dup v([0-9]+).4h, w1
> +** uqsub h([0-9]+), h\1, h\2
> +** umov w0, v\3.h\[0\]
> +** ret
> +*/
> +
> +#include <limits.h>
> +
> +#define UT unsigned short
> +#define UMAX USHRT_MAX
> +#define UMIN 0
> +
> +#include "saturating_arithmetic.inc"
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
> b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
> new file mode 100644
> index 00000000000..21517254519
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_3.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** uadd:
> +** adds\tw([0-9]+), w([0-9]+), w([0-9]+)
> +** csinv\tw\1, w\1, wzr, cc
> +** ret
> +*/
> +/*
> +** uadd2:
> +** adds\tw([0-9]+), w([0-9]+), w([0-9]+)
> +** csinv\tw\1, w\1, wzr, cc
> +** ret
> +*/
> +/*
> +** usub:
> +** subs\tw([0-9]+), w([0-9]+), w([0-9]+)
> +** csel\tw\1, w\1, wzr, cs
> +** ret
> +*/
> +
> +#include <limits.h>
> +
> +#define UT unsigned int
> +#define UMAX UINT_MAX
> +#define UMIN 0
> +
> +#include "saturating_arithmetic.inc"
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
> b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
> new file mode 100644
> index 00000000000..363d0a79a73
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/saturating_arithmetic_4.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-options "-O2 --save-temps -ftree-vectorize" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +/*
> +** uadd:
> +** adds\tx([0-9]+), x([0-9]+), x([0-9]+)
> +** csinv\tx\1, x\1, xzr, cc
> +** ret
> +*/
> +/*
> +** uadd2:
> +** adds\tx([0-9]+), x([0-9]+), x([0-9]+)
> +** csinv\tx\1, x\1, xzr, cc
> +** ret
> +*/
> +/*
> +** usub:
> +** subs\tx([0-9]+), x([0-9]+), x([0-9]+)
> +** csel\tx\1, x\1, xzr, cs
> +** ret
> +*/
> +
> +#include <limits.h>
> +
> +#define UT unsigned long
> +#define UMAX ULONG_MAX
> +#define UMIN 0
> +
> +#include "saturating_arithmetic.inc"
> \ No newline at end of file
> --
> 2.34.1
>