Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

Richard Biener Mon, 04 Mar 2024 02:31:33 -0800

On Sat, Mar 2, 2024 at 8:46 AM Li, Pan2 <pan2...@intel.com> wrote:
>
> Hi Richard and Tamar,
>
> I have a try with DEF_INTERNAL_SIGNED_OPTAB_FN for SAT_ADD/SUB/MUL but meet 
> some problem when match.pd.
>
> For unsigned SAT_ADD = (x + y) | - ((x + y) < x), the match.pd can be 
> (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0)))).
> For unsigned SAT_SUB = x >= y ? x - y : 0, and then match.pd can be (cond (ge 
> @0 @1) (minus @0 @1) integer_zerop).
>
> For signed SAT_ADD/SAT_SUB as below, seems not easy to make the simplify 
> pattern works well as expected up to a point.
> sint64_t sat_add (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t add = x + y;
>   sint64_t b = sum ^ x;
>
>   return (a < 0 || (a >= 0 && b >= 0)) ? add : (MAX_INT64 + (x < 0));
> }
>
> sint64_t sad_sub (sint64_t x, sint64_t y)
> {
>   sint64_t a = x ^ y;
>   sint64_t sub = x - y;
>   sint64_t b = sub ^ x;
>
>   return (a >= 0 || (a < 0 && b >= 0) ? sub : (MAX_INT64 + (x < 0));
> }
>
> For SAT_MUL as below, looks we may need widen type. I am not sure if we can 
> leverage MUL_OVERFLOW or not in match.pd.
>
> uint32_t sat_mul (uint32_t x, uint32_t y)
> {
>   uint64_t mul = (uint64_t)x * (uint64_t)y;
>   return mul > UINT32_MAX ? UINT32_MAX : (uint32_t)mul;
> }
>
> sint32_t sat_mul (sint32_t x, sint32_t y)
> {
>   sint64_t mul = (sint64_t)x * (sint64_t))y;
>
>   return mul <= MAX_INT32 && mul >= MIN_INT32 ? mul : MAX_INT32 + (x ^ y) > 0;
> }
>
> Below diff only contains unsigned SAT_ADD and SAT_SUB for prototype 
> validation.
> I will continue to try the rest part in match.pd and keep you posted.
>
> -------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 80efdf2b7e5..d9ad6fe2b58 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -132,6 +132,9 @@ extern void riscv_asm_output_external (FILE *, const 
> tree, const char *);
>  extern bool
>  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
>  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, rtx);
> +extern void riscv_expand_usadd (rtx, rtx, rtx);
> +extern void riscv_expand_ussub (rtx, rtx, rtx);
>
>  #ifdef RTX_CODE
>  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, bool 
> *invert_ptr = 0);
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5e984ee2a55..795462526df 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -10655,6 +10655,28 @@ riscv_vector_mode_supported_any_target_p 
> (machine_mode)
>    return true;
>  }
>
> +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < x).  */
> +void
> +riscv_expand_usadd (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_usadd.\n");
> +  // ToDo
> +}
> +
> +void
> +riscv_expand_ussub (rtx dest, rtx x, rtx y)
> +{
> +  fprintf (stdout, "Hit riscv_expand_ussub.\n");
> +  // ToDo
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_ASM_ALIGNED_HI_OP
>  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> index 1fec13092e2..e2dbadb3ead 100644
> --- a/gcc/config/riscv/riscv.md
> +++ b/gcc/config/riscv/riscv.md
> @@ -3839,6 +3839,39 @@ (define_insn "*large_load_address"
>    [(set_attr "type" "load")
>     (set (attr "length") (const_int 8))])
>
> +(define_expand "usadd<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_usadd (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
> +(define_expand "ussub<mode>3"
> +  [(match_operand:ANYI 0 "register_operand")
> +   (match_operand:ANYI 1 "register_operand")
> +   (match_operand:ANYI 2 "register_operand")]
> +  ""
> +  {
> +    riscv_expand_ussub (operands[0], operands[1], operands[2]);
> +    DONE;
> +  }
> +)
> +
>  (include "bitmanip.md")
>  (include "crypto.md")
>  (include "sync.md")
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..0fff19c875f 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,13 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST | 
> ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, first,
>                               smulhrs, umulhrs, binary)
>
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST | ECF_NOTHROW, first,
> +                             ssadd, usadd, binary)
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST | ECF_NOTHROW, first,
> +                             sssub, ussub, binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index f3fffd8dec2..6592dea643a 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -10276,3 +10276,32 @@ and,
>        }
>        (if (full_perm_p)
>         (vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +#if GIMPLE
> +
> +/* Unsigned saturation add, aka:
> +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> +(simplify
> + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> +   (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && direct_internal_fn_supported_p (IFN_SAT_ADD, type, 
> OPTIMIZE_FOR_BOTH))
> +   (IFN_SAT_ADD @0 @1)))
> +
> +/* Unsigned saturation sub , aka
> +   SAT_SUBU = x >= y ? x - y : 0.  */
> +(simplify
> +  (cond (ge @0 @1) (minus @0 @1) integer_zerop)
> +    (if (optimize
> +       && INTEGRAL_TYPE_P (type)
> +       && TYPE_UNSIGNED (TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@0))
> +       && types_match (type, TREE_TYPE (@1))
> +       && direct_internal_fn_supported_p (IFN_SAT_SUB, type, 
> OPTIMIZE_FOR_BOTH))
> +    (IFN_SAT_SUB @0 @1)))
> +
> +#endif
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index ad14f9328b9..bebe38c888b 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -111,15 +111,15 @@ OPTAB_NX(add_optab, "add$F$a3")
>  OPTAB_NX(add_optab, "add$Q$a3")
>  OPTAB_VL(addv_optab, "addv$I$a3", PLUS, "add", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(addv_optab, "add$F$a3")
> -OPTAB_NL(ssadd_optab, "ssadd$Q$a3", SS_PLUS, "ssadd", '3', 
> gen_signed_fixed_libfunc)
> -OPTAB_NL(usadd_optab, "usadd$Q$a3", US_PLUS, "usadd", '3', 
> gen_unsigned_fixed_libfunc)
> +OPTAB_NL(ssadd_optab, "ssadd$a3", SS_PLUS, "ssadd", '3', gen_int_libfunc)
> +OPTAB_NL(usadd_optab, "usadd$a3", US_PLUS, "usadd", '3', gen_int_libfunc)
>  OPTAB_NL(sub_optab, "sub$P$a3", MINUS, "sub", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(sub_optab, "sub$F$a3")
>  OPTAB_NX(sub_optab, "sub$Q$a3")
>  OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
>  OPTAB_VX(subv_optab, "sub$F$a3")
> -OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', 
> gen_signed_fixed_libfunc)
> -OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', 
> gen_unsigned_fixed_libfunc)
> +OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', gen_int_libfunc)
> +OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', gen_int_libfunc)


I do wonder what the existing usadd patterns with integer vector modes
in various targets do?
Those define_insn will at least not end up in the optab set I guess,
so they must end up
being either unused or used by explicit gen_* (via intrinsic
functions?) or by combine?

I think simply changing gen_*_fixed_libfunc to gen_int_libfunc won't
work.  Since there's
no libgcc support I'd leave it as gen_*_fixed_libfunc thus no library
fallback for integers?

>  OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
>  OPTAB_NX(smul_optab, "mul$P$a3")
>  OPTAB_NX(smul_optab, "mul$F$a3")
>
> Pan
>
> -----Original Message-----
> From: Li, Pan2 <pan2...@intel.com>
> Sent: Tuesday, February 27, 2024 10:36 PM
> To: Richard Biener <richard.guent...@gmail.com>; Tamar Christina 
> <tamar.christ...@arm.com>
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang 
> <yanzhang.w...@intel.com>; kito.ch...@gmail.com; richard.sandiford@arm.com2; 
> jeffreya...@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation 
> US_PLUS
>
> Thanks Richard and Tammer for moving this forward.
>
> > That said, I would like to see the bigger picture to be kept in mind
> > before altering the GIMPLE IL.
>
> > Adding an internal function for an already present optab is a
> > no-brainer.  Adding a vectorizer
> > and/or if-conversion pattern to make use of this during vectorization
> > is existing practice.
> > Adding pattern recognition to ISEL or widening-mul passes for
> > instructions the CPU can do
> > is existing practice and OK.
>
> Thanks for explaining, got the point here.
>
> > So I'd suggest writing some example of both signed and unsigned saturating 
> > add and multiply
>
> > Because signed addition, will likely require a branch and signed 
> > multiplication would require a
> > larger type.
>
> Ack, will prepare one prototype validation patch for add, sub and mul (both 
> unsigned and signed) soon.
>
> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guent...@gmail.com>
> Sent: Tuesday, February 27, 2024 9:42 PM
> To: Tamar Christina <tamar.christ...@arm.com>
> Cc: Li, Pan2 <pan2...@intel.com>; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai; Wang, Yanzhang <yanzhang.w...@intel.com>; 
> kito.ch...@gmail.com; richard.sandiford@arm.com2; jeffreya...@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation 
> US_PLUS
>
> On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina <tamar.christ...@arm.com> 
> wrote:
> >
> > > Thanks Tamar.
> > >
> > > > Those two cases also *completely* stop vectorization because of either 
> > > > the
> > > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > Yes, we eventually would like to vectorize the SAT ALU but we start with 
> > > scalar part
> > > first.
> > > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > > well with some additions as below.
> > > Feel free to correct me if any misunderstandings.
> > >
> > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > > usadd$a3(as well as gen_int_libfunc) for int.
> > > 2. We need to implement a default implementation of SAT_ADD if
> > > direct_binary_optab_supported_p is false.
> > >     It looks like the default implementation is difficult to make every 
> > > backend happy.
> > > That is why you suggest just normal
> > >     DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> > >
> > > Thanks Richard.
> > >
> > > > But what I'd like to see is that we do more instruction selection on 
> > > > GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not 
> > > > adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > >
> > > > So as a start I would go for a direct optab and see to recognize it 
> > > > during
> > > > ISEL?
> > >
> > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, 
> > > good
> > > to know isel and I am happy to
> > > try that once we have conclusion.
> > >
> >
> > So after a lively discussion on IRC, the conclusion is that before we 
> > proceed Richi would
> > like to see some examples of various operations.  The problem is that 
> > unsigned saturating
> > addition is the simplest example and it may lead to an implementation 
> > strategy that doesn't
> > scale.
> >
> > So I'd suggest writing some example of both signed and unsigned saturating 
> > add and multiply
> >
> > Because signed addition, will likely require a branch and signed 
> > multiplication would require a
> > larger type.
> >
> > This would allow us to better understand what kind of gimple would have to 
> > to deal with in
> > ISEL and VECT if we decide not to lower early.
>
> More specifically before making something like .SAT_ADD a core part of
> GIMPLE I'd like
> to point out that we have saturating PLUS_EXPR but just for
> fixed-point types.  I realize
> Joseph thinks that keying this on the type was wrong and it should
> have used integer
> types and special saturating operations.  Still having both,
> type-keyed saturating PLUS_EXPR
> and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.
>
> It might be that the way to go is to turn all existing saturating type
> *_EXPR into
> .SAT_* internal function calls, in the end mapping to the optabs and
> eventual RTX codes.
>
> That could work for both integer types and fixed-point types.
>
> I'll also note that "saturating" is just another variant of overflow
> behavior of which we have
> trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
> also (kind-of) sanitized.
> We do lack direct IL representation of -ftrapv and -fwrapv, the
> semantics on a PLUS_EXPR
> depend on per-function flags.  Eventually a common representation
> could be found here.
> For saturating I was thinking of .ADD_OVERFLOW (a, b,
> saturation-value), a "trap" could
> be a "trapping" saturation value, "undefined" could be a
> "not-a-thing".  But I didn't think much
> about this.
>
> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.
>
> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.
>
> Thanks,
> Richard.
>
>
> > Thanks,
> > Tamar
> >
> > > Pan
> > >
> > > -----Original Message-----
> > > From: Tamar Christina <tamar.christ...@arm.com>
> > > Sent: Tuesday, February 27, 2024 5:57 PM
> > > To: Richard Biener <richard.guent...@gmail.com>
> > > Cc: Li, Pan2 <pan2...@intel.com>; gcc-patches@gcc.gnu.org; 
> > > juzhe.zh...@rivai.ai;
> > > Wang, Yanzhang <yanzhang.w...@intel.com>; kito.ch...@gmail.com;
> > > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn 
> > > saturation
> > > US_PLUS
> > >
> > > > -----Original Message-----
> > > > From: Richard Biener <richard.guent...@gmail.com>
> > > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > > To: Tamar Christina <tamar.christ...@arm.com>
> > > > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> > > > yanzhang.w...@intel.com; kito.ch...@gmail.com;
> > > > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn 
> > > > saturation
> > > > US_PLUS
> > > >
> > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > > > <tamar.christ...@arm.com> wrote:
> > > > >
> > > > > Hi Pan,
> > > > >
> > > > > > From: Pan Li <pan2...@intel.com>
> > > > > >
> > > > > > Hi Richard & Tamar,
> > > > > >
> > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > > > >
> > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, 
> > > > > > given
> > > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) 
> > > > > > already.
> > > > > >
> > > > >
> > > > > I think a couple of things are being confused here.  So lets break it 
> > > > > down:
> > > > >
> > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > > > we only want one internal function for both signed and unsigned 
> > > > > SAT_ADD.
> > > > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > > > >
> > > > > Now the downside of this is that this is a direct internal optab.  
> > > > > This means
> > > > > that for the representation to be used the target *must* have the 
> > > > > optab
> > > > > implemented.   This is a bit annoying because it doesn't allow us to 
> > > > > generically
> > > > > assume that all targets use SAT_ADD for saturating add and thus only 
> > > > > have to
> > > > > write optimization for this representation.
> > > > >
> > > > > This is why Richi said we may need to use a new tree_code because we 
> > > > > can
> > > > > override tree code expansions.  However the same can be done with the
> > > _EXT_FN
> > > > > internal functions.
> > > > >
> > > > > So what I meant was that we want to have a combination of the two. 
> > > > > i.e. a
> > > > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> > > >
> > > > Whether we want/need _EXT or only direct depends mainly on how we want 
> > > > to
> > > > leverage support.  If it's only during vectorization and possibly 
> > > > instruction
> > > > selection a direct optab is IMO the way to go.  Generic optimization 
> > > > only
> > > > marginally improves when you explode the number of basic operations you
> > > > expose - in fact it gets quite unwieldly to support all of them in
> > > > simplifications
> > > > and/or canonicalization and you possibly need to translate them back to 
> > > > what
> > > > the target CPU supports.
> > > >
> > > > We already do have too many (IMO) "special" operations exposed "early"
> > > > in the GIMPLE pipeline.
> > > >
> > > > But what I'd like to see is that we do more instruction selection on 
> > > > GIMPLE
> > > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > > passes doing what I'd call instruction selection).  But that means not 
> > > > adding
> > > > match.pd patterns for that or at least have a separate isel-match.pd
> > > > machinery for that.
> > > >
> > > > So as a start I would go for a direct optab and see to recognize it 
> > > > during
> > > > ISEL?
> > > >
> > >
> > > The problem with ISEL and the reason I suggested an indirect IFN is that 
> > > there
> > > Are benefit to be had from recognizing it early.  Saturating arithmetic 
> > > can be
> > > optimized
> > > Differently from non-saturating ones.
> > >
> > > But additionally a common way of specifying them decomposes to branches
> > > and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).
> > >
> > > These two representation can be detected in PHI-opts and it's beneficial 
> > > to all
> > > targets to canonicalize them to the branchless code.
> > >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> > >
> > > So really, gimple ISEL would fix just 1 of the 3 very common cases, and 
> > > then
> > > We'd still need to hack the vectorizer cost models for targets with 
> > > saturating
> > > vector instructions.
> > >
> > > I of course defer to you, but it seems quite suboptimal to do it this way 
> > > and
> > > doesn't get us first class saturation support.
> > >
> > > Additionally there have been discussions whether both clang and gcc should
> > > provide __builtin_saturate_* methods, which the non-direct IFN would help
> > > support.
> > >
> > > Tamar.
> > >
> > > > > If Richi agrees, the below is what I meant. It creates the 
> > > > > infrastructure for this
> > > > > and for now only allows a default fallback for unsigned saturating 
> > > > > add and
> > > makes
> > > > > it easier for us to add the rest later
> > > > >
> > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and 
> > > > > ss_plus are
> > > > the
> > > > > RTL expression, but the optab for saturation are ssadd and usadd.  So 
> > > > > you don't
> > > > > need to make new us_plus and ss_plus ones.
> > > > >
> > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > index a07f25f3aee..aaf9f8991b3 100644
> > > > > --- a/gcc/internal-fn.cc
> > > > > +++ b/gcc/internal-fn.cc
> > > > > @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn,
> > > > tree_pair types,
> > > > >         return direct_##TYPE##_optab_supported_p (which_optab, types, 
> > > > >   \
> > > > >                                                   opt_type);          
> > > > >   \
> > > > >        }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)        
> > > > >   \
> > > > > +    case IFN_##CODE:                                                 
> > > > >   \
> > > > > +      {                                                              
> > > > >           \
> > > > > +       optab which_optab = (TYPE_UNSIGNED (types.SELECTOR)           
> > > > >   \
> > > > > +                            ? UNSIGNED_OPTAB ## _optab               
> > > > >   \
> > > > > +                            : SIGNED_OPTAB ## _optab);               
> > > > >   \
> > > > > +       return direct_##TYPE##_optab_supported_p (which_optab, types, 
> > > > >   \
> > > > > +                                                 opt_type)           
> > > > >   \
> > > > > +              || internal_##CODE##_fn_supported_p (types.SELECTOR, 
> > > > > opt_type); \
> > > > > +      }
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >      case IFN_LAST:
> > > > > @@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
> > > > >      optab which_optab = direct_internal_fn_optab (fn, types);        
> > > > >   \
> > > > >      expand_##TYPE##_optab_fn (fn, stmt, which_optab);                
> > > > >   \
> > > > >    }
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                        UNSIGNED_OPTAB, TYPE)
> > > > >  #include "internal-fn.def"
> > > > >
> > > > >  /* Routines to expand each internal function, indexed by function 
> > > > > number.
> > > > > @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > >        emit_move_insn (plhs, cmp);
> > > > >      }
> > > > >  }
> > > > > +
> > > > > +void
> > > > > +expand_SAT_ADD (internal_fn fn, gcall *stmt)
> > > > > +{
> > > > > +  /* Check if the target supports the expansion through an IFN.  */
> > > > > +  tree_pair types = direct_internal_fn_types (fn, stmt);
> > > > > +  optab which_optab = direct_internal_fn_optab (fn, types);
> > > > > +  if (direct_binary_optab_supported_p (which_optab, types,
> > > > > +                                      insn_optimization_type ()))
> > > > > +    {
> > > > > +      expand_binary_optab_fn (fn, stmt, which_optab);
> > > > > +      return;
> > > > > +    }
> > > > > +
> > > > > +  /* Target does not support the optab, but we can de-compose it.  */
> > > > > +  /*
> > > > > +  ... decompose to a canonical representation ...
> > > > > +  if (TYPE_UNSIGNED (types.SELECTOR))
> > > > > +    {
> > > > > +      ...
> > > > > +      decompose back to (X + Y) | - ((X + Y) < X)
> > > > > +    }
> > > > > +  else
> > > > > +    {
> > > > > +      ...
> > > > > +    }
> > > > > +  */
> > > > > +}
> > > > > +
> > > > > +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /*
> > > > optype */)
> > > > > +{
> > > > > +  /* For now, don't support decomposing vector ops.  */
> > > > > +  if (VECTOR_TYPE_P (type))
> > > > > +    return false;
> > > > > +
> > > > > +  /* Signed saturating arithmetic is harder to do since we'll so for 
> > > > > now
> > > > > +     lets ignore.  */
> > > > > +  if (!TYPE_UNSIGNED (type))
> > > > > +    return false;
> > > > > +
> > > > > +  return TREE_CODE (type) == INTEGER_TYPE;
> > > > > +}
> > > > > \ No newline at end of file
> > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > index c14d30365c1..5a2491228d5 100644
> > > > > --- a/gcc/internal-fn.def
> > > > > +++ b/gcc/internal-fn.def
> > > > > @@ -92,6 +92,10 @@ along with GCC; see the file COPYING3.  If not see
> > > > >     unsigned inputs respectively, both without the trailing "_optab".
> > > > >     SELECTOR says which type in the tree_pair determines the 
> > > > > signedness.
> > > > >
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_EXT_FN is like
> > > > DEF_INTERNAL_SIGNED_OPTAB_FN, except
> > > > > +   that it has expand_##NAME defined in internal-fn.cc to override 
> > > > > the
> > > > > +   DEF_INTERNAL_SIGNED_OPTAB_FN expansion behavior.
> > > > > +
> > > > >     DEF_INTERNAL_FLT_FN is like DEF_INTERNAL_OPTAB_FN, but in 
> > > > > addition,
> > > > >     the function implements the computational part of a built-in math
> > > > >     function BUILT_IN_<NAME>{F,,L}.  Unlike some built-in functions,
> > > > > @@ -153,6 +157,13 @@ along with GCC; see the file COPYING3.  If not 
> > > > > see
> > > > >    DEF_INTERNAL_FN (NAME, FLAGS | ECF_LEAF, NULL)
> > > > >  #endif
> > > > >
> > > > > +#ifndef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > > +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                                    UNSIGNED_OPTAB, TYPE) \
> > > > > +  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR,
> > > > SIGNED_OPTAB, \
> > > > > +                               UNSIGNED_OPTAB, TYPE)
> > > > > +#endif
> > > > > +
> > > > >  #ifndef DEF_INTERNAL_FLT_FN
> > > > >  #define DEF_INTERNAL_FLT_FN(NAME, FLAGS, OPTAB, TYPE) \
> > > > >    DEF_INTERNAL_OPTAB_FN (NAME, FLAGS, OPTAB, TYPE)
> > > > > @@ -274,6 +285,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS,
> > > > ECF_CONST | ECF_NOTHROW, first,
> > > > >                               smulhs, umulhs, binary)
> > > > >  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> > > > first,
> > > > >                               smulhrs, umulhrs, binary)
> > > > > +DEF_INTERNAL_SIGNED_OPTAB_EXT_FN (SAT_ADD, ECF_CONST |
> > > > ECF_NOTHROW, first,
> > > > > +                                 ssadd, usadd, binary)
> > > > >
> > > > >  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
> > > > >  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
> > > > > @@ -593,5 +606,6 @@ DEF_INTERNAL_FN (BITINTTOFLOAT, ECF_PURE |
> > > > ECF_LEAF, ". R . ")
> > > > >  #undef DEF_INTERNAL_FLT_FN
> > > > >  #undef DEF_INTERNAL_FLT_FLOATN_FN
> > > > >  #undef DEF_INTERNAL_SIGNED_OPTAB_FN
> > > > > +#undef DEF_INTERNAL_SIGNED_OPTAB_EXT_FN
> > > > >  #undef DEF_INTERNAL_OPTAB_FN
> > > > >  #undef DEF_INTERNAL_FN
> > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > index bccee1c3e09..dbdb1e6bad2 100644
> > > > > --- a/gcc/internal-fn.h
> > > > > +++ b/gcc/internal-fn.h
> > > > > @@ -263,6 +263,8 @@ extern void expand_DIVMODBITINT (internal_fn, 
> > > > > gcall
> > > > *);
> > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > +extern void expand_SAT_ADD (internal_fn, gcall *);
> > > > > +extern bool internal_SAT_ADD_fn_supported_p (tree, 
> > > > > optimization_type);
> > > > >
> > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > >
> > > > > > Note this patch is a draft for validation, no test are invovled 
> > > > > > here.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >       * builtins.def (BUILT_IN_US_PLUS): Add builtin def.
> > > > > >       (BUILT_IN_US_PLUSIMAX): Ditto.
> > > > > >       (BUILT_IN_US_PLUSL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSLL): Ditto.
> > > > > >       (BUILT_IN_US_PLUSG): Ditto.
> > > > > >       * config/riscv/riscv-protos.h (riscv_expand_us_plus): Add new
> > > > > >       func decl for expanding us_plus.
> > > > > >       * config/riscv/riscv.cc (riscv_expand_us_plus): Add new func
> > > > > >       impl for expanding us_plus.
> > > > > >       * config/riscv/riscv.md (us_plus<mode>3): Add new pattern impl
> > > > > >       us_plus<mode>3.
> > > > > >       * internal-fn.cc (expand_US_PLUS): Add new func impl to expand
> > > > > >       US_PLUS.
> > > > > >       * internal-fn.def (US_PLUS): Add new INT_EXT_FN.
> > > > > >       * internal-fn.h (expand_US_PLUS): Add new func decl.
> > > > > >       * match.pd: Add new simplify pattern for us_plus.
> > > > > >       * optabs.def (OPTAB_NL): Add new OPTAB_NL to US_PLUS rtl.
> > > > > >
> > > > > > Signed-off-by: Pan Li <pan2...@intel.com>
> > > > > > ---
> > > > > >  gcc/builtins.def                |  7 +++++
> > > > > >  gcc/config/riscv/riscv-protos.h |  1 +
> > > > > >  gcc/config/riscv/riscv.cc       | 46 
> > > > > > +++++++++++++++++++++++++++++++++
> > > > > >  gcc/config/riscv/riscv.md       | 11 ++++++++
> > > > > >  gcc/internal-fn.cc              | 26 +++++++++++++++++++
> > > > > >  gcc/internal-fn.def             |  3 +++
> > > > > >  gcc/internal-fn.h               |  1 +
> > > > > >  gcc/match.pd                    | 17 ++++++++++++
> > > > > >  gcc/optabs.def                  |  2 ++
> > > > > >  9 files changed, 114 insertions(+)
> > > > > >
> > > > > > diff --git a/gcc/builtins.def b/gcc/builtins.def
> > > > > > index f6f3e104f6a..0777b912cfa 100644
> > > > > > --- a/gcc/builtins.def
> > > > > > +++ b/gcc/builtins.def
> > > > > > @@ -1055,6 +1055,13 @@ DEF_GCC_BUILTIN
> > > > (BUILT_IN_POPCOUNTIMAX,
> > > > > > "popcountimax", BT_FN_INT_UINTMAX
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTL, "popcountl",
> > > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTLL, "popcountll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_POPCOUNTG, "popcountg",
> > > > BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUS, "us_plus", 
> > > > > > BT_FN_INT_UINT,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSIMAX, "us_plusimax",
> > > > > > BT_FN_INT_UINTMAX, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSL, "us_plusl",
> > > BT_FN_INT_ULONG,
> > > > > > ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSLL, "us_plusll",
> > > > > > BT_FN_INT_ULONGLONG, ATTR_CONST_NOTHROW_LEAF_LIST)
> > > > > > +DEF_GCC_BUILTIN        (BUILT_IN_US_PLUSG, "us_plusg", 
> > > > > > BT_FN_INT_VAR,
> > > > > > ATTR_CONST_NOTHROW_TYPEGENERIC_LEAF)
> > > > > > +
> > > > > >  DEF_EXT_LIB_BUILTIN    (BUILT_IN_POSIX_MEMALIGN, "posix_memalign",
> > > > > > BT_FN_INT_PTRPTR_SIZE_SIZE, ATTR_NOTHROW_NONNULL_LEAF)
> > > > > >  DEF_GCC_BUILTIN        (BUILT_IN_PREFETCH, "prefetch",
> > > > > > BT_FN_VOID_CONST_PTR_VAR, ATTR_NOVOPS_LEAF_LIST)
> > > > > >  DEF_LIB_BUILTIN        (BUILT_IN_REALLOC, "realloc", 
> > > > > > BT_FN_PTR_PTR_SIZE,
> > > > > > ATTR_ALLOC_WARN_UNUSED_RESULT_SIZE_2_NOTHROW_LEAF_LIST)
> > > > > > diff --git a/gcc/config/riscv/riscv-protos.h 
> > > > > > b/gcc/config/riscv/riscv-protos.h
> > > > > > index 80efdf2b7e5..ba6086f1f25 100644
> > > > > > --- a/gcc/config/riscv/riscv-protos.h
> > > > > > +++ b/gcc/config/riscv/riscv-protos.h
> > > > > > @@ -132,6 +132,7 @@ extern void riscv_asm_output_external (FILE *, 
> > > > > > const
> > > > tree,
> > > > > > const char *);
> > > > > >  extern bool
> > > > > >  riscv_zcmp_valid_stack_adj_bytes_p (HOST_WIDE_INT, int);
> > > > > >  extern void riscv_legitimize_poly_move (machine_mode, rtx, rtx, 
> > > > > > rtx);
> > > > > > +extern void riscv_expand_us_plus (rtx, rtx, rtx);
> > > > > >
> > > > > >  #ifdef RTX_CODE
> > > > > >  extern void riscv_expand_int_scc (rtx, enum rtx_code, rtx, rtx, 
> > > > > > bool
> > > *invert_ptr
> > > > =
> > > > > > 0);
> > > > > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > > > > > index 4100abc9dd1..23f08974f07 100644
> > > > > > --- a/gcc/config/riscv/riscv.cc
> > > > > > +++ b/gcc/config/riscv/riscv.cc
> > > > > > @@ -10657,6 +10657,52 @@ riscv_vector_mode_supported_any_target_p
> > > > > > (machine_mode)
> > > > > >    return true;
> > > > > >  }
> > > > > >
> > > > > > +/* Emit insn for the saturation addu, aka (x + y) | - ((x + y) < 
> > > > > > x).  */
> > > > > > +void
> > > > > > +riscv_expand_us_plus (rtx dest, rtx x, rtx y)
> > > > > > +{
> > > > > > +  machine_mode mode = GET_MODE (dest);
> > > > > > +  rtx pmode_sum = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_lt = gen_reg_rtx (Pmode);
> > > > > > +  rtx pmode_x = gen_lowpart (Pmode, x);
> > > > > > +  rtx pmode_y = gen_lowpart (Pmode, y);
> > > > > > +  rtx pmode_dest = gen_reg_rtx (Pmode);
> > > > > > +
> > > > > > +  /* Step-1: sum = x + y  */
> > > > > > +  if (mode == SImode && mode != Pmode)
> > > > > > +    { /* Take addw to avoid the sum truncate.  */
> > > > > > +      rtx simode_sum = gen_reg_rtx (SImode);
> > > > > > +      riscv_emit_binary (PLUS, simode_sum, x, y);
> > > > > > +      emit_move_insn (pmode_sum, gen_lowpart (Pmode, simode_sum));
> > > > > > +    }
> > > > > > +  else
> > > > > > +    riscv_emit_binary (PLUS, pmode_sum, pmode_x, pmode_y);
> > > > > > +
> > > > > > +  /* Step-1.1: truncate sum for HI and QI as we have no insn for 
> > > > > > add QI/HI.
> > > */
> > > > > > +  if (mode == HImode || mode == QImode)
> > > > > > +    {
> > > > > > +      int mode_bits = GET_MODE_BITSIZE (mode).to_constant ();
> > > > > > +      int shift_bits = GET_MODE_BITSIZE (Pmode) - mode_bits;
> > > > > > +
> > > > > > +      gcc_assert (shift_bits > 0);
> > > > > > +
> > > > > > +      riscv_emit_binary (ASHIFT, pmode_sum, pmode_sum, GEN_INT
> > > > (shift_bits));
> > > > > > +      riscv_emit_binary (LSHIFTRT, pmode_sum, pmode_sum, GEN_INT
> > > > > > (shift_bits));
> > > > > > +    }
> > > > > > +
> > > > > > +  /* Step-2: lt = sum < x  */
> > > > > > +  riscv_emit_binary (LTU, pmode_lt, pmode_sum, pmode_x);
> > > > > > +
> > > > > > +  /* Step-3: lt = -lt  */
> > > > > > +  riscv_emit_unary (NEG, pmode_lt, pmode_lt);
> > > > > > +
> > > > > > +  /* Step-4: pmode_dest = sum | lt  */
> > > > > > +  riscv_emit_binary (IOR, pmode_dest, pmode_lt, pmode_sum);
> > > > > > +
> > > > > > +  /* Step-5: dest = pmode_dest */
> > > > > > +  emit_move_insn (dest, gen_lowpart (mode, pmode_dest));
> > > > > > +}
> > > > > > +
> > > > > >  /* Initialize the GCC target structure.  */
> > > > > >  #undef TARGET_ASM_ALIGNED_HI_OP
> > > > > >  #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
> > > > > > diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
> > > > > > index 3f7a023d941..eaa9867023c 100644
> > > > > > --- a/gcc/config/riscv/riscv.md
> > > > > > +++ b/gcc/config/riscv/riscv.md
> > > > > > @@ -3841,6 +3841,17 @@ (define_insn "*large_load_address"
> > > > > >    [(set_attr "type" "load")
> > > > > >     (set (attr "length") (const_int 8))])
> > > > > >
> > > > > > +(define_expand "us_plus<mode>3"
> > > > > > +  [(match_operand:ANYI   0 "register_operand")
> > > > > > +   (match_operand:ANYI   1 "register_operand")
> > > > > > +   (match_operand:ANYI   2 "register_operand")]
> > > > > > +  ""
> > > > > > +  {
> > > > > > +    riscv_expand_us_plus (operands[0], operands[1], operands[2]);
> > > > > > +    DONE;
> > > > > > +  }
> > > > > > +)
> > > > > > +
> > > > > >  (include "bitmanip.md")
> > > > > >  (include "crypto.md")
> > > > > >  (include "sync.md")
> > > > > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> > > > > > index a07f25f3aee..a7341a57ffa 100644
> > > > > > --- a/gcc/internal-fn.cc
> > > > > > +++ b/gcc/internal-fn.cc
> > > > > > @@ -5177,3 +5177,29 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
> > > > > >        emit_move_insn (plhs, cmp);
> > > > > >      }
> > > > > >  }
> > > > > > +
> > > > > > +void
> > > > > > +expand_US_PLUS (internal_fn fn, gcall *stmt)
> > > > > > +{
> > > > > > +  tree lhs = gimple_call_lhs (stmt);
> > > > > > +  tree rhs_0 = gimple_call_arg (stmt, 0);
> > > > > > +  tree rhs_1 = gimple_call_arg (stmt, 1);
> > > > > > +
> > > > > > +  do_pending_stack_adjust ();
> > > > > > +
> > > > > > +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> > > > > > +  rtx op_0 = expand_normal (rhs_0);
> > > > > > +  rtx op_1 = expand_normal (rhs_1);
> > > > > > +
> > > > > > +  class expand_operand ops[3];
> > > > > > +
> > > > > > +  create_output_operand (&ops[0], target, TYPE_MODE (TREE_TYPE 
> > > > > > (lhs)));
> > > > > > +  create_output_operand (&ops[1], op_0, TYPE_MODE (TREE_TYPE
> > > (rhs_0)));
> > > > > > +  create_output_operand (&ops[2], op_1, TYPE_MODE (TREE_TYPE
> > > (rhs_1)));
> > > > > > +
> > > > > > +  insn_code code = optab_handler (us_plus_optab, TYPE_MODE 
> > > > > > (TREE_TYPE
> > > > > > (rhs_0)));
> > > > > > +  expand_insn (code, 3, ops);
> > > > > > +
> > > > > > +  if (!rtx_equal_p (target, ops[0].value))
> > > > > > +    emit_move_insn (target, ops[0].value);
> > > > > > +}
> > > > >
> > > > > This can be simplified by calling expand_binary_optab_fn instead. See 
> > > > > my
> > > > template
> > > > >
> > > > > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> > > > > > index c14d30365c1..b1d7b5a0307 100644
> > > > > > --- a/gcc/internal-fn.def
> > > > > > +++ b/gcc/internal-fn.def
> > > > > > @@ -447,6 +447,9 @@ DEF_INTERNAL_INT_FN (FFS, ECF_CONST |
> > > > > > ECF_NOTHROW, ffs, unary)
> > > > > >  DEF_INTERNAL_INT_FN (PARITY, ECF_CONST | ECF_NOTHROW, parity,
> > > unary)
> > > > > >  DEF_INTERNAL_INT_EXT_FN (POPCOUNT, ECF_CONST | ECF_NOTHROW,
> > > > > > popcount, unary)
> > > > > >
> > > > > > +/* Binary integer ops.  */
> > > > > > +DEF_INTERNAL_INT_EXT_FN (US_PLUS, ECF_CONST | ECF_NOTHROW,
> > > > us_plus,
> > > > > > binary)
> > > > > > +
> > > > > >  DEF_INTERNAL_FN (GOMP_TARGET_REV, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_USE_SIMT, ECF_NOVOPS | ECF_LEAF |
> > > > > > ECF_NOTHROW, NULL)
> > > > > >  DEF_INTERNAL_FN (GOMP_SIMT_ENTER, ECF_LEAF | ECF_NOTHROW,
> > > NULL)
> > > > > > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> > > > > > index bccee1c3e09..46e404b4a49 100644
> > > > > > --- a/gcc/internal-fn.h
> > > > > > +++ b/gcc/internal-fn.h
> > > > > > @@ -263,6 +263,7 @@ extern void expand_DIVMODBITINT (internal_fn,
> > > gcall
> > > > *);
> > > > > >  extern void expand_FLOATTOBITINT (internal_fn, gcall *);
> > > > > >  extern void expand_BITINTTOFLOAT (internal_fn, gcall *);
> > > > > >  extern void expand_POPCOUNT (internal_fn, gcall *);
> > > > > > +extern void expand_US_PLUS (internal_fn, gcall *);
> > > > > >
> > > > > >  extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
> > > > > >
> > > > > > diff --git a/gcc/match.pd b/gcc/match.pd
> > > > > > index c5b6540f939..f45fd58ad23 100644
> > > > > > --- a/gcc/match.pd
> > > > > > +++ b/gcc/match.pd
> > > > > > @@ -10265,3 +10265,20 @@ and,
> > > > > >        }
> > > > > >        (if (full_perm_p)
> > > > > >       (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > +
> > > > > > +#if GIMPLE
> > > > > > +
> > > > > > +/* Unsigned saturation add, aka:
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < X) or
> > > > > > +   SAT_ADDU = (X + Y) | - ((X + Y) < Y).  */
> > > > > > +(simplify
> > > > > > + (bit_ior:c (plus:c@2 @0 @1) (negate (convert (lt @2 @0))))
> > > > > > +   (if (optimize
> > > > > > +     && INTEGRAL_TYPE_P (type)
> > > > > > +     && TYPE_UNSIGNED (TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@0))
> > > > > > +     && types_match (type, TREE_TYPE (@1))
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > > > OPTIMIZE_FOR_BOTH))
> > > > > > +   (IFN_US_PLUS @0 @1)))
> > > > > > +
> > > > > > +#endif
> > > > >
> > > > > With the version above you can drop the #if GIMPLE and the
> > > > > > +     && direct_internal_fn_supported_p (IFN_US_PLUS, type,
> > > > >
> > > > > Check.
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > > diff --git a/gcc/optabs.def b/gcc/optabs.def
> > > > > > index ad14f9328b9..5855c4e0834 100644
> > > > > > --- a/gcc/optabs.def
> > > > > > +++ b/gcc/optabs.def
> > > > > > @@ -179,6 +179,8 @@ OPTAB_NL(clrsb_optab, "clrsb$a2", CLRSB, 
> > > > > > "clrsb",
> > > '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(popcount_optab, "popcount$a2", POPCOUNT, "popcount", '2',
> > > > > > gen_int_libfunc)
> > > > > >  OPTAB_NL(parity_optab, "parity$a2", PARITY, "parity", '2', 
> > > > > > gen_int_libfunc)
> > > > > >
> > > > > > +OPTAB_NL(us_plus_optab, "us_plus$a3", US_PLUS, "us_plus", '3',
> > > > > > gen_int_libfunc)
> > > > > > +
> > > > > >  /* Comparison libcalls for integers MUST come in pairs, 
> > > > > > signed/unsigned.  */
> > > > > >  OPTAB_NL(cmp_optab, NULL, UNKNOWN, "cmp", '2',
> > > > gen_int_fp_fixed_libfunc)
> > > > > >  OPTAB_NL(ucmp_optab, NULL, UNKNOWN, "ucmp", '2', gen_int_libfunc)
> > > > > > --
> > > > > > 2.34.1
> > > > >

Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

Reply via email to