On Wed, Oct 14, 2020 at 11:01 AM Jakub Jelinek via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi!
>
> These builtins have two known issues and this patch fixes one of them.
>
> One issue is that the builtins effectively return two results and
> they make the destination addressable until expansion, which means
> a stack slot is allocated for them and e.g. with -fstack-protector*
> DSE isn't able to optimize that away.  I think for that we want to use
> the technique of returning complex value; the patch doesn't handle that
> though.  See PR93990 for that.
>
> The other problem is optimization of successive uses of the builtin
> e.g. for arbitrary precision arithmetic additions/subtractions.
> As shown PR93990, combine is able to optimize the case when the first
> argument to these builtins is 0 (the first instance when several are used
> together), and also the last one if the last one ignores its result (i.e.
> the carry/borrow is dead and thrown away in that case).
> As shown in this PR, combiner refuses to optimize the rest, where it sees:
> (insn 10 9 11 2 (set (reg:QI 88 [ _31 ])
>         (ltu:QI (reg:CCC 17 flags)
>             (const_int 0 [0]))) "include/adxintrin.h":69:10 785 {*setcc_qi}
>      (expr_list:REG_DEAD (reg:CCC 17 flags)
>         (nil)))
> - set pseudo 88 to CF from flags, then some uninteresting insns that
> don't modify flags, and finally:
> (insn 17 15 18 2 (parallel [
>             (set (reg:CCC 17 flags)
>                 (compare:CCC (plus:QI (reg:QI 88 [ _31 ])
>                         (const_int -1 [0xffffffffffffffff]))
>                     (reg:QI 88 [ _31 ])))
>             (clobber (scratch:QI))
>         ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1}
>      (expr_list:REG_DEAD (reg:QI 88 [ _31 ])
>         (nil)))
> to set CF in flags back to what we saved earlier.  The combiner just punts
> trying to combine the 10, 17 and following addcarrydi (etc.) instruction,
> because
>   if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src))
>     {
>       if (dump_file && (dump_flags & TDF_DETAILS))
>         fprintf (dump_file, "Can't combine i1 into i3\n");
>       undo_all ();
>       return 0;
>     }
> fails - the 3 insns aren't all adjacent and
>       || (! all_adjacent
>           && (((!MEM_P (src)
>                 || ! find_reg_note (insn, REG_EQUIV, src))
>                && modified_between_p (src, insn, i3))
> src (flags hard register) is modified between the first and third insn - in
> the second insn.
>
> The following patch optimizes this by optimizing just the two insns,
> 10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into
> a nop.  The new define_insn_and_split matches how combine simplifies those
> two together (except without the ix86_cc_mode change it was choosing CCmode
> for the destination instead of CCCmode, so had to change that function too,
> and also adjust costs so that combiner understand it is beneficial).
>
> With this, all the testcases are optimized, so that the:
>         setc    %dl
> ...
>         addb    $-1, %dl
> insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all optimized
> away (sure, if something would clobber flags in between they wouldn't, but
> there is nothing that can be done about that).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-10-14  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/97387
>         * config/i386/i386.md (CC_CCC): New mode iterator.
>         (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New
>         define_insn_and_split.
>         * config/i386/i386.c (ix86_cc_mode): Return CCCmode for
>         *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands.
>         (ix86_rtx_costs): Return true and *total = 0; for
>         *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern.
>
>         * gcc.target/i386/pr97387-1.c: New test.
>         * gcc.target/i386/pr97387-2.c: New test.
>
> --- gcc/config/i386/i386.md.jj  2020-10-01 10:40:09.955758167 +0200
> +++ gcc/config/i386/i386.md     2020-10-13 13:38:24.644980815 +0200
> @@ -7039,6 +7039,20 @@ (define_expand "subborrow<mode>_0"
>        (set (match_operand:SWI48 0 "register_operand")
>            (minus:SWI48 (match_dup 1) (match_dup 2)))])]
>    "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
> +
> +(define_mode_iterator CC_CCC [CC CCC])
> +
> +;; Pre-reload splitter to optimize
> +;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI
> +;; operand and no intervening flags modifications into nothing.
> +(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1_<mode>"
> +  [(set (reg:CCC FLAGS_REG)
> +       (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0)))
> +                    (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))))]
> +  "ix86_pre_reload_split ()"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)])
>
>  ;; Overflow setting add instructions
>
> --- gcc/config/i386/i386.c.jj   2020-10-01 10:40:09.951758225 +0200
> +++ gcc/config/i386/i386.c      2020-10-13 13:40:20.471300518 +0200
> @@ -15136,6 +15136,22 @@ ix86_cc_mode (enum rtx_code code, rtx op
>           && (rtx_equal_p (op1, XEXP (op0, 0))
>               || rtx_equal_p (op1, XEXP (op0, 1))))
>         return CCCmode;
> +      /* Similarly for *setcc_qi_addqi3_cconly_overflow_1_* patterns.  */
> +      else if (code == LTU
> +              && GET_CODE (op0) == NEG
> +              && GET_CODE (XEXP (op0, 0)) == GEU
> +              && REG_P (XEXP (XEXP (op0, 0), 0))
> +              && (GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
> +                  || GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCmode)
> +              && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
> +              && XEXP (XEXP (op0, 0), 1) == const0_rtx
> +              && GET_CODE (op1) == LTU
> +              && REG_P (XEXP (op1, 0))
> +              && (GET_MODE (XEXP (op1, 0))
> +                  == GET_MODE (XEXP (XEXP (op0, 0), 0)))
> +              && REGNO (XEXP (op1, 0)) == FLAGS_REG
> +              && XEXP (op1, 1) == const0_rtx)
> +       return CCCmode;
>        else
>         return CCmode;
>      case GTU:                  /* CF=0 & ZF=0 */
> @@ -19773,6 +19789,26 @@ ix86_rtx_costs (rtx x, machine_mode mode
>           return true;
>         }
>
> +      if (mode == CCCmode
> +         && GET_CODE (XEXP (x, 0)) == NEG
> +         && GET_CODE (XEXP (XEXP (x, 0), 0)) == GEU
> +         && REG_P (XEXP (XEXP (XEXP (x, 0), 0), 0))
> +         && (GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCCmode
> +             || GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCmode)
> +         && REGNO (XEXP (XEXP (XEXP (x, 0), 0), 0)) == FLAGS_REG
> +         && XEXP (XEXP (XEXP (x, 0), 0), 1) == const0_rtx
> +         && GET_CODE (XEXP (x, 1)) == LTU
> +         && REG_P (XEXP (XEXP (x, 1), 0))
> +         && (GET_MODE (XEXP (XEXP (x, 1), 0))
> +             == GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)))
> +         && REGNO (XEXP (XEXP (x, 1), 0)) == FLAGS_REG
> +         && XEXP (XEXP (x, 1), 1) == const0_rtx)

Meh ;)  templates to the rescue?

  rtx_match < un<NEG, bin<GEU, reg, ..> > ().matches (x)

and with fancy metaprogramming expand it to above?  Not sure if it's easier
to read that way.  Maybe

  rtx neg, geu;
  if (mode == CCCmode
      && (neg = XEXP (x, 0), GET_CODE (neg) == NEG)
      && (geu = XEXP (neg, 0), GET_CODE (geu) == GEU)
...

or

  if (mode == CCCmode
      && GET_CODE (neg = XEXP (x, 0)) == NEG

thus some manual CSE and naming in this matching would help?

> +       {
> +         /* This is *setcc_qi_addqi3_cconly_overflow_1_* patterns, a nop.  */
> +         *total = 0;
> +         return true;
> +       }
> +
>        /* The embedded comparison operand is completely free.  */
>        if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
>           && XEXP (x, 1) == const0_rtx)
> --- gcc/testsuite/gcc.target/i386/pr97387-1.c.jj        2020-10-13 
> 13:47:04.106444976 +0200
> +++ gcc/testsuite/gcc.target/i386/pr97387-1.c   2020-10-13 13:46:41.808768448 
> +0200
> @@ -0,0 +1,31 @@
> +/* PR target/97387 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fomit-frame-pointer" } */
> +/* { dg-final { scan-assembler-times "\taddl\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tadcl\t" 3 } } */
> +/* { dg-final { scan-assembler-times "\tsubl\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tsbbl\t" 3 } } */
> +/* { dg-final { scan-assembler-not "\tset\[bc]\t" } } */
> +/* { dg-final { scan-assembler-not "\taddb\t" } } */
> +
> +#include <x86intrin.h>
> +
> +void
> +foo (unsigned int a[4], unsigned int b[4])
> +{
> +  unsigned char carry = 0;
> +  carry = _addcarry_u32 (carry, a[0], b[0], &a[0]);
> +  carry = _addcarry_u32 (carry, a[1], b[1], &a[1]);
> +  carry = _addcarry_u32 (carry, a[2], b[2], &a[2]);
> +  _addcarry_u32 (carry, a[3], b[3], &a[3]);
> +}
> +
> +void
> +bar (unsigned int a[4], unsigned int b[4])
> +{
> +  unsigned char carry = 0;
> +  carry = _subborrow_u32 (carry, a[0], b[0], &a[0]);
> +  carry = _subborrow_u32 (carry, a[1], b[1], &a[1]);
> +  carry = _subborrow_u32 (carry, a[2], b[2], &a[2]);
> +  _subborrow_u32 (carry, a[3], b[3], &a[3]);
> +}
> --- gcc/testsuite/gcc.target/i386/pr97387-2.c.jj        2020-10-13 
> 13:47:12.385324872 +0200
> +++ gcc/testsuite/gcc.target/i386/pr97387-2.c   2020-10-13 13:47:54.356715987 
> +0200
> @@ -0,0 +1,31 @@
> +/* PR target/97387 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -fomit-frame-pointer" } */
> +/* { dg-final { scan-assembler-times "\taddq\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tadcq\t" 3 } } */
> +/* { dg-final { scan-assembler-times "\tsubq\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tsbbq\t" 3 } } */
> +/* { dg-final { scan-assembler-not "\tset\[bc]\t" } } */
> +/* { dg-final { scan-assembler-not "\taddb\t" } } */
> +
> +#include <x86intrin.h>
> +
> +void
> +foo (unsigned long long a[4], unsigned long long b[4])
> +{
> +  unsigned char carry = 0;
> +  carry = _addcarry_u64 (carry, a[0], b[0], &a[0]);
> +  carry = _addcarry_u64 (carry, a[1], b[1], &a[1]);
> +  carry = _addcarry_u64 (carry, a[2], b[2], &a[2]);
> +  _addcarry_u64 (carry, a[3], b[3], &a[3]);
> +}
> +
> +void
> +bar (unsigned long long a[4], unsigned long long b[4])
> +{
> +  unsigned char carry = 0;
> +  carry = _subborrow_u64 (carry, a[0], b[0], &a[0]);
> +  carry = _subborrow_u64 (carry, a[1], b[1], &a[1]);
> +  carry = _subborrow_u64 (carry, a[2], b[2], &a[2]);
> +  _subborrow_u64 (carry, a[3], b[3], &a[3]);
> +}
>
>         Jakub
>

Reply via email to