On Wed, Oct 14, 2020 at 11:01 AM Jakub Jelinek via Gcc-patches
<[email protected]> wrote:
>
> Hi!
>
> These builtins have two known issues and this patch fixes one of them.
>
> One issue is that the builtins effectively return two results and
> they make the destination addressable until expansion, which means
> a stack slot is allocated for them and e.g. with -fstack-protector*
> DSE isn't able to optimize that away. I think for that we want to use
> the technique of returning complex value; the patch doesn't handle that
> though. See PR93990 for that.
>
> The other problem is optimization of successive uses of the builtin
> e.g. for arbitrary precision arithmetic additions/subtractions.
> As shown PR93990, combine is able to optimize the case when the first
> argument to these builtins is 0 (the first instance when several are used
> together), and also the last one if the last one ignores its result (i.e.
> the carry/borrow is dead and thrown away in that case).
> As shown in this PR, combiner refuses to optimize the rest, where it sees:
> (insn 10 9 11 2 (set (reg:QI 88 [ _31 ])
> (ltu:QI (reg:CCC 17 flags)
> (const_int 0 [0]))) "include/adxintrin.h":69:10 785 {*setcc_qi}
> (expr_list:REG_DEAD (reg:CCC 17 flags)
> (nil)))
> - set pseudo 88 to CF from flags, then some uninteresting insns that
> don't modify flags, and finally:
> (insn 17 15 18 2 (parallel [
> (set (reg:CCC 17 flags)
> (compare:CCC (plus:QI (reg:QI 88 [ _31 ])
> (const_int -1 [0xffffffffffffffff]))
> (reg:QI 88 [ _31 ])))
> (clobber (scratch:QI))
> ]) "include/adxintrin.h":69:10 350 {*addqi3_cconly_overflow_1}
> (expr_list:REG_DEAD (reg:QI 88 [ _31 ])
> (nil)))
> to set CF in flags back to what we saved earlier. The combiner just punts
> trying to combine the 10, 17 and following addcarrydi (etc.) instruction,
> because
> if (i1 && !can_combine_p (i1, i3, i0, NULL, i2, NULL, &i1dest, &i1src))
> {
> if (dump_file && (dump_flags & TDF_DETAILS))
> fprintf (dump_file, "Can't combine i1 into i3\n");
> undo_all ();
> return 0;
> }
> fails - the 3 insns aren't all adjacent and
> || (! all_adjacent
> && (((!MEM_P (src)
> || ! find_reg_note (insn, REG_EQUIV, src))
> && modified_between_p (src, insn, i3))
> src (flags hard register) is modified between the first and third insn - in
> the second insn.
>
> The following patch optimizes this by optimizing just the two insns,
> 10 and 17 above, i.e. save CF into pseudo, set CF from that pseudo, into
> a nop. The new define_insn_and_split matches how combine simplifies those
> two together (except without the ix86_cc_mode change it was choosing CCmode
> for the destination instead of CCCmode, so had to change that function too,
> and also adjust costs so that combiner understand it is beneficial).
>
> With this, all the testcases are optimized, so that the:
> setc %dl
> ...
> addb $-1, %dl
> insns in between the ad[dc][lq] or s[ub]b[lq] instructions are all optimized
> away (sure, if something would clobber flags in between they wouldn't, but
> there is nothing that can be done about that).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-10-14 Jakub Jelinek <[email protected]>
>
> PR target/97387
> * config/i386/i386.md (CC_CCC): New mode iterator.
> (*setcc_qi_addqi3_cconly_overflow_1_<mode>): New
> define_insn_and_split.
> * config/i386/i386.c (ix86_cc_mode): Return CCCmode for
> *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern operands.
> (ix86_rtx_costs): Return true and *total = 0; for
> *setcc_qi_addqi3_cconly_overflow_1_<mode> pattern.
>
> * gcc.target/i386/pr97387-1.c: New test.
> * gcc.target/i386/pr97387-2.c: New test.
>
> --- gcc/config/i386/i386.md.jj 2020-10-01 10:40:09.955758167 +0200
> +++ gcc/config/i386/i386.md 2020-10-13 13:38:24.644980815 +0200
> @@ -7039,6 +7039,20 @@ (define_expand "subborrow<mode>_0"
> (set (match_operand:SWI48 0 "register_operand")
> (minus:SWI48 (match_dup 1) (match_dup 2)))])]
> "ix86_binary_operator_ok (MINUS, <MODE>mode, operands)")
> +
> +(define_mode_iterator CC_CCC [CC CCC])
> +
> +;; Pre-reload splitter to optimize
> +;; *setcc_qi followed by *addqi3_cconly_overflow_1 with the same QI
> +;; operand and no intervening flags modifications into nothing.
> +(define_insn_and_split "*setcc_qi_addqi3_cconly_overflow_1_<mode>"
> + [(set (reg:CCC FLAGS_REG)
> + (compare:CCC (neg:QI (geu:QI (reg:CC_CCC FLAGS_REG) (const_int 0)))
> + (ltu:QI (reg:CC_CCC FLAGS_REG) (const_int 0))))]
> + "ix86_pre_reload_split ()"
> + "#"
> + "&& 1"
> + [(const_int 0)])
>
> ;; Overflow setting add instructions
>
> --- gcc/config/i386/i386.c.jj 2020-10-01 10:40:09.951758225 +0200
> +++ gcc/config/i386/i386.c 2020-10-13 13:40:20.471300518 +0200
> @@ -15136,6 +15136,22 @@ ix86_cc_mode (enum rtx_code code, rtx op
> && (rtx_equal_p (op1, XEXP (op0, 0))
> || rtx_equal_p (op1, XEXP (op0, 1))))
> return CCCmode;
> + /* Similarly for *setcc_qi_addqi3_cconly_overflow_1_* patterns. */
> + else if (code == LTU
> + && GET_CODE (op0) == NEG
> + && GET_CODE (XEXP (op0, 0)) == GEU
> + && REG_P (XEXP (XEXP (op0, 0), 0))
> + && (GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCCmode
> + || GET_MODE (XEXP (XEXP (op0, 0), 0)) == CCmode)
> + && REGNO (XEXP (XEXP (op0, 0), 0)) == FLAGS_REG
> + && XEXP (XEXP (op0, 0), 1) == const0_rtx
> + && GET_CODE (op1) == LTU
> + && REG_P (XEXP (op1, 0))
> + && (GET_MODE (XEXP (op1, 0))
> + == GET_MODE (XEXP (XEXP (op0, 0), 0)))
> + && REGNO (XEXP (op1, 0)) == FLAGS_REG
> + && XEXP (op1, 1) == const0_rtx)
> + return CCCmode;
> else
> return CCmode;
> case GTU: /* CF=0 & ZF=0 */
> @@ -19773,6 +19789,26 @@ ix86_rtx_costs (rtx x, machine_mode mode
> return true;
> }
>
> + if (mode == CCCmode
> + && GET_CODE (XEXP (x, 0)) == NEG
> + && GET_CODE (XEXP (XEXP (x, 0), 0)) == GEU
> + && REG_P (XEXP (XEXP (XEXP (x, 0), 0), 0))
> + && (GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCCmode
> + || GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == CCmode)
> + && REGNO (XEXP (XEXP (XEXP (x, 0), 0), 0)) == FLAGS_REG
> + && XEXP (XEXP (XEXP (x, 0), 0), 1) == const0_rtx
> + && GET_CODE (XEXP (x, 1)) == LTU
> + && REG_P (XEXP (XEXP (x, 1), 0))
> + && (GET_MODE (XEXP (XEXP (x, 1), 0))
> + == GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)))
> + && REGNO (XEXP (XEXP (x, 1), 0)) == FLAGS_REG
> + && XEXP (XEXP (x, 1), 1) == const0_rtx)
Meh ;) templates to the rescue?
rtx_match < un<NEG, bin<GEU, reg, ..> > ().matches (x)
and with fancy metaprogramming expand it to above? Not sure if it's easier
to read that way. Maybe
rtx neg, geu;
if (mode == CCCmode
&& (neg = XEXP (x, 0), GET_CODE (neg) == NEG)
&& (geu = XEXP (neg, 0), GET_CODE (geu) == GEU)
...
or
if (mode == CCCmode
&& GET_CODE (neg = XEXP (x, 0)) == NEG
thus some manual CSE and naming in this matching would help?
> + {
> + /* This is *setcc_qi_addqi3_cconly_overflow_1_* patterns, a nop. */
> + *total = 0;
> + return true;
> + }
> +
> /* The embedded comparison operand is completely free. */
> if (!general_operand (XEXP (x, 0), GET_MODE (XEXP (x, 0)))
> && XEXP (x, 1) == const0_rtx)
> --- gcc/testsuite/gcc.target/i386/pr97387-1.c.jj 2020-10-13
> 13:47:04.106444976 +0200
> +++ gcc/testsuite/gcc.target/i386/pr97387-1.c 2020-10-13 13:46:41.808768448
> +0200
> @@ -0,0 +1,31 @@
> +/* PR target/97387 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fomit-frame-pointer" } */
> +/* { dg-final { scan-assembler-times "\taddl\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tadcl\t" 3 } } */
> +/* { dg-final { scan-assembler-times "\tsubl\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tsbbl\t" 3 } } */
> +/* { dg-final { scan-assembler-not "\tset\[bc]\t" } } */
> +/* { dg-final { scan-assembler-not "\taddb\t" } } */
> +
> +#include <x86intrin.h>
> +
> +void
> +foo (unsigned int a[4], unsigned int b[4])
> +{
> + unsigned char carry = 0;
> + carry = _addcarry_u32 (carry, a[0], b[0], &a[0]);
> + carry = _addcarry_u32 (carry, a[1], b[1], &a[1]);
> + carry = _addcarry_u32 (carry, a[2], b[2], &a[2]);
> + _addcarry_u32 (carry, a[3], b[3], &a[3]);
> +}
> +
> +void
> +bar (unsigned int a[4], unsigned int b[4])
> +{
> + unsigned char carry = 0;
> + carry = _subborrow_u32 (carry, a[0], b[0], &a[0]);
> + carry = _subborrow_u32 (carry, a[1], b[1], &a[1]);
> + carry = _subborrow_u32 (carry, a[2], b[2], &a[2]);
> + _subborrow_u32 (carry, a[3], b[3], &a[3]);
> +}
> --- gcc/testsuite/gcc.target/i386/pr97387-2.c.jj 2020-10-13
> 13:47:12.385324872 +0200
> +++ gcc/testsuite/gcc.target/i386/pr97387-2.c 2020-10-13 13:47:54.356715987
> +0200
> @@ -0,0 +1,31 @@
> +/* PR target/97387 */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-O2 -fomit-frame-pointer" } */
> +/* { dg-final { scan-assembler-times "\taddq\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tadcq\t" 3 } } */
> +/* { dg-final { scan-assembler-times "\tsubq\t" 1 } } */
> +/* { dg-final { scan-assembler-times "\tsbbq\t" 3 } } */
> +/* { dg-final { scan-assembler-not "\tset\[bc]\t" } } */
> +/* { dg-final { scan-assembler-not "\taddb\t" } } */
> +
> +#include <x86intrin.h>
> +
> +void
> +foo (unsigned long long a[4], unsigned long long b[4])
> +{
> + unsigned char carry = 0;
> + carry = _addcarry_u64 (carry, a[0], b[0], &a[0]);
> + carry = _addcarry_u64 (carry, a[1], b[1], &a[1]);
> + carry = _addcarry_u64 (carry, a[2], b[2], &a[2]);
> + _addcarry_u64 (carry, a[3], b[3], &a[3]);
> +}
> +
> +void
> +bar (unsigned long long a[4], unsigned long long b[4])
> +{
> + unsigned char carry = 0;
> + carry = _subborrow_u64 (carry, a[0], b[0], &a[0]);
> + carry = _subborrow_u64 (carry, a[1], b[1], &a[1]);
> + carry = _subborrow_u64 (carry, a[2], b[2], &a[2]);
> + _subborrow_u64 (carry, a[3], b[3], &a[3]);
> +}
>
> Jakub
>