On Sun, Aug 1, 2021 at 7:12 PM H.J. Lu <hjl.to...@gmail.com> wrote: > > On Sat, Jul 31, 2021 at 12:53:44PM -0700, H.J. Lu wrote: > > On Fri, Jul 30, 2021 at 6:27 AM Jakub Jelinek via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote: > > > > Please put some space here, e.g.: > > > ... > > > > Can you just name the relevant insn pattern and use > > > > > > > > emit_insn (gen_bsr_1)? > > > > > > Here is the updated patch. I'll bootstrap/regtest it tonight. > > > > > > 2021-07-30 Jakub Jelinek <ja...@redhat.com> > > > > > > PR target/78103 > > > * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New > > > define_insn patterns. > > > (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns. > > > Add combine splitters for constant - clz. > > > (clz<mode>2): Use a temporary pseudo for bsr result. > > > > > > * gcc.target/i386/pr78103-1.c: New test. > > > * gcc.target/i386/pr78103-2.c: New test. > > > * gcc.target/i386/pr78103-3.c: New test. > > > > > > --- gcc/config/i386/i386.md.jj 2021-07-28 12:05:56.857977764 +0200 > > > +++ gcc/config/i386/i386.md 2021-07-30 15:13:49.994946550 +0200 > > > @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64" > > > (set_attr "znver1_decode" "vector") > > > (set_attr "mode" "DI")]) > > > > > > +(define_insn "bsr_rex64_1" > > > + [(set (match_operand:DI 0 "register_operand" "=r") > > > + (minus:DI (const_int 63) > > > + (clz:DI (match_operand:DI 1 "nonimmediate_operand" > > > "rm")))) > > > + (clobber (reg:CC FLAGS_REG))] > > > + "!TARGET_LZCNT && TARGET_64BIT" > > > + "bsr{q}\t{%1, %0|%0, %1}" > > > + [(set_attr "type" "alu1") > > > + (set_attr "prefix_0f" "1") > > > + (set_attr "znver1_decode" "vector") > > > + (set_attr "mode" "DI")]) > > > + > > > (define_insn "bsr" > > > [(set (reg:CCZ FLAGS_REG) > > > (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") > > > @@ -14775,17 +14787,204 @@ (define_insn "bsr" > > > (set_attr "znver1_decode" "vector") > > > (set_attr "mode" "SI")]) > > > > > > +(define_insn "bsr_1" > > > + [(set (match_operand:SI 0 "register_operand" "=r") > > > + (minus:SI (const_int 31) > > > + (clz:SI (match_operand:SI 1 "nonimmediate_operand" > > > "rm")))) > > > + (clobber (reg:CC FLAGS_REG))] > > > + "!TARGET_LZCNT" > > > + "bsr{l}\t{%1, %0|%0, %1}" > > > + [(set_attr "type" "alu1") > > > + (set_attr "prefix_0f" "1") > > > + (set_attr "znver1_decode" "vector") > > > + (set_attr "mode" "SI")]) > > > + > > > +(define_insn "bsr_zext_1" > > > + [(set (match_operand:DI 0 "register_operand" "=r") > > > + (zero_extend:DI > > > + (minus:SI > > > + (const_int 31) > > > + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))) > > > + (clobber (reg:CC FLAGS_REG))] > > > + "!TARGET_LZCNT && TARGET_64BIT" > > > + "bsr{l}\t{%1, %k0|%k0, %1}" > > > + [(set_attr "type" "alu1") > > > + (set_attr "prefix_0f" "1") > > > + (set_attr "znver1_decode" "vector") > > > + (set_attr "mode" "SI")]) > > > + > > > +; As bsr is undefined behavior on zero and for other input > > > +; values it is in range 0 to 63, we can optimize away sign-extends. > > > +(define_insn_and_split "*bsr_rex64_2" > > > + [(set (match_operand:DI 0 "register_operand") > > > + (xor:DI > > > + (sign_extend:DI > > > + (minus:SI > > > + (const_int 63) > > > + (subreg:SI (clz:DI (match_operand:DI 1 > > > "nonimmediate_operand")) > > > + 0))) > > > + (const_int 63))) > > > + (clobber (reg:CC FLAGS_REG))] > > > + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" > > > + "#" > > > + "&& 1" > > > + [(parallel [(set (reg:CCZ FLAGS_REG) > > > + (compare:CCZ (match_dup 1) (const_int 0))) > > > + (set (match_dup 2) > > > + (minus:DI (const_int 63) (clz:DI (match_dup 1))))]) > > > + (parallel [(set (match_dup 0) > > > + (zero_extend:DI (xor:SI (match_dup 3) (const_int 63)))) > > > + (clobber (reg:CC FLAGS_REG))])] > > > +{ > > > + operands[2] = gen_reg_rtx (DImode); > > > + operands[3] = lowpart_subreg (SImode, operands[2], DImode); > > > +}) > > > + > > > +(define_insn_and_split "*bsr_2" > > > + [(set (match_operand:DI 0 "register_operand") > > > + (sign_extend:DI > > > + (xor:SI > > > + (minus:SI > > > + (const_int 31) > > > + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) > > > + (const_int 31)))) > > > + (clobber (reg:CC FLAGS_REG))] > > > + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" > > > + "#" > > > + "&& 1" > > > + [(parallel [(set (reg:CCZ FLAGS_REG) > > > + (compare:CCZ (match_dup 1) (const_int 0))) > > > + (set (match_dup 2) > > > + (minus:SI (const_int 31) (clz:SI (match_dup 1))))]) > > > + (parallel [(set (match_dup 0) > > > + (zero_extend:DI (xor:SI (match_dup 2) (const_int 31)))) > > > + (clobber (reg:CC FLAGS_REG))])] > > > + "operands[2] = gen_reg_rtx (SImode);") > > > + > > > +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz > > > (x). > > > +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be > > > +; in [0, 63] or [0, 31] range. > > > +(define_split > > > + [(set (match_operand:SI 0 "register_operand") > > > + (minus:SI > > > + (match_operand:SI 2 "const_int_operand") > > > + (xor:SI > > > + (minus:SI (const_int 63) > > > + (subreg:SI > > > + (clz:DI (match_operand:DI 1 > > > "nonimmediate_operand")) > > > + 0)) > > > + (const_int 63))))] > > > + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" > > > + [(set (match_dup 3) > > > + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) > > > + (set (match_dup 0) > > > + (plus:SI (match_dup 5) (match_dup 4)))] > > > +{ > > > + operands[3] = gen_reg_rtx (DImode); > > > + operands[5] = lowpart_subreg (SImode, operands[3], DImode); > > > + if (INTVAL (operands[2]) == 63) > > > + { > > > + emit_insn (gen_bsr_rex64_1 (operands[3], operands[1])); > > > + emit_move_insn (operands[0], operands[5]); > > > + DONE; > > > + } > > > + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode); > > > +}) > > > + > > > +(define_split > > > + [(set (match_operand:SI 0 "register_operand") > > > + (minus:SI > > > + (match_operand:SI 2 "const_int_operand") > > > + (xor:SI > > > + (minus:SI (const_int 31) > > > + (clz:SI (match_operand:SI 1 > > > "nonimmediate_operand"))) > > > + (const_int 31))))] > > > + "!TARGET_LZCNT && ix86_pre_reload_split ()" > > > + [(set (match_dup 3) > > > + (minus:SI (const_int 31) (clz:SI (match_dup 1)))) > > > + (set (match_dup 0) > > > + (plus:SI (match_dup 3) (match_dup 4)))] > > > +{ > > > + if (INTVAL (operands[2]) == 31) > > > + { > > > + emit_insn (gen_bsr_1 (operands[0], operands[1])); > > > + DONE; > > > + } > > > + operands[3] = gen_reg_rtx (SImode); > > > + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode); > > > +}) > > > + > > > +(define_split > > > + [(set (match_operand:DI 0 "register_operand") > > > + (minus:DI > > > + (match_operand:DI 2 "const_int_operand") > > > + (xor:DI > > > + (sign_extend:DI > > > + (minus:SI (const_int 63) > > > + (subreg:SI > > > + (clz:DI (match_operand:DI 1 > > > "nonimmediate_operand")) > > > + 0))) > > > + (const_int 63))))] > > > + "!TARGET_LZCNT > > > + && TARGET_64BIT > > > + && ix86_pre_reload_split () > > > + && ((unsigned HOST_WIDE_INT) > > > + trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode) > > > + == UINTVAL (operands[2]) - 63)" > > > + [(set (match_dup 3) > > > + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) > > > + (set (match_dup 0) > > > + (plus:DI (match_dup 3) (match_dup 4)))] > > > +{ > > > + if (INTVAL (operands[2]) == 63) > > > + { > > > + emit_insn (gen_bsr_rex64_1 (operands[0], operands[1])); > > > + DONE; > > > + } > > > + operands[3] = gen_reg_rtx (DImode); > > > + operands[4] = GEN_INT (UINTVAL (operands[2]) - 63); > > > +}) > > > + > > > +(define_split > > > + [(set (match_operand:DI 0 "register_operand") > > > + (minus:DI > > > + (match_operand:DI 2 "const_int_operand") > > > + (sign_extend:DI > > > + (xor:SI > > > + (minus:SI (const_int 31) > > > + (clz:SI (match_operand:SI 1 > > > "nonimmediate_operand"))) > > > + (const_int 31)))))] > > > + "!TARGET_LZCNT > > > + && TARGET_64BIT > > > + && ix86_pre_reload_split () > > > + && ((unsigned HOST_WIDE_INT) > > > + trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode) > > > + == UINTVAL (operands[2]) - 31)" > > > + [(set (match_dup 3) > > > + (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1))))) > > > + (set (match_dup 0) > > > + (plus:DI (match_dup 3) (match_dup 4)))] > > > +{ > > > + if (INTVAL (operands[2]) == 31) > > > + { > > > + emit_insn (gen_bsr_zext_1 (operands[0], operands[1])); > > > + DONE; > > > + } > > > + operands[3] = gen_reg_rtx (DImode); > > > + operands[4] = GEN_INT (UINTVAL (operands[2]) - 31); > > > +}) > > > + > > > (define_expand "clz<mode>2" > > > [(parallel > > > [(set (reg:CCZ FLAGS_REG) > > > (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm") > > > (const_int 0))) > > > - (set (match_operand:SWI48 0 "register_operand") > > > - (minus:SWI48 > > > - (match_dup 2) > > > - (clz:SWI48 (match_dup 1))))]) > > > + (set (match_dup 3) (minus:SWI48 > > > + (match_dup 2) > > > + (clz:SWI48 (match_dup 1))))]) > > > (parallel > > > - [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2))) > > > + [(set (match_operand:SWI48 0 "register_operand") > > > + (xor:SWI48 (match_dup 3) (match_dup 2))) > > > (clobber (reg:CC FLAGS_REG))])] > > > "" > > > { > > > @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2" > > > DONE; > > > } > > > operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1); > > > + operands[3] = gen_reg_rtx (<MODE>mode); > > > }) > > > > > > (define_insn_and_split "clz<mode>2_lzcnt" > > > --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj 2021-07-30 > > > 15:07:26.104139537 +0200 > > > +++ gcc/testsuite/gcc.target/i386/pr78103-1.c 2021-07-30 > > > 15:07:26.104139537 +0200 > > > @@ -0,0 +1,28 @@ > > > +/* PR target/78103 */ > > > +/* { dg-do compile { target { ! ia32 } } } */ > > > +/* { dg-options "-O2 -mno-lzcnt" } */ > > > +/* { dg-final { scan-assembler-not {\mcltq\M} } } */ > > > + > > > +long long > > > +foo (long long x) > > > +{ > > > + return __builtin_clzll (x); > > > +} > > > + > > > +long long > > > +bar (long long x) > > > +{ > > > + return (unsigned int) __builtin_clzll (x); > > > +} > > > + > > > +long long > > > +baz (int x) > > > +{ > > > + return __builtin_clz (x); > > > +} > > > + > > > +long long > > > +qux (int x) > > > +{ > > > + return (unsigned int) __builtin_clz (x); > > > +} > > > --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj 2021-07-30 > > > 15:07:26.104139537 +0200 > > > +++ gcc/testsuite/gcc.target/i386/pr78103-2.c 2021-07-30 > > > 15:07:26.104139537 +0200 > > > @@ -0,0 +1,33 @@ > > > +/* PR target/78103 */ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-O2 -mno-lzcnt" } */ > > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */ > > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ > > > +/* { dg-final { scan-assembler-not {\msubl\M} } } */ > > > +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */ > > > + > > > +unsigned int > > > +foo (unsigned int x) > > > +{ > > > + return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x); > > > +} > > > + > > > +unsigned int > > > +bar (unsigned int x) > > > +{ > > > + return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x); > > > +} > > > + > > > +#ifdef __x86_64__ > > > +unsigned int > > > +baz (unsigned long long x) > > > +{ > > > + return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll > > > (x); > > > +} > > > + > > > +unsigned int > > > +qux (unsigned long long x) > > > +{ > > > + return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - > > > __builtin_clzll (x); > > > +} > > > +#endif > > > --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj 2021-07-30 > > > 15:07:26.104139537 +0200 > > > +++ gcc/testsuite/gcc.target/i386/pr78103-3.c 2021-07-30 > > > 15:07:26.104139537 +0200 > > > @@ -0,0 +1,32 @@ > > > +/* PR target/78103 */ > > > +/* { dg-do compile { target { ! ia32 } } } */ > > > +/* { dg-options "-O2 -mno-lzcnt" } */ > > > +/* { dg-final { scan-assembler-not {\mmovl\M} } } */ > > > +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */ > > > +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ > > > +/* { dg-final { scan-assembler-not {\msubq\M} } } */ > > > +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */ > > > > It fails for -mx32: > > > > @@ -7,7 +7,7 @@ foo: > > .LFB0: > > .cfi_startproc > > bsrl %edi, %edi > > - leaq 1(%rdi), %rax > > + leal 1(%rdi), %eax << This should also work for -m64. Why isn't it > > used for -m64? > > ret > > .cfi_endproc > > .LFE0: > > @@ -30,7 +30,7 @@ baz: > > .LFB2: > > .cfi_startproc > > bsrq %rdi, %rdi > > - leaq 1(%rdi), %rax > > + leal 1(%rdi), %eax << This should also work for -m64. Why isn't > > it used for -m64? > > ret > > .cfi_endproc > > .LFE2: > > @@ -42,6 +42,7 @@ qux: > > .LFB3: > > .cfi_startproc > > bsrq %rdi, %rax > > + movl %eax, %eax << Why is it generated for -mx32? > > ret > > .cfi_endproc > > .LFE3: > > > > Add a zero_extend patten for bsr_rex64_1 and use it to split SImode > constant - __builtin_clzll to avoid unncessary zero_extend. > > OK for master? > > H.J. > --- > gcc/ > > PR target/78103 > * config/i386/i386.md (bsr_rex64_1_zext): New. > (combine splitter for SImode constant - clzll): Replace > gen_bsr_rex64_1 with gen_bsr_rex64_1_zext. > > gcc/testsuite/ > > PR target/78103 > * gcc.target/i386/pr78103-2.c: Also scan incl. > * gcc.target/i386/pr78103-3.c: Scan leal|addl|incl for x32. Also > scan incq.
OK. Thanks, Uros. > --- > gcc/config/i386/i386.md | 17 ++++++++++++++++- > gcc/testsuite/gcc.target/i386/pr78103-2.c | 2 +- > gcc/testsuite/gcc.target/i386/pr78103-3.c | 3 ++- > 3 files changed, 19 insertions(+), 3 deletions(-) > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index c9787d73262..0c23ddb8d1f 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -14796,6 +14796,21 @@ (define_insn "bsr_rex64_1" > (set_attr "znver1_decode" "vector") > (set_attr "mode" "DI")]) > > +(define_insn "bsr_rex64_1_zext" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (zero_extend:DI > + (minus:SI (const_int 63) > + (subreg:SI > + (clz:DI (match_operand:DI 1 "nonimmediate_operand" > "rm")) > + 0)))) > + (clobber (reg:CC FLAGS_REG))] > + "!TARGET_LZCNT && TARGET_64BIT" > + "bsr{q}\t{%1, %0|%0, %1}" > + [(set_attr "type" "alu1") > + (set_attr "prefix_0f" "1") > + (set_attr "znver1_decode" "vector") > + (set_attr "mode" "DI")]) > + > (define_insn "bsr" > [(set (reg:CCZ FLAGS_REG) > (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") > @@ -14907,7 +14922,7 @@ (define_split > operands[5] = lowpart_subreg (SImode, operands[3], DImode); > if (INTVAL (operands[2]) == 63) > { > - emit_insn (gen_bsr_rex64_1 (operands[3], operands[1])); > + emit_insn (gen_bsr_rex64_1_zext (operands[3], operands[1])); > emit_move_insn (operands[0], operands[5]); > DONE; > } > diff --git a/gcc/testsuite/gcc.target/i386/pr78103-2.c > b/gcc/testsuite/gcc.target/i386/pr78103-2.c > index b3523382926..30f7f98f60a 100644 > --- a/gcc/testsuite/gcc.target/i386/pr78103-2.c > +++ b/gcc/testsuite/gcc.target/i386/pr78103-2.c > @@ -4,7 +4,7 @@ > /* { dg-final { scan-assembler-not {\mmovl\M} } } */ > /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ > /* { dg-final { scan-assembler-not {\msubl\M} } } */ > -/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */ > +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} } } */ > > unsigned int > foo (unsigned int x) > diff --git a/gcc/testsuite/gcc.target/i386/pr78103-3.c > b/gcc/testsuite/gcc.target/i386/pr78103-3.c > index 49a36eccf4d..b8d82312a0e 100644 > --- a/gcc/testsuite/gcc.target/i386/pr78103-3.c > +++ b/gcc/testsuite/gcc.target/i386/pr78103-3.c > @@ -5,7 +5,8 @@ > /* { dg-final { scan-assembler-not {\mmovslq\M} } } */ > /* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ > /* { dg-final { scan-assembler-not {\msubq\M} } } */ > -/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */ > +/* { dg-final { scan-assembler {\m(leaq|addq|incq)\M} { target { ! x32 } } } > } */ > +/* { dg-final { scan-assembler {\m(leal|addl|incl)\M} { target x32 } } } */ > > unsigned long long > foo (unsigned int x) > -- > 2.31.1 >