On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote: > Please put some space here, e.g.: ... > Can you just name the relevant insn pattern and use > > emit_insn (gen_bsr_1)?
Here is the updated patch. I'll bootstrap/regtest it tonight. 2021-07-30 Jakub Jelinek <ja...@redhat.com> PR target/78103 * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New define_insn patterns. (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns. Add combine splitters for constant - clz. (clz<mode>2): Use a temporary pseudo for bsr result. * gcc.target/i386/pr78103-1.c: New test. * gcc.target/i386/pr78103-2.c: New test. * gcc.target/i386/pr78103-3.c: New test. --- gcc/config/i386/i386.md.jj 2021-07-28 12:05:56.857977764 +0200 +++ gcc/config/i386/i386.md 2021-07-30 15:13:49.994946550 +0200 @@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64" (set_attr "znver1_decode" "vector") (set_attr "mode" "DI")]) +(define_insn "bsr_rex64_1" + [(set (match_operand:DI 0 "register_operand" "=r") + (minus:DI (const_int 63) + (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT" + "bsr{q}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "DI")]) + (define_insn "bsr" [(set (reg:CCZ FLAGS_REG) (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm") @@ -14775,17 +14787,204 @@ (define_insn "bsr" (set_attr "znver1_decode" "vector") (set_attr "mode" "SI")]) +(define_insn "bsr_1" + [(set (match_operand:SI 0 "register_operand" "=r") + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT" + "bsr{l}\t{%1, %0|%0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "SI")]) + +(define_insn "bsr_zext_1" + [(set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI + (minus:SI + (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT" + "bsr{l}\t{%1, %k0|%k0, %1}" + [(set_attr "type" "alu1") + (set_attr "prefix_0f" "1") + (set_attr "znver1_decode" "vector") + (set_attr "mode" "SI")]) + +; As bsr is undefined behavior on zero and for other input +; values it is in range 0 to 63, we can optimize away sign-extends. +(define_insn_and_split "*bsr_rex64_2" + [(set (match_operand:DI 0 "register_operand") + (xor:DI + (sign_extend:DI + (minus:SI + (const_int 63) + (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0))) + (const_int 63))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_dup 1) (const_int 0))) + (set (match_dup 2) + (minus:DI (const_int 63) (clz:DI (match_dup 1))))]) + (parallel [(set (match_dup 0) + (zero_extend:DI (xor:SI (match_dup 3) (const_int 63)))) + (clobber (reg:CC FLAGS_REG))])] +{ + operands[2] = gen_reg_rtx (DImode); + operands[3] = lowpart_subreg (SImode, operands[2], DImode); +}) + +(define_insn_and_split "*bsr_2" + [(set (match_operand:DI 0 "register_operand") + (sign_extend:DI + (xor:SI + (minus:SI + (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31)))) + (clobber (reg:CC FLAGS_REG))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + "#" + "&& 1" + [(parallel [(set (reg:CCZ FLAGS_REG) + (compare:CCZ (match_dup 1) (const_int 0))) + (set (match_dup 2) + (minus:SI (const_int 31) (clz:SI (match_dup 1))))]) + (parallel [(set (match_dup 0) + (zero_extend:DI (xor:SI (match_dup 2) (const_int 31)))) + (clobber (reg:CC FLAGS_REG))])] + "operands[2] = gen_reg_rtx (SImode);") + +; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x). +; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be +; in [0, 63] or [0, 31] range. +(define_split + [(set (match_operand:SI 0 "register_operand") + (minus:SI + (match_operand:SI 2 "const_int_operand") + (xor:SI + (minus:SI (const_int 63) + (subreg:SI + (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0)) + (const_int 63))))] + "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()" + [(set (match_dup 3) + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) + (set (match_dup 0) + (plus:SI (match_dup 5) (match_dup 4)))] +{ + operands[3] = gen_reg_rtx (DImode); + operands[5] = lowpart_subreg (SImode, operands[3], DImode); + if (INTVAL (operands[2]) == 63) + { + emit_insn (gen_bsr_rex64_1 (operands[3], operands[1])); + emit_move_insn (operands[0], operands[5]); + DONE; + } + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode); +}) + +(define_split + [(set (match_operand:SI 0 "register_operand") + (minus:SI + (match_operand:SI 2 "const_int_operand") + (xor:SI + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31))))] + "!TARGET_LZCNT && ix86_pre_reload_split ()" + [(set (match_dup 3) + (minus:SI (const_int 31) (clz:SI (match_dup 1)))) + (set (match_dup 0) + (plus:SI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 31) + { + emit_insn (gen_bsr_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (SImode); + operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode); +}) + +(define_split + [(set (match_operand:DI 0 "register_operand") + (minus:DI + (match_operand:DI 2 "const_int_operand") + (xor:DI + (sign_extend:DI + (minus:SI (const_int 63) + (subreg:SI + (clz:DI (match_operand:DI 1 "nonimmediate_operand")) + 0))) + (const_int 63))))] + "!TARGET_LZCNT + && TARGET_64BIT + && ix86_pre_reload_split () + && ((unsigned HOST_WIDE_INT) + trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode) + == UINTVAL (operands[2]) - 63)" + [(set (match_dup 3) + (minus:DI (const_int 63) (clz:DI (match_dup 1)))) + (set (match_dup 0) + (plus:DI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 63) + { + emit_insn (gen_bsr_rex64_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (DImode); + operands[4] = GEN_INT (UINTVAL (operands[2]) - 63); +}) + +(define_split + [(set (match_operand:DI 0 "register_operand") + (minus:DI + (match_operand:DI 2 "const_int_operand") + (sign_extend:DI + (xor:SI + (minus:SI (const_int 31) + (clz:SI (match_operand:SI 1 "nonimmediate_operand"))) + (const_int 31)))))] + "!TARGET_LZCNT + && TARGET_64BIT + && ix86_pre_reload_split () + && ((unsigned HOST_WIDE_INT) + trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode) + == UINTVAL (operands[2]) - 31)" + [(set (match_dup 3) + (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1))))) + (set (match_dup 0) + (plus:DI (match_dup 3) (match_dup 4)))] +{ + if (INTVAL (operands[2]) == 31) + { + emit_insn (gen_bsr_zext_1 (operands[0], operands[1])); + DONE; + } + operands[3] = gen_reg_rtx (DImode); + operands[4] = GEN_INT (UINTVAL (operands[2]) - 31); +}) + (define_expand "clz<mode>2" [(parallel [(set (reg:CCZ FLAGS_REG) (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm") (const_int 0))) - (set (match_operand:SWI48 0 "register_operand") - (minus:SWI48 - (match_dup 2) - (clz:SWI48 (match_dup 1))))]) + (set (match_dup 3) (minus:SWI48 + (match_dup 2) + (clz:SWI48 (match_dup 1))))]) (parallel - [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2))) + [(set (match_operand:SWI48 0 "register_operand") + (xor:SWI48 (match_dup 3) (match_dup 2))) (clobber (reg:CC FLAGS_REG))])] "" { @@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2" DONE; } operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1); + operands[3] = gen_reg_rtx (<MODE>mode); }) (define_insn_and_split "clz<mode>2_lzcnt" --- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj 2021-07-30 15:07:26.104139537 +0200 +++ gcc/testsuite/gcc.target/i386/pr78103-1.c 2021-07-30 15:07:26.104139537 +0200 @@ -0,0 +1,28 @@ +/* PR target/78103 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-lzcnt" } */ +/* { dg-final { scan-assembler-not {\mcltq\M} } } */ + +long long +foo (long long x) +{ + return __builtin_clzll (x); +} + +long long +bar (long long x) +{ + return (unsigned int) __builtin_clzll (x); +} + +long long +baz (int x) +{ + return __builtin_clz (x); +} + +long long +qux (int x) +{ + return (unsigned int) __builtin_clz (x); +} --- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj 2021-07-30 15:07:26.104139537 +0200 +++ gcc/testsuite/gcc.target/i386/pr78103-2.c 2021-07-30 15:07:26.104139537 +0200 @@ -0,0 +1,33 @@ +/* PR target/78103 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-lzcnt" } */ +/* { dg-final { scan-assembler-not {\mmovl\M} } } */ +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ +/* { dg-final { scan-assembler-not {\msubl\M} } } */ +/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */ + +unsigned int +foo (unsigned int x) +{ + return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x); +} + +unsigned int +bar (unsigned int x) +{ + return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x); +} + +#ifdef __x86_64__ +unsigned int +baz (unsigned long long x) +{ + return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x); +} + +unsigned int +qux (unsigned long long x) +{ + return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x); +} +#endif --- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj 2021-07-30 15:07:26.104139537 +0200 +++ gcc/testsuite/gcc.target/i386/pr78103-3.c 2021-07-30 15:07:26.104139537 +0200 @@ -0,0 +1,32 @@ +/* PR target/78103 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-lzcnt" } */ +/* { dg-final { scan-assembler-not {\mmovl\M} } } */ +/* { dg-final { scan-assembler-not {\mmovslq\M} } } */ +/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */ +/* { dg-final { scan-assembler-not {\msubq\M} } } */ +/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */ + +unsigned long long +foo (unsigned int x) +{ + return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x); +} + +unsigned long long +bar (unsigned int x) +{ + return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x); +} + +unsigned long long +baz (unsigned long long x) +{ + return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x); +} + +unsigned long long +qux (unsigned long long x) +{ + return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x); +} Jakub