Re: [PATCH] expand: Add new clrsb fallback expansion [PR101950]
On Thu, 19 Aug 2021, Jakub Jelinek wrote: > Hi! > > As suggested in the PR, the following patch adds two new clrsb > expansion possibilities if target doesn't have clrsb_optab for the > requested nor wider modes, but does have clz_optab for the requested > mode. > One expansion is > clrsb (op0) > expands as > clz (op0 ^ (((stype)op0) >> (prec-1))) - 1 > which is usable if CLZ_DEFINED_VALUE_AT_ZERO is 2 with value > of prec, because the clz argument can be 0 and clrsb should give > prec-1 in that case. > The other expansion is > clz (((op0 << 1) ^ (((stype)op0) >> (prec-1))) | 1) > where the clz argument is never 0, but it is one operation longer. > E.g. on x86_64-linux with -O2 -mno-lzcnt, this results for > int foo (int x) { return __builtin_clrsb (x); } > in > - subq$8, %rsp > - movslq %edi, %rdi > - call__clrsbdi2 > - addq$8, %rsp > - subl$32, %eax > + leal(%rdi,%rdi), %eax > + sarl$31, %edi > + xorl%edi, %eax > + orl $1, %eax > + bsrl%eax, %eax > + xorl$31, %eax > and with -O2 -mlzcnt: > + movl%edi, %eax > + sarl$31, %eax > + xorl%edi, %eax > + lzcntl %eax, %eax > + subl$1, %eax > On armv7hl-linux-gnueabi with -O2: > - push{r4, lr} > - bl __clrsbsi2 > - pop {r4, pc} > + @ link register save eliminated. > + eor r0, r0, r0, asr #31 > + clz r0, r0 > + sub r0, r0, #1 > + bx lr > As it (at least usually) will make code larger, it is > disabled for -Os or cold instructions. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK. Richard. > 2021-08-19 Jakub Jelinek > > PR middle-end/101950 > * optabs.c (expand_clrsb_using_clz): New function. > (expand_unop): Use it as another clrsb expansion fallback. > > * gcc.target/i386/pr101950-1.c: New test. > * gcc.target/i386/pr101950-2.c: New test. > > --- gcc/optabs.c.jj 2021-07-15 10:16:13.027581160 +0200 > +++ gcc/optabs.c 2021-08-18 13:36:56.410818265 +0200 > @@ -2600,6 +2600,82 @@ widen_leading (scalar_int_mode mode, rtx >return 0; > } > > +/* Attempt to emit (clrsb:mode op0) as > + (plus:mode (clz:mode (xor:mode op0 (ashr:mode op0 (const_int prec-1 > + (const_int -1)) > + if CLZ_DEFINED_VALUE_AT_ZERO (mode, val) is 2 and val is prec, > + or as > + (clz:mode (ior:mode (xor:mode (ashl:mode op0 (const_int 1)) > + (ashr:mode op0 (const_int prec-1))) > +(const_int 1))) > + otherwise. */ > + > +static rtx > +expand_clrsb_using_clz (scalar_int_mode mode, rtx op0, rtx target) > +{ > + if (optimize_insn_for_size_p () > + || optab_handler (clz_optab, mode) == CODE_FOR_nothing) > +return NULL_RTX; > + > + start_sequence (); > + HOST_WIDE_INT val = 0; > + if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) != 2 > + || val != GET_MODE_PRECISION (mode)) > +val = 0; > + else > +val = 1; > + > + rtx temp2 = op0; > + if (!val) > +{ > + temp2 = expand_binop (mode, ashl_optab, op0, const1_rtx, > + NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp2) > + { > + fail: > + end_sequence (); > + return NULL_RTX; > + } > +} > + > + rtx temp = expand_binop (mode, ashr_optab, op0, > +GEN_INT (GET_MODE_PRECISION (mode) - 1), > +NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp) > +goto fail; > + > + temp = expand_binop (mode, xor_optab, temp2, temp, NULL_RTX, 0, > +OPTAB_DIRECT); > + if (!temp) > +goto fail; > + > + if (!val) > +{ > + temp = expand_binop (mode, ior_optab, temp, const1_rtx, > +NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp) > + goto fail; > +} > + temp = expand_unop_direct (mode, clz_optab, temp, val ? NULL_RTX : target, > + true); > + if (!temp) > +goto fail; > + if (val) > +{ > + temp = expand_binop (mode, add_optab, temp, constm1_rtx, > +target, 0, OPTAB_DIRECT); > + if (!temp) > + goto fail; > +} > + > + rtx_insn *seq = get_insns (); > + end_sequence (); > + > + add_equal_note (seq, temp, CLRSB, op0, NULL_RTX, mode); > + emit_insn (seq); > + return temp; > +} > + > /* Try calculating clz of a double-word quantity as two clz's of word-sized > quantities, choosing which based on whether the high word is nonzero. */ > static rtx > @@ -3171,6 +3247,9 @@ expand_unop (machine_mode mode, optab un > temp = widen_leading (int_mode, op0, target, unoptab); > if (temp) > return temp; > + temp = expand_clrsb_using_clz (int_mode, op0, target); > + if (temp) > + return temp; > } >goto try_libcall; > } > --- gcc/testsuite/gcc.target/i386/pr101950-1.c.jj 2021-08-18 > 13:58:05.363093681 +0200 > +++
[PATCH] expand: Add new clrsb fallback expansion [PR101950]
Hi! As suggested in the PR, the following patch adds two new clrsb expansion possibilities if target doesn't have clrsb_optab for the requested nor wider modes, but does have clz_optab for the requested mode. One expansion is clrsb (op0) expands as clz (op0 ^ (((stype)op0) >> (prec-1))) - 1 which is usable if CLZ_DEFINED_VALUE_AT_ZERO is 2 with value of prec, because the clz argument can be 0 and clrsb should give prec-1 in that case. The other expansion is clz (((op0 << 1) ^ (((stype)op0) >> (prec-1))) | 1) where the clz argument is never 0, but it is one operation longer. E.g. on x86_64-linux with -O2 -mno-lzcnt, this results for int foo (int x) { return __builtin_clrsb (x); } in - subq$8, %rsp - movslq %edi, %rdi - call__clrsbdi2 - addq$8, %rsp - subl$32, %eax + leal(%rdi,%rdi), %eax + sarl$31, %edi + xorl%edi, %eax + orl $1, %eax + bsrl%eax, %eax + xorl$31, %eax and with -O2 -mlzcnt: + movl%edi, %eax + sarl$31, %eax + xorl%edi, %eax + lzcntl %eax, %eax + subl$1, %eax On armv7hl-linux-gnueabi with -O2: - push{r4, lr} - bl __clrsbsi2 - pop {r4, pc} + @ link register save eliminated. + eor r0, r0, r0, asr #31 + clz r0, r0 + sub r0, r0, #1 + bx lr As it (at least usually) will make code larger, it is disabled for -Os or cold instructions. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2021-08-19 Jakub Jelinek PR middle-end/101950 * optabs.c (expand_clrsb_using_clz): New function. (expand_unop): Use it as another clrsb expansion fallback. * gcc.target/i386/pr101950-1.c: New test. * gcc.target/i386/pr101950-2.c: New test. --- gcc/optabs.c.jj 2021-07-15 10:16:13.027581160 +0200 +++ gcc/optabs.c2021-08-18 13:36:56.410818265 +0200 @@ -2600,6 +2600,82 @@ widen_leading (scalar_int_mode mode, rtx return 0; } +/* Attempt to emit (clrsb:mode op0) as + (plus:mode (clz:mode (xor:mode op0 (ashr:mode op0 (const_int prec-1 + (const_int -1)) + if CLZ_DEFINED_VALUE_AT_ZERO (mode, val) is 2 and val is prec, + or as + (clz:mode (ior:mode (xor:mode (ashl:mode op0 (const_int 1)) +(ashr:mode op0 (const_int prec-1))) + (const_int 1))) + otherwise. */ + +static rtx +expand_clrsb_using_clz (scalar_int_mode mode, rtx op0, rtx target) +{ + if (optimize_insn_for_size_p () + || optab_handler (clz_optab, mode) == CODE_FOR_nothing) +return NULL_RTX; + + start_sequence (); + HOST_WIDE_INT val = 0; + if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) != 2 + || val != GET_MODE_PRECISION (mode)) +val = 0; + else +val = 1; + + rtx temp2 = op0; + if (!val) +{ + temp2 = expand_binop (mode, ashl_optab, op0, const1_rtx, + NULL_RTX, 0, OPTAB_DIRECT); + if (!temp2) + { + fail: + end_sequence (); + return NULL_RTX; + } +} + + rtx temp = expand_binop (mode, ashr_optab, op0, + GEN_INT (GET_MODE_PRECISION (mode) - 1), + NULL_RTX, 0, OPTAB_DIRECT); + if (!temp) +goto fail; + + temp = expand_binop (mode, xor_optab, temp2, temp, NULL_RTX, 0, + OPTAB_DIRECT); + if (!temp) +goto fail; + + if (!val) +{ + temp = expand_binop (mode, ior_optab, temp, const1_rtx, + NULL_RTX, 0, OPTAB_DIRECT); + if (!temp) + goto fail; +} + temp = expand_unop_direct (mode, clz_optab, temp, val ? NULL_RTX : target, +true); + if (!temp) +goto fail; + if (val) +{ + temp = expand_binop (mode, add_optab, temp, constm1_rtx, + target, 0, OPTAB_DIRECT); + if (!temp) + goto fail; +} + + rtx_insn *seq = get_insns (); + end_sequence (); + + add_equal_note (seq, temp, CLRSB, op0, NULL_RTX, mode); + emit_insn (seq); + return temp; +} + /* Try calculating clz of a double-word quantity as two clz's of word-sized quantities, choosing which based on whether the high word is nonzero. */ static rtx @@ -3171,6 +3247,9 @@ expand_unop (machine_mode mode, optab un temp = widen_leading (int_mode, op0, target, unoptab); if (temp) return temp; + temp = expand_clrsb_using_clz (int_mode, op0, target); + if (temp) + return temp; } goto try_libcall; } --- gcc/testsuite/gcc.target/i386/pr101950-1.c.jj 2021-08-18 13:58:05.363093681 +0200 +++ gcc/testsuite/gcc.target/i386/pr101950-1.c 2021-08-18 14:01:22.905335834 +0200 @@ -0,0 +1,20 @@ +/* PR middle-end/101950 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-lzcnt" } */ +/* { dg-final { scan-assembler-not "call\[^\n\r]*__clrsb.i2" } } */ +/* { dg-final {