On Thu, 19 Aug 2021, Jakub Jelinek wrote: > Hi! > > As suggested in the PR, the following patch adds two new clrsb > expansion possibilities if target doesn't have clrsb_optab for the > requested nor wider modes, but does have clz_optab for the requested > mode. > One expansion is > clrsb (op0) > expands as > clz (op0 ^ (((stype)op0) >> (prec-1))) - 1 > which is usable if CLZ_DEFINED_VALUE_AT_ZERO is 2 with value > of prec, because the clz argument can be 0 and clrsb should give > prec-1 in that case. > The other expansion is > clz (((op0 << 1) ^ (((stype)op0) >> (prec-1))) | 1) > where the clz argument is never 0, but it is one operation longer. > E.g. on x86_64-linux with -O2 -mno-lzcnt, this results for > int foo (int x) { return __builtin_clrsb (x); } > in > - subq $8, %rsp > - movslq %edi, %rdi > - call __clrsbdi2 > - addq $8, %rsp > - subl $32, %eax > + leal (%rdi,%rdi), %eax > + sarl $31, %edi > + xorl %edi, %eax > + orl $1, %eax > + bsrl %eax, %eax > + xorl $31, %eax > and with -O2 -mlzcnt: > + movl %edi, %eax > + sarl $31, %eax > + xorl %edi, %eax > + lzcntl %eax, %eax > + subl $1, %eax > On armv7hl-linux-gnueabi with -O2: > - push {r4, lr} > - bl __clrsbsi2 > - pop {r4, pc} > + @ link register save eliminated. > + eor r0, r0, r0, asr #31 > + clz r0, r0 > + sub r0, r0, #1 > + bx lr > As it (at least usually) will make code larger, it is > disabled for -Os or cold instructions. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK. Richard. > 2021-08-19 Jakub Jelinek <ja...@redhat.com> > > PR middle-end/101950 > * optabs.c (expand_clrsb_using_clz): New function. > (expand_unop): Use it as another clrsb expansion fallback. > > * gcc.target/i386/pr101950-1.c: New test. > * gcc.target/i386/pr101950-2.c: New test. > > --- gcc/optabs.c.jj 2021-07-15 10:16:13.027581160 +0200 > +++ gcc/optabs.c 2021-08-18 13:36:56.410818265 +0200 > @@ -2600,6 +2600,82 @@ widen_leading (scalar_int_mode mode, rtx > return 0; > } > > +/* Attempt to emit (clrsb:mode op0) as > + (plus:mode (clz:mode (xor:mode op0 (ashr:mode op0 (const_int prec-1)))) > + (const_int -1)) > + if CLZ_DEFINED_VALUE_AT_ZERO (mode, val) is 2 and val is prec, > + or as > + (clz:mode (ior:mode (xor:mode (ashl:mode op0 (const_int 1)) > + (ashr:mode op0 (const_int prec-1))) > + (const_int 1))) > + otherwise. */ > + > +static rtx > +expand_clrsb_using_clz (scalar_int_mode mode, rtx op0, rtx target) > +{ > + if (optimize_insn_for_size_p () > + || optab_handler (clz_optab, mode) == CODE_FOR_nothing) > + return NULL_RTX; > + > + start_sequence (); > + HOST_WIDE_INT val = 0; > + if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) != 2 > + || val != GET_MODE_PRECISION (mode)) > + val = 0; > + else > + val = 1; > + > + rtx temp2 = op0; > + if (!val) > + { > + temp2 = expand_binop (mode, ashl_optab, op0, const1_rtx, > + NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp2) > + { > + fail: > + end_sequence (); > + return NULL_RTX; > + } > + } > + > + rtx temp = expand_binop (mode, ashr_optab, op0, > + GEN_INT (GET_MODE_PRECISION (mode) - 1), > + NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp) > + goto fail; > + > + temp = expand_binop (mode, xor_optab, temp2, temp, NULL_RTX, 0, > + OPTAB_DIRECT); > + if (!temp) > + goto fail; > + > + if (!val) > + { > + temp = expand_binop (mode, ior_optab, temp, const1_rtx, > + NULL_RTX, 0, OPTAB_DIRECT); > + if (!temp) > + goto fail; > + } > + temp = expand_unop_direct (mode, clz_optab, temp, val ? NULL_RTX : target, > + true); > + if (!temp) > + goto fail; > + if (val) > + { > + temp = expand_binop (mode, add_optab, temp, constm1_rtx, > + target, 0, OPTAB_DIRECT); > + if (!temp) > + goto fail; > + } > + > + rtx_insn *seq = get_insns (); > + end_sequence (); > + > + add_equal_note (seq, temp, CLRSB, op0, NULL_RTX, mode); > + emit_insn (seq); > + return temp; > +} > + > /* Try calculating clz of a double-word quantity as two clz's of word-sized > quantities, choosing which based on whether the high word is nonzero. */ > static rtx > @@ -3171,6 +3247,9 @@ expand_unop (machine_mode mode, optab un > temp = widen_leading (int_mode, op0, target, unoptab); > if (temp) > return temp; > + temp = expand_clrsb_using_clz (int_mode, op0, target); > + if (temp) > + return temp; > } > goto try_libcall; > } > --- gcc/testsuite/gcc.target/i386/pr101950-1.c.jj 2021-08-18 > 13:58:05.363093681 +0200 > +++ gcc/testsuite/gcc.target/i386/pr101950-1.c 2021-08-18 > 14:01:22.905335834 +0200 > @@ -0,0 +1,20 @@ > +/* PR middle-end/101950 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-lzcnt" } */ > +/* { dg-final { scan-assembler-not "call\[^\n\r]*__clrsb.i2" } } */ > +/* { dg-final { scan-assembler-times "\tbsr\[ql]\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\txor\[ql]\t" 4 } } */ > +/* { dg-final { scan-assembler-times "\tor\[ql]\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\tsar\[ql]\t|\tcltd" 2 } } */ > + > +int > +foo (long x) > +{ > + return __builtin_clrsbl (x); > +} > + > +int > +bar (int x) > +{ > + return __builtin_clrsb (x); > +} > --- gcc/testsuite/gcc.target/i386/pr101950-2.c.jj 2021-08-18 > 13:58:11.367009865 +0200 > +++ gcc/testsuite/gcc.target/i386/pr101950-2.c 2021-08-18 > 14:01:18.540396835 +0200 > @@ -0,0 +1,19 @@ > +/* PR middle-end/101950 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mlzcnt" } */ > +/* { dg-final { scan-assembler-not "call\[^\n\r]*__clrsb.i2" } } */ > +/* { dg-final { scan-assembler-times "\tlzcnt\[ql]\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\txor\[ql]\t" 2 } } */ > +/* { dg-final { scan-assembler-times "\tsar\[ql]\t|\tcltd" 2 } } */ > + > +int > +foo (long x) > +{ > + return __builtin_clrsbl (x); > +} > + > +int > +bar (int x) > +{ > + return __builtin_clrsb (x); > +} > > Jakub > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)