Re: [PATCH] expand: Add new clrsb fallback expansion [PR101950]

2021-08-19 Thread Richard Biener via Gcc-patches
On Thu, 19 Aug 2021, Jakub Jelinek wrote:

> Hi!
> 
> As suggested in the PR, the following patch adds two new clrsb
> expansion possibilities if target doesn't have clrsb_optab for the
> requested nor wider modes, but does have clz_optab for the requested
> mode.
> One expansion is
> clrsb (op0)
> expands as
> clz (op0 ^ (((stype)op0) >> (prec-1))) - 1
> which is usable if CLZ_DEFINED_VALUE_AT_ZERO is 2 with value
> of prec, because the clz argument can be 0 and clrsb should give
> prec-1 in that case.
> The other expansion is
> clz (((op0 << 1) ^ (((stype)op0) >> (prec-1))) | 1)
> where the clz argument is never 0, but it is one operation longer.
> E.g. on x86_64-linux with -O2 -mno-lzcnt, this results for
> int foo (int x) { return __builtin_clrsb (x); }
> in
> - subq$8, %rsp
> - movslq  %edi, %rdi
> - call__clrsbdi2
> - addq$8, %rsp
> - subl$32, %eax
> + leal(%rdi,%rdi), %eax
> + sarl$31, %edi
> + xorl%edi, %eax
> + orl $1, %eax
> + bsrl%eax, %eax
> + xorl$31, %eax
> and with -O2 -mlzcnt:
> + movl%edi, %eax
> + sarl$31, %eax
> + xorl%edi, %eax
> + lzcntl  %eax, %eax
> + subl$1, %eax
> On armv7hl-linux-gnueabi with -O2:
> - push{r4, lr}
> - bl  __clrsbsi2
> - pop {r4, pc}
> + @ link register save eliminated.
> + eor r0, r0, r0, asr #31
> + clz r0, r0
> + sub r0, r0, #1
> + bx  lr
> As it (at least usually) will make code larger, it is
> disabled for -Os or cold instructions.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Richard.

> 2021-08-19  Jakub Jelinek  
> 
>   PR middle-end/101950
>   * optabs.c (expand_clrsb_using_clz): New function.
>   (expand_unop): Use it as another clrsb expansion fallback.
> 
>   * gcc.target/i386/pr101950-1.c: New test.
>   * gcc.target/i386/pr101950-2.c: New test.
> 
> --- gcc/optabs.c.jj   2021-07-15 10:16:13.027581160 +0200
> +++ gcc/optabs.c  2021-08-18 13:36:56.410818265 +0200
> @@ -2600,6 +2600,82 @@ widen_leading (scalar_int_mode mode, rtx
>return 0;
>  }
>  
> +/* Attempt to emit (clrsb:mode op0) as
> +   (plus:mode (clz:mode (xor:mode op0 (ashr:mode op0 (const_int prec-1
> +   (const_int -1))
> +   if CLZ_DEFINED_VALUE_AT_ZERO (mode, val) is 2 and val is prec,
> +   or as
> +   (clz:mode (ior:mode (xor:mode (ashl:mode op0 (const_int 1))
> +  (ashr:mode op0 (const_int prec-1)))
> +(const_int 1)))
> +   otherwise.  */
> +
> +static rtx
> +expand_clrsb_using_clz (scalar_int_mode mode, rtx op0, rtx target)
> +{
> +  if (optimize_insn_for_size_p ()
> +  || optab_handler (clz_optab, mode) == CODE_FOR_nothing)
> +return NULL_RTX;
> +
> +  start_sequence ();
> +  HOST_WIDE_INT val = 0;
> +  if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) != 2
> +  || val != GET_MODE_PRECISION (mode))
> +val = 0;
> +  else
> +val = 1;
> +
> +  rtx temp2 = op0;
> +  if (!val)
> +{
> +  temp2 = expand_binop (mode, ashl_optab, op0, const1_rtx,
> + NULL_RTX, 0, OPTAB_DIRECT);
> +  if (!temp2)
> + {
> + fail:
> +   end_sequence ();
> +   return NULL_RTX;
> + }
> +}
> +
> +  rtx temp = expand_binop (mode, ashr_optab, op0,
> +GEN_INT (GET_MODE_PRECISION (mode) - 1),
> +NULL_RTX, 0, OPTAB_DIRECT);
> +  if (!temp)
> +goto fail;
> +
> +  temp = expand_binop (mode, xor_optab, temp2, temp, NULL_RTX, 0,
> +OPTAB_DIRECT);
> +  if (!temp)
> +goto fail;
> +
> +  if (!val)
> +{
> +  temp = expand_binop (mode, ior_optab, temp, const1_rtx,
> +NULL_RTX, 0, OPTAB_DIRECT);
> +  if (!temp)
> + goto fail;
> +}
> +  temp = expand_unop_direct (mode, clz_optab, temp, val ? NULL_RTX : target,
> +  true);
> +  if (!temp)
> +goto fail;
> +  if (val)
> +{
> +  temp = expand_binop (mode, add_optab, temp, constm1_rtx,
> +target, 0, OPTAB_DIRECT);
> +  if (!temp)
> + goto fail;
> +}
> +
> +  rtx_insn *seq = get_insns ();
> +  end_sequence ();
> +
> +  add_equal_note (seq, temp, CLRSB, op0, NULL_RTX, mode);
> +  emit_insn (seq);
> +  return temp;
> +}
> +
>  /* Try calculating clz of a double-word quantity as two clz's of word-sized
> quantities, choosing which based on whether the high word is nonzero.  */
>  static rtx
> @@ -3171,6 +3247,9 @@ expand_unop (machine_mode mode, optab un
> temp = widen_leading (int_mode, op0, target, unoptab);
> if (temp)
>   return temp;
> +   temp = expand_clrsb_using_clz (int_mode, op0, target);
> +   if (temp)
> + return temp;
>   }
>goto try_libcall;
>  }
> --- gcc/testsuite/gcc.target/i386/pr101950-1.c.jj 2021-08-18 
> 13:58:05.363093681 +0200
> +++ 

[PATCH] expand: Add new clrsb fallback expansion [PR101950]

2021-08-19 Thread Jakub Jelinek via Gcc-patches
Hi!

As suggested in the PR, the following patch adds two new clrsb
expansion possibilities if target doesn't have clrsb_optab for the
requested nor wider modes, but does have clz_optab for the requested
mode.
One expansion is
clrsb (op0)
expands as
clz (op0 ^ (((stype)op0) >> (prec-1))) - 1
which is usable if CLZ_DEFINED_VALUE_AT_ZERO is 2 with value
of prec, because the clz argument can be 0 and clrsb should give
prec-1 in that case.
The other expansion is
clz (((op0 << 1) ^ (((stype)op0) >> (prec-1))) | 1)
where the clz argument is never 0, but it is one operation longer.
E.g. on x86_64-linux with -O2 -mno-lzcnt, this results for
int foo (int x) { return __builtin_clrsb (x); }
in
-   subq$8, %rsp
-   movslq  %edi, %rdi
-   call__clrsbdi2
-   addq$8, %rsp
-   subl$32, %eax
+   leal(%rdi,%rdi), %eax
+   sarl$31, %edi
+   xorl%edi, %eax
+   orl $1, %eax
+   bsrl%eax, %eax
+   xorl$31, %eax
and with -O2 -mlzcnt:
+   movl%edi, %eax
+   sarl$31, %eax
+   xorl%edi, %eax
+   lzcntl  %eax, %eax
+   subl$1, %eax
On armv7hl-linux-gnueabi with -O2:
-   push{r4, lr}
-   bl  __clrsbsi2
-   pop {r4, pc}
+   @ link register save eliminated.
+   eor r0, r0, r0, asr #31
+   clz r0, r0
+   sub r0, r0, #1
+   bx  lr
As it (at least usually) will make code larger, it is
disabled for -Os or cold instructions.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2021-08-19  Jakub Jelinek  

PR middle-end/101950
* optabs.c (expand_clrsb_using_clz): New function.
(expand_unop): Use it as another clrsb expansion fallback.

* gcc.target/i386/pr101950-1.c: New test.
* gcc.target/i386/pr101950-2.c: New test.

--- gcc/optabs.c.jj 2021-07-15 10:16:13.027581160 +0200
+++ gcc/optabs.c2021-08-18 13:36:56.410818265 +0200
@@ -2600,6 +2600,82 @@ widen_leading (scalar_int_mode mode, rtx
   return 0;
 }
 
+/* Attempt to emit (clrsb:mode op0) as
+   (plus:mode (clz:mode (xor:mode op0 (ashr:mode op0 (const_int prec-1
+ (const_int -1))
+   if CLZ_DEFINED_VALUE_AT_ZERO (mode, val) is 2 and val is prec,
+   or as
+   (clz:mode (ior:mode (xor:mode (ashl:mode op0 (const_int 1))
+(ashr:mode op0 (const_int prec-1)))
+  (const_int 1)))
+   otherwise.  */
+
+static rtx
+expand_clrsb_using_clz (scalar_int_mode mode, rtx op0, rtx target)
+{
+  if (optimize_insn_for_size_p ()
+  || optab_handler (clz_optab, mode) == CODE_FOR_nothing)
+return NULL_RTX;
+
+  start_sequence ();
+  HOST_WIDE_INT val = 0;
+  if (CLZ_DEFINED_VALUE_AT_ZERO (mode, val) != 2
+  || val != GET_MODE_PRECISION (mode))
+val = 0;
+  else
+val = 1;
+
+  rtx temp2 = op0;
+  if (!val)
+{
+  temp2 = expand_binop (mode, ashl_optab, op0, const1_rtx,
+   NULL_RTX, 0, OPTAB_DIRECT);
+  if (!temp2)
+   {
+   fail:
+ end_sequence ();
+ return NULL_RTX;
+   }
+}
+
+  rtx temp = expand_binop (mode, ashr_optab, op0,
+  GEN_INT (GET_MODE_PRECISION (mode) - 1),
+  NULL_RTX, 0, OPTAB_DIRECT);
+  if (!temp)
+goto fail;
+
+  temp = expand_binop (mode, xor_optab, temp2, temp, NULL_RTX, 0,
+  OPTAB_DIRECT);
+  if (!temp)
+goto fail;
+
+  if (!val)
+{
+  temp = expand_binop (mode, ior_optab, temp, const1_rtx,
+  NULL_RTX, 0, OPTAB_DIRECT);
+  if (!temp)
+   goto fail;
+}
+  temp = expand_unop_direct (mode, clz_optab, temp, val ? NULL_RTX : target,
+true);
+  if (!temp)
+goto fail;
+  if (val)
+{
+  temp = expand_binop (mode, add_optab, temp, constm1_rtx,
+  target, 0, OPTAB_DIRECT);
+  if (!temp)
+   goto fail;
+}
+
+  rtx_insn *seq = get_insns ();
+  end_sequence ();
+
+  add_equal_note (seq, temp, CLRSB, op0, NULL_RTX, mode);
+  emit_insn (seq);
+  return temp;
+}
+
 /* Try calculating clz of a double-word quantity as two clz's of word-sized
quantities, choosing which based on whether the high word is nonzero.  */
 static rtx
@@ -3171,6 +3247,9 @@ expand_unop (machine_mode mode, optab un
  temp = widen_leading (int_mode, op0, target, unoptab);
  if (temp)
return temp;
+ temp = expand_clrsb_using_clz (int_mode, op0, target);
+ if (temp)
+   return temp;
}
   goto try_libcall;
 }
--- gcc/testsuite/gcc.target/i386/pr101950-1.c.jj   2021-08-18 
13:58:05.363093681 +0200
+++ gcc/testsuite/gcc.target/i386/pr101950-1.c  2021-08-18 14:01:22.905335834 
+0200
@@ -0,0 +1,20 @@
+/* PR middle-end/101950 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not "call\[^\n\r]*__clrsb.i2" } } */
+/* { dg-final {