On Fri, Jul 30, 2021 at 12:27:39PM +0200, Uros Bizjak wrote:
> Please put some space here, e.g.:
...
> Can you just name the relevant insn pattern and use
> 
> emit_insn (gen_bsr_1)?

Here is the updated patch.  I'll bootstrap/regtest it tonight.

2021-07-30  Jakub Jelinek  <ja...@redhat.com>

        PR target/78103
        * config/i386/i386.md (bsr_rex64_1, bsr_1, bsr_zext_1): New
        define_insn patterns.
        (*bsr_rex64_2, *bsr_2): New define_insn_and_split patterns.
        Add combine splitters for constant - clz.
        (clz<mode>2): Use a temporary pseudo for bsr result.

        * gcc.target/i386/pr78103-1.c: New test.
        * gcc.target/i386/pr78103-2.c: New test.
        * gcc.target/i386/pr78103-3.c: New test.

--- gcc/config/i386/i386.md.jj  2021-07-28 12:05:56.857977764 +0200
+++ gcc/config/i386/i386.md     2021-07-30 15:13:49.994946550 +0200
@@ -14761,6 +14761,18 @@ (define_insn "bsr_rex64"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "DI")])
 
+(define_insn "bsr_rex64_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (minus:DI (const_int 63)
+                 (clz:DI (match_operand:DI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{q}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "DI")])
+
 (define_insn "bsr"
   [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ (match_operand:SI 1 "nonimmediate_operand" "rm")
@@ -14775,17 +14787,204 @@ (define_insn "bsr"
    (set_attr "znver1_decode" "vector")
    (set_attr "mode" "SI")])
 
+(define_insn "bsr_1"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (minus:SI (const_int 31)
+                 (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm"))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT"
+  "bsr{l}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+(define_insn "bsr_zext_1"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+       (zero_extend:DI
+         (minus:SI
+           (const_int 31)
+           (clz:SI (match_operand:SI 1 "nonimmediate_operand" "rm")))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT"
+  "bsr{l}\t{%1, %k0|%k0, %1}"
+  [(set_attr "type" "alu1")
+   (set_attr "prefix_0f" "1")
+   (set_attr "znver1_decode" "vector")
+   (set_attr "mode" "SI")])
+
+; As bsr is undefined behavior on zero and for other input
+; values it is in range 0 to 63, we can optimize away sign-extends.
+(define_insn_and_split "*bsr_rex64_2"
+  [(set (match_operand:DI 0 "register_operand")
+       (xor:DI
+         (sign_extend:DI
+           (minus:SI
+             (const_int 63)
+             (subreg:SI (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+                        0)))
+         (const_int 63)))
+    (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+                  (compare:CCZ (match_dup 1) (const_int 0)))
+             (set (match_dup 2)
+                  (minus:DI (const_int 63) (clz:DI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+                  (zero_extend:DI (xor:SI (match_dup 3) (const_int 63))))
+             (clobber (reg:CC FLAGS_REG))])]
+{
+  operands[2] = gen_reg_rtx (DImode);
+  operands[3] = lowpart_subreg (SImode, operands[2], DImode);
+})
+
+(define_insn_and_split "*bsr_2"
+  [(set (match_operand:DI 0 "register_operand")
+       (sign_extend:DI
+         (xor:SI
+           (minus:SI
+             (const_int 31)
+             (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+           (const_int 31))))
+   (clobber (reg:CC FLAGS_REG))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(parallel [(set (reg:CCZ FLAGS_REG)
+                  (compare:CCZ (match_dup 1) (const_int 0)))
+             (set (match_dup 2)
+                  (minus:SI (const_int 31) (clz:SI (match_dup 1))))])
+   (parallel [(set (match_dup 0)
+                  (zero_extend:DI (xor:SI (match_dup 2) (const_int 31))))
+             (clobber (reg:CC FLAGS_REG))])]
+  "operands[2] = gen_reg_rtx (SImode);")
+
+; Splitters to optimize 64 - __builtin_clzl (x) or 32 - __builtin_clz (x).
+; Again, as for !TARGET_LZCNT CLZ is UB at zero, CLZ is guaranteed to be
+; in [0, 63] or [0, 31] range.
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+       (minus:SI
+         (match_operand:SI 2 "const_int_operand")
+         (xor:SI
+           (minus:SI (const_int 63)
+                     (subreg:SI
+                       (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+                       0))
+           (const_int 63))))]
+  "!TARGET_LZCNT && TARGET_64BIT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+       (plus:SI (match_dup 5) (match_dup 4)))]
+{
+  operands[3] = gen_reg_rtx (DImode);
+  operands[5] = lowpart_subreg (SImode, operands[3], DImode);
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[3], operands[1]));
+      emit_move_insn (operands[0], operands[5]);
+      DONE;
+    }
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 63, SImode);
+})
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+       (minus:SI
+         (match_operand:SI 2 "const_int_operand")
+         (xor:SI
+           (minus:SI (const_int 31)
+                     (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+           (const_int 31))))]
+  "!TARGET_LZCNT && ix86_pre_reload_split ()"
+  [(set (match_dup 3)
+       (minus:SI (const_int 31) (clz:SI (match_dup 1))))
+   (set (match_dup 0)
+       (plus:SI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (SImode);
+  operands[4] = gen_int_mode (UINTVAL (operands[2]) - 31, SImode);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+       (minus:DI
+         (match_operand:DI 2 "const_int_operand")
+         (xor:DI
+           (sign_extend:DI
+             (minus:SI (const_int 63)
+                       (subreg:SI
+                         (clz:DI (match_operand:DI 1 "nonimmediate_operand"))
+                         0)))
+           (const_int 63))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 63, SImode)
+       == UINTVAL (operands[2]) - 63)"
+  [(set (match_dup 3)
+       (minus:DI (const_int 63) (clz:DI (match_dup 1))))
+   (set (match_dup 0)
+       (plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 63)
+    {
+      emit_insn (gen_bsr_rex64_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 63);
+})
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+       (minus:DI
+         (match_operand:DI 2 "const_int_operand")
+         (sign_extend:DI
+           (xor:SI
+             (minus:SI (const_int 31)
+                       (clz:SI (match_operand:SI 1 "nonimmediate_operand")))
+             (const_int 31)))))]
+  "!TARGET_LZCNT
+   && TARGET_64BIT
+   && ix86_pre_reload_split ()
+   && ((unsigned HOST_WIDE_INT)
+       trunc_int_for_mode (UINTVAL (operands[2]) - 31, SImode)
+       == UINTVAL (operands[2]) - 31)"
+  [(set (match_dup 3)
+       (zero_extend:DI (minus:SI (const_int 31) (clz:SI (match_dup 1)))))
+   (set (match_dup 0)
+       (plus:DI (match_dup 3) (match_dup 4)))]
+{
+  if (INTVAL (operands[2]) == 31)
+    {
+      emit_insn (gen_bsr_zext_1 (operands[0], operands[1]));
+      DONE;
+    }
+  operands[3] = gen_reg_rtx (DImode);
+  operands[4] = GEN_INT (UINTVAL (operands[2]) - 31);
+})
+
 (define_expand "clz<mode>2"
   [(parallel
      [(set (reg:CCZ FLAGS_REG)
        (compare:CCZ (match_operand:SWI48 1 "nonimmediate_operand" "rm")
                     (const_int 0)))
-      (set (match_operand:SWI48 0 "register_operand")
-          (minus:SWI48
-            (match_dup 2)
-            (clz:SWI48 (match_dup 1))))])
+      (set (match_dup 3) (minus:SWI48
+                          (match_dup 2)
+                          (clz:SWI48 (match_dup 1))))])
    (parallel
-     [(set (match_dup 0) (xor:SWI48 (match_dup 0) (match_dup 2)))
+     [(set (match_operand:SWI48 0 "register_operand")
+          (xor:SWI48 (match_dup 3) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
   ""
 {
@@ -14795,6 +14994,7 @@ (define_expand "clz<mode>2"
       DONE;
     }
   operands[2] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode)-1);
+  operands[3] = gen_reg_rtx (<MODE>mode);
 })
 
 (define_insn_and_split "clz<mode>2_lzcnt"
--- gcc/testsuite/gcc.target/i386/pr78103-1.c.jj        2021-07-30 
15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-1.c   2021-07-30 15:07:26.104139537 
+0200
@@ -0,0 +1,28 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mcltq\M} } } */
+
+long long
+foo (long long x)
+{
+  return __builtin_clzll (x);
+}
+
+long long
+bar (long long x)
+{
+  return (unsigned int) __builtin_clzll (x);
+}
+
+long long
+baz (int x)
+{
+  return __builtin_clz (x);
+}
+
+long long
+qux (int x)
+{
+  return (unsigned int) __builtin_clz (x);
+}
--- gcc/testsuite/gcc.target/i386/pr78103-2.c.jj        2021-07-30 
15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-2.c   2021-07-30 15:07:26.104139537 
+0200
@@ -0,0 +1,33 @@
+/* PR target/78103 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubl\M} } } */
+/* { dg-final { scan-assembler {\m(leal|addl)\M} } } */
+
+unsigned int
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned int
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+#ifdef __x86_64__
+unsigned int
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned int
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}
+#endif
--- gcc/testsuite/gcc.target/i386/pr78103-3.c.jj        2021-07-30 
15:07:26.104139537 +0200
+++ gcc/testsuite/gcc.target/i386/pr78103-3.c   2021-07-30 15:07:26.104139537 
+0200
@@ -0,0 +1,32 @@
+/* PR target/78103 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-lzcnt" } */
+/* { dg-final { scan-assembler-not {\mmovl\M} } } */
+/* { dg-final { scan-assembler-not {\mmovslq\M} } } */
+/* { dg-final { scan-assembler-not {\mxor[lq]\M} } } */
+/* { dg-final { scan-assembler-not {\msubq\M} } } */
+/* { dg-final { scan-assembler {\m(leaq|addq)\M} } } */
+
+unsigned long long
+foo (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - __builtin_clz (x);
+}
+
+unsigned long long
+bar (unsigned int x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned int) - 1 - __builtin_clz (x);
+}
+
+unsigned long long
+baz (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - __builtin_clzll (x);
+}
+
+unsigned long long
+qux (unsigned long long x)
+{
+  return __CHAR_BIT__ * sizeof (unsigned long long) - 1 - __builtin_clzll (x);
+}


        Jakub

Reply via email to