https://gcc.gnu.org/g:c0fd049ae75349dc273de868fd3b3e8935115418

commit r17-563-gc0fd049ae75349dc273de868fd3b3e8935115418
Author: Jakub Jelinek <[email protected]>
Date:   Mon May 18 09:41:59 2026 +0200

    i386: Implement bitreverse<mode>2 optab for GFNI [PR50481]
    
    The following patch implements the bitreverse<mode>2 optab for
    -mgfni -msse2 (SSE2 because apparently -mgfni doesn't imply -msse nor
    -msse2).
    This is done by using gf2p8affineqb insn with a special constant
    which reverses bits in each byte, and for modes wider than QImode
    also by doing a byteswap afterwards.
    With -m64 it emits
    .LC0:
            .byte   1, 2, 4, 8, 16, 32, 64, -128
            .byte   1, 2, 4, 8, 16, 32, 64, -128
    and
            movd    %edi, %xmm0
            gf2p8affineqb   $0, .LC0(%rip), %xmm0
            movd    %xmm0, %eax
    for __builtin_bitreverse8,
            movd    %edi, %xmm0
            gf2p8affineqb   $0, .LC0(%rip), %xmm0
            movd    %xmm0, %eax
            rolw    $8, %ax
    for __builtin_bitreverse16,
            movd    %edi, %xmm0
            gf2p8affineqb   $0, .LC0(%rip), %xmm0
            movd    %xmm0, %eax
            bswap   %eax
    for __builtin_bitreverse32,
            movq    %rdi, %xmm0
            gf2p8affineqb   $0, .LC0(%rip), %xmm0
            movq    %xmm0, %rax
            bswap   %rax
    for __builtin_bitreverse64, and
            movq    %rdi, %xmm0
            pinsrq  $1, %rsi, %xmm0
            gf2p8affineqb   $0, .LC0(%rip), %xmm0
            movq    %xmm0, %rax
            pextrq  $1, %xmm0, %rdx
            bswap   %rax
            bswap   %rdx
            xchgq   %rdx, %rax
    for __builtin_bitreverse128 (only the xchgq is unnecessary
    and surprising, some RA issue).
    
    2026-05-18  Jakub Jelinek  <[email protected]>
    
            PR target/50481
            * config/i386/i386-protos.h (ix86_expand_gfni_bitreverse): Declare.
            * config/i386/i386-expand.cc (ix86_expand_gfni_bitreverse): New
            function.
            * config/i386/i386.md (bitreverse<mode>2): New expander.
    
            * gcc.target/i386/gfni-builtin-bitreverse-1.c: New test.
    
    Reviewed-by: Hongtao Liu <[email protected]>

Diff:
---
 gcc/config/i386/i386-expand.cc                     | 117 +++++++++++++++++++++
 gcc/config/i386/i386-protos.h                      |   1 +
 gcc/config/i386/i386.md                            |   9 ++
 .../gcc.target/i386/gfni-builtin-bitreverse-1.c    |  13 +++
 4 files changed, 140 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index f234f884f9e0..1a1d0652f885 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -28069,5 +28069,122 @@ ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx 
src)
   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
 }
 
+/* Implement bitreverse<mode>2 using gf2p8affineqb.  */
+
+void
+ix86_expand_gfni_bitreverse (rtx dest, rtx src)
+{
+  machine_mode mode = GET_MODE (dest);
+  rtx temp;
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+    {
+      rtx temp1 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
+      rtx temp2 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
+      if (mode == TImode)
+       {
+         temp = lowpart_subreg (DImode, src, TImode);
+         emit_insn (gen_rtx_SET (temp1, gen_rtx_VEC_CONCAT (V2DImode, temp,
+                                                            const0_rtx)));
+         temp = gen_highpart (DImode, src);
+         emit_insn (gen_rtx_SET (temp2, gen_rtx_VEC_CONCAT (V2DImode, temp,
+                                                            const0_rtx)));
+       }
+      else
+       {
+         temp = lowpart_subreg (SImode, src, DImode);
+         emit_insn (gen_vec_setv4si_0 (temp1, CONST0_RTX (V4SImode), temp));
+         temp = gen_highpart (SImode, src);
+         emit_insn (gen_vec_setv4si_0 (temp2, CONST0_RTX (V4SImode), temp));
+         temp1 = lowpart_subreg (V2DImode, temp1, V4SImode);
+         temp2 = lowpart_subreg (V2DImode, temp2, V4SImode);
+       }
+      temp = gen_reg_rtx (V2DImode);
+      emit_insn (gen_vec_interleave_lowv2di (temp, temp1, temp2));
+    }
+  else if (mode != DImode)
+    {
+      if (mode != SImode)
+       {
+         src = force_reg (mode, src);
+         src = lowpart_subreg (SImode, src, mode);
+       }
+      temp = gen_reg_rtx (V4SImode);
+      emit_insn (gen_vec_setv4si_0 (temp, CONST0_RTX (V4SImode), src));
+    }
+  else
+    {
+      temp = gen_reg_rtx (V2DImode);
+      emit_insn (gen_rtx_SET (temp, gen_rtx_VEC_CONCAT (V2DImode, src,
+                                                       const0_rtx)));
+    }
+  src = temp;
+  temp = gen_reg_rtx (V16QImode);
+  rtx src2 = gen_rtx_CONST_VECTOR (V16QImode,
+                                  gen_rtvec (16, GEN_INT (1), GEN_INT (2),
+                                             GEN_INT (4), GEN_INT (8),
+                                             GEN_INT (16), GEN_INT (32),
+                                             GEN_INT (64), GEN_INT (-128),
+                                             GEN_INT (1), GEN_INT (2),
+                                             GEN_INT (4), GEN_INT (8),
+                                             GEN_INT (16), GEN_INT (32),
+                                             GEN_INT (64), GEN_INT (-128)));
+  src2 = validize_mem (force_const_mem (V16QImode, src2));
+  src = lowpart_subreg (V16QImode, src, GET_MODE (src));
+  emit_insn (gen_vgf2p8affineqb_v16qi (temp, src, src2, const0_rtx));
+  if (mode == QImode)
+    {
+      rtx temp1 = gen_reg_rtx (SImode);
+      rtx temp2 = lowpart_subreg (V4SImode, temp, V16QImode);
+      rtx temp3 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
+      emit_insn (gen_rtx_SET (temp1,
+                             gen_rtx_VEC_SELECT (SImode, temp2, temp3)));
+      emit_move_insn (dest, lowpart_subreg (QImode, temp1, SImode));
+      return;
+    }
+  rtx target = gen_reg_rtx ((GET_MODE_SIZE (mode) < 4 || !TARGET_64BIT)
+                           ? SImode : mode == TImode ? DImode : mode);
+  emit_move_insn (target, lowpart_subreg (GET_MODE (target), temp, V16QImode));
+  if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
+    {
+      rtx temp1 = gen_reg_rtx (GET_MODE (target));
+      if (mode == TImode || TARGET_SSE4_1)
+       {
+         rtx temp2 = lowpart_subreg (mode == TImode ? V2DImode : V4SImode,
+                                     temp, V16QImode);
+         rtx temp3 = gen_rtx_PARALLEL (VOIDmode,
+                                       gen_rtvec (1, GEN_INT (mode == TImode
+                                                              ? 1 : 2)));
+         emit_insn (gen_rtx_SET (temp1,
+                                 gen_rtx_VEC_SELECT (GET_MODE (target), temp2,
+                                                     temp3)));
+       }
+      else
+       {
+         rtx temp2 = gen_reg_rtx (V4SImode);
+         rtx temp3 = lowpart_subreg (V4SImode, temp, V16QImode);
+         emit_insn (gen_sse2_pshufd (temp2, temp3, GEN_INT (0xaa)));
+         emit_move_insn (temp1, lowpart_subreg (GET_MODE (target), temp2,
+                                                V4SImode));
+       }
+      rtx temp4 = gen_reg_rtx (GET_MODE (target));
+      rtx temp5 = gen_reg_rtx (GET_MODE (target));
+      rtx (*gen_bswap) (rtx, rtx)
+       = mode == TImode ? gen_bswapdi2 : gen_bswapsi2;
+      emit_insn (gen_bswap (temp4, target));
+      emit_insn (gen_bswap (temp5, temp1));
+      temp4 = gen_rtx_ZERO_EXTEND (mode, temp4);
+      temp5 = gen_rtx_ZERO_EXTEND (mode, temp5);
+      rtx shift = GEN_INT (GET_MODE_PRECISION (GET_MODE (target)));
+      temp4 = gen_rtx_ASHIFT (mode, temp4, shift);
+      emit_insn (gen_rtx_SET (dest, gen_rtx_IOR (mode, temp4, temp5)));
+      return;
+    }
+  if (mode == HImode)
+    target = lowpart_subreg (mode, target, SImode);
+  if (mode == SImode)
+    emit_insn (gen_bswapsi2 (dest, target));
+  else
+    emit_insn (gen_rtx_SET (dest, gen_rtx_BSWAP (mode, target)));
+}
 
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 4ba4fb08556e..106a79dd41ee 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -263,6 +263,7 @@ extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, 
rtx op1, rtx op2,
                                int idx, rtx target);
 extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
 extern void ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx);
+extern void ix86_expand_gfni_bitreverse (rtx, rtx);
 
 
 #ifdef TREE_CODE
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fa84da26aa89..051105d5c1f0 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -23157,6 +23157,15 @@
   operands[3] = gen_lowpart (HImode, operands[2]);
 })
 
+(define_expand "bitreverse<mode>2"
+  [(set (match_operand:SWIDWI 0 "register_operand")
+       (bitreverse:SWIDWI (match_operand:SWIDWI 1 "nonimmediate_operand")))]
+  "TARGET_GFNI && TARGET_SSE2"
+{
+  ix86_expand_gfni_bitreverse (operands[0], operands[1]);
+  DONE;
+})
+
 (define_expand "paritydi2"
   [(set (match_operand:DI 0 "register_operand")
        (parity:DI (match_operand:DI 1 "register_operand")))]
diff --git a/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c 
b/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c
new file mode 100644
index 000000000000..6c190a3de846
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c
@@ -0,0 +1,13 @@
+/* { dg-do run { target gfni } } */
+/* { dg-options "-mgfni -O2" } */
+
+#define main do_test
+#include "../../gcc.dg/builtin-bitreverse-1.c"
+#undef main
+
+int
+main ()
+{
+  if (__builtin_cpu_supports ("gfni"))
+    do_test ();
+}

Reply via email to