https://gcc.gnu.org/g:c0fd049ae75349dc273de868fd3b3e8935115418
commit r17-563-gc0fd049ae75349dc273de868fd3b3e8935115418 Author: Jakub Jelinek <[email protected]> Date: Mon May 18 09:41:59 2026 +0200 i386: Implement bitreverse<mode>2 optab for GFNI [PR50481] The following patch implements the bitreverse<mode>2 optab for -mgfni -msse2 (SSE2 because apparently -mgfni doesn't imply -msse nor -msse2). This is done by using gf2p8affineqb insn with a special constant which reverses bits in each byte, and for modes wider than QImode also by doing a byteswap afterwards. With -m64 it emits .LC0: .byte 1, 2, 4, 8, 16, 32, 64, -128 .byte 1, 2, 4, 8, 16, 32, 64, -128 and movd %edi, %xmm0 gf2p8affineqb $0, .LC0(%rip), %xmm0 movd %xmm0, %eax for __builtin_bitreverse8, movd %edi, %xmm0 gf2p8affineqb $0, .LC0(%rip), %xmm0 movd %xmm0, %eax rolw $8, %ax for __builtin_bitreverse16, movd %edi, %xmm0 gf2p8affineqb $0, .LC0(%rip), %xmm0 movd %xmm0, %eax bswap %eax for __builtin_bitreverse32, movq %rdi, %xmm0 gf2p8affineqb $0, .LC0(%rip), %xmm0 movq %xmm0, %rax bswap %rax for __builtin_bitreverse64, and movq %rdi, %xmm0 pinsrq $1, %rsi, %xmm0 gf2p8affineqb $0, .LC0(%rip), %xmm0 movq %xmm0, %rax pextrq $1, %xmm0, %rdx bswap %rax bswap %rdx xchgq %rdx, %rax for __builtin_bitreverse128 (only the xchgq is unnecessary and surprising, some RA issue). 2026-05-18 Jakub Jelinek <[email protected]> PR target/50481 * config/i386/i386-protos.h (ix86_expand_gfni_bitreverse): Declare. * config/i386/i386-expand.cc (ix86_expand_gfni_bitreverse): New function. * config/i386/i386.md (bitreverse<mode>2): New expander. * gcc.target/i386/gfni-builtin-bitreverse-1.c: New test. Reviewed-by: Hongtao Liu <[email protected]> Diff: --- gcc/config/i386/i386-expand.cc | 117 +++++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.md | 9 ++ .../gcc.target/i386/gfni-builtin-bitreverse-1.c | 13 +++ 4 files changed, 140 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index f234f884f9e0..1a1d0652f885 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -28069,5 +28069,122 @@ ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src) emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode)); } +/* Implement bitreverse<mode>2 using gf2p8affineqb. */ + +void +ix86_expand_gfni_bitreverse (rtx dest, rtx src) +{ + machine_mode mode = GET_MODE (dest); + rtx temp; + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + { + rtx temp1 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode); + rtx temp2 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode); + if (mode == TImode) + { + temp = lowpart_subreg (DImode, src, TImode); + emit_insn (gen_rtx_SET (temp1, gen_rtx_VEC_CONCAT (V2DImode, temp, + const0_rtx))); + temp = gen_highpart (DImode, src); + emit_insn (gen_rtx_SET (temp2, gen_rtx_VEC_CONCAT (V2DImode, temp, + const0_rtx))); + } + else + { + temp = lowpart_subreg (SImode, src, DImode); + emit_insn (gen_vec_setv4si_0 (temp1, CONST0_RTX (V4SImode), temp)); + temp = gen_highpart (SImode, src); + emit_insn (gen_vec_setv4si_0 (temp2, CONST0_RTX (V4SImode), temp)); + temp1 = lowpart_subreg (V2DImode, temp1, V4SImode); + temp2 = lowpart_subreg (V2DImode, temp2, V4SImode); + } + temp = gen_reg_rtx (V2DImode); + emit_insn (gen_vec_interleave_lowv2di (temp, temp1, temp2)); + } + else if (mode != DImode) + { + if (mode != SImode) + { + src = force_reg (mode, src); + src = lowpart_subreg (SImode, src, mode); + } + temp = gen_reg_rtx (V4SImode); + emit_insn (gen_vec_setv4si_0 (temp, CONST0_RTX (V4SImode), src)); + } + else + { + temp = gen_reg_rtx (V2DImode); + emit_insn (gen_rtx_SET (temp, gen_rtx_VEC_CONCAT (V2DImode, src, + const0_rtx))); + } + src = temp; + temp = gen_reg_rtx (V16QImode); + rtx src2 = gen_rtx_CONST_VECTOR (V16QImode, + gen_rtvec (16, GEN_INT (1), GEN_INT (2), + GEN_INT (4), GEN_INT (8), + GEN_INT (16), GEN_INT (32), + GEN_INT (64), GEN_INT (-128), + GEN_INT (1), GEN_INT (2), + GEN_INT (4), GEN_INT (8), + GEN_INT (16), GEN_INT (32), + GEN_INT (64), GEN_INT (-128))); + src2 = validize_mem (force_const_mem (V16QImode, src2)); + src = lowpart_subreg (V16QImode, src, GET_MODE (src)); + emit_insn (gen_vgf2p8affineqb_v16qi (temp, src, src2, const0_rtx)); + if (mode == QImode) + { + rtx temp1 = gen_reg_rtx (SImode); + rtx temp2 = lowpart_subreg (V4SImode, temp, V16QImode); + rtx temp3 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + emit_insn (gen_rtx_SET (temp1, + gen_rtx_VEC_SELECT (SImode, temp2, temp3))); + emit_move_insn (dest, lowpart_subreg (QImode, temp1, SImode)); + return; + } + rtx target = gen_reg_rtx ((GET_MODE_SIZE (mode) < 4 || !TARGET_64BIT) + ? SImode : mode == TImode ? DImode : mode); + emit_move_insn (target, lowpart_subreg (GET_MODE (target), temp, V16QImode)); + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + { + rtx temp1 = gen_reg_rtx (GET_MODE (target)); + if (mode == TImode || TARGET_SSE4_1) + { + rtx temp2 = lowpart_subreg (mode == TImode ? V2DImode : V4SImode, + temp, V16QImode); + rtx temp3 = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (1, GEN_INT (mode == TImode + ? 1 : 2))); + emit_insn (gen_rtx_SET (temp1, + gen_rtx_VEC_SELECT (GET_MODE (target), temp2, + temp3))); + } + else + { + rtx temp2 = gen_reg_rtx (V4SImode); + rtx temp3 = lowpart_subreg (V4SImode, temp, V16QImode); + emit_insn (gen_sse2_pshufd (temp2, temp3, GEN_INT (0xaa))); + emit_move_insn (temp1, lowpart_subreg (GET_MODE (target), temp2, + V4SImode)); + } + rtx temp4 = gen_reg_rtx (GET_MODE (target)); + rtx temp5 = gen_reg_rtx (GET_MODE (target)); + rtx (*gen_bswap) (rtx, rtx) + = mode == TImode ? gen_bswapdi2 : gen_bswapsi2; + emit_insn (gen_bswap (temp4, target)); + emit_insn (gen_bswap (temp5, temp1)); + temp4 = gen_rtx_ZERO_EXTEND (mode, temp4); + temp5 = gen_rtx_ZERO_EXTEND (mode, temp5); + rtx shift = GEN_INT (GET_MODE_PRECISION (GET_MODE (target))); + temp4 = gen_rtx_ASHIFT (mode, temp4, shift); + emit_insn (gen_rtx_SET (dest, gen_rtx_IOR (mode, temp4, temp5))); + return; + } + if (mode == HImode) + target = lowpart_subreg (mode, target, SImode); + if (mode == SImode) + emit_insn (gen_bswapsi2 (dest, target)); + else + emit_insn (gen_rtx_SET (dest, gen_rtx_BSWAP (mode, target))); +} #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 4ba4fb08556e..106a79dd41ee 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -263,6 +263,7 @@ extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, rtx target); extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx); extern void ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx); +extern void ix86_expand_gfni_bitreverse (rtx, rtx); #ifdef TREE_CODE diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index fa84da26aa89..051105d5c1f0 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -23157,6 +23157,15 @@ operands[3] = gen_lowpart (HImode, operands[2]); }) +(define_expand "bitreverse<mode>2" + [(set (match_operand:SWIDWI 0 "register_operand") + (bitreverse:SWIDWI (match_operand:SWIDWI 1 "nonimmediate_operand")))] + "TARGET_GFNI && TARGET_SSE2" +{ + ix86_expand_gfni_bitreverse (operands[0], operands[1]); + DONE; +}) + (define_expand "paritydi2" [(set (match_operand:DI 0 "register_operand") (parity:DI (match_operand:DI 1 "register_operand")))] diff --git a/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c b/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c new file mode 100644 index 000000000000..6c190a3de846 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/gfni-builtin-bitreverse-1.c @@ -0,0 +1,13 @@ +/* { dg-do run { target gfni } } */ +/* { dg-options "-mgfni -O2" } */ + +#define main do_test +#include "../../gcc.dg/builtin-bitreverse-1.c" +#undef main + +int +main () +{ + if (__builtin_cpu_supports ("gfni")) + do_test (); +}
