https://gcc.gnu.org/g:727f8b142b7d5442af6c2e903293abc367a8de5f

commit r15-1835-g727f8b142b7d5442af6c2e903293abc367a8de5f
Author: Roger Sayle <ro...@nextmovesoftware.com>
Date:   Thu Jul 4 07:31:17 2024 +0100

    i386: Add additional variant of bswaphisi2_lowpart peephole2.
    
    This patch adds an additional variation of the peephole2 used to convert
    bswaphisi2_lowpart into rotlhi3_1_slp, which converts xchgb %ah,%al into
    rotw if the flags register isn't live.  The motivating example is:
    
    void ext(int x);
    void foo(int x)
    {
      ext((x&~0xffff)|((x>>8)&0xff)|((x&0xff)<<8));
    }
    
    where GCC with -O2 currently produces:
    
    foo:    movl    %edi, %eax
            rolw    $8, %ax
            movl    %eax, %edi
            jmp     ext
    
    The issue is that the original xchgb (bswaphisi2_lowpart) can only be
    performed in "Q" registers that allow the %?h register to be used, so
    reload generates the above two movl.  However, it's later in peephole2
    where we see that CC_FLAGS can be clobbered, so we can use a rotate word,
    which is more forgiving with register allocations.  With the additional
    peephole2 proposed here, we now generate:
    
    foo:    rolw    $8, %di
            jmp     ext
    
    2024-07-04  Roger Sayle  <ro...@nextmovesoftware.com>
    
    gcc/ChangeLog
            * config/i386/i386.md (bswaphisi2_lowpart peephole2): New
            peephole2 variant to eliminate register shuffling.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/xchg-4.c: New test case.

Diff:
---
 gcc/config/i386/i386.md                | 24 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/xchg-4.c | 11 +++++++++++
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4a44b69b5fc..b24c4fe5875 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21489,6 +21489,30 @@
              (clobber (reg:CC FLAGS_REG))])]
   "operands[0] = gen_lowpart (HImode, operands[0]);")
 
+;; Variant of above peephole2 to improve register allocation.
+(define_peephole2
+  [(set (match_operand:SI 0 "general_reg_operand")
+        (match_operand:SI 1 "register_operand"))
+   (set (match_dup 0)
+       (ior:SI (and:SI (match_dup 0)
+                       (const_int -65536))
+               (lshiftrt:SI (bswap:SI (match_dup 0))
+                            (const_int 16))))
+   (set (match_operand:SI 2 "general_reg_operand") (match_dup 0))]
+  "!(TARGET_USE_XCHGB ||
+     TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && peep2_regno_dead_p (0, FLAGS_REG)
+   && peep2_reg_dead_p(3, operands[0])"
+  [(parallel
+    [(set (strict_low_part (match_dup 3))
+         (rotate:HI (match_dup 3) (const_int 8)))
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!rtx_equal_p (operands[1], operands[2]))
+    emit_move_insn (operands[2], operands[1]);
+  operands[3] = gen_lowpart (HImode, operands[2]);
+})
+
 (define_expand "paritydi2"
   [(set (match_operand:DI 0 "register_operand")
        (parity:DI (match_operand:DI 1 "register_operand")))]
diff --git a/gcc/testsuite/gcc.target/i386/xchg-4.c 
b/gcc/testsuite/gcc.target/i386/xchg-4.c
new file mode 100644
index 00000000000..de099e79f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xchg-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+void ext(int x);
+void foo(int x) 
+{
+    ext((x&~0xffff)|((x>>8)&0xff)|((x&0xff)<<8));
+}
+
+/* { dg-final { scan-assembler "rolw" } } */
+/* { dg-final { scan-assembler-not "mov" } } */

Reply via email to