https://gcc.gnu.org/g:f6971e28e71eda7f8a1d1247f57ee7f16828864d

commit r17-916-gf6971e28e71eda7f8a1d1247f57ee7f16828864d
Author: Roger Sayle <[email protected]>
Date:   Thu May 28 20:50:11 2026 +0100

    x86_64 SSE: Handle SUBREG conversions in TImode STV (for ptest).
    
    This patch teaches i386's STV pass how to handle SUBREG conversions,
    i.e. that a TImode SUBREG can be transformed into a V1TImode SUBREG,
    without worrying about other DEFs and USEs.
    
    One example where this is useful is
    
    typedef long long __m128i __attribute__ ((__vector_size__ (16)));
    int foo (__m128i x, __m128i y) {
      return (__int128)x == (__int128)y;
    }
    
    where with -O2 -msse4 we can now scalar-to-vector transform:
    
    (insn 7 4 8 2 (set (reg:CCZ 17 flags)
            (compare:CCZ (subreg:TI (reg/v:V2DI 86 [ x ]) 0)
                (subreg:TI (reg/v:V2DI 87 [ y ]) 0))) {*cmpti_doubleword}
    
    into
    
    (insn 17 4 7 2 (set (reg:V1TI 91)
            (xor:V1TI (subreg:V1TI (reg/v:V2DI 86 [ x ]) 0)
                (subreg:V1TI (reg/v:V2DI 87 [ y ]) 0)))
         (nil))
    (insn 7 17 8 2 (set (reg:CCZ 17 flags)
            (unspec:CCZ [
                    (reg:V1TI 91) repeated x2
                ] UNSPEC_PTEST)) {*sse4_1_ptestv1ti}
         (expr_list:REG_DEAD (reg/v:V2DI 87 [ y ])
            (expr_list:REG_DEAD (reg/v:V2DI 86 [ x ])
                (nil))))
    
    with the dramatic effect that the assembly output before:
    
    foo:    movaps  %xmm0, -40(%rsp)
            movq    -32(%rsp), %rdx
            movq    %xmm0, %rax
            movq    %xmm1, %rsi
            movaps  %xmm1, -24(%rsp)
            movq    -16(%rsp), %rcx
            xorq    %rsi, %rax
            xorq    %rcx, %rdx
            orq     %rdx, %rax
            sete    %al
            movzbl  %al, %eax
            ret
    
    now becomes
    
    foo:    pxor    %xmm1, %xmm0
            xorl    %eax, %eax
            ptest   %xmm0, %xmm0
            sete    %al
            ret
    
    i.e. a 128-bit vector doesn't need to be transferred to the
    scalar unit to be tested for equality.  The new test case includes
    additional related examples that show similar improvements.
    
    Previously we explicitly checked *cmpti_doubleword operands to be
    either immediate constants, or a TImode REG or a TImode MEM.  By
    enhancing this to allow a TImode SUBREG, we now handle everything
    that would match the general_operand predicate, making this part
    of STV more like other RTL passes (lra/reload).  The big change is
    that unlike a regular DF USE, a SUBREG USE doesn't require us to
    analyze and convert the rest of the DEF-USE chain.
    
    2026-05-28  Roger Sayle  <[email protected]>
                Hongtao Liu  <[email protected]>
    
    gcc/ChangeLog
            * config/i386/i386-features.cc (scalar_chain::add_insn): Don't
            call analyze_register_chain if the USE is a SUBREG.
            (scalar_chain::convert_op): Call gen_lowpart to convert
            scalar (TImode) SUBREGs to vector (V1TImode) SUBREGs.
            (convertible_comparison_p): We can now handle all general_operands
            of *cmp<dwi>_doubleword.
            (timode_remove_non_convertible_regs): We only need to check TImode
            uses that aren't TImode SUBREGs of registers in other modes.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse4_1-ptest-7.c: New test case.

Diff:
---
 gcc/config/i386/i386-features.cc               | 19 +++++++++----------
 gcc/testsuite/gcc.target/i386/sse4_1-ptest-7.c | 22 ++++++++++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 30c15e63a5e2..4f3f50a65248 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -527,7 +527,8 @@ scalar_chain::add_insn (bitmap candidates, unsigned int 
insn_uid,
       }
 
   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
-    if (!DF_REF_REG_MEM_P (ref))
+    if (DF_REF_TYPE (ref) == DF_REF_REG_USE
+       && !SUBREG_P (DF_REF_REG (ref)))
       if (!analyze_register_chain (candidates, ref, disallowed))
        return false;
 
@@ -1167,7 +1168,8 @@ scalar_chain::convert_op (rtx *op, rtx_insn *insn)
   else
     {
       gcc_assert (SUBREG_P (*op));
-      gcc_assert (GET_MODE (*op) == vmode);
+      if (GET_MODE (*op) != vmode)
+       *op = gen_lowpart (vmode, *op);
     }
 }
 
@@ -2343,12 +2345,8 @@ convertible_comparison_p (rtx_insn *insn, enum 
machine_mode mode)
   rtx op2 = XEXP (src, 1);
 
   /* *cmp<dwi>_doubleword.  */
-  if ((CONST_SCALAR_INT_P (op1)
-       || ((REG_P (op1) || MEM_P (op1))
-          && GET_MODE (op1) == mode))
-      && (CONST_SCALAR_INT_P (op2)
-         || ((REG_P (op2) || MEM_P (op2))
-             && GET_MODE (op2) == mode)))
+  if (general_operand (op1, mode)
+      && general_operand (op2, mode))
     return true;
 
   /* *testti_doubleword.  */
@@ -2715,8 +2713,9 @@ timode_remove_non_convertible_regs (bitmap candidates)
                                               DF_REF_REGNO (ref));
 
        FOR_EACH_INSN_USE (ref, insn)
-         if (!DF_REF_REG_MEM_P (ref)
-             && GET_MODE (DF_REF_REG (ref)) == TImode)
+         if (DF_REF_TYPE (ref) == DF_REF_REG_USE
+             && GET_MODE (DF_REF_REG (ref)) == TImode
+             && !SUBREG_P (DF_REF_REG (ref)))
            timode_check_non_convertible_regs (candidates, regs,
                                               DF_REF_REGNO (ref));
       }
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-ptest-7.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-7.c
new file mode 100644
index 000000000000..2b7c049d155e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-ptest-7.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse4.1 -mno-stackrealign" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo (__m128i x, __m128i y)
+{
+  return (__int128)x == (__int128)y;
+}
+
+int bar (__m128i x, __m128i y)
+{
+  return (__int128)(x^y) == 0;
+}
+
+int baz (__m128i x, __m128i y)
+{
+  return (__int128)(x==y) == ~0;
+}
+
+/* { dg-final { scan-assembler-times "ptest\[ \\t\]+%" 3 } } */
+/* { dg-final { scan-assembler-not "%\[er\]sp" } } */

Reply via email to