https://gcc.gnu.org/g:f4afefbbbee1414e130ca2f1552216bb702a985c

commit r16-4539-gf4afefbbbee1414e130ca2f1552216bb702a985c
Author: Roger Sayle <[email protected]>
Date:   Tue Oct 21 13:14:58 2025 +0100

    x86_64: Start TImode STV chains from zero-extension or *concatditi.
    
    Currently x86_64's TImode STV pass has the restriction that candidate
    chains must start with a TImode load from memory.  This patch improves
    the functionality of STV to allow zero-extensions and construction of
    TImode pseudos from two DImode values (i.e. *concatditi) to both be
    considered candidate chain initiators.  For example, this allows chains
    starting from an __int128 function argument to be processed by STV.
    
    Compiled with -O2 on x86_64:
    
    __int128 m0,m1,m2,m3;
    void foo(__int128 m)
    {
        m0 = m;
        m1 = m;
        m2 = m;
        m3 = m;
    }
    
    Previously generated:
    
    foo:    xchgq   %rdi, %rsi
            movq    %rsi, m0(%rip)
            movq    %rdi, m0+8(%rip)
            movq    %rsi, m1(%rip)
            movq    %rdi, m1+8(%rip)
            movq    %rsi, m2(%rip)
            movq    %rdi, m2+8(%rip)
            movq    %rsi, m3(%rip)
            movq    %rdi, m3+8(%rip)
            ret
    
    With the patch, we now generate:
    
    foo:    movq    %rdi, %xmm0
            movq    %rsi, %xmm1
            punpcklqdq      %xmm1, %xmm0
            movaps  %xmm0, m0(%rip)
            movaps  %xmm0, m1(%rip)
            movaps  %xmm0, m2(%rip)
            movaps  %xmm0, m3(%rip)
            ret
    
    or with -mavx2:
    
    foo:    vmovq   %rdi, %xmm1
            vpinsrq $1, %rsi, %xmm1, %xmm0
            vmovdqa %xmm0, m0(%rip)
            vmovdqa %xmm0, m1(%rip)
            vmovdqa %xmm0, m2(%rip)
            vmovdqa %xmm0, m3(%rip)
            ret
    
    Likewise, for zero-extension:
    
    __int128 m0,m1,m2,m3;
    void bar(unsigned long x)
    {
        __int128 m = x;
        m0 = m;
        m1 = m;
        m2 = m;
        m3 = m;
    }
    
    Previously with -O2:
    
    bar:    movq    %rdi, m0(%rip)
            movq    $0, m0+8(%rip)
            movq    %rdi, m1(%rip)
            movq    $0, m1+8(%rip)
            movq    %rdi, m2(%rip)
            movq    $0, m2+8(%rip)
            movq    %rdi, m3(%rip)
            movq    $0, m3+8(%rip)
            ret
    
    with this patch:
    
    bar:    movq    %rdi, %xmm0
            movaps  %xmm0, m0(%rip)
            movaps  %xmm0, m1(%rip)
            movaps  %xmm0, m2(%rip)
            movaps  %xmm0, m3(%rip)
            ret
    
    As shown in the examples above, the scalar-to-vector (STV) conversion of
    *concatditi has an overhead [treating two DImode registers as a TImode
    value is free on x86_64], but specifying this penalty allows the STV
    pass to make an informed decision if the total cost/gain of the chain
    is a net win.
    
    2025-10-21  Roger Sayle  <[email protected]>
    
    gcc/ChangeLog
            * config/i386/i386-features.cc (timode_concatdi_p): New
            function to recognize the various variants of *concatditi3_[1-7].
            (scalar_chain::add_insn): Like VEC_SELECT, ZERO_EXTEND and
            timode_concatdi_p instructions don't require their input
            operands to be converted (to TImode).
            (timode_scalar_chain::compute_convert_gain): Split/clone XOR and
            IOR cases from AND case, to handle timode_concatdi_p costs.
            <case PLUS>: Handle timode_concatdi_p conversion costs.
            <case ZERO_EXTEND>: Provide costs of DImode to TImode extension.
            (timode_convert_concatdi): Helper function to transform
            a *concatditi3 instruction into a vec_concatv2di instruction.
            (timode_scalar_chain::convert_insn): Split/clone XOR and IOR
            cases from ANS case, to handle timode_concatdi_p using the new
            timode_convert_concatdi helper function.
            <case ZERO_EXTEND>: Convert zero_extendditi2 to *vec_concatv2di_0.
            <case PLUS>: Handle timode_concatdi_p using the new
            timode_convert_concatdi helper function.
            (timode_scalar_to_vector_candidate_p): Support timode_concatdi_p
            instructions in IOR, XOR and PLUS cases.
            <case ZERO_EXTEND>: Consider zero extension of a register from
            DImode to TImode to be a candidate.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/sse4_1-stv-10.c: New test case.
            * gcc.target/i386/sse4_1-stv-11.c: Likewise.
            * gcc.target/i386/sse4_1-stv-12.c: Likewise.

Diff:
---
 gcc/config/i386/i386-features.cc              | 141 +++++++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c |  13 +++
 gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c |  14 +++
 gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c |  14 +++
 4 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 9348f55c2cd1..8e277843f23c 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -449,6 +449,30 @@ scalar_chain::analyze_register_chain (bitmap candidates, 
df_ref ref,
   return true;
 }
 
+/* Check whether X is a convertible *concatditi_? variant.  X is known
+   to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI.  */
+
+static bool
+timode_concatdi_p (rtx x)
+{
+  rtx op0 = XEXP (x, 0);
+  rtx op1 = XEXP (x, 1);
+
+  if (GET_CODE (op1) == ASHIFT)
+    std::swap (op0, op1);
+
+  return GET_CODE (op0) == ASHIFT
+        && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
+        && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
+        && REG_P (XEXP (XEXP (op0, 0), 0))
+        && CONST_INT_P (XEXP (op0, 1))
+        && INTVAL (XEXP (op0, 1)) == 64
+        && GET_CODE (op1) == ZERO_EXTEND
+        && GET_MODE (XEXP (op1, 0)) == DImode
+        && REG_P (XEXP (op1, 0));
+}
+
+
 /* Add instruction into a chain.  Return true if OK, false if the search
    was aborted.  */
 
@@ -477,9 +501,26 @@ scalar_chain::add_insn (bitmap candidates, unsigned int 
insn_uid,
       if (!analyze_register_chain (candidates, ref, disallowed))
        return false;
 
-  /* The operand(s) of VEC_SELECT don't need to be converted/convertible.  */
-  if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
-    return true;
+  /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
+     to be converted/convertible.  */
+  if (def_set)
+    switch (GET_CODE (SET_SRC (def_set)))
+      {
+      case VEC_SELECT:
+       return true;
+      case ZERO_EXTEND:
+       if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
+         return true;
+       break;
+      case PLUS:
+      case IOR:
+      case XOR:
+       if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
+         return true;
+       break;
+      default:
+       break;
+      }
 
   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
     if (!DF_REF_REG_MEM_P (ref))
@@ -1628,14 +1669,34 @@ timode_scalar_chain::compute_convert_gain ()
          break;
 
        case AND:
+         if (!MEM_P (dst))
+           igain = COSTS_N_INSNS (1);
+         if (CONST_SCALAR_INT_P (XEXP (src, 1)))
+           igain += timode_immed_const_gain (XEXP (src, 1), bb);
+         break;
+
        case XOR:
        case IOR:
+         if (timode_concatdi_p (src))
+           {
+             /* vmovq;vpinsrq (11 bytes).  */
+             igain = speed_p ? -2 * ix86_cost->sse_to_integer
+                             : -COSTS_N_BYTES (11);
+             break;
+           }
          if (!MEM_P (dst))
            igain = COSTS_N_INSNS (1);
          if (CONST_SCALAR_INT_P (XEXP (src, 1)))
            igain += timode_immed_const_gain (XEXP (src, 1), bb);
          break;
 
+       case PLUS:
+         if (timode_concatdi_p (src))
+           /* vmovq;vpinsrq (11 bytes).  */
+           igain = speed_p ? -2 * ix86_cost->sse_to_integer
+                           : -COSTS_N_BYTES (11);
+         break;
+
        case ASHIFT:
        case LSHIFTRT:
          /* See ix86_expand_v1ti_shift.  */
@@ -1794,6 +1855,13 @@ timode_scalar_chain::compute_convert_gain ()
            igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
          break;
 
+       case ZERO_EXTEND:
+         if (GET_MODE (XEXP (src, 0)) == DImode)
+           /* xor (2 bytes) vs. vmovq (5 bytes).  */
+           igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
+                           : -COSTS_N_BYTES (3);
+         break;
+
        default:
          break;
        }
@@ -1858,6 +1926,28 @@ timode_scalar_chain::fix_debug_reg_uses (rtx reg)
     }
 }
 
+/* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
+   Insert this before INSN, and return the result as a V1TImode subreg.  */
+
+static rtx
+timode_convert_concatdi (rtx src, rtx_insn *insn)
+{
+  rtx hi, lo;
+  rtx tmp = gen_reg_rtx (V2DImode);
+  if (GET_CODE (XEXP (src, 0)) == ASHIFT)
+    {
+      hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
+      lo = XEXP (XEXP (src, 1), 0);
+    }
+  else
+    {
+      hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
+      lo = XEXP (XEXP (src, 0), 0);
+    }
+  emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
+  return gen_rtx_SUBREG (V1TImode, tmp, 0);
+}
+
 /* Convert INSN from TImode to V1T1mode.  */
 
 void
@@ -1967,10 +2057,24 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
          PUT_MODE (src, V1TImode);
          break;
        }
-      /* FALLTHRU */
+      convert_op (&XEXP (src, 0), insn);
+      convert_op (&XEXP (src, 1), insn);
+      PUT_MODE (src, V1TImode);
+      if (MEM_P (dst))
+       {
+         tmp = gen_reg_rtx (V1TImode);
+         emit_insn_before (gen_rtx_SET (tmp, src), insn);
+         src = tmp;
+       }
+      break;
 
     case XOR:
     case IOR:
+      if (timode_concatdi_p (src))
+       {
+         src = timode_convert_concatdi (src, insn);
+         break;
+       }
       convert_op (&XEXP (src, 0), insn);
       convert_op (&XEXP (src, 1), insn);
       PUT_MODE (src, V1TImode);
@@ -2010,6 +2114,26 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
       PUT_MODE (src, V1TImode);
       break;
 
+    case ZERO_EXTEND:
+      if (GET_MODE (XEXP (src, 0)) == DImode)
+       {
+         /* Convert to *vec_concatv2di_0.  */
+         rtx tmp = gen_reg_rtx (V2DImode);
+         rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
+         emit_insn_before (gen_move_insn (tmp, pat), insn);
+         src = gen_rtx_SUBREG (vmode, tmp, 0);
+       }
+      else
+       gcc_unreachable ();
+      break;
+
+    case PLUS:
+      if (timode_concatdi_p (src))
+       src = timode_convert_concatdi (src, insn);
+      else
+       gcc_unreachable ();
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -2389,6 +2513,8 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
 
     case IOR:
     case XOR:
+      if (timode_concatdi_p (src))
+       return true;
       return (REG_P (XEXP (src, 0))
              || timode_mem_p (XEXP (src, 0)))
             && (REG_P (XEXP (src, 1))
@@ -2408,6 +2534,13 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
             && CONST_INT_P (XEXP (src, 1))
             && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
 
+    case PLUS:
+      return timode_concatdi_p (src);
+
+    case ZERO_EXTEND:
+      return REG_P (XEXP (src, 0))
+            && GET_MODE (XEXP (src, 0)) == DImode;
+
     default:
       return false;
     }
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c
new file mode 100644
index 000000000000..229bc459747c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-10.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
+
+__int128 m0,m1,m2,m3;
+void foo(__int128 m)
+{
+    m0 = m;
+    m1 = m;
+    m2 = m;
+    m3 = m;
+}
+
+/* { dg-final { scan-assembler-times "movaps" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c
new file mode 100644
index 000000000000..3508bfb67263
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-11.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
+
+__int128 m0,m1,m2,m3;
+void foo(unsigned long x)
+{
+    __int128 m = x;
+    m0 = m;
+    m1 = m;
+    m2 = m;
+    m3 = m;
+}
+
+/* { dg-final { scan-assembler-times "movaps" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c 
b/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c
new file mode 100644
index 000000000000..9587b6405d70
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse4_1-stv-12.c
@@ -0,0 +1,14 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse4.1 -mstv -mno-stackrealign" } */
+
+__int128 m0,m1,m2,m3;
+void foo(unsigned int x)
+{
+    __int128 m = x;
+    m0 = m;
+    m1 = m;
+    m2 = m;
+    m3 = m;
+}
+
+/* { dg-final { scan-assembler-times "movaps" 4 } } */

Reply via email to