This patch adds an RTL optimization to simplify-rtx.cc to simplify a
vec_select of a vec_select.  It's very similar conceptually to yesterday's
patch to simplify a vec_merge of a vec_merge.

A motivating example is the following code on x86_64:

typedef unsigned int v4si __attribute__((vector_size(16)));

v4si foo(v4si vec, int val) {
    vec[1] = val;
    vec[2] = val;
    return vec;
}

with -O2, GCC currently generates the following code:

foo:    movd    %edi, %xmm1
        pshufd  $225, %xmm0, %xmm0      // swap elements 0 and 1
        movss   %xmm1, %xmm0            // overwrite element 0
        pshufd  $225, %xmm0, %xmm0      // swap elements 0 and 1
        pshufd  $198, %xmm0, %xmm0      // swap elements 0 and 2
        movss   %xmm1, %xmm0            // overwrite element 0
        pshufd  $198, %xmm0, %xmm0      // swap elements 0 and 2


Notice there a two consecutive pshufd instructions, permuting the
same register.  During combine, we see:

Trying 11 -> 14:
   11: r103:V4SI=vec_select(r103:V4SI,parallel)
   14: r105:V4SI=vec_select(r103:V4SI,parallel)
      REG_DEAD r103:V4SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
    (vec_select:V4SI (vec_select:V4SI (reg:V4SI 103 [ vec_4 ])
            (parallel [
                    (const_int 1 [0x1])
                    (const_int 0 [0])
                    (const_int 2 [0x2])
                    (const_int 3 [0x3])
                ]))
        (parallel [
                (const_int 2 [0x2])
                (const_int 1 [0x1])
                (const_int 0 [0])
                (const_int 3 [0x3])
            ])))

Clearly a permutation of a permutation is another permutation, so
the above expression can be simplified/canonicalized.  Conveniently
there's already code in simplify_rtx to spot that a vec_select of
vec_select is an identity, this patch extends that functionality to
simplify a vec_select of a vec_select to a single vec_select.

With this transformation in simplify-rtx.cc, combine now reports:

Trying 11 -> 14:
   11: r103:V4SI=vec_select(r103:V4SI,parallel)
   14: r105:V4SI=vec_select(r103:V4SI,parallel)
      REG_DEAD r103:V4SI
Successfully matched this instruction:
(set (reg:V4SI 105 [ vec_5 ])
    (vec_select:V4SI (reg:V4SI 103 [ vec_4 ])
        (parallel [
                (const_int 2 [0x2])
                (const_int 0 [0])
                (const_int 1 [0x1])
                (const_int 3 [0x3])
            ])))
allowing combination of insns 11 and 14
original costs 4 + 4 = 8
replacement cost 4

And for the example above, we now generate:

foo:    movd    %edi, %xmm1
        pshufd  $225, %xmm0, %xmm0
        movss   %xmm1, %xmm0
        pshufd  $210, %xmm0, %xmm0
        movss   %xmm1, %xmm0
        pshufd  $198, %xmm0, %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline in stage 1?


2026-01-21  Roger Sayle  <[email protected]>

gcc/ChangeLog
        * simplify-rtx.cc (simplify_context::simplify_binary_operation_1)
        <case VEC_SELECT>: Simplify a (non-identity) vec_select of a
        vec_select.

gcc/testsuite/ChangeLog
        * gcc.target/i386/sse2-pshufd-2.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 8016e02e925..8acda29d8e5 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -5277,6 +5277,7 @@ simplify_ashift:
          rtx op0_subop1 = XEXP (trueop0, 1);
          gcc_assert (GET_CODE (op0_subop1) == PARALLEL);
          gcc_assert (known_eq (XVECLEN (trueop1, 0), GET_MODE_NUNITS (mode)));
+         bool identity_p = true;
 
          /* Apply the outer ordering vector to the inner one.  (The inner
             ordering vector is expressly permitted to be of a different
@@ -5288,10 +5289,25 @@ simplify_ashift:
              if (!CONST_INT_P (x))
                return 0;
              rtx y = XVECEXP (op0_subop1, 0, INTVAL (x));
-             if (!CONST_INT_P (y) || i != INTVAL (y))
+             if (!CONST_INT_P (y))
                return 0;
+             if (i != INTVAL (y))
+               identity_p = false;
            }
-         return XEXP (trueop0, 0);
+         if (identity_p)
+           return XEXP (trueop0, 0);
+
+         /* Otherwise a permutation of a permutation is a permutation.  */
+         int len = XVECLEN (trueop1, 0);
+         rtvec vec = rtvec_alloc (len);
+         for (int i = 0; i < len; ++i)
+           {
+             rtx x = XVECEXP (trueop1, 0, i);
+             rtx y = XVECEXP (op0_subop1, 0, INTVAL (x));
+             RTVEC_ELT (vec, i) = y;
+           }
+           return gen_rtx_fmt_ee (code, mode, XEXP (trueop0, 0),
+                                  gen_rtx_PARALLEL (VOIDmode, vec));
        }
 
       return 0;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c
new file mode 100644
index 00000000000..6d5be2b7293
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+__v4si foo(__v4si x)
+{
+  __v4si t = __builtin_ia32_pshufd (x, 225);
+  return __builtin_ia32_pshufd (t, 198);
+}
+
+/* { dg-final { scan-assembler-times "pshufd\[ \\t\]+" 1 } } */

Reply via email to