This patch adds an RTL optimization to simplify-rtx.cc to simplify a
vec_select of a vec_select. It's very similar conceptually to yesterday's
patch to simplify a vec_merge of a vec_merge.
A motivating example is the following code on x86_64:
typedef unsigned int v4si __attribute__((vector_size(16)));
v4si foo(v4si vec, int val) {
vec[1] = val;
vec[2] = val;
return vec;
}
with -O2, GCC currently generates the following code:
foo: movd %edi, %xmm1
pshufd $225, %xmm0, %xmm0 // swap elements 0 and 1
movss %xmm1, %xmm0 // overwrite element 0
pshufd $225, %xmm0, %xmm0 // swap elements 0 and 1
pshufd $198, %xmm0, %xmm0 // swap elements 0 and 2
movss %xmm1, %xmm0 // overwrite element 0
pshufd $198, %xmm0, %xmm0 // swap elements 0 and 2
Notice there a two consecutive pshufd instructions, permuting the
same register. During combine, we see:
Trying 11 -> 14:
11: r103:V4SI=vec_select(r103:V4SI,parallel)
14: r105:V4SI=vec_select(r103:V4SI,parallel)
REG_DEAD r103:V4SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
(vec_select:V4SI (vec_select:V4SI (reg:V4SI 103 [ vec_4 ])
(parallel [
(const_int 1 [0x1])
(const_int 0 [0])
(const_int 2 [0x2])
(const_int 3 [0x3])
]))
(parallel [
(const_int 2 [0x2])
(const_int 1 [0x1])
(const_int 0 [0])
(const_int 3 [0x3])
])))
Clearly a permutation of a permutation is another permutation, so
the above expression can be simplified/canonicalized. Conveniently
there's already code in simplify_rtx to spot that a vec_select of
vec_select is an identity, this patch extends that functionality to
simplify a vec_select of a vec_select to a single vec_select.
With this transformation in simplify-rtx.cc, combine now reports:
Trying 11 -> 14:
11: r103:V4SI=vec_select(r103:V4SI,parallel)
14: r105:V4SI=vec_select(r103:V4SI,parallel)
REG_DEAD r103:V4SI
Successfully matched this instruction:
(set (reg:V4SI 105 [ vec_5 ])
(vec_select:V4SI (reg:V4SI 103 [ vec_4 ])
(parallel [
(const_int 2 [0x2])
(const_int 0 [0])
(const_int 1 [0x1])
(const_int 3 [0x3])
])))
allowing combination of insns 11 and 14
original costs 4 + 4 = 8
replacement cost 4
And for the example above, we now generate:
foo: movd %edi, %xmm1
pshufd $225, %xmm0, %xmm0
movss %xmm1, %xmm0
pshufd $210, %xmm0, %xmm0
movss %xmm1, %xmm0
pshufd $198, %xmm0, %xmm0
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline in stage 1?
2026-01-21 Roger Sayle <[email protected]>
gcc/ChangeLog
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1)
<case VEC_SELECT>: Simplify a (non-identity) vec_select of a
vec_select.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-pshufd-2.c: New test case.
Thanks in advance,
Roger
--
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 8016e02e925..8acda29d8e5 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -5277,6 +5277,7 @@ simplify_ashift:
rtx op0_subop1 = XEXP (trueop0, 1);
gcc_assert (GET_CODE (op0_subop1) == PARALLEL);
gcc_assert (known_eq (XVECLEN (trueop1, 0), GET_MODE_NUNITS (mode)));
+ bool identity_p = true;
/* Apply the outer ordering vector to the inner one. (The inner
ordering vector is expressly permitted to be of a different
@@ -5288,10 +5289,25 @@ simplify_ashift:
if (!CONST_INT_P (x))
return 0;
rtx y = XVECEXP (op0_subop1, 0, INTVAL (x));
- if (!CONST_INT_P (y) || i != INTVAL (y))
+ if (!CONST_INT_P (y))
return 0;
+ if (i != INTVAL (y))
+ identity_p = false;
}
- return XEXP (trueop0, 0);
+ if (identity_p)
+ return XEXP (trueop0, 0);
+
+ /* Otherwise a permutation of a permutation is a permutation. */
+ int len = XVECLEN (trueop1, 0);
+ rtvec vec = rtvec_alloc (len);
+ for (int i = 0; i < len; ++i)
+ {
+ rtx x = XVECEXP (trueop1, 0, i);
+ rtx y = XVECEXP (op0_subop1, 0, INTVAL (x));
+ RTVEC_ELT (vec, i) = y;
+ }
+ return gen_rtx_fmt_ee (code, mode, XEXP (trueop0, 0),
+ gen_rtx_PARALLEL (VOIDmode, vec));
}
return 0;
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c
b/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c
new file mode 100644
index 00000000000..6d5be2b7293
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-pshufd-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+
+__v4si foo(__v4si x)
+{
+ __v4si t = __builtin_ia32_pshufd (x, 225);
+ return __builtin_ia32_pshufd (t, 198);
+}
+
+/* { dg-final { scan-assembler-times "pshufd\[ \\t\]+" 1 } } */