This patch adds an RTL optimization to simplify-rtx.cc to simplify a
vec_merge of a vec_merge.
A motivating example is the following code on x86_64:
typedef unsigned int v4si __attribute__((vector_size(16)));
v4si foo(v4si vec, int val) {
vec[1] = val;
vec[3] = val;
return vec;
}
with -O2 -mavx2, GCC currently generates the following code:
foo: vpinsrd $1, %edi, %xmm0, %xmm0
vpinsrd $3, %edi, %xmm0, %xmm0
ret
During combine, we see:
Trying 9 -> 12:
9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
REG_DEAD r106:V4SI
12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
REG_DEAD r103:V4SI
REG_DEAD r102:SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
(vec_merge:V4SI (vec_merge:V4SI (vec_duplicate:V4SI (reg/v:SI 102 [
valD.3392 ]))
(reg:V4SI 106 [ vecD.3391 ])
(const_int 2 [0x2]))
(vec_duplicate:V4SI (reg/v:SI 102 [ valD.3392 ]))
(const_int 7 [0x7])))
This can be simplified/canonicalized as (vec_merge (vec_merge a b m) a n)
is (vec_merge a b (m|~n)). This is easy to see as the first two operands
of a vec_merge may be swapped by inverting the third, i.e.
(vec_merge a b n) is equivalent to (vec_merge b a ~n), and the merging
one set of elements from a vector, followed by another set of elements
from the same vector can be done in a single step/instruction, i.e.
(vec_merge a (vec_merge a b m) n) = (vec_merge a b (m|n)).
With this transformation in simplify-rtx.cc, combine now reports:
Trying 3, 9 -> 12:
3: r102:SI=r107:SI
REG_DEAD r107:SI
9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
REG_DEAD r106:V4SI
12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
REG_DEAD r103:V4SI
REG_DEAD r102:SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
(vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ]))
(reg:V4SI 106 [ vecD.3391 ])
(const_int 10 [0xa])))
Successfully matched this instruction:
(set (reg:V4SI 103 [ vec_4 ])
(vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ])))
Successfully matched this instruction:
(set (reg:V4SI 105 [ vec_5 ])
(vec_merge:V4SI (reg:V4SI 103 [ vec_4 ])
(reg:V4SI 106 [ vecD.3391 ])
(const_int 10 [0xa])))
allowing combination of insns 3, 9 and 12
original costs 4 + 4 + 4 = 12
replacement costs 4 + 4 = 8
And for the example above, we now generate the faster:
foo: vmovd %edi, %xmm2
vpbroadcastd %xmm2, %xmm1
vpblendd $10, %xmm1, %xmm0, %xmm0
ret
which uses only 1 inter-unit move.
The effect is even more dramatic when all the elements of a vector
get set:
v4si bar(v4si vec, int val) {
vec[0] = val;
vec[1] = val;
vec[2] = val;
vec[3] = val;
return vec;
}
Before:
bar: vpinsrd $0, %edi, %xmm0, %xmm0
vpinsrd $1, %edi, %xmm0, %xmm0
vpinsrd $2, %edi, %xmm0, %xmm0
vpinsrd $3, %edi, %xmm0, %xmm0
ret
After:
bar: vmovd %edi, %xmm1
vpbroadcastd %xmm1, %xmm0
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures. Ok for mainline in stage 1?
2026-01-20 Roger Sayle <[email protected]>
gcc/ChangeLog
* simplify-rtx.cc (simplify_context::simplify_ternary_operation)
<case VEC_MERGE>: Simplify a vec_merge of a vec_merge with a
repeated operand.
gcc/testsuite/ChangeLog
* gcc.target/i386/avx2-vpblendd128-3.c: New test case.
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 8016e02e925..6b0160c71a3 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7560,6 +7560,21 @@ simplify_context::simplify_ternary_operation (rtx_code
code, machine_mode mode,
if (!(sel & ~sel0 & mask) && !side_effects_p (XEXP (op0, 1)))
return simplify_gen_ternary (code, mode, mode,
XEXP (op0, 0), op1, op2);
+
+ /* Replace (vec_merge (vec_merge a b m) a n) with
+ (vec_merge a b (m|~n)). */
+ if (rtx_equal_p (XEXP (op0, 0), op1)
+ && ! side_effects_p (op1))
+ return simplify_gen_ternary (code, mode, mode,
+ op1, XEXP (op0, 1),
+ GEN_INT ((sel0 | ~sel) &
mask));
+ /* Replace (vec_merge (vec_merge b a m) a n) with
+ (vec_merge b a (m&n)). */
+ if (rtx_equal_p (XEXP (op0, 1), op1)
+ && ! side_effects_p (op1))
+ return simplify_gen_ternary (code, mode, mode,
+ XEXP (op0, 0), op1,
+ GEN_INT (sel & sel0 & mask));
}
}
if (GET_CODE (op1) == VEC_MERGE)
@@ -7574,6 +7589,22 @@ simplify_context::simplify_ternary_operation (rtx_code
code, machine_mode mode,
if (!(~sel & ~sel1 & mask) && !side_effects_p (XEXP (op1, 1)))
return simplify_gen_ternary (code, mode, mode,
op0, XEXP (op1, 0), op2);
+
+ /* Replace (vec_merge a (vec_merge a b m) n) with
+ (vec_merge a b (m|n)). */
+ if (rtx_equal_p (XEXP (op1, 0), op0)
+ && ! side_effects_p (op0))
+ return simplify_gen_ternary (code, mode, mode,
+ op0, XEXP (op1, 1),
+ GEN_INT ((sel | sel1) & mask));
+
+ /* Replace (vec_merge a (vec_merge b a m) n) with
+ (vec_merge a b (~m|n)). */
+ if (rtx_equal_p (XEXP (op1, 1), op0)
+ && ! side_effects_p (op0))
+ return simplify_gen_ternary (code, mode, mode,
+ op0, XEXP (op1, 0),
+ GEN_INT ((sel | ~sel1) &
mask));
}
}
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
new file mode 100644
index 00000000000..a4bd90b48d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+v4si foo(v4si vec, int val) {
+ vec[0] = val;
+ vec[2] = val;
+ return vec;
+}
+
+v4si bar(v4si vec, int val) {
+ vec[0] = val;
+ vec[1] = val;
+ vec[2] = val;
+ vec[3] = val;
+ return vec;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+/* { dg-final { scan-assembler-times "vpblendd\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-not "vpinsrd" } } */