This patch adds an RTL optimization to simplify-rtx.cc to simplify a
vec_merge of a vec_merge.

A motivating example is the following code on x86_64:

typedef unsigned int v4si __attribute__((vector_size(16)));

v4si foo(v4si vec, int val) {
    vec[1] = val;
    vec[3] = val;
    return vec;
}

with -O2 -mavx2, GCC currently generates the following code:

foo:    vpinsrd $1, %edi, %xmm0, %xmm0
        vpinsrd $3, %edi, %xmm0, %xmm0
        ret

During combine, we see:

Trying 9 -> 12:
    9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
      REG_DEAD r106:V4SI
   12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
      REG_DEAD r103:V4SI
      REG_DEAD r102:SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
    (vec_merge:V4SI (vec_merge:V4SI (vec_duplicate:V4SI (reg/v:SI 102 [
valD.3392 ]))
            (reg:V4SI 106 [ vecD.3391 ])
            (const_int 2 [0x2]))
        (vec_duplicate:V4SI (reg/v:SI 102 [ valD.3392 ]))
        (const_int 7 [0x7])))

This can be simplified/canonicalized as (vec_merge (vec_merge a b m) a n)
is (vec_merge a b (m|~n)).  This is easy to see as the first two operands
of a vec_merge may be swapped by inverting the third, i.e.
(vec_merge a b n) is equivalent to (vec_merge b a ~n), and the merging
one set of elements from a vector, followed by another set of elements
from the same vector can be done in a single step/instruction, i.e.
(vec_merge a (vec_merge a b m) n) = (vec_merge a b (m|n)).

With this transformation in simplify-rtx.cc, combine now reports:

Trying 3, 9 -> 12:
    3: r102:SI=r107:SI
      REG_DEAD r107:SI
    9: r103:V4SI=vec_merge(vec_duplicate(r102:SI),r106:V4SI,0x2)
      REG_DEAD r106:V4SI
   12: r105:V4SI=vec_merge(vec_duplicate(r102:SI),r103:V4SI,0x8)
      REG_DEAD r103:V4SI
      REG_DEAD r102:SI
Failed to match this instruction:
(set (reg:V4SI 105 [ vec_5 ])
    (vec_merge:V4SI (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ]))
        (reg:V4SI 106 [ vecD.3391 ])
        (const_int 10 [0xa])))
Successfully matched this instruction:
(set (reg:V4SI 103 [ vec_4 ])
    (vec_duplicate:V4SI (reg:SI 107 [ valD.3392 ])))
Successfully matched this instruction:
(set (reg:V4SI 105 [ vec_5 ])
    (vec_merge:V4SI (reg:V4SI 103 [ vec_4 ])
        (reg:V4SI 106 [ vecD.3391 ])
        (const_int 10 [0xa])))
allowing combination of insns 3, 9 and 12
original costs 4 + 4 + 4 = 12
replacement costs 4 + 4 = 8

And for the example above, we now generate the faster:

foo:    vmovd   %edi, %xmm2
        vpbroadcastd    %xmm2, %xmm1
        vpblendd        $10, %xmm1, %xmm0, %xmm0
        ret

which uses only 1 inter-unit move.

The effect is even more dramatic when all the elements of a vector
get set:

v4si bar(v4si vec, int val) {
    vec[0] = val;
    vec[1] = val;
    vec[2] = val;
    vec[3] = val;
    return vec;
}

Before:
bar:    vpinsrd $0, %edi, %xmm0, %xmm0
        vpinsrd $1, %edi, %xmm0, %xmm0
        vpinsrd $2, %edi, %xmm0, %xmm0
        vpinsrd $3, %edi, %xmm0, %xmm0
        ret

After:
bar:    vmovd   %edi, %xmm1
        vpbroadcastd    %xmm1, %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline in stage 1?


2026-01-20  Roger Sayle  <[email protected]>

gcc/ChangeLog
        * simplify-rtx.cc (simplify_context::simplify_ternary_operation)
        <case VEC_MERGE>: Simplify a vec_merge of a vec_merge with a
        repeated operand.

gcc/testsuite/ChangeLog
        * gcc.target/i386/avx2-vpblendd128-3.c: New test case.


diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 8016e02e925..6b0160c71a3 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -7560,6 +7560,21 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
                  if (!(sel & ~sel0 & mask) && !side_effects_p (XEXP (op0, 1)))
                    return simplify_gen_ternary (code, mode, mode,
                                                 XEXP (op0, 0), op1, op2);
+
+                 /* Replace (vec_merge (vec_merge a b m) a n) with
+                    (vec_merge a b (m|~n)).  */
+                 if (rtx_equal_p (XEXP (op0, 0), op1)
+                     && ! side_effects_p (op1))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op1, XEXP (op0, 1),
+                                                GEN_INT ((sel0 | ~sel) & 
mask));
+                 /* Replace (vec_merge (vec_merge b a m) a n) with
+                    (vec_merge b a (m&n)).  */
+                 if (rtx_equal_p (XEXP (op0, 1), op1)
+                     && ! side_effects_p (op1))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                XEXP (op0, 0), op1,
+                                                GEN_INT (sel & sel0 & mask));
                }
            }
          if (GET_CODE (op1) == VEC_MERGE)
@@ -7574,6 +7589,22 @@ simplify_context::simplify_ternary_operation (rtx_code 
code, machine_mode mode,
                  if (!(~sel & ~sel1 & mask) && !side_effects_p (XEXP (op1, 1)))
                    return simplify_gen_ternary (code, mode, mode,
                                                 op0, XEXP (op1, 0), op2);
+
+                 /* Replace (vec_merge a (vec_merge a b m) n) with
+                    (vec_merge a b (m|n)).  */
+                 if (rtx_equal_p (XEXP (op1, 0), op0)
+                     && ! side_effects_p (op0))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op0, XEXP (op1, 1),
+                                                GEN_INT ((sel | sel1) & mask));
+
+                 /* Replace (vec_merge a (vec_merge b a m) n) with
+                    (vec_merge a b (~m|n)).  */
+                 if (rtx_equal_p (XEXP (op1, 1), op0)
+                     && ! side_effects_p (op0))
+                   return simplify_gen_ternary (code, mode, mode,
+                                                op0, XEXP (op1, 0),
+                                                GEN_INT ((sel | ~sel1) & 
mask));
                }
            }
 
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c 
b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
new file mode 100644
index 00000000000..a4bd90b48d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vpblendd128-3.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+v4si foo(v4si vec, int val) {
+    vec[0] = val;
+    vec[2] = val;
+    return vec;
+}
+
+v4si bar(v4si vec, int val) {
+    vec[0] = val;
+    vec[1] = val;
+    vec[2] = val;
+    vec[3] = val;
+    return vec;
+}
+
+/* { dg-final { scan-assembler-times "vpbroadcastd" 2 } } */
+/* { dg-final { scan-assembler-times "vpblendd\[ \\t\]+" 1 } } */
+/* { dg-final { scan-assembler-not "vpinsrd" } } */

Reply via email to