Hi Kyrill,
Thanks for the review and your comments.
>> It would be useful if you expanded a bit on the approach used to
>> generate the improved codegen
The patch creates a duplicate of most common element and tries to optimize
the insertion using dup for the element followed by insertions.
Current code:
============================================
movi v2.4s, 0
ins v2.s[0], v0.s[0]
ins v2.s[1], v1.s[0]
ins v2.s[2], v0.s[0]
orr v0.16b, v2.16b, v2.16b
ins v0.s[3], v3.s[0]
ret
============================================
Code after the patch:
============================================
dup v0.4s, v0.s[0]
ins v0.s[1], v1.s[0]
ins v0.s[3], v3.s[0]
ret
============================================
>> Some typos
Modified as required
>> worth adding a testcase where one of the vector elements appears more than
>> the others?
Modified the testcase as required using common element.
Please review the patch and let us know if its okay?
Bootstrapped and Regression tested on aarch64-thunder-linux.
Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 2e385c4..8747a23 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11671,11 +11671,54 @@ aarch64_expand_vector_init (rtx target, rtx vals)
aarch64_expand_vector_init (target, copy);
}
- /* Insert the variable lanes directly. */
-
enum insn_code icode = optab_handler (vec_set_optab, mode);
gcc_assert (icode != CODE_FOR_nothing);
+ /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions. */
+ if (n_var == n_elts && n_elts <= 16)
+ {
+ int matches[16][2];
+ int nummatches = 0;
+ memset (matches, 0, sizeof(matches));
+ for(int i = 0; i < n_elts; i++)
+ {
+ for (int j = 0; j <= i; j++)
+ {
+ if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+ {
+ matches[i][0] = j;
+ matches[j][1]++;
+ if (i != j)
+ nummatches++;
+ break;
+ }
+ }
+ }
+ int maxelement = 0;
+ int maxv = 0;
+ for (int i = 0; i < n_elts; i++)
+ if (matches[i][1] > maxv)
+ maxelement = i, maxv = matches[i][1];
+
+ /* Create a duplicate of the most common element. */
+ rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+ aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+ /* Insert the rest. */
+ for (int i = 0; i < n_elts; i++)
+ {
+ rtx x = XVECEXP (vals, 0, i);
+ if (matches[i][0] == maxelement)
+ continue;
+ x = copy_to_mode_reg (inner_mode, x);
+ emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+ }
+ return;
+ }
+
+ /* Insert the variable lanes directly. */
for (int i = 0; i < n_elts; i++)
{
rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 0000000..a043a21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float c, float d)
+{
+ return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */