SLP may take a broadcast as kind of vec_perm, the patch checks the
permutation index to exclude those false positive.
> > > so the vectorizer costs sth withy count == 0? I'll see to fix that,
> > > but this also
> > > means the code should have used m_num_avx256_vec_perm[where] += count.
Changed.
>
> && (is_a <bb_vec_info> (m_vinfo)
> || SLP_TREE_LANES (node) % nunits == 0)
For the case mentioned in the comments, it's also
SLP_TREE_LANES (node) % nunits == 0(avx256_avoid_vec_perm-5.c), hence it
can't distinguish avx256_avoid_vec_perm-5.c from avx256_avoid_vec_perm-3.c.
Both of them are “legacy" load permutation in loop vectorization.
So I just handled is_a <bb_vec_info> (m_vinfo) in the patch,
leave loop vectorization in the follow up patch.
>
> the case of SLP_TREE_PERMUTE_P would need to be added separately,
> but those are also costed as kind == vec_perm. A common use-case were
> blends but now that we lower most load permutations to explicit
> SLP permute nodes there are also those when vectorizing loops.
>
> I guess it's reasonable to first handle SLP_TREE_LOAD_PERMUTATION,
> the other case could be done as followup.
Bootstrapped and regtested on x86_64-c-pc-linux-gnu{-m32,}.
Ok for trunk.
gcc/ChangeLog:
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Check permutation index for vec_perm, don't count it if we
know it's not a cross-lane permutation.
gcc/testsuite/ChangeLog:
* gcc.target/i386/avx256_avoid_vec_perm.c: Adjust testcase.
* gcc.target/i386/avx256_avoid_vec_perm-2.c: New test.
* gcc.target/i386/avx256_avoid_vec_perm-5.c: New test.
---
gcc/config/i386/i386.cc | 59 ++++++++++++++++++-
.../gcc.target/i386/avx256_avoid_vec_perm-2.c | 21 +++++++
.../gcc.target/i386/avx256_avoid_vec_perm-5.c | 24 ++++++++
.../gcc.target/i386/avx256_avoid_vec_perm.c | 2 +-
4 files changed, 103 insertions(+), 3 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 55c9b16dd38..932e3feedc3 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26237,8 +26237,63 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
stmt_cost = ix86_default_vector_cost (kind, mode);
if (kind == vec_perm && vectype
- && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
- m_num_avx256_vec_perm[where]++;
+ && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
+ /* BIT_FIELD_REF <vect_**, 64, 0> 0 times vec_perm costs 0 in body. */
+ && count != 0)
+ {
+ bool real_perm = true;
+ unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+ if (node
+ && SLP_TREE_LOAD_PERMUTATION (node).exists ()
+ /* Loop vectorization will have 4 times vec_perm
+ with index as {0, 0, 0, 0}.
+ But it actually generates
+ vec_perm_expr <vect, vect, 0, 0, 0, 0>
+ vec_perm_expr <vect, vect, 1, 1, 1, 1>
+ vec_perm_expr <vect, vect, 2, 2, 2, 2>
+ Need to be handled separately. */
+ && is_a <bb_vec_info> (m_vinfo))
+ {
+ unsigned half = nunits / 2;
+ unsigned i = 0;
+ bool allsame = true;
+ unsigned first = SLP_TREE_LOAD_PERMUTATION (node)[0];
+ bool cross_lane_p = false;
+ for (i = 0 ; i != SLP_TREE_LANES (node); i++)
+ {
+ unsigned tmp = SLP_TREE_LOAD_PERMUTATION (node)[i];
+ /* allsame is just a broadcast. */
+ if (tmp != first)
+ allsame = false;
+
+ /* 4 times vec_perm with number of lanes multiple of nunits. */
+ tmp = tmp & (nunits - 1);
+ unsigned index = i & (nunits - 1);
+ if ((index < half && tmp >= half)
+ || (index >= half && tmp < half))
+ cross_lane_p = true;
+
+ if (!allsame && cross_lane_p)
+ break;
+ }
+
+ if (i == SLP_TREE_LANES (node))
+ real_perm = false;
+ }
+
+ if (real_perm)
+ {
+ m_num_avx256_vec_perm[where] += count;
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ {
+ fprintf (dump_file, "Detected avx256 cross-lane permutation: ");
+ if (stmt_info)
+ print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM);
+ fprintf (dump_file, " \n");
+ }
+ }
+ }
/* Penalize DFmode vector operations for Bonnell. */
if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
new file mode 100644
index 00000000000..8d4e641444d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sierraforest -O2 -fdump-tree-slp-details" } */
+/* { dg-final { scan-tree-dump-times {(?n)Detected avx256 cross-lane
permutation} 1 "slp2" } } */
+
+void
+foo (double* a, double* __restrict b, int c, int n)
+{
+ a[0] = b[100] * b[2];
+ a[1] = b[100] * b[3];
+ a[2] = b[100] * b[0];
+ a[3] = b[100] * b[1];
+}
+
+void
+foo1 (double* a, double* __restrict b, int c, int n)
+{
+ a[0] = b[100] * b[0];
+ a[1] = b[100] * b[1];
+ a[2] = b[100] * b[3];
+ a[3] = b[100] * b[2];
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
new file mode 100644
index 00000000000..c11bea8c7b3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-march=sierraforest -Ofast" } */
+/* { dg-final { scan-assembler-not {(?n)vpermpd.*%ymm} } } */
+
+typedef struct {
+ unsigned short m1, m2, m3, m4;
+} the_struct_t;
+typedef struct {
+ double m1, m2, m3, m4, m5;
+} the_struct2_t;
+
+double bar1 (the_struct2_t*);
+
+double foo (double* k, unsigned int n, the_struct_t* the_struct) {
+ unsigned int u;
+ the_struct2_t result;
+ for (u=0; u < n; u++, k--) {
+ result.m1 += (*k)*the_struct[u].m1;
+ result.m2 += (*k)*the_struct[u].m2;
+ result.m3 += (*k)*the_struct[u].m3;
+ result.m4 += (*k)*the_struct[u].m4;
+ }
+ return bar1 (&result);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
index d4f00b3fb52..e0399041ad9 100644
--- a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
+++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
@@ -13,7 +13,7 @@ foo (void)
b[i*8+0] = a[i*8+0];
b[i*8+1] = a[i*8+0];
b[i*8+2] = a[i*8+3];
- b[i*8+3] = a[i*8+3];
+ b[i*8+3] = a[i*8+5];
b[i*8+4] = a[i*8+4];
b[i*8+5] = a[i*8+6];
b[i*8+6] = a[i*8+4];
--
2.34.1