Re: [PATCH v3] [x86] Exclude fake cross-lane permutation from avx256_avoid_vec_perm.

Richard Biener Wed, 17 Sep 2025 14:05:45 -0700

On Mon, Sep 8, 2025 at 10:15 AM liuhongt <[email protected]> wrote:
>
> SLP may take a broadcast as kind of vec_perm, the patch checks the
> permutation index to exclude those false positive.
>
> > > > so the vectorizer costs sth withy count == 0?  I'll see to fix that,
> > > > but this also
> > > > means the code should have used m_num_avx256_vec_perm[where] += count.
> Changed.
>
> >
> >  && (is_a <bb_vec_info> (m_vinfo)
> >  || SLP_TREE_LANES (node) % nunits == 0)
> For the case mentioned in the comments, it's also
> SLP_TREE_LANES (node) % nunits == 0(avx256_avoid_vec_perm-5.c), hence it
> can't distinguish avx256_avoid_vec_perm-5.c from avx256_avoid_vec_perm-3.c.
> Both of them are “legacy" load permutation in loop vectorization.
>
> So I just handled is_a <bb_vec_info> (m_vinfo) in the patch,
> leave loop vectorization in the follow up patch.
> >
> > the case of SLP_TREE_PERMUTE_P would need to be added separately,
> > but those are also costed as kind == vec_perm.  A common use-case were
> > blends but now that we lower most load permutations to explicit
> > SLP permute nodes there are also those when vectorizing loops.
> >
> > I guess it's reasonable to first handle SLP_TREE_LOAD_PERMUTATION,
> > the other case could be done as followup.
>
> Bootstrapped and regtested on x86_64-c-pc-linux-gnu{-m32,}.
> Ok for trunk.


OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
>         Check permutation index for vec_perm, don't count it if we
>         know it's not a cross-lane permutation.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx256_avoid_vec_perm.c: Adjust testcase.
>         * gcc.target/i386/avx256_avoid_vec_perm-2.c: New test.
>         * gcc.target/i386/avx256_avoid_vec_perm-5.c: New test.
> ---
>  gcc/config/i386/i386.cc                       | 59 ++++++++++++++++++-
>  .../gcc.target/i386/avx256_avoid_vec_perm-2.c | 21 +++++++
>  .../gcc.target/i386/avx256_avoid_vec_perm-5.c | 24 ++++++++
>  .../gcc.target/i386/avx256_avoid_vec_perm.c   |  2 +-
>  4 files changed, 103 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 55c9b16dd38..932e3feedc3 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -26237,8 +26237,63 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>      stmt_cost = ix86_default_vector_cost (kind, mode);
>
>    if (kind == vec_perm && vectype
> -      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
> -    m_num_avx256_vec_perm[where]++;
> +      && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32
> +      /* BIT_FIELD_REF <vect_**, 64, 0> 0 times vec_perm costs 0 in body.  */
> +      && count != 0)
> +    {
> +      bool real_perm = true;
> +      unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +
> +      if (node
> +         && SLP_TREE_LOAD_PERMUTATION (node).exists ()
> +         /* Loop vectorization will have 4 times vec_perm
> +            with index as {0, 0, 0, 0}.
> +            But it actually generates
> +            vec_perm_expr <vect, vect, 0, 0, 0, 0>
> +            vec_perm_expr <vect, vect, 1, 1, 1, 1>
> +            vec_perm_expr <vect, vect, 2, 2, 2, 2>
> +            Need to be handled separately.  */
> +         && is_a <bb_vec_info> (m_vinfo))
> +       {
> +         unsigned half = nunits / 2;
> +         unsigned i = 0;
> +         bool allsame = true;
> +         unsigned first = SLP_TREE_LOAD_PERMUTATION (node)[0];
> +         bool cross_lane_p = false;
> +         for (i = 0 ; i != SLP_TREE_LANES (node); i++)
> +           {
> +             unsigned tmp = SLP_TREE_LOAD_PERMUTATION (node)[i];
> +             /* allsame is just a broadcast.  */
> +             if (tmp != first)
> +               allsame = false;
> +
> +             /* 4 times vec_perm with number of lanes multiple of nunits.  */
> +             tmp = tmp & (nunits - 1);
> +             unsigned index = i & (nunits - 1);
> +             if ((index < half && tmp >= half)
> +                 || (index >= half && tmp < half))
> +               cross_lane_p = true;
> +
> +             if (!allsame && cross_lane_p)
> +               break;
> +           }
> +
> +         if (i == SLP_TREE_LANES (node))
> +           real_perm = false;
> +       }
> +
> +      if (real_perm)
> +       {
> +         m_num_avx256_vec_perm[where] += count;
> +         if (dump_file && (dump_flags & TDF_DETAILS))
> +           {
> +             fprintf (dump_file, "Detected avx256 cross-lane permutation: ");
> +             if (stmt_info)
> +               print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM);
> +             fprintf (dump_file, " \n");
> +           }
> +       }
> +    }
>
>    /* Penalize DFmode vector operations for Bonnell.  */
>    if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
> diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c 
> b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
> new file mode 100644
> index 00000000000..8d4e641444d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-2.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=sierraforest -O2 -fdump-tree-slp-details" } */
> +/* { dg-final { scan-tree-dump-times {(?n)Detected avx256 cross-lane 
> permutation} 1 "slp2" } } */
> +
> +void
> +foo (double* a, double* __restrict b, int c, int n)
> +{
> +  a[0] = b[100] * b[2];
> +  a[1] = b[100] * b[3];
> +  a[2] = b[100] * b[0];
> +  a[3] = b[100] * b[1];
> +}
> +
> +void
> +foo1 (double* a, double* __restrict b, int c, int n)
> +{
> +  a[0] = b[100] * b[0];
> +  a[1] = b[100] * b[1];
> +  a[2] = b[100] * b[3];
> +  a[3] = b[100] * b[2];
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c 
> b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
> new file mode 100644
> index 00000000000..c11bea8c7b3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm-5.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=sierraforest -Ofast" } */
> +/* { dg-final { scan-assembler-not {(?n)vpermpd.*%ymm} } } */
> +
> +typedef struct {
> +  unsigned short m1, m2, m3, m4;
> +} the_struct_t;
> +typedef struct {
> +  double m1, m2, m3, m4, m5;
> +} the_struct2_t;
> +
> +double bar1 (the_struct2_t*);
> +
> +double foo (double* k, unsigned int n, the_struct_t* the_struct) {
> +  unsigned int u;
> +  the_struct2_t result;
> +  for (u=0; u < n; u++, k--) {
> +    result.m1 += (*k)*the_struct[u].m1;
> +    result.m2 += (*k)*the_struct[u].m2;
> +    result.m3 += (*k)*the_struct[u].m3;
> +    result.m4 += (*k)*the_struct[u].m4;
> +  }
> +  return bar1 (&result);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c 
> b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
> index d4f00b3fb52..e0399041ad9 100644
> --- a/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
> +++ b/gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
> @@ -13,7 +13,7 @@ foo (void)
>        b[i*8+0] = a[i*8+0];
>        b[i*8+1] = a[i*8+0];
>        b[i*8+2] = a[i*8+3];
> -      b[i*8+3] = a[i*8+3];
> +      b[i*8+3] = a[i*8+5];
>        b[i*8+4] = a[i*8+4];
>        b[i*8+5] = a[i*8+6];
>        b[i*8+6] = a[i*8+4];
> --
> 2.34.1
>

Re: [PATCH v3] [x86] Exclude fake cross-lane permutation from avx256_avoid_vec_perm.

Reply via email to