[PATCH] tree-optimization/123190 - fix costing of permuted contiguous loads

Richard Biener Wed, 14 Jan 2026 06:39:29 -0800

The following fixes a regression from the time we split load groups
along SLP boundaries.  When we face a permuted load from an access
that is contiguous across loop iterations we emit code that loads
the whole group and then emit required permutations.  The permutations
might not need all those loads, and if we split the group we would
not have emitted them.  Fortunately when analyzing a permutation
we compute both the number of required permutes and the number of
loads that will survive the followin DCE.  So make sure to use that
when costing.  This allows the previously added testcase for PR123190
to undergo epilog vectorization also at -O2 plus when using non-generic
tuning, such as tuning for Zen4 which ups the cost for XMM loads.


Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed together
with the other fix.

        PR tree-optimization/123190
        * tree-vectorizer.h (vect_load_store_data): Add n_loads member.
        * tree-vect-stmts.cc (get_load_store_type): Record the
        number of required loads for permuted loads.
        (vectorizable_load): Make use of this when costing loads
        for VMAT_CONTIGUOUS[_REVERSE].

        * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c: Do not
        require -mtune=generic.
        * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c: Add
        variant with -O2 instead of -O3, inner loop not unrolled.
---
 .../vect/costmodel/x86_64/costmodel-pr123190-1.c   |  2 +-
 .../vect/costmodel/x86_64/costmodel-pr123190-2.c   |  7 +++++++
 gcc/tree-vect-stmts.cc                             | 14 +++++++++-----
 gcc/tree-vectorizer.h                              |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
index 4265ac80a43..098468627f0 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */
+/* { dg-additional-options "-O3 -mavx2 -mno-avx512f" } */
 
 typedef struct {
    double real;
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c
new file mode 100644
index 00000000000..abc63b23695
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -mavx2 -mno-avx512f" } */
+
+#include "costmodel-pr123190-1.c"
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } 
} */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 
byte vectors and unroll factor 1" "vect" } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a563238c4be..83983742467 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2087,6 +2087,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   tree *ls_type = &ls->ls_type;
   bool *slp_perm = &ls->slp_perm;
   unsigned *n_perms = &ls->n_perms;
+  unsigned *n_loads = &ls->n_loads;
   tree *supported_offset_vectype = &ls->supported_offset_vectype;
   int *supported_scale = &ls->supported_scale;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
@@ -2103,6 +2104,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   *ls_type = NULL_TREE;
   *slp_perm = false;
   *n_perms = -1U;
+  *n_loads = -1U;
   ls->subchain_p = false;
 
   bool perm_ok = true;
@@ -2110,7 +2112,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
 
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
     perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
-                                           vf, true, n_perms);
+                                           vf, true, n_perms, n_loads);
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -11880,18 +11882,20 @@ vectorizable_load (vec_info *vinfo,
         in PR101120 and friends.  */
       if (costing_p)
        {
-         gcc_assert (ls.n_perms != -1U);
+         gcc_assert (ls.n_perms != -1U && ls.n_loads != -1U);
          if (ls.n_perms != 0)
            inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                            slp_node, 0, vect_body);
+         if (n_adjacent_loads > 0)
+           n_adjacent_loads = ls.n_loads;
        }
       else
        {
-         unsigned n_perms2;
+         unsigned n_perms2, n_loads2;
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
                                                  gsi, vf, false, &n_perms2,
-                                                 nullptr, true);
-         gcc_assert (ok && ls.n_perms == n_perms2);
+                                                 &n_loads2, true);
+         gcc_assert (ok && ls.n_perms == n_perms2 && ls.n_loads == n_loads2);
        }
     }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 7a38d4969cf..2cbf752e4e7 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -307,6 +307,7 @@ struct vect_load_store_data : vect_data {
   /* True if the load requires a load permutation.  */
   bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
+  unsigned n_loads; // SLP_TREE_LOAD_PERMUTATION
   /* Whether the load permutation is consecutive and simple.  */
   bool subchain_p; // VMAT_STRIDED_SLP and VMAT_GATHER_SCATTER
 };
-- 
2.51.0

[PATCH] tree-optimization/123190 - fix costing of permuted contiguous loads

Reply via email to