[gcc r16-6767] tree-optimization/123190 - fix costing of permuted contiguous loads

Richard Biener via Gcc-cvs Wed, 14 Jan 2026 05:47:02 -0800

https://gcc.gnu.org/g:948d33f490a6b0051376da6bdcf55223a552b30f


commit r16-6767-g948d33f490a6b0051376da6bdcf55223a552b30f
Author: Richard Biener <[email protected]>
Date:   Wed Jan 14 12:45:19 2026 +0100

    tree-optimization/123190 - fix costing of permuted contiguous loads
    
    The following fixes a regression from the time we split load groups
    along SLP boundaries.  When we face a permuted load from an access
    that is contiguous across loop iterations we emit code that loads
    the whole group and then emit required permutations.  The permutations
    might not need all those loads, and if we split the group we would
    not have emitted them.  Fortunately when analyzing a permutation
    we compute both the number of required permutes and the number of
    loads that will survive the followin DCE.  So make sure to use that
    when costing.  This allows the previously added testcase for PR123190
    to undergo epilog vectorization also at -O2 plus when using non-generic
    tuning, such as tuning for Zen4 which ups the cost for XMM loads.
    
            PR tree-optimization/123190
            * tree-vectorizer.h (vect_load_store_data): Add n_loads member.
            * tree-vect-stmts.cc (get_load_store_type): Record the
            number of required loads for permuted loads.
            (vectorizable_load): Make use of this when costing loads
            for VMAT_CONTIGUOUS[_REVERSE].
    
            * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c: Do not
            require -mtune=generic.
            * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c: Add
            variant with -O2 instead of -O3, inner loop not unrolled.

Diff:
---
 .../gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c    |  2 +-
 .../gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c    |  7 +++++++
 gcc/tree-vect-stmts.cc                                     | 14 +++++++++-----
 gcc/tree-vectorizer.h                                      |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
index 4265ac80a43d..098468627f05 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */
+/* { dg-additional-options "-O3 -mavx2 -mno-avx512f" } */
 
 typedef struct {
    double real;
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c
new file mode 100644
index 000000000000..abc63b236955
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c
@@ -0,0 +1,7 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -mavx2 -mno-avx512f" } */
+
+#include "costmodel-pr123190-1.c"
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } 
} */
+/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 
byte vectors and unroll factor 1" "vect" } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a563238c4be0..83983742467c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2087,6 +2087,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   tree *ls_type = &ls->ls_type;
   bool *slp_perm = &ls->slp_perm;
   unsigned *n_perms = &ls->n_perms;
+  unsigned *n_loads = &ls->n_loads;
   tree *supported_offset_vectype = &ls->supported_offset_vectype;
   int *supported_scale = &ls->supported_scale;
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
@@ -2103,6 +2104,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   *ls_type = NULL_TREE;
   *slp_perm = false;
   *n_perms = -1U;
+  *n_loads = -1U;
   ls->subchain_p = false;
 
   bool perm_ok = true;
@@ -2110,7 +2112,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
 
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
     perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
-                                           vf, true, n_perms);
+                                           vf, true, n_perms, n_loads);
 
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
@@ -11880,18 +11882,20 @@ vectorizable_load (vec_info *vinfo,
         in PR101120 and friends.  */
       if (costing_p)
        {
-         gcc_assert (ls.n_perms != -1U);
+         gcc_assert (ls.n_perms != -1U && ls.n_loads != -1U);
          if (ls.n_perms != 0)
            inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                            slp_node, 0, vect_body);
+         if (n_adjacent_loads > 0)
+           n_adjacent_loads = ls.n_loads;
        }
       else
        {
-         unsigned n_perms2;
+         unsigned n_perms2, n_loads2;
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
                                                  gsi, vf, false, &n_perms2,
-                                                 nullptr, true);
-         gcc_assert (ok && ls.n_perms == n_perms2);
+                                                 &n_loads2, true);
+         gcc_assert (ok && ls.n_perms == n_perms2 && ls.n_loads == n_loads2);
        }
     }
 
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 7a38d4969cf2..2cbf752e4e76 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -307,6 +307,7 @@ struct vect_load_store_data : vect_data {
   /* True if the load requires a load permutation.  */
   bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
+  unsigned n_loads; // SLP_TREE_LOAD_PERMUTATION
   /* Whether the load permutation is consecutive and simple.  */
   bool subchain_p; // VMAT_STRIDED_SLP and VMAT_GATHER_SCATTER
 };

[gcc r16-6767] tree-optimization/123190 - fix costing of permuted contiguous loads

Reply via email to