https://gcc.gnu.org/g:948d33f490a6b0051376da6bdcf55223a552b30f
commit r16-6767-g948d33f490a6b0051376da6bdcf55223a552b30f Author: Richard Biener <[email protected]> Date: Wed Jan 14 12:45:19 2026 +0100 tree-optimization/123190 - fix costing of permuted contiguous loads The following fixes a regression from the time we split load groups along SLP boundaries. When we face a permuted load from an access that is contiguous across loop iterations we emit code that loads the whole group and then emit required permutations. The permutations might not need all those loads, and if we split the group we would not have emitted them. Fortunately when analyzing a permutation we compute both the number of required permutes and the number of loads that will survive the followin DCE. So make sure to use that when costing. This allows the previously added testcase for PR123190 to undergo epilog vectorization also at -O2 plus when using non-generic tuning, such as tuning for Zen4 which ups the cost for XMM loads. PR tree-optimization/123190 * tree-vectorizer.h (vect_load_store_data): Add n_loads member. * tree-vect-stmts.cc (get_load_store_type): Record the number of required loads for permuted loads. (vectorizable_load): Make use of this when costing loads for VMAT_CONTIGUOUS[_REVERSE]. * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c: Do not require -mtune=generic. * gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c: Add variant with -O2 instead of -O3, inner loop not unrolled. Diff: --- .../gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c | 2 +- .../gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c | 7 +++++++ gcc/tree-vect-stmts.cc | 14 +++++++++----- gcc/tree-vectorizer.h | 1 + 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c index 4265ac80a43d..098468627f05 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-O3 -mavx2 -mno-avx512f -mtune=generic" } */ +/* { dg-additional-options "-O3 -mavx2 -mno-avx512f" } */ typedef struct { double real; diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c new file mode 100644 index 000000000000..abc63b236955 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr123190-2.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O2 -mavx2 -mno-avx512f" } */ + +#include "costmodel-pr123190-1.c" + +/* { dg-final { scan-tree-dump "optimized: loop vectorized using 32" "vect" } } */ +/* { dg-final { scan-tree-dump "optimized: epilogue loop vectorized using 16 byte vectors and unroll factor 1" "vect" } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a563238c4be0..83983742467c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2087,6 +2087,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, tree *ls_type = &ls->ls_type; bool *slp_perm = &ls->slp_perm; unsigned *n_perms = &ls->n_perms; + unsigned *n_loads = &ls->n_loads; tree *supported_offset_vectype = &ls->supported_offset_vectype; int *supported_scale = &ls->supported_scale; loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); @@ -2103,6 +2104,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, *ls_type = NULL_TREE; *slp_perm = false; *n_perms = -1U; + *n_loads = -1U; ls->subchain_p = false; bool perm_ok = true; @@ -2110,7 +2112,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, - vf, true, n_perms); + vf, true, n_perms, n_loads); if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { @@ -11880,18 +11882,20 @@ vectorizable_load (vec_info *vinfo, in PR101120 and friends. */ if (costing_p) { - gcc_assert (ls.n_perms != -1U); + gcc_assert (ls.n_perms != -1U && ls.n_loads != -1U); if (ls.n_perms != 0) inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm, slp_node, 0, vect_body); + if (n_adjacent_loads > 0) + n_adjacent_loads = ls.n_loads; } else { - unsigned n_perms2; + unsigned n_perms2, n_loads2; bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf, false, &n_perms2, - nullptr, true); - gcc_assert (ok && ls.n_perms == n_perms2); + &n_loads2, true); + gcc_assert (ok && ls.n_perms == n_perms2 && ls.n_loads == n_loads2); } } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 7a38d4969cf2..2cbf752e4e76 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -307,6 +307,7 @@ struct vect_load_store_data : vect_data { /* True if the load requires a load permutation. */ bool slp_perm; // SLP_TREE_LOAD_PERMUTATION unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION + unsigned n_loads; // SLP_TREE_LOAD_PERMUTATION /* Whether the load permutation is consecutive and simple. */ bool subchain_p; // VMAT_STRIDED_SLP and VMAT_GATHER_SCATTER };
