The following makes sure to cost live scalar stmts appearing in multiple
SLP nodes only once and code-generate them from the SLP node we verified
we can replace all scalar uses from.
* tree-vectorizer.h (_slp_tree::live_lanes): New vector.
(SLP_TREE_LIVE_LANES): New.
* tree-vect-loop.cc (vectorizable_live_operation): Append
to SLP_TREE_LIVE_LANES.
* tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize
SLP_TREE_LIVE_LANES.
(_slp_tree::~_slp_tree): Release SLP_TREE_LIVE_LANES.
(vect_print_slp_tree): Adjust live lane dumping, indicating
the SLP node a lane is code generated from.
(vect_bb_slp_mark_live_stmts): No longer verify we can
code-generate from all SLP nodes but at least one, picking
the first.
* tree-vect-stmts.cc (vect_transform_stmt): Iterate over
SLP_TREE_LIVE_LANES.
(vect_analyze_stmt): Also analyze reductions for live
lanes.
---
gcc/tree-vect-loop.cc | 11 ++++++++---
gcc/tree-vect-slp.cc | 31 +++++++++++++++----------------
gcc/tree-vect-stmts.cc | 12 ++++++++----
gcc/tree-vectorizer.h | 5 +++++
4 files changed, 36 insertions(+), 23 deletions(-)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 5518be4d392..dd239a0b015 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10165,7 +10165,10 @@ vectorizable_live_operation (vec_info *vinfo,
stmt_vec_info stmt_info,
if (vect_is_reduction (slp_node))
{
if (!vec_stmt_p)
- return true;
+ {
+ SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
+ return true;
+ }
/* For SLP reductions we vectorize the epilogue for all involved stmts
together. For SLP reduction chains we only get here once. */
if (SLP_INSTANCE_KIND (slp_node_instance) == slp_inst_kind_reduc_group
@@ -10289,6 +10292,7 @@ vectorizable_live_operation (vec_info *vinfo,
stmt_vec_info stmt_info,
if (!loop_vinfo)
record_stmt_cost (cost_vec, 1, vec_to_scalar, slp_node,
0, vect_epilogue);
+ SLP_TREE_LIVE_LANES (slp_node).safe_push (slp_index);
return true;
}
@@ -10423,7 +10427,8 @@ vectorizable_live_operation (vec_info *vinfo,
stmt_vec_info stmt_info,
/* ??? This can happen when the live lane ends up being
rooted in a vector construction code-generated by an
external SLP node (and code-generation for that already
- happened). See gcc.dg/vect/bb-slp-47.c.
+ happened). See gcc.dg/vect/bb-slp-47.c or
+ gcc.dg/vect/pr97173.c or gcc.dg/vect/bb-slp-pr115777.c.
Doing this is what would happen if that vector CTOR
were not code-generated yet so it is not too bad.
??? In fact we'd likely want to avoid this situation
@@ -10444,7 +10449,7 @@ vectorizable_live_operation (vec_info *vinfo,
stmt_vec_info stmt_info,
/* ??? It can also happen that we end up pulling a def into
a loop where replacing out-of-loop uses would require
a new LC SSA PHI node. Retain the original scalar in
- those cases as well. PR98064. */
+ those cases as well. PR98064, gcc.dg/vect/bb-slp-57.c. */
if (TREE_CODE (new_tree) == SSA_NAME
&& !SSA_NAME_IS_DEFAULT_DEF (new_tree)
&& (gimple_bb (use_stmt)->loop_father
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index e4b3352f958..10e9ff607ad 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -114,6 +114,7 @@ _slp_tree::_slp_tree ()
slp_first_node = this;
SLP_TREE_SCALAR_STMTS (this) = vNULL;
SLP_TREE_SCALAR_OPS (this) = vNULL;
+ SLP_TREE_LIVE_LANES (this) = vNULL;
SLP_TREE_VEC_DEFS (this) = vNULL;
SLP_TREE_CHILDREN (this) = vNULL;
SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
@@ -149,6 +150,7 @@ _slp_tree::~_slp_tree ()
SLP_TREE_CHILDREN (this).release ();
SLP_TREE_SCALAR_STMTS (this).release ();
SLP_TREE_SCALAR_OPS (this).release ();
+ SLP_TREE_LIVE_LANES (this).release ();
SLP_TREE_VEC_DEFS (this).release ();
SLP_TREE_LOAD_PERMUTATION (this).release ();
SLP_TREE_LANE_PERMUTATION (this).release ();
@@ -3340,7 +3342,9 @@ vect_print_slp_tree (dump_flags_t dump_kind,
dump_location_t loc,
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
if (stmt_info)
dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
- STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
+ SLP_TREE_LIVE_LANES (node).contains (i)
+ ? "[l*]" : (STMT_VINFO_LIVE_P (stmt_info)
+ ? "[l] " : ""),
i, stmt_info->stmt);
else
dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
@@ -9023,28 +9027,23 @@ vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo,
slp_tree node,
}
if (live_p && can_insert)
{
+ /* Only record a live stmt when we can replace all uses. We
+ record from which SLP tree we vectorize the uses, so we'll
+ cost once and can deal with the case that not all SLP nodes
+ may be suitable for code-generation of all live uses.
+ ??? But we never split up the work between multiple SLP
+ nodes. */
STMT_VINFO_LIVE_P (stmt_info) = true;
- if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
- instance, i, false, cost_vec))
+ if (!vectorizable_live_operation (bb_vinfo, stmt_info, node,
+ instance, i, false, cost_vec))
{
- /* ??? So we know we can vectorize the live stmt from one SLP
- node. If we cannot do so from all or none consistently
- we'd have to record which SLP node (and lane) we want to
- use for the live operation. So make sure we can
- code-generate from all nodes. */
- /* ??? We are costing the extract possibly multiple times,
- but code-generation also works this way, leaving uses
- that are not valid for one extraction to be handled
- by another. */
+ STMT_VINFO_LIVE_P (stmt_info) = false;
mark_visited = false;
}
}
}
if (mark_visited)
- {
- STMT_VINFO_LIVE_P (stmt_info) = false;
- svisited.add (stmt_info);
- }
+ svisited.add (stmt_info);
}
slp_tree child;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 2c3214cc196..c895e143473 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13316,7 +13316,6 @@ vect_analyze_stmt (vec_info *vinfo,
/* Stmts that are (also) "live" (i.e. - that are used out of the loop)
need extra handling, except for vectorizable reductions. */
if (!bb_vinfo
- && SLP_TREE_TYPE (node) != reduc_vec_info_type
&& (SLP_TREE_TYPE (node) != lc_phi_info_type
|| SLP_TREE_DEF_TYPE (node) == vect_internal_def)
&& (!node->ldst_lanes || SLP_TREE_PERMUTE_P (node))
@@ -13466,9 +13465,14 @@ vect_transform_stmt (vec_info *vinfo,
{
/* Handle stmts whose DEF is used outside the loop-nest that is
being vectorized. */
- done = can_vectorize_live_stmts (vinfo, slp_node,
- slp_node_instance, true, NULL);
- gcc_assert (done);
+ for (unsigned lane : SLP_TREE_LIVE_LANES (slp_node))
+ {
+ stmt_vec_info slp_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[lane];
+ done = vectorizable_live_operation (vinfo, slp_stmt_info, slp_node,
+ slp_node_instance, lane,
+ true, NULL);
+ gcc_assert (done);
+ }
}
return is_store;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 9a4126e0cec..e2aa1c4bad5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -328,6 +328,10 @@ struct _slp_tree {
vec<stmt_vec_info> stmts;
/* A group of scalar operands to be vectorized together. */
vec<tree> ops;
+ /* A set of lane indices that are live and to be code-generated from
+ this SLP node. */
+ vec<unsigned> live_lanes;
+
/* The representative that should be used for analysis and
code generation. */
stmt_vec_info representative;
@@ -457,6 +461,7 @@ public:
#define SLP_TREE_CHILDREN(S) (S)->children
#define SLP_TREE_SCALAR_STMTS(S) (S)->stmts
#define SLP_TREE_SCALAR_OPS(S) (S)->ops
+#define SLP_TREE_LIVE_LANES(S) (S)->live_lanes
#define SLP_TREE_REF_COUNT(S) (S)->refcnt
#define SLP_TREE_VEC_DEFS(S) (S)->vec_defs
#define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation
--
2.51.0