The following implements VMAT_ELEMENTWISE for grouped loads, in
particular for being able to serve as fallback for unhandled
load permutations since it's trivial to load elements in the
correct order.
Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.
PR tree-optimization/116816
* tree-vect-stmts.cc (get_load_store_type): Allow multi-lane
single-element interleaving to fall back to VMAT_ELEMENTWISE.
Fall back to VMAT_ELEMENTWISE when we cannot handle a load
permutation.
(vectorizable_load): Do not check a load permutation
for VMAT_ELEMENTWISE. Handle grouped loads with
VMAT_ELEMENTWISE and directly apply a load permutation.
---
gcc/tree-vect-stmts.cc | 81 ++++++++++++++++++------------------------
1 file changed, 34 insertions(+), 47 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 841733349b5..bdedb25efe9 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2134,41 +2134,26 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
|| *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
&& maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
{
- if (SLP_TREE_LANES (slp_node) == 1)
- {
- *memory_access_type = VMAT_ELEMENTWISE;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads, using "
- "elementwise access\n");
- }
- else
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving not supported "
- "for not adjacent vector loads\n");
- return false;
- }
+ *memory_access_type = VMAT_ELEMENTWISE;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving not supported "
+ "for not adjacent vector loads, using "
+ "elementwise access\n");
}
- /* For single-element interleaving also fall back to elementwise
- access in case we did not lower a permutation and cannot
- code generate it. */
+ /* Also fall back to elementwise access in case we did not lower a
+ permutation and cannot code generate it. */
if (loop_vinfo
- && single_element_p
- && SLP_TREE_LANES (slp_node) == 1
- && (*memory_access_type == VMAT_CONTIGUOUS
- || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ && *memory_access_type != VMAT_ELEMENTWISE
&& SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
&& !perm_ok)
{
*memory_access_type = VMAT_ELEMENTWISE;
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "single-element interleaving permutation not "
- "supported, using elementwise access\n");
+ "permutation not supported, using elementwise "
+ "access\n");
}
overrun_p = (loop_vinfo && gap != 0
@@ -2498,9 +2483,9 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
traditional behavior until that can be fixed. */
if (*memory_access_type == VMAT_ELEMENTWISE
&& !STMT_VINFO_STRIDED_P (first_stmt_info)
- && !(stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info)
- && !DR_GROUP_NEXT_ELEMENT (stmt_info)
- && !pow2p_hwi (DR_GROUP_SIZE (stmt_info))))
+ && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
+ && single_element_p
+ && !pow2p_hwi (group_size)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9480,11 +9465,11 @@ vectorizable_load (vec_info *vinfo,
/* ??? The following checks should really be part of
get_load_store_type. */
if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
- && !((memory_access_type == VMAT_ELEMENTWISE
- || mat_gather_scatter_p (memory_access_type))
- && SLP_TREE_LANES (slp_node) == 1
- && (!grouped_load
- || !DR_GROUP_NEXT_ELEMENT (first_stmt_info))))
+ && !(memory_access_type == VMAT_ELEMENTWISE
+ || (mat_gather_scatter_p (memory_access_type)
+ && SLP_TREE_LANES (slp_node) == 1
+ && (!grouped_load
+ || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
{
slp_perm = true;
@@ -9727,28 +9712,24 @@ vectorizable_load (vec_info *vinfo,
{
first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
+ ref_type = get_group_alias_ptr_type (first_stmt_info);
}
else
{
first_stmt_info = stmt_info;
first_dr_info = dr_info;
+ ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
}
- if (grouped_load && memory_access_type == VMAT_STRIDED_SLP)
+ if (grouped_load)
{
- group_size = DR_GROUP_SIZE (first_stmt_info);
- ref_type = get_group_alias_ptr_type (first_stmt_info);
+ if (memory_access_type == VMAT_STRIDED_SLP)
+ group_size = DR_GROUP_SIZE (first_stmt_info);
+ else /* VMAT_ELEMENTWISE */
+ group_size = SLP_TREE_LANES (slp_node);
}
else
- {
- if (grouped_load)
- cst_offset
- = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))
- * vect_get_place_in_interleaving_chain (stmt_info,
- first_stmt_info));
- group_size = 1;
- ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
- }
+ group_size = 1;
if (!costing_p)
{
@@ -9887,6 +9868,7 @@ vectorizable_load (vec_info *vinfo,
int ncopies;
if (slp_perm)
{
+ gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
/* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
variable VF. */
unsigned int const_vf = vf.to_constant ();
@@ -9922,8 +9904,13 @@ vectorizable_load (vec_info *vinfo,
slp_node, 0, vect_body);
continue;
}
+ unsigned int load_el = group_el;
+ /* For elementwise accesses apply a load permutation directly. */
+ if (memory_access_type == VMAT_ELEMENTWISE
+ && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+ load_el = SLP_TREE_LOAD_PERMUTATION (slp_node)[group_el];
tree this_off = build_int_cst (TREE_TYPE (alias_off),
- group_el * elsz + cst_offset);
+ load_el * elsz + cst_offset);
tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
new_temp = make_ssa_name (ltype);
--
2.51.0