This is the main patch in the series. It adds a new enum and routines for classifying a vector load or store implementation.
Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? Thanks, Richard gcc/ * tree-vectorizer.h (vect_memory_access_type): New enum. (_stmt_vec_info): Add a memory_access_type field. (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro. (vect_model_store_cost): Take an access type instead of a boolean. (vect_model_load_cost): Likewise. * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to vect_model_store_cost and vect_model_load_cost. * tree-vect-stmts.c (vec_load_store_type): New enum. (vect_model_store_cost): Take an access type instead of a store_lanes_p boolean. Simplify tests. (vect_model_load_cost): Likewise, but for load_lanes_p. (get_group_load_store_type, get_load_store_type): New functions. (vectorizable_store): Use get_load_store_type. Record the access type in STMT_VINFO_MEMORY_ACCESS_TYPE. (vectorizable_load): Likewise. (vectorizable_mask_load_store): Likewise. Replace is_store variable with vls_type. Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h +++ gcc/tree-vectorizer.h @@ -485,6 +485,33 @@ enum slp_vect_type { hybrid }; +/* Describes how we're going to vectorize an individual load or store, + or a group of loads or stores. */ +enum vect_memory_access_type { + /* A simple contiguous access. */ + VMAT_CONTIGUOUS, + + /* A simple contiguous access in which the elements need to be permuted + after loading or before storing. Only used for loop vectorization; + SLP uses separate permutes. */ + VMAT_CONTIGUOUS_PERMUTE, + + /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES. */ + VMAT_LOAD_STORE_LANES, + + /* An access in which each scalar element is loaded or stored + individually. */ + VMAT_ELEMENTWISE, + + /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped + SLP accesses. Each unrolled iteration uses a contiguous load + or store for the whole group, but the groups from separate iterations + are combined in the same way as for VMAT_ELEMENTWISE. */ + VMAT_STRIDED_SLP, + + /* The access uses gather loads or scatter stores. */ + VMAT_GATHER_SCATTER +}; typedef struct data_reference *dr_p; @@ -602,6 +629,10 @@ typedef struct _stmt_vec_info { /* True if this is an access with loop-invariant stride. */ bool strided_p; + /* Classifies how the load or store is going to be implemented + for loop vectorization. */ + vect_memory_access_type memory_access_type; + /* For both loads and stores. */ bool simd_lane_access_p; @@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info #define STMT_VINFO_GATHER_SCATTER_P(S) (S)->gather_scatter_p #define STMT_VINFO_STRIDED_P(S) (S)->strided_p +#define STMT_VINFO_MEMORY_ACCESS_TYPE(S) (S)->memory_access_type #define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p #define STMT_VINFO_VEC_REDUCTION_TYPE(S) (S)->v_reduc_type @@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt); extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *, stmt_vector_for_cost *, stmt_vector_for_cost *); -extern void vect_model_store_cost (stmt_vec_info, int, bool, +extern void vect_model_store_cost (stmt_vec_info, int, vect_memory_access_type, enum vect_def_type, slp_tree, stmt_vector_for_cost *, stmt_vector_for_cost *); -extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree, - stmt_vector_for_cost *, +extern void vect_model_load_cost (stmt_vec_info, int, vect_memory_access_type, + slp_tree, stmt_vector_for_cost *, stmt_vector_for_cost *); extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, enum vect_cost_for_stmt, stmt_vec_info, Index: gcc/tree-vect-slp.c =================================================================== --- gcc/tree-vect-slp.c +++ gcc/tree-vect-slp.c @@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node, stmt_info = vinfo_for_stmt (stmt); if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { + vect_memory_access_type memory_access_type + = (STMT_VINFO_STRIDED_P (stmt_info) + ? VMAT_STRIDED_SLP + : VMAT_CONTIGUOUS); if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) - vect_model_store_cost (stmt_info, ncopies_for_cost, false, - vect_uninitialized_def, + vect_model_store_cost (stmt_info, ncopies_for_cost, + memory_access_type, vect_uninitialized_def, node, prologue_cost_vec, body_cost_vec); else { @@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node, ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); } /* Record the cost for the vector loads. */ - vect_model_load_cost (stmt_info, ncopies_for_cost, false, - node, prologue_cost_vec, body_cost_vec); + vect_model_load_cost (stmt_info, ncopies_for_cost, + memory_access_type, node, prologue_cost_vec, + body_cost_vec); return; } } Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c +++ gcc/tree-vect-stmts.c @@ -52,6 +52,14 @@ along with GCC; see the file COPYING3. If not see /* For lang_hooks.types.type_for_mode. */ #include "langhooks.h" +/* Says whether a statement is a load, a store of a vectorized statement + result, or a store of an invariant value. */ +enum vec_load_store_type { + VLS_LOAD, + VLS_STORE, + VLS_STORE_INVARIANT +}; + /* Return the vectorized type for the given statement. */ tree @@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info stmt_info, void vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, - bool store_lanes_p, enum vect_def_type dt, - slp_tree slp_node, + vect_memory_access_type memory_access_type, + enum vect_def_type dt, slp_tree slp_node, stmt_vector_for_cost *prologue_cost_vec, stmt_vector_for_cost *body_cost_vec) { @@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, /* We assume that the cost of a single store-lanes instruction is equivalent to the cost of GROUP_SIZE separate stores. If a grouped access is instead being provided by a permute-and-store operation, - include the cost of the permutes. - - For SLP, the caller has already counted the permutation, if any. */ - if (grouped_access_p - && first_stmt_p - && !store_lanes_p - && !STMT_VINFO_STRIDED_P (stmt_info) - && !slp_node) + include the cost of the permutes. */ + if (first_stmt_p + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE) { /* Uses a high and low interleave or shuffle operations for each needed permute. */ @@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, tree vectype = STMT_VINFO_VECTYPE (stmt_info); /* Costs of the stores. */ - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p)) - { - /* N scalar stores plus extracting the elements. */ - inside_cost += record_stmt_cost (body_cost_vec, - ncopies * TYPE_VECTOR_SUBPARTS (vectype), - scalar_store, stmt_info, 0, vect_body); - } + if (memory_access_type == VMAT_ELEMENTWISE) + /* N scalar stores plus extracting the elements. */ + inside_cost += record_stmt_cost (body_cost_vec, + ncopies * TYPE_VECTOR_SUBPARTS (vectype), + scalar_store, stmt_info, 0, vect_body); else vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec); - if (STMT_VINFO_STRIDED_P (stmt_info)) + if (memory_access_type == VMAT_ELEMENTWISE + || memory_access_type == VMAT_STRIDED_SLP) inside_cost += record_stmt_cost (body_cost_vec, ncopies * TYPE_VECTOR_SUBPARTS (vectype), vec_to_scalar, stmt_info, 0, vect_body); @@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int ncopies, void vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, - bool load_lanes_p, slp_tree slp_node, + vect_memory_access_type memory_access_type, + slp_tree slp_node, stmt_vector_for_cost *prologue_cost_vec, stmt_vector_for_cost *body_cost_vec) { @@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, /* We assume that the cost of a single load-lanes instruction is equivalent to the cost of GROUP_SIZE separate loads. If a grouped access is instead being provided by a load-and-permute operation, - include the cost of the permutes. - - For SLP, the caller has already counted the permutation, if any. */ - if (grouped_access_p - && first_stmt_p - && !load_lanes_p - && !STMT_VINFO_STRIDED_P (stmt_info) - && !slp_node) + include the cost of the permutes. */ + if (first_stmt_p + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE) { /* Uses an even and odd extract operations or shuffle operations for each needed permute. */ @@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, } /* The loads themselves. */ - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p)) + if (memory_access_type == VMAT_ELEMENTWISE) { /* N scalar loads plus gathering them into a vector. */ tree vectype = STMT_VINFO_VECTYPE (stmt_info); @@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, vect_get_load_cost (dr, ncopies, first_stmt_p, &inside_cost, &prologue_cost, prologue_cost_vec, body_cost_vec, true); - if (STMT_VINFO_STRIDED_P (stmt_info)) + if (memory_access_type == VMAT_ELEMENTWISE + || memory_access_type == VMAT_STRIDED_SLP) inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct, stmt_info, 0, vect_body); @@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree, gimple *, gimple_stmt_iterator *); +/* A subroutine of get_load_store_type, with a subset of the same + arguments. Handle the case where STMT is part of a grouped load + or store. + + For stores, the statements in the group are all consecutive + and there is no gap at the end. For loads, the statements in the + group might not be consecutive; there can be gaps between statements + as well as at the end. */ + +static bool +get_group_load_store_type (gimple *stmt, tree vectype, bool slp, + vec_load_store_type vls_type, + vect_memory_access_type *memory_access_type) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + vec_info *vinfo = stmt_info->vinfo; + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; + gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info); + unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); + bool single_element_p = (stmt == first_stmt + && !GROUP_NEXT_ELEMENT (stmt_info)); + unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); + int nunits = TYPE_VECTOR_SUBPARTS (vectype); + + /* True if the vectorized statements would access beyond the last + statement in the group. */ + bool overrun_p = false; + + /* True if we can cope with such overrun by peeling for gaps, so that + there is at least one final scalar iteration after the vector loop. */ + bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner); + + /* There can only be a gap at the end of the group if the stride is + known at compile time. */ + gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0); + + /* Stores can't yet have gaps. */ + gcc_assert (slp || vls_type == VLS_LOAD || gap == 0); + + if (slp) + { + if (STMT_VINFO_STRIDED_P (stmt_info)) + { + /* Try to use consecutive accesses of GROUP_SIZE elements, + separated by the stride, until we have a complete vector. + Fall back to scalar accesses if that isn't possible. */ + if (nunits % group_size == 0) + *memory_access_type = VMAT_STRIDED_SLP; + else + *memory_access_type = VMAT_ELEMENTWISE; + } + else + { + overrun_p = loop_vinfo && gap != 0; + if (overrun_p && vls_type != VLS_LOAD) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Grouped store with gaps requires" + " non-consecutive accesses\n"); + return false; + } + if (overrun_p && !can_overrun_p) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Peeling for outer loop is not supported\n"); + return false; + } + *memory_access_type = VMAT_CONTIGUOUS; + } + } + else + { + /* We can always handle this case using elementwise accesses, + but see if something more efficient is available. */ + *memory_access_type = VMAT_ELEMENTWISE; + + /* If there is a gap at the end of the group then these optimizations + would access excess elements in the last iteration. */ + bool would_overrun_p = (gap != 0); + if (!STMT_VINFO_STRIDED_P (stmt_info) + && (can_overrun_p || !would_overrun_p)) + { + /* First try using LOAD/STORE_LANES. */ + if (vls_type == VLS_LOAD + ? vect_load_lanes_supported (vectype, group_size) + : vect_store_lanes_supported (vectype, group_size)) + { + *memory_access_type = VMAT_LOAD_STORE_LANES; + overrun_p = would_overrun_p; + } + + /* If that fails, try using permuting loads. */ + if (*memory_access_type == VMAT_ELEMENTWISE + && (vls_type == VLS_LOAD + ? vect_grouped_load_supported (vectype, single_element_p, + group_size) + : vect_grouped_store_supported (vectype, group_size))) + { + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; + overrun_p = would_overrun_p; + } + } + } + + if (vls_type != VLS_LOAD && first_stmt == stmt) + { + /* STMT is the leader of the group. Check the operands of all the + stmts of the group. */ + gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info); + while (next_stmt) + { + gcc_assert (gimple_assign_single_p (next_stmt)); + tree op = gimple_assign_rhs1 (next_stmt); + gimple *def_stmt; + enum vect_def_type dt; + if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "use not simple.\n"); + return false; + } + next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); + } + } + + if (overrun_p) + { + gcc_assert (can_overrun_p); + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Data access with gaps requires scalar " + "epilogue loop\n"); + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; + } + + return true; +} + +/* Analyze load or store statement STMT of type VLS_TYPE. Return true + if there is a memory access type that the vectorized form can use, + storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers + or scatters, fill in GS_INFO accordingly. + + SLP says whether we're performing SLP rather than loop vectorization. + VECTYPE is the vector type that the vectorized statements will use. */ + +static bool +get_load_store_type (gimple *stmt, tree vectype, bool slp, + vec_load_store_type vls_type, + vect_memory_access_type *memory_access_type, + gather_scatter_info *gs_info) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + vec_info *vinfo = stmt_info->vinfo; + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + { + *memory_access_type = VMAT_GATHER_SCATTER; + gimple *def_stmt; + if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)) + gcc_unreachable (); + else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt, + &gs_info->offset_dt, + &gs_info->offset_vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "%s index use not simple.\n", + vls_type == VLS_LOAD ? "gather" : "scatter"); + return false; + } + } + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + if (!get_group_load_store_type (stmt, vectype, slp, vls_type, + memory_access_type)) + return false; + } + else if (STMT_VINFO_STRIDED_P (stmt_info)) + { + gcc_assert (!slp); + *memory_access_type = VMAT_ELEMENTWISE; + } + else + *memory_access_type = VMAT_CONTIGUOUS; + + /* FIXME: At the moment the cost model seems to underestimate the + cost of using elementwise accesses. This check preserves the + traditional behavior until that can be fixed. */ + if (*memory_access_type == VMAT_ELEMENTWISE + && !STMT_VINFO_STRIDED_P (stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not falling back to elementwise accesses\n"); + return false; + } + return true; +} + /* Function vectorizable_mask_load_store. Check if STMT performs a conditional load or store that can be vectorized. @@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, int i, j; bool inv_p; gather_scatter_info gs_info; - bool is_store; + vec_load_store_type vls_type; tree mask; gimple *def_stmt; enum vect_def_type dt; @@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; gcc_assert (ncopies >= 1); - is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE; mask = gimple_call_arg (stmt, 2); if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE) @@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, elem_type = TREE_TYPE (vectype); - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - return false; - - if (STMT_VINFO_STRIDED_P (stmt_info)) - return false; - if (TREE_CODE (mask) != SSA_NAME) return false; @@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS (vectype)) return false; - if (is_store) + if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) { tree rhs = gimple_call_arg (stmt, 3); if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, &rhs_vectype)) return false; + if (dt == vect_constant_def || dt == vect_external_def) + vls_type = VLS_STORE_INVARIANT; + else + vls_type = VLS_STORE; } + else + vls_type = VLS_LOAD; - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - { - gimple *def_stmt; - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) - gcc_unreachable (); - if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt, - &gs_info.offset_dt, &gs_info.offset_vectype)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "gather index use not simple."); - return false; - } + vect_memory_access_type memory_access_type; + if (!get_load_store_type (stmt, vectype, false, vls_type, + &memory_access_type, &gs_info)) + return false; + if (memory_access_type == VMAT_GATHER_SCATTER) + { tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); tree masktype = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist)))); @@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, return false; } } + else if (memory_access_type != VMAT_CONTIGUOUS) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "unsupported access type for masked %s\n", + vls_type == VLS_LOAD ? "load" : "store"); + return false; + } else if (tree_int_cst_compare (nested_in_vect_loop ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr), size_zero_node) <= 0) @@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, else if (!VECTOR_MODE_P (TYPE_MODE (vectype)) || !can_vec_mask_load_store_p (TYPE_MODE (vectype), TYPE_MODE (mask_vectype), - !is_store) + vls_type == VLS_LOAD) || (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))) return false; if (!vec_stmt) /* transformation not required. */ { + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; - if (is_store) - vect_model_store_cost (stmt_info, ncopies, false, dt, - NULL, NULL, NULL); + if (vls_type == VLS_LOAD) + vect_model_load_cost (stmt_info, ncopies, memory_access_type, + NULL, NULL, NULL); else - vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL); + vect_model_store_cost (stmt_info, ncopies, memory_access_type, + dt, NULL, NULL, NULL); return true; } + gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); /** Transform. **/ - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + if (memory_access_type == VMAT_GATHER_SCATTER) { tree vec_oprnd0 = NULL_TREE, op; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); @@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, gsi_replace (gsi, new_stmt, true); return true; } - else if (is_store) + else if (vls_type != VLS_LOAD) { tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE; prev_stmt_info = NULL; @@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt, gimple_stmt_iterator *gsi, } } - if (!is_store) + if (vls_type == VLS_LOAD) { /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed from the IL. */ @@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gimple *ptr_incr = NULL; int ncopies; int j; - gimple *next_stmt, *first_stmt = NULL; - bool grouped_store = false; - bool store_lanes_p = false; + gimple *next_stmt, *first_stmt; + bool grouped_store; unsigned int group_size, i; vec<tree> dr_chain = vNULL; vec<tree> oprnds = vNULL; @@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gather_scatter_info gs_info; enum vect_def_type scatter_src_dt = vect_unknown_def_type; gimple *new_stmt; + vec_load_store_type vls_type; if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) return false; @@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return false; } + if (dt == vect_constant_def || dt == vect_external_def) + vls_type = VLS_STORE_INVARIANT; + else + vls_type = VLS_STORE; + if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype)) return false; @@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } if (negative) { - gcc_assert (!grouped_store); alignment_support_scheme = vect_supportable_dr_alignment (dr, false); if (alignment_support_scheme != dr_aligned && alignment_support_scheme != dr_unaligned_supported) @@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } } - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) - { - grouped_store = true; - first_stmt = GROUP_FIRST_ELEMENT (stmt_info); - group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info)) - { - if (vect_store_lanes_supported (vectype, group_size)) - store_lanes_p = true; - else if (!vect_grouped_store_supported (vectype, group_size)) - return false; - } - - if (STMT_VINFO_STRIDED_P (stmt_info) - && slp - && (group_size > nunits - || nunits % group_size != 0)) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "unhandled strided group store\n"); - return false; - } - - if (first_stmt == stmt) - { - /* STMT is the leader of the group. Check the operands of all the - stmts of the group. */ - next_stmt = GROUP_NEXT_ELEMENT (stmt_info); - while (next_stmt) - { - gcc_assert (gimple_assign_single_p (next_stmt)); - op = gimple_assign_rhs1 (next_stmt); - if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "use not simple.\n"); - return false; - } - next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); - } - } - } - - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - { - gimple *def_stmt; - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) - gcc_unreachable (); - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt, - &gs_info.offset_dt, &gs_info.offset_vectype)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "scatter index use not simple."); - return false; - } - } + vect_memory_access_type memory_access_type; + if (!get_load_store_type (stmt, vectype, slp, vls_type, + &memory_access_type, &gs_info)) + return false; if (!vec_stmt) /* transformation not required. */ { + if (!slp) + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; /* The SLP costs are calculated during SLP analysis. */ if (!PURE_SLP_STMT (stmt_info)) - vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt, + vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt, NULL, NULL, NULL); return true; } + if (!slp) + gcc_assert (memory_access_type + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); /** Transform. **/ ensure_base_align (stmt_info, dr); - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + if (memory_access_type == VMAT_GATHER_SCATTER) { tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); @@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, return true; } + grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info); if (grouped_store) { + first_stmt = GROUP_FIRST_ELEMENT (stmt_info); first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); @@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, dump_printf_loc (MSG_NOTE, vect_location, "transform store. ncopies = %d\n", ncopies); - if (STMT_VINFO_STRIDED_P (stmt_info)) + if (memory_access_type == VMAT_ELEMENTWISE + || memory_access_type == VMAT_STRIDED_SLP) { gimple_stmt_iterator incr_gsi; bool insert_after; @@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gcc_assert (alignment_support_scheme); /* Targets with store-lane instructions must not require explicit realignment. */ - gcc_assert (!store_lanes_p + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES || alignment_support_scheme == dr_aligned || alignment_support_scheme == dr_unaligned_supported); if (negative) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - if (store_lanes_p) + if (memory_access_type == VMAT_LOAD_STORE_LANES) aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); else aggr_type = vectype; @@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, TYPE_SIZE_UNIT (aggr_type)); } - if (store_lanes_p) + if (memory_access_type == VMAT_LOAD_STORE_LANES) { tree vec_array; @@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gphi *phi = NULL; vec<tree> dr_chain = vNULL; bool grouped_load = false; - bool load_lanes_p = false; gimple *first_stmt; gimple *first_stmt_for_drptr = NULL; bool inv_p; @@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, { grouped_load = true; /* FORNOW */ - gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)); + gcc_assert (!nested_in_vect_loop); + gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)); first_stmt = GROUP_FIRST_ELEMENT (stmt_info); group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); - bool single_element_p = (first_stmt == stmt - && !GROUP_NEXT_ELEMENT (stmt_info)); - - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info)) - { - if (vect_load_lanes_supported (vectype, group_size)) - load_lanes_p = true; - else if (!vect_grouped_load_supported (vectype, single_element_p, - group_size)) - return false; - } - - if (single_element_p) - { - /* Single-element interleaving requires peeling for gaps. */ - gcc_assert (GROUP_GAP (stmt_info)); - } - - /* If there is a gap in the end of the group then we access excess - elements in the last iteration and thus need to peel that off. */ - if (loop_vinfo - && ! STMT_VINFO_STRIDED_P (stmt_info) - && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Data access with gaps requires scalar " - "epilogue loop\n"); - if (loop->inner) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Peeling for outer loop is not supported\n"); - return false; - } - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; - } if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) slp_perm = true; @@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } } + vect_memory_access_type memory_access_type; + if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD, + &memory_access_type, &gs_info)) + return false; - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - { - gimple *def_stmt; - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) - gcc_unreachable (); - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt, - &gs_info.offset_dt, &gs_info.offset_vectype)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "gather index use not simple.\n"); - return false; - } - } - else if (STMT_VINFO_STRIDED_P (stmt_info)) - ; - else + if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info) + && !STMT_VINFO_STRIDED_P (stmt_info)) { negative = tree_int_cst_compare (nested_in_vect_loop ? STMT_VINFO_DR_STEP (stmt_info) @@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (!vec_stmt) /* transformation not required. */ { + if (!slp) + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; /* The SLP costs are calculated during SLP analysis. */ if (!PURE_SLP_STMT (stmt_info)) - vect_model_load_cost (stmt_info, ncopies, load_lanes_p, + vect_model_load_cost (stmt_info, ncopies, memory_access_type, NULL, NULL, NULL); return true; } + if (!slp) + gcc_assert (memory_access_type + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "transform load. ncopies = %d\n", ncopies); @@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, ensure_base_align (stmt_info, dr); - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) + if (memory_access_type == VMAT_GATHER_SCATTER) { tree vec_oprnd0 = NULL_TREE, op; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); @@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, } return true; } - else if (STMT_VINFO_STRIDED_P (stmt_info)) + + if (memory_access_type == VMAT_ELEMENTWISE + || memory_access_type == VMAT_STRIDED_SLP) { gimple_stmt_iterator incr_gsi; bool insert_after; @@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, int lnel = 1; tree ltype = TREE_TYPE (vectype); auto_vec<tree> dr_chain; - if (slp) + if (memory_access_type == VMAT_STRIDED_SLP) { - if (group_size < nunits - && nunits % group_size == 0) + nloads = nunits / group_size; + if (group_size < nunits) { - nloads = nunits / group_size; lnel = group_size; ltype = build_vector_type (TREE_TYPE (vectype), group_size); - ltype = build_aligned_type (ltype, - TYPE_ALIGN (TREE_TYPE (vectype))); } - else if (group_size >= nunits - && group_size % nunits == 0) + else { - nloads = 1; lnel = nunits; ltype = vectype; - ltype = build_aligned_type (ltype, - TYPE_ALIGN (TREE_TYPE (vectype))); } + ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); + } + if (slp) + { /* For SLP permutation support we need to load the whole group, not only the number of vector stmts the permutation result fits in. */ @@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, gcc_assert (alignment_support_scheme); /* Targets with load-lane instructions must not require explicit realignment. */ - gcc_assert (!load_lanes_p + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES || alignment_support_scheme == dr_aligned || alignment_support_scheme == dr_unaligned_supported); @@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (negative) offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); - if (load_lanes_p) + if (memory_access_type == VMAT_LOAD_STORE_LANES) aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); else aggr_type = vectype; @@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, if (grouped_load || slp_perm) dr_chain.create (vec_num); - if (load_lanes_p) + if (memory_access_type == VMAT_LOAD_STORE_LANES) { tree vec_array; @@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, { if (grouped_load) { - if (!load_lanes_p) + if (memory_access_type != VMAT_LOAD_STORE_LANES) vect_transform_grouped_load (stmt, dr_chain, group_size, gsi); *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); }