This is the main patch in the series.  It adds a new enum and routines
for classifying a vector load or store implementation.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Thanks,
Richard


gcc/
        * tree-vectorizer.h (vect_memory_access_type): New enum.
        (_stmt_vec_info): Add a memory_access_type field.
        (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
        (vect_model_store_cost): Take an access type instead of a boolean.
        (vect_model_load_cost): Likewise.
        * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
        vect_model_store_cost and vect_model_load_cost.
        * tree-vect-stmts.c (vec_load_store_type): New enum.
        (vect_model_store_cost): Take an access type instead of a
        store_lanes_p boolean.  Simplify tests.
        (vect_model_load_cost): Likewise, but for load_lanes_p.
        (get_group_load_store_type, get_load_store_type): New functions.
        (vectorizable_store): Use get_load_store_type.  Record the access
        type in STMT_VINFO_MEMORY_ACCESS_TYPE.
        (vectorizable_load): Likewise.
        (vectorizable_mask_load_store): Likewise.  Replace is_store
        variable with vls_type.

Index: gcc/tree-vectorizer.h
===================================================================
--- gcc/tree-vectorizer.h
+++ gcc/tree-vectorizer.h
@@ -485,6 +485,33 @@ enum slp_vect_type {
   hybrid
 };
 
+/* Describes how we're going to vectorize an individual load or store,
+   or a group of loads or stores.  */
+enum vect_memory_access_type {
+  /* A simple contiguous access.  */
+  VMAT_CONTIGUOUS,
+
+  /* A simple contiguous access in which the elements need to be permuted
+     after loading or before storing.  Only used for loop vectorization;
+     SLP uses separate permutes.  */
+  VMAT_CONTIGUOUS_PERMUTE,
+
+  /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES.  */
+  VMAT_LOAD_STORE_LANES,
+
+  /* An access in which each scalar element is loaded or stored
+     individually.  */
+  VMAT_ELEMENTWISE,
+
+  /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped
+     SLP accesses.  Each unrolled iteration uses a contiguous load
+     or store for the whole group, but the groups from separate iterations
+     are combined in the same way as for VMAT_ELEMENTWISE.  */
+  VMAT_STRIDED_SLP,
+
+  /* The access uses gather loads or scatter stores.  */
+  VMAT_GATHER_SCATTER
+};
 
 typedef struct data_reference *dr_p;
 
@@ -602,6 +629,10 @@ typedef struct _stmt_vec_info {
   /* True if this is an access with loop-invariant stride.  */
   bool strided_p;
 
+  /* Classifies how the load or store is going to be implemented
+     for loop vectorization.  */
+  vect_memory_access_type memory_access_type;
+
   /* For both loads and stores.  */
   bool simd_lane_access_p;
 
@@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
 #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
 #define STMT_VINFO_GATHER_SCATTER_P(S)    (S)->gather_scatter_p
 #define STMT_VINFO_STRIDED_P(S)                   (S)->strided_p
+#define STMT_VINFO_MEMORY_ACCESS_TYPE(S)   (S)->memory_access_type
 #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
 #define STMT_VINFO_VEC_REDUCTION_TYPE(S)   (S)->v_reduc_type
 
@@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt);
 extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
                                     stmt_vector_for_cost *,
                                    stmt_vector_for_cost *);
-extern void vect_model_store_cost (stmt_vec_info, int, bool,
+extern void vect_model_store_cost (stmt_vec_info, int, vect_memory_access_type,
                                   enum vect_def_type, slp_tree,
                                   stmt_vector_for_cost *,
                                   stmt_vector_for_cost *);
-extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree,
-                                 stmt_vector_for_cost *,
+extern void vect_model_load_cost (stmt_vec_info, int, vect_memory_access_type,
+                                 slp_tree, stmt_vector_for_cost *,
                                  stmt_vector_for_cost *);
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
                                  enum vect_cost_for_stmt, stmt_vec_info,
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c
+++ gcc/tree-vect-slp.c
@@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree 
node,
   stmt_info = vinfo_for_stmt (stmt);
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
+      vect_memory_access_type memory_access_type
+       = (STMT_VINFO_STRIDED_P (stmt_info)
+          ? VMAT_STRIDED_SLP
+          : VMAT_CONTIGUOUS);
       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
-       vect_model_store_cost (stmt_info, ncopies_for_cost, false,
-                              vect_uninitialized_def,
+       vect_model_store_cost (stmt_info, ncopies_for_cost,
+                              memory_access_type, vect_uninitialized_def,
                               node, prologue_cost_vec, body_cost_vec);
       else
        {
@@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree 
node,
              ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
            }
          /* Record the cost for the vector loads.  */
-         vect_model_load_cost (stmt_info, ncopies_for_cost, false,
-                               node, prologue_cost_vec, body_cost_vec);
+         vect_model_load_cost (stmt_info, ncopies_for_cost,
+                               memory_access_type, node, prologue_cost_vec,
+                               body_cost_vec);
          return;
        }
     }
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c
+++ gcc/tree-vect-stmts.c
@@ -52,6 +52,14 @@ along with GCC; see the file COPYING3.  If not see
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
 
+/* Says whether a statement is a load, a store of a vectorized statement
+   result, or a store of an invariant value.  */
+enum vec_load_store_type {
+  VLS_LOAD,
+  VLS_STORE,
+  VLS_STORE_INVARIANT
+};
+
 /* Return the vectorized type for the given statement.  */
 
 tree
@@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info stmt_info,
 
 void
 vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
-                      bool store_lanes_p, enum vect_def_type dt,
-                      slp_tree slp_node,
+                      vect_memory_access_type memory_access_type,
+                      enum vect_def_type dt, slp_tree slp_node,
                       stmt_vector_for_cost *prologue_cost_vec,
                       stmt_vector_for_cost *body_cost_vec)
 {
@@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int 
ncopies,
   /* We assume that the cost of a single store-lanes instruction is
      equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
      access is instead being provided by a permute-and-store operation,
-     include the cost of the permutes.
-
-     For SLP, the caller has already counted the permutation, if any.  */
-  if (grouped_access_p
-      && first_stmt_p
-      && !store_lanes_p
-      && !STMT_VINFO_STRIDED_P (stmt_info)
-      && !slp_node)
+     include the cost of the permutes.  */
+  if (first_stmt_p
+      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
     {
       /* Uses a high and low interleave or shuffle operations for each
         needed permute.  */
@@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int 
ncopies,
 
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   /* Costs of the stores.  */
-  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
-    {
-      /* N scalar stores plus extracting the elements.  */
-      inside_cost += record_stmt_cost (body_cost_vec,
-                                      ncopies * TYPE_VECTOR_SUBPARTS (vectype),
-                                      scalar_store, stmt_info, 0, vect_body);
-    }
+  if (memory_access_type == VMAT_ELEMENTWISE)
+    /* N scalar stores plus extracting the elements.  */
+    inside_cost += record_stmt_cost (body_cost_vec,
+                                    ncopies * TYPE_VECTOR_SUBPARTS (vectype),
+                                    scalar_store, stmt_info, 0, vect_body);
   else
     vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec);
 
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     inside_cost += record_stmt_cost (body_cost_vec,
                                     ncopies * TYPE_VECTOR_SUBPARTS (vectype),
                                     vec_to_scalar, stmt_info, 0, vect_body);
@@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int 
ncopies,
 
 void
 vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
-                     bool load_lanes_p, slp_tree slp_node,
+                     vect_memory_access_type memory_access_type,
+                     slp_tree slp_node,
                      stmt_vector_for_cost *prologue_cost_vec,
                      stmt_vector_for_cost *body_cost_vec)
 {
@@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
ncopies,
   /* We assume that the cost of a single load-lanes instruction is
      equivalent to the cost of GROUP_SIZE separate loads.  If a grouped
      access is instead being provided by a load-and-permute operation,
-     include the cost of the permutes.
-
-     For SLP, the caller has already counted the permutation, if any.  */
-  if (grouped_access_p
-      && first_stmt_p
-      && !load_lanes_p
-      && !STMT_VINFO_STRIDED_P (stmt_info)
-      && !slp_node)
+     include the cost of the permutes.  */
+  if (first_stmt_p
+      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
     {
       /* Uses an even and odd extract operations or shuffle operations
         for each needed permute.  */
@@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
ncopies,
     }
 
   /* The loads themselves.  */
-  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
+  if (memory_access_type == VMAT_ELEMENTWISE)
     {
       /* N scalar loads plus gathering them into a vector.  */
       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
ncopies,
     vect_get_load_cost (dr, ncopies, first_stmt_p,
                        &inside_cost, &prologue_cost, 
                        prologue_cost_vec, body_cost_vec, true);
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
                                     stmt_info, 0, vect_body);
 
@@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree, 
gimple *,
                                  gimple_stmt_iterator *);
 
 
+/* A subroutine of get_load_store_type, with a subset of the same
+   arguments.  Handle the case where STMT is part of a grouped load
+   or store.
+
+   For stores, the statements in the group are all consecutive
+   and there is no gap at the end.  For loads, the statements in the
+   group might not be consecutive; there can be gaps between statements
+   as well as at the end.  */
+
+static bool
+get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
+                          vec_load_store_type vls_type,
+                          vect_memory_access_type *memory_access_type)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  vec_info *vinfo = stmt_info->vinfo;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
+  gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+  bool single_element_p = (stmt == first_stmt
+                          && !GROUP_NEXT_ELEMENT (stmt_info));
+  unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
+  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
+
+  /* True if the vectorized statements would access beyond the last
+     statement in the group.  */
+  bool overrun_p = false;
+
+  /* True if we can cope with such overrun by peeling for gaps, so that
+     there is at least one final scalar iteration after the vector loop.  */
+  bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
+
+  /* There can only be a gap at the end of the group if the stride is
+     known at compile time.  */
+  gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0);
+
+  /* Stores can't yet have gaps.  */
+  gcc_assert (slp || vls_type == VLS_LOAD || gap == 0);
+
+  if (slp)
+    {
+      if (STMT_VINFO_STRIDED_P (stmt_info))
+       {
+         /* Try to use consecutive accesses of GROUP_SIZE elements,
+            separated by the stride, until we have a complete vector.
+            Fall back to scalar accesses if that isn't possible.  */
+         if (nunits % group_size == 0)
+           *memory_access_type = VMAT_STRIDED_SLP;
+         else
+           *memory_access_type = VMAT_ELEMENTWISE;
+       }
+      else
+       {
+         overrun_p = loop_vinfo && gap != 0;
+         if (overrun_p && vls_type != VLS_LOAD)
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "Grouped store with gaps requires"
+                              " non-consecutive accesses\n");
+             return false;
+           }
+         if (overrun_p && !can_overrun_p)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Peeling for outer loop is not supported\n");
+             return false;
+           }
+         *memory_access_type = VMAT_CONTIGUOUS;
+       }
+    }
+  else
+    {
+      /* We can always handle this case using elementwise accesses,
+        but see if something more efficient is available.  */
+      *memory_access_type = VMAT_ELEMENTWISE;
+
+      /* If there is a gap at the end of the group then these optimizations
+        would access excess elements in the last iteration.  */
+      bool would_overrun_p = (gap != 0);
+      if (!STMT_VINFO_STRIDED_P (stmt_info)
+         && (can_overrun_p || !would_overrun_p))
+       {
+         /* First try using LOAD/STORE_LANES.  */
+         if (vls_type == VLS_LOAD
+             ? vect_load_lanes_supported (vectype, group_size)
+             : vect_store_lanes_supported (vectype, group_size))
+           {
+             *memory_access_type = VMAT_LOAD_STORE_LANES;
+             overrun_p = would_overrun_p;
+           }
+
+         /* If that fails, try using permuting loads.  */
+         if (*memory_access_type == VMAT_ELEMENTWISE
+             && (vls_type == VLS_LOAD
+                 ? vect_grouped_load_supported (vectype, single_element_p,
+                                                group_size)
+                 : vect_grouped_store_supported (vectype, group_size)))
+           {
+             *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+             overrun_p = would_overrun_p;
+           }
+       }
+    }
+
+  if (vls_type != VLS_LOAD && first_stmt == stmt)
+    {
+      /* STMT is the leader of the group. Check the operands of all the
+        stmts of the group.  */
+      gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
+      while (next_stmt)
+       {
+         gcc_assert (gimple_assign_single_p (next_stmt));
+         tree op = gimple_assign_rhs1 (next_stmt);
+         gimple *def_stmt;
+         enum vect_def_type dt;
+         if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "use not simple.\n");
+             return false;
+           }
+         next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+       }
+    }
+
+  if (overrun_p)
+    {
+      gcc_assert (can_overrun_p);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "Data access with gaps requires scalar "
+                        "epilogue loop\n");
+      LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
+    }
+
+  return true;
+}
+
+/* Analyze load or store statement STMT of type VLS_TYPE.  Return true
+   if there is a memory access type that the vectorized form can use,
+   storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
+   or scatters, fill in GS_INFO accordingly.
+
+   SLP says whether we're performing SLP rather than loop vectorization.
+   VECTYPE is the vector type that the vectorized statements will use.  */
+
+static bool
+get_load_store_type (gimple *stmt, tree vectype, bool slp,
+                    vec_load_store_type vls_type,
+                    vect_memory_access_type *memory_access_type,
+                    gather_scatter_info *gs_info)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  vec_info *vinfo = stmt_info->vinfo;
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+    {
+      *memory_access_type = VMAT_GATHER_SCATTER;
+      gimple *def_stmt;
+      if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
+       gcc_unreachable ();
+      else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt,
+                                   &gs_info->offset_dt,
+                                   &gs_info->offset_vectype))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "%s index use not simple.\n",
+                            vls_type == VLS_LOAD ? "gather" : "scatter");
+         return false;
+       }
+    }
+  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
+                                     memory_access_type))
+       return false;
+    }
+  else if (STMT_VINFO_STRIDED_P (stmt_info))
+    {
+      gcc_assert (!slp);
+      *memory_access_type = VMAT_ELEMENTWISE;
+    }
+  else
+    *memory_access_type = VMAT_CONTIGUOUS;
+
+  /* FIXME: At the moment the cost model seems to underestimate the
+     cost of using elementwise accesses.  This check preserves the
+     traditional behavior until that can be fixed.  */
+  if (*memory_access_type == VMAT_ELEMENTWISE
+      && !STMT_VINFO_STRIDED_P (stmt_info))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not falling back to elementwise accesses\n");
+      return false;
+    }
+  return true;
+}
+
 /* Function vectorizable_mask_load_store.
 
    Check if STMT performs a conditional load or store that can be vectorized.
@@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
   int i, j;
   bool inv_p;
   gather_scatter_info gs_info;
-  bool is_store;
+  vec_load_store_type vls_type;
   tree mask;
   gimple *def_stmt;
   enum vect_def_type dt;
@@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
   ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
   gcc_assert (ncopies >= 1);
 
-  is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
   mask = gimple_call_arg (stmt, 2);
 
   if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE)
@@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
 
   elem_type = TREE_TYPE (vectype);
 
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  if (STMT_VINFO_STRIDED_P (stmt_info))
-    return false;
-
   if (TREE_CODE (mask) != SSA_NAME)
     return false;
 
@@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
       || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS (vectype))
     return false;
 
-  if (is_store)
+  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
     {
       tree rhs = gimple_call_arg (stmt, 3);
       if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, &rhs_vectype))
        return false;
+      if (dt == vect_constant_def || dt == vect_external_def)
+       vls_type = VLS_STORE_INVARIANT;
+      else
+       vls_type = VLS_STORE;
     }
+  else
+    vls_type = VLS_LOAD;
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-       gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt,
-                              &gs_info.offset_dt, &gs_info.offset_vectype))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "gather index use not simple.");
-         return false;
-       }
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, false, vls_type,
+                           &memory_access_type, &gs_info))
+    return false;
 
+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
       tree masktype
        = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
@@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
          return false;
        }
     }
+  else if (memory_access_type != VMAT_CONTIGUOUS)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "unsupported access type for masked %s\n",
+                        vls_type == VLS_LOAD ? "load" : "store");
+      return false;
+    }
   else if (tree_int_cst_compare (nested_in_vect_loop
                                 ? STMT_VINFO_DR_STEP (stmt_info)
                                 : DR_STEP (dr), size_zero_node) <= 0)
@@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
   else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
           || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
                                          TYPE_MODE (mask_vectype),
-                                         !is_store)
+                                         vls_type == VLS_LOAD)
           || (rhs_vectype
               && !useless_type_conversion_p (vectype, rhs_vectype)))
     return false;
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
-      if (is_store)
-       vect_model_store_cost (stmt_info, ncopies, false, dt,
-                              NULL, NULL, NULL);
+      if (vls_type == VLS_LOAD)
+       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
+                             NULL, NULL, NULL);
       else
-       vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL);
+       vect_model_store_cost (stmt_info, ncopies, memory_access_type,
+                              dt, NULL, NULL, NULL);
       return true;
     }
+  gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
 
   /** Transform.  **/
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
       gsi_replace (gsi, new_stmt, true);
       return true;
     }
-  else if (is_store)
+  else if (vls_type != VLS_LOAD)
     {
       tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
       prev_stmt_info = NULL;
@@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt, 
gimple_stmt_iterator *gsi,
        }
     }
 
-  if (!is_store)
+  if (vls_type == VLS_LOAD)
     {
       /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed
         from the IL.  */
@@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   gimple *ptr_incr = NULL;
   int ncopies;
   int j;
-  gimple *next_stmt, *first_stmt = NULL;
-  bool grouped_store = false;
-  bool store_lanes_p = false;
+  gimple *next_stmt, *first_stmt;
+  bool grouped_store;
   unsigned int group_size, i;
   vec<tree> dr_chain = vNULL;
   vec<tree> oprnds = vNULL;
@@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   gather_scatter_info gs_info;
   enum vect_def_type scatter_src_dt = vect_unknown_def_type;
   gimple *new_stmt;
+  vec_load_store_type vls_type;
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
     return false;
@@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
       return false;
     }
 
+  if (dt == vect_constant_def || dt == vect_external_def)
+    vls_type = VLS_STORE_INVARIANT;
+  else
+    vls_type = VLS_STORE;
+
   if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
     return false;
 
@@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
        }
       if (negative)
        {
-         gcc_assert (!grouped_store);
          alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
          if (alignment_support_scheme != dr_aligned
              && alignment_support_scheme != dr_unaligned_supported)
@@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
        }
     }
 
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    {
-      grouped_store = true;
-      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
-      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
-       {
-         if (vect_store_lanes_supported (vectype, group_size))
-           store_lanes_p = true;
-         else if (!vect_grouped_store_supported (vectype, group_size))
-           return false;
-       }
-
-      if (STMT_VINFO_STRIDED_P (stmt_info)
-         && slp
-         && (group_size > nunits
-             || nunits % group_size != 0))
-       {
-         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "unhandled strided group store\n");
-         return false;
-       }
-
-      if (first_stmt == stmt)
-       {
-          /* STMT is the leader of the group. Check the operands of all the
-             stmts of the group.  */
-          next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
-          while (next_stmt)
-            {
-             gcc_assert (gimple_assign_single_p (next_stmt));
-             op = gimple_assign_rhs1 (next_stmt);
-              if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
-                {
-                  if (dump_enabled_p ())
-                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                     "use not simple.\n");
-                  return false;
-                }
-              next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
-            }
-        }
-    }
-
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-       gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
-                              &gs_info.offset_dt, &gs_info.offset_vectype))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "scatter index use not simple.");
-         return false;
-       }
-    }
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, slp, vls_type,
+                           &memory_access_type, &gs_info))
+    return false;
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (!slp)
+       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
       if (!PURE_SLP_STMT (stmt_info))
-       vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
+       vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt,
                               NULL, NULL, NULL);
       return true;
     }
+  if (!slp)
+    gcc_assert (memory_access_type
+               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
 
   /** Transform.  **/
 
   ensure_base_align (stmt_info, dr);
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
       return true;
     }
 
+  grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
   if (grouped_store)
     {
+      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
       first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
       group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
 
@@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
     dump_printf_loc (MSG_NOTE, vect_location,
                      "transform store. ncopies = %d\n", ncopies);
 
-  if (STMT_VINFO_STRIDED_P (stmt_info))
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     {
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
@@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   gcc_assert (alignment_support_scheme);
   /* Targets with store-lane instructions must not require explicit
      realignment.  */
-  gcc_assert (!store_lanes_p
+  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
              || alignment_support_scheme == dr_aligned
              || alignment_support_scheme == dr_unaligned_supported);
 
   if (negative)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  if (store_lanes_p)
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
     aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
   else
     aggr_type = vectype;
@@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
                                           TYPE_SIZE_UNIT (aggr_type));
        }
 
-      if (store_lanes_p)
+      if (memory_access_type == VMAT_LOAD_STORE_LANES)
        {
          tree vec_array;
 
@@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   gphi *phi = NULL;
   vec<tree> dr_chain = vNULL;
   bool grouped_load = false;
-  bool load_lanes_p = false;
   gimple *first_stmt;
   gimple *first_stmt_for_drptr = NULL;
   bool inv_p;
@@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
     {
       grouped_load = true;
       /* FORNOW */
-      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P 
(stmt_info));
+      gcc_assert (!nested_in_vect_loop);
+      gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
       group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-      bool single_element_p = (first_stmt == stmt
-                              && !GROUP_NEXT_ELEMENT (stmt_info));
-
-      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
-       {
-         if (vect_load_lanes_supported (vectype, group_size))
-           load_lanes_p = true;
-         else if (!vect_grouped_load_supported (vectype, single_element_p,
-                                                group_size))
-           return false;
-       }
-
-      if (single_element_p)
-       {
-         /* Single-element interleaving requires peeling for gaps.  */
-         gcc_assert (GROUP_GAP (stmt_info));
-       }
-
-      /* If there is a gap in the end of the group then we access excess
-        elements in the last iteration and thus need to peel that off.  */
-      if (loop_vinfo
-         && ! STMT_VINFO_STRIDED_P (stmt_info)
-         && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "Data access with gaps requires scalar "
-                            "epilogue loop\n");
-         if (loop->inner)
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "Peeling for outer loop is not supported\n");
-             return false;
-           }
-
-         LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
-       }
 
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
        slp_perm = true;
@@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
        }
     }
 
+  vect_memory_access_type memory_access_type;
+  if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD,
+                           &memory_access_type, &gs_info))
+    return false;
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-    {
-      gimple *def_stmt;
-      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
-       gcc_unreachable ();
-      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
-                              &gs_info.offset_dt, &gs_info.offset_vectype))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "gather index use not simple.\n");
-         return false;
-       }
-    }
-  else if (STMT_VINFO_STRIDED_P (stmt_info))
-    ;
-  else
+  if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)
+      && !STMT_VINFO_STRIDED_P (stmt_info))
     {
       negative = tree_int_cst_compare (nested_in_vect_loop
                                       ? STMT_VINFO_DR_STEP (stmt_info)
@@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      if (!slp)
+       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       /* The SLP costs are calculated during SLP analysis.  */
       if (!PURE_SLP_STMT (stmt_info))
-       vect_model_load_cost (stmt_info, ncopies, load_lanes_p,
+       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
                              NULL, NULL, NULL);
       return true;
     }
 
+  if (!slp)
+    gcc_assert (memory_access_type
+               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
+
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "transform load. ncopies = %d\n", ncopies);
@@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
 
   ensure_base_align (stmt_info, dr);
 
-  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+  if (memory_access_type == VMAT_GATHER_SCATTER)
     {
       tree vec_oprnd0 = NULL_TREE, op;
       tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
@@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
        }
       return true;
     }
-  else if (STMT_VINFO_STRIDED_P (stmt_info))
+
+  if (memory_access_type == VMAT_ELEMENTWISE
+      || memory_access_type == VMAT_STRIDED_SLP)
     {
       gimple_stmt_iterator incr_gsi;
       bool insert_after;
@@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
       int lnel = 1;
       tree ltype = TREE_TYPE (vectype);
       auto_vec<tree> dr_chain;
-      if (slp)
+      if (memory_access_type == VMAT_STRIDED_SLP)
        {
-         if (group_size < nunits
-             && nunits % group_size == 0)
+         nloads = nunits / group_size;
+         if (group_size < nunits)
            {
-             nloads = nunits / group_size;
              lnel = group_size;
              ltype = build_vector_type (TREE_TYPE (vectype), group_size);
-             ltype = build_aligned_type (ltype,
-                                         TYPE_ALIGN (TREE_TYPE (vectype)));
            }
-         else if (group_size >= nunits
-                  && group_size % nunits == 0)
+         else
            {
-             nloads = 1;
              lnel = nunits;
              ltype = vectype;
-             ltype = build_aligned_type (ltype,
-                                         TYPE_ALIGN (TREE_TYPE (vectype)));
            }
+         ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
+       }
+      if (slp)
+       {
          /* For SLP permutation support we need to load the whole group,
             not only the number of vector stmts the permutation result
             fits in.  */
@@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   gcc_assert (alignment_support_scheme);
   /* Targets with load-lane instructions must not require explicit
      realignment.  */
-  gcc_assert (!load_lanes_p
+  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
              || alignment_support_scheme == dr_aligned
              || alignment_support_scheme == dr_unaligned_supported);
 
@@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
   if (negative)
     offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
 
-  if (load_lanes_p)
+  if (memory_access_type == VMAT_LOAD_STORE_LANES)
     aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
   else
     aggr_type = vectype;
@@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
       if (grouped_load || slp_perm)
        dr_chain.create (vec_num);
 
-      if (load_lanes_p)
+      if (memory_access_type == VMAT_LOAD_STORE_LANES)
        {
          tree vec_array;
 
@@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
*gsi, gimple **vec_stmt,
         {
           if (grouped_load)
            {
-             if (!load_lanes_p)
+             if (memory_access_type != VMAT_LOAD_STORE_LANES)
                vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
              *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
            }

Reply via email to