[PATCH] tree-optimization/116816 - improve VMAT_ELEMENTWISE with SLP

Richard Biener Wed, 24 Sep 2025 04:37:03 -0700

The following implements VMAT_ELEMENTWISE for grouped loads, in
particular for being able to serve as fallback for unhandled
load permutations since it's trivial to load elements in the
correct order.


Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

        PR tree-optimization/116816
        * tree-vect-stmts.cc (get_load_store_type): Allow multi-lane
        single-element interleaving to fall back to VMAT_ELEMENTWISE.
        Fall back to VMAT_ELEMENTWISE when we cannot handle a load
        permutation.
        (vectorizable_load): Do not check a load permutation
        for VMAT_ELEMENTWISE.  Handle grouped loads with
        VMAT_ELEMENTWISE and directly apply a load permutation.
---
 gcc/tree-vect-stmts.cc | 81 ++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 47 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 841733349b5..bdedb25efe9 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2134,41 +2134,26 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
              || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
          && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
        {
-         if (SLP_TREE_LANES (slp_node) == 1)
-           {
-             *memory_access_type = VMAT_ELEMENTWISE;
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "single-element interleaving not supported "
-                                "for not adjacent vector loads, using "
-                                "elementwise access\n");
-           }
-         else
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "single-element interleaving not supported "
-                                "for not adjacent vector loads\n");
-             return false;
-           }
+         *memory_access_type = VMAT_ELEMENTWISE;
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "single-element interleaving not supported "
+                            "for not adjacent vector loads, using "
+                            "elementwise access\n");
        }
 
-      /* For single-element interleaving also fall back to elementwise
-        access in case we did not lower a permutation and cannot
-        code generate it.  */
+      /* Also fall back to elementwise access in case we did not lower a
+        permutation and cannot code generate it.  */
       if (loop_vinfo
-         && single_element_p
-         && SLP_TREE_LANES (slp_node) == 1
-         && (*memory_access_type == VMAT_CONTIGUOUS
-             || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+         && *memory_access_type != VMAT_ELEMENTWISE
          && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
          && !perm_ok)
        {
          *memory_access_type = VMAT_ELEMENTWISE;
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "single-element interleaving permutation not "
-                            "supported, using elementwise access\n");
+                            "permutation not supported, using elementwise "
+                            "access\n");
        }
 
       overrun_p = (loop_vinfo && gap != 0
@@ -2498,9 +2483,9 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
      traditional behavior until that can be fixed.  */
   if (*memory_access_type == VMAT_ELEMENTWISE
       && !STMT_VINFO_STRIDED_P (first_stmt_info)
-      && !(stmt_info == DR_GROUP_FIRST_ELEMENT (stmt_info)
-          && !DR_GROUP_NEXT_ELEMENT (stmt_info)
-          && !pow2p_hwi (DR_GROUP_SIZE (stmt_info))))
+      && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
+          && single_element_p
+          && !pow2p_hwi (group_size)))
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9480,11 +9465,11 @@ vectorizable_load (vec_info *vinfo,
   /* ???  The following checks should really be part of
      get_load_store_type.  */
   if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-      && !((memory_access_type == VMAT_ELEMENTWISE
-           || mat_gather_scatter_p (memory_access_type))
-          && SLP_TREE_LANES (slp_node) == 1
-          && (!grouped_load
-              || !DR_GROUP_NEXT_ELEMENT (first_stmt_info))))
+      && !(memory_access_type == VMAT_ELEMENTWISE
+          || (mat_gather_scatter_p (memory_access_type)
+              && SLP_TREE_LANES (slp_node) == 1
+              && (!grouped_load
+                  || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
     {
       slp_perm = true;
 
@@ -9727,28 +9712,24 @@ vectorizable_load (vec_info *vinfo,
        {
          first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
          first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
+         ref_type = get_group_alias_ptr_type (first_stmt_info);
        }
       else
        {
          first_stmt_info = stmt_info;
          first_dr_info = dr_info;
+         ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
        }
 
-      if (grouped_load && memory_access_type == VMAT_STRIDED_SLP)
+      if (grouped_load)
        {
-         group_size = DR_GROUP_SIZE (first_stmt_info);
-         ref_type = get_group_alias_ptr_type (first_stmt_info);
+         if (memory_access_type == VMAT_STRIDED_SLP)
+           group_size = DR_GROUP_SIZE (first_stmt_info);
+         else /* VMAT_ELEMENTWISE */
+           group_size = SLP_TREE_LANES (slp_node);
        }
       else
-       {
-         if (grouped_load)
-           cst_offset
-             = (tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))
-                * vect_get_place_in_interleaving_chain (stmt_info,
-                                                        first_stmt_info));
-         group_size = 1;
-         ref_type = reference_alias_ptr_type (DR_REF (dr_info->dr));
-       }
+       group_size = 1;
 
       if (!costing_p)
        {
@@ -9887,6 +9868,7 @@ vectorizable_load (vec_info *vinfo,
       int ncopies;
       if (slp_perm)
        {
+         gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
          /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
             variable VF.  */
          unsigned int const_vf = vf.to_constant ();
@@ -9922,8 +9904,13 @@ vectorizable_load (vec_info *vinfo,
                                                     slp_node, 0, vect_body);
                  continue;
                }
+             unsigned int load_el = group_el;
+             /* For elementwise accesses apply a load permutation directly.  */
+             if (memory_access_type == VMAT_ELEMENTWISE
+                 && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+               load_el = SLP_TREE_LOAD_PERMUTATION (slp_node)[group_el];
              tree this_off = build_int_cst (TREE_TYPE (alias_off),
-                                            group_el * elsz + cst_offset);
+                                            load_el * elsz + cst_offset);
              tree data_ref = build2 (MEM_REF, ltype, running_off, this_off);
              vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
              new_temp = make_ssa_name (ltype);
-- 
2.51.0

[PATCH] tree-optimization/116816 - improve VMAT_ELEMENTWISE with SLP

Reply via email to