The following fixes SLP loads with gaps in the case of no permutation.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk
sofar.

Richard.

2017-06-18  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/81410
        * tree-vect-stmts.c (vectorizable_load): Properly adjust for
        the gap in the ! slp_perm SLP case after each group.

        * gcc.dg/vect/pr81410.c: New testcase.

Index: gcc/tree-vect-stmts.c
===================================================================
*** gcc/tree-vect-stmts.c       (revision 250296)
--- gcc/tree-vect-stmts.c       (working copy)
*************** vectorizable_load (gimple *stmt, gimple_
*** 7118,7123 ****
--- 7118,7124 ----
      {
        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
        group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+       int group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
        /* For SLP vectorization we directly vectorize a subchain
           without permutation.  */
        if (slp && ! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
*************** vectorizable_load (gimple *stmt, gimple_
*** 7153,7162 ****
             not only the number of vector stmts the permutation result
             fits in.  */
          if (slp_perm)
!           vec_num = (group_size * vf + nunits - 1) / nunits;
          else
!           vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!         group_gap_adj = vf * group_size - nunits * vec_num;
        }
        else
        vec_num = group_size;
--- 7154,7168 ----
             not only the number of vector stmts the permutation result
             fits in.  */
          if (slp_perm)
!           {
!             vec_num = (group_size * vf + nunits - 1) / nunits;
!             group_gap_adj = vf * group_size - nunits * vec_num;
!           }
          else
!           {
!             vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
!             group_gap_adj = group_gap;
!           }
        }
        else
        vec_num = group_size;
*************** vectorizable_load (gimple *stmt, gimple_
*** 7316,7321 ****
--- 7322,7328 ----
      aggr_type = vectype;
  
    prev_stmt_info = NULL;
+   int group_elt = 0;
    for (j = 0; j < ncopies; j++)
      {
        /* 1. Create the vector or array pointer update chain.  */
*************** vectorizable_load (gimple *stmt, gimple_
*** 7603,7612 ****
              /* Store vector loads in the corresponding SLP_NODE.  */
              if (slp && !slp_perm)
                SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
            }
          /* Bump the vector pointer to account for a gap or for excess
             elements loaded for a permuted SLP load.  */
!         if (group_gap_adj != 0)
            {
              bool ovf;
              tree bump
--- 7610,7636 ----
              /* Store vector loads in the corresponding SLP_NODE.  */
              if (slp && !slp_perm)
                SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+ 
+             /* With SLP permutation we load the gaps as well, without
+                we need to skip the gaps after we manage to fully load
+                all elements.  group_gap_adj is GROUP_SIZE here.  */
+             group_elt += nunits;
+             if (group_gap_adj != 0 && ! slp_perm
+                 && group_elt == group_size - group_gap_adj)
+               {
+                 bool ovf;
+                 tree bump
+                   = wide_int_to_tree (sizetype,
+                                       wi::smul (TYPE_SIZE_UNIT (elem_type),
+                                                 group_gap_adj, &ovf));
+                 dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi,
+                                                stmt, bump);
+                 group_elt = 0;
+               }
            }
          /* Bump the vector pointer to account for a gap or for excess
             elements loaded for a permuted SLP load.  */
!         if (group_gap_adj != 0 && slp_perm)
            {
              bool ovf;
              tree bump
Index: gcc/testsuite/gcc.dg/vect/pr81410.c
===================================================================
*** gcc/testsuite/gcc.dg/vect/pr81410.c (nonexistent)
--- gcc/testsuite/gcc.dg/vect/pr81410.c (working copy)
***************
*** 0 ****
--- 1,38 ----
+ /* { dg-do run } */
+ /* { dg-require-effective-target vect_long_long } */
+ 
+ #include "tree-vect.h"
+ 
+ typedef long long uint64_t;
+ uint64_t x[24];
+ uint64_t y[16];
+ uint64_t z[8];
+ 
+ void __attribute__((noinline)) foo()
+ {
+   for (int i = 0; i < 8; ++i)
+     {
+       y[2*i] = x[3*i];
+       y[2*i + 1] = x[3*i + 1];
+       z[i] = 1;
+     }
+ }
+ 
+ int main()
+ {
+   check_vect ();
+ 
+   for (int i = 0; i < 24; ++i)
+     {
+       x[i] = i;
+       __asm__ volatile ("" : : : "memory");
+     }
+   foo ();
+   for (int i = 0; i < 8; ++i)
+     if (y[2*i] != 3*i || y[2*i+1] != 3*i + 1)
+       __builtin_abort ();
+ 
+   return 0;
+ }
+ 
+ /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */

Reply via email to