Need to think more about that one. I'm leaving the testcase in, it compiles somewhat slowly (6s for -O0 optimized cc1 with checking) but still reasonable due to the other fix for said PR.
Applied. Richard. 2019-07-19 Richard Biener <rguent...@suse.de> PR tree-optimization/91207 Revert 2019-07-17 Richard Biener <rguent...@suse.de> PR tree-optimization/91178 * tree-vect-stmts.c (get_group_load_store_type): For SLP loads with a gap larger than the vector size always use VMAT_STRIDED_SLP. (vectorizable_load): For VMAT_STRIDED_SLP with a permutation avoid loading vectors that are only contained in the gap and thus are not needed. * gcc.dg/torture/pr91207.c: New testcase. Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 273590) +++ gcc/tree-vect-stmts.c (working copy) @@ -2267,14 +2267,6 @@ get_group_load_store_type (stmt_vec_info / vect_get_scalar_dr_size (first_dr_info))) overrun_p = false; - /* If the gap at the end of the group exceeds a whole vector - in size use the strided SLP code which can skip code-generation - for the gap. */ - if (vls_type == VLS_LOAD && known_gt (gap, nunits)) - *memory_access_type = VMAT_STRIDED_SLP; - else - *memory_access_type = VMAT_CONTIGUOUS; - /* If the gap splits the vector in half and the target can do half-vector operations avoid the epilogue peeling by simply loading half of the vector only. Usually @@ -2282,8 +2274,7 @@ get_group_load_store_type (stmt_vec_info dr_alignment_support alignment_support_scheme; scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); machine_mode vmode; - if (*memory_access_type == VMAT_CONTIGUOUS - && overrun_p + if (overrun_p && !masked_p && (((alignment_support_scheme = vect_supportable_dr_alignment (first_dr_info, false))) @@ -2306,6 +2297,7 @@ get_group_load_store_type (stmt_vec_info "Peeling for outer loop is not supported\n"); return false; } + *memory_access_type = VMAT_CONTIGUOUS; } } else @@ -8740,7 +8732,6 @@ vectorizable_load (stmt_vec_info stmt_in /* Checked by get_load_store_type. */ unsigned int const_nunits = nunits.to_constant (); unsigned HOST_WIDE_INT cst_offset = 0; - unsigned int group_gap = 0; gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); gcc_assert (!nested_in_vect_loop); @@ -8758,7 +8749,6 @@ vectorizable_load (stmt_vec_info stmt_in if (slp && grouped_load) { group_size = DR_GROUP_SIZE (first_stmt_info); - group_gap = DR_GROUP_GAP (first_stmt_info); ref_type = get_group_alias_ptr_type (first_stmt_info); } else @@ -8902,14 +8892,6 @@ vectorizable_load (stmt_vec_info stmt_in if (nloads > 1) vec_alloc (v, nloads); stmt_vec_info new_stmt_info = NULL; - if (slp && slp_perm - && (group_el % group_size) > group_size - group_gap - && (group_el % group_size) + nloads * lnel < group_size) - { - dr_chain.quick_push (NULL_TREE); - group_el += nloads * lnel; - continue; - } for (i = 0; i < nloads; i++) { tree this_off = build_int_cst (TREE_TYPE (alias_off), Index: gcc/testsuite/gcc.dg/torture/pr91207.c =================================================================== --- gcc/testsuite/gcc.dg/torture/pr91207.c (nonexistent) +++ gcc/testsuite/gcc.dg/torture/pr91207.c (working copy) @@ -0,0 +1,25 @@ +/* { dg-do run } */ + +long long a; +int b[92][32]; +unsigned int c, d; + +void e(long long *f, int p2) { *f = p2; } + +int main() +{ + for (int i = 6; i <= 20; d = i++) + for (int j = 6; j <= 91; j++) { + for (int k = 16; k <= 31;k++) + b[j][k] ^= 7; + c *= d; + } + + for (int i = 0; i < 21; ++i) + for (int j = 0; j < 32; ++j) + e(&a, b[i][j]); + + if (a != 7) + __builtin_abort (); + return 0; +}