The following patch is similar to the strided grouped load case I fixed recently - it handles all the missing cases. The testcase needs the previous dependence fix.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2016-06-14 Richard Biener <rguent...@suse.de> * tree-vect-stmts.c (vectorizable_store): Remove strided grouped store restrictions. * gcc.dg/vect/slp-45.c: New testcase. Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 237428) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_store (gimple *stmt, gimple *** 5234,5239 **** --- 5297,5303 ---- enum vect_def_type scatter_idx_dt = vect_unknown_def_type; enum vect_def_type scatter_src_dt = vect_unknown_def_type; gimple *new_stmt; + int vf; if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) return false; *************** vectorizable_store (gimple *stmt, gimple *** 5270,5276 **** unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype); if (loop_vinfo) ! loop = LOOP_VINFO_LOOP (loop_vinfo); /* Multiple types in SLP are handled by creating the appropriate number of vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in --- 5334,5345 ---- unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype); if (loop_vinfo) ! { ! loop = LOOP_VINFO_LOOP (loop_vinfo); ! vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); ! } ! else ! vf = 1; /* Multiple types in SLP are handled by creating the appropriate number of vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in *************** vectorizable_store (gimple *stmt, gimple *** 5365,5380 **** return false; } - if (STMT_VINFO_STRIDED_P (stmt_info) - && slp - && (group_size > nunits - || nunits % group_size != 0)) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "unhandled strided group store\n"); - return false; - } - if (first_stmt == stmt) { /* STMT is the leader of the group. Check the operands of all the --- 5434,5439 ---- *************** vectorizable_store (gimple *stmt, gimple *** 5653,5675 **** */ unsigned nstores = nunits; tree ltype = elem_type; if (slp) { ! nstores = nunits / group_size; ! if (group_size < nunits) ! ltype = build_vector_type (elem_type, group_size); ! else ! ltype = vectype; ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); - group_size = 1; } ivstep = stride_step; ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, ! build_int_cst (TREE_TYPE (ivstep), ! ncopies * nstores)); standard_iv_increment_position (loop, &incr_gsi, &insert_after); --- 5712,5742 ---- */ unsigned nstores = nunits; + unsigned lnel = 1; tree ltype = elem_type; if (slp) { ! if (group_size < nunits ! && nunits % group_size == 0) ! { ! nstores = nunits / group_size; ! lnel = group_size; ! ltype = build_vector_type (elem_type, group_size); ! } ! else if (group_size >= nunits ! && group_size % nunits == 0) ! { ! nstores = 1; ! lnel = nunits; ! ltype = vectype; ! } ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } ivstep = stride_step; ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, ! build_int_cst (TREE_TYPE (ivstep), vf)); standard_iv_increment_position (loop, &incr_gsi, &insert_after); *************** vectorizable_store (gimple *stmt, gimple *** 5700,5705 **** --- 5767,5775 ---- vect_finish_stmt_generation (stmt, incr, gsi); running_off = newoff; } + unsigned int group_el = 0; + unsigned HOST_WIDE_INT + elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); for (j = 0; j < ncopies; j++) { /* We've set op and dt above, from gimple_assign_rhs1(stmt), *************** vectorizable_store (gimple *stmt, gimple *** 5745,5763 **** NULL_TREE, true, GSI_SAME_STMT); newref = build2 (MEM_REF, ltype, ! running_off, alias_off); /* And store it to *running_off. */ assign = gimple_build_assign (newref, elem); vect_finish_stmt_generation (stmt, assign, gsi); ! newoff = copy_ssa_name (running_off, NULL); ! incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, ! running_off, stride_step); ! vect_finish_stmt_generation (stmt, incr, gsi); ! running_off = newoff; if (g == group_size - 1 && !slp) { --- 5815,5841 ---- NULL_TREE, true, GSI_SAME_STMT); + tree this_off = build_int_cst (TREE_TYPE (alias_off), + group_el * elsz); newref = build2 (MEM_REF, ltype, ! running_off, this_off); /* And store it to *running_off. */ assign = gimple_build_assign (newref, elem); vect_finish_stmt_generation (stmt, assign, gsi); ! group_el += lnel; ! if (! slp ! || group_el == group_size) ! { ! newoff = copy_ssa_name (running_off, NULL); ! incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, ! running_off, stride_step); ! vect_finish_stmt_generation (stmt, incr, gsi); ! running_off = newoff; ! group_el = 0; ! } if (g == group_size - 1 && !slp) { *************** vectorizable_store (gimple *stmt, gimple *** 5771,5776 **** --- 5849,5856 ---- } } next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); + if (slp) + break; } return true; } Index: gcc/testsuite/gcc.dg/vect/slp-45.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-45.c (revision 0) --- gcc/testsuite/gcc.dg/vect/slp-45.c (working copy) *************** *** 0 **** --- 1,78 ---- + /* { dg-do run } */ + /* { dg-require-effective-target vect_int } */ + /* { dg-additional-options "-O3" } */ + + #include <string.h> + #include "tree-vect.h" + + #define FOO(T,N) \ + void __attribute__((noinline,noclone)) \ + foo_ ## T ## _ ## N (T * __restrict__ in_, T * __restrict__ out_, int s) \ + { \ + T *in = __builtin_assume_aligned (in_, __BIGGEST_ALIGNMENT__); \ + T *out = __builtin_assume_aligned (out_, __BIGGEST_ALIGNMENT__); \ + for (int i = 0; i < 16; i++) \ + { \ + for (int j = 0; j < N; ++j) \ + out[j] = in[j]; \ + in += N; \ + out += s*N; \ + } \ + } + + #define TEST(T,N) \ + do { \ + memset (out, 0, 4096); \ + foo_ ## T ## _ ## N ((T *)in, (T *)out, 1); \ + if (memcmp (in, out, sizeof (T) * 16 * N) != 0) \ + __builtin_abort (); \ + for (int i = sizeof (T) * 16 * N; i < 4096; ++i) \ + if (out[i] != 0) \ + __builtin_abort (); \ + } while (0) + + FOO(char, 1) + FOO(char, 2) + FOO(char, 3) + FOO(char, 4) + FOO(char, 6) + FOO(char, 8) + FOO(int, 1) + FOO(int, 2) + FOO(int, 3) + FOO(int, 4) + FOO(int, 6) + FOO(int, 8) + FOO(int, 16) + + char in[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__))); + char out[4096] __attribute__((aligned(__BIGGEST_ALIGNMENT__))); + + int main() + { + check_vect (); + + for (int i = 0; i < 4096; ++i) + { + in[i] = i; + __asm__ volatile ("" : : : "memory"); + } + + TEST(char, 1); + TEST(char, 2); + TEST(char, 3); + TEST(char, 4); + TEST(char, 6); + TEST(char, 8); + TEST(int, 1); + TEST(int, 2); + TEST(int, 3); + TEST(int, 4); + TEST(int, 6); + TEST(int, 8); + TEST(int, 16); + + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 13 "vect" } } */