Richard Biener <[email protected]> writes:
> This avoids falling back to elementwise accesses for strided SLP
> loads when the group size is not a multiple of the vector element
> size. Instead we can use a smaller vector or integer type for the load.
>
> For stores we can do the same though restrictions on stores we handle
> and the fact that store-merging covers up makes this mostly effective
> for cost modeling which shows for gcc.target/i386/vect-strided-3.c
> which we now vectorize with V4SI vectors rather than just V2SI ones.
>
> For all of this there's still the opportunity to use non-uniform
> accesses, say for a 6-element group with a VF of two do
> V4SI, { V2SI, V2SI }, V4SI. But that's for a possible followup.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, textually
> this depends on the gap improvement series so I'll push only
> after those. Target independent testing is difficult, strided
> accesses are difficult for VLA - I suppose they should go
> through gather/scatter but we have to be able to construct the
> offset vector there.
Yeah, agreed. And I suppose for tests like these, which load
consecutive pairs of 32-bit elements, we'd want to generate a gather
of 64-bit elements. So there'd be a similar accretion process,
but only if it applies regularly across the whole vector.
Richard
>
> Richard.
>
> * gcc.target/i386/vect-strided-1.c: New testcase.
> * gcc.target/i386/vect-strided-2.c: Likewise.
> * gcc.target/i386/vect-strided-3.c: Likewise.
> * gcc.target/i386/vect-strided-4.c: Likewise.
> ---
> .../gcc.target/i386/vect-strided-1.c | 24 +++++
> .../gcc.target/i386/vect-strided-2.c | 17 +++
> .../gcc.target/i386/vect-strided-3.c | 20 ++++
> .../gcc.target/i386/vect-strided-4.c | 20 ++++
> gcc/tree-vect-stmts.cc | 100 ++++++++----------
> 5 files changed, 127 insertions(+), 54 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-1.c
> create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-2.c
> create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-3.c
> create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-4.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-1.c
> b/gcc/testsuite/gcc.target/i386/vect-strided-1.c
> new file mode 100644
> index 00000000000..db4a06711f1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-strided-1.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +
> +void foo (int * __restrict a, int *b, int s)
> +{
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[8*i+0] = b[s*i+0];
> + a[8*i+1] = b[s*i+1];
> + a[8*i+2] = b[s*i+2];
> + a[8*i+3] = b[s*i+3];
> + a[8*i+4] = b[s*i+4];
> + a[8*i+5] = b[s*i+5];
> + a[8*i+6] = b[s*i+4];
> + a[8*i+7] = b[s*i+5];
> + }
> +}
> +
> +/* Three two-element loads, two four-element stores. On ia32 we elide
> + a permute and perform a redundant load. */
> +/* { dg-final { scan-assembler-times "movq" 2 } } */
> +/* { dg-final { scan-assembler-times "movhps" 2 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "movhps" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "movups" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-2.c
> b/gcc/testsuite/gcc.target/i386/vect-strided-2.c
> new file mode 100644
> index 00000000000..6fd64e28cf0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-strided-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx" } */
> +
> +void foo (int * __restrict a, int *b, int s)
> +{
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[4*i+0] = b[s*i+0];
> + a[4*i+1] = b[s*i+1];
> + a[4*i+2] = b[s*i+0];
> + a[4*i+3] = b[s*i+1];
> + }
> +}
> +
> +/* One two-element load, one four-element store. */
> +/* { dg-final { scan-assembler-times "movq" 1 } } */
> +/* { dg-final { scan-assembler-times "movups" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-3.c
> b/gcc/testsuite/gcc.target/i386/vect-strided-3.c
> new file mode 100644
> index 00000000000..b462701a0b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-strided-3.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-avx -fno-tree-slp-vectorize" } */
> +
> +void foo (int * __restrict a, int *b, int s)
> +{
> + if (s >= 6)
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[s*i+0] = b[4*i+0];
> + a[s*i+1] = b[4*i+1];
> + a[s*i+2] = b[4*i+2];
> + a[s*i+3] = b[4*i+3];
> + a[s*i+4] = b[4*i+0];
> + a[s*i+5] = b[4*i+1];
> + }
> +}
> +
> +/* While the vectorizer generates 6 uint64 stores. */
> +/* { dg-final { scan-assembler-times "movq" 4 } } */
> +/* { dg-final { scan-assembler-times "movhps" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> new file mode 100644
> index 00000000000..dd922926a2a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */
> +
> +void foo (int * __restrict a, int * __restrict b, int *c, int s)
> +{
> + if (s >= 2)
> + for (int i = 0; i < 1024; ++i)
> + {
> + a[s*i+0] = c[4*i+0];
> + a[s*i+1] = c[4*i+1];
> + b[s*i+0] = c[4*i+2];
> + b[s*i+1] = c[4*i+3];
> + }
> +}
> +
> +/* Vectorization factor two, two two-element stores to a using movq
> + and two two-element stores to b via pextrq/movhps of the high part. */
> +/* { dg-final { scan-assembler-times "movq" 2 } } */
> +/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 8aa41833433..03a5db45976 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2036,15 +2036,10 @@ get_group_load_store_type (vec_info *vinfo,
> stmt_vec_info stmt_info,
> first_dr_info
> = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
> if (STMT_VINFO_STRIDED_P (first_stmt_info))
> - {
> - /* Try to use consecutive accesses of DR_GROUP_SIZE elements,
> - separated by the stride, until we have a complete vector.
> - Fall back to scalar accesses if that isn't possible. */
> - if (multiple_p (nunits, group_size))
> - *memory_access_type = VMAT_STRIDED_SLP;
> - else
> - *memory_access_type = VMAT_ELEMENTWISE;
> - }
> + /* Try to use consecutive accesses of as many elements as possible,
> + separated by the stride, until we have a complete vector.
> + Fall back to scalar accesses if that isn't possible. */
> + *memory_access_type = VMAT_STRIDED_SLP;
> else
> {
> int cmp = compare_step_with_zero (vinfo, stmt_info);
> @@ -8512,12 +8507,29 @@ vectorizable_store (vec_info *vinfo,
> tree lvectype = vectype;
> if (slp)
> {
> - if (group_size < const_nunits
> - && const_nunits % group_size == 0)
> + HOST_WIDE_INT n = gcd (group_size, const_nunits);
> + if (n == const_nunits)
> {
> - nstores = const_nunits / group_size;
> - lnel = group_size;
> - ltype = build_vector_type (elem_type, group_size);
> + int mis_align = dr_misalignment (first_dr_info, vectype);
> + dr_alignment_support dr_align
> + = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
> + mis_align);
> + if (dr_align == dr_aligned
> + || dr_align == dr_unaligned_supported)
> + {
> + nstores = 1;
> + lnel = const_nunits;
> + ltype = vectype;
> + lvectype = vectype;
> + alignment_support_scheme = dr_align;
> + misalignment = mis_align;
> + }
> + }
> + else if (n > 1)
> + {
> + nstores = const_nunits / n;
> + lnel = n;
> + ltype = build_vector_type (elem_type, n);
> lvectype = vectype;
>
> /* First check if vec_extract optab doesn't support extraction
> @@ -8526,7 +8538,7 @@ vectorizable_store (vec_info *vinfo,
> machine_mode vmode;
> if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> || !related_vector_mode (TYPE_MODE (vectype), elmode,
> - group_size).exists (&vmode)
> + n).exists (&vmode)
> || (convert_optab_handler (vec_extract_optab,
> TYPE_MODE (vectype), vmode)
> == CODE_FOR_nothing))
> @@ -8537,8 +8549,8 @@ vectorizable_store (vec_info *vinfo,
> re-interpreting it as the original vector type if
> supported. */
> unsigned lsize
> - = group_size * GET_MODE_BITSIZE (elmode);
> - unsigned int lnunits = const_nunits / group_size;
> + = n * GET_MODE_BITSIZE (elmode);
> + unsigned int lnunits = const_nunits / n;
> /* If we can't construct such a vector fall back to
> element extracts from the original vector type and
> element size stores. */
> @@ -8551,7 +8563,7 @@ vectorizable_store (vec_info *vinfo,
> != CODE_FOR_nothing))
> {
> nstores = lnunits;
> - lnel = group_size;
> + lnel = n;
> ltype = build_nonstandard_integer_type (lsize, 1);
> lvectype = build_vector_type (ltype, nstores);
> }
> @@ -8562,24 +8574,6 @@ vectorizable_store (vec_info *vinfo,
> issue exists here for reasonable archs. */
> }
> }
> - else if (group_size >= const_nunits
> - && group_size % const_nunits == 0)
> - {
> - int mis_align = dr_misalignment (first_dr_info, vectype);
> - dr_alignment_support dr_align
> - = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
> - mis_align);
> - if (dr_align == dr_aligned
> - || dr_align == dr_unaligned_supported)
> - {
> - nstores = 1;
> - lnel = const_nunits;
> - ltype = vectype;
> - lvectype = vectype;
> - alignment_support_scheme = dr_align;
> - misalignment = mis_align;
> - }
> - }
> ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
> ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> }
> @@ -10364,34 +10358,32 @@ vectorizable_load (vec_info *vinfo,
> auto_vec<tree> dr_chain;
> if (memory_access_type == VMAT_STRIDED_SLP)
> {
> - if (group_size < const_nunits)
> + HOST_WIDE_INT n = gcd (group_size, const_nunits);
> + /* Use the target vector type if the group size is a multiple
> + of it. */
> + if (n == const_nunits)
> + {
> + nloads = 1;
> + lnel = const_nunits;
> + ltype = vectype;
> + }
> + /* Else use the biggest vector we can load the group without
> + accessing excess elements. */
> + else if (n > 1)
> {
> - /* First check if vec_init optab supports construction from vector
> - elts directly. Otherwise avoid emitting a constructor of
> - vector elements by performing the loads using an integer type
> - of the same size, constructing a vector of those and then
> - re-interpreting it as the original vector type. This avoids a
> - huge runtime penalty due to the general inability to perform
> - store forwarding from smaller stores to a larger load. */
> tree ptype;
> tree vtype
> - = vector_vector_composition_type (vectype,
> - const_nunits / group_size,
> + = vector_vector_composition_type (vectype, const_nunits / n,
> &ptype);
> if (vtype != NULL_TREE)
> {
> - nloads = const_nunits / group_size;
> - lnel = group_size;
> + nloads = const_nunits / n;
> + lnel = n;
> lvectype = vtype;
> ltype = ptype;
> }
> }
> - else
> - {
> - nloads = 1;
> - lnel = const_nunits;
> - ltype = vectype;
> - }
> + /* Else fall back to the default element-wise access. */
> ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype)));
> }
> /* Load vector(1) scalar_type if it's 1 element-wise vectype. */