On Mon, Sep 15, 2025 at 8:53 PM Robin Dapp <rdapp....@gmail.com> wrote:
>
> Hi,
>
> This patch adds gather/scatter handling for grouped access.  The idea is
> to e.g. replace an access (for uint8_t elements) like
>   arr[0]
>   arr[1]
>   arr[2]
>   arr[3]
>   arr[0 + step]
>   arr[1 + step]
>   ...
> by a gather load of uint32_t
>   arr[0]
>   arr[0 + step * 1]
>   arr[0 + step * 2]
>   ...
> where the offset vector is a simple series with step STEP.
> If supported, such a gather can be implemented as a strided load.
>
> Two changes from v1:
>
> - Re-use vector_vector_composition_type.
>
> - For now, if we have a masked access, the transformation is not
> performed.  We could perform the load using a punned type and later
> mask with an explicit VCOND_MASK once we converted it back.
> The same is true for non-trivial load permutations.

load-permutations should be pushed out to a separate VEC_PERM SLP node.
I'm not sure if the masked case is worth bothering, but of course with any form
of loop masking (mask or len) this would need adjustments as well.  We might
be fine with simply requiring no gaps at the end (so VMAT_STRIDED_SLP
originally) and asking for an appropriate loop mask/len for the new punned
element type.

Some comments below.

>
> Regtested and bootstrapped on x86 and power10.  Regtested on aarch64 and
> rv64gcv_zvl512b.  This now introduces a new failure in a dynamic-lmul
> testcase but I'm not going to bother for now.  That's stage 3+ material
> if we care enough.
>
> Regards
>  Robin
>
>         PR target/118019
>
> gcc/ChangeLog:
>
>         * internal-fn.cc (get_supported_else_vals): Exit at invalid
>         index.
>         (internal_strided_fn_supported_p): New funtion.
>         * internal-fn.h (internal_strided_fn_supported_p): Declare.
>         * tree-vect-stmts.cc (vector_vector_composition_type):
>         Add vector_only argument.
>         (vect_use_grouped_gather): New function.
>         (vect_get_store_rhs): Adjust docs of
>         vector_vector_composition_type.
>         (get_load_store_type): Try grouped gather.
>         (vectorizable_store): Use punned vectype.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (struct vect_load_store_data): Add punned
>         vectype.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test.
> ---
>  gcc/internal-fn.cc                            |  22 +-
>  gcc/internal-fn.h                             |   2 +
>  .../gcc.target/riscv/rvv/autovec/pr118019-2.c |  50 ++++
>  gcc/tree-vect-stmts.cc                        | 219 ++++++++++++++++--
>  gcc/tree-vectorizer.h                         |   1 +
>  5 files changed, 270 insertions(+), 24 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
>
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index bf2fac81807..db396c69ec5 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned 
> else_index,
>                          vec<int> &else_vals)
>  {
>    const struct insn_data_d *data = &insn_data[icode];
> -  if ((char)else_index >= data->n_operands)
> +  if ((int)else_index >= data->n_operands || (int)else_index == -1)
>      return;
>
>    machine_mode else_mode = data->operand[else_index].mode;
> @@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn 
> ifn, tree vector_type,
>    return ok;
>  }
>
> +/* Return true if the target supports a strided load/store function IFN
> +   with VECTOR_TYPE.  If supported and ELSVALS is nonzero the supported else
> +   values will be added to the vector ELSVALS points to.  */
> +
> +bool
> +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type,
> +                                vec<int> *elsvals)
> +{
> +  machine_mode mode = TYPE_MODE (vector_type);
> +  optab optab = direct_internal_fn_optab (ifn);
> +  insn_code icode = direct_optab_handler (optab, mode);
> +
> +  bool ok = icode != CODE_FOR_nothing;
> +
> +  if (ok && elsvals)
> +    get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals);
> +
> +  return ok;
> +}
> +
>  /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
>     for pointers of type TYPE when the accesses have LENGTH bytes and their
>     common byte alignment is ALIGN.  */
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index fd21694dfeb..dcb707251f8 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>                                                     tree, tree, int,
>                                                     vec<int> * = nullptr);
> +extern bool internal_strided_fn_supported_p (internal_fn, tree,
> +                                             vec<int> * = nullptr);
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>                                                 poly_uint64, unsigned int);
>
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> new file mode 100644
> index 00000000000..d3436b78377
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
> @@ -0,0 +1,50 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d 
> -mno-vector-strict-align" } */
> +
> +/* Ensure we use strided loads.  */
> +
> +typedef unsigned char uint8_t;
> +typedef unsigned short uint16_t;
> +typedef unsigned int uint32_t;
> +
> +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
> +    int t0 = s0 + s1;\
> +    int t1 = s0 - s1;\
> +    int t2 = s2 + s3;\
> +    int t3 = s2 - s3;\
> +    d0 = t0 + t2;\
> +    d2 = t0 - t2;\
> +    d1 = t1 + t3;\
> +    d3 = t1 - t3;\
> +}
> +
> +uint32_t
> +abs2 (uint32_t a)
> +{
> +  uint32_t s = ((a >> 15) & 0x10001) * 0xffff;
> +  return (a + s) ^ s;
> +}
> +
> +int
> +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2)
> +{
> +  uint32_t tmp[4][4];
> +  uint32_t a0, a1, a2, a3;
> +  int sum = 0;
> +  for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2)
> +    {
> +      a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
> +      a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
> +      a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
> +      a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
> +      HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
> +    }
> +  for (int i = 0; i < 4; i++)
> +    {
> +      HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]);
> +      sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3);
> +    }
> +  return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
> +}
> +
> +/* { dg-final { scan-assembler-times "vlse32" 4 } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index d46c1e3d56d..b2f67ea3849 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -62,6 +62,9 @@ along with GCC; see the file COPYING3.  If not see
>  /* For lang_hooks.types.type_for_mode.  */
>  #include "langhooks.h"
>
> +static tree vector_vector_composition_type (tree, poly_uint64, tree *,
> +                                           bool = false);
> +
>  /* Return TRUE iff the given statement is in an inner loop relative to
>     the loop being vectorized.  */
>  bool
> @@ -1723,6 +1726,96 @@ vect_truncate_gather_scatter_offset (stmt_vec_info 
> stmt_info, tree vectype,
>    return false;
>  }
>
> +/* Return true if we can use gather/scatter or strided internal functions
> +   to vectorize STMT_INFO, which is a grouped or strided load or store
> +   with multiple lanes and will be implemented by a type-punned access
> +   of a vector with element size that matches the number of lanes.
> +
> +   MASKED_P is true if load or store is conditional.
> +   When returning true, fill in GS_INFO with the information required to
> +   perform the operation.  Also, store the punning type in PUNNED_VECTYPE.
> +
> +   If successful and ELSVALS is nonzero the supported
> +   else values will be stored in the vector ELSVALS points to.  */
> +
> +static bool
> +vect_use_grouped_gather (stmt_vec_info stmt_info, tree vectype,

I don't like seeing stmt_vec_info much, can we get away with
passing in dr_vec_info instead?

> +                        loop_vec_info loop_vinfo, bool masked_p,
> +                        unsigned int nelts,
> +                        gather_scatter_info *info, vec<int> *elsvals,
> +                        tree *pun_vectype)
> +{
> +  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
> +  data_reference *dr = dr_info->dr;
> +
> +  /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by
> +     multiple gathers/scatter.  */
> +  if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts))
> +    return false;
> +
> +  /* Pun the vectype with one of the same size but an element spanning
> +     NELTS elements of VECTYPE.
> +     The punned type of a V16QI with NELTS = 4 would be V4SI.
> +     */
> +  tree tmp;
> +  unsigned int pieces;
> +  if (!can_div_trunc_p (TYPE_VECTOR_SUBPARTS (vectype), nelts, &pieces)
> +      || !pieces)
> +    return false;
> +
> +  *pun_vectype = vector_vector_composition_type (vectype, pieces, &tmp, 
> true);
> +
> +  if (!*pun_vectype || !VECTOR_TYPE_P (*pun_vectype))
> +    return false;
> +
> +  internal_fn ifn;
> +  tree offset_vectype = *pun_vectype;
> +
> +  internal_fn strided_ifn = DR_IS_READ (dr)
> +    ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
> +
> +  /* Check if we have a gather/scatter with the new type.  We're just trying
> +     with the type itself as offset for now.  If not, check if we have a
> +     strided load/store.  These have fewer constraints (for example no offset
> +     type must exist) so it is possible that even though a gather/scatter is
> +     not available we still have a strided load/store.  */
> +  bool ok = false;
> +  if (vect_gather_scatter_fn_p
> +      (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
> +       TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
> +       &offset_vectype, elsvals))
> +    ok = true;
> +  else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
> +                                           elsvals))
> +    {
> +      /* Use gather/scatter IFNs, vect_get_strided_load_store_ops
> +        will switch back to the strided variants.  */
> +      ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD :
> +       IFN_MASK_LEN_SCATTER_STORE;
> +      ok = true;
> +    }
> +
> +  if (ok)
> +    {
> +      info->ifn = ifn;
> +      info->decl = NULL_TREE;
> +      info->base = dr->ref;
> +      info->alias_ptr = build_int_cst
> +       (reference_alias_ptr_type (DR_REF (dr)),
> +        get_object_alignment (DR_REF (dr)));
> +      info->element_type = TREE_TYPE (vectype);

is that correct?  I'm not sure where we eventually pass it to.

> +      info->offset_vectype = offset_vectype;
> +      /* No need to set the offset, vect_get_strided_load_store_ops
> +        will do that.  */
> +      info->scale = 1;
> +      info->memory_type = TREE_TYPE (DR_REF (dr));
> +      return true;
> +    }
> +
> +  return false;
> +}
> +
> +
>  /* Return true if we can use gather/scatter internal functions to
>     vectorize STMT_INFO, which is a grouped or strided load or store.
>     MASKED_P is true if load or store is conditional.  When returning
> @@ -1888,12 +1981,14 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
>
>  /* Function VECTOR_VECTOR_COMPOSITION_TYPE
>
> -   This function returns a vector type which can be composed with NETLS 
> pieces,
> +   This function returns a vector type which can be composed with NELTS 
> pieces,
>     whose type is recorded in PTYPE.  VTYPE should be a vector type, and has 
> the
>     same vector size as the return vector.  It checks target whether supports
>     pieces-size vector mode for construction firstly, if target fails to, 
> check
>     pieces-size scalar mode for construction further.  It returns NULL_TREE if
> -   fails to find the available composition.
> +   fails to find the available composition.  If the caller only wants scalar
> +   pieces where PTYPE e.g. is a possible gather/scatter element type
> +   SCALAR_PTYPE_ONLY must be true.
>
>     For example, for (vtype=V16QI, nelts=4), we can probably get:
>       - V16QI with PTYPE V4QI.
> @@ -1901,7 +1996,8 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
>       - NULL_TREE.  */
>
>  static tree
> -vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
> +vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype,
> +                               bool scalar_ptype_only)
>  {
>    gcc_assert (VECTOR_TYPE_P (vtype));
>    gcc_assert (known_gt (nelts, 0U));
> @@ -1927,7 +2023,8 @@ vector_vector_composition_type (tree vtype, poly_uint64 
> nelts, tree *ptype)
>        scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
>        poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
>        machine_mode rmode;
> -      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
> +      if (!scalar_ptype_only
> +         && related_vector_mode (vmode, elmode, inelts).exists (&rmode)
>           && (convert_optab_handler (vec_init_optab, vmode, rmode)
>               != CODE_FOR_nothing))
>         {
> @@ -1938,12 +2035,15 @@ vector_vector_composition_type (tree vtype, 
> poly_uint64 nelts, tree *ptype)
>        /* Otherwise check if exists an integer type of the same piece size and
>          if vec_init optab supports construction from it directly.  */
>        if (int_mode_for_size (pbsize, 0).exists (&elmode)
> -         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
> -         && (convert_optab_handler (vec_init_optab, rmode, elmode)
> -             != CODE_FOR_nothing))
> +         && related_vector_mode (vmode, elmode, nelts).exists (&rmode))
>         {
> -         *ptype = build_nonstandard_integer_type (pbsize, 1);
> -         return build_vector_type (*ptype, nelts);
> +         if (scalar_ptype_only
> +             || convert_optab_handler (vec_init_optab, rmode, elmode)
> +             != CODE_FOR_nothing)
> +           {
> +             *ptype = build_nonstandard_integer_type (pbsize, 1);
> +             return build_vector_type (*ptype, nelts);
> +           }
>         }
>      }
>
> @@ -1978,6 +2078,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    int *misalignment = &ls->misalignment;
>    internal_fn *lanes_ifn = &ls->lanes_ifn;
>    vec<int> *elsvals = &ls->elsvals;
> +  tree *pun_vectype = &ls->pun_vectype;
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -1989,6 +2090,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>
>    *misalignment = DR_MISALIGNMENT_UNKNOWN;
>    *poffset = 0;
> +  *pun_vectype = NULL_TREE;
>
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      {
> @@ -2317,13 +2419,18 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>    if ((*memory_access_type == VMAT_ELEMENTWISE
>         || *memory_access_type == VMAT_STRIDED_SLP)
>        && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> -      && SLP_TREE_LANES (slp_node) == 1

I think this now conflicts a bit with what I just pushed (sorry).

>        && loop_vinfo)
>      {
> +      unsigned i, j;
> +      bool simple_perm_series = true;
> +      FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), i, j)
> +       if (i != j)
> +         simple_perm_series = false;

In particular I disallow all load permutes, since we are going to elide it.

>        gather_scatter_info gs_info;
> -      if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo,
> -                                             masked_p, &gs_info, elsvals,
> -                                             group_size, single_element_p))
> +      if (SLP_TREE_LANES (slp_node) == 1
> +         && vect_use_strided_gather_scatters_p (stmt_info, vectype, 
> loop_vinfo,
> +                                                masked_p, &gs_info, elsvals,
> +                                                group_size, 
> single_element_p))
>         {
>           SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
>           SLP_TREE_GS_BASE (slp_node) = error_mark_node;
> @@ -2331,6 +2438,35 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>           ls->strided_offset_vectype = gs_info.offset_vectype;
>           *memory_access_type = VMAT_GATHER_SCATTER_IFN;
>         }
> +      /* For now we don't allow masked loads or complex (other than
> +        0, 1, 2, ...) load permutations.  Masking can be supported
> +        by a VCOND_MASK after the load (and returning to the
> +        original vectype).  Similar for a permutation with an
> +        additional VEC_PERM.  */
> +      else if (SLP_TREE_LANES (slp_node) > 1
> +              && !masked_p
> +              && simple_perm_series
> +              && vect_use_grouped_gather (stmt_info, vectype, loop_vinfo,
> +                                          masked_p, SLP_TREE_LANES 
> (slp_node),
> +                                          &gs_info, elsvals, pun_vectype))
> +       {
> +         int puntype_misalignment = dr_misalignment
> +           (first_dr_info, *pun_vectype, *poffset);
> +         dr_alignment_support puntype_alignment_scheme
> +           = vect_supportable_dr_alignment
> +           (vinfo, first_dr_info, *pun_vectype, puntype_misalignment,
> +            true);
> +
> +         if (puntype_alignment_scheme == dr_aligned
> +             || puntype_alignment_scheme == dr_unaligned_supported)
> +           {
> +             SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
> +             SLP_TREE_GS_BASE (slp_node) = error_mark_node;
> +             ls->gs.ifn = gs_info.ifn;
> +             ls->strided_offset_vectype = gs_info.offset_vectype;
> +             *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> +           }
> +       }
>      }
>
>    if (*memory_access_type == VMAT_CONTIGUOUS_DOWN
> @@ -2347,14 +2483,15 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>      }
>    else
>      {

Re-doing this after the above is a bit ugly.  Likewise having pun_vectype.
It's also an opportunity to move the VMAT_STRIDED_SLP punning and
alignment (re-)computation code here.  OTOH this is complicated enough
already.

I was working on dis-entangling it a bit, but ...

> +      tree vtype = ls->pun_vectype ? ls->pun_vectype : vectype;
>        if (mat_gather_scatter_p (*memory_access_type)
>           && !first_dr_info)
>         *misalignment = DR_MISALIGNMENT_UNKNOWN;
>        else
> -       *misalignment = dr_misalignment (first_dr_info, vectype, *poffset);
> +       *misalignment = dr_misalignment (first_dr_info, vtype, *poffset);
>        *alignment_support_scheme
>         = vect_supportable_dr_alignment
> -          (vinfo, first_dr_info, vectype, *misalignment,
> +          (vinfo, first_dr_info, vtype, *misalignment,
>             mat_gather_scatter_p (*memory_access_type));
>      }
>
> @@ -8360,10 +8497,13 @@ vectorizable_store (vec_info *vinfo,
>      {
>        aggr_type = elem_type;
>        if (!costing_p)
> -       vect_get_strided_load_store_ops (stmt_info, slp_node, vectype,
> -                                        ls.strided_offset_vectype,
> -                                        loop_vinfo, gsi,
> -                                        &bump, &vec_offset, loop_lens);
> +       {
> +         tree vtype = ls.pun_vectype ? ls.pun_vectype : vectype;
> +         vect_get_strided_load_store_ops (stmt_info, slp_node, vtype,
> +                                          ls.strided_offset_vectype,
> +                                          loop_vinfo, gsi,
> +                                          &bump, &vec_offset, loop_lens);
> +       }
>      }
>    else
>      {
> @@ -8549,7 +8689,9 @@ vectorizable_store (vec_info *vinfo,
>
>    if (mat_gather_scatter_p (memory_access_type))
>      {
> -      gcc_assert (!grouped_store);
> +      gcc_assert (!grouped_store || ls.pun_vectype);
> +      if (ls.pun_vectype)
> +       vectype = ls.pun_vectype;
>        auto_vec<tree> vec_offsets;
>        unsigned int inside_cost = 0, prologue_cost = 0;
>        int num_stmts = vec_num;
> @@ -8596,8 +8738,9 @@ vectorizable_store (vec_info *vinfo,
>               if (mask_node)
>                 vec_mask = vec_masks[j];
>               /* We should have catched mismatched types earlier.  */
> -             gcc_assert (useless_type_conversion_p (vectype,
> -                                                    TREE_TYPE (vec_oprnd)));
> +             gcc_assert (ls.pun_vectype
> +                         || useless_type_conversion_p
> +                         (vectype, TREE_TYPE (vec_oprnd)));
>             }
>           tree final_mask = NULL_TREE;
>           tree final_len = NULL_TREE;
> @@ -8650,6 +8793,18 @@ vectorizable_store (vec_info *vinfo,
>                     }
>                 }
>
> +             if (ls.pun_vectype)
> +               {
> +                 gimple *conv_stmt
> +                   = gimple_build_assign (make_ssa_name (vectype),
> +                                          VIEW_CONVERT_EXPR,
> +                                          build1 (VIEW_CONVERT_EXPR, vectype,
> +                                                  vec_oprnd));
> +                 vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt,
> +                                              gsi);
> +                 vec_oprnd = gimple_get_lhs (conv_stmt);
> +               }
> +
>               gcall *call;
>               if (final_len && final_mask)
>                 {
> @@ -10415,7 +10570,14 @@ vectorizable_load (vec_info *vinfo,

I believe you now have to change

  /* ???  The following checks should really be part of
     get_load_store_type.  */
  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
      && !((memory_access_type == VMAT_ELEMENTWISE
            || mat_gather_scatter_p (memory_access_type))
           && SLP_TREE_LANES (slp_node) == 1
           && (!grouped_load
               || !DR_GROUP_NEXT_ELEMENT (first_stmt_info))))
    {

as otherwise you'll get slp_perm set but the permute will be silently
elided.  This is all a bit ugly already :/

Now I wonder if/how we handle

 for (i = 0; i < n; ++i)
   {
      int j = offset[i];
      sum += data[2*i] + data[2*i+1];
   }

aka a gather grouped load.  Maybe there's an opportunity to handle
the "punning" higher up, representing this as a single-element group
in the first place.  Hmm.

Anyway, I think the general direction of the patch is OK.  You'll have to
figure what I just broke though and I'm still somewhat missing the
"beauty" moment when thinking of how the VMAT_STRIDED_SLP

      /* ???  Modify local copies of alignment_support_scheme and
         misalignment, but this part of analysis should be done
         earlier and remembered, likewise the chosen load mode.  */

parts resolve themselves into happiness with this ... ;)

Richard.

>    if (mat_gather_scatter_p (memory_access_type))
>      {
> -      gcc_assert (!grouped_load && !slp_perm);
> +      gcc_assert ((!grouped_load && !slp_perm) || ls.pun_vectype);
> +
> +      /* If we pun the original vectype the loads as well as costing, length,
> +        etc. is performed with the new type.  After loading we VIEW_CONVERT
> +        the data to the original vectype.  */
> +      tree original_vectype = vectype;
> +      if (ls.pun_vectype)
> +       vectype = ls.pun_vectype;
>
>        /* 1. Create the vector or array pointer update chain.  */
>        if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> @@ -10756,6 +10918,17 @@ vectorizable_load (vec_info *vinfo,
>               new_temp = new_temp2;
>             }
>
> +         if (ls.pun_vectype)
> +           {
> +             new_stmt = gimple_build_assign (make_ssa_name
> +                                             (original_vectype),
> +                                             VIEW_CONVERT_EXPR,
> +                                             build1 (VIEW_CONVERT_EXPR,
> +                                                     original_vectype,
> +                                                     new_temp));
> +             vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +           }
> +
>           /* Store vector loads in the corresponding SLP_NODE.  */
>           slp_node->push_vec_def (new_stmt);
>         }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index b7c2188ab3d..fe05b53fe4e 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
>        tree decl;       // VMAT_GATHER_SCATTER_DECL
>    } gs;
>    tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> +  tree pun_vectype; // VMAT_GATHER_SCATTER_IFN
>    auto_vec<int> elsvals;
>    unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
>  };
> --
> 2.51.0
>

Reply via email to