On Tue, 4 Jul 2023, juzhe.zh...@rivai.ai wrote:

> From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>
> 
> Hi, Richard and Richi.
> 
> The len_mask_gather_load/len_mask_scatter_store patterns have been added.
> Now, this patch applies them into vectorizer.
> 
> Here is the example:
> 
> void
> f (int *restrict a,
>    int *restrict b, int n,
>    int base, int step,
>    int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       if (cond[i])
>         a[i * 4] = b[i];
>     }
> }
> 
> Gimple IR:
> 
>   <bb 3> [local count: 105119324]:
>   _58 = (unsigned long) n_13(D);
> 
>   <bb 4> [local count: 630715945]:
>   # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)>
>   # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)>
>   # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)>
>   # ivtmp_59 = PHI <ivtmp_60(4), _58(3)>
>   _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]);
>   ivtmp_44 = _61 * 4;
>   vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... });
>   mask__24.10_49 = vect__4.9_47 != { 0, ... };
>   vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49);
>   ivtmp_54 = _61 * 16;
>   .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, 
> vect__8.13_53, _61, 0, mask__24.10_49);
>   vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44;
>   vectp_b.11_52 = vectp_b.11_51 + ivtmp_44;
>   vectp_a.14_56 = vectp_a.14_55 + ivtmp_54;
>   ivtmp_60 = ivtmp_59 - _61;
>   if (ivtmp_60 != 0)
>     goto <bb 4>; [83.33%]
>   else
>     goto <bb 5>; [16.67%]
> 
> gcc/ChangeLog:
> 
>         * optabs-query.cc (supports_vec_gather_load_p): Apply 
> LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer.
>         (supports_vec_scatter_store_p): Ditto.
>         * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (vect_get_strided_load_store_ops): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
> 
> ---
>  gcc/optabs-query.cc        |   2 +
>  gcc/tree-vect-data-refs.cc |  15 +++-
>  gcc/tree-vect-stmts.cc     | 136 ++++++++++++++++++++++++++++++++-----
>  3 files changed, 134 insertions(+), 19 deletions(-)
> 
> diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
> index 2fdd0d34354..bf1f484e874 100644
> --- a/gcc/optabs-query.cc
> +++ b/gcc/optabs-query.cc
> @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode)
>      this_fn_optabs->supports_vec_gather_load[mode]
>        = (supports_vec_convert_optab_p (gather_load_optab, mode)
>        || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
> +      || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode)
>        ? 1 : -1);
>  
>    return this_fn_optabs->supports_vec_gather_load[mode] > 0;
> @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode)
>      this_fn_optabs->supports_vec_scatter_store[mode]
>        = (supports_vec_convert_optab_p (scatter_store_optab, mode)
>        || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
> +      || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode)
>        ? 1 : -1);
>  
>    return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index ebe93832b1e..8d32eb3c83b 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool 
> read_p, bool masked_p,
>      return false;
>  
>    /* Work out which function we need.  */
> -  internal_fn ifn, alt_ifn;
> +  internal_fn ifn, alt_ifn, len_mask_ifn;
>    if (read_p)
>      {
>        ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
>        alt_ifn = IFN_MASK_GATHER_LOAD;
> +      /* When target supports LEN_MASK_GATHER_LOAD, we always
> +      use LEN_MASK_GATHER_LOAD regardless whether len and
> +      mask are valid or not.  */
> +      len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD;
>      }
>    else
>      {
>        ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
>        alt_ifn = IFN_MASK_SCATTER_STORE;
> +      /* When target supports LEN_MASK_SCATTER_STORE, we always
> +      use LEN_MASK_SCATTER_STORE regardless whether len and
> +      mask are valid or not.  */
> +      len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE;
>      }
>  
>    for (;;)
> @@ -3893,7 +3901,10 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool 
> read_p, bool masked_p,
>  
>        /* Test whether the target supports this combination.  */
>        if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
> -                                               offset_vectype, scale))
> +                                               offset_vectype, scale)
> +       || internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype,
> +                                                  memory_type,
> +                                                  offset_vectype, scale))
>       {
>         *ifn_out = ifn;

I think *ifn_out should be len_mask_ifn if that matched, so instead add
an else if just like we do for alt_ifn?

>         *offset_vectype_out = offset_vectype;
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a0c39268bf0..1f607b7102b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info 
> loop_vinfo, tree vectype,
>                                                  gs_info->offset_vectype,
>                                                  gs_info->scale))
>       {
> +       ifn = (is_load
> +              ? IFN_LEN_MASK_GATHER_LOAD
> +              : IFN_LEN_MASK_SCATTER_STORE);
> +       if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> +                                                   gs_info->memory_type,
> +                                                   gs_info->offset_vectype,
> +                                                   gs_info->scale))
> +         {
> +           vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
> +           vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +           return;
> +         }
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                            "can't operate on partial vectors because"
> @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo,
>  static void
>  vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
>                                loop_vec_info loop_vinfo,
> +                              gimple_stmt_iterator *gsi,
>                                gather_scatter_info *gs_info,
> -                              tree *dataref_bump, tree *vec_offset)
> +                              tree *dataref_bump, tree *vec_offset,
> +                              vec_loop_lens *loop_lens)
>  {
>    struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
>    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>  
> -  tree bump = size_binop (MULT_EXPR,
> -                       fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> -                       size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> -  *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> +  if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
> +    {
> +      /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]);
> +      ivtmp_8 = _31 * 16 (step in bytes);
> +      .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... );
> +      vectp_a.9_26 = vectp_a.9_7 + ivtmp_8;  */
> +      tree loop_len
> +     = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0);
> +      tree tmp
> +     = fold_build2 (MULT_EXPR, sizetype,
> +                    fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> +                    loop_len);
> +      tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp");
> +      gassign *assign = gimple_build_assign (bump, tmp);
> +      gsi_insert_before (gsi, assign, GSI_SAME_STMT);
> +      *dataref_bump = bump;
> +    }
> +  else
> +    {
> +      tree bump
> +     = size_binop (MULT_EXPR,
> +                   fold_convert (sizetype, unshare_expr (DR_STEP (dr))),
> +                   size_int (TYPE_VECTOR_SUBPARTS (vectype)));
> +      *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
> +    }
>  
>    /* The offset given in GS_INFO can have pointer type, so use the element
>       type of the vector instead.  */
> @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo,
>    else if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        aggr_type = elem_type;
> -      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> -                                    &bump, &vec_offset);
> +      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> +                                    &bump, &vec_offset, loop_lens);
>      }
>    else
>      {
> @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo,
>             unsigned HOST_WIDE_INT align;
>  
>             tree final_mask = NULL_TREE;
> +           tree final_len = NULL_TREE;
> +           tree bias = NULL_TREE;
>             if (loop_masks)
>               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>                                                vec_num * ncopies,
> @@ -8929,8 +8966,43 @@ vectorizable_store (vec_info *vinfo,
>                 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
>                   vec_offset = vec_offsets[vec_num * j + i];
>                 tree scale = size_int (gs_info.scale);
> +
> +               if (internal_gather_scatter_fn_supported_p (
> +                     IFN_LEN_MASK_SCATTER_STORE, vectype, 
> gs_info.memory_type,
> +                     TREE_TYPE (vec_offset), gs_info.scale))

gs_info.ifn should have recorded the appropriate ifn, so you should
be able to simplify this.

> +                 {
> +                   if (loop_lens)
> +                     {
> +                       final_len
> +                         = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                              vec_num * ncopies, vectype,
> +                                              vec_num * j + i, 1);
> +                     }
> +                   else
> +                     {
> +                       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +                       final_len
> +                         = build_int_cst (iv_type,
> +                                          TYPE_VECTOR_SUBPARTS (vectype));
> +                     }
> +                   signed char biasval
> +                     = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +                   bias = build_int_cst (intQI_type_node, biasval);
> +                   if (!final_mask)
> +                     {
> +                       mask_vectype = truth_type_for (vectype);
> +                       final_mask = build_minus_one_cst (mask_vectype);
> +                     }
> +                 }
> +
>                 gcall *call;
> -               if (final_mask)
> +               if (final_len && final_len)
> +                 call
> +                   = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE,
> +                                                 7, dataref_ptr, vec_offset,
> +                                                 scale, vec_oprnd, final_len,
> +                                                 bias, final_mask);
> +               else if (final_mask)
>                   call = gimple_build_call_internal
>                     (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset,
>                      scale, vec_oprnd, final_mask);
> @@ -9047,9 +9119,6 @@ vectorizable_store (vec_info *vinfo,
>             machine_mode vmode = TYPE_MODE (vectype);
>             machine_mode new_vmode = vmode;
>             internal_fn partial_ifn = IFN_LAST;
> -           /* Produce 'len' and 'bias' argument.  */
> -           tree final_len = NULL_TREE;
> -           tree bias = NULL_TREE;
>             if (loop_lens)
>               {
>                 opt_machine_mode new_ovmode
> @@ -10177,8 +10246,8 @@ vectorizable_load (vec_info *vinfo,
>    else if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        aggr_type = elem_type;
> -      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info,
> -                                    &bump, &vec_offset);
> +      vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info,
> +                                    &bump, &vec_offset, loop_lens);
>      }
>    else
>      {
> @@ -10339,6 +10408,8 @@ vectorizable_load (vec_info *vinfo,
>         for (i = 0; i < vec_num; i++)
>           {
>             tree final_mask = NULL_TREE;
> +           tree final_len = NULL_TREE;
> +           tree bias = NULL_TREE;
>             if (loop_masks
>                 && memory_access_type != VMAT_INVARIANT)
>               final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
> @@ -10368,8 +10439,42 @@ vectorizable_load (vec_info *vinfo,
>                         vec_offset = vec_offsets[vec_num * j + i];
>                       tree zero = build_zero_cst (vectype);
>                       tree scale = size_int (gs_info.scale);
> +
> +                     if (internal_gather_scatter_fn_supported_p (
> +                           IFN_LEN_MASK_GATHER_LOAD, vectype,
> +                           gs_info.memory_type, TREE_TYPE (vec_offset),
> +                           gs_info.scale))

Likewise.

> +                       {
> +                         if (loop_lens)
> +                           {
> +                             final_len = vect_get_loop_len (
> +                               loop_vinfo, gsi, loop_lens, vec_num * ncopies,
> +                               vectype, vec_num * j + i, 1);
> +                           }
> +                         else
> +                           {
> +                             tree iv_type
> +                               = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
> +                             final_len = build_int_cst (
> +                               iv_type, TYPE_VECTOR_SUBPARTS (vectype));
> +                           }
> +                         signed char biasval
> +                           = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +                         bias = build_int_cst (intQI_type_node, biasval);
> +                         if (!final_mask)
> +                           {
> +                             mask_vectype = truth_type_for (vectype);
> +                             final_mask = build_minus_one_cst (mask_vectype);
> +                           }
> +                       }
> +
>                       gcall *call;
> -                     if (final_mask)
> +                     if (final_len && final_mask)
> +                       call = gimple_build_call_internal (
> +                         IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr,
> +                         vec_offset, scale, zero, final_len, bias,
> +                         final_mask);
> +                     else if (final_mask)
>                         call = gimple_build_call_internal
>                           (IFN_MASK_GATHER_LOAD, 5, dataref_ptr,
>                            vec_offset, scale, zero, final_mask);
> @@ -10462,9 +10567,6 @@ vectorizable_load (vec_info *vinfo,
>                   machine_mode vmode = TYPE_MODE (vectype);
>                   machine_mode new_vmode = vmode;
>                   internal_fn partial_ifn = IFN_LAST;
> -                 /* Produce 'len' and 'bias' argument.  */
> -                 tree final_len = NULL_TREE;
> -                 tree bias = NULL_TREE;
>                   if (loop_lens)
>                     {
>                       opt_machine_mode new_ovmode
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

Reply via email to