juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai>
>
> Hi, Richi and Richard.
>
> This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
> handle flow control by mask and loop control by length on gather/scatter 
> memory
> operations. Consider this following case:
>
> #include <stdint.h>
> void
> f (uint8_t *restrict a,
>    uint8_t *restrict b, int n,
>    int base, int step,
>    int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       if (cond[i])
>         a[i * step + base] = b[i * step + base];
>     }
> }
>
> We hope RVV can vectorize such case into following IR:
>
> loop_len = SELECT_VL
> control_mask = comparison
> v = LEN_MASK_GATHER_LOAD (.., loop_len, control_mask, bias)
> LEN_SCATTER_STORE (... v, ..., loop_len, control_mask, bias)
>
> This patch doesn't apply such patterns into vectorizer, just add patterns
> and update the documents.
>
> Will send patch which apply such patterns into vectorizer soon after this
> patch is approved.
>
> Thanks.
>
> ---
>  gcc/doc/md.texi     | 17 ++++++++++++
>  gcc/internal-fn.cc  | 67 +++++++++++++++++++++++++++++++++++++++++++--
>  gcc/internal-fn.def |  8 ++++--
>  gcc/internal-fn.h   |  1 +
>  gcc/optabs.def      |  2 ++
>  5 files changed, 90 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 9648fdc846a..df41b5251d4 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element 
> @var{i}
>  of the result should be loaded from memory and clear if element @var{i}
>  of the result should be set to zero.
>  
> +@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_gather_load@var{m}@var{n}}
> +Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a mask operand (operand 6) as well as a bias operand (operand 7).  Similar 
> to len_maskload,
> +the instruction loads at most (operand 5 + operand 7) elements from memory.
> +Bit @var{i} of the mask is set if element @var{i} of the result should
> +be loaded from memory and clear if element @var{i} of the result should be 
> undefined.
> +Mask elements @var{i} with i > (operand 5) are ignored.

Nit: second i should be @var{i} too.

> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an 
> extra mask operand as
>  operand 5.  Bit @var{i} of the mask is set if element @var{i}
>  of the result should be stored to memory.
>  
> +@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_scatter_store@var{m}@var{n}}
> +Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a mask operand (operand 6) as well as a bias operand (operand 7).  The 
> instruction stores
> +at most (operand 5 + operand 7) elements of (operand 4) to memory.
> +Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
> stored.
> +Mask elements @var{i} with i > (operand 5) are ignored.

Same here.

> +
>  @cindex @code{vec_set@var{m}} instruction pattern
>  @item @samp{vec_set@var{m}}
>  Set given field in the vector value.  Operand 0 is the vector to modify,
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 9017176dc7a..da3827481e9 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -3537,7 +3537,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
> *stmt, direct_optab optab)
>    HOST_WIDE_INT scale_int = tree_to_shwi (scale);
>    rtx rhs_rtx = expand_normal (rhs);
>  
> -  class expand_operand ops[6];
> +  class expand_operand ops[8];
>    int i = 0;
>    create_address_operand (&ops[i++], base_rtx);
>    create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE 
> (offset)));
> @@ -3546,9 +3546,23 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
> *stmt, direct_optab optab)
>    create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
>    if (mask_index >= 0)
>      {
> +      if (optab == len_mask_scatter_store_optab)
> +     {
> +       tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
> +       rtx len_rtx = expand_normal (len);
> +       create_convert_operand_from (&ops[i++], len_rtx,
> +                                    TYPE_MODE (TREE_TYPE (len)),
> +                                    TYPE_UNSIGNED (TREE_TYPE (len)));
> +     }
>        tree mask = gimple_call_arg (stmt, mask_index);
>        rtx mask_rtx = expand_normal (mask);
>        create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE 
> (mask)));
> +      if (optab == len_mask_scatter_store_optab)
> +     {
> +       tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
> +       rtx bias = expand_normal (biast);
> +       create_input_operand (&ops[i++], bias, QImode);
> +     }
>      }

Guess this is personal preference, but IMO it would be more natural to have:

  if (optab == len_mask_scatter_store_optab)
    {
      ...
    }
  if (mask_index >= 0)
    {
      ...
    }
  if (optab == len_mask_scatter_store_optab)
    {
      ...
    }

since that's the structure we'd need if LEN_GATHER_LOAD was a thing.
Also...

>  
>    insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE 
> (rhs)),
> @@ -3559,7 +3573,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
> *stmt, direct_optab optab)
>  /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB.  */
>  
>  static void
> -expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab)
> +expand_gather_load_optab_fn (internal_fn ifn, gcall *stmt, direct_optab 
> optab)
>  {
>    tree lhs = gimple_call_lhs (stmt);
>    tree base = gimple_call_arg (stmt, 0);
> @@ -3572,7 +3586,7 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, 
> direct_optab optab)
>    HOST_WIDE_INT scale_int = tree_to_shwi (scale);
>  
>    int i = 0;
> -  class expand_operand ops[6];
> +  class expand_operand ops[8];
>    create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs)));
>    create_address_operand (&ops[i++], base_rtx);
>    create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE 
> (offset)));
> @@ -3584,6 +3598,20 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, 
> direct_optab optab)
>        rtx mask_rtx = expand_normal (mask);
>        create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE 
> (mask)));
>      }
> +  else if (optab == len_mask_gather_load_optab)
> +    {
> +      tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn));
> +      rtx len_rtx = expand_normal (len);
> +      create_convert_operand_from (&ops[i++], len_rtx,
> +                                TYPE_MODE (TREE_TYPE (len)),
> +                                TYPE_UNSIGNED (TREE_TYPE (len)));
> +      tree mask = gimple_call_arg (stmt, internal_fn_mask_index (ifn));
> +      rtx mask_rtx = expand_normal (mask);
> +      create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE 
> (mask)));
> +      tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1);
> +      rtx bias = expand_normal (biast);
> +      create_input_operand (&ops[i++], bias, QImode);
> +    }

...I think we've now got too many copies of this construct: the two above,
plus expand_partial_load_optab_fn and expand_partial_store_optab_fn.
It would be good to have a helper along the lines of:

static unsigned int
add_mask_and_len_args (expand_operand *opno, unsigned int opno,
                       gcall *stmt)
{
  ...
}

that makes use of internal_fn_len_index and internal_fn_mask_index.

Don't shoot me, but I think we might have made a mistake by putting
the mask between the length and the bias.  It seems more natural
to keep the length-related arguments consecutive, so that any bias
argument is always immediately after any length argument.

Would you mind doing a separate patch to swap the len_maskload/len_maskstore
arguments around?  “mask, len, bias” or “len, bias, mask” would be OK,
although I suppose “len, bias, mask” fits the order in the name.

>    insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE 
> (lhs)),
>                                          TYPE_MODE (TREE_TYPE (offset)));
>    expand_insn (icode, i, ops);
> @@ -4434,6 +4462,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_MASK_LOAD_LANES:
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_GATHER_LOAD:
>      case IFN_LEN_LOAD:
>      case IFN_LEN_MASK_LOAD:
>        return true;
> @@ -4455,6 +4484,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>      case IFN_LEN_MASK_STORE:
>        return true;
> @@ -4473,8 +4503,10 @@ internal_gather_scatter_fn_p (internal_fn fn)
>      {
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_GATHER_LOAD:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>        return true;
>  
>      default:
> @@ -4504,6 +4536,34 @@ internal_fn_mask_index (internal_fn fn)
>      case IFN_LEN_MASK_STORE:
>        return 3;
>  
> +    case IFN_LEN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_SCATTER_STORE:
> +      return 5;
> +
> +    default:
> +      return (conditional_internal_fn_code (fn) != ERROR_MARK
> +           || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
> +    }
> +}
> +
> +/* If FN takes a vector len argument, return the index of that argument,
> +   otherwise return -1.  */
> +
> +int
> +internal_fn_len_index (internal_fn fn)
> +{
> +  switch (fn)
> +    {
> +    case IFN_LEN_LOAD:
> +    case IFN_LEN_STORE:
> +    case IFN_LEN_MASK_LOAD:
> +    case IFN_LEN_MASK_STORE:
> +      return 2;
> +
> +    case IFN_LEN_MASK_GATHER_LOAD:
> +    case IFN_LEN_MASK_SCATTER_STORE:
> +      return 4;
> +
>      default:
>        return (conditional_internal_fn_code (fn) != ERROR_MARK
>             || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);

This default case isn't correct for this function.

Thanks,
Richard

> @@ -4522,6 +4582,7 @@ internal_fn_stored_value_index (internal_fn fn)
>      case IFN_MASK_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
> +    case IFN_LEN_MASK_SCATTER_STORE:
>      case IFN_LEN_STORE:
>        return 3;
>  
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index bc947c0fde7..5be24decf88 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -48,14 +48,14 @@ along with GCC; see the file COPYING3.  If not see
>     - mask_load: currently just maskload
>     - load_lanes: currently just vec_load_lanes
>     - mask_load_lanes: currently just vec_mask_load_lanes
> -   - gather_load: used for {mask_,}gather_load
> +   - gather_load: used for {mask_,len_mask_,}gather_load
>     - len_load: currently just len_load
>     - len_maskload: currently just len_maskload
>  
>     - mask_store: currently just maskstore
>     - store_lanes: currently just vec_store_lanes
>     - mask_store_lanes: currently just vec_mask_store_lanes
> -   - scatter_store: used for {mask_,}scatter_store
> +   - scatter_store: used for {mask_,len_mask_,}scatter_store
>     - len_store: currently just len_store
>     - len_maskstore: currently just len_maskstore
>  
> @@ -157,6 +157,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE,
>  DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load)
>  DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE,
>                      mask_gather_load, gather_load)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_GATHER_LOAD, ECF_PURE,
> +                    len_mask_gather_load, gather_load)
>  
>  DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load)
>  DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload)
> @@ -164,6 +166,8 @@ DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, 
> len_maskload, len_maskload)
>  DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store)
>  DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0,
>                      mask_scatter_store, scatter_store)
> +DEF_INTERNAL_OPTAB_FN (LEN_MASK_SCATTER_STORE, 0,
> +                    len_mask_scatter_store, scatter_store)
>  
>  DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store)
>  DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes)
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 8f21068e300..4234bbfed87 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -234,6 +234,7 @@ extern bool internal_load_fn_p (internal_fn);
>  extern bool internal_store_fn_p (internal_fn);
>  extern bool internal_gather_scatter_fn_p (internal_fn);
>  extern int internal_fn_mask_index (internal_fn);
> +extern int internal_fn_len_index (internal_fn);
>  extern int internal_fn_stored_value_index (internal_fn);
>  extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
>                                                   tree, tree, int);
> diff --git a/gcc/optabs.def b/gcc/optabs.def
> index 9533eb11565..58933e61817 100644
> --- a/gcc/optabs.def
> +++ b/gcc/optabs.def
> @@ -95,8 +95,10 @@ OPTAB_CD(len_maskload_optab, "len_maskload$a$b")
>  OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b")
>  OPTAB_CD(gather_load_optab, "gather_load$a$b")
>  OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b")
> +OPTAB_CD(len_mask_gather_load_optab, "len_mask_gather_load$a$b")
>  OPTAB_CD(scatter_store_optab, "scatter_store$a$b")
>  OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b")
> +OPTAB_CD(len_mask_scatter_store_optab, "len_mask_scatter_store$a$b")
>  OPTAB_CD(vec_extract_optab, "vec_extract$a$b")
>  OPTAB_CD(vec_init_optab, "vec_init$a$b")

Reply via email to