juzhe.zh...@rivai.ai writes: > From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> > > Hi, Richi and Richard. > > This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets > handle flow control by mask and loop control by length on gather/scatter > memory > operations. Consider this following case: > > #include <stdint.h> > void > f (uint8_t *restrict a, > uint8_t *restrict b, int n, > int base, int step, > int *restrict cond) > { > for (int i = 0; i < n; ++i) > { > if (cond[i]) > a[i * step + base] = b[i * step + base]; > } > } > > We hope RVV can vectorize such case into following IR: > > loop_len = SELECT_VL > control_mask = comparison > v = LEN_MASK_GATHER_LOAD (.., loop_len, control_mask, bias) > LEN_SCATTER_STORE (... v, ..., loop_len, control_mask, bias) > > This patch doesn't apply such patterns into vectorizer, just add patterns > and update the documents. > > Will send patch which apply such patterns into vectorizer soon after this > patch is approved. > > Thanks. > > --- > gcc/doc/md.texi | 17 ++++++++++++ > gcc/internal-fn.cc | 67 +++++++++++++++++++++++++++++++++++++++++++-- > gcc/internal-fn.def | 8 ++++-- > gcc/internal-fn.h | 1 + > gcc/optabs.def | 2 ++ > 5 files changed, 90 insertions(+), 5 deletions(-) > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi > index 9648fdc846a..df41b5251d4 100644 > --- a/gcc/doc/md.texi > +++ b/gcc/doc/md.texi > @@ -5040,6 +5040,15 @@ operand 5. Bit @var{i} of the mask is set if element > @var{i} > of the result should be loaded from memory and clear if element @var{i} > of the result should be set to zero. > > +@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern > +@item @samp{len_mask_gather_load@var{m}@var{n}} > +Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand > (operand 5), > +a mask operand (operand 6) as well as a bias operand (operand 7). Similar > to len_maskload, > +the instruction loads at most (operand 5 + operand 7) elements from memory. > +Bit @var{i} of the mask is set if element @var{i} of the result should > +be loaded from memory and clear if element @var{i} of the result should be > undefined. > +Mask elements @var{i} with i > (operand 5) are ignored.
Nit: second i should be @var{i} too. > + > @cindex @code{scatter_store@var{m}@var{n}} instruction pattern > @item @samp{scatter_store@var{m}@var{n}} > Store a vector of mode @var{m} into several distinct memory locations. > @@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an > extra mask operand as > operand 5. Bit @var{i} of the mask is set if element @var{i} > of the result should be stored to memory. > > +@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern > +@item @samp{len_mask_scatter_store@var{m}@var{n}} > +Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand > (operand 5), > +a mask operand (operand 6) as well as a bias operand (operand 7). The > instruction stores > +at most (operand 5 + operand 7) elements of (operand 4) to memory. > +Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be > stored. > +Mask elements @var{i} with i > (operand 5) are ignored. Same here. > + > @cindex @code{vec_set@var{m}} instruction pattern > @item @samp{vec_set@var{m}} > Set given field in the vector value. Operand 0 is the vector to modify, > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 9017176dc7a..da3827481e9 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -3537,7 +3537,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall > *stmt, direct_optab optab) > HOST_WIDE_INT scale_int = tree_to_shwi (scale); > rtx rhs_rtx = expand_normal (rhs); > > - class expand_operand ops[6]; > + class expand_operand ops[8]; > int i = 0; > create_address_operand (&ops[i++], base_rtx); > create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE > (offset))); > @@ -3546,9 +3546,23 @@ expand_scatter_store_optab_fn (internal_fn, gcall > *stmt, direct_optab optab) > create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs))); > if (mask_index >= 0) > { > + if (optab == len_mask_scatter_store_optab) > + { > + tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn)); > + rtx len_rtx = expand_normal (len); > + create_convert_operand_from (&ops[i++], len_rtx, > + TYPE_MODE (TREE_TYPE (len)), > + TYPE_UNSIGNED (TREE_TYPE (len))); > + } > tree mask = gimple_call_arg (stmt, mask_index); > rtx mask_rtx = expand_normal (mask); > create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE > (mask))); > + if (optab == len_mask_scatter_store_optab) > + { > + tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1); > + rtx bias = expand_normal (biast); > + create_input_operand (&ops[i++], bias, QImode); > + } > } Guess this is personal preference, but IMO it would be more natural to have: if (optab == len_mask_scatter_store_optab) { ... } if (mask_index >= 0) { ... } if (optab == len_mask_scatter_store_optab) { ... } since that's the structure we'd need if LEN_GATHER_LOAD was a thing. Also... > > insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE > (rhs)), > @@ -3559,7 +3573,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall > *stmt, direct_optab optab) > /* Expand {MASK_,}GATHER_LOAD call CALL using optab OPTAB. */ > > static void > -expand_gather_load_optab_fn (internal_fn, gcall *stmt, direct_optab optab) > +expand_gather_load_optab_fn (internal_fn ifn, gcall *stmt, direct_optab > optab) > { > tree lhs = gimple_call_lhs (stmt); > tree base = gimple_call_arg (stmt, 0); > @@ -3572,7 +3586,7 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, > direct_optab optab) > HOST_WIDE_INT scale_int = tree_to_shwi (scale); > > int i = 0; > - class expand_operand ops[6]; > + class expand_operand ops[8]; > create_output_operand (&ops[i++], lhs_rtx, TYPE_MODE (TREE_TYPE (lhs))); > create_address_operand (&ops[i++], base_rtx); > create_input_operand (&ops[i++], offset_rtx, TYPE_MODE (TREE_TYPE > (offset))); > @@ -3584,6 +3598,20 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, > direct_optab optab) > rtx mask_rtx = expand_normal (mask); > create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE > (mask))); > } > + else if (optab == len_mask_gather_load_optab) > + { > + tree len = gimple_call_arg (stmt, internal_fn_len_index (ifn)); > + rtx len_rtx = expand_normal (len); > + create_convert_operand_from (&ops[i++], len_rtx, > + TYPE_MODE (TREE_TYPE (len)), > + TYPE_UNSIGNED (TREE_TYPE (len))); > + tree mask = gimple_call_arg (stmt, internal_fn_mask_index (ifn)); > + rtx mask_rtx = expand_normal (mask); > + create_input_operand (&ops[i++], mask_rtx, TYPE_MODE (TREE_TYPE > (mask))); > + tree biast = gimple_call_arg (stmt, gimple_call_num_args (stmt) - 1); > + rtx bias = expand_normal (biast); > + create_input_operand (&ops[i++], bias, QImode); > + } ...I think we've now got too many copies of this construct: the two above, plus expand_partial_load_optab_fn and expand_partial_store_optab_fn. It would be good to have a helper along the lines of: static unsigned int add_mask_and_len_args (expand_operand *opno, unsigned int opno, gcall *stmt) { ... } that makes use of internal_fn_len_index and internal_fn_mask_index. Don't shoot me, but I think we might have made a mistake by putting the mask between the length and the bias. It seems more natural to keep the length-related arguments consecutive, so that any bias argument is always immediately after any length argument. Would you mind doing a separate patch to swap the len_maskload/len_maskstore arguments around? “mask, len, bias” or “len, bias, mask” would be OK, although I suppose “len, bias, mask” fits the order in the name. > insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE > (lhs)), > TYPE_MODE (TREE_TYPE (offset))); > expand_insn (icode, i, ops); > @@ -4434,6 +4462,7 @@ internal_load_fn_p (internal_fn fn) > case IFN_MASK_LOAD_LANES: > case IFN_GATHER_LOAD: > case IFN_MASK_GATHER_LOAD: > + case IFN_LEN_MASK_GATHER_LOAD: > case IFN_LEN_LOAD: > case IFN_LEN_MASK_LOAD: > return true; > @@ -4455,6 +4484,7 @@ internal_store_fn_p (internal_fn fn) > case IFN_MASK_STORE_LANES: > case IFN_SCATTER_STORE: > case IFN_MASK_SCATTER_STORE: > + case IFN_LEN_MASK_SCATTER_STORE: > case IFN_LEN_STORE: > case IFN_LEN_MASK_STORE: > return true; > @@ -4473,8 +4503,10 @@ internal_gather_scatter_fn_p (internal_fn fn) > { > case IFN_GATHER_LOAD: > case IFN_MASK_GATHER_LOAD: > + case IFN_LEN_MASK_GATHER_LOAD: > case IFN_SCATTER_STORE: > case IFN_MASK_SCATTER_STORE: > + case IFN_LEN_MASK_SCATTER_STORE: > return true; > > default: > @@ -4504,6 +4536,34 @@ internal_fn_mask_index (internal_fn fn) > case IFN_LEN_MASK_STORE: > return 3; > > + case IFN_LEN_MASK_GATHER_LOAD: > + case IFN_LEN_MASK_SCATTER_STORE: > + return 5; > + > + default: > + return (conditional_internal_fn_code (fn) != ERROR_MARK > + || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); > + } > +} > + > +/* If FN takes a vector len argument, return the index of that argument, > + otherwise return -1. */ > + > +int > +internal_fn_len_index (internal_fn fn) > +{ > + switch (fn) > + { > + case IFN_LEN_LOAD: > + case IFN_LEN_STORE: > + case IFN_LEN_MASK_LOAD: > + case IFN_LEN_MASK_STORE: > + return 2; > + > + case IFN_LEN_MASK_GATHER_LOAD: > + case IFN_LEN_MASK_SCATTER_STORE: > + return 4; > + > default: > return (conditional_internal_fn_code (fn) != ERROR_MARK > || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1); This default case isn't correct for this function. Thanks, Richard > @@ -4522,6 +4582,7 @@ internal_fn_stored_value_index (internal_fn fn) > case IFN_MASK_STORE_LANES: > case IFN_SCATTER_STORE: > case IFN_MASK_SCATTER_STORE: > + case IFN_LEN_MASK_SCATTER_STORE: > case IFN_LEN_STORE: > return 3; > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index bc947c0fde7..5be24decf88 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -48,14 +48,14 @@ along with GCC; see the file COPYING3. If not see > - mask_load: currently just maskload > - load_lanes: currently just vec_load_lanes > - mask_load_lanes: currently just vec_mask_load_lanes > - - gather_load: used for {mask_,}gather_load > + - gather_load: used for {mask_,len_mask_,}gather_load > - len_load: currently just len_load > - len_maskload: currently just len_maskload > > - mask_store: currently just maskstore > - store_lanes: currently just vec_store_lanes > - mask_store_lanes: currently just vec_mask_store_lanes > - - scatter_store: used for {mask_,}scatter_store > + - scatter_store: used for {mask_,len_mask_,}scatter_store > - len_store: currently just len_store > - len_maskstore: currently just len_maskstore > > @@ -157,6 +157,8 @@ DEF_INTERNAL_OPTAB_FN (MASK_LOAD_LANES, ECF_PURE, > DEF_INTERNAL_OPTAB_FN (GATHER_LOAD, ECF_PURE, gather_load, gather_load) > DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, > mask_gather_load, gather_load) > +DEF_INTERNAL_OPTAB_FN (LEN_MASK_GATHER_LOAD, ECF_PURE, > + len_mask_gather_load, gather_load) > > DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load) > DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload) > @@ -164,6 +166,8 @@ DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, > len_maskload, len_maskload) > DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) > DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, > mask_scatter_store, scatter_store) > +DEF_INTERNAL_OPTAB_FN (LEN_MASK_SCATTER_STORE, 0, > + len_mask_scatter_store, scatter_store) > > DEF_INTERNAL_OPTAB_FN (MASK_STORE, 0, maskstore, mask_store) > DEF_INTERNAL_OPTAB_FN (STORE_LANES, ECF_CONST, vec_store_lanes, store_lanes) > diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h > index 8f21068e300..4234bbfed87 100644 > --- a/gcc/internal-fn.h > +++ b/gcc/internal-fn.h > @@ -234,6 +234,7 @@ extern bool internal_load_fn_p (internal_fn); > extern bool internal_store_fn_p (internal_fn); > extern bool internal_gather_scatter_fn_p (internal_fn); > extern int internal_fn_mask_index (internal_fn); > +extern int internal_fn_len_index (internal_fn); > extern int internal_fn_stored_value_index (internal_fn); > extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, > tree, tree, int); > diff --git a/gcc/optabs.def b/gcc/optabs.def > index 9533eb11565..58933e61817 100644 > --- a/gcc/optabs.def > +++ b/gcc/optabs.def > @@ -95,8 +95,10 @@ OPTAB_CD(len_maskload_optab, "len_maskload$a$b") > OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b") > OPTAB_CD(gather_load_optab, "gather_load$a$b") > OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") > +OPTAB_CD(len_mask_gather_load_optab, "len_mask_gather_load$a$b") > OPTAB_CD(scatter_store_optab, "scatter_store$a$b") > OPTAB_CD(mask_scatter_store_optab, "mask_scatter_store$a$b") > +OPTAB_CD(len_mask_scatter_store_optab, "len_mask_scatter_store$a$b") > OPTAB_CD(vec_extract_optab, "vec_extract$a$b") > OPTAB_CD(vec_init_optab, "vec_init$a$b")