On Wed, 9 Aug 2023, juzhe.zh...@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> > > Hi, this patch is adding loop len control on extract_last autovectorization. > > Consider this following case: > > #include <stdint.h> > > #define EXTRACT_LAST(TYPE) \ > TYPE __attribute__ ((noinline, noclone)) \ > test_##TYPE (TYPE *x, int n, TYPE value) \ > { \ > TYPE last; \ > for (int j = 0; j < n; ++j) \ > { \ > last = x[j]; \ > x[j] = last * value; \ > } \ > return last; \ > } > > #define TEST_ALL(T) \ > T (uint8_t) \ > > TEST_ALL (EXTRACT_LAST) > > ARM SVE IR: > > Preheader: > max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); > > Loop: > ... > # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> > ... > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); > ... > next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); > ... > > Epilogue: > _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); > > For RVV since we prefer len in loop control, after this patch for RVV: > > Loop: > ... > loop_len_22 = SELECT_VL; > vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); > vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; > .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); > ... > > Epilogue: > _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23); > > This patch didn't add a new pattern for length loop control of extract_last. > Instead we reuse current extract_last. > > Here is the code: > > Step 1 - Enable length and record length for extract_last: > > + machine_mode vec_mode = TYPE_MODE (vectype); > + if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode)) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), 1, > + vectype, 1); > + else > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), 1, > + vectype, NULL); > > We use 'get_len_load_store_mode' to check whether targets support loop len > control or not. > If yes, record a loop len. > > Step 2 - Build EXTRACT_LAST with len: > > - tree mask = vect_get_loop_mask (loop_vinfo, gsi, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, 0); > + tree control; > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + control = vect_get_loop_len (loop_vinfo, gsi, > + &LOOP_VINFO_LENS (loop_vinfo), 1, > + vectype, 0, 0); > + else > + control = vect_get_loop_mask (loop_vinfo, gsi, > + &LOOP_VINFO_MASKS (loop_vinfo), 1, > + vectype, 0); > tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, > - mask, vec_lhs_phi); > + control, vec_lhs_phi); > > Reuse the current codes (build EXTRACT_LAST with mask), build length instead > if > 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true. > > This patch has been fully tested in RISC-V port. > > Bootstrap and Regression on X86 passed. > > Ok for trunk ? > > gcc/ChangeLog: > > * tree-vect-loop.cc (vectorizable_live_operation): Add length control. > > --- > gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------ > 1 file changed, 28 insertions(+), 12 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 00058c3c13e..fde098cafde 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo, > else > { > gcc_assert (ncopies == 1 && !slp_node); > - vect_record_loop_mask (loop_vinfo, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, NULL); > + machine_mode vec_mode = TYPE_MODE (vectype); > + if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode)) > + vect_record_loop_len (loop_vinfo, > + &LOOP_VINFO_LENS (loop_vinfo), 1, > + vectype, 1); > + else > + vect_record_loop_mask (loop_vinfo, > + &LOOP_VINFO_MASKS (loop_vinfo), 1, > + vectype, NULL); > } > } > /* ??? Enable for loop costing as well. */ > @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, > gimple *vec_stmt; > if (slp_node) > { > - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); > + gcc_assert (!loop_vinfo > + || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo));
that should be || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) I think. It seems to imply that SLP isn't supported with masking/lengthing. > > /* Get the correct slp vectorized stmt. */ > vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; > @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo, > > gimple_seq stmts = NULL; > tree new_tree; > - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) > + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > + || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > { > /* Emit: > > - SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> > + SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL> > > - where VEC_LHS is the vectorized live-out result and MASK is > - the loop mask for the final iteration. */ > + where VEC_LHS is the vectorized live-out result and CONTROL can > + be either the loop mask for the final iteration or the loop len > + for the final iteration. */ > gcc_assert (ncopies == 1 && !slp_node); > tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); > - tree mask = vect_get_loop_mask (loop_vinfo, gsi, > - &LOOP_VINFO_MASKS (loop_vinfo), > - 1, vectype, 0); > + tree control; > + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) > + control = vect_get_loop_len (loop_vinfo, gsi, > + &LOOP_VINFO_LENS (loop_vinfo), 1, > + vectype, 0, 0); > + else > + control = vect_get_loop_mask (loop_vinfo, gsi, > + &LOOP_VINFO_MASKS (loop_vinfo), 1, > + vectype, 0); > tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, > - mask, vec_lhs_phi); > + control, vec_lhs_phi); Hum, how does CFN_EXTRACT_LAST handle both mask and length transparently? Don't you need some CFN_LEN_EXTRACT_LAST instead? > /* Convert the extracted vector element to the scalar type. */ > new_tree = gimple_convert (&stmts, lhs_type, scalar_res); > Otherwise looks OK to me. Thanks, Richard.