From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> Hi, this patch is adding loop len control on extract_last autovectorization.
Consider this following case: #include <stdint.h> #define EXTRACT_LAST(TYPE) \ TYPE __attribute__ ((noinline, noclone)) \ test_##TYPE (TYPE *x, int n, TYPE value) \ { \ TYPE last; \ for (int j = 0; j < n; ++j) \ { \ last = x[j]; \ x[j] = last * value; \ } \ return last; \ } #define TEST_ALL(T) \ T (uint8_t) \ TEST_ALL (EXTRACT_LAST) ARM SVE IR: Preheader: max_mask_34 = .WHILE_ULT (0, bnd.5_6, { 0, ... }); Loop: ... # loop_mask_22 = PHI <next_mask_35(4), max_mask_34(3)> ... vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_mask_22); vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; .MASK_STORE (_7, 8B, loop_mask_22, vect__4.9_27); ... next_mask_35 = .WHILE_ULT (_1, bnd.5_6, { 0, ... }); ... Epilogue: _25 = .EXTRACT_LAST (loop_mask_22, vect_last_12.8_23); For RVV since we prefer len in loop control, after this patch for RVV: Loop: ... loop_len_22 = SELECT_VL; vect_last_12.8_23 = .MASK_LOAD (_7, 8B, loop_len_22); vect__4.9_27 = vect_last_12.8_23 * vect_cst__26; .MASK_STORE (_7, 8B, loop_len_22, vect__4.9_27); ... Epilogue: _25 = .EXTRACT_LAST (loop_len_22, vect_last_12.8_23); This patch didn't add a new pattern for length loop control of extract_last. Instead we reuse current extract_last. Here is the code: Step 1 - Enable length and record length for extract_last: + machine_mode vec_mode = TYPE_MODE (vectype); + if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode)) + vect_record_loop_len (loop_vinfo, + &LOOP_VINFO_LENS (loop_vinfo), 1, + vectype, 1); + else + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), 1, + vectype, NULL); We use 'get_len_load_store_mode' to check whether targets support loop len control or not. If yes, record a loop len. Step 2 - Build EXTRACT_LAST with len: - tree mask = vect_get_loop_mask (loop_vinfo, gsi, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, 0); + tree control; + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + control = vect_get_loop_len (loop_vinfo, gsi, + &LOOP_VINFO_LENS (loop_vinfo), 1, + vectype, 0, 0); + else + control = vect_get_loop_mask (loop_vinfo, gsi, + &LOOP_VINFO_MASKS (loop_vinfo), 1, + vectype, 0); tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, - mask, vec_lhs_phi); + control, vec_lhs_phi); Reuse the current codes (build EXTRACT_LAST with mask), build length instead if 'LOOP_VINFO_FULLY_WITH_LENGTH_P' is true. This patch has been fully tested in RISC-V port. Bootstrap and Regression on X86 passed. Ok for trunk ? gcc/ChangeLog: * tree-vect-loop.cc (vectorizable_live_operation): Add length control. --- gcc/tree-vect-loop.cc | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 00058c3c13e..fde098cafde 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10311,9 +10311,15 @@ vectorizable_live_operation (vec_info *vinfo, else { gcc_assert (ncopies == 1 && !slp_node); - vect_record_loop_mask (loop_vinfo, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, NULL); + machine_mode vec_mode = TYPE_MODE (vectype); + if (get_len_load_store_mode (vec_mode, true).exists (&vec_mode)) + vect_record_loop_len (loop_vinfo, + &LOOP_VINFO_LENS (loop_vinfo), 1, + vectype, 1); + else + vect_record_loop_mask (loop_vinfo, + &LOOP_VINFO_MASKS (loop_vinfo), 1, + vectype, NULL); } } /* ??? Enable for loop costing as well. */ @@ -10339,7 +10345,9 @@ vectorizable_live_operation (vec_info *vinfo, gimple *vec_stmt; if (slp_node) { - gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); + gcc_assert (!loop_vinfo + || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + || !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)); /* Get the correct slp vectorized stmt. */ vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry]; @@ -10383,21 +10391,29 @@ vectorizable_live_operation (vec_info *vinfo, gimple_seq stmts = NULL; tree new_tree; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) + || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) { /* Emit: - SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> + SCALAR_RES = EXTRACT_LAST <VEC_LHS, CONTROL> - where VEC_LHS is the vectorized live-out result and MASK is - the loop mask for the final iteration. */ + where VEC_LHS is the vectorized live-out result and CONTROL can + be either the loop mask for the final iteration or the loop len + for the final iteration. */ gcc_assert (ncopies == 1 && !slp_node); tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); - tree mask = vect_get_loop_mask (loop_vinfo, gsi, - &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype, 0); + tree control; + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + control = vect_get_loop_len (loop_vinfo, gsi, + &LOOP_VINFO_LENS (loop_vinfo), 1, + vectype, 0, 0); + else + control = vect_get_loop_mask (loop_vinfo, gsi, + &LOOP_VINFO_MASKS (loop_vinfo), 1, + vectype, 0); tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, - mask, vec_lhs_phi); + control, vec_lhs_phi); /* Convert the extracted vector element to the scalar type. */ new_tree = gimple_convert (&stmts, lhs_type, scalar_res); -- 2.36.1