From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> Hi, Richard and Richi. It seems that the implementation of LEN_MASK_GATHER_LOAD/LEN_MASK_SCATTER_STORE is simple and code change is not big.
Here is an example: #include <stdint.h> void f (uint8_t *restrict a, uint8_t *restrict b, int n, int base, int step, int *restrict cond) { for (int i = 0; i < n; ++i) { if (cond[i]) a[i * step + base] = b[i * step + base]; } } With this patch: <bb 6> [local count: 84095460]: _58 = (unsigned int) base_19(D); _61 = (unsigned long) b_20(D); _63 = (unsigned long) a_21(D); vect_cst__105 = [vec_duplicate_expr] _58; _110 = (unsigned long) n_16(D); <bb 7> [local count: 504572759]: # vect_vec_iv_.8_95 = PHI <_96(7), { 0, 1, 2, ... }(6)> # vectp_cond.9_99 = PHI <vectp_cond.9_100(7), cond_17(D)(6)> # ivtmp_111 = PHI <ivtmp_112(7), _110(6)> _113 = .SELECT_VL (ivtmp_111, POLY_INT_CST [4, 4]); _96 = vect_vec_iv_.8_95 + { POLY_INT_CST [4, 4], ... }; ivtmp_98 = _113 * 4; vect__24.11_101 = .LEN_MASK_LOAD (vectp_cond.9_99, 32B, _113, { -1, ... }, 0); mask__14.12_103 = vect__24.11_101 != { 0, ... }; vect__59.13_104 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned int>(vect_vec_iv_.8_95); vect__60.14_106 = vect__59.13_104 + vect_cst__105; vect__12.15_107 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect__60.14_106); vect_patt_5.16_108 = .LEN_MASK_GATHER_LOAD (_61, vect__12.15_107, 4, { 0, ... }, _113, mask__14.12_103, 0); .LEN_MASK_SCATTER_STORE (_63, vect__12.15_107, 4, vect_patt_5.16_108, _113, mask__14.12_103, 0); vectp_cond.9_100 = vectp_cond.9_99 + ivtmp_98; ivtmp_112 = ivtmp_111 - _113; if (ivtmp_112 != 0) goto <bb 7>; [83.33%] else goto <bb 8>; [16.67%] gcc/ChangeLog: * optabs-query.cc (supports_vec_gather_load_p): Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE. (supports_vec_scatter_store_p): Ditto. * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. --- gcc/optabs-query.cc | 2 + gcc/tree-vect-data-refs.cc | 18 ++++++++- gcc/tree-vect-stmts.cc | 81 +++++++++++++++++++++++++++++++++++++- 3 files changed, 98 insertions(+), 3 deletions(-) diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc index 2fdd0d34354..bf1f484e874 100644 --- a/gcc/optabs-query.cc +++ b/gcc/optabs-query.cc @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) this_fn_optabs->supports_vec_gather_load[mode] = (supports_vec_convert_optab_p (gather_load_optab, mode) || supports_vec_convert_optab_p (mask_gather_load_optab, mode) + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_gather_load[mode] > 0; @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) this_fn_optabs->supports_vec_scatter_store[mode] = (supports_vec_convert_optab_p (scatter_store_optab, mode) || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) ? 1 : -1); return this_fn_optabs->supports_vec_scatter_store[mode] > 0; diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index ebe93832b1e..01016284c48 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, return false; /* Work out which function we need. */ - internal_fn ifn, alt_ifn; + internal_fn ifn, alt_ifn, len_mask_ifn; if (read_p) { ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; alt_ifn = IFN_MASK_GATHER_LOAD; + /* When target supports LEN_MASK_GATHER_LOAD, we always + use LEN_MASK_GATHER_LOAD regardless whether len and + mask are valid or not. */ + len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD; } else { ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; alt_ifn = IFN_MASK_SCATTER_STORE; + /* When target supports LEN_MASK_SCATTER_STORE, we always + use LEN_MASK_SCATTER_STORE regardless whether len and + mask are valid or not. */ + len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE; } for (;;) @@ -3909,6 +3917,14 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, *offset_vectype_out = offset_vectype; return true; } + else if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype, + memory_type, + offset_vectype, scale)) + { + *ifn_out = ifn; + *offset_vectype_out = offset_vectype; + return true; + } if (TYPE_PRECISION (offset_type) >= POINTER_SIZE && TYPE_PRECISION (offset_type) >= element_bits) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 68faa8ead39..fa0387353cf 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1771,6 +1771,17 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, gs_info->offset_vectype, gs_info->scale)) { + internal_fn len_mask_ifn + = (is_load ? IFN_LEN_MASK_GATHER_LOAD : IFN_LEN_MASK_SCATTER_STORE); + if (internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype, + gs_info->memory_type, + gs_info->offset_vectype, + gs_info->scale)) + { + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); + return; + } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "can't operate on partial vectors because" @@ -8930,7 +8941,40 @@ vectorizable_store (vec_info *vinfo, vec_offset = vec_offsets[vec_num * j + i]; tree scale = size_int (gs_info.scale); gcall *call; - if (final_mask) + if (internal_gather_scatter_fn_supported_p ( + IFN_LEN_MASK_SCATTER_STORE, vectype, + gs_info.memory_type, TREE_TYPE (vec_offset), + gs_info.scale)) + { + tree final_len = NULL_TREE; + tree bias = NULL_TREE; + if (loop_lens) + { + final_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + } + else + { + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len + = build_int_cst (iv_type, + TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + call = gimple_build_call_internal ( + IFN_LEN_MASK_SCATTER_STORE, 7, dataref_ptr, vec_offset, + scale, vec_oprnd, final_len, final_mask, bias); + } + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, scale, vec_oprnd, final_mask); @@ -10368,7 +10412,40 @@ vectorizable_load (vec_info *vinfo, tree zero = build_zero_cst (vectype); tree scale = size_int (gs_info.scale); gcall *call; - if (final_mask) + if (internal_gather_scatter_fn_supported_p ( + IFN_LEN_MASK_GATHER_LOAD, vectype, + gs_info.memory_type, TREE_TYPE (vec_offset), + gs_info.scale)) + { + tree final_len = NULL_TREE; + tree bias = NULL_TREE; + if (loop_lens) + { + final_len = vect_get_loop_len ( + loop_vinfo, gsi, loop_lens, vec_num * ncopies, + vectype, vec_num * j + i, 1); + } + else + { + tree iv_type + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + final_len = build_int_cst ( + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); + } + signed char biasval + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); + bias = build_int_cst (intQI_type_node, biasval); + if (!final_mask) + { + mask_vectype = truth_type_for (vectype); + final_mask = build_minus_one_cst (mask_vectype); + } + call = gimple_build_call_internal ( + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, + vec_offset, scale, zero, final_len, final_mask, + bias); + } + else if (final_mask) call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, vec_offset, scale, zero, final_mask); -- 2.36.3