From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> Target like ARM SVE in GCC has an elegant way to handle both loop control and flow control simultaneously:
loop_control_mask = WHILE_ULT flow_control_mask = comparison control_mask = loop_control_mask & flow_control_mask; MASK_LOAD (control_mask) MASK_STORE (control_mask) However, targets like RVV (RISC-V Vector) can not use this approach in auto-vectorization since RVV use length in loop control. This patch adds LEN_MASK_ LOAD/STORE to support flow control for targets like RISC-V that uses length in loop control. Normalize load/store into LEN_MASK_ LOAD/STORE as long as either length or mask is valid. Length is the outcome of SELECT_VL or MIN_EXPR. Mask is the outcome of comparison. LEN_MASK_ LOAD/STORE format is defined as follows: 1). LEN_MASK_LOAD (ptr, align, length, mask). 2). LEN_MASK_STORE (ptr, align, length, mask, vec). Consider these 4 following cases: VLA: Variable-length auto-vectorization VLS: Specific-length auto-vectorization Case 1 (VLS): -mrvv-vector-bits=128 IR (Does not use LEN_MASK_*): Code: v1 = MEM (...) for (int i = 0; i < 4; i++) v2 = MEM (...) a[i] = b[i] + c[i]; v3 = v1 + v2 MEM[...] = v3 Case 2 (VLS): -mrvv-vector-bits=128 IR (LEN_MASK_* with length = VF, mask = comparison): Code: mask = comparison for (int i = 0; i < 4; i++) v1 = LEN_MASK_LOAD (length = VF, mask) if (cond[i]) v2 = LEN_MASK_LOAD (length = VF, mask) a[i] = b[i] + c[i]; v3 = v1 + v2 LEN_MASK_STORE (length = VF, mask, v3) Case 3 (VLA): Code: loop_len = SELECT_VL or MIN for (int i = 0; i < n; i++) v1 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...}) a[i] = b[i] + c[i]; v2 = LEN_MASK_LOAD (length = loop_len, mask = {-1,-1,...}) v3 = v1 + v2 LEN_MASK_STORE (length = loop_len, mask = {-1,-1,...}, v3) Case 4 (VLA): Code: loop_len = SELECT_VL or MIN for (int i = 0; i < n; i++) mask = comparison if (cond[i]) v1 = LEN_MASK_LOAD (length = loop_len, mask) a[i] = b[i] + c[i]; v2 = LEN_MASK_LOAD (length = loop_len, mask) v3 = v1 + v2 LEN_MASK_STORE (length = loop_len, mask, v3) More features: 1. Support simplify gimple fold for LEN_MASK_ LOAD/STORE: LEN_MASK_STORE (length = vf, mask = {-1,-1,...}, v) ===> MEM [...] = V 2. Allow DSE for LEN_MASK_* LOAD/STORE. Bootstrap && Regression on X86 with no surprise difference. gcc/ChangeLog: * doc/md.texi: Add LEN_MASK_ LOAD/STORE. * genopinit.cc (main): Ditto. (CMP_NAME): Ditto. * gimple-fold.cc (arith_overflowed_p): Ditto. (gimple_fold_partial_load_store_mem_ref): Ditto. (gimple_fold_partial_store): Ditto. (gimple_fold_call): Ditto. * internal-fn.cc (len_maskload_direct): Ditto. (len_maskstore_direct): Ditto. (expand_partial_load_optab_fn): Ditto. (expand_len_maskload_optab_fn): Ditto. (expand_partial_store_optab_fn): Ditto. (expand_len_maskstore_optab_fn): Ditto. (direct_len_maskload_optab_supported_p): Ditto. (direct_len_maskstore_optab_supported_p): Ditto. (internal_load_fn_p): Ditto. (internal_store_fn_p): Ditto. (internal_fn_mask_index): Ditto. (internal_fn_stored_value_index): Ditto. * internal-fn.def (LEN_MASK_LOAD): Ditto. (LEN_MASK_STORE): Ditto. * optabs-query.cc (can_vec_len_mask_load_store_p): Ditto. * optabs-query.h (can_vec_len_mask_load_store_p): Ditto. * optabs.def (OPTAB_CD): Ditto. * tree-data-ref.cc (get_references_in_stmt): Ditto. * tree-if-conv.cc (ifcvt_can_use_mask_load_store): Ditto. * tree-ssa-alias.cc (ref_maybe_used_by_call_p_1): Ditto. (call_may_clobber_ref_p_1): Ditto. * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Ditto. (dse_optimize_stmt): Ditto. * tree-ssa-loop-ivopts.cc (get_mem_type_for_internal_fn): Ditto. (get_alias_ptr_type_for_ptr_address): Ditto. * tree-ssa-sccvn.cc (vn_reference_lookup_3): Ditto. * tree-vect-data-refs.cc (can_group_stmts_p): Ditto. (vect_find_stmt_data_reference): Ditto. (vect_supportable_dr_alignment): Ditto. * tree-vect-loop.cc (vect_verify_loop_lens): Ditto. (optimize_mask_stores): Ditto. * tree-vect-slp.cc (vect_get_operand_map): Ditto. (vect_build_slp_tree_2): Ditto. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (vectorizable_store): Ditto. (vectorizable_load): Ditto. (vect_get_vector_types_for_stmt): Ditto. * tree-vectorizer.cc (try_vectorize_loop_1): Ditto. --- gcc/doc/md.texi | 32 ++++++++++++ gcc/genopinit.cc | 6 ++- gcc/gimple-fold.cc | 28 ++++++++--- gcc/internal-fn.cc | 37 +++++++++++++- gcc/internal-fn.def | 4 ++ gcc/optabs-query.cc | 39 +++++++++++++++ gcc/optabs-query.h | 1 + gcc/optabs.def | 2 + gcc/tree-data-ref.cc | 4 ++ gcc/tree-if-conv.cc | 3 ++ gcc/tree-ssa-alias.cc | 3 ++ gcc/tree-ssa-dse.cc | 12 +++++ gcc/tree-ssa-loop-ivopts.cc | 8 +++ gcc/tree-ssa-sccvn.cc | 6 +++ gcc/tree-vect-data-refs.cc | 20 +++++--- gcc/tree-vect-loop.cc | 52 +++++++++++-------- gcc/tree-vect-slp.cc | 5 ++ gcc/tree-vect-stmts.cc | 99 ++++++++++++++++++++++++++++++++++--- gcc/tree-vectorizer.cc | 2 + 19 files changed, 320 insertions(+), 43 deletions(-) diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 95f7fe1f802..fc99990465d 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5136,6 +5136,38 @@ of @code{QI} elements. This pattern is not allowed to @code{FAIL}. +@cindex @code{len_maskload@var{m}@var{n}} instruction pattern +@item @samp{len_maskload@var{m}@var{n}} +Perform a load of vector which is predicated by length and mask +from memory operand 1 of mode @var{m} into register operand 0. +Length is provided in operand 2 which has whichever +integer mode the target prefers. +Mask is provided in register operand 3 of mode @var{n}. + +operand 2 can be a variable or a constant amount. It can be vectorization +factor which is the special constant value represents the maximum length. + +operand 3 can be a variable or a constant amount. It can be all 1 +which is the special constant value represents the full mask. + +This pattern is not allowed to @code{FAIL}. + +@cindex @code{len_maskstore@var{m}@var{n}} instruction pattern +@item @samp{len_maskstore@var{m}@var{n}} +Perform a store of vector which is predicated by length and mask +from register operand 1 of mode @var{m} into memory operand 0. +Length is provided in operand 2 which has whichever +integer mode the target prefers. +Mask is provided in register operand 3 of mode @var{n}. + +operand 2 can be a variable or a constant amount. It can be vectorization +factor which is the special constant value represents the maximum length. + +operand 3 can be a variable or a constant amount. It can be all 1 +which is the special constant value represents the full mask. + +This pattern is not allowed to @code{FAIL}. + @cindex @code{vec_perm@var{m}} instruction pattern @item @samp{vec_perm@var{m}} Output a (variable) vector permutation. Operand 0 is the destination diff --git a/gcc/genopinit.cc b/gcc/genopinit.cc index 0c1b6859ca0..9aeebd66724 100644 --- a/gcc/genopinit.cc +++ b/gcc/genopinit.cc @@ -376,7 +376,8 @@ main (int argc, const char **argv) fprintf (s_file, "/* Returns TRUE if the target supports any of the partial vector\n" - " optabs: while_ult_optab, len_load_optab or len_store_optab,\n" + " optabs: while_ult_optab, len_load_optab, len_store_optab,\n" + " len_maskload_optab or len_maskstore_optab,\n" " for any mode. */\n" "bool\npartial_vectors_supported_p (void)\n{\n"); bool any_match = false; @@ -386,7 +387,8 @@ main (int argc, const char **argv) { #define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N))) if (CMP_NAME("while_ult") || CMP_NAME ("len_load") - || CMP_NAME ("len_store")) + || CMP_NAME ("len_store") || CMP_NAME ("len_maskload") + || CMP_NAME ("len_maskstore")) { if (first) fprintf (s_file, " HAVE_%s", p->name); diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc index 581575b65ec..a2c2ad5bfe7 100644 --- a/gcc/gimple-fold.cc +++ b/gcc/gimple-fold.cc @@ -5370,8 +5370,8 @@ arith_overflowed_p (enum tree_code code, const_tree type, return wi::min_precision (wres, sign) > TYPE_PRECISION (type); } -/* If IFN_{MASK,LEN}_LOAD/STORE call CALL is unconditional, return a MEM_REF - for the memory it references, otherwise return null. VECTYPE is the +/* If IFN_{MASK,LEN,LEN_MASK}_LOAD/STORE call CALL is unconditional, return a + MEM_REF for the memory it references, otherwise return null. VECTYPE is the type of the memory vector. MASK_P indicates it's for MASK if true, otherwise it's for LEN. */ @@ -5383,7 +5383,20 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p) if (!tree_fits_uhwi_p (alias_align)) return NULL_TREE; - if (mask_p) + if (gimple_call_internal_fn (call) == IFN_LEN_MASK_LOAD + || gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE) + { + tree basic_len = gimple_call_arg (call, 2); + if (!poly_int_tree_p (basic_len)) + return NULL_TREE; + if (maybe_ne (tree_to_poly_uint64 (basic_len), + TYPE_VECTOR_SUBPARTS (vectype))) + return NULL_TREE; + tree mask = gimple_call_arg (call, 3); + if (!integer_all_onesp (mask)) + return NULL_TREE; + } + else if (mask_p) { tree mask = gimple_call_arg (call, 2); if (!integer_all_onesp (mask)) @@ -5409,7 +5422,7 @@ gimple_fold_partial_load_store_mem_ref (gcall *call, tree vectype, bool mask_p) return fold_build2 (MEM_REF, vectype, ptr, offset); } -/* Try to fold IFN_{MASK,LEN}_LOAD call CALL. Return true on success. +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_LOAD call CALL. Return true on success. MASK_P indicates it's for MASK if true, otherwise it's for LEN. */ static bool @@ -5431,14 +5444,15 @@ gimple_fold_partial_load (gimple_stmt_iterator *gsi, gcall *call, bool mask_p) return false; } -/* Try to fold IFN_{MASK,LEN}_STORE call CALL. Return true on success. +/* Try to fold IFN_{MASK,LEN,LEN_MASK}_STORE call CALL. Return true on success. MASK_P indicates it's for MASK if true, otherwise it's for LEN. */ static bool gimple_fold_partial_store (gimple_stmt_iterator *gsi, gcall *call, bool mask_p) { - tree rhs = gimple_call_arg (call, 3); + tree rhs = gimple_call_arg ( + call, gimple_call_internal_fn (call) == IFN_LEN_MASK_STORE ? 4 : 3); if (tree lhs = gimple_fold_partial_load_store_mem_ref (call, TREE_TYPE (rhs), mask_p)) { @@ -5659,9 +5673,11 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace) cplx_result = true; break; case IFN_MASK_LOAD: + case IFN_LEN_MASK_LOAD: changed |= gimple_fold_partial_load (gsi, stmt, true); break; case IFN_MASK_STORE: + case IFN_LEN_MASK_STORE: changed |= gimple_fold_partial_store (gsi, stmt, true); break; case IFN_LEN_LOAD: diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index da9b944dd5d..4a9fe388eed 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -165,6 +165,7 @@ init_internal_fns () #define mask_load_lanes_direct { -1, -1, false } #define gather_load_direct { 3, 1, false } #define len_load_direct { -1, -1, false } +#define len_maskload_direct { -1, 3, false } #define mask_store_direct { 3, 2, false } #define store_lanes_direct { 0, 0, false } #define mask_store_lanes_direct { 0, 0, false } @@ -172,6 +173,7 @@ init_internal_fns () #define vec_cond_direct { 2, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define len_maskstore_direct { 4, 3, false } #define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define unary_convert_direct { -1, 0, true } @@ -2875,6 +2877,17 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_input_operand (&ops[3], bias, QImode); expand_insn (icode, 4, ops); } + else if (optab == len_maskload_optab) + { + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), + TYPE_UNSIGNED (TREE_TYPE (maskt))); + maskt = gimple_call_arg (stmt, 3); + mask = expand_normal (maskt); + create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt))); + icode = convert_optab_handler (optab, TYPE_MODE (type), + TYPE_MODE (TREE_TYPE (maskt))); + expand_insn (icode, 4, ops); + } else { create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); @@ -2888,6 +2901,7 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_mask_load_optab_fn expand_partial_load_optab_fn #define expand_mask_load_lanes_optab_fn expand_mask_load_optab_fn #define expand_len_load_optab_fn expand_partial_load_optab_fn +#define expand_len_maskload_optab_fn expand_partial_load_optab_fn /* Expand MASK_STORE{,_LANES} or LEN_STORE call STMT using optab OPTAB. */ @@ -2900,7 +2914,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) insn_code icode; maskt = gimple_call_arg (stmt, 2); - rhs = gimple_call_arg (stmt, 3); + rhs = gimple_call_arg (stmt, optab == len_maskstore_optab ? 4 : 3); type = TREE_TYPE (rhs); lhs = expand_call_mem_ref (type, stmt, 0); @@ -2927,6 +2941,16 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) create_input_operand (&ops[3], bias, QImode); expand_insn (icode, 4, ops); } + else if (optab == len_maskstore_optab) + { + create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)), + TYPE_UNSIGNED (TREE_TYPE (maskt))); + maskt = gimple_call_arg (stmt, 3); + mask = expand_normal (maskt); + create_input_operand (&ops[3], mask, TYPE_MODE (TREE_TYPE (maskt))); + icode = convert_optab_handler (optab, TYPE_MODE (type), GET_MODE (mask)); + expand_insn (icode, 4, ops); + } else { create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt))); @@ -2937,6 +2961,7 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_mask_store_optab_fn expand_partial_store_optab_fn #define expand_mask_store_lanes_optab_fn expand_mask_store_optab_fn #define expand_len_store_optab_fn expand_partial_store_optab_fn +#define expand_len_maskstore_optab_fn expand_partial_store_optab_fn /* Expand VCOND, VCONDU and VCONDEQ optab internal functions. The expansion of STMT happens based on OPTAB table associated. */ @@ -3890,6 +3915,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_mask_load_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_gather_load_optab_supported_p convert_optab_supported_p #define direct_len_load_optab_supported_p direct_optab_supported_p +#define direct_len_maskload_optab_supported_p convert_optab_supported_p #define direct_mask_store_optab_supported_p convert_optab_supported_p #define direct_store_lanes_optab_supported_p multi_vector_optab_supported_p #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p @@ -3897,6 +3923,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_vec_cond_optab_supported_p convert_optab_supported_p #define direct_scatter_store_optab_supported_p convert_optab_supported_p #define direct_len_store_optab_supported_p direct_optab_supported_p +#define direct_len_maskstore_optab_supported_p convert_optab_supported_p #define direct_while_optab_supported_p convert_optab_supported_p #define direct_fold_extract_optab_supported_p direct_optab_supported_p #define direct_fold_left_optab_supported_p direct_optab_supported_p @@ -4361,6 +4388,7 @@ internal_load_fn_p (internal_fn fn) case IFN_GATHER_LOAD: case IFN_MASK_GATHER_LOAD: case IFN_LEN_LOAD: + case IFN_LEN_MASK_LOAD: return true; default: @@ -4381,6 +4409,7 @@ internal_store_fn_p (internal_fn fn) case IFN_SCATTER_STORE: case IFN_MASK_SCATTER_STORE: case IFN_LEN_STORE: + case IFN_LEN_MASK_STORE: return true; default: @@ -4420,6 +4449,10 @@ internal_fn_mask_index (internal_fn fn) case IFN_MASK_STORE_LANES: return 2; + case IFN_LEN_MASK_LOAD: + case IFN_LEN_MASK_STORE: + return 3; + case IFN_MASK_GATHER_LOAD: case IFN_MASK_SCATTER_STORE: return 4; @@ -4444,6 +4477,8 @@ internal_fn_stored_value_index (internal_fn fn) case IFN_MASK_SCATTER_STORE: case IFN_LEN_STORE: return 3; + case IFN_LEN_MASK_STORE: + return 4; default: return -1; diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 5d638de6d06..cf0bcea5ac7 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -50,12 +50,14 @@ along with GCC; see the file COPYING3. If not see - mask_load_lanes: currently just vec_mask_load_lanes - gather_load: used for {mask_,}gather_load - len_load: currently just len_load + - len_maskload: currently just len_maskload - mask_store: currently just maskstore - store_lanes: currently just vec_store_lanes - mask_store_lanes: currently just vec_mask_store_lanes - scatter_store: used for {mask_,}scatter_store - len_store: currently just len_store + - len_maskstore: currently just len_maskstore - unary: a normal unary optab, such as vec_reverse_<mode> - binary: a normal binary optab, such as vec_interleave_lo_<mode> @@ -157,6 +159,7 @@ DEF_INTERNAL_OPTAB_FN (MASK_GATHER_LOAD, ECF_PURE, mask_gather_load, gather_load) DEF_INTERNAL_OPTAB_FN (LEN_LOAD, ECF_PURE, len_load, len_load) +DEF_INTERNAL_OPTAB_FN (LEN_MASK_LOAD, ECF_PURE, len_maskload, len_maskload) DEF_INTERNAL_OPTAB_FN (SCATTER_STORE, 0, scatter_store, scatter_store) DEF_INTERNAL_OPTAB_FN (MASK_SCATTER_STORE, 0, @@ -175,6 +178,7 @@ DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) +DEF_INTERNAL_OPTAB_FN (LEN_MASK_STORE, 0, len_maskstore, len_maskstore) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary) diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc index 276f8408dd7..ec765e78088 100644 --- a/gcc/optabs-query.cc +++ b/gcc/optabs-query.cc @@ -624,6 +624,45 @@ get_len_load_store_mode (machine_mode mode, bool is_load) return opt_machine_mode (); } +/* Return true if target supports vector length && masked load/store for mode. + Length is used on loop control and mask is used on flow control. */ + +bool +can_vec_len_mask_load_store_p (machine_mode mode, bool is_load) +{ + optab op = is_load ? len_maskload_optab : len_maskstore_optab; + machine_mode vmode; + machine_mode mask_mode; + + /* If mode is vector mode, check it directly. */ + if (VECTOR_MODE_P (mode)) + return targetm.vectorize.get_mask_mode (mode).exists (&mask_mode) + && convert_optab_handler (op, mode, mask_mode) != CODE_FOR_nothing; + + scalar_mode smode; + if (is_a<scalar_mode> (mode, &smode)) + /* See if there is any chance the mask load or store might be + vectorized. If not, punt. */ + vmode = targetm.vectorize.preferred_simd_mode (smode); + else + vmode = mode; + + if (VECTOR_MODE_P (vmode) + && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode) + && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing) + return true; + + auto_vector_modes vector_modes; + targetm.vectorize.autovectorize_vector_modes (&vector_modes, true); + for (machine_mode base_mode : vector_modes) + if (related_vector_mode (base_mode, smode).exists (&vmode) + && targetm.vectorize.get_mask_mode (vmode).exists (&mask_mode) + && convert_optab_handler (op, vmode, mask_mode) != CODE_FOR_nothing) + return true; + + return false; +} + /* Return true if there is a compare_and_swap pattern. */ bool diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h index b266d2fe990..2b9c9b44af2 100644 --- a/gcc/optabs-query.h +++ b/gcc/optabs-query.h @@ -189,6 +189,7 @@ enum insn_code find_widening_optab_handler_and_mode (optab, machine_mode, int can_mult_highpart_p (machine_mode, bool); bool can_vec_mask_load_store_p (machine_mode, machine_mode, bool); opt_machine_mode get_len_load_store_mode (machine_mode, bool); +bool can_vec_len_mask_load_store_p (machine_mode, bool); bool can_compare_and_swap_p (machine_mode, bool); bool can_atomic_exchange_p (machine_mode, bool); bool can_atomic_load_p (machine_mode); diff --git a/gcc/optabs.def b/gcc/optabs.def index f31b69c5d85..f5401aea364 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -91,6 +91,8 @@ OPTAB_CD(vec_cmpu_optab, "vec_cmpu$a$b") OPTAB_CD(vec_cmpeq_optab, "vec_cmpeq$a$b") OPTAB_CD(maskload_optab, "maskload$a$b") OPTAB_CD(maskstore_optab, "maskstore$a$b") +OPTAB_CD(len_maskload_optab, "len_maskload$a$b") +OPTAB_CD(len_maskstore_optab, "len_maskstore$a$b") OPTAB_CD(gather_load_optab, "gather_load$a$b") OPTAB_CD(mask_gather_load_optab, "mask_gather_load$a$b") OPTAB_CD(scatter_store_optab, "scatter_store$a$b") diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc index b576cce6db6..99aca44e6a5 100644 --- a/gcc/tree-data-ref.cc +++ b/gcc/tree-data-ref.cc @@ -5816,6 +5816,8 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references) } case IFN_MASK_LOAD: case IFN_MASK_STORE: + case IFN_LEN_MASK_LOAD: + case IFN_LEN_MASK_STORE: break; default: clobbers_memory = true; @@ -5861,11 +5863,13 @@ get_references_in_stmt (gimple *stmt, vec<data_ref_loc, va_heap> *references) switch (gimple_call_internal_fn (stmt)) { case IFN_MASK_LOAD: + case IFN_LEN_MASK_LOAD: if (gimple_call_lhs (stmt) == NULL_TREE) break; ref.is_read = true; /* FALLTHRU */ case IFN_MASK_STORE: + case IFN_LEN_MASK_STORE: ptr = build_int_cst (TREE_TYPE (gimple_call_arg (stmt, 1)), 0); align = tree_to_shwi (gimple_call_arg (stmt, 1)); if (ref.is_read) diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc index 1393ce184e3..0f549fa528d 100644 --- a/gcc/tree-if-conv.cc +++ b/gcc/tree-if-conv.cc @@ -960,6 +960,9 @@ ifcvt_can_use_mask_load_store (gimple *stmt) if (can_vec_mask_load_store_p (mode, VOIDmode, is_load)) return true; + if (can_vec_len_mask_load_store_p (mode, is_load)) + return true; + return false; } diff --git a/gcc/tree-ssa-alias.cc b/gcc/tree-ssa-alias.cc index 79ed956e300..100c4b2e7d9 100644 --- a/gcc/tree-ssa-alias.cc +++ b/gcc/tree-ssa-alias.cc @@ -2815,11 +2815,13 @@ ref_maybe_used_by_call_p_1 (gcall *call, ao_ref *ref, bool tbaa_p) case IFN_SCATTER_STORE: case IFN_MASK_SCATTER_STORE: case IFN_LEN_STORE: + case IFN_LEN_MASK_STORE: return false; case IFN_MASK_STORE_LANES: goto process_args; case IFN_MASK_LOAD: case IFN_LEN_LOAD: + case IFN_LEN_MASK_LOAD: case IFN_MASK_LOAD_LANES: { ao_ref rhs_ref; @@ -3065,6 +3067,7 @@ call_may_clobber_ref_p_1 (gcall *call, ao_ref *ref, bool tbaa_p) return false; case IFN_MASK_STORE: case IFN_LEN_STORE: + case IFN_LEN_MASK_STORE: case IFN_MASK_STORE_LANES: { tree rhs = gimple_call_arg (call, diff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc index eabe8ba4522..acaf844b8ef 100644 --- a/gcc/tree-ssa-dse.cc +++ b/gcc/tree-ssa-dse.cc @@ -174,6 +174,17 @@ initialize_ao_ref_for_dse (gimple *stmt, ao_ref *write, bool may_def_ok = false) return true; } break; + case IFN_LEN_MASK_STORE: + /* We cannot initialize a must-def ao_ref (in all cases) but we + can provide a may-def variant. */ + if (may_def_ok) + { + ao_ref_init_from_ptr_and_size + (write, gimple_call_arg (stmt, 0), + TYPE_SIZE_UNIT (TREE_TYPE (gimple_call_arg (stmt, 4)))); + return true; + } + break; default:; } } @@ -1483,6 +1494,7 @@ dse_optimize_stmt (function *fun, gimple_stmt_iterator *gsi, sbitmap live_bytes) { case IFN_LEN_STORE: case IFN_MASK_STORE: + case IFN_LEN_MASK_STORE: { enum dse_store_status store_status; store_status = dse_classify_store (&ref, stmt, false, live_bytes); diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc index 6fbd2d59318..e8e9df1ab74 100644 --- a/gcc/tree-ssa-loop-ivopts.cc +++ b/gcc/tree-ssa-loop-ivopts.cc @@ -2439,6 +2439,7 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p) case IFN_MASK_LOAD: case IFN_MASK_LOAD_LANES: case IFN_LEN_LOAD: + case IFN_LEN_MASK_LOAD: if (op_p == gimple_call_arg_ptr (call, 0)) return TREE_TYPE (gimple_call_lhs (call)); return NULL_TREE; @@ -2450,6 +2451,11 @@ get_mem_type_for_internal_fn (gcall *call, tree *op_p) return TREE_TYPE (gimple_call_arg (call, 3)); return NULL_TREE; + case IFN_LEN_MASK_STORE: + if (op_p == gimple_call_arg_ptr (call, 0)) + return TREE_TYPE (gimple_call_arg (call, 4)); + return NULL_TREE; + default: return NULL_TREE; } @@ -7555,6 +7561,8 @@ get_alias_ptr_type_for_ptr_address (iv_use *use) case IFN_MASK_STORE_LANES: case IFN_LEN_LOAD: case IFN_LEN_STORE: + case IFN_LEN_MASK_LOAD: + case IFN_LEN_MASK_STORE: /* The second argument contains the correct alias type. */ gcc_assert (use->op_p = gimple_call_arg_ptr (call, 0)); return TREE_TYPE (gimple_call_arg (call, 1)); diff --git a/gcc/tree-ssa-sccvn.cc b/gcc/tree-ssa-sccvn.cc index 27c84e78fcf..02fbc4a2dfa 100644 --- a/gcc/tree-ssa-sccvn.cc +++ b/gcc/tree-ssa-sccvn.cc @@ -3304,6 +3304,12 @@ vn_reference_lookup_3 (ao_ref *ref, tree vuse, void *data_, if (!tree_fits_uhwi_p (len) || !tree_fits_shwi_p (bias)) return (void *)-1; break; + case IFN_LEN_MASK_STORE: + len = gimple_call_arg (call, 2); + mask = gimple_call_arg (call, internal_fn_mask_index (fn)); + if (!tree_fits_uhwi_p (len) || TREE_CODE (mask) != VECTOR_CST) + return (void *)-1; + break; default: return (void *)-1; } diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index ebe93832b1e..fb83446519a 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -3039,17 +3039,21 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, if (!call2 || !gimple_call_internal_p (call2)) return false; internal_fn ifn = gimple_call_internal_fn (call1); - if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) + if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE + && ifn != IFN_LEN_MASK_LOAD && ifn != IFN_LEN_MASK_STORE) return false; if (ifn != gimple_call_internal_fn (call2)) return false; /* Check that the masks are the same. Cope with casts of masks, like those created by build_mask_conversion. */ - tree mask1 = gimple_call_arg (call1, 2); - tree mask2 = gimple_call_arg (call2, 2); + unsigned int mask_argno + = ifn == IFN_LEN_MASK_LOAD || ifn == IFN_LEN_MASK_STORE ? 3 : 2; + tree mask1 = gimple_call_arg (call1, mask_argno); + tree mask2 = gimple_call_arg (call2, mask_argno); if (!operand_equal_p (mask1, mask2, 0) - && (ifn == IFN_MASK_STORE || !allow_slp_p)) + && (ifn == IFN_MASK_STORE || ifn == IFN_LEN_MASK_STORE + || !allow_slp_p)) { mask1 = strip_conversion (mask1); if (!mask1) @@ -4292,7 +4296,9 @@ vect_find_stmt_data_reference (loop_p loop, gimple *stmt, if (gcall *call = dyn_cast <gcall *> (stmt)) if (!gimple_call_internal_p (call) || (gimple_call_internal_fn (call) != IFN_MASK_LOAD - && gimple_call_internal_fn (call) != IFN_MASK_STORE)) + && gimple_call_internal_fn (call) != IFN_MASK_STORE + && gimple_call_internal_fn (call) != IFN_LEN_MASK_LOAD + && gimple_call_internal_fn (call) != IFN_LEN_MASK_STORE)) { free_data_ref (dr); return opt_result::failure_at (stmt, @@ -6731,7 +6737,9 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) if (gimple_call_internal_p (stmt) && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD - || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) + || gimple_call_internal_fn (stmt) == IFN_MASK_STORE + || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_LOAD + || gimple_call_internal_fn (stmt) == IFN_LEN_MASK_STORE)) return dr_unaligned_supported; if (loop_vinfo) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index ace9e759f5b..03de41d4988 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -1296,30 +1296,33 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo) if (LOOP_VINFO_LENS (loop_vinfo).is_empty ()) return false; - machine_mode len_load_mode = get_len_load_store_mode - (loop_vinfo->vector_mode, true).require (); - machine_mode len_store_mode = get_len_load_store_mode - (loop_vinfo->vector_mode, false).require (); + if (!can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, true) + && !can_vec_len_mask_load_store_p (loop_vinfo->vector_mode, false)) + { + machine_mode len_load_mode + = get_len_load_store_mode (loop_vinfo->vector_mode, true).require (); + machine_mode len_store_mode + = get_len_load_store_mode (loop_vinfo->vector_mode, false).require (); - signed char partial_load_bias = internal_len_load_store_bias - (IFN_LEN_LOAD, len_load_mode); + signed char partial_load_bias + = internal_len_load_store_bias (IFN_LEN_LOAD, len_load_mode); - signed char partial_store_bias = internal_len_load_store_bias - (IFN_LEN_STORE, len_store_mode); + signed char partial_store_bias + = internal_len_load_store_bias (IFN_LEN_STORE, len_store_mode); - gcc_assert (partial_load_bias == partial_store_bias); + gcc_assert (partial_load_bias == partial_store_bias); - if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) - return false; + if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED) + return false; - /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit - len_loads with a length of zero. In order to avoid that we prohibit - more than one loop length here. */ - if (partial_load_bias == -1 - && LOOP_VINFO_LENS (loop_vinfo).length () > 1) - return false; + /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit + len_loads with a length of zero. In order to avoid that we prohibit + more than one loop length here. */ + if (partial_load_bias == -1 && LOOP_VINFO_LENS (loop_vinfo).length () > 1) + return false; - LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; + LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias; + } unsigned int max_nitems_per_iter = 1; unsigned int i; @@ -11317,7 +11320,8 @@ optimize_mask_stores (class loop *loop) gsi_next (&gsi)) { stmt = gsi_stmt (gsi); - if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) + if (gimple_call_internal_p (stmt, IFN_MASK_STORE) + || gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE)) worklist.safe_push (stmt); } } @@ -11340,7 +11344,8 @@ optimize_mask_stores (class loop *loop) tree zero; last = worklist.pop (); - mask = gimple_call_arg (last, 2); + mask = gimple_call_arg ( + last, gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE) ? 3 : 2); bb = gimple_bb (last); /* Create then_bb and if-then structure in CFG, then_bb belongs to the same loop as if_bb. It could be different to LOOP when two @@ -11473,7 +11478,12 @@ optimize_mask_stores (class loop *loop) } /* Put other masked stores with the same mask to STORE_BB. */ if (worklist.is_empty () - || gimple_call_arg (worklist.last (), 2) != mask + || gimple_call_arg (worklist.last (), + gimple_call_internal_p (worklist.last (), + IFN_LEN_MASK_STORE) + ? 3 + : 2) + != mask || worklist.last () != stmt1) break; last = worklist.pop (); diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index ab89a82f1b3..937b5295df4 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -489,6 +489,7 @@ static const int cond_expr_maps[3][5] = { }; static const int arg1_map[] = { 1, 1 }; static const int arg2_map[] = { 1, 2 }; +static const int arg3_map[] = { 1, 3 }; static const int arg1_arg4_map[] = { 2, 1, 4 }; static const int op1_op0_map[] = { 2, 1, 0 }; @@ -524,6 +525,9 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) case IFN_MASK_LOAD: return arg2_map; + case IFN_LEN_MASK_LOAD: + return arg3_map; + case IFN_GATHER_LOAD: return arg1_map; @@ -1779,6 +1783,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, { if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) + || gimple_call_internal_p (stmt, IFN_LEN_MASK_LOAD) || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); else diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index a7acc032d47..9b797c61c88 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1837,6 +1837,15 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, using_partial_vectors_p = true; } + if (can_vec_len_mask_load_store_p (vecmode, is_load)) + { + nvectors = group_memory_nvectors (group_size * vf, nunits); + /* Length is used on loop control and mask for flow control.*/ + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); + using_partial_vectors_p = true; + } + if (!using_partial_vectors_p) { if (dump_enabled_p ()) @@ -7978,8 +7987,9 @@ vectorizable_store (vec_info *vinfo, if (memory_access_type == VMAT_CONTIGUOUS) { if (!VECTOR_MODE_P (vec_mode) - || !can_vec_mask_load_store_p (vec_mode, - TYPE_MODE (mask_vectype), false)) + || (!can_vec_mask_load_store_p (vec_mode, + TYPE_MODE (mask_vectype), false) + && !can_vec_len_mask_load_store_p (vec_mode, false))) return false; } else if (memory_access_type != VMAT_LOAD_STORE_LANES @@ -8942,7 +8952,38 @@ vectorizable_store (vec_info *vinfo, } /* Arguments are ready. Create the new vector stmt. */ - if (final_mask) + if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype), false) + && (final_mask || loop_lens)) + { + tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (!final_mask) + { + machine_mode mask_mode + = targetm.vectorize.get_mask_mode (TYPE_MODE (vectype)) + .require (); + mask_vectype + = build_truth_vector_type_for_mode (nunits, mask_mode); + tree mask = build_int_cst (TREE_TYPE (mask_vectype), -1); + final_mask = build_vector_from_val (mask_vectype, mask); + } + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + tree final_len; + if (loop_lens) + final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + else + final_len = build_int_cst (iv_type, nunits); + gcall *call + = gimple_build_call_internal (IFN_LEN_MASK_STORE, 5, + dataref_ptr, ptr, final_len, + final_mask, vec_oprnd); + gimple_call_set_nothrow (call, true); + vect_finish_stmt_generation (vinfo, stmt_info, call, gsi); + new_stmt = call; + } + else if (final_mask) { tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); gcall *call @@ -9407,8 +9448,9 @@ vectorizable_load (vec_info *vinfo, { machine_mode vec_mode = TYPE_MODE (vectype); if (!VECTOR_MODE_P (vec_mode) - || !can_vec_mask_load_store_p (vec_mode, - TYPE_MODE (mask_vectype), true)) + || (!can_vec_mask_load_store_p (vec_mode, + TYPE_MODE (mask_vectype), true) + && !can_vec_len_mask_load_store_p (vec_mode, false))) return false; } else if (memory_access_type != VMAT_LOAD_STORE_LANES @@ -10301,7 +10343,47 @@ vectorizable_load (vec_info *vinfo, align, misalign); align = least_bit_hwi (misalign | align); - if (final_mask) + if (can_vec_len_mask_load_store_p (TYPE_MODE (vectype), + true) + && (final_mask || loop_lens) + && memory_access_type != VMAT_INVARIANT) + { + tree ptr + = build_int_cst (ref_type, align * BITS_PER_UNIT); + poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); + if (!final_mask) + { + machine_mode mask_mode + = targetm.vectorize + .get_mask_mode (TYPE_MODE (vectype)) + .require (); + mask_vectype + = build_truth_vector_type_for_mode (nunits, + mask_mode); + tree mask + = build_int_cst (TREE_TYPE (mask_vectype), -1); + final_mask + = build_vector_from_val (mask_vectype, mask); + } + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); + tree final_len; + if (loop_lens) + final_len + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, + vec_num * ncopies, vectype, + vec_num * j + i, 1); + else + final_len = build_int_cst (iv_type, nunits); + + gcall *call + = gimple_build_call_internal (IFN_LEN_MASK_LOAD, 4, + dataref_ptr, ptr, + final_len, final_mask); + gimple_call_set_nothrow (call, true); + new_stmt = call; + data_ref = NULL_TREE; + } + else if (final_mask) { tree ptr = build_int_cst (ref_type, align * BITS_PER_UNIT); @@ -13027,7 +13109,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, if (gimple_get_lhs (stmt) == NULL_TREE /* MASK_STORE has no lhs, but is ok. */ - && !gimple_call_internal_p (stmt, IFN_MASK_STORE)) + && !gimple_call_internal_p (stmt, IFN_MASK_STORE) + && !gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE)) { if (is_a <gcall *> (stmt)) { @@ -13071,6 +13154,8 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, scalar_type = TREE_TYPE (DR_REF (dr)); else if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3)); + else if (gimple_call_internal_p (stmt, IFN_LEN_MASK_STORE)) + scalar_type = TREE_TYPE (gimple_call_arg (stmt, 4)); else scalar_type = TREE_TYPE (gimple_get_lhs (stmt)); diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc index a048e9d8917..19312404ac4 100644 --- a/gcc/tree-vectorizer.cc +++ b/gcc/tree-vectorizer.cc @@ -1101,6 +1101,8 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> *&simduid_to_vf_htab, { internal_fn ifn = gimple_call_internal_fn (call); if (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE + || ifn == IFN_LEN_MASK_LOAD + || ifn == IFN_LEN_MASK_STORE /* Don't keep the if-converted parts when the ifn with specifc type is not supported by the backend. */ || (direct_internal_fn_p (ifn) -- 2.36.1