Thanks for the clean-ups. But... "Kewen.Lin" <li...@linux.ibm.com> writes: > Hi, > > Following Richi's suggestion [1], this patch is to move the > handlings on VMAT_GATHER_SCATTER in the final loop nest > of function vectorizable_load to its own loop. Basically > it duplicates the final loop nest, clean up some useless > set up code for the case of VMAT_GATHER_SCATTER, remove some > unreachable code. Also remove the corresponding handlings > in the final loop nest. > > Bootstrapped and regtested on x86_64-redhat-linux, > aarch64-linux-gnu and powerpc64{,le}-linux-gnu. > > [1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html > > Is it ok for trunk? > > BR, > Kewen > ----- > > gcc/ChangeLog: > > * tree-vect-stmts.cc (vectorizable_load): Move the handlings on > VMAT_GATHER_SCATTER in the final loop nest to its own loop, > and update the final nest accordingly. > --- > gcc/tree-vect-stmts.cc | 361 +++++++++++++++++++++++++---------------- > 1 file changed, 219 insertions(+), 142 deletions(-)
...that seems like quite a lot of +s. Is there nothing we can do to avoid the cut-&-paste? Richard > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index c361e16cb7b..5e514eca19b 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -10455,6 +10455,218 @@ vectorizable_load (vec_info *vinfo, > return true; > } > > + if (memory_access_type == VMAT_GATHER_SCATTER) > + { > + gcc_assert (alignment_support_scheme == dr_aligned > + || alignment_support_scheme == dr_unaligned_supported); > + gcc_assert (!grouped_load && !slp_perm); > + > + unsigned int inside_cost = 0, prologue_cost = 0; > + for (j = 0; j < ncopies; j++) > + { > + /* 1. Create the vector or array pointer update chain. */ > + if (j == 0 && !costing_p) > + { > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > + slp_node, &gs_info, &dataref_ptr, > + &vec_offsets); > + else > + dataref_ptr > + = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, > + at_loop, offset, &dummy, gsi, > + &ptr_incr, false, bump); > + } > + else if (!costing_p) > + { > + gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); > + if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > + gsi, stmt_info, bump); > + } > + > + if (mask && !costing_p) > + vec_mask = vec_masks[j]; > + > + gimple *new_stmt = NULL; > + for (i = 0; i < vec_num; i++) > + { > + tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > + if (!costing_p) > + { > + if (loop_masks) > + final_mask > + = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > + vec_num * ncopies, vectype, > + vec_num * j + i); > + if (vec_mask) > + final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > + final_mask, vec_mask, gsi); > + > + if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > + gsi, stmt_info, bump); > + } > + > + /* 2. Create the vector-load in the loop. */ > + unsigned HOST_WIDE_INT align; > + if (gs_info.ifn != IFN_LAST) > + { > + if (costing_p) > + { > + unsigned int cnunits = vect_nunits_for_cost (vectype); > + inside_cost > + = record_stmt_cost (cost_vec, cnunits, scalar_load, > + stmt_info, 0, vect_body); > + continue; > + } > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + vec_offset = vec_offsets[vec_num * j + i]; > + tree zero = build_zero_cst (vectype); > + tree scale = size_int (gs_info.scale); > + > + if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > + { > + if (loop_lens) > + final_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, 1); > + else > + final_len > + = build_int_cst (sizetype, > + TYPE_VECTOR_SUBPARTS (vectype)); > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > + gcall *call; > + if (final_len && final_mask) > + call > + = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7, > + dataref_ptr, vec_offset, > + scale, zero, final_mask, > + final_len, bias); > + else if (final_mask) > + call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, > + dataref_ptr, vec_offset, > + scale, zero, final_mask); > + else > + call = gimple_build_call_internal (IFN_GATHER_LOAD, 4, > + dataref_ptr, vec_offset, > + scale, zero); > + gimple_call_set_nothrow (call, true); > + new_stmt = call; > + data_ref = NULL_TREE; > + } > + else > + { > + /* Emulated gather-scatter. */ > + gcc_assert (!final_mask); > + unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); > + if (costing_p) > + { > + /* For emulated gathers N offset vector element > + offset add is consumed by the load). */ > + inside_cost = record_stmt_cost (cost_vec, const_nunits, > + vec_to_scalar, stmt_info, > + 0, vect_body); > + /* N scalar loads plus gathering them into a > + vector. */ > + inside_cost > + = record_stmt_cost (cost_vec, const_nunits, scalar_load, > + stmt_info, 0, vect_body); > + inside_cost > + = record_stmt_cost (cost_vec, 1, vec_construct, > + stmt_info, 0, vect_body); > + continue; > + } > + unsigned HOST_WIDE_INT const_offset_nunits > + = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > + .to_constant (); > + vec<constructor_elt, va_gc> *ctor_elts; > + vec_alloc (ctor_elts, const_nunits); > + gimple_seq stmts = NULL; > + /* We support offset vectors with more elements > + than the data vector for now. */ > + unsigned HOST_WIDE_INT factor > + = const_offset_nunits / const_nunits; > + vec_offset = vec_offsets[j / factor]; > + unsigned elt_offset = (j % factor) * const_nunits; > + tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > + tree scale = size_int (gs_info.scale); > + align = get_object_alignment (DR_REF (first_dr_info->dr)); > + tree ltype = build_aligned_type (TREE_TYPE (vectype), align); > + for (unsigned k = 0; k < const_nunits; ++k) > + { > + tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), > + bitsize_int (k + elt_offset)); > + tree idx > + = gimple_build (&stmts, BIT_FIELD_REF, idx_type, > + vec_offset, TYPE_SIZE (idx_type), boff); > + idx = gimple_convert (&stmts, sizetype, idx); > + idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, > + scale); > + tree ptr = gimple_build (&stmts, PLUS_EXPR, > + TREE_TYPE (dataref_ptr), > + dataref_ptr, idx); > + ptr = gimple_convert (&stmts, ptr_type_node, ptr); > + tree elt = make_ssa_name (TREE_TYPE (vectype)); > + tree ref = build2 (MEM_REF, ltype, ptr, > + build_int_cst (ref_type, 0)); > + new_stmt = gimple_build_assign (elt, ref); > + gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi))); > + gimple_seq_add_stmt (&stmts, new_stmt); > + CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); > + } > + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > + new_stmt = gimple_build_assign ( > + NULL_TREE, build_constructor (vectype, ctor_elts)); > + data_ref = NULL_TREE; > + } > + > + vec_dest = vect_create_destination_var (scalar_dest, vectype); > + /* DATA_REF is null if we've already built the statement. */ > + if (data_ref) > + { > + vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr)); > + new_stmt = gimple_build_assign (vec_dest, data_ref); > + } > + new_temp = make_ssa_name (vec_dest, new_stmt); > + gimple_set_lhs (new_stmt, new_temp); > + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); > + > + /* Store vector loads in the corresponding SLP_NODE. */ > + if (slp) > + slp_node->push_vec_def (new_stmt); > + } > + > + if (!slp && !costing_p) > + STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt); > + } > + > + if (!slp && !costing_p) > + *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0]; > + > + if (costing_p) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "vect_model_load_cost: inside_cost = %u, " > + "prologue_cost = %u .\n", > + inside_cost, prologue_cost); > + } > + return true; > + } > + > poly_uint64 group_elt = 0; > unsigned int inside_cost = 0, prologue_cost = 0; > for (j = 0; j < ncopies; j++) > @@ -10504,12 +10716,6 @@ vectorizable_load (vec_info *vinfo, > gcc_assert (!compute_in_loop); > } > } > - else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - { > - vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, > - slp_node, &gs_info, &dataref_ptr, > - &vec_offsets); > - } > else > dataref_ptr > = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type, > @@ -10525,7 +10731,7 @@ vectorizable_load (vec_info *vinfo, > if (dataref_offset) > dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset, > bump); > - else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + else > dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi, > stmt_info, bump); > if (mask) > @@ -10551,7 +10757,7 @@ vectorizable_load (vec_info *vinfo, > final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, > final_mask, vec_mask, gsi); > > - if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + if (i > 0) > dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, > gsi, stmt_info, bump); > } > @@ -10562,139 +10768,11 @@ vectorizable_load (vec_info *vinfo, > case dr_aligned: > case dr_unaligned_supported: > { > - unsigned int misalign; > - unsigned HOST_WIDE_INT align; > - > - if (memory_access_type == VMAT_GATHER_SCATTER > - && gs_info.ifn != IFN_LAST) > - { > - if (costing_p) > - { > - unsigned int cnunits = vect_nunits_for_cost (vectype); > - inside_cost > - = record_stmt_cost (cost_vec, cnunits, scalar_load, > - stmt_info, 0, vect_body); > - break; > - } > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - vec_offset = vec_offsets[vec_num * j + i]; > - tree zero = build_zero_cst (vectype); > - tree scale = size_int (gs_info.scale); > - > - if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD) > - { > - if (loop_lens) > - final_len > - = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > - vec_num * ncopies, vectype, > - vec_num * j + i, 1); > - else > - final_len > - = build_int_cst (sizetype, > - TYPE_VECTOR_SUBPARTS (vectype)); > - signed char biasval > - = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > - bias = build_int_cst (intQI_type_node, biasval); > - if (!final_mask) > - { > - mask_vectype = truth_type_for (vectype); > - final_mask = build_minus_one_cst (mask_vectype); > - } > - } > - > - gcall *call; > - if (final_len && final_mask) > - call = gimple_build_call_internal ( > - IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset, > - scale, zero, final_mask, final_len, bias); > - else if (final_mask) > - call > - = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5, > - dataref_ptr, vec_offset, > - scale, zero, final_mask); > - else > - call > - = gimple_build_call_internal (IFN_GATHER_LOAD, 4, > - dataref_ptr, vec_offset, > - scale, zero); > - gimple_call_set_nothrow (call, true); > - new_stmt = call; > - data_ref = NULL_TREE; > - break; > - } > - else if (memory_access_type == VMAT_GATHER_SCATTER) > - { > - /* Emulated gather-scatter. */ > - gcc_assert (!final_mask); > - unsigned HOST_WIDE_INT const_nunits = nunits.to_constant (); > - if (costing_p) > - { > - /* For emulated gathers N offset vector element > - offset add is consumed by the load). */ > - inside_cost > - = record_stmt_cost (cost_vec, const_nunits, > - vec_to_scalar, stmt_info, 0, > - vect_body); > - /* N scalar loads plus gathering them into a > - vector. */ > - inside_cost = record_stmt_cost (cost_vec, const_nunits, > - scalar_load, stmt_info, > - 0, vect_body); > - inside_cost > - = record_stmt_cost (cost_vec, 1, vec_construct, > - stmt_info, 0, vect_body); > - break; > - } > - unsigned HOST_WIDE_INT const_offset_nunits > - = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype) > - .to_constant (); > - vec<constructor_elt, va_gc> *ctor_elts; > - vec_alloc (ctor_elts, const_nunits); > - gimple_seq stmts = NULL; > - /* We support offset vectors with more elements > - than the data vector for now. */ > - unsigned HOST_WIDE_INT factor > - = const_offset_nunits / const_nunits; > - vec_offset = vec_offsets[j / factor]; > - unsigned elt_offset = (j % factor) * const_nunits; > - tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); > - tree scale = size_int (gs_info.scale); > - align = get_object_alignment (DR_REF (first_dr_info->dr)); > - tree ltype > - = build_aligned_type (TREE_TYPE (vectype), align); > - for (unsigned k = 0; k < const_nunits; ++k) > - { > - tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type), > - bitsize_int (k + elt_offset)); > - tree idx = gimple_build (&stmts, BIT_FIELD_REF, > - idx_type, vec_offset, > - TYPE_SIZE (idx_type), boff); > - idx = gimple_convert (&stmts, sizetype, idx); > - idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx, > - scale); > - tree ptr = gimple_build (&stmts, PLUS_EXPR, > - TREE_TYPE (dataref_ptr), > - dataref_ptr, idx); > - ptr = gimple_convert (&stmts, ptr_type_node, ptr); > - tree elt = make_ssa_name (TREE_TYPE (vectype)); > - tree ref = build2 (MEM_REF, ltype, ptr, > - build_int_cst (ref_type, 0)); > - new_stmt = gimple_build_assign (elt, ref); > - gimple_set_vuse (new_stmt, > - gimple_vuse (gsi_stmt (*gsi))); > - gimple_seq_add_stmt (&stmts, new_stmt); > - CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt); > - } > - gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); > - new_stmt = gimple_build_assign ( > - NULL_TREE, build_constructor (vectype, ctor_elts)); > - data_ref = NULL_TREE; > - break; > - } > - > if (costing_p) > break; > > + unsigned int misalign; > + unsigned HOST_WIDE_INT align; > align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info)); > if (alignment_support_scheme == dr_aligned) > misalign = 0; > @@ -11156,10 +11234,9 @@ vectorizable_load (vec_info *vinfo, > > if (costing_p) > { > - gcc_assert (memory_access_type != VMAT_INVARIANT > - && memory_access_type != VMAT_ELEMENTWISE > - && memory_access_type != VMAT_STRIDED_SLP > - && memory_access_type != VMAT_LOAD_STORE_LANES); > + gcc_assert (memory_access_type == VMAT_CONTIGUOUS > + || memory_access_type == VMAT_CONTIGUOUS_REVERSE > + || memory_access_type == VMAT_CONTIGUOUS_PERMUTE); > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "vect_model_load_cost: inside_cost = %u, " > -- > 2.39.1