Hi, This patch adds gather/scatter handling for grouped access. The idea is to e.g. replace an access (for uint8_t elements) like arr[0] arr[1] arr[2] arr[3] arr[0 + step] arr[1 + step] ... by a gather load of uint32_t arr[0] arr[0 + step * 1] arr[0 + step * 2] ... where the offset vector is a simple series with step STEP. If supported, such a gather can be implemented as a strided load.
Bootstrapped on x86 and power10. Regtested on aarch64 and riscv. Regards Robin PR target/118019 gcc/ChangeLog: * internal-fn.cc (get_supported_else_vals): Exit at invalid index. (internal_strided_fn_supported_p): New function. * internal-fn.h (internal_strided_fn_supported_p): Declare. * tree-vect-data-refs.cc (vect_supportable_dr_alignment): Assume packed if access mode unit size unequal data ref type. * tree-vect-stmts.cc (vect_get_punning_vectype): New function. (vect_use_grouped_gather): New function. (get_load_store_type): Call new function. (vectorizable_store): Use punned vectype. (vectorizable_load): Ditto. * tree-vectorizer.h (struct vect_load_store_data): Add punned vectype. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr118019-2.c: New test. --- gcc/internal-fn.cc | 22 +- gcc/internal-fn.h | 2 + .../gcc.target/riscv/rvv/autovec/pr118019-2.c | 50 +++++ gcc/tree-vect-data-refs.cc | 4 +- gcc/tree-vect-stmts.cc | 207 ++++++++++++++++-- gcc/tree-vectorizer.h | 1 + 6 files changed, 270 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index bf2fac81807..db396c69ec5 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -5234,7 +5234,7 @@ get_supported_else_vals (enum insn_code icode, unsigned else_index, vec<int> &else_vals) { const struct insn_data_d *data = &insn_data[icode]; - if ((char)else_index >= data->n_operands) + if ((int)else_index >= data->n_operands || (int)else_index == -1) return; machine_mode else_mode = data->operand[else_index].mode; @@ -5309,6 +5309,26 @@ internal_gather_scatter_fn_supported_p (internal_fn ifn, tree vector_type, return ok; } +/* Return true if the target supports a strided load/store function IFN + with VECTOR_TYPE. If supported and ELSVALS is nonzero the supported else + values will be added to the vector ELSVALS points to. */ + +bool +internal_strided_fn_supported_p (internal_fn ifn, tree vector_type, + vec<int> *elsvals) +{ + machine_mode mode = TYPE_MODE (vector_type); + optab optab = direct_internal_fn_optab (ifn); + insn_code icode = direct_optab_handler (optab, mode); + + bool ok = icode != CODE_FOR_nothing; + + if (ok && elsvals) + get_supported_else_vals (icode, internal_fn_else_index (ifn), *elsvals); + + return ok; +} + /* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN for pointers of type TYPE when the accesses have LENGTH bytes and their common byte alignment is ALIGN. */ diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index fd21694dfeb..dcb707251f8 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -246,6 +246,8 @@ extern int internal_fn_alias_ptr_index (internal_fn fn); extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree, tree, tree, int, vec<int> * = nullptr); +extern bool internal_strided_fn_supported_p (internal_fn, tree, + vec<int> * = nullptr); extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree, poly_uint64, unsigned int); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c new file mode 100644 index 00000000000..d3436b78377 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c @@ -0,0 +1,50 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv_zvl512b -mabi=lp64d -mno-vector-strict-align" } */ + +/* Ensure we use strided loads. */ + +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d2 = t0 - t2;\ + d1 = t1 + t3;\ + d3 = t1 - t3;\ +} + +uint32_t +abs2 (uint32_t a) +{ + uint32_t s = ((a >> 15) & 0x10001) * 0xffff; + return (a + s) ^ s; +} + +int +x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2) +{ + uint32_t tmp[4][4]; + uint32_t a0, a1, a2, a3; + int sum = 0; + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4 (tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + for (int i = 0; i < 4; i++) + { + HADAMARD4 (a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + sum += abs2 (a0) + abs2 (a1) + abs2 (a2) + abs2 (a3); + } + return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1; +} + +/* { dg-final { scan-assembler-times "vlse32" 4 } } */ diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index e451b72e07e..be03495c6b9 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -6640,7 +6640,9 @@ vect_supportable_dr_alignment (vec_info *vinfo, dr_vec_info *dr_info, bool is_packed = false; tree type = TREE_TYPE (DR_REF (dr)); if (misalignment == DR_MISALIGNMENT_UNKNOWN) - is_packed = not_size_aligned (DR_REF (dr)); + is_packed = not_size_aligned (DR_REF (dr)) + || tree_to_uhwi (TYPE_SIZE (type)) + < tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); if (targetm.vectorize.support_vector_misalignment (mode, type, misalignment, is_packed, is_gather_scatter)) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 5b1f291fa8d..619b99083a3 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1723,6 +1723,124 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, tree vectype, return false; } +/* Check if there is an integer vector type with the same size as VECTYPE but + NELTS units of size (TYPE_SIZE (VECTYPE) / NELTS). If so, return the + appropriate type. */ + +static tree +vect_get_punning_vectype (tree vectype, int nelts) +{ + gcc_assert (VECTOR_TYPE_P (vectype)); + + machine_mode vmode = TYPE_MODE (vectype); + if (!VECTOR_MODE_P (vmode)) + return NULL_TREE; + + poly_uint64 vbsize = GET_MODE_BITSIZE (vmode); + unsigned int pbsize; + scalar_int_mode elmode; + if (constant_multiple_p (vbsize, nelts, &pbsize) + && (int_mode_for_mode (SCALAR_TYPE_MODE + (TREE_TYPE (vectype))).exists (&elmode))) + { + machine_mode rmode; + if (int_mode_for_size (pbsize, 0).exists (&elmode) + && tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype))) * nelts + == GET_MODE_SIZE (elmode) + && related_vector_mode (vmode, elmode, nelts).exists (&rmode)) + { + tree ptype = build_nonstandard_integer_type (pbsize, 1); + return build_vector_type (ptype, nelts); + } + } + + return NULL_TREE; +} + +/* Return true if we can use gather/scatter or strided internal functions + to vectorize STMT_INFO, which is a grouped or strided load or store + with multiple lanes and will be implemented by a type-punned access + of a vector with element size that matches the number of lanes. + + MASKED_P is true if load or store is conditional. + When returning true, fill in GS_INFO with the information required to + perform the operation. Also, store the punning type in PUNNED_VECTYPE. + + If successful and ELSVALS is nonzero the supported + else values will be stored in the vector ELSVALS points to. */ + +static bool +vect_use_grouped_gather (stmt_vec_info stmt_info, tree vectype, + loop_vec_info loop_vinfo, bool masked_p, + unsigned int nelts, + gather_scatter_info *info, vec<int> *elsvals, + tree *pun_vectype) +{ + dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); + data_reference *dr = dr_info->dr; + + /* TODO: We can support nelts > BITS_PER_UNIT or non-power-of-two by + multiple gathers/scatter. */ + if (nelts > BITS_PER_UNIT || !pow2p_hwi (nelts)) + return false; + + /* Pun the vectype with one of the same size but an element spanning + NELTS elements of VECTYPE. + The punned type of a V16QI with NELTS = 4 would be V4SI. + */ + *pun_vectype = vect_get_punning_vectype (vectype, nelts); + + if (!*pun_vectype) + return false; + + internal_fn ifn; + tree offset_vectype = *pun_vectype; + + internal_fn strided_ifn = DR_IS_READ (dr) + ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE; + + /* Check if we have a gather/scatter with the new type. We're just trying + with the type itself as offset for now. If not, check if we have a + strided load/store. These have fewer constraints (for example no offset + type must exist) so it is possible that even though a gather/scatter is + not available we still have a strided load/store. */ + bool ok = false; + if (vect_gather_scatter_fn_p + (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype, + TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn, + &offset_vectype, elsvals)) + ok = true; + else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype, + elsvals)) + { + /* Use gather/scatter IFNs, vect_get_strided_load_store_ops + will switch back to the strided variants. */ + ifn = DR_IS_READ (dr) ? IFN_MASK_LEN_GATHER_LOAD : + IFN_MASK_LEN_SCATTER_STORE; + ok = true; + } + + if (ok) + { + info->ifn = ifn; + info->decl = NULL_TREE; + info->base = dr->ref; + info->alias_ptr = build_int_cst + (reference_alias_ptr_type (DR_REF (dr)), + get_object_alignment (DR_REF (dr))); + info->element_type = TREE_TYPE (vectype); + info->offset_vectype = offset_vectype; + /* No need to set the offset, vect_get_strided_load_store_ops + will do that. */ + info->scale = 1; + info->memory_type = TREE_TYPE (DR_REF (dr)); + return true; + } + + return false; +} + + /* Return true if we can use gather/scatter internal functions to vectorize STMT_INFO, which is a grouped or strided load or store. MASKED_P is true if load or store is conditional. When returning @@ -1978,6 +2096,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, int *misalignment = &ls->misalignment; internal_fn *lanes_ifn = &ls->lanes_ifn; vec<int> *elsvals = &ls->elsvals; + tree *pun_vectype = &ls->pun_vectype; loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; @@ -1989,6 +2108,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, *misalignment = DR_MISALIGNMENT_UNKNOWN; *poffset = 0; + *pun_vectype = NULL_TREE; if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { @@ -2321,13 +2441,13 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, if ((*memory_access_type == VMAT_ELEMENTWISE || *memory_access_type == VMAT_STRIDED_SLP) && !STMT_VINFO_GATHER_SCATTER_P (stmt_info) - && SLP_TREE_LANES (slp_node) == 1 && loop_vinfo) { gather_scatter_info gs_info; - if (vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, - masked_p, &gs_info, elsvals, - group_size, single_element_p)) + if (SLP_TREE_LANES (slp_node) == 1 + && vect_use_strided_gather_scatters_p (stmt_info, vectype, loop_vinfo, + masked_p, &gs_info, elsvals, + group_size, single_element_p)) { SLP_TREE_GS_SCALE (slp_node) = gs_info.scale; SLP_TREE_GS_BASE (slp_node) = error_mark_node; @@ -2335,6 +2455,28 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, ls->strided_offset_vectype = gs_info.offset_vectype; *memory_access_type = VMAT_GATHER_SCATTER_IFN; } + else if (SLP_TREE_LANES (slp_node) > 1 + && vect_use_grouped_gather (stmt_info, vectype, loop_vinfo, + masked_p, SLP_TREE_LANES (slp_node), + &gs_info, elsvals, pun_vectype)) + { + int puntype_misalignment = dr_misalignment + (first_dr_info, *pun_vectype, *poffset); + dr_alignment_support puntype_alignment_scheme + = vect_supportable_dr_alignment + (vinfo, first_dr_info, *pun_vectype, puntype_misalignment, + true); + + if (puntype_alignment_scheme == dr_aligned + || puntype_alignment_scheme == dr_unaligned_supported) + { + SLP_TREE_GS_SCALE (slp_node) = gs_info.scale; + SLP_TREE_GS_BASE (slp_node) = error_mark_node; + ls->gs.ifn = gs_info.ifn; + ls->strided_offset_vectype = gs_info.offset_vectype; + *memory_access_type = VMAT_GATHER_SCATTER_IFN; + } + } } if (*memory_access_type == VMAT_CONTIGUOUS_DOWN @@ -2351,14 +2493,15 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, } else { + tree vtype = ls->pun_vectype ? ls->pun_vectype : vectype; if (mat_gather_scatter_p (*memory_access_type) && !first_dr_info) *misalignment = DR_MISALIGNMENT_UNKNOWN; else - *misalignment = dr_misalignment (first_dr_info, vectype, *poffset); + *misalignment = dr_misalignment (first_dr_info, vtype, *poffset); *alignment_support_scheme = vect_supportable_dr_alignment - (vinfo, first_dr_info, vectype, *misalignment, + (vinfo, first_dr_info, vtype, *misalignment, mat_gather_scatter_p (*memory_access_type)); } @@ -8364,10 +8507,13 @@ vectorizable_store (vec_info *vinfo, { aggr_type = elem_type; if (!costing_p) - vect_get_strided_load_store_ops (stmt_info, slp_node, vectype, - ls.strided_offset_vectype, - loop_vinfo, gsi, - &bump, &vec_offset, loop_lens); + { + tree vtype = ls.pun_vectype ? ls.pun_vectype : vectype; + vect_get_strided_load_store_ops (stmt_info, slp_node, vtype, + ls.strided_offset_vectype, + loop_vinfo, gsi, + &bump, &vec_offset, loop_lens); + } } else { @@ -8553,7 +8699,9 @@ vectorizable_store (vec_info *vinfo, if (mat_gather_scatter_p (memory_access_type)) { - gcc_assert (!grouped_store); + gcc_assert (!grouped_store || ls.pun_vectype); + if (ls.pun_vectype) + vectype = ls.pun_vectype; auto_vec<tree> vec_offsets; unsigned int inside_cost = 0, prologue_cost = 0; int num_stmts = vec_num; @@ -8600,8 +8748,9 @@ vectorizable_store (vec_info *vinfo, if (mask_node) vec_mask = vec_masks[j]; /* We should have catched mismatched types earlier. */ - gcc_assert (useless_type_conversion_p (vectype, - TREE_TYPE (vec_oprnd))); + gcc_assert (ls.pun_vectype + || useless_type_conversion_p + (vectype, TREE_TYPE (vec_oprnd))); } tree final_mask = NULL_TREE; tree final_len = NULL_TREE; @@ -8654,6 +8803,18 @@ vectorizable_store (vec_info *vinfo, } } + if (ls.pun_vectype) + { + gimple *conv_stmt + = gimple_build_assign (make_ssa_name (vectype), + VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, vectype, + vec_oprnd)); + vect_finish_stmt_generation (vinfo, stmt_info, conv_stmt, + gsi); + vec_oprnd = gimple_get_lhs (conv_stmt); + } + gcall *call; if (final_len && final_mask) { @@ -10413,7 +10574,14 @@ vectorizable_load (vec_info *vinfo, if (mat_gather_scatter_p (memory_access_type)) { - gcc_assert (!grouped_load && !slp_perm); + gcc_assert ((!grouped_load && !slp_perm) || ls.pun_vectype); + + /* If we pun the original vectype the loads as well as costing, length, + etc. is performed with the new type. After loading we VIEW_CONVERT + the data to the original vectype. */ + tree original_vectype = vectype; + if (ls.pun_vectype) + vectype = ls.pun_vectype; /* 1. Create the vector or array pointer update chain. */ if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) @@ -10754,6 +10922,17 @@ vectorizable_load (vec_info *vinfo, new_temp = new_temp2; } + if (ls.pun_vectype) + { + new_stmt = gimple_build_assign (make_ssa_name + (original_vectype), + VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + original_vectype, + new_temp)); + vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi); + } + /* Store vector loads in the corresponding SLP_NODE. */ slp_node->push_vec_def (new_stmt); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index df805c6ade9..c7533d7a35b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -287,6 +287,7 @@ struct vect_load_store_data : vect_data { tree decl; // VMAT_GATHER_SCATTER_DECL } gs; tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided + tree pun_vectype; // VMAT_GATHER_SCATTER_IFN auto_vec<int> elsvals; }; -- 2.51.0