https://gcc.gnu.org/g:573ea59a48d95fc4e9f520865eae71c5d9de614f
commit r16-5086-g573ea59a48d95fc4e9f520865eae71c5d9de614f Author: Robin Dapp <[email protected]> Date: Wed Oct 29 16:02:51 2025 +0100 vect: Relax gather/scatter scale handling. Similar to the signed/unsigned patch before this one relaxes the gather/scatter restrictions on scale factors. The basic idea is that a natively unsupported scale factor can still be reached by emitting a multiplication before the actual gather operation. As before, we need to make sure that there is no overflow when multiplying. gcc/ChangeLog: * tree-vect-data-refs.cc (struct gather_scatter_config): Add scale. (vect_gather_scatter_get_configs): Try various scales. (vect_gather_scatter_fn_p): Add scale handling. (vect_check_gather_scatter): Add scale parameter. * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. (vect_truncate_gather_scatter_offset): Ditto. (vect_use_grouped_gather): Ditto. (get_load_store_type): Ditto. (vectorizable_store): Scale offset if necessary. (vectorizable_load): Ditto. * tree-vectorizer.h (struct vect_load_store_data): Add supported_scale. (vect_gather_scatter_fn_p): Add argument. Diff: --- gcc/tree-vect-data-refs.cc | 180 +++++++++++++++++++++++++++++++++++---------- gcc/tree-vect-stmts.cc | 71 +++++++++++++++--- gcc/tree-vectorizer.h | 11 ++- 3 files changed, 209 insertions(+), 53 deletions(-) diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index fb2450a30c45..e8cfb884c1d0 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -4431,6 +4431,7 @@ struct gather_scatter_config { internal_fn ifn; tree offset_vectype; + int scale; vec<int> elsvals; }; @@ -4523,38 +4524,62 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool read_p, bool masked_p, if (!offset_vectype) continue; - vec<int> elsvals = vNULL; + /* Try multiple scale values. Start with exact match, then try + smaller common scales that a target might support . */ + int scales_to_try[] = {scale, 1, 2, 4, 8}; - /* If we haven't determined which IFN is supported yet, try all three - to find which one the target supports. */ - if (ifn == IFN_LAST) + for (unsigned int j = 0; + j < sizeof (scales_to_try) / sizeof (*scales_to_try); + j++) { - ifn = vect_gather_scatter_which_ifn (read_p, masked_p, - vectype, memory_type, - offset_vectype, scale, &elsvals); - if (ifn != IFN_LAST) + int try_scale = scales_to_try[j]; + + /* Skip scales >= requested scale (except for exact match). */ + if (j > 0 && try_scale >= scale) + continue; + + /* Skip if requested scale is not a multiple of this scale. */ + if (j > 0 && scale % try_scale != 0) + continue; + + vec<int> elsvals = vNULL; + + /* If we haven't determined which IFN is supported yet, try all three + to find which one the target supports. */ + if (ifn == IFN_LAST) { - /* Found which IFN is supported. Save this configuration. */ - gather_scatter_config config; - config.ifn = ifn; - config.offset_vectype = offset_vectype; - config.elsvals = elsvals; - configs.safe_push (config); + ifn = vect_gather_scatter_which_ifn (read_p, masked_p, + vectype, memory_type, + offset_vectype, try_scale, + &elsvals); + if (ifn != IFN_LAST) + { + /* Found which IFN is supported. Save this configuration. */ + gather_scatter_config config; + config.ifn = ifn; + config.offset_vectype = offset_vectype; + config.scale = try_scale; + config.elsvals = elsvals; + configs.safe_push (config); + } } - } - else - { - /* We already know which IFN is supported, just check if this - offset type works with it. */ - if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, - offset_vectype, scale, - &elsvals)) + else { - gather_scatter_config config; - config.ifn = ifn; - config.offset_vectype = offset_vectype; - config.elsvals = elsvals; - configs.safe_push (config); + /* We already know which IFN is supported, just check if this + offset type and scale work with it. */ + if (internal_gather_scatter_fn_supported_p (ifn, vectype, + memory_type, + offset_vectype, + try_scale, + &elsvals)) + { + gather_scatter_config config; + config.ifn = ifn; + config.offset_vectype = offset_vectype; + config.scale = try_scale; + config.elsvals = elsvals; + configs.safe_push (config); + } } } } @@ -4570,6 +4595,10 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool read_p, bool masked_p, base address. If OFFSET_TYPE is scalar the function chooses an appropriate vector type for it. SCALE is the amount by which the offset should be multiplied *after* it has been converted to address width. + If the target does not support the requested SCALE, SUPPORTED_SCALE + will contain the scale that is actually supported + (which may be smaller, requiring additional multiplication). + Otherwise SUPPORTED_SCALE is 0. Return true if the function is supported, storing the function id in *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. @@ -4582,12 +4611,14 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool read_p, bool masked_p, bool vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, tree vectype, tree memory_type, tree offset_type, - int scale, internal_fn *ifn_out, + int scale, int *supported_scale, + internal_fn *ifn_out, tree *offset_vectype_out, tree *supported_offset_vectype, vec<int> *elsvals) { *supported_offset_vectype = NULL_TREE; + *supported_scale = 0; unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); unsigned int element_bits = vector_element_bits (vectype); if (element_bits != memory_bits) @@ -4609,11 +4640,19 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, if (configs.is_empty ()) return false; - /* First, try to find a configuration that matches our offset type - (no conversion needed). */ + /* Selection priority: + 1 - Exact scale match + offset type match + 2 - Exact scale match + sign-swapped offset + 3 - Smaller scale + offset type match + 4 - Smaller scale + sign-swapped offset + Within each category, prefer smaller offset types. */ + + /* First pass: exact scale match with no conversion. */ for (unsigned int i = 0; i < configs.length (); i++) { - if (TYPE_SIGN (configs[i].offset_vectype) == TYPE_SIGN (offset_vectype)) + if (configs[i].scale == scale + && TYPE_SIGN (configs[i].offset_vectype) + == TYPE_SIGN (offset_vectype)) { *ifn_out = configs[i].ifn; *offset_vectype_out = configs[i].offset_vectype; @@ -4623,19 +4662,77 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, } } - /* No direct match. This means we try to find a sign-swapped offset - vectype. */ + /* No direct match. This means we try to find either + - a sign-swapped offset vectype or + - a different scale and 2x larger offset type + - a different scale and larger sign-swapped offset vectype. */ unsigned int offset_precision = TYPE_PRECISION (TREE_TYPE (offset_vectype)); unsigned int needed_precision = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE; needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE); + /* Second pass: No direct match. This means we try to find a sign-swapped + offset vectype. */ enum tree_code tmp; for (unsigned int i = 0; i < configs.length (); i++) { unsigned int precision = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype)); - if (precision >= needed_precision + if (configs[i].scale == scale + && precision >= needed_precision + && (supportable_convert_operation (CONVERT_EXPR, + configs[i].offset_vectype, + offset_vectype, &tmp) + || (needed_precision == offset_precision + && tree_nop_conversion_p (configs[i].offset_vectype, + offset_vectype)))) + { + *ifn_out = configs[i].ifn; + *offset_vectype_out = offset_vectype; + *supported_offset_vectype = configs[i].offset_vectype; + if (elsvals) + *elsvals = configs[i].elsvals; + return true; + } + } + + /* Third pass: Try a smaller scale with the same signedness. */ + needed_precision = offset_precision * 2; + needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE); + + for (unsigned int i = 0; i < configs.length (); i++) + { + unsigned int precision + = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype)); + if (configs[i].scale < scale + && precision >= needed_precision + && (supportable_convert_operation (CONVERT_EXPR, + configs[i].offset_vectype, + offset_vectype, &tmp) + || (needed_precision == offset_precision + && tree_nop_conversion_p (configs[i].offset_vectype, + offset_vectype)))) + { + *ifn_out = configs[i].ifn; + *offset_vectype_out = configs[i].offset_vectype; + *supported_scale = configs[i].scale; + if (elsvals) + *elsvals = configs[i].elsvals; + return true; + } + } + + /* Fourth pass: Try a smaller scale and sign-swapped offset vectype. */ + needed_precision + = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE; + needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE); + + for (unsigned int i = 0; i < configs.length (); i++) + { + unsigned int precision + = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype)); + if (configs[i].scale < scale + && precision >= needed_precision && (supportable_convert_operation (CONVERT_EXPR, configs[i].offset_vectype, offset_vectype, &tmp) @@ -4646,6 +4743,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, *ifn_out = configs[i].ifn; *offset_vectype_out = offset_vectype; *supported_offset_vectype = configs[i].offset_vectype; + *supported_scale = configs[i].scale; if (elsvals) *elsvals = configs[i].elsvals; return true; @@ -4805,6 +4903,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree vectype, base = fold_convert (sizetype, base); base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); + int tmp_scale; tree tmp_offset_vectype; /* OFF at this point may be either a SSA_NAME or some tree expression @@ -4878,14 +4977,16 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree vectype, && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, vectype, memory_type, signed_char_type_node, - new_scale, &ifn, + new_scale, &tmp_scale, + &ifn, &offset_vectype, &tmp_offset_vectype, elsvals) && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, vectype, memory_type, unsigned_char_type_node, - new_scale, &ifn, + new_scale, &tmp_scale, + &ifn, &offset_vectype, &tmp_offset_vectype, elsvals)) @@ -4910,7 +5011,9 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree vectype, && !POINTER_TYPE_P (TREE_TYPE (off)) && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, vectype, memory_type, - TREE_TYPE (off), scale, &ifn, + TREE_TYPE (off), + scale, &tmp_scale, + &ifn, &offset_vectype, &tmp_offset_vectype, elsvals)) @@ -4966,7 +5069,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, tree vectype, if (use_ifn_p) { if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, - vectype, memory_type, offtype, scale, + vectype, memory_type, offtype, + scale, &tmp_scale, &ifn, &offset_vectype, &tmp_offset_vectype, elsvals)) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index da093d5021bc..2054f2afa6e0 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -1512,6 +1512,9 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, we chose a different one use this instead. */ if (ls->supported_offset_vectype) off_vectype = ls->supported_offset_vectype; + /* Same for scale. */ + if (ls->supported_scale) + scale = ls->supported_scale; if (internal_gather_scatter_fn_supported_p (len_ifn, vectype, memory_type, @@ -1706,8 +1709,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info stmt_info, tree vectype, no narrower than OFFSET_TYPE. */ tree memory_type = TREE_TYPE (DR_REF (dr)); tree tmp_offset_vectype; + int tmp_scale; if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, - vectype, memory_type, offset_type, scale, + vectype, memory_type, offset_type, + scale, &tmp_scale, &gs_info->ifn, &gs_info->offset_vectype, &tmp_offset_vectype, elsvals) || gs_info->ifn == IFN_LAST) @@ -1789,9 +1794,10 @@ vect_use_grouped_gather (dr_vec_info *dr_info, tree vectype, not available we still have a strided load/store. */ bool ok = false; tree tmp_vectype; + int tmp_scale; if (vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype, - TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn, + TREE_TYPE (*pun_vectype), *pun_vectype, 1, &tmp_scale, &ifn, &offset_vectype, &tmp_vectype, elsvals)) ok = true; else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype, @@ -2091,6 +2097,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, bool *slp_perm = &ls->slp_perm; unsigned *n_perms = &ls->n_perms; tree *supported_offset_vectype = &ls->supported_offset_vectype; + int *supported_scale = &ls->supported_scale; loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; @@ -2164,7 +2171,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, tree tem; if (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD, masked_p, vectype, memory_type, - offset_vectype, scale, + offset_vectype, scale, supported_scale, &ls->gs.ifn, &tem, supported_offset_vectype, elsvals)) { @@ -2179,6 +2186,10 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, dump_printf_loc (MSG_NOTE, vect_location, " target supports offset type %T.\n", *supported_offset_vectype); + if (*supported_scale) + dump_printf_loc (MSG_NOTE, vect_location, + " target supports offset scale %d.\n", + *supported_scale); } *memory_access_type = VMAT_GATHER_SCATTER_IFN; } @@ -2455,7 +2466,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, gcc_assert (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD, masked_p, vectype, gs_info.memory_type, TREE_TYPE (gs_info.offset), - gs_info.scale, &gs_info.ifn, + gs_info.scale, supported_scale, &gs_info.ifn, &tmp, supported_offset_vectype, elsvals)); SLP_TREE_GS_SCALE (slp_node) = gs_info.scale; @@ -8850,6 +8861,10 @@ vectorizable_store (vec_info *vinfo, inside_cost += record_stmt_cost (cost_vec, 1, vector_stmt, slp_node, 0, vect_body); + if (ls.supported_scale) + inside_cost + += record_stmt_cost (cost_vec, 1, vector_stmt, + slp_node, 0, vect_body); unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost @@ -8864,12 +8879,26 @@ vectorizable_store (vec_info *vinfo, tree scale = size_int (SLP_TREE_GS_SCALE (slp_node)); bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset)); - /* Perform the offset conversion if necessary. */ - if (!strided && ls.supported_offset_vectype) + /* Perform the offset conversion and scaling if necessary. */ + if (!strided + && (ls.supported_offset_vectype || ls.supported_scale)) { gimple_seq stmts = NULL; - vec_offset = gimple_convert - (&stmts, ls.supported_offset_vectype, vec_offset); + if (ls.supported_offset_vectype) + vec_offset = gimple_convert + (&stmts, ls.supported_offset_vectype, vec_offset); + if (ls.supported_scale) + { + tree mult_cst = build_int_cst + (TREE_TYPE (TREE_TYPE (vec_offset)), + SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale); + tree mult = build_vector_from_val + (TREE_TYPE (vec_offset), mult_cst); + vec_offset = gimple_build + (&stmts, MULT_EXPR, TREE_TYPE (vec_offset), + vec_offset, mult); + scale = size_int (ls.supported_scale); + } gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); } @@ -10691,6 +10720,10 @@ vectorizable_load (vec_info *vinfo, inside_cost += record_stmt_cost (cost_vec, 1, vector_stmt, slp_node, 0, vect_body); + if (ls.supported_scale) + inside_cost + += record_stmt_cost (cost_vec, 1, vector_stmt, + slp_node, 0, vect_body); unsigned int cnunits = vect_nunits_for_cost (vectype); inside_cost @@ -10704,12 +10737,26 @@ vectorizable_load (vec_info *vinfo, tree scale = size_int (SLP_TREE_GS_SCALE (slp_node)); bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset)); - /* Perform the offset conversion if necessary. */ - if (!strided && ls.supported_offset_vectype) + /* Perform the offset conversion and scaling if necessary. */ + if (!strided + && (ls.supported_offset_vectype || ls.supported_scale)) { gimple_seq stmts = NULL; - vec_offset = gimple_convert - (&stmts, ls.supported_offset_vectype, vec_offset); + if (ls.supported_offset_vectype) + vec_offset = gimple_convert + (&stmts, ls.supported_offset_vectype, vec_offset); + if (ls.supported_scale) + { + tree mult_cst = build_int_cst + (TREE_TYPE (TREE_TYPE (vec_offset)), + SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale); + tree mult = build_vector_from_val + (TREE_TYPE (vec_offset), mult_cst); + vec_offset = gimple_build + (&stmts, MULT_EXPR, TREE_TYPE (vec_offset), + vec_offset, mult); + scale = size_int (ls.supported_scale); + } gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index b940a763a3c7..b7f3297a16b9 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -290,9 +290,14 @@ struct vect_load_store_data : vect_data { tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided tree ls_type; // VMAT_GATHER_SCATTER_IFN /* This is set to a supported offset vector type if we don't support the - originally requested offset type. In that case there will be an - additional offset conversion before the gather/scatter. */ + originally requested offset type, otherwise NULL. + If nonzero there will be an additional offset conversion before + the gather/scatter. */ tree supported_offset_vectype; // VMAT_GATHER_SCATTER_IFN + /* Similar for scale. Only nonzero if we don't support the requested + scale. Then we need to multiply the offset vector before the + gather/scatter. */ + int supported_scale; // VMAT_GATHER_SCATTER_IFN auto_vec<int> elsvals; /* True if the load requires a load permutation. */ bool slp_perm; // SLP_TREE_LOAD_PERMUTATION @@ -2592,7 +2597,7 @@ extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance); extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *); extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info); extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree, - tree, int, internal_fn *, tree *, + tree, int, int *, internal_fn *, tree *, tree *, vec<int> * = nullptr); extern bool vect_check_gather_scatter (stmt_vec_info, tree, loop_vec_info, gather_scatter_info *,
