Hi Kewen and Richard,
the attached v3 addresses the comments to v2, among others:
- Rename to load_store where appropriate.
- Save the adjusted length as a separate control that is used instead
of loop_len with a bias != 0 and added to the loop header.
- Update the costs to reflect a bias.
Bootstrap and regtest were fine on z15 and p9.
Regards
Robin
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index bf033e31c1c..dc2756f83e9 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5637,7 +5637,8 @@
(define_expand "len_load_v16qi"
[(match_operand:V16QI 0 "vlogical_operand")
(match_operand:V16QI 1 "memory_operand")
- (match_operand:QI 2 "gpc_reg_operand")]
+ (match_operand:QI 2 "gpc_reg_operand")
+ (match_operand:QI 3 "zero_constant")]
"TARGET_P9_VECTOR && TARGET_64BIT"
{
rtx mem = XEXP (operands[1], 0);
@@ -5651,6 +5652,7 @@
[(match_operand:V16QI 0 "memory_operand")
(match_operand:V16QI 1 "vlogical_operand")
(match_operand:QI 2 "gpc_reg_operand")
+ (match_operand:QI 3 "zero_constant")
]
"TARGET_P9_VECTOR && TARGET_64BIT"
{
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 2b41cb7fb7b..8df61f578bd 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5213,7 +5213,10 @@ which must be a vector mode. Operand 2 has whichever integer mode the
target prefers. If operand 2 exceeds the number of elements in mode
@var{m}, the behavior is undefined. If the target prefers the length
to be measured in bytes rather than elements, it should only implement
-this pattern for vectors of @code{QI} elements.
+this pattern for vectors of @code{QI} elements. Operand 3 specifies
+a bias predicate that determines whether a length of zero is permitted
+or not. If permitted, the predicate should only allow a zero immediate,
+otherwise it should only allow an immediate value of -1.
This pattern is not allowed to @code{FAIL}.
@@ -5226,7 +5229,10 @@ a vector mode. Operand 2 has whichever integer mode the target prefers.
If operand 2 exceeds the number of elements in mode @var{m}, the behavior
is undefined. If the target prefers the length to be measured in bytes
rather than elements, it should only implement this pattern for vectors
-of @code{QI} elements.
+of @code{QI} elements. Operand 3 specifies a bias predicate that
+determines whether a length of zero is permitted or not. If permitted,
+the predicate should only allow a zero constant, otherwise it should
+only allow an immediate value of -1.
This pattern is not allowed to @code{FAIL}.
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8312d08aab2..d45f080c06f 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -2696,9 +2696,9 @@ expand_call_mem_ref (tree type, gcall *stmt, int index)
static void
expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
{
- class expand_operand ops[3];
- tree type, lhs, rhs, maskt;
- rtx mem, target, mask;
+ class expand_operand ops[4];
+ tree type, lhs, rhs, maskt, biast;
+ rtx mem, target, mask, bias;
insn_code icode;
maskt = gimple_call_arg (stmt, 2);
@@ -2723,11 +2723,20 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
create_output_operand (&ops[0], target, TYPE_MODE (type));
create_fixed_operand (&ops[1], mem);
if (optab == len_load_optab)
- create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
- TYPE_UNSIGNED (TREE_TYPE (maskt)));
+ {
+ create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+ TYPE_UNSIGNED (TREE_TYPE (maskt)));
+ biast = gimple_call_arg (stmt, 3);
+ bias = expand_normal (biast);
+ create_input_operand (&ops[3], bias, QImode);
+ expand_insn (icode, 4, ops);
+ }
else
+ {
create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
- expand_insn (icode, 3, ops);
+ expand_insn (icode, 3, ops);
+ }
+
if (!rtx_equal_p (target, ops[0].value))
emit_move_insn (target, ops[0].value);
}
@@ -2741,9 +2750,9 @@ expand_partial_load_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
static void
expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
{
- class expand_operand ops[3];
- tree type, lhs, rhs, maskt;
- rtx mem, reg, mask;
+ class expand_operand ops[4];
+ tree type, lhs, rhs, maskt, biast;
+ rtx mem, reg, mask, bias;
insn_code icode;
maskt = gimple_call_arg (stmt, 2);
@@ -2766,11 +2775,19 @@ expand_partial_store_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
create_fixed_operand (&ops[0], mem);
create_input_operand (&ops[1], reg, TYPE_MODE (type));
if (optab == len_store_optab)
- create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
- TYPE_UNSIGNED (TREE_TYPE (maskt)));
+ {
+ create_convert_operand_from (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)),
+ TYPE_UNSIGNED (TREE_TYPE (maskt)));
+ biast = gimple_call_arg (stmt, 4);
+ bias = expand_normal (biast);
+ create_input_operand (&ops[3], bias, QImode);
+ expand_insn (icode, 4, ops);
+ }
else
- create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
- expand_insn (icode, 3, ops);
+ {
+ create_input_operand (&ops[2], mask, TYPE_MODE (TREE_TYPE (maskt)));
+ expand_insn (icode, 3, ops);
+ }
}
#define expand_mask_store_optab_fn expand_partial_store_optab_fn
@@ -4172,6 +4189,29 @@ internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
&& insn_operand_matches (icode, 4, GEN_INT (align)));
}
+/* Return the supported bias for the len_load IFN. For now we only support
+ the biases of 0 and -1 (in case 0 is not an allowable length for len_load).
+ If none of the biases match what the backend provides, return
+ VECT_PARTIAL_BIAS_UNSUPPORTED. */
+
+signed char
+internal_len_load_store_bias (internal_fn ifn, machine_mode mode)
+{
+ optab optab = direct_internal_fn_optab (ifn);
+ insn_code icode = direct_optab_handler (optab, mode);
+
+ if (icode != CODE_FOR_nothing)
+ {
+ /* For now we only support biases of 0 or -1. Try both of them. */
+ if (insn_operand_matches (icode, 3, GEN_INT (0)))
+ return 0;
+ if (insn_operand_matches (icode, 3, GEN_INT (-1)))
+ return -1;
+ }
+
+ return VECT_PARTIAL_BIAS_UNSUPPORTED;
+}
+
/* Expand STMT as though it were a call to internal function FN. */
void
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 19d0f849a5a..d46aa4ebf33 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -227,6 +227,10 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
tree, tree, int);
extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
poly_uint64, unsigned int);
+#define VECT_PARTIAL_BIAS_UNSUPPORTED 127
+
+extern signed char internal_len_load_store_bias (internal_fn ifn,
+ machine_mode);
extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
bool, bool, bool, bool, tree *);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 4988c93fdb6..931378820ac 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -421,6 +421,7 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm,
static tree
vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
gimple_seq *preheader_seq,
+ gimple_seq *header_seq,
gimple_stmt_iterator loop_cond_gsi,
rgroup_controls *rgc, tree niters,
tree niters_skip, bool might_wrap_p)
@@ -436,7 +437,7 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
tree length_limit = NULL_TREE;
/* For length, we need length_limit to ensure length in range. */
if (!use_masks_p)
- length_limit = build_int_cst (compare_type, nitems_per_ctrl);
+ length_limit = build_int_cst (compare_type, nitems_per_ctrl);
/* Calculate the maximum number of item values that the rgroup
handles in total, the number that it handles for each iteration
@@ -560,8 +561,9 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
{
/* Previous controls will cover BIAS items. This control covers the
next batch. */
+ tree bias_tree;
poly_uint64 bias = nitems_per_ctrl * i;
- tree bias_tree = build_int_cst (compare_type, bias);
+ bias_tree = build_int_cst (compare_type, bias);
/* See whether the first iteration of the vector loop is known
to have a full control. */
@@ -664,6 +666,20 @@ vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo,
vect_set_loop_control (loop, ctrl, init_ctrl, next_ctrl);
}
+
+ int partial_load_bias = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+ if (partial_load_bias != 0
+ && partial_load_bias != VECT_PARTIAL_BIAS_UNSUPPORTED)
+ {
+ tree adjusted_len = rgc->bias_adjusted_ctrl;
+ gassign *minus = gimple_build_assign (adjusted_len, MINUS_EXPR,
+ rgc->controls[0],
+ build_int_cst
+ (TREE_TYPE (rgc->controls[0]),
+ -partial_load_bias));
+ gimple_seq_add_stmt (header_seq, minus);
+ }
+
return next_ctrl;
}
@@ -744,6 +760,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop,
/* Set up all controls for this group. */
test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo,
&preheader_seq,
+ &header_seq,
loop_cond_gsi, rgc,
niters, niters_skip,
might_wrap_p);
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index e94356d76e9..ceeb6920871 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1163,6 +1163,31 @@ vect_verify_loop_lens (loop_vec_info loop_vinfo)
if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
return false;
+ machine_mode len_load_mode = get_len_load_store_mode
+ (loop_vinfo->vector_mode, true).require ();
+ machine_mode len_store_mode = get_len_load_store_mode
+ (loop_vinfo->vector_mode, false).require ();
+
+ signed char partial_load_bias = internal_len_load_store_bias
+ (IFN_LEN_LOAD, len_load_mode);
+
+ signed char partial_store_bias = internal_len_load_store_bias
+ (IFN_LEN_STORE, len_store_mode);
+
+ gcc_assert (partial_load_bias == partial_store_bias);
+
+ if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
+ return false;
+
+ LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
+
+ /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
+ len_loads with a length of zero. In order to avoid that we prohibit
+ more than one loop length here. */
+ if (partial_load_bias == -1
+ && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
+ return false;
+
unsigned int max_nitems_per_iter = 1;
unsigned int i;
rgroup_controls *rgl;
@@ -4125,6 +4150,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
here. */
bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+ signed char partial_load_store_bias
+ = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
bool need_iterate_p
= (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& !vect_known_niters_smaller_than_vf (loop_vinfo));
@@ -4157,6 +4184,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
for each since start index is zero. */
prologue_stmts += num_vectors;
+ /* If we have a non-zero partial load bias, we need one MINUS
+ and a MAX to adjust the load length. */
+ if (partial_load_store_bias != 0)
+ prologue_stmts += 2;
+
/* Each may need two MINs and one MINUS to update lengths in body
for next iteration. */
if (need_iterate_p)
@@ -9226,6 +9258,13 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
{
rgroup_controls *rgl = &(*lens)[nvectors - 1];
+ signed char partial_load_store_bias =
+ LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+
+ bool use_bias_adjusted_len =
+ partial_load_store_bias != VECT_PARTIAL_BIAS_UNSUPPORTED
+ && partial_load_store_bias != 0;
+
/* Populate the rgroup's len array, if this is the first time we've
used it. */
if (rgl->controls.is_empty ())
@@ -9235,6 +9274,15 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
{
tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
gcc_assert (len_type != NULL_TREE);
+
+ if (i == 0 && use_bias_adjusted_len)
+ {
+ tree adjusted_len =
+ make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
+ SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
+ rgl->bias_adjusted_ctrl = adjusted_len;
+ }
+
tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
/* Provide a dummy definition until the real one is available. */
@@ -9243,7 +9291,10 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
}
}
- return rgl->controls[index];
+ if (use_bias_adjusted_len)
+ return rgl->bias_adjusted_ctrl;
+ else
+ return rgl->controls[index];
}
/* Scale profiling counters by estimation for LOOP which is vectorized
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 17849b575b7..006f4c31217 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -8289,9 +8289,16 @@ vectorizable_store (vec_info *vinfo,
gsi);
vec_oprnd = var;
}
+
+ /* Check which bias value to use. */
+ signed char biasval = internal_len_load_store_bias
+ (IFN_LEN_STORE, new_vmode);
+
+ tree bias = build_int_cst (intQI_type_node, biasval);
gcall *call
- = gimple_build_call_internal (IFN_LEN_STORE, 4, dataref_ptr,
- ptr, final_len, vec_oprnd);
+ = gimple_build_call_internal (IFN_LEN_STORE, 5, dataref_ptr,
+ ptr, final_len, vec_oprnd,
+ bias);
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
new_stmt = call;
@@ -9588,22 +9595,30 @@ vectorizable_load (vec_info *vinfo,
vec_num * j + i);
tree ptr = build_int_cst (ref_type,
align * BITS_PER_UNIT);
+
+ machine_mode vmode = TYPE_MODE (vectype);
+ opt_machine_mode new_ovmode
+ = get_len_load_store_mode (vmode, true);
+ machine_mode new_vmode = new_ovmode.require ();
+ tree qi_type = unsigned_intQI_type_node;
+
+ /* Check which bias value to use. */
+ signed char biasval = internal_len_load_store_bias
+ (IFN_LEN_LOAD, new_vmode);
+
+ tree bias = build_int_cst (intQI_type_node, biasval);
+
gcall *call
- = gimple_build_call_internal (IFN_LEN_LOAD, 3,
+ = gimple_build_call_internal (IFN_LEN_LOAD, 4,
dataref_ptr, ptr,
- final_len);
+ final_len, bias);
gimple_call_set_nothrow (call, true);
new_stmt = call;
data_ref = NULL_TREE;
/* Need conversion if it's wrapped with VnQI. */
- machine_mode vmode = TYPE_MODE (vectype);
- opt_machine_mode new_ovmode
- = get_len_load_store_mode (vmode, true);
- machine_mode new_vmode = new_ovmode.require ();
if (vmode != new_vmode)
{
- tree qi_type = unsigned_intQI_type_node;
tree new_vtype
= build_vector_type_for_mode (qi_type, new_vmode);
tree var = vect_get_new_ssa_name (new_vtype,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index c4c5678e7f1..75fdfa31405 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -544,6 +544,10 @@ struct rgroup_controls {
/* A vector of nV controls, in iteration order. */
vec<tree> controls;
+
+ /* In case of len_load and len_store with a bias there is only one
+ rgroup. This holds the adjusted loop length for the this rgroup. */
+ tree bias_adjusted_ctrl;
};
typedef auto_vec<rgroup_controls> vec_loop_masks;
@@ -749,6 +753,11 @@ public:
epilogue of loop. */
bool epil_using_partial_vectors_p;
+ /* The bias for len_load and len_store. For now, only 0 and -1 are
+ supported. -1 must be used when a backend does not support
+ len_load/len_store with a length of zero. */
+ signed char partial_load_store_bias;
+
/* When we have grouped data accesses with gaps, we may introduce invalid
memory accesses. We peel the last iteration of the loop to prevent
this. */
@@ -814,6 +823,7 @@ public:
#define LOOP_VINFO_USING_PARTIAL_VECTORS_P(L) (L)->using_partial_vectors_p
#define LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P(L) \
(L)->epil_using_partial_vectors_p
+#define LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS(L) (L)->partial_load_store_bias
#define LOOP_VINFO_VECT_FACTOR(L) (L)->vectorization_factor
#define LOOP_VINFO_MAX_VECT_FACTOR(L) (L)->max_vectorization_factor
#define LOOP_VINFO_MASKS(L) (L)->masks