From: Andrei Nichita Tirziu <[email protected]>
The `vectorizable_call` function is used to analyze and transform a given
`gcall` GIMPLE statement.
Prior to this commit, if the original `gcall` was:
- standard IFN => no change; just add its arguments.
- conditional IFN => if conditional-len IFN is available,
do a conversion; then add its arguments,
plus a `len` and `bias`.
- conditional-len IFN => no change; just add its arguments.
After this commit, if the original `gcall` is:
- standard IFN
- if conditional-len IFN is available, do a conversion;
add its arguments, plus a newly created `mask`, `else`,
`len` and `bias`;
- if conditional IFN is available, do a conversion;
add its arguments, plus a newly created `mask`, `else`;
- otherwise, leave it as it is and add its arguments.
- conditional IFN => if conditional-len IFN is available,
do a conversion; then add its arguments,
plus a `len` and `bias`.
- conditional-len IFN => no change; just add its arguments.
gcc/ChangeLog:
* internal-fn.h: New helper functions.
* tree-vect-stmts.cc: Improve vectorizable_call function.
Change-Id: Ib1f95f0b391e55b5f8935b5221cfe2b07a783c80
---
gcc/internal-fn.h | 40 ++++
gcc/tree-vect-stmts.cc | 448 ++++++++++++++++++++++++++++++++++-------
2 files changed, 417 insertions(+), 71 deletions(-)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index b97c0dc60315..8462a9d359a6 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -183,6 +183,46 @@ vectorizable_internal_fn_p (internal_fn fn)
return direct_internal_fn_array[fn].vectorizable;
}
+/**
+ * Return true if an internal function should be transformed to its conditional
+ * version by the vectorizer. The transformation might depend on other things
+ * as well, such as the availability of an optab (or other conditions),
+ * so this function only indicates that a transformation should be considered,
+ * not necessarily applied.
+ */
+inline bool transform_to_conditional_version (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MATCH_EQ:
+ case IFN_MATCH_NE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/**
+ * Return true if an internal function should be transformed to its
+ * conditional-len version by the vectorizer. The transformation
+ * might depend on other things as well, such as the availability of
+ * an optab (or other conditions), so this function only indicates that
+ * a transformation should be considered, not necessarily applied.
+ */
+inline bool transform_to_conditional_len_version (internal_fn fn)
+{
+ switch (fn)
+ {
+ case IFN_MATCH_EQ:
+ case IFN_MATCH_NE:
+ case IFN_COND_MATCH_EQ:
+ case IFN_COND_MATCH_NE:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* Return optab information about internal function FN. Only meaningful
if direct_internal_fn_p (FN). */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a563238c4be0..075c20fd8d91 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1311,6 +1311,8 @@ vect_finish_stmt_generation (vec_info *vinfo,
stmt_vec_info stmt_info, gimple *vec_stmt,
gimple_stmt_iterator *gsi)
{
+ loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
+
gcc_assert (!stmt_info || gimple_code (stmt_info->stmt) != GIMPLE_LABEL);
if (!gsi_end_p (*gsi)
@@ -1342,6 +1344,16 @@ vect_finish_stmt_generation (vec_info *vinfo,
}
}
}
+
+ /* If we could not re-use an existing virtual operand, updating virtual
+ SSA form will be needed later. */
+ if (loop_vinfo
+ && gimple_has_mem_ops (vec_stmt)
+ && gimple_vuse (vec_stmt) == NULL_TREE)
+ {
+ loop_vinfo->any_known_not_updated_vssa = true;
+ }
+
gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
vect_finish_stmt_generation_1 (vinfo, stmt_info, vec_stmt);
}
@@ -3568,14 +3580,23 @@ vectorizable_call (vec_info *vinfo,
return false;
}
- if (VECTOR_BOOLEAN_TYPE_P (vectype_out)
- != VECTOR_BOOLEAN_TYPE_P (vectype_in))
+ /* In the case of IFN_MATCH_EQ and IFN_MATCH_NE, we know that the
+ input vectype is an integer vector, while the output is a boolean mask.
+ So we only check if the vectypes match if we don't have an IFN
+ or if we have an IFN different from IFN_MATCH_EQ and IFN_MATCH_NE. */
+ if (!gimple_call_internal_p (stmt) ||
+ (gimple_call_internal_fn (stmt) != IFN_MATCH_EQ &&
+ gimple_call_internal_fn (stmt) != IFN_MATCH_NE))
+ {
+ if (VECTOR_BOOLEAN_TYPE_P (vectype_out) !=
+ VECTOR_BOOLEAN_TYPE_P (vectype_in))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"mixed mask and nonmask vector types\n");
return false;
}
+ }
if (vect_emulated_vector_p (vectype_in)
|| vect_emulated_vector_p (vectype_out))
@@ -3694,10 +3715,76 @@ vectorizable_call (vec_info *vinfo,
}
int reduc_idx = SLP_TREE_REDUC_IDX (slp_node);
- internal_fn cond_fn = (internal_fn_mask_index (ifn) != -1
- ? ifn : get_conditional_internal_fn (ifn));
- internal_fn cond_len_fn = get_len_internal_fn (ifn);
+
+ /* If the original IFN was already conditional, keep it.
+ Otherwise, for a "standard" IFN, we only get its conditional version
+ if the IFN supports being converted to such a form. */
+ internal_fn cond_fn = IFN_LAST;
+ if (internal_fn_mask_index (ifn) != -1)
+ cond_fn = ifn;
+ else
+ {
+ if (transform_to_conditional_version (ifn) || could_trap)
+ {
+ cond_fn = get_conditional_internal_fn (ifn);
+ }
+ }
+
+ /* If the original IFN was conditional-len, keep it.
+ If the original IFN was conditional, look for its conditional-len version.
+ Otherwise, for a "standard" IFN, we only get its conditional-len version
+ if the IFN supports being converted to such a form. */
+ internal_fn cond_len_fn = IFN_LAST;
+ if (internal_fn_len_index (ifn) != -1)
+ cond_len_fn = ifn;
+ else
+ {
+ if (internal_fn_mask_index (ifn) != -1 ||
+ transform_to_conditional_len_version (ifn) ||
+ could_trap)
+ {
+ cond_len_fn = get_len_internal_fn (ifn);
+ }
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "For given IFN %s, found potential conditional version "
+ "%s and conditional-len version %s\n",
+ internal_fn_name (ifn),
+ internal_fn_name (cond_fn),
+ internal_fn_name (cond_len_fn));
+
int len_opno = internal_fn_len_index (cond_len_fn);
+
+ /* In accordance with the definition of `direct_internal_fn_info`,
+ get the types of an internal function (these are given by the values
+ of the `type0` and `type1` fields). */
+ auto internal_fn_vector_types = [&](internal_fn fn) -> tree_pair {
+ if (fn == IFN_LAST || !direct_internal_fn_p (fn))
+ return tree_pair (NULL_TREE, NULL_TREE);
+
+ const direct_internal_fn_info &info = direct_internal_fn (fn);
+
+ // We pick a certain type using the definition of
`direct_internal_fn_info`.
+ auto pick_type = [&](int idx) -> tree {
+ if (idx < 0)
+ return vectype_out;
+
+ if ((unsigned) idx < nargs && vectypes[idx])
+ return vectypes[idx];
+
+ return vectype_in;
+ };
+
+ return tree_pair (pick_type (info.type0), pick_type (info.type1));
+ };
+
+ /* The types of the conditional versions (with and without the length)
+ of the IFN. */
+ tree_pair cond_fn_types = internal_fn_vector_types (cond_fn);
+ tree_pair cond_len_fn_types = internal_fn_vector_types (cond_len_fn);
+
vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : NULL);
vec_loop_lens *lens = (loop_vinfo ? &LOOP_VINFO_LENS (loop_vinfo) : NULL);
unsigned int nvectors = vect_get_num_copies (vinfo, slp_node);
@@ -3707,49 +3794,105 @@ vectorizable_call (vec_info *vinfo,
if (!vect_maybe_update_slp_op_vectype (slp_op[i],
vectypes[i]
? vectypes[i] : vectype_in))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "incompatible vector types for invariants\n");
- return false;
- }
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "incompatible vector types for invariants\n");
+ return false;
+ }
+
SLP_TREE_TYPE (slp_node) = call_vec_info_type;
DUMP_VECT_SCOPE ("vectorizable_call");
vect_model_simple_cost (vinfo, 1, slp_node, cost_vec);
+ /* Check if we can use the partial vector, and we either had a reduction,
+ an IFN that already had a mask when we called `vectorizable_call`
+ (`mask_opno >= 0`) or if the current IFN has a conditional version. */
if (loop_vinfo
&& LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
- && (reduc_idx >= 0 || mask_opno >= 0))
+ && (reduc_idx >= 0 || mask_opno >= 0 || cond_fn != IFN_LAST
+ || cond_len_fn != IFN_LAST))
+ {
+ if (reduc_idx >= 0
+ && (cond_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_fn, cond_fn_types,
+ OPTIMIZE_FOR_SPEED))
+ && (cond_len_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_len_fn,
+ cond_len_fn_types,
+ OPTIMIZE_FOR_SPEED)))
{
- if (reduc_idx >= 0
- && (cond_fn == IFN_LAST
- || !direct_internal_fn_supported_p (cond_fn, vectype_out,
- OPTIMIZE_FOR_SPEED))
- && (cond_len_fn == IFN_LAST
- || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
- OPTIMIZE_FOR_SPEED)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "can't use a fully-masked loop because no"
- " conditional operation is available.\n");
- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
- }
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because no"
+ " conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ else if (reduc_idx >= 0 || mask_opno >= 0)
+ {
+ tree scalar_mask = NULL_TREE;
+ if (mask_opno >= 0)
+ scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+ if (cond_len_fn != IFN_LAST
+ && direct_internal_fn_supported_p (cond_len_fn, cond_len_fn_types,
+ OPTIMIZE_FOR_SPEED))
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out, 1);
else
- {
- tree scalar_mask = NULL_TREE;
- if (mask_opno >= 0)
- scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
- if (cond_len_fn != IFN_LAST
- && direct_internal_fn_supported_p (cond_len_fn, vectype_out,
- OPTIMIZE_FOR_SPEED))
- vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out,
- 1);
- else
- vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
- scalar_mask);
- }
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
+ scalar_mask);
}
+ else
+ {
+ /* In this case, we know that we don't have a reduction, and the
+ IFN given to `vectorizable_call` didn't have a mask initially.
+ But, the given IFN has at least one conditional version. */
+
+ if (cond_len_fn != IFN_LAST &&
+ direct_internal_fn_supported_p (cond_len_fn, cond_len_fn_types,
+ OPTIMIZE_FOR_SPEED))
+ {
+ /* We have a conditional-len version and there is a direct optab
+ that supports it. */
+ vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_out, 1);
+ LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Choosing conditional-len (%s) version "
+ "of original IFN %s\n",
+ internal_fn_name (cond_len_fn),
+ internal_fn_name (ifn));
+ }
+ else if (cond_fn != IFN_LAST &&
+ direct_internal_fn_supported_p (cond_fn, cond_fn_types,
+ OPTIMIZE_FOR_SPEED))
+ {
+ /* We have a conditional version and there is a direct optab that
+ supports it. */
+ vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out,
+ NULL_TREE);
+ LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Choosing conditional (%s) version "
+ "of original IFN %s\n",
+ internal_fn_name (cond_fn),
+ internal_fn_name (ifn));
+ }
+ else
+ {
+ /* Even though we had conditional versions,
+ they are not supported by a direct optab. */
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because no"
+ " conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ }
return true;
}
@@ -3765,22 +3908,71 @@ vectorizable_call (vec_info *vinfo,
bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
bool len_loop_p = loop_vinfo && LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
unsigned int vect_nargs = nargs;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Transform phase: "
+ "masked_loop_p = %d , "
+ "len_loop_p = %d\n",
+ masked_loop_p, len_loop_p);
+
if (len_loop_p)
{
+ /* In this case, we are supposed to transform the original IFN with
+ its conditional-len version. */
+
if (len_opno >= 0)
+ {
+ /* In this case, we'll have a conditional-len function, which might have
+ mask + else + len + bias. Unlike `mask_opno` which comes from the
+ original IFN, `len_opno` comes from the conditional-len version
+ of the original call, so just because we are in this if,
+ it doesn't mean that the original IFN had a mask.
+ We know from the analysis phase that the conditional-len version
+ exists, and that it is supported by an optab. */
+
+ if (mask_opno == -1 && internal_fn_mask_index (cond_len_fn) >= 0)
+ {
+ /* The original IFN had no mask (this also implies that
+ it had no len).
+ This means that we have to add 4 arguments (mask, else, len, bias).
+ */
+ vect_nargs += 4;
+ }
+ else
{
- ifn = cond_len_fn;
- /* COND_* -> COND_LEN_* takes 2 extra arguments:LEN,BIAS. */
+ /* The original IFN either had a mask (in which case `mask_opno >= 0`)
+ or the conditional-len version doesn't require a mask.
+ This means that we only have to add 2 arguments (len, bias). */
vect_nargs += 2;
}
+
+ ifn = cond_len_fn;
+ }
else if (reduc_idx >= 0)
gcc_unreachable ();
}
- else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
+ else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ {
+ ifn = cond_fn;
+ vect_nargs += 2;
+ }
+ else if (masked_loop_p
+ && mask_opno == -1
+ && cond_fn != IFN_LAST
+ && internal_fn_mask_index (cond_fn) >= 0)
{
+
+ /* In this case, we are supposed to transform the orignal,
+ non-masked IFN (since `mask_opno == -1`) to its conditonal version.
+ We know from the analysis phase that the conditional version exists,
+ and that it is supported by an optab.
+ We have to add 2 arguments (mask, else). */
+
ifn = cond_fn;
vect_nargs += 2;
}
+
if (clz_ctz_arg1)
++vect_nargs;
@@ -3817,39 +4009,143 @@ vectorizable_call (vec_info *vinfo,
/* Arguments are ready. Create the new vector stmt. */
FOR_EACH_VEC_ELT (vec_oprnds0, i, vec_oprnd0)
+ {
+ /* The indices of the mask, else, len. The index of the bias can
+ be determined from the index of len. */
+ int mask_index = internal_fn_mask_index (ifn);
+ int else_index = internal_fn_else_index (ifn);
+ int len_index = internal_fn_len_index (ifn);
+
+ // If we generate a new loop mask, keep track of it here.
+ tree loop_mask_for_stmt = NULL_TREE;
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Transform phase: For IFN %s, got "
+ "mask_index = %d , "
+ "else_index = %d , "
+ "len_index = %d , "
+ "bias_index = %d\n",
+ internal_fn_name (ifn),
+ mask_index, else_index,
+ len_index, len_index + 1);
+
+ if (mask_index < 0 && else_index >= 0)
+ {
+ // An else operand is only meaningful if there is a mask.
+ gcc_unreachable ();
+ }
+
+ /* If `mask_opno != -1`, we already had a mask for this loop (of
+ course, other things such as len, bias might still have
+ to be added).
+ Otherwise, we need to also create a mask. */
+ bool needs_new_loop_mask = ((masked_loop_p || len_loop_p)
+ && mask_opno == -1
+ && mask_index >= 0);
+
+ /* Helper to identify if a slot is reserved for mask/else/len so
+ that we avoid it, and drop the other operands into the
+ correct positions. */
+ auto is_reserved_slot = [&](int idx) -> bool {
+ if (needs_new_loop_mask)
+ {
+ /* We didn't have a mask originally, but we have to
+ build one now. We need to keep clear the positions
+ of the mask and else.
+ If it already had a mask, there's no need to reserve certain
+ slots, as the mask and else would already be among it's usual
+ arguments that we copy. */
+ return idx == mask_index || (else_index >= 0
+ && idx == else_index);
+ }
+
+ if (len_loop_p)
+ {
+ // We need to keep clear the slots for len and bias.
+ return idx == len_index || idx == (len_index + 1);
+ }
+
+ return false;
+ };
+
+ /* If the initial call had no explicit mask but we need it now,
+ synthesize the loop mask into the conditional IFNâs mask
operand.
+ */
+ if (needs_new_loop_mask)
{
- int varg = 0;
- /* Add the mask if necessary. */
- if (masked_loop_p && mask_opno == -1
- && (reduc_idx >= 0 || could_trap))
+ tree new_loop_mask;
+ if (masked_loop_p)
+ {
+ // We have a conditional version of the original IFN.
+ unsigned int vec_num = vec_oprnds0.length ();
+ new_loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
+ vec_num, vectype_out, i);
+
+ /* We only set this if we have a masked loop (conditional
+ version). In the case of conditional-len versions, there's
+ no need to carry the loop mask. */
+ loop_mask_for_stmt = new_loop_mask;
+ }
+ else
+ {
+ // We have a conditional-len version of the original IFN.
+ new_loop_mask =
+ build_minus_one_cst (truth_type_for (vectype_out));
+ }
+
+ vargs[mask_index] = new_loop_mask;
+
+ // Check if we also need to add the else operand.
+ if (else_index >= 0)
+ {
+ tree else_val = NULL_TREE;
+
+ if (reduc_idx >= 0)
{
- gcc_assert (internal_fn_mask_index (ifn) == varg);
- unsigned int vec_num = vec_oprnds0.length ();
- vargs[varg++] = vect_get_loop_mask (loop_vinfo, gsi, masks,
- vec_num, vectype_out, i);
+ // For reductions, use the running reduction value.
+ else_val = vec_defs[reduc_idx][i];
}
- size_t k;
- for (k = 0; k < nargs; k++)
+ else
{
- vec<tree> vec_oprndsk = vec_defs[k];
- vargs[varg++] = vec_oprndsk[i];
+ /* Pick the target-preferred inactive value when there is
+ no reduction seed we can reuse. */
+ auto_vec<tree> data_ops;
+ for (size_t k = 0; k < nargs; k++)
+ data_ops.safe_push (vec_defs[k][i]);
+ else_val = targetm.preferred_else_value (ifn,
+ vectype_out,
+ data_ops.length (),
+ data_ops.address ());
}
- /* Add the else value if necessary. */
- if (masked_loop_p && mask_opno == -1
- && (reduc_idx >= 0 || could_trap))
- {
- gcc_assert (internal_fn_else_index (ifn) == varg);
- if (reduc_idx >= 0)
- vargs[varg++] = vargs[reduc_idx + 1];
- else
- {
- auto else_value = targetm.preferred_else_value
- (cond_fn, vectype_out, varg - 1, &vargs[1]);
- vargs[varg++] = else_value;
- }
- }
- if (clz_ctz_arg1)
- vargs[varg++] = clz_ctz_arg1;
+
+ vargs[else_index] = else_val;
+ }
+ }
+
+ /* Copy the vector arguments into the non-reserved slots, skipping
+ over mask/else/len positions as needed. */
+ unsigned int current_varg_slot = 0;
+ for (size_t k = 0; k < nargs; k++)
+ {
+ while (is_reserved_slot (current_varg_slot))
+ current_varg_slot++;
+
+ vec<tree> vec_oprndsk = vec_defs[k];
+ vargs[current_varg_slot] = vec_oprndsk[i];
+ current_varg_slot++;
+ }
+
+ /* Ensure the clz/ctz extra argument, if present, lands after the
+ synthesized mask/else/len operands. */
+ if (clz_ctz_arg1)
+ {
+ while (is_reserved_slot (current_varg_slot))
+ current_varg_slot++;
+
+ vargs[current_varg_slot] = clz_ctz_arg1;
+ current_varg_slot++;
+ }
gimple *new_stmt;
if (modifier == NARROW)
@@ -3905,6 +4201,17 @@ vectorizable_call (vec_info *vinfo,
gimple_call_set_nothrow (call, true);
vect_finish_stmt_generation (vinfo, stmt_info, call, gsi);
new_stmt = call;
+
+ /* The result of this call is already masked by LOOP_MASK,
+ so note that to avoid re-applying it later. */
+ if (loop_mask_for_stmt
+ && loop_vinfo
+ && VECTOR_BOOLEAN_TYPE_P (vectype_out))
+ {
+ loop_vinfo->vec_cond_masked_set.add ({ new_temp,
+ loop_mask_for_stmt
+ });
+ }
}
slp_node->push_vec_def (new_stmt);
}
@@ -14818,4 +15125,3 @@ vect_gen_len (tree len, tree start_index, tree
end_index, tree len_limit)
return stmts;
}
-
--
2.52.0