The following example
void f (float *__restrict c, int *__restrict d, int n)
{
for (int i = 0; i < n; i++)
{
c[i] = __builtin_sqrtf (c[i]);
}
}
compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be
predicated on the conditional. It's invalid to execute the branch and use a
select to extract it later unless using -fno-trapping-math.
We currently generate:
f:
cmp w2, 0
ble .L1
mov x1, 0
whilelo p7.s, wzr, w2
ptrue p6.b, all
.L3:
ld1w z31.s, p7/z, [x0, x1, lsl 2]
fsqrt z31.s, p6/m, z31.s
st1w z31.s, p7, [x0, x1, lsl 2]
incw x1
whilelo p7.s, w1, w2
b.any .L3
.L1:
ret
Which means the inactive lanes of the operation can raise an FE. With this
change we now generate
f:
cmp w2, 0
ble .L1
mov x1, 0
whilelo p7.s, wzr, w2
.p2align 5,,15
.L3:
ld1w z31.s, p7/z, [x0, x1, lsl 2]
fsqrt z31.s, p7/m, z31.s
st1w z31.s, p7, [x0, x1, lsl 2]
incw x1
whilelo p7.s, w1, w2
b.any .L3
.L1:
ret
However as discussed in PR96373 while we probably shouldn't vectorize for the
cases where we can trap but don't support conditional operation there doesn't
seem to be a clear consensus on how GCC should handle trapping math.
As such similar to PR96373 I don't stop vectorization if trapping math and
the conditional operation isn't supported.
Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.
Any comments?
Thanks,
Tamar
gcc/ChangeLog:
PR tree-optimization/122103
* tree-vect-stmts.cc (vectorizable_call): Handle trapping math.
---
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index
a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd
100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
tree fndecl, new_temp, rhs_type;
- enum vect_def_type dt[4]
+ enum vect_def_type dt[5]
= { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
- vect_unknown_def_type };
+ vect_unknown_def_type, vect_unknown_def_type };
tree vectypes[ARRAY_SIZE (dt)] = {};
slp_tree slp_op[ARRAY_SIZE (dt)] = {};
auto_vec<tree, 8> vargs;
@@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
/* Bail out if the function has more than four arguments, we do not have
interesting builtin functions to vectorize with more than two arguments
- except for fma. No arguments is also not good. */
- if (nargs == 0 || nargs > 4)
+ except for fma (cond_fma has more). No arguments is also not good. */
+ if (nargs == 0 || nargs > 5)
return false;
/* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
@@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
ifn = vectorizable_internal_function (cfn, callee, vectype_out,
vectype_in);
+ /* Check if the operation traps. */
+ bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+ if (could_trap && cost_vec && loop_vinfo)
+ {
+ /* If the operation can trap it must be conditional, otherwise fail. */
+ internal_fn cond_fn = get_conditional_internal_fn (ifn);
+ internal_fn cond_len_fn = get_len_internal_fn (ifn);
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ /* We assume that BB SLP fills all lanes, so no inactive lanes can
+ cause issues. */
+ if ((cond_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED))
+ && (cond_len_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because no"
+ " conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ }
+
/* If that fails, try asking for a target-specific built-in function. */
if (ifn == IFN_LAST)
{
@@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
else if (reduc_idx >= 0)
gcc_unreachable ();
}
- else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
{
ifn = cond_fn;
vect_nargs += 2;
@@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
{
int varg = 0;
/* Add the mask if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_mask_index (ifn) == varg);
unsigned int vec_num = vec_oprnds0.length ();
@@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
vargs[varg++] = vec_oprndsk[i];
}
/* Add the else value if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_else_index (ifn) == varg);
- vargs[varg++] = vargs[reduc_idx + 1];
+ if (reduc_idx >= 0)
+ vargs[varg++] = vargs[reduc_idx + 1];
+ else
+ {
+ auto else_value = targetm.preferred_else_value
+ (cond_fn, vectype_out, varg - 1, &vargs[1]);
+ vargs[varg++] = else_value;
+ }
}
if (clz_ctz_arg1)
vargs[varg++] = clz_ctz_arg1;
--
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
tree fndecl, new_temp, rhs_type;
- enum vect_def_type dt[4]
+ enum vect_def_type dt[5]
= { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
- vect_unknown_def_type };
+ vect_unknown_def_type, vect_unknown_def_type };
tree vectypes[ARRAY_SIZE (dt)] = {};
slp_tree slp_op[ARRAY_SIZE (dt)] = {};
auto_vec<tree, 8> vargs;
@@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
/* Bail out if the function has more than four arguments, we do not have
interesting builtin functions to vectorize with more than two arguments
- except for fma. No arguments is also not good. */
- if (nargs == 0 || nargs > 4)
+ except for fma (cond_fma has more). No arguments is also not good. */
+ if (nargs == 0 || nargs > 5)
return false;
/* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
@@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
ifn = vectorizable_internal_function (cfn, callee, vectype_out,
vectype_in);
+ /* Check if the operation traps. */
+ bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+ if (could_trap && cost_vec && loop_vinfo)
+ {
+ /* If the operation can trap it must be conditional, otherwise fail. */
+ internal_fn cond_fn = get_conditional_internal_fn (ifn);
+ internal_fn cond_len_fn = get_len_internal_fn (ifn);
+ if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+ {
+ /* We assume that BB SLP fills all lanes, so no inactive lanes can
+ cause issues. */
+ if ((cond_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED))
+ && (cond_len_fn == IFN_LAST
+ || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+ OPTIMIZE_FOR_SPEED)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't use a fully-masked loop because no"
+ " conditional operation is available.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+ }
+ }
+ }
+
/* If that fails, try asking for a target-specific built-in function. */
if (ifn == IFN_LAST)
{
@@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
else if (reduc_idx >= 0)
gcc_unreachable ();
}
- else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
{
ifn = cond_fn;
vect_nargs += 2;
@@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
{
int varg = 0;
/* Add the mask if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_mask_index (ifn) == varg);
unsigned int vec_num = vec_oprnds0.length ();
@@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
vargs[varg++] = vec_oprndsk[i];
}
/* Add the else value if necessary. */
- if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+ if (masked_loop_p && mask_opno == -1
+ && (reduc_idx >= 0 || could_trap))
{
gcc_assert (internal_fn_else_index (ifn) == varg);
- vargs[varg++] = vargs[reduc_idx + 1];
+ if (reduc_idx >= 0)
+ vargs[varg++] = vargs[reduc_idx + 1];
+ else
+ {
+ auto else_value = targetm.preferred_else_value
+ (cond_fn, vectype_out, varg - 1, &vargs[1]);
+ vargs[varg++] = else_value;
+ }
}
if (clz_ctz_arg1)
vargs[varg++] = clz_ctz_arg1;