On Mon, Dec 29, 2025 at 4:30 PM Tamar Christina <[email protected]> wrote:
>
> The following example
>
> void f (float *__restrict c, int *__restrict d, int n)
> {
> for (int i = 0; i < n; i++)
> {
> c[i] = __builtin_sqrtf (c[i]);
> }
> }
>
> compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be
> predicated on the conditional. It's invalid to execute the branch and use a
> select to extract it later unless using -fno-trapping-math.
>
> We currently generate:
>
> f:
> cmp w2, 0
> ble .L1
> mov x1, 0
> whilelo p7.s, wzr, w2
> ptrue p6.b, all
> .L3:
> ld1w z31.s, p7/z, [x0, x1, lsl 2]
> fsqrt z31.s, p6/m, z31.s
> st1w z31.s, p7, [x0, x1, lsl 2]
> incw x1
> whilelo p7.s, w1, w2
> b.any .L3
> .L1:
> ret
>
> Which means the inactive lanes of the operation can raise an FE. With this
> change we now generate
>
> f:
> cmp w2, 0
> ble .L1
> mov x1, 0
> whilelo p7.s, wzr, w2
> .p2align 5,,15
> .L3:
> ld1w z31.s, p7/z, [x0, x1, lsl 2]
> fsqrt z31.s, p7/m, z31.s
> st1w z31.s, p7, [x0, x1, lsl 2]
> incw x1
> whilelo p7.s, w1, w2
> b.any .L3
> .L1:
> ret
>
> However as discussed in PR96373 while we probably shouldn't vectorize for the
> cases where we can trap but don't support conditional operation there doesn't
> seem to be a clear consensus on how GCC should handle trapping math.
>
> As such similar to PR96373 I don't stop vectorization if trapping math and
> the conditional operation isn't supported.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Any comments?
OK. Testcase?
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/122103
> * tree-vect-stmts.cc (vectorizable_call): Handle trapping math.
>
> ---
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index
> a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd
> 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
> loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
> tree fndecl, new_temp, rhs_type;
> - enum vect_def_type dt[4]
> + enum vect_def_type dt[5]
> = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
> - vect_unknown_def_type };
> + vect_unknown_def_type, vect_unknown_def_type };
> tree vectypes[ARRAY_SIZE (dt)] = {};
> slp_tree slp_op[ARRAY_SIZE (dt)] = {};
> auto_vec<tree, 8> vargs;
> @@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
>
> /* Bail out if the function has more than four arguments, we do not have
> interesting builtin functions to vectorize with more than two arguments
> - except for fma. No arguments is also not good. */
> - if (nargs == 0 || nargs > 4)
> + except for fma (cond_fma has more). No arguments is also not good. */
> + if (nargs == 0 || nargs > 5)
> return false;
>
> /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic. */
> @@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
> ifn = vectorizable_internal_function (cfn, callee, vectype_out,
> vectype_in);
>
> + /* Check if the operation traps. */
> + bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
> + if (could_trap && cost_vec && loop_vinfo)
> + {
> + /* If the operation can trap it must be conditional, otherwise fail.
> */
> + internal_fn cond_fn = get_conditional_internal_fn (ifn);
> + internal_fn cond_len_fn = get_len_internal_fn (ifn);
> + if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> + {
> + /* We assume that BB SLP fills all lanes, so no inactive lanes can
> + cause issues. */
> + if ((cond_fn == IFN_LAST
> + || !direct_internal_fn_supported_p (cond_fn, vectype_out,
> + OPTIMIZE_FOR_SPEED))
> + && (cond_len_fn == IFN_LAST
> + || !direct_internal_fn_supported_p (cond_len_fn,
> vectype_out,
> + OPTIMIZE_FOR_SPEED)))
> + {
> + if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> + "can't use a fully-masked loop because no"
> + " conditional operation is available.\n");
> + LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> + }
> + }
> + }
> +
> /* If that fails, try asking for a target-specific built-in function. */
> if (ifn == IFN_LAST)
> {
> @@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
> else if (reduc_idx >= 0)
> gcc_unreachable ();
> }
> - else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> + else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 ||
> could_trap))
> {
> ifn = cond_fn;
> vect_nargs += 2;
> @@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
> {
> int varg = 0;
> /* Add the mask if necessary. */
> - if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> + if (masked_loop_p && mask_opno == -1
> + && (reduc_idx >= 0 || could_trap))
> {
> gcc_assert (internal_fn_mask_index (ifn) == varg);
> unsigned int vec_num = vec_oprnds0.length ();
> @@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
> vargs[varg++] = vec_oprndsk[i];
> }
> /* Add the else value if necessary. */
> - if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> + if (masked_loop_p && mask_opno == -1
> + && (reduc_idx >= 0 || could_trap))
> {
> gcc_assert (internal_fn_else_index (ifn) == varg);
> - vargs[varg++] = vargs[reduc_idx + 1];
> + if (reduc_idx >= 0)
> + vargs[varg++] = vargs[reduc_idx + 1];
> + else
> + {
> + auto else_value = targetm.preferred_else_value
> + (cond_fn, vectype_out, varg - 1, &vargs[1]);
> + vargs[varg++] = else_value;
> + }
> }
> if (clz_ctz_arg1)
> vargs[varg++] = clz_ctz_arg1;
>
>
> --