On Mon, Dec 29, 2025 at 4:30 PM Tamar Christina <[email protected]> wrote:
>
> The following example
>
> void f (float *__restrict c, int *__restrict d, int n)
> {
>     for (int i = 0; i < n; i++)
>     {
>       c[i] = __builtin_sqrtf (c[i]);
>     }
> }
>
> compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be
> predicated on the conditional.  It's invalid to execute the branch and use a
> select to extract it later unless using -fno-trapping-math.
>
> We currently generate:
>
> f:
>         cmp     w2, 0
>         ble     .L1
>         mov     x1, 0
>         whilelo p7.s, wzr, w2
>         ptrue   p6.b, all
> .L3:
>         ld1w    z31.s, p7/z, [x0, x1, lsl 2]
>         fsqrt   z31.s, p6/m, z31.s
>         st1w    z31.s, p7, [x0, x1, lsl 2]
>         incw    x1
>         whilelo p7.s, w1, w2
>         b.any   .L3
> .L1:
>         ret
>
> Which means the inactive lanes of the operation can raise an FE.  With this
> change we now generate
>
> f:
>         cmp     w2, 0
>         ble     .L1
>         mov     x1, 0
>         whilelo p7.s, wzr, w2
>         .p2align 5,,15
> .L3:
>         ld1w    z31.s, p7/z, [x0, x1, lsl 2]
>         fsqrt   z31.s, p7/m, z31.s
>         st1w    z31.s, p7, [x0, x1, lsl 2]
>         incw    x1
>         whilelo p7.s, w1, w2
>         b.any   .L3
> .L1:
>         ret
>
> However as discussed in PR96373 while we probably shouldn't vectorize for the
> cases where we can trap but don't support conditional operation there doesn't
> seem to be a clear consensus on how GCC should handle trapping math.
>
> As such similar to PR96373 I don't stop vectorization if trapping math and
> the conditional operation isn't supported.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Any comments?

OK.  Testcase?

> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         PR tree-optimization/122103
>         * tree-vect-stmts.cc (vectorizable_call): Handle trapping math.
>
> ---
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
>    tree fndecl, new_temp, rhs_type;
> -  enum vect_def_type dt[4]
> +  enum vect_def_type dt[5]
>      = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
> -       vect_unknown_def_type };
> +       vect_unknown_def_type, vect_unknown_def_type };
>    tree vectypes[ARRAY_SIZE (dt)] = {};
>    slp_tree slp_op[ARRAY_SIZE (dt)] = {};
>    auto_vec<tree, 8> vargs;
> @@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
>
>    /* Bail out if the function has more than four arguments, we do not have
>       interesting builtin functions to vectorize with more than two arguments
> -     except for fma.  No arguments is also not good.  */
> -  if (nargs == 0 || nargs > 4)
> +     except for fma (cond_fma has more).  No arguments is also not good.  */
> +  if (nargs == 0 || nargs > 5)
>      return false;
>
>    /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic.  */
> @@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
>      ifn = vectorizable_internal_function (cfn, callee, vectype_out,
>                                           vectype_in);
>
> +  /* Check if the operation traps.  */
> +  bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
> +  if (could_trap && cost_vec && loop_vinfo)
> +    {
> +      /* If the operation can trap it must be conditional, otherwise fail.  
> */
> +      internal_fn cond_fn = get_conditional_internal_fn (ifn);
> +      internal_fn cond_len_fn = get_len_internal_fn (ifn);
> +      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
> +       {
> +         /* We assume that BB SLP fills all lanes, so no inactive lanes can
> +            cause issues.  */
> +         if ((cond_fn == IFN_LAST
> +              || !direct_internal_fn_supported_p (cond_fn, vectype_out,
> +                                                  OPTIMIZE_FOR_SPEED))
> +             && (cond_len_fn == IFN_LAST
> +                 || !direct_internal_fn_supported_p (cond_len_fn, 
> vectype_out,
> +                                                     OPTIMIZE_FOR_SPEED)))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "can't use a fully-masked loop because no"
> +                                " conditional operation is available.\n");
> +             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> +           }
> +       }
> +    }
> +
>    /* If that fails, try asking for a target-specific built-in function.  */
>    if (ifn == IFN_LAST)
>      {
> @@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
>        else if (reduc_idx >= 0)
>         gcc_unreachable ();
>      }
> -  else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> +  else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || 
> could_trap))
>      {
>        ifn = cond_fn;
>        vect_nargs += 2;
> @@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
>             {
>               int varg = 0;
>               /* Add the mask if necessary.  */
> -             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> +             if (masked_loop_p && mask_opno == -1
> +                 && (reduc_idx >= 0 || could_trap))
>                 {
>                   gcc_assert (internal_fn_mask_index (ifn) == varg);
>                   unsigned int vec_num = vec_oprnds0.length ();
> @@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
>                   vargs[varg++] = vec_oprndsk[i];
>                 }
>               /* Add the else value if necessary.  */
> -             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
> +             if (masked_loop_p && mask_opno == -1
> +                && (reduc_idx >= 0 || could_trap))
>                 {
>                   gcc_assert (internal_fn_else_index (ifn) == varg);
> -                 vargs[varg++] = vargs[reduc_idx + 1];
> +                 if (reduc_idx >= 0)
> +                   vargs[varg++] = vargs[reduc_idx + 1];
> +                 else
> +                   {
> +                     auto else_value = targetm.preferred_else_value
> +                       (cond_fn, vectype_out, varg - 1, &vargs[1]);
> +                     vargs[varg++] = else_value;
> +                   }
>                 }
>               if (clz_ctz_arg1)
>                 vargs[varg++] = clz_ctz_arg1;
>
>
> --

Reply via email to