[PATCH 6/6][vect]: teach vectorizable_call to predicate calls when they can trap [PR122103]

Tamar Christina Mon, 29 Dec 2025 07:29:56 -0800

The following example

void f (float *__restrict c, int *__restrict d, int n)
{
    for (int i = 0; i < n; i++)
    {
      c[i] = __builtin_sqrtf (c[i]);
    }
}


compiled with -O3 -march=armv9-a -fno-math-errno -ftrapping-math needs to be
predicated on the conditional.  It's invalid to execute the branch and use a
select to extract it later unless using -fno-trapping-math.

We currently generate:

f:
        cmp     w2, 0
        ble     .L1
        mov     x1, 0
        whilelo p7.s, wzr, w2
        ptrue   p6.b, all
.L3:
        ld1w    z31.s, p7/z, [x0, x1, lsl 2]
        fsqrt   z31.s, p6/m, z31.s
        st1w    z31.s, p7, [x0, x1, lsl 2]
        incw    x1
        whilelo p7.s, w1, w2
        b.any   .L3
.L1:
        ret

Which means the inactive lanes of the operation can raise an FE.  With this
change we now generate

f:
        cmp     w2, 0
        ble     .L1
        mov     x1, 0
        whilelo p7.s, wzr, w2
        .p2align 5,,15
.L3:
        ld1w    z31.s, p7/z, [x0, x1, lsl 2]
        fsqrt   z31.s, p7/m, z31.s
        st1w    z31.s, p7, [x0, x1, lsl 2]
        incw    x1
        whilelo p7.s, w1, w2
        b.any   .L3
.L1:
        ret

However as discussed in PR96373 while we probably shouldn't vectorize for the
cases where we can trap but don't support conditional operation there doesn't
seem to be a clear consensus on how GCC should handle trapping math.

As such similar to PR96373 I don't stop vectorization if trapping math and
the conditional operation isn't supported.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Any comments?

Thanks,
Tamar

gcc/ChangeLog:

        PR tree-optimization/122103
        * tree-vect-stmts.cc (vectorizable_call): Handle trapping math.

---
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
   tree fndecl, new_temp, rhs_type;
-  enum vect_def_type dt[4]
+  enum vect_def_type dt[5]
     = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
-       vect_unknown_def_type };
+       vect_unknown_def_type, vect_unknown_def_type };
   tree vectypes[ARRAY_SIZE (dt)] = {};
   slp_tree slp_op[ARRAY_SIZE (dt)] = {};
   auto_vec<tree, 8> vargs;
@@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
 
   /* Bail out if the function has more than four arguments, we do not have
      interesting builtin functions to vectorize with more than two arguments
-     except for fma.  No arguments is also not good.  */
-  if (nargs == 0 || nargs > 4)
+     except for fma (cond_fma has more).  No arguments is also not good.  */
+  if (nargs == 0 || nargs > 5)
     return false;
 
   /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic.  */
@@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
     ifn = vectorizable_internal_function (cfn, callee, vectype_out,
                                          vectype_in);
 
+  /* Check if the operation traps.  */
+  bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+  if (could_trap && cost_vec && loop_vinfo)
+    {
+      /* If the operation can trap it must be conditional, otherwise fail.  */
+      internal_fn cond_fn = get_conditional_internal_fn (ifn);
+      internal_fn cond_len_fn = get_len_internal_fn (ifn);
+      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+       {
+         /* We assume that BB SLP fills all lanes, so no inactive lanes can
+            cause issues.  */
+         if ((cond_fn == IFN_LAST
+              || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+                                                  OPTIMIZE_FOR_SPEED))
+             && (cond_len_fn == IFN_LAST
+                 || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+                                                     OPTIMIZE_FOR_SPEED)))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "can't use a fully-masked loop because no"
+                                " conditional operation is available.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+           }
+       }
+    }
+
   /* If that fails, try asking for a target-specific built-in function.  */
   if (ifn == IFN_LAST)
     {
@@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
       else if (reduc_idx >= 0)
        gcc_unreachable ();
     }
-  else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+  else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
     {
       ifn = cond_fn;
       vect_nargs += 2;
@@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
            {
              int varg = 0;
              /* Add the mask if necessary.  */
-             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+             if (masked_loop_p && mask_opno == -1
+                 && (reduc_idx >= 0 || could_trap))
                {
                  gcc_assert (internal_fn_mask_index (ifn) == varg);
                  unsigned int vec_num = vec_oprnds0.length ();
@@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
                  vargs[varg++] = vec_oprndsk[i];
                }
              /* Add the else value if necessary.  */
-             if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+             if (masked_loop_p && mask_opno == -1
+                && (reduc_idx >= 0 || could_trap))
                {
                  gcc_assert (internal_fn_else_index (ifn) == varg);
-                 vargs[varg++] = vargs[reduc_idx + 1];
+                 if (reduc_idx >= 0)
+                   vargs[varg++] = vargs[reduc_idx + 1];
+                 else
+                   {
+                     auto else_value = targetm.preferred_else_value
+                       (cond_fn, vectype_out, varg - 1, &vargs[1]);
+                     vargs[varg++] = else_value;
+                   }
                }
              if (clz_ctz_arg1)
                vargs[varg++] = clz_ctz_arg1;


--

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a2f345c97d1c94ecdcfaf8e50461157e90127a7a..0ab8839e8006bf0e6b1bd25031a0291478a385bd 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3438,9 +3438,9 @@ vectorizable_call (vec_info *vinfo,
   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
   tree fndecl, new_temp, rhs_type;
-  enum vect_def_type dt[4]
+  enum vect_def_type dt[5]
     = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
-	vect_unknown_def_type };
+	vect_unknown_def_type, vect_unknown_def_type };
   tree vectypes[ARRAY_SIZE (dt)] = {};
   slp_tree slp_op[ARRAY_SIZE (dt)] = {};
   auto_vec<tree, 8> vargs;
@@ -3481,8 +3481,8 @@ vectorizable_call (vec_info *vinfo,
 
   /* Bail out if the function has more than four arguments, we do not have
      interesting builtin functions to vectorize with more than two arguments
-     except for fma.  No arguments is also not good.  */
-  if (nargs == 0 || nargs > 4)
+     except for fma (cond_fma has more).  No arguments is also not good.  */
+  if (nargs == 0 || nargs > 5)
     return false;
 
   /* Ignore the arguments of IFN_GOMP_SIMD_LANE, they are magic.  */
@@ -3625,6 +3625,33 @@ vectorizable_call (vec_info *vinfo,
     ifn = vectorizable_internal_function (cfn, callee, vectype_out,
 					  vectype_in);
 
+  /* Check if the operation traps.  */
+  bool could_trap = gimple_could_trap_p (STMT_VINFO_STMT (stmt_info));
+  if (could_trap && cost_vec && loop_vinfo)
+    {
+      /* If the operation can trap it must be conditional, otherwise fail.  */
+      internal_fn cond_fn = get_conditional_internal_fn (ifn);
+      internal_fn cond_len_fn = get_len_internal_fn (ifn);
+      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+	{
+	  /* We assume that BB SLP fills all lanes, so no inactive lanes can
+	     cause issues.  */
+	  if ((cond_fn == IFN_LAST
+	       || !direct_internal_fn_supported_p (cond_fn, vectype_out,
+						   OPTIMIZE_FOR_SPEED))
+	      && (cond_len_fn == IFN_LAST
+		  || !direct_internal_fn_supported_p (cond_len_fn, vectype_out,
+						      OPTIMIZE_FOR_SPEED)))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				 "can't use a fully-masked loop because no"
+				 " conditional operation is available.\n");
+	      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+	    }
+	}
+    }
+
   /* If that fails, try asking for a target-specific built-in function.  */
   if (ifn == IFN_LAST)
     {
@@ -3749,7 +3776,7 @@ vectorizable_call (vec_info *vinfo,
       else if (reduc_idx >= 0)
 	gcc_unreachable ();
     }
-  else if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+  else if (masked_loop_p && mask_opno == -1 && (reduc_idx >= 0 || could_trap))
     {
       ifn = cond_fn;
       vect_nargs += 2;
@@ -3793,7 +3820,8 @@ vectorizable_call (vec_info *vinfo,
 	    {
 	      int varg = 0;
 	      /* Add the mask if necessary.  */
-	      if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+	      if (masked_loop_p && mask_opno == -1
+		  && (reduc_idx >= 0 || could_trap))
 		{
 		  gcc_assert (internal_fn_mask_index (ifn) == varg);
 		  unsigned int vec_num = vec_oprnds0.length ();
@@ -3807,10 +3835,18 @@ vectorizable_call (vec_info *vinfo,
 		  vargs[varg++] = vec_oprndsk[i];
 		}
 	      /* Add the else value if necessary.  */
-	      if (masked_loop_p && mask_opno == -1 && reduc_idx >= 0)
+	      if (masked_loop_p && mask_opno == -1
+		 && (reduc_idx >= 0 || could_trap))
 		{
 		  gcc_assert (internal_fn_else_index (ifn) == varg);
-		  vargs[varg++] = vargs[reduc_idx + 1];
+		  if (reduc_idx >= 0)
+		    vargs[varg++] = vargs[reduc_idx + 1];
+		  else
+		    {
+		      auto else_value = targetm.preferred_else_value
+			(cond_fn, vectype_out, varg - 1, &vargs[1]);
+		      vargs[varg++] = else_value;
+		    }
 		}
 	      if (clz_ctz_arg1)
 		vargs[varg++] = clz_ctz_arg1;

[PATCH 6/6][vect]: teach vectorizable_call to predicate calls when they can trap [PR122103]

Reply via email to