On Fri, 27 Jan 2023, Richard Sandiford wrote: > PR96373 points out that a predicated SVE loop currently converts > trapping unconditional ops into unpredicated vector ops. Doing > the operation on inactive lanes can then raise an exception. > > As discussed in the PR trail, we aren't 100% consistent about > whether we preserve traps or not. But the direction of travel > is clearly to improve that rather than live with it. This patch > tries to do that for the SVE case. > > Doing this regresses gcc.target/aarch64/sve/fabd_1.c. I've added > -fno-trapping-math for now and filed PR108571 to track it. > A similar problem applies to fsubr_1.d. > > I think this is likely to regress Power 10, since conditional > operations are only available for masked loops. I think we'll > need to add -fno-trapping-math to any affected testcases, > but I don't have a Power 10 system to test on. Kewen, would you > mind giving this a spin and seeing how bad the fallout is? > > Tested on aarch64-linux-gnu. OK to install assuming no blockers > on the Power 10 side?
OK. Thanks, Richard. > Richard > > > gcc/ > PR tree-optimization/96373 > * tree-vect-stmts.cc (vectorizable_operation): Predicate trapping > operations on the loop mask. Reject partial vectors if this isn't > possible. > > gcc/testsuite/ > PR tree-optimization/96373 > PR tree-optimization/108571 > * gcc.target/aarch64/sve/fabd_1.c: Add -fno-trapping-math. > * gcc.target/aarch64/sve/fsubr_1.c: Likewise. > * gcc.target/aarch64/sve/fmul_1.c: Expect predicate ops. > * gcc.target/aarch64/sve/fp_arith_1.c: Likewise. > --- > gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c | 2 +- > gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c | 12 +++---- > .../gcc.target/aarch64/sve/fp_arith_1.c | 12 +++---- > .../gcc.target/aarch64/sve/fsubr_1.c | 2 +- > gcc/tree-vect-stmts.cc | 32 ++++++++++++++----- > 5 files changed, 38 insertions(+), 22 deletions(-) > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c > index 13ad83be24c..30bde6f0df7 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fabd_1.c > @@ -1,5 +1,5 @@ > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > -/* { dg-options "-O3 --save-temps" } */ > +/* { dg-options "-O3 --save-temps -fno-trapping-math" } */ > > #define N 16 > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c > index 4a3e7c06745..0245a8c1422 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmul_1.c > @@ -27,20 +27,20 @@ DO_ARITH_OPS (_Float16, *, mul) > DO_ARITH_OPS (float, *, mul) > DO_ARITH_OPS (double, *, mul) > > -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, z[0-9]+\.h, > z[0-9]+\.h\n} 4 } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #0.5\n} 1 } } */ > -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #2} } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #2.0\n} 1 } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #5} } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #-} } } */ > > -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, z[0-9]+\.s, > z[0-9]+\.s\n} 4 } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, z[0-9]+\.s\n} 4 } } */ > /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #0.5\n} 1 } } */ > -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #2} } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #2.0\n} 1 } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #5} } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #-} } } */ > > -/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, z[0-9]+\.d, > z[0-9]+\.d\n} 4 } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, z[0-9]+\.d\n} 4 } } */ > /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #0.5\n} 1 } } */ > -/* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #2} } } */ > +/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #2.0\n} 1 } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #5} } } */ > /* { dg-final { scan-assembler-not {\tfmul\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #-} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c > index 5aed0dcb490..419d6e1b5ec 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fp_arith_1.c > @@ -34,37 +34,37 @@ DO_ARITH_OPS (double, -, minus) > > /* No specific count because it's valid to use fadd or fsub for the > out-of-range constants. */ > -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, z[0-9]+\.h, > z[0-9]+\.h\n} } } */ > +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, > z[0-9]+\.h\n} } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #2} } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #-} } } */ > > -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, z[0-9]+\.h, > z[0-9]+\.h\n} } } */ > +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, > z[0-9]+\.h\n} } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #2} } } */ > /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.h, p[0-7]/m, > z[0-9]+\.h, #-} } } */ > > -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, z[0-9]+\.s, > z[0-9]+\.s\n} } } */ > +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, > z[0-9]+\.s\n} } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #2} } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #-} } } */ > > -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, z[0-9]+\.s, > z[0-9]+\.s\n} } } */ > +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, > z[0-9]+\.s\n} } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #2} } } */ > /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.s, p[0-7]/m, > z[0-9]+\.s, #-} } } */ > > -/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, z[0-9]+\.d, > z[0-9]+\.d\n} } } */ > +/* { dg-final { scan-assembler {\tfadd\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, > z[0-9]+\.d\n} } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #2} } } */ > /* { dg-final { scan-assembler-not {\tfadd\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #-} } } */ > > -/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, z[0-9]+\.d, > z[0-9]+\.d\n} } } */ > +/* { dg-final { scan-assembler {\tfsub\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, > z[0-9]+\.d\n} } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #1.0\n} 2 } } */ > /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #0.5\n} 2 } } */ > /* { dg-final { scan-assembler-not {\tfsub\tz[0-9]+\.d, p[0-7]/m, > z[0-9]+\.d, #2} } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c > index f47a360dee9..012cf6e9e5d 100644 > --- a/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fsubr_1.c > @@ -1,5 +1,5 @@ > /* { dg-do assemble { target aarch64_asm_sve_ok } } */ > -/* { dg-options "-O3 --save-temps" } */ > +/* { dg-options "-O3 --save-temps -fno-trapping-math" } */ > > #define DO_IMMEDIATE_OPS(VALUE, TYPE, NAME) \ > void vsubrarithimm_##NAME##_##TYPE (TYPE *dst, int count) \ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index eb4ca1f184e..56e3c30658e 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -6301,6 +6301,7 @@ vectorizable_operation (vec_info *vinfo, > int reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info); > vec_loop_masks *masks = (loop_vinfo ? &LOOP_VINFO_MASKS (loop_vinfo) : > NULL); > internal_fn cond_fn = get_conditional_internal_fn (code); > + bool could_trap = gimple_could_trap_p (stmt); > > if (!vec_stmt) /* transformation not required. */ > { > @@ -6309,7 +6310,7 @@ vectorizable_operation (vec_info *vinfo, > keeping the inactive lanes as-is. */ > if (loop_vinfo > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) > - && reduc_idx >= 0) > + && (could_trap || reduc_idx >= 0)) > { > if (cond_fn == IFN_LAST > || !direct_internal_fn_supported_p (cond_fn, vectype, > @@ -6452,16 +6453,31 @@ vectorizable_operation (vec_info *vinfo, > vop1 = ((op_type == binary_op || op_type == ternary_op) > ? vec_oprnds1[i] : NULL_TREE); > vop2 = ((op_type == ternary_op) ? vec_oprnds2[i] : NULL_TREE); > - if (masked_loop_p && reduc_idx >= 0) > + if (masked_loop_p && (reduc_idx >= 0 || could_trap)) > { > - /* Perform the operation on active elements only and take > - inactive elements from the reduction chain input. */ > - gcc_assert (!vop2); > - vop2 = reduc_idx == 1 ? vop1 : vop0; > tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, > vectype, i); > - gcall *call = gimple_build_call_internal (cond_fn, 4, mask, > - vop0, vop1, vop2); > + auto_vec<tree> vops (5); > + vops.quick_push (mask); > + vops.quick_push (vop0); > + if (vop1) > + vops.quick_push (vop1); > + if (vop2) > + vops.quick_push (vop2); > + if (reduc_idx >= 0) > + { > + /* Perform the operation on active elements only and take > + inactive elements from the reduction chain input. */ > + gcc_assert (!vop2); > + vops.quick_push (reduc_idx == 1 ? vop1 : vop0); > + } > + else > + { > + auto else_value = targetm.preferred_else_value > + (cond_fn, vectype, vops.length () - 1, &vops[1]); > + vops.quick_push (else_value); > + } > + gcall *call = gimple_build_call_internal_vec (cond_fn, vops); > new_temp = make_ssa_name (vec_dest, call); > gimple_call_set_lhs (call, new_temp); > gimple_call_set_nothrow (call, true); > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)