On Thu, 30 Oct 2025, Tamar Christina wrote:
> This fixes the two reported ICEs by accounting for that neutral_op can be NULL
> and that we should require IFN_VEC_SHL_INSERT for non reduction defs.
>
> The conditions were taken from what is required to delay the reduction with
> the
> initial values later on in vect_transform_cycle_phi.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/122474
> PR tree-optimization/122475
> * tree-vect-loop.cc (vectorizable_reduction): Tighten IFN_VEC_SHL_INSERT
> requirement.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/122474
> PR tree-optimization/122475
> * gcc.dg/vect/pr122474.c: New test.
> * gcc.dg/vect/pr122475.c: New test.
> * gcc.target/aarch64/sve/vect-reduc-bool-19.c: New test.
> * gcc.target/aarch64/sve/vect-reduc-bool-20.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.dg/vect/pr122474.c
> b/gcc/testsuite/gcc.dg/vect/pr122474.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..c5d70b518a0078a6ec887a37d1c898365ec6f01d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr122474.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fwrapv -Waggressive-loop-optimizations -w" } */
> +/* { dg-additional-options "-march=armv8-a+sve" { target aarch64*-*-* } } */
> +/* Check that we don't ICE. */
> +
> +_Bool a;
> +void b(unsigned c[][8][8][8][8][8]) {
> + for (int d = 1; d; d += 3)
> + for (int e = 0; e < 11; e += 2)
> + for (short f = 0; f < 1; f = 30482)
> + for (short g = 0; g < 014; g++)
> + a = ({
> + int h = a;
> + int i = c[2][f][d][d][d][d] ^ c[g][g][1][2][g][g];
> + i ? h : i;
> + });
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" {
> target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr122475.c
> b/gcc/testsuite/gcc.dg/vect/pr122475.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..ed229c5475cc96fc0ef3beea71b101db89e08916
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr122475.c
> @@ -0,0 +1,13 @@
> +/* { dg-additional-options "-march=armv8-a+sve" { target aarch64*-*-* } } */
> +/* Check that we don't ICE. */
> +int a;
> +int b;
> +int main() {
> + for (char t = 0; t < 14; t += 2)
> + for (int u = 0; u < 242; u += 4) {
> + a = a < 0 ? a : 0;
> + b = b < 0 ? b : 0;
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" {
> target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c
> b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6492c44c065159d06d7e69499dfc9834721d62ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mautovec-preference=sve-only
> -fdump-tree-vect-details -O3 --param vect-epilogues-nomask=0" } */
> +
> +int p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n, bool r1, bool r2)
> +{
> + bool r = true;
> + for (int i = 0; i < (n/2); i+=2)
> + {
> + r &= (p[i] != 0) & r1;
> + r &= (p[i+1] != 0) & r2;
> + }
> + return r;
> +}
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" }
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c
> b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..83c5c206dd2597c3776317005a3dd38ee5991f16
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mautovec-preference=sve-only
> -fdump-tree-vect-details -O3 --param vect-epilogues-nomask=0" } */
> +
> +#include <stdbool.h>
> +#include <stdint.h>
> +
> +void vec_slp_cmp (char* restrict a, char* restrict b, int n) {
> + bool x0 = b[0] != 0;
> + bool x1 = b[1] != 0;
> + bool x2 = b[2] != 0;
> + bool x3 = b[3] != 0;
> + for (int i = 0; i < n; ++i) {
> + x0 &= (a[i * 4] != 0);
> + x1 &= (a[i * 4 + 1] != 0);
> + x2 &= (a[i * 4 + 2] != 0);
> + x3 &= (a[i * 4 + 3] != 0);
> + }
> + b[0] = x0;
> + b[1] = x1;
> + b[2] = x2;
> + b[3] = x3;
> +}
> +
> +void vec_slp_cmp1 (char* restrict a, char* restrict b, int n) {
> + bool x0 = b[0] != 0;
> + for (int i = 0; i < n; ++i) {
> + x0 &= (a[i] != 0);
> + }
> + b[0] = x0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" {
> target aarch64*-*-* } } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index
> 50cdc2a90fa29c1e0d116c0589bc246e6d8fcc84..bfcea0d52337eead7ee80572963f986bee00591d
> 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7577,8 +7577,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> values into the low-numbered elements. */
> if ((double_reduc || neutral_op)
> && !nunits_out.is_constant ()
> - && (SLP_TREE_LANES (slp_node) != 1 && !reduc_chain)
> - && !operand_equal_p (neutral_op, vect_phi_initial_value
> (reduc_def_phi))
> + && (!neutral_op
> + || (!reduc_chain
> + && (SLP_TREE_LANES (slp_node) != 1
> + || (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)
So this will be always true given we set
VECT_REDUC_INFO_REUSED_ACCUMULATOR only during transform. Meaning
this will be a source of mismatch which is why I earlier suggested
that we'd need to ensure we can later drop the requirement, like
forcing of no accumulator re-use. Thus, after the checks we can
perform here, if ...
> + && STMT_VINFO_DEF_TYPE (stmt_info)
> + != vect_reduction_def))
> + && !operand_equal_p (neutral_op,
> + vect_phi_initial_value (reduc_def_phi))))
> && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
> vectype_out, OPTIMIZE_FOR_SPEED))
... this fails, allow to continue but set some
VECT_REDUC_INFO_MAY_NOT_REUSE_ACCUMULATOR or so?
I'd suggest to push the !neutral_op change and rework the rest in
a way suggested?
That said, the accumulator re-use might be not an actual blocker
to delay the initial value adjustment, but just an implementation
detail missing. Does the other bug run into exactly that?
Richard.
> {
>
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)