This fixes a miscompilation issue introduced by the enablement of combined loop peeling and versioning. A test case that reproduces the issue is included in the patch.
When performing loop peeling, GCC usually inserts a skip-vector check. This ensures that after peeling, there are enough remaining iterations to enter the main vectorized loop. Previously, the check was omitted if loop versioning for alignment was applied. It was safe before because versioning and peeling for alignment were mutually exclusive. However, with combined peeling and versioning enabled, this is not safe any more. A loop may be peeled and versioned at the same time. Without the skip-vector check, the main vectorized loop can be entered even if its iteration count is zero. This can cause the loop running many more iterations than needed, resulting in incorrect results. To fix this, the patch updates the condition of omitting the skip-vector check to when versioning is performed alone without peeling. This patch is bootstrapped and regression-tested on x86_64-linux-gnu, arm-linux-gnueabihf and aarch64-linux-gnu. PR tree-optimization/121020 gcc/ChangeLog: * tree-vect-loop-manip.cc (vect_do_peeling): Update the condition of omitting the skip-vector check. * tree-vectorizer.h (LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING): Add a helper macro. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-early-break_138-pr121020.c: New test. --- .../vect/vect-early-break_138-pr121020.c | 52 +++++++++++++++++++ gcc/tree-vect-loop-manip.cc | 2 +- gcc/tree-vectorizer.h | 4 ++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c new file mode 100644 index 00000000000..86661e445a8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_138-pr121020.c @@ -0,0 +1,52 @@ +/* PR tree-optimization/121020 */ +/* { dg-do run } */ +/* { dg-options "-O3 --vect-cost-model=unlimited" } */ +/* { dg-additional-options "-march=znver2" { target x86_64-*-* i?86-*-* } } */ +/* { dg-require-effective-target mmap } */ +/* { dg-require-effective-target vect_early_break } */ + +#include <stdint.h> +#include <stdio.h> +#include <sys/mman.h> +#include <unistd.h> + +__attribute__((noipa)) +bool equal (uint64_t *restrict p, uint64_t *restrict q, int length) +{ + for (int i = 0; i < length; i++) { + if (*(p + i) != *(q + i)) + return false; + } + return true; +} + +int main () +{ + long pgsz = sysconf (_SC_PAGESIZE); + if (pgsz == -1) { + fprintf (stderr, "sysconf failed\n"); + return 0; + } + + /* Allocate a whole page of memory. */ + void *mem = mmap (NULL, pgsz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + fprintf (stderr, "mmap failed\n"); + return 0; + } + uint64_t *p1 = (uint64_t *) mem; + uint64_t *p2 = (uint64_t *) mem + 32; + + /* The first 16 elements pointed to by p1 and p2 are the same. */ + for (int i = 0; i < 32; i++) { + *(p1 + i) = 0; + *(p2 + i) = (i < 16 ? 0 : -1); + } + + /* All calls to equal should return true. */ + for (int len = 0; len < 16; len++) { + if (!equal (p1 + 1, p2 + 1, len)) + __builtin_abort(); + } +} diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 2d01a4b0ed1..7fcbc1ad2eb 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -3295,7 +3295,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), bound_prolog + bound_epilog) - : (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) + : (!LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING (loop_vinfo) || vect_epilogues)); /* Epilog loop must be executed if the number of iterations for epilog diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 799d5fed7a9..bbfeecacf5b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1168,6 +1168,10 @@ public: || LOOP_REQUIRES_VERSIONING_FOR_NITERS (L) \ || LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (L)) +#define LOOP_VINFO_USE_VERSIONING_WITHOUT_PEELING(L) \ + ((L)->may_misalign_stmts.length () > 0 \ + && !LOOP_VINFO_ALLOW_MUTUAL_ALIGNMENT (L)) + #define LOOP_VINFO_NITERS_KNOWN_P(L) \ (tree_fits_shwi_p ((L)->num_iters) && tree_to_shwi ((L)->num_iters) > 0) -- 2.43.0