On Mon, 23 Jun 2025, Tamar Christina wrote: > This patch fixes a bug where the current code assumed that exact_log2 returns > NULL on failure, but it instead returns -1. So there are some cases where the > right shift could shift out the entire value. > > Secondly it also removes the requirement that VF be a power of two. With an > uneven unroll factor we can easily end up with a non-power of two VF which SLP > can handle. This replaces shifts with multiplication and division. > > The 32-bit x86 testcase from PR64110 was always wrong, it used to match by > pure > coincidence a vmovd inside the vector loop. What it intended to match was > that > the argument to the function isn't spilled and then reloaded from the stack > for > no reason. > > But on 32-bit x86 all arguments are passed on the stack anyway and so the > match > would have never worked. The patch seems to simplify the loop preheader which > gets it to remove an intermediate zero extend which causes the match to now > properly fail. > > As such I'm skipping the test on 32-bit x86. > > Bootstrapped Regtested on aarch64-none-linux-gnu, > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > -m32, -m64 and no issues. > > Ok for master?
OK. Thanks, Richard. > Thanks, > Tamar > > gcc/ChangeLog: > > * tree-vect-loop-manip.cc (vect_gen_vector_loop_niters, > vect_gen_vector_loop_niters_mult_vf): Remove uses of log_vf. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr64110.c: Update testcase. > > --- > diff --git a/gcc/testsuite/gcc.target/i386/pr64110.c > b/gcc/testsuite/gcc.target/i386/pr64110.c > index > 99e391916cb7a94e7dd40207f35f29f62acc412c..11a6929835f4e6490d3ff5022a4f33050559b022 > 100644 > --- a/gcc/testsuite/gcc.target/i386/pr64110.c > +++ b/gcc/testsuite/gcc.target/i386/pr64110.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-O3 -march=core-avx2" } */ > -/* { dg-final { scan-assembler "vmovd\[\\t \]" } } */ > +/* { dg-final { scan-assembler "vmovd\[\\t \]" { target { ! ilp32 } } } } */ > > int foo (void); > int a; > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > index > 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..90034fe2d681e2740f54486cfd9e7ddc778e84e1 > 100644 > --- a/gcc/tree-vect-loop-manip.cc > +++ b/gcc/tree-vect-loop-manip.cc > @@ -2794,7 +2794,6 @@ vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, > tree niters, > tree niters_vector, step_vector, type = TREE_TYPE (niters); > poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); > - tree log_vf = NULL_TREE; > > /* If epilogue loop is required because of data accesses with gaps, we > subtract one iteration from the total number of iterations here for > @@ -2820,22 +2819,25 @@ vect_gen_vector_loop_niters (loop_vec_info > loop_vinfo, tree niters, > if (vf.is_constant (&const_vf) > && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) > { > - /* Create: niters >> log2(vf) */ > + /* Create: niters / vf, which is equivalent to niters >> log2(vf) when > + vf is a power of two, and when not we approximate using a > + truncating division. */ > /* If it's known that niters == number of latch executions + 1 doesn't > - overflow, we can generate niters >> log2(vf); otherwise we generate > - (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio > + overflow, we can generate niters / vf; otherwise we generate > + (niters - vf) / vf + 1 by using the fact that we know ratio > will be at least one. */ > - log_vf = build_int_cst (type, exact_log2 (const_vf)); > + tree var_vf = build_int_cst (type, const_vf); > if (niters_no_overflow) > - niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf); > + niters_vector = fold_build2 (TRUNC_DIV_EXPR, type, ni_minus_gap, > + var_vf); > else > niters_vector > = fold_build2 (PLUS_EXPR, type, > - fold_build2 (RSHIFT_EXPR, type, > + fold_build2 (TRUNC_DIV_EXPR, type, > fold_build2 (MINUS_EXPR, type, > ni_minus_gap, > - build_int_cst (type, vf)), > - log_vf), > + var_vf), > + var_vf), > build_int_cst (type, 1)); > step_vector = build_one_cst (type); > } > @@ -2854,16 +2856,17 @@ vect_gen_vector_loop_niters (loop_vec_info > loop_vinfo, tree niters, > /* Peeling algorithm guarantees that vector loop bound is at least ONE, > we set range information to make niters analyzer's life easier. > Note the number of latch iteration value can be TYPE_MAX_VALUE so > - we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */ > - if (stmts != NULL && log_vf) > + we have to represent the vector niter TYPE_MAX_VALUE + 1 / vf. */ > + if (stmts != NULL && const_vf > 0) > { > if (niters_no_overflow) > { > int_range<1> vr (type, > wi::one (TYPE_PRECISION (type)), > - wi::rshift (wi::max_value (TYPE_PRECISION (type), > - TYPE_SIGN (type)), > - exact_log2 (const_vf), > + wi::div_trunc (wi::max_value > + (TYPE_PRECISION (type), > + TYPE_SIGN (type)), > + const_vf, > TYPE_SIGN (type))); > set_range_info (niters_vector, vr); > } > @@ -2901,13 +2904,12 @@ vect_gen_vector_loop_niters_mult_vf (loop_vec_info > loop_vinfo, > /* We should be using a step_vector of VF if VF is variable. */ > int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); > tree type = TREE_TYPE (niters_vector); > - tree log_vf = build_int_cst (type, exact_log2 (vf)); > tree tree_vf = build_int_cst (type, vf); > basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest; > > gcc_assert (niters_vector_mult_vf_ptr != NULL); > - tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type, > - niters_vector, log_vf); > + tree niters_vector_mult_vf = fold_build2 (MULT_EXPR, type, > + niters_vector, tree_vf); > > /* If we've peeled a vector iteration then subtract one full vector > iteration. */ > > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)