On Wed, Nov 27, 2024 at 4:26 AM liuhongt <[email protected]> wrote:
>
> When loop requires any kind of versioning which could increase register
> pressure too much, and it's in a deeply nest big loop, don't do
> vectorization.
>
> I tested the patch with both Ofast and O2 for SPEC2017, besides
> 548.exchange_r,
> other benchmarks are same binary.
>
> Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,}
> Any comments?
The vectorizer tries to version an outer loop when vectorizing a loop nest
and the versioning condition is invariant. See vect_loop_versioning. This
tries to handle such cases. Often the generated runtime alias checks are
not invariant because we do not consider the outer evolutions. I think we
should instead fix this there.
Question below ...
> gcc/ChangeLog:
>
> pr target/117088
> * config/i386/i386.cc
> (ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function.
> (ix86_vector_costs::finish_cost): Prevent loop vectorization
> if it's in a deeply nested loop and require versioning.
> * config/i386/i386.opt (--param=vect-max-loop-depth=): New
> param.
> ---
> gcc/config/i386/i386.cc | 89 ++++++++++++++++++++++++++++++++++++++++
> gcc/config/i386/i386.opt | 4 ++
> 2 files changed, 93 insertions(+)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 526c9df7618..608f40413d2 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -25019,6 +25019,8 @@ private:
>
> /* Estimate register pressure of the vectorized code. */
> void ix86_vect_estimate_reg_pressure ();
> + /* Check if vect_loop is in a deeply-nested loop. */
> + bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop);
> /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
> estimation of register pressure.
> ??? Currently it's only used by vec_construct/scalar_to_vec
> @@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
> }
> }
>
> +/* Return true if vect_loop is in a deeply-nested loop.
> + .i.e vect_loop_n in below loop structure.
> +loop1
> +{
> + loop2
> + {
> + loop3
> + {
> + vect_loop_1;
> + loop4
> + {
> + vect_loop_2;
> + loop5
> + {
> + vect_loop_3;
> + loop6
> + {
> + vect_loop_4;
> + loop7
> + {
> + vect_loop_5;
> + loop8
> + {
> + loop9
> + }
> + vect_loop_6;
> + }
> + vect_loop_7;
> + }
> + }
> + }
> + }
> + It's a big hammer to fix O2 regression for 548.exchange_r after
> vectorization
> + is enhanced by (r15-4225-g70c3db511ba14f) */
> +bool
> +ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop)
> +{
> + if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth)
> + return true;
> +
> + if (loop_depth (vect_loop) < 2)
> + return false;
> +
while the above two are "obvious", what you check below isn't clear to me.
Is this trying to compute whether 'vect_loop' is inside of a loop nest which
at any sibling of vect_loop (or even sibling of an outer loop of vect_loop,
recursively) is a sub-nest with a loop depth (relative to what?) exceeds
ix86_vect_max_loop_depth?
> + class loop* outer_loop = loop_outer (vect_loop);
> +
> + auto_vec<class loop*> m_loop_stack;
> + auto_sbitmap m_visited_loops (number_of_loops (cfun));
> +
> + /* Get all sibling loops for vect_loop. */
> + class loop* next_loop = outer_loop->inner;
> + for (; next_loop; next_loop = next_loop->next)
> + {
> + m_loop_stack.safe_push (next_loop);
> + bitmap_set_bit (m_visited_loops, next_loop->num);
> + }
> +
> + /* DFS the max depth of all sibling loop. */
> + while (!m_loop_stack.is_empty ())
> + {
> + next_loop = m_loop_stack.pop ();
> + if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth)
> + return true;
> +
> + class loop* inner_loop = next_loop->inner;
> + while (inner_loop)
> + {
> + if (!bitmap_bit_p (m_visited_loops, inner_loop->num))
> + {
> + m_loop_stack.safe_push (inner_loop);
> + bitmap_set_bit (m_visited_loops, inner_loop->num);
> + }
> + inner_loop = inner_loop->next;
> + }
> + }
> +
> + return false;
> +}
> +
> void
> ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
> {
> @@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs
> *scalar_costs)
> && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
> > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
> m_costs[vect_body] = INT_MAX;
> +
> + /* Prohibit vectorization when the loop requires versioning
> + and loop_depth exceeds threshold. */
> + if ((LOOP_REQUIRES_VERSIONING (loop_vinfo)
> + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> + || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
> + || vect_apply_runtime_profitability_check_p (loop_vinfo))
this also applies to loops not requiring versioning - it practically applies to
all loops that do not run for a constant VF-times times.
> + && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo)))
> + m_costs[vect_body] = INT_MAX;
> }
>
> ix86_vect_estimate_reg_pressure ();
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index 99e86f545e8..c5abf83473d 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation.
> mamx-movrs
> Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save
> Support AMX-MOVRS built-in functions and code generation.
> +
> +-param=vect-max-loop-depth=
> +Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param
> +Preversion loop vectorization when it's in a deeply nested loop and requires
> versioning, since it may increase register pressure too much.
> --
> 2.34.1
>