When loop requires any kind of versioning which could increase register
pressure too much, and it's in a deeply nest big loop, don't do
vectorization.
I tested the patch with both Ofast and O2 for SPEC2017, besides 548.exchange_r,
other benchmarks are same binary.
Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,}
Any comments?
gcc/ChangeLog:
pr target/117088
* config/i386/i386.cc
(ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function.
(ix86_vector_costs::finish_cost): Prevent loop vectorization
if it's in a deeply nested loop and require versioning.
* config/i386/i386.opt (--param=vect-max-loop-depth=): New
param.
---
gcc/config/i386/i386.cc | 89 ++++++++++++++++++++++++++++++++++++++++
gcc/config/i386/i386.opt | 4 ++
2 files changed, 93 insertions(+)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 526c9df7618..608f40413d2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25019,6 +25019,8 @@ private:
/* Estimate register pressure of the vectorized code. */
void ix86_vect_estimate_reg_pressure ();
+ /* Check if vect_loop is in a deeply-nested loop. */
+ bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop);
/* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
estimation of register pressure.
??? Currently it's only used by vec_construct/scalar_to_vec
@@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
}
}
+/* Return true if vect_loop is in a deeply-nested loop.
+ .i.e vect_loop_n in below loop structure.
+loop1
+{
+ loop2
+ {
+ loop3
+ {
+ vect_loop_1;
+ loop4
+ {
+ vect_loop_2;
+ loop5
+ {
+ vect_loop_3;
+ loop6
+ {
+ vect_loop_4;
+ loop7
+ {
+ vect_loop_5;
+ loop8
+ {
+ loop9
+ }
+ vect_loop_6;
+ }
+ vect_loop_7;
+ }
+ }
+ }
+ }
+ It's a big hammer to fix O2 regression for 548.exchange_r after vectorization
+ is enhanced by (r15-4225-g70c3db511ba14f) */
+bool
+ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop)
+{
+ if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth)
+ return true;
+
+ if (loop_depth (vect_loop) < 2)
+ return false;
+
+ class loop* outer_loop = loop_outer (vect_loop);
+
+ auto_vec<class loop*> m_loop_stack;
+ auto_sbitmap m_visited_loops (number_of_loops (cfun));
+
+ /* Get all sibling loops for vect_loop. */
+ class loop* next_loop = outer_loop->inner;
+ for (; next_loop; next_loop = next_loop->next)
+ {
+ m_loop_stack.safe_push (next_loop);
+ bitmap_set_bit (m_visited_loops, next_loop->num);
+ }
+
+ /* DFS the max depth of all sibling loop. */
+ while (!m_loop_stack.is_empty ())
+ {
+ next_loop = m_loop_stack.pop ();
+ if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth)
+ return true;
+
+ class loop* inner_loop = next_loop->inner;
+ while (inner_loop)
+ {
+ if (!bitmap_bit_p (m_visited_loops, inner_loop->num))
+ {
+ m_loop_stack.safe_push (inner_loop);
+ bitmap_set_bit (m_visited_loops, inner_loop->num);
+ }
+ inner_loop = inner_loop->next;
+ }
+ }
+
+ return false;
+}
+
void
ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
{
@@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs
*scalar_costs)
&& (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
> ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
m_costs[vect_body] = INT_MAX;
+
+ /* Prohibit vectorization when the loop requires versioning
+ and loop_depth exceeds threshold. */
+ if ((LOOP_REQUIRES_VERSIONING (loop_vinfo)
+ || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+ || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+ || vect_apply_runtime_profitability_check_p (loop_vinfo))
+ && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo)))
+ m_costs[vect_body] = INT_MAX;
}
ix86_vect_estimate_reg_pressure ();
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 99e86f545e8..c5abf83473d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation.
mamx-movrs
Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save
Support AMX-MOVRS built-in functions and code generation.
+
+-param=vect-max-loop-depth=
+Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param
+Preversion loop vectorization when it's in a deeply nested loop and requires
versioning, since it may increase register pressure too much.
--
2.34.1