https://gcc.gnu.org/g:b9cf6721ef0f13cd27d399024931634f571e9186

commit r17-894-gb9cf6721ef0f13cd27d399024931634f571e9186
Author: Zhongyao Chen <[email protected]>
Date:   Wed May 20 17:30:22 2026 +0800

    RISC-V: Add RISC-V RVV main-loop overhead comparison in cost model
    
    Add an RVV-specific loop-overhead comparison in the RISC-V cost model and
    use it after inside-loop cost comparison.
    
    The RISC-V implementation prefers RVV mode that eliminate the main
    loop, and otherwise compares their main-loop head overhead.
    
    Local testing shows no regressions. This is likely because few testcases
    have equal inside-loop cost, especially before VLS lmul cost scaling 
support.
    
    I also ran regression tests with temporary VLS lmul cost scaling support.
    Only 3 regressions found:
      - dyn-lmul-conv-1.c & dyn-lmul-conv-2.c: Cost model now prefers smaller 
LMULs
    due to VLS lmul scaling, so this is reasonable, just need to update 
expectations.
      - pr123414.c: This test relies on large LMULs to trigger a specific bug,
    so reasonable too, can be fixed by adding -fno-vect-cost-model.
    
    The VLS LMUL cost scaling patch will be updated after this is pushed.
    
    gcc/ChangeLog:
            * config/riscv/riscv-vector-costs.cc
            (estimated_loop_iters): New function.
            (compare_loop_overhead): New function.
            (costs::better_main_loop_than_p): Compare RVV loop overhead after
            inside-loop cost.
    
    Signed-off-by: Zhongyao Chen <[email protected]>

Diff:
---
 gcc/config/riscv/riscv-vector-costs.cc | 91 +++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 6d37519dbfee..833a525abd65 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1095,6 +1095,74 @@ costs::prefer_unrolled_loop () const
              <= (unsigned int) param_max_completely_peeled_insns));
 }
 
+/* Return the estimated number of vector iterations for LOOP_VINFO, or
+   HOST_WIDE_INT_M1U if the scalar iteration count is not known.  */
+static unsigned HOST_WIDE_INT
+estimated_loop_iters (loop_vec_info loop_vinfo)
+{
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    return HOST_WIDE_INT_M1U;
+
+  unsigned HOST_WIDE_INT scalar_niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
+  unsigned int vf = vect_vf_for_cost (loop_vinfo);
+  return (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+         ? CEIL (scalar_niters, vf)
+         : scalar_niters / vf);
+}
+
+/* Compare the estimated loop overheads of two loops.  With LMUL cost scaling,
+   simple loop bodies can have equal inside-loop costs for different LMULs.
+   Include loop-back branch costs so that larger RVV modes are preferred when
+   they reduce or eliminate vector loop iterations.  */
+static int
+compare_loop_overhead (loop_vec_info this_loop_vinfo,
+                      loop_vec_info other_loop_vinfo)
+{
+  gcc_assert (LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo));
+  gcc_assert (LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo));
+
+  unsigned HOST_WIDE_INT this_niters = estimated_loop_iters (this_loop_vinfo);
+  unsigned HOST_WIDE_INT other_niters = estimated_loop_iters 
(other_loop_vinfo);
+  bool this_eliminate_loop_p = this_niters == 1;
+  bool other_eliminate_loop_p = other_niters == 1;
+
+  if (this_eliminate_loop_p != other_eliminate_loop_p)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Preferring %s loop because it is estimated to"
+                        " eliminate the main loop entirely\n",
+                        GET_MODE_NAME ((this_eliminate_loop_p
+                                        ? this_loop_vinfo
+                                        : other_loop_vinfo)->vector_mode));
+      return this_eliminate_loop_p ? -1 : 1;
+    }
+
+  unsigned int branch_cost
+    = builtin_vectorization_cost (cond_branch_taken, NULL_TREE, 0);
+  unsigned HOST_WIDE_INT this_overhead
+    = this_niters > 1 ? (this_niters - 1) * branch_cost : 0;
+  unsigned HOST_WIDE_INT other_overhead
+    = other_niters > 1 ? (other_niters - 1) * branch_cost : 0;
+
+  if (this_overhead != other_overhead)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Preferring %s loop because it has lower"
+                        " loop overhead ("
+                        HOST_WIDE_INT_PRINT_UNSIGNED " vs. "
+                        HOST_WIDE_INT_PRINT_UNSIGNED ")\n",
+                        GET_MODE_NAME ((this_overhead < other_overhead
+                                        ? this_loop_vinfo
+                                        : other_loop_vinfo)->vector_mode),
+                        this_overhead, other_overhead);
+      return this_overhead < other_overhead ? -1 : 1;
+    }
+
+  return 0;
+}
+
 bool
 costs::better_main_loop_than_p (const vector_costs *uncast_other) const
 {
@@ -1213,7 +1281,28 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
           && m_cost_type == VLS_VECTOR_COST)
     return false;
 
-  return vector_costs::better_main_loop_than_p (other);
+  /* Fall back to generic costing if either iteration count is unknown.  For
+     known iteration counts, include loop overhead when comparing different
+     LMULs.  This handles such cases better than better_main_loop_than_p,
+     especially while outside costs can still overestimate prologue costs
+     (PR target/125476).  */
+  if (!LOOP_VINFO_NITERS_KNOWN_P (this_loop_vinfo)
+      || !LOOP_VINFO_NITERS_KNOWN_P (other_loop_vinfo))
+    return vector_costs::better_main_loop_than_p (other);
+
+  int diff = compare_inside_loop_cost (other);
+  if (diff != 0)
+    return diff < 0;
+
+  diff = compare_loop_overhead (this_loop_vinfo, other_loop_vinfo);
+  if (diff != 0)
+    return diff < 0;
+
+  diff = compare_outside_loop_cost (other);
+  if (diff != 0)
+    return diff < 0;
+
+  return false;
 }
 
 /* Returns the group size i.e. the number of vectors to be loaded by a

Reply via email to