The current mechanism for determining mTHP collapse scales the
khugepaged_max_ptes_none value based on the target order. This
introduces an undesirable feedback loop, or "creep", when max_ptes_none
is set to a value greater than HPAGE_PMD_NR / 2.

With this configuration, a successful collapse to order N will populate
enough pages to satisfy the collapse condition on order N+1 on the next
scan. This leads to unnecessary work and memory churn.

To fix this issue introduce a helper function that caps the max_ptes_none
to HPAGE_PMD_NR / 2 - 1 (255 on 4k page size). The function also scales
the max_ptes_none number by the (PMD_ORDER - target collapse order).

The limits can be ignored by passing full_scan=true, this is useful for
madvise_collapse (which ignores limits), or in the case of
collapse_scan_pmd(), allows the full PMD to be scanned when mTHP
collapse is available.

Signed-off-by: Nico Pache <[email protected]>
---
 mm/khugepaged.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4ccebf5dda97..286c3a7afdee 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -459,6 +459,39 @@ void __khugepaged_enter(struct mm_struct *mm)
                wake_up_interruptible(&khugepaged_wait);
 }
 
+/**
+ * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
+ * @order: The folio order being collapsed to
+ * @full_scan: Whether this is a full scan (ignore limits)
+ *
+ * For madvise-triggered collapses (full_scan=true), all limits are bypassed
+ * and allow up to HPAGE_PMD_NR - 1 empty PTEs.
+ *
+ * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
+ * khugepaged_max_ptes_none value.
+ *
+ * For mTHP collapses, scale down the max_ptes_none proportionally to the folio
+ * order, but caps it at HPAGE_PMD_NR/2-1 to prevent a collapse feedback loop.
+ *
+ * Return: Maximum number of empty PTEs allowed for the collapse operation
+ */
+static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan)
+{
+       unsigned int max_ptes_none;
+
+       /* ignore max_ptes_none limits */
+       if (full_scan)
+               return HPAGE_PMD_NR - 1;
+
+       if (order == HPAGE_PMD_ORDER)
+               return khugepaged_max_ptes_none;
+
+       max_ptes_none = min(khugepaged_max_ptes_none, HPAGE_PMD_NR/2 - 1);
+
+       return max_ptes_none >> (HPAGE_PMD_ORDER - order);
+
+}
+
 void khugepaged_enter_vma(struct vm_area_struct *vma,
                          vm_flags_t vm_flags)
 {
@@ -546,7 +579,7 @@ static int __collapse_huge_page_isolate(struct 
vm_area_struct *vma,
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        const unsigned long nr_pages = 1UL << order;
-       int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - 
order);
+       int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged);
 
        for (_pte = pte; _pte < pte + nr_pages;
             _pte++, addr += PAGE_SIZE) {
-- 
2.51.0


Reply via email to