An existing selftest can quickly demonstrate the effectiveness of this
patch. On a generic workstation equipped with 128 CPUs and 256GB DRAM:

  $ sudo max_guest_memory_test -c 64 -m 250 -s 250

  MGLRU      run2
  ---------------
  Before    ~600s
  After      ~50s
  Off       ~250s

  kswapd (MGLRU before)
    100.00%  balance_pgdat
      100.00%  shrink_node
        100.00%  shrink_one
          99.97%  try_to_shrink_lruvec
            99.06%  evict_folios
              97.41%  shrink_folio_list
                31.33%  folio_referenced
                  31.06%  rmap_walk_file
                    30.89%  folio_referenced_one
                      20.83%  __mmu_notifier_clear_flush_young
                        20.54%  kvm_mmu_notifier_clear_flush_young
  =>                      19.34%  _raw_write_lock

  kswapd (MGLRU after)
    100.00%  balance_pgdat
      100.00%  shrink_node
        100.00%  shrink_one
          99.97%  try_to_shrink_lruvec
            99.51%  evict_folios
              71.70%  shrink_folio_list
                7.08%  folio_referenced
                  6.78%  rmap_walk_file
                    6.72%  folio_referenced_one
                      5.60%  lru_gen_look_around
  =>                    1.53%  __mmu_notifier_test_clear_young

  kswapd (MGLRU off)
    100.00%  balance_pgdat
      100.00%  shrink_node
        99.92%  shrink_lruvec
          69.95%  shrink_folio_list
            19.35%  folio_referenced
              18.37%  rmap_walk_file
                17.88%  folio_referenced_one
                  13.20%  __mmu_notifier_clear_flush_young
                    11.64%  kvm_mmu_notifier_clear_flush_young
  =>                  9.93%  _raw_write_lock
          26.23%  shrink_active_list
            25.50%  folio_referenced
              25.35%  rmap_walk_file
                25.28%  folio_referenced_one
                  23.87%  __mmu_notifier_clear_flush_young
                    23.69%  kvm_mmu_notifier_clear_flush_young
  =>                  18.98%  _raw_write_lock

Signed-off-by: Yu Zhao <yuz...@google.com>
---
 include/linux/mmzone.h |   6 +-
 mm/rmap.c              |   8 +--
 mm/vmscan.c            | 127 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9fb1b03b83b2..ce34b7ea8e4c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -379,6 +379,7 @@ enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
+       LRU_GEN_SPTE_WALK,
        NR_LRU_GEN_CAPS
 };
 
@@ -485,7 +486,7 @@ struct lru_gen_mm_walk {
 };
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 
 #ifdef CONFIG_MEMCG
 
@@ -573,8 +574,9 @@ static inline void lru_gen_init_lruvec(struct lruvec 
*lruvec)
 {
 }
 
-static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
+       return false;
 }
 
 #ifdef CONFIG_MEMCG
diff --git a/mm/rmap.c b/mm/rmap.c
index 15ae24585fc4..eb0089f8f112 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -823,12 +823,10 @@ static bool folio_referenced_one(struct folio *folio,
                        return false; /* To break the loop */
                }
 
-               if (pvmw.pte) {
-                       if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
-                               lru_gen_look_around(&pvmw);
+               if (lru_gen_enabled() && pvmw.pte) {
+                       if (lru_gen_look_around(&pvmw))
                                referenced++;
-                       }
-
+               } else if (pvmw.pte) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte))
                                referenced++;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c1c5e8b24b8..d6d69f0baabf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -57,6 +57,8 @@
 #include <linux/khugepaged.h>
 #include <linux/rculist_nulls.h>
 #include <linux/random.h>
+#include <linux/mmu_notifier.h>
+#include <linux/kvm_host.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3923,6 +3925,55 @@ static struct folio *get_pfn_folio(unsigned long pfn, 
struct mem_cgroup *memcg,
        return folio;
 }
 
+static bool test_spte_young(struct mm_struct *mm, unsigned long addr, unsigned 
long end,
+                           unsigned long *bitmap, unsigned long *last)
+{
+       if (!kvm_arch_has_test_clear_young() || !get_cap(LRU_GEN_SPTE_WALK))
+               return false;
+
+       if (*last > addr)
+               goto done;
+
+       *last = end - addr > MIN_LRU_BATCH * PAGE_SIZE ?
+               addr + MIN_LRU_BATCH * PAGE_SIZE - 1 : end - 1;
+       bitmap_zero(bitmap, MIN_LRU_BATCH);
+
+       mmu_notifier_test_clear_young(mm, addr, *last + 1, false, bitmap);
+done:
+       return test_bit((*last - addr) / PAGE_SIZE, bitmap);
+}
+
+static void clear_spte_young(struct mm_struct *mm, unsigned long addr,
+                            unsigned long *bitmap, unsigned long *last)
+{
+       int i;
+       unsigned long start, end = *last + 1;
+
+       if (addr + PAGE_SIZE != end)
+               return;
+
+       i = find_last_bit(bitmap, MIN_LRU_BATCH);
+       if (i == MIN_LRU_BATCH)
+               return;
+
+       start = end - (i + 1) * PAGE_SIZE;
+
+       i = find_first_bit(bitmap, MIN_LRU_BATCH);
+
+       end -= i * PAGE_SIZE;
+
+       mmu_notifier_test_clear_young(mm, start, end, false, bitmap);
+}
+
+static void skip_spte_young(struct mm_struct *mm, unsigned long addr,
+                           unsigned long *bitmap, unsigned long *last)
+{
+       if (*last > addr)
+               __clear_bit((*last - addr) / PAGE_SIZE, bitmap);
+
+       clear_spte_young(mm, addr, bitmap, last);
+}
+
 static bool suitable_to_scan(int total, int young)
 {
        int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
@@ -3938,6 +3989,8 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long 
start, unsigned long end,
        pte_t *pte;
        spinlock_t *ptl;
        unsigned long addr;
+       unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+       unsigned long last = 0;
        int total = 0;
        int young = 0;
        struct lru_gen_mm_walk *walk = args->private;
@@ -3956,6 +4009,7 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long 
start, unsigned long end,
        pte = pte_offset_map(pmd, start & PMD_MASK);
 restart:
        for (i = pte_index(start), addr = start; addr != end; i++, addr += 
PAGE_SIZE) {
+               bool success;
                unsigned long pfn;
                struct folio *folio;
 
@@ -3963,20 +4017,27 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long 
start, unsigned long end,
                walk->mm_stats[MM_LEAF_TOTAL]++;
 
                pfn = get_pte_pfn(pte[i], args->vma, addr);
-               if (pfn == -1)
+               if (pfn == -1) {
+                       skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
                        continue;
+               }
 
-               if (!pte_young(pte[i])) {
+               success = test_spte_young(args->vma->vm_mm, addr, end, bitmap, 
&last);
+               if (!success && !pte_young(pte[i])) {
+                       skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
                        walk->mm_stats[MM_LEAF_OLD]++;
                        continue;
                }
 
                folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
-               if (!folio)
+               if (!folio) {
+                       skip_spte_young(args->vma->vm_mm, addr, bitmap, &last);
                        continue;
+               }
 
-               if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
-                       VM_WARN_ON_ONCE(true);
+               clear_spte_young(args->vma->vm_mm, addr, bitmap, &last);
+               if (pte_young(pte[i]))
+                       ptep_test_and_clear_young(args->vma, addr, pte + i);
 
                young++;
                walk->mm_stats[MM_LEAF_YOUNG]++;
@@ -4581,6 +4642,24 @@ static void lru_gen_age_node(struct pglist_data *pgdat, 
struct scan_control *sc)
  *                          rmap/PT walk feedback
  
******************************************************************************/
 
+static bool should_look_around(struct vm_area_struct *vma, unsigned long addr,
+                              pte_t *pte, int *young)
+{
+       unsigned long old = true;
+
+       *young = mmu_notifier_test_clear_young(vma->vm_mm, addr, addr + 
PAGE_SIZE, true, &old);
+       if (!old)
+               *young = true;
+
+       if (pte_young(*pte)) {
+               ptep_test_and_clear_young(vma, addr, pte);
+               *young = true;
+               return true;
+       }
+
+       return !old && get_cap(LRU_GEN_SPTE_WALK);
+}
+
 /*
  * This function exploits spatial locality when shrink_folio_list() walks the
  * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
@@ -4588,12 +4667,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, 
struct scan_control *sc)
  * the PTE table to the Bloom filter. This forms a feedback loop between the
  * eviction and the aging.
  */
-void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 {
        int i;
        unsigned long start;
        unsigned long end;
        struct lru_gen_mm_walk *walk;
+       unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
+       unsigned long last = 0;
        int young = 0;
        pte_t *pte = pvmw->pte;
        unsigned long addr = pvmw->address;
@@ -4607,8 +4688,11 @@ void lru_gen_look_around(struct page_vma_mapped_walk 
*pvmw)
        lockdep_assert_held(pvmw->ptl);
        VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 
+       if (!should_look_around(pvmw->vma, addr, pte, &young))
+               return young;
+
        if (spin_is_contended(pvmw->ptl))
-               return;
+               return young;
 
        /* avoid taking the LRU lock under the PTL when possible */
        walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
@@ -4616,6 +4700,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk 
*pvmw)
        start = max(addr & PMD_MASK, pvmw->vma->vm_start);
        end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
 
+       if (end - start == PAGE_SIZE)
+               return young;
+
        if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
                if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
                        end = start + MIN_LRU_BATCH * PAGE_SIZE;
@@ -4629,28 +4716,37 @@ void lru_gen_look_around(struct page_vma_mapped_walk 
*pvmw)
 
        /* folio_update_gen() requires stable folio_memcg() */
        if (!mem_cgroup_trylock_pages(memcg))
-               return;
+               return young;
 
        arch_enter_lazy_mmu_mode();
 
        pte -= (addr - start) / PAGE_SIZE;
 
        for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+               bool success;
                unsigned long pfn;
 
                pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
-               if (pfn == -1)
+               if (pfn == -1) {
+                       skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
                        continue;
+               }
 
-               if (!pte_young(pte[i]))
+               success = test_spte_young(pvmw->vma->vm_mm, addr, end, bitmap, 
&last);
+               if (!success && !pte_young(pte[i])) {
+                       skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
                        continue;
+               }
 
                folio = get_pfn_folio(pfn, memcg, pgdat, !walk || 
walk->can_swap);
-               if (!folio)
+               if (!folio) {
+                       skip_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
                        continue;
+               }
 
-               if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
-                       VM_WARN_ON_ONCE(true);
+               clear_spte_young(pvmw->vma->vm_mm, addr, bitmap, &last);
+               if (pte_young(pte[i]))
+                       ptep_test_and_clear_young(pvmw->vma, addr, pte + i);
 
                young++;
 
@@ -4680,6 +4776,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk 
*pvmw)
        /* feedback from rmap walkers to page table walkers */
        if (suitable_to_scan(i, young))
                update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+       return young;
 }
 
 /******************************************************************************
@@ -5699,6 +5797,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct 
kobj_attribute *attr, c
        if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
                caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 
+       if (kvm_arch_has_test_clear_young() && get_cap(LRU_GEN_SPTE_WALK))
+               caps |= BIT(LRU_GEN_SPTE_WALK);
+
        return sysfs_emit(buf, "0x%04x\n", caps);
 }
 
-- 
2.39.2.637.g21b0678d19-goog

Reply via email to