In the page fault path, we want to add pages to the per-zone lists
index by max_seq as they cannot be evicted without going through
the aging first. For anon pages, we rename
lru_cache_add_inactive_or_unevictable() to lru_cache_add_page_vma()
and add a new parameter, which is set to true in the page fault path,
to indicate whether they should be added to the per-zone lists index
by max_seq. For page/swap cache, since we cannot differentiate the
page fault path from the read ahead path at the time we call
lru_cache_add() in add_to_page_cache_lru() and
__read_swap_cache_async(), we have to add a new function
lru_gen_activate_page(), which is essentially activate_page(), to move
pages to the per-zone lists indexed by max_seq at a later time.
Hopefully we would find pages we want to activate in lru_pvecs.lru_add
and simply set PageActive() on them without having to actually move
them.

In the reclaim path, pages mapped around a referenced PTE may also
have been referenced due to spatial locality. We add a new function
lru_gen_scan_around() to scan the vicinity of such a PTE.

In addition, we add a new function page_is_active() to tell whether a
page is active. We cannot use PageActive() because it is only set on
active pages while they are not on multigenerational lru. It is
cleared while pages are on multigenerational lru, in order to spare
the aging the trouble of clearing it when an active generation becomes
inactive. Internally, page_is_active() compares the generation number
of a page with max_seq and max_seq-1, which are active generations and
protected from the eviction. Other generations, which may or may not
exist, are inactive.

Signed-off-by: Yu Zhao <yuz...@google.com>
---
 fs/proc/task_mmu.c        |  3 ++-
 include/linux/mm_inline.h | 52 ++++++++++++++++++++++++++++++++++++++
 include/linux/mmzone.h    |  6 +++++
 include/linux/swap.h      |  4 +--
 kernel/events/uprobes.c   |  2 +-
 mm/huge_memory.c          |  2 +-
 mm/khugepaged.c           |  2 +-
 mm/memory.c               | 14 +++++++----
 mm/migrate.c              |  2 +-
 mm/rmap.c                 |  6 +++++
 mm/swap.c                 | 26 +++++++++++--------
 mm/swapfile.c             |  2 +-
 mm/userfaultfd.c          |  2 +-
 mm/vmscan.c               | 53 ++++++++++++++++++++++++++++++++++++++-
 14 files changed, 150 insertions(+), 26 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3cec6fbef725..7cd173710e76 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/uaccess.h>
 #include <linux/pkeys.h>
+#include <linux/mm_inline.h>
 
 #include <asm/elf.h>
 #include <asm/tlb.h>
@@ -1720,7 +1721,7 @@ static void gather_stats(struct page *page, struct 
numa_maps *md, int pte_dirty,
        if (PageSwapCache(page))
                md->swapcache += nr_pages;
 
-       if (PageActive(page) || PageUnevictable(page))
+       if (PageUnevictable(page) || page_is_active(compound_head(page), NULL))
                md->active += nr_pages;
 
        if (PageWriteback(page))
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2d306cab36bc..a1a382418fc4 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -116,6 +116,49 @@ static inline int page_lru_gen(struct page *page)
        return ((READ_ONCE(page->flags) & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 }
 
+/* This function works regardless whether multigenerational lru is enabled. */
+static inline bool page_is_active(struct page *page, struct lruvec *lruvec)
+{
+       struct mem_cgroup *memcg;
+       int gen = page_lru_gen(page);
+       bool active = false;
+
+       VM_BUG_ON_PAGE(PageTail(page), page);
+
+       if (gen < 0)
+               return PageActive(page);
+
+       if (lruvec) {
+               VM_BUG_ON_PAGE(PageUnevictable(page), page);
+               VM_BUG_ON_PAGE(PageActive(page), page);
+               lockdep_assert_held(&lruvec->lru_lock);
+
+               return lru_gen_is_active(lruvec, gen);
+       }
+
+       rcu_read_lock();
+
+       memcg = page_memcg_rcu(page);
+       lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
+       active = lru_gen_is_active(lruvec, gen);
+
+       rcu_read_unlock();
+
+       return active;
+}
+
+/* Activate a page from page cache or swap cache after it's mapped. */
+static inline void lru_gen_activate_page(struct page *page, struct 
vm_area_struct *vma)
+{
+       if (!lru_gen_enabled() || PageActive(page))
+               return;
+
+       if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_HUGETLB))
+               return;
+
+       activate_page(page);
+}
+
 /* Update multigenerational lru sizes in addition to active/inactive lru 
sizes. */
 static inline void lru_gen_update_size(struct page *page, struct lruvec 
*lruvec,
                                       int old_gen, int new_gen)
@@ -252,6 +295,15 @@ static inline bool lru_gen_enabled(void)
        return false;
 }
 
+static inline bool page_is_active(struct page *page, struct lruvec *lruvec)
+{
+       return PageActive(page);
+}
+
+static inline void lru_gen_activate_page(struct page *page, struct 
vm_area_struct *vma)
+{
+}
+
 static inline bool page_set_lru_gen(struct page *page, struct lruvec *lruvec, 
bool front)
 {
        return false;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 173083bb846e..99156602cd06 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -292,6 +292,7 @@ enum lruvec_flags {
 };
 
 struct lruvec;
+struct page_vma_mapped_walk;
 
 #define LRU_GEN_MASK   ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
 
@@ -328,6 +329,7 @@ struct lru_gen {
 
 void lru_gen_init_lruvec(struct lruvec *lruvec);
 void lru_gen_set_state(bool enable, bool main, bool swap);
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw);
 
 #else /* CONFIG_LRU_GEN */
 
@@ -339,6 +341,10 @@ static inline void lru_gen_set_state(bool enable, bool 
main, bool swap)
 {
 }
 
+static inline void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+}
+
 #endif /* CONFIG_LRU_GEN */
 
 struct lruvec {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2bbbf181ba..0e7532c7db22 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -350,8 +350,8 @@ extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
 extern void swap_setup(void);
 
-extern void lru_cache_add_inactive_or_unevictable(struct page *page,
-                                               struct vm_area_struct *vma);
+extern void lru_cache_add_page_vma(struct page *page, struct vm_area_struct 
*vma,
+                                  bool faulting);
 
 /* linux/mm/vmscan.c */
 extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6addc9780319..4e93e5602723 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -184,7 +184,7 @@ static int __replace_page(struct vm_area_struct *vma, 
unsigned long addr,
        if (new_page) {
                get_page(new_page);
                page_add_new_anon_rmap(new_page, vma, addr, false);
-               lru_cache_add_inactive_or_unevictable(new_page, vma);
+               lru_cache_add_page_vma(new_page, vma, false);
        } else
                /* no new page, just dec_mm_counter for old_page */
                dec_mm_counter(mm, MM_ANONPAGES);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index be9bf681313c..62e14da5264e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -637,7 +637,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr, true);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               lru_cache_add_page_vma(page, vma, true);
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a7d6cb912b05..08a43910f232 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1199,7 +1199,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address, true);
-       lru_cache_add_inactive_or_unevictable(new_page, vma);
+       lru_cache_add_page_vma(new_page, vma, true);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
diff --git a/mm/memory.c b/mm/memory.c
index c8e357627318..7188607bddb9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -73,6 +73,7 @@
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
 
 #include <trace/events/kmem.h>
 
@@ -845,7 +846,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct 
vm_area_struct *src_vma
        copy_user_highpage(new_page, page, addr, src_vma);
        __SetPageUptodate(new_page);
        page_add_new_anon_rmap(new_page, dst_vma, addr, false);
-       lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
+       lru_cache_add_page_vma(new_page, dst_vma, false);
        rss[mm_counter(new_page)]++;
 
        /* All done, just insert the new page copy in the child */
@@ -2913,7 +2914,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 */
                ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
                page_add_new_anon_rmap(new_page, vma, vmf->address, false);
-               lru_cache_add_inactive_or_unevictable(new_page, vma);
+               lru_cache_add_page_vma(new_page, vma, true);
                /*
                 * We call the notify macro here because, when using secondary
                 * mmu page tables (such as kvm shadow page tables), we want the
@@ -3436,9 +3437,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        /* ksm created a completely new copy */
        if (unlikely(page != swapcache && swapcache)) {
                page_add_new_anon_rmap(page, vma, vmf->address, false);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               lru_cache_add_page_vma(page, vma, true);
        } else {
                do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+               lru_gen_activate_page(page, vma);
        }
 
        swap_free(entry);
@@ -3582,7 +3584,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
        inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, vmf->address, false);
-       lru_cache_add_inactive_or_unevictable(page, vma);
+       lru_cache_add_page_vma(page, vma, true);
 setpte:
        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
 
@@ -3707,6 +3709,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page 
*page)
 
        add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
        page_add_file_rmap(page, true);
+       lru_gen_activate_page(page, vma);
        /*
         * deposit and withdraw with pmd lock held
         */
@@ -3750,10 +3753,11 @@ void do_set_pte(struct vm_fault *vmf, struct page 
*page, unsigned long addr)
        if (write && !(vma->vm_flags & VM_SHARED)) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
                page_add_new_anon_rmap(page, vma, addr, false);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               lru_cache_add_page_vma(page, vma, true);
        } else {
                inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page, false);
+               lru_gen_activate_page(page, vma);
        }
        set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..1064b03cac33 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3004,7 +3004,7 @@ static void migrate_vma_insert_page(struct migrate_vma 
*migrate,
        inc_mm_counter(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, addr, false);
        if (!is_zone_device_page(page))
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               lru_cache_add_page_vma(page, vma, false);
        get_page(page);
 
        if (flush) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b0fc27e77d6d..a44f9ee74ee1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,7 @@
 #include <linux/page_idle.h>
 #include <linux/memremap.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
 
 #include <asm/tlbflush.h>
 
@@ -792,6 +793,11 @@ static bool page_referenced_one(struct page *page, struct 
vm_area_struct *vma,
                }
 
                if (pvmw.pte) {
+                       /* multigenerational lru exploits spatial locality */
+                       if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
+                               lru_gen_scan_around(&pvmw);
+                               referenced++;
+                       }
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte)) {
                                /*
diff --git a/mm/swap.c b/mm/swap.c
index bd10efe00684..7aa85004b490 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -310,7 +310,7 @@ void lru_note_cost_page(struct page *page)
 
 static void __activate_page(struct page *page, struct lruvec *lruvec)
 {
-       if (!PageActive(page) && !PageUnevictable(page)) {
+       if (!PageUnevictable(page) && !page_is_active(page, lruvec)) {
                int nr_pages = thp_nr_pages(page);
 
                del_page_from_lru_list(page, lruvec);
@@ -341,7 +341,7 @@ static bool need_activate_page_drain(int cpu)
 static void activate_page_on_lru(struct page *page)
 {
        page = compound_head(page);
-       if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+       if (PageLRU(page) && !PageUnevictable(page) && !page_is_active(page, 
NULL)) {
                struct pagevec *pvec;
 
                local_lock(&lru_pvecs.lock);
@@ -435,7 +435,7 @@ void mark_page_accessed(struct page *page)
                 * this list is never rotated or maintained, so marking an
                 * evictable page accessed has no effect.
                 */
-       } else if (!PageActive(page)) {
+       } else if (!page_is_active(page, NULL)) {
                activate_page(page);
                ClearPageReferenced(page);
                workingset_activation(page);
@@ -471,15 +471,14 @@ void lru_cache_add(struct page *page)
 EXPORT_SYMBOL(lru_cache_add);
 
 /**
- * lru_cache_add_inactive_or_unevictable
+ * lru_cache_add_page_vma
  * @page:  the page to be added to LRU
  * @vma:   vma in which page is mapped for determining reclaimability
  *
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * Place @page on an LRU list, depending on its evictability.
  */
-void lru_cache_add_inactive_or_unevictable(struct page *page,
-                                        struct vm_area_struct *vma)
+void lru_cache_add_page_vma(struct page *page, struct vm_area_struct *vma,
+                           bool faulting)
 {
        bool unevictable;
 
@@ -496,6 +495,11 @@ void lru_cache_add_inactive_or_unevictable(struct page 
*page,
                __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
                count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }
+
+       /* multigenerational lru uses PageActive() to track page faults */
+       if (lru_gen_enabled() && !unevictable && faulting)
+               SetPageActive(page);
+
        lru_cache_add(page);
 }
 
@@ -522,7 +526,7 @@ void lru_cache_add_inactive_or_unevictable(struct page 
*page,
  */
 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
 {
-       bool active = PageActive(page);
+       bool active = page_is_active(page, lruvec);
        int nr_pages = thp_nr_pages(page);
 
        if (PageUnevictable(page))
@@ -562,7 +566,7 @@ static void lru_deactivate_file_fn(struct page *page, 
struct lruvec *lruvec)
 
 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
 {
-       if (PageActive(page) && !PageUnevictable(page)) {
+       if (!PageUnevictable(page) && page_is_active(page, lruvec)) {
                int nr_pages = thp_nr_pages(page);
 
                del_page_from_lru_list(page, lruvec);
@@ -676,7 +680,7 @@ void deactivate_file_page(struct page *page)
  */
 void deactivate_page(struct page *page)
 {
-       if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+       if (PageLRU(page) && !PageUnevictable(page) && page_is_active(page, 
NULL)) {
                struct pagevec *pvec;
 
                local_lock(&lru_pvecs.lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fe03cfeaa08f..c0956b3bde03 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1936,7 +1936,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t 
*pmd,
                page_add_anon_rmap(page, vma, addr, false);
        } else { /* ksm created a completely new copy */
                page_add_new_anon_rmap(page, vma, addr, false);
-               lru_cache_add_inactive_or_unevictable(page, vma);
+               lru_cache_add_page_vma(page, vma, false);
        }
        swap_free(entry);
 out:
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..e1d4cd3103b8 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 
        inc_mm_counter(dst_mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
-       lru_cache_add_inactive_or_unevictable(page, dst_vma);
+       lru_cache_add_page_vma(page, dst_vma, true);
 
        set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd49a9a5d7f5..ce868d89dc53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1876,7 +1876,7 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
                add_page_to_lru_list(page, lruvec);
                nr_pages = thp_nr_pages(page);
                nr_moved += nr_pages;
-               if (PageActive(page))
+               if (page_is_active(page, lruvec))
                        workingset_age_nonresident(lruvec, nr_pages);
        }
 
@@ -4688,6 +4688,57 @@ static int page_update_lru_gen(struct page *page, int 
new_gen)
        return old_gen;
 }
 
+void lru_gen_scan_around(struct page_vma_mapped_walk *pvmw)
+{
+       pte_t *pte;
+       unsigned long start, end;
+       int old_gen, new_gen;
+       unsigned long flags;
+       struct lruvec *lruvec;
+       struct mem_cgroup *memcg;
+       struct pglist_data *pgdat = page_pgdat(pvmw->page);
+
+       lockdep_assert_held(pvmw->ptl);
+       VM_BUG_ON_VMA(pvmw->address < pvmw->vma->vm_start, pvmw->vma);
+
+       start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+       end = pmd_addr_end(pvmw->address, pvmw->vma->vm_end);
+       pte = pvmw->pte - ((pvmw->address - start) >> PAGE_SHIFT);
+
+       memcg = lock_page_memcg(pvmw->page);
+       lruvec = lock_page_lruvec_irqsave(pvmw->page, &flags);
+
+       new_gen = lru_gen_from_seq(lruvec->evictable.max_seq);
+
+       for (; start != end; pte++, start += PAGE_SIZE) {
+               struct page *page;
+               unsigned long pfn = pte_pfn(*pte);
+
+               if (!pte_present(*pte) || !pte_young(*pte) || is_zero_pfn(pfn))
+                       continue;
+
+               if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+                       continue;
+
+               page = compound_head(pte_page(*pte));
+               if (page_to_nid(page) != pgdat->node_id)
+                       continue;
+               if (page_memcg_rcu(page) != memcg)
+                       continue;
+               /*
+                * We may be holding many locks. So try to finish as fast as
+                * possible and leave the accessed and the dirty bits to page
+                * table walk.
+                */
+               old_gen = page_update_lru_gen(page, new_gen);
+               if (old_gen >= 0 && old_gen != new_gen)
+                       lru_gen_update_size(page, lruvec, old_gen, new_gen);
+       }
+
+       unlock_page_lruvec_irqrestore(lruvec, flags);
+       unlock_page_memcg(pvmw->page);
+}
+
 struct mm_walk_args {
        struct mem_cgroup *memcg;
        unsigned long max_seq;
-- 
2.31.0.rc2.261.g7f71774620-goog

Reply via email to