On Tue, Sep 16, 2025 at 2:22 AM Lei Liu <[email protected]> wrote:
...
2. Solution Proposal
Introduce a Readahead LRU to track pages brought in via readahead. During
memory reclamation, prioritize scanning this LRU to reclaim pages that
have not been accessed recently. For pages in the Readahead LRU that are
accessed, move them back to the inactive_file LRU to await subsequent
reclamation.
I'm unsure this is the right solution though, given all users would
have this readahead LRU on and we don't have performance numbers
besides application startup here.
My impression is that readahead behavior is highly dependent on the
hardware, the workload, and the desired behavior, so making the
readahead{-adjacent} behavior more amenable to tuning seems like the
right direction.
Maybe relevant discussions: https://lwn.net/Articles/897786/
I only skimmed the code but noticed a few things:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index a458f1e112fd..4f3f031134fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -71,6 +71,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]);
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
+ show_val_kb(m, "ReadAhead(file):",
I notice both readahead and read ahead in this patch. Stick to the
conventional one (readahead).
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 8d3fa3a91ce4..57dac828aa4f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -127,6 +127,7 @@ enum pageflags {
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
PG_arch_3,
#endif
+ PG_readahead_lru,
More pageflags...
b/include/trace/events/mmflags.h
index aa441f593e9a..2dbc1701e838 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -159,7 +159,8 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT);
DEF_PAGEFLAG_NAME(reclaim), \
DEF_PAGEFLAG_NAME(swapbacked), \
DEF_PAGEFLAG_NAME(unevictable), \
- DEF_PAGEFLAG_NAME(dropbehind) \
+ DEF_PAGEFLAG_NAME(dropbehind), \
+ DEF_PAGEFLAG_NAME(readahead_lru) \
IF_HAVE_PG_MLOCK(mlocked) \
IF_HAVE_PG_HWPOISON(hwpoison) \
IF_HAVE_PG_IDLE(idle) \
@@ -309,6 +310,7 @@ IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" )
\
EM (LRU_ACTIVE_ANON, "active_anon") \
EM (LRU_INACTIVE_FILE, "inactive_file") \
EM (LRU_ACTIVE_FILE, "active_file") \
+ EM(LRU_READ_AHEAD_FILE, "readahead_file") \
Likewise, inconsistent naming.
diff --git a/mm/migrate.c b/mm/migrate.c
index 9e5ef39ce73a..0feab4d89d47 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -760,6 +760,8 @@ void folio_migrate_flags(struct folio *newfolio, struct
folio *folio)
folio_set_workingset(newfolio);
if (folio_test_checked(folio))
folio_set_checked(newfolio);
+ if (folio_test_readahead_lru(folio))
+ folio_set_readahead_lru(folio);
newfolio
/*
@@ -5800,6 +5837,87 @@ static void lru_gen_shrink_node(struct pglist_data
*pgdat, struct scan_control *
#endif /* CONFIG_LRU_GEN */
+static unsigned long shrink_read_ahead_list(unsigned long nr_to_scan,
+ unsigned long nr_to_reclaim,
+ struct lruvec *lruvec,
+ struct scan_control *sc)
+{
+ LIST_HEAD(l_hold);
+ LIST_HEAD(l_reclaim);
+ LIST_HEAD(l_inactive);
+ unsigned long nr_scanned = 0;
+ unsigned long nr_taken = 0;
+ unsigned long nr_reclaimed = 0;
+ unsigned long vm_flags;
+ enum vm_event_item item;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ struct reclaim_stat stat = { 0 };
+
+ lru_add_drain();
+
+ spin_lock_irq(&lruvec->lru_lock);
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned,
+ sc, LRU_READ_AHEAD_FILE);
+
+ __count_vm_events(PGSCAN_READAHEAD_FILE, nr_scanned);
+ __mod_node_page_state(pgdat, NR_ISOLATED_FILE, nr_taken);
+ item = PGSCAN_KSWAPD + reclaimer_offset(sc);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_FILE, nr_scanned);
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (nr_taken == 0)
+ return 0;
+
+ while (!list_empty(&l_hold)) {
+ struct folio *folio;
+
+ cond_resched();
+ folio = lru_to_folio(&l_hold);
+ list_del(&folio->lru);
+ folio_clear_readahead_lru(folio);
+
+ if (folio_referenced(folio, 0, sc->target_mem_cgroup,
&vm_flags)) {
+ list_add(&folio->lru, &l_inactive);
+ continue;
+ }
+ folio_clear_active(folio);
+ list_add(&folio->lru, &l_reclaim);
+ }
+
+ nr_reclaimed = shrink_folio_list(&l_reclaim, pgdat, sc, &stat, true,
+ lruvec_memcg(lruvec));
+
+ list_splice(&l_reclaim, &l_inactive);
+
+ spin_lock_irq(&lruvec->lru_lock);
+ move_folios_to_lru(lruvec, &l_inactive);
+ __mod_node_page_state(pgdat, NR_ISOLATED_FILE, -nr_taken);
+
+ __count_vm_events(PGSTEAL_READAHEAD_FILE, nr_reclaimed);
+ item = PGSTEAL_KSWAPD + reclaimer_offset(sc);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ __count_vm_events(PGSTEAL_FILE, nr_reclaimed);
+ spin_unlock_irq(&lruvec->lru_lock);
I see the idea is that readahead pages should be scanned before the
rest of inactive file. I wonder if this is achievable without adding
another LRU.
Thanks,
Yuanchu