On Tue, 20 Jan 2026 10:43:47 +0800 Jiayuan Chen <[email protected]> wrote:
> == Problem == > > We observed an issue in production on a multi-NUMA system where kswapd > runs endlessly, causing sustained heavy IO READ pressure across the > entire system. > > The root cause is that direct reclaim triggered by cgroup memory.high > keeps resetting kswapd_failures to 0, even when the node cannot be > balanced. This prevents kswapd from ever stopping after reaching > MAX_RECLAIM_RETRIES. > Updated, thanks. > v3 -> v4: > https://lore.kernel.org/linux-mm/[email protected]/ > - Add Acked-by tags > - Some modifications suggested by Johannes Weiner Here's how v4 altered mm.git: include/linux/mmzone.h | 26 ++++++++----- include/trace/events/vmscan.h | 24 ++++++------ mm/memory-tiers.c | 2 - mm/page_alloc.c | 4 +- mm/show_mem.c | 3 - mm/vmscan.c | 60 +++++++++++++++++--------------- mm/vmstat.c | 2 - 7 files changed, 64 insertions(+), 57 deletions(-) --- a/include/linux/mmzone.h~b +++ a/include/linux/mmzone.h @@ -1531,26 +1531,30 @@ static inline unsigned long pgdat_end_pf return pgdat->node_start_pfn + pgdat->node_spanned_pages; } -enum reset_kswapd_failures_reason { - RESET_KSWAPD_FAILURES_OTHER = 0, - RESET_KSWAPD_FAILURES_KSWAPD, - RESET_KSWAPD_FAILURES_DIRECT, - RESET_KSWAPD_FAILURES_PCP, -}; - -void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum reset_kswapd_failures_reason reason); - #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); -void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); + +enum kswapd_clear_hopeless_reason { + KSWAPD_CLEAR_HOPELESS_OTHER = 0, + KSWAPD_CLEAR_HOPELESS_KSWAPD, + KSWAPD_CLEAR_HOPELESS_DIRECT, + KSWAPD_CLEAR_HOPELESS_PCP, +}; + +void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, + enum zone_type highest_zoneidx); +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx); +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason); +bool kswapd_test_hopeless(pg_data_t *pgdat); + /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. --- a/include/trace/events/vmscan.h~b +++ a/include/trace/events/vmscan.h @@ -40,16 +40,16 @@ {_VMSCAN_THROTTLE_CONGESTED, "VMSCAN_THROTTLE_CONGESTED"} \ ) : "VMSCAN_THROTTLE_NONE" -TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_OTHER); -TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_KSWAPD); -TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_DIRECT); -TRACE_DEFINE_ENUM(RESET_KSWAPD_FAILURES_PCP); - -#define reset_kswapd_src \ - {RESET_KSWAPD_FAILURES_KSWAPD, "KSWAPD"}, \ - {RESET_KSWAPD_FAILURES_DIRECT, "DIRECT"}, \ - {RESET_KSWAPD_FAILURES_PCP, "PCP"}, \ - {RESET_KSWAPD_FAILURES_OTHER, "OTHER"} +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_OTHER); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_KSWAPD); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_DIRECT); +TRACE_DEFINE_ENUM(KSWAPD_CLEAR_HOPELESS_PCP); + +#define kswapd_clear_hopeless_reason_ops \ + {KSWAPD_CLEAR_HOPELESS_KSWAPD, "KSWAPD"}, \ + {KSWAPD_CLEAR_HOPELESS_DIRECT, "DIRECT"}, \ + {KSWAPD_CLEAR_HOPELESS_PCP, "PCP"}, \ + {KSWAPD_CLEAR_HOPELESS_OTHER, "OTHER"} #define trace_reclaim_flags(file) ( \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ @@ -566,7 +566,7 @@ TRACE_EVENT(mm_vmscan_kswapd_reclaim_fai __entry->nid, __entry->failures) ); -TRACE_EVENT(mm_vmscan_reset_kswapd_failures, +TRACE_EVENT(mm_vmscan_kswapd_clear_hopeless, TP_PROTO(int nid, int reason), @@ -584,7 +584,7 @@ TRACE_EVENT(mm_vmscan_reset_kswapd_failu TP_printk("nid=%d reason=%s", __entry->nid, - __print_symbolic(__entry->reason, reset_kswapd_src)) + __print_symbolic(__entry->reason, kswapd_clear_hopeless_reason_ops)) ); #endif /* _TRACE_VMSCAN_H */ --- a/mm/memory-tiers.c~b +++ a/mm/memory-tiers.c @@ -955,7 +955,7 @@ static ssize_t demotion_enabled_store(st struct pglist_data *pgdat; for_each_online_pgdat(pgdat) - pgdat_reset_kswapd_failures(pgdat, RESET_KSWAPD_FAILURES_OTHER); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_OTHER); } return count; --- a/mm/page_alloc.c~b +++ a/mm/page_alloc.c @@ -2945,9 +2945,9 @@ static bool free_frozen_page_commit(stru * 'hopeless node' to stay in that state for a while. Let * kswapd work again by resetting kswapd_failures. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + if (kswapd_test_hopeless(pgdat) && next_memory_node(pgdat->node_id) < MAX_NUMNODES) - pgdat_reset_kswapd_failures(pgdat, RESET_KSWAPD_FAILURES_PCP); + kswapd_clear_hopeless(pgdat, KSWAPD_CLEAR_HOPELESS_PCP); } return ret; } --- a/mm/show_mem.c~b +++ a/mm/show_mem.c @@ -278,8 +278,7 @@ static void show_free_areas(unsigned int #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(atomic_read(&pgdat->kswapd_failures) >= - MAX_RECLAIM_RETRIES), + str_yes_no(kswapd_test_hopeless(pgdat)), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } --- a/mm/vmscan.c~b +++ a/mm/vmscan.c @@ -506,7 +506,7 @@ static bool skip_throttle_noprogress(pg_ * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; /* @@ -2647,28 +2647,6 @@ static bool can_age_anon_pages(struct lr lruvec_memcg(lruvec)); } -void pgdat_reset_kswapd_failures(pg_data_t *pgdat, enum reset_kswapd_failures_reason reason) -{ - /* Only trace actual resets, not redundant zero-to-zero */ - if (atomic_xchg(&pgdat->kswapd_failures, 0)) - trace_mm_vmscan_reset_kswapd_failures(pgdat->node_id, reason); -} - -/* - * Reset kswapd_failures only when the node is balanced. Without this - * check, successful direct reclaim (e.g., from cgroup memory.high - * throttling) can keep resetting kswapd_failures even when the node - * cannot be balanced, causing kswapd to run endlessly. - */ -static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx); -static inline void pgdat_try_reset_kswapd_failures(struct pglist_data *pgdat, - struct scan_control *sc) -{ - if (pgdat_balanced(pgdat, sc->order, sc->reclaim_idx)) - pgdat_reset_kswapd_failures(pgdat, current_is_kswapd() ? - RESET_KSWAPD_FAILURES_KSWAPD : RESET_KSWAPD_FAILURES_DIRECT); -} - #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED @@ -5086,7 +5064,7 @@ static void lru_gen_shrink_node(struct p blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - pgdat_try_reset_kswapd_failures(pgdat, sc); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); } /****************************************************************************** @@ -6153,7 +6131,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat_try_reset_kswapd_failures(pgdat, sc); + kswapd_try_clear_hopeless(pgdat, sc->order, sc->reclaim_idx); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6458,7 +6436,7 @@ static bool allow_direct_reclaim(pg_data int i; bool wmark_ok; - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6867,7 +6845,7 @@ static bool prepare_kswapd_sleep(pg_data wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) + if (kswapd_test_hopeless(pgdat)) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7395,7 +7373,7 @@ void wakeup_kswapd(struct zone *zone, gf return; /* Hopeless node, leave it to direct reclaim if possible */ - if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || + if (kswapd_test_hopeless(pgdat) || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* @@ -7415,6 +7393,32 @@ void wakeup_kswapd(struct zone *zone, gf wake_up_interruptible(&pgdat->kswapd_wait); } +void kswapd_clear_hopeless(pg_data_t *pgdat, enum kswapd_clear_hopeless_reason reason) +{ + /* Only trace actual resets, not redundant zero-to-zero */ + if (atomic_xchg(&pgdat->kswapd_failures, 0)) + trace_mm_vmscan_kswapd_clear_hopeless(pgdat->node_id, reason); +} + +/* + * Reset kswapd_failures only when the node is balanced. Without this + * check, successful direct reclaim (e.g., from cgroup memory.high + * throttling) can keep resetting kswapd_failures even when the node + * cannot be balanced, causing kswapd to run endlessly. + */ +void kswapd_try_clear_hopeless(struct pglist_data *pgdat, + unsigned int order, int highest_zoneidx) +{ + if (pgdat_balanced(pgdat, order, highest_zoneidx)) + kswapd_clear_hopeless(pgdat, current_is_kswapd() ? + KSWAPD_CLEAR_HOPELESS_KSWAPD : KSWAPD_CLEAR_HOPELESS_DIRECT); +} + +bool kswapd_test_hopeless(pg_data_t *pgdat) +{ + return atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES; +} + #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of --- a/mm/vmstat.c~b +++ a/mm/vmstat.c @@ -1840,7 +1840,7 @@ static void zoneinfo_show_print(struct s "\n start_pfn: %lu" "\n reserved_highatomic: %lu" "\n free_highatomic: %lu", - atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, + kswapd_test_hopeless(pgdat), zone->zone_start_pfn, zone->nr_reserved_highatomic, zone->nr_free_highatomic); _
