Instead of iterating from all cgroups, reclaim cgroup that triggered allocation first. Don't reclaim cgroup if it causes refaults.
https://pmc.acronis.com/browse/VSTOR-19037 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- include/linux/memcontrol.h | 5 ++ include/linux/mmzone.h | 1 + mm/vmscan.c | 122 ++++++++++++++++++++++--------------- 3 files changed, 79 insertions(+), 49 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 575584dc1651..3dc16313a366 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -176,6 +176,11 @@ static inline void mem_cgroup_get(struct mem_cgroup *memcg) css_get(mem_cgroup_css(memcg)); } +static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg) +{ + return css_tryget(mem_cgroup_css(memcg)); +} + static inline void mem_cgroup_put(struct mem_cgroup *memcg) { css_put(mem_cgroup_css(memcg)); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 70e925d41445..59f53adfc1c5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -807,6 +807,7 @@ typedef struct pglist_data { mem_hotplug_begin/end() */ int kswapd_max_order; enum zone_type classzone_idx; + struct mem_cgroup *memcg; #ifdef CONFIG_NUMA_BALANCING /* Lock serializing the migrate rate limiting window */ spinlock_t numabalancing_migrate_lock; diff --git a/mm/vmscan.c b/mm/vmscan.c index fe651c6047db..583ba1abfc44 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2510,27 +2510,75 @@ static inline bool should_continue_reclaim(struct zone *zone, } } +static bool mem_cgroup_refaults(struct zone *zone, struct mem_cgroup *memcg) +{ + if (memcg) { + unsigned long refaults = memcg_ws_activates(memcg); + unsigned long snapshot = mem_cgroup_zone_lruvec(zone, memcg)->refaults; + + return refaults != snapshot; + } + return false; +} + +static unsigned long shrink_memcg(struct zone *zone, struct scan_control *sc, + struct mem_cgroup *memcg, bool is_classzone) +{ + struct mem_cgroup *root = sc->target_mem_cgroup; + struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long lru_pages, reclaimed; + bool slab_only = sc->slab_only; + struct lruvec *lruvec; + + if (!sc->may_thrash && mem_cgroup_low(root, memcg)) + return 0; + + if (sc->priority && mem_cgroup_refaults(zone, memcg)) + return 0; + + reclaimed = sc->nr_reclaimed; + + if (!slab_only) { + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + sc->swappiness = mem_cgroup_swappiness(memcg); + shrink_lruvec(lruvec, sc, &lru_pages); + } + + if (is_classzone) { + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->priority, false); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + sc->nr_scanned += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + + return sc->nr_reclaimed - reclaimed; +} + static void shrink_zone(struct zone *zone, struct scan_control *sc, bool is_classzone) { struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; - gfp_t slab_gfp = sc->gfp_mask; - bool slab_only = sc->slab_only; + struct mem_cgroup *target_memcg = NULL; + + if (current_is_kswapd()) { + target_memcg = smp_load_acquire(&zone->zone_pgdat->memcg); + mem_cgroup_get(target_memcg); + } - /* Disable fs-related IO for direct reclaim */ - if (!sc->target_mem_cgroup && - (current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) - slab_gfp &= ~__GFP_FS; + target_memcg = target_memcg ? : get_mem_cgroup_from_mm(current->mm); do { + unsigned long shrinked; struct mem_cgroup *root = sc->target_mem_cgroup; struct mem_cgroup_reclaim_cookie reclaim = { .zone = zone, .priority = sc->priority, }; - unsigned long zone_lru_pages = 0; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = target_memcg; struct reclaim_stat stat = {}; sc->stat = &stat; @@ -2538,50 +2586,19 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc, nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, &reclaim); - do { - unsigned long lru_pages, scanned; - struct lruvec *lruvec; - - if (!sc->may_thrash && mem_cgroup_low(root, memcg)) - continue; - - scanned = sc->nr_scanned; + shrinked = shrink_memcg(zone, sc, memcg, is_classzone); - if (!slab_only) { - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc, &lru_pages); - zone_lru_pages += lru_pages; - } + if (!shrinked) { + memcg = mem_cgroup_iter(root, NULL, &reclaim); + do { + shrink_memcg(zone, sc, memcg, is_classzone); - if (is_classzone) { - shrink_slab(slab_gfp, zone_to_nid(zone), - memcg, sc->priority, false); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - sc->nr_scanned += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(root, memcg); + break; } - - } - - /* - * Direct reclaim and kswapd have to scan all memory - * cgroups to fulfill the overall scan target for the - * zone. - * - * Limit reclaim, on the other hand, only cares about - * nr_to_reclaim pages to be reclaimed and it will - * retry with decreasing priority if one round over the - * whole hierarchy is not sufficient. - */ - if (!global_reclaim(sc) && - sc->nr_reclaimed >= sc->nr_to_reclaim) { - mem_cgroup_iter_break(root, memcg); - break; - } - } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + } if (global_reclaim(sc)) { /* @@ -2649,6 +2666,8 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc, } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + mem_cgroup_put(target_memcg); } /* Returns true if compaction should go ahead for a high-order request */ @@ -3811,6 +3830,7 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + struct mem_cgroup *prev_memcg; if (!populated_zone(zone)) return; @@ -3827,6 +3847,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) return; + prev_memcg = xchg(&pgdat->memcg, get_mem_cgroup_from_mm(current->mm)); + if (prev_memcg) + mem_cgroup_put(prev_memcg); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } -- 2.19.2 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel