Currently we collect reclaim stats per-lru list and set zone flags based on these stats. This seems wrong, as lrus a per-memcg thus one zone could have hundreds of them.
Move all that zone-related logic from shrink_inactive_list() to shrink_zone, and make decisions based on per-zone sum of reclaim stat instead of just per-lru. https://jira.sw.ru/browse/PSBM-61409 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- mm/vmscan.c | 109 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e6dde1e15a54..d71fa15a1750 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1628,61 +1628,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, sc->stat->nr_immediate += stat.nr_immediate; } - /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - zone_set_flag(zone, ZONE_WRITEBACK); - - if (!global_reclaim(sc) && stat.nr_immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - - if (sane_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - zone_set_flag(zone, ZONE_CONGESTED); - - /* - * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not keeping up. In this case, flag - * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing - * pages from reclaim context. - */ - if (stat.nr_unqueued_dirty == nr_taken) - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd()) - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone_to_nid(zone), zone_idx(zone), nr_scanned, nr_reclaimed, stat.nr_dirty, stat.nr_writeback, @@ -2485,6 +2430,60 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc, shrink_slab(slab_gfp, zone_to_nid(zone), NULL, sc->priority, false); + if (global_reclaim(sc)) { + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. + */ + if (stat.nr_writeback && stat.nr_writeback == stat.nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); + + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. + */ + if (stat.nr_unqueued_dirty == stat.nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (stat.nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + if (!global_reclaim(sc) && stat.nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd()) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; -- 2.13.6 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel