Re: [PATCH 5/8] mm: vmscan: replace shrink_node() loop with a retry jump

Roman Gushchin Tue, 22 Oct 2019 12:57:21 -0700

On Tue, Oct 22, 2019 at 10:48:00AM -0400, Johannes Weiner wrote:
> Most of the function body is inside a loop, which imposes an
> additional indentation and scoping level that makes the code a bit
> hard to follow and modify.
> 
> The looping only happens in case of reclaim-compaction, which isn't
> the common case. So rather than adding yet another function level to
> the reclaim path and have every reclaim invocation go through a level
> that only exists for one specific cornercase, use a retry goto.
> 
> Signed-off-by: Johannes Weiner <[email protected]>
> ---
>  mm/vmscan.c | 231 ++++++++++++++++++++++++++--------------------------
>  1 file changed, 115 insertions(+), 116 deletions(-)
> 
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 302dad112f75..235d1fc72311 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
> struct mem_cgroup *memcg)
>  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
>  {
>       struct reclaim_state *reclaim_state = current->reclaim_state;
> +     struct mem_cgroup *root = sc->target_mem_cgroup;
>       unsigned long nr_reclaimed, nr_scanned;
>       bool reclaimable = false;
> +     struct mem_cgroup *memcg;
> +again:
> +     memset(&sc->nr, 0, sizeof(sc->nr));
>  
> -     do {
> -             struct mem_cgroup *root = sc->target_mem_cgroup;
> -             struct mem_cgroup *memcg;
> -
> -             memset(&sc->nr, 0, sizeof(sc->nr));
> -
> -             nr_reclaimed = sc->nr_reclaimed;
> -             nr_scanned = sc->nr_scanned;
> +     nr_reclaimed = sc->nr_reclaimed;
> +     nr_scanned = sc->nr_scanned;
>  
> -             memcg = mem_cgroup_iter(root, NULL, NULL);
> -             do {
> -                     unsigned long reclaimed;
> -                     unsigned long scanned;
> +     memcg = mem_cgroup_iter(root, NULL, NULL);
> +     do {
> +             unsigned long reclaimed;
> +             unsigned long scanned;
>  
> -                     switch (mem_cgroup_protected(root, memcg)) {
> -                     case MEMCG_PROT_MIN:
> -                             /*
> -                              * Hard protection.
> -                              * If there is no reclaimable memory, OOM.
> -                              */
> +             switch (mem_cgroup_protected(root, memcg)) {
> +             case MEMCG_PROT_MIN:
> +                     /*
> +                      * Hard protection.
> +                      * If there is no reclaimable memory, OOM.
> +                      */
> +                     continue;
> +             case MEMCG_PROT_LOW:
> +                     /*
> +                      * Soft protection.
> +                      * Respect the protection only as long as
> +                      * there is an unprotected supply
> +                      * of reclaimable memory from other cgroups.
> +                      */
> +                     if (!sc->memcg_low_reclaim) {
> +                             sc->memcg_low_skipped = 1;
>                               continue;
> -                     case MEMCG_PROT_LOW:
> -                             /*
> -                              * Soft protection.
> -                              * Respect the protection only as long as
> -                              * there is an unprotected supply
> -                              * of reclaimable memory from other cgroups.
> -                              */
> -                             if (!sc->memcg_low_reclaim) {
> -                                     sc->memcg_low_skipped = 1;
> -                                     continue;
> -                             }
> -                             memcg_memory_event(memcg, MEMCG_LOW);
> -                             break;
> -                     case MEMCG_PROT_NONE:
> -                             /*
> -                              * All protection thresholds breached. We may
> -                              * still choose to vary the scan pressure
> -                              * applied based on by how much the cgroup in
> -                              * question has exceeded its protection
> -                              * thresholds (see get_scan_count).
> -                              */
> -                             break;
>                       }
> +                     memcg_memory_event(memcg, MEMCG_LOW);
> +                     break;
> +             case MEMCG_PROT_NONE:
> +                     /*
> +                      * All protection thresholds breached. We may
> +                      * still choose to vary the scan pressure
> +                      * applied based on by how much the cgroup in
> +                      * question has exceeded its protection
> +                      * thresholds (see get_scan_count).
> +                      */
> +                     break;
> +             }
>  
> -                     reclaimed = sc->nr_reclaimed;
> -                     scanned = sc->nr_scanned;
> -                     shrink_node_memcg(pgdat, memcg, sc);
> -
> -                     shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> -                                     sc->priority);
> -
> -                     /* Record the group's reclaim efficiency */
> -                     vmpressure(sc->gfp_mask, memcg, false,
> -                                sc->nr_scanned - scanned,
> -                                sc->nr_reclaimed - reclaimed);
> -
> -             } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
> +             reclaimed = sc->nr_reclaimed;
> +             scanned = sc->nr_scanned;
> +             shrink_node_memcg(pgdat, memcg, sc);
>  
> -             if (reclaim_state) {
> -                     sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> -                     reclaim_state->reclaimed_slab = 0;
> -             }
> +             shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
> +                         sc->priority);
>  
> -             /* Record the subtree's reclaim efficiency */
> -             vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> -                        sc->nr_scanned - nr_scanned,
> -                        sc->nr_reclaimed - nr_reclaimed);
> +             /* Record the group's reclaim efficiency */
> +             vmpressure(sc->gfp_mask, memcg, false,
> +                        sc->nr_scanned - scanned,
> +                        sc->nr_reclaimed - reclaimed);


It doesn't look as a trivial change. I'd add some comments to the commit message
why it's safe to do.

Thanks!

>  
> -             if (sc->nr_reclaimed - nr_reclaimed)
> -                     reclaimable = true;
> +     } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
>  
> -             if (current_is_kswapd()) {
> -                     /*
> -                      * If reclaim is isolating dirty pages under writeback,
> -                      * it implies that the long-lived page allocation rate
> -                      * is exceeding the page laundering rate. Either the
> -                      * global limits are not being effective at throttling
> -                      * processes due to the page distribution throughout
> -                      * zones or there is heavy usage of a slow backing
> -                      * device. The only option is to throttle from reclaim
> -                      * context which is not ideal as there is no guarantee
> -                      * the dirtying process is throttled in the same way
> -                      * balance_dirty_pages() manages.
> -                      *
> -                      * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> -                      * count the number of pages under pages flagged for
> -                      * immediate reclaim and stall if any are encountered
> -                      * in the nr_immediate check below.
> -                      */
> -                     if (sc->nr.writeback && sc->nr.writeback == 
> sc->nr.taken)
> -                             set_bit(PGDAT_WRITEBACK, &pgdat->flags);
> +     if (reclaim_state) {
> +             sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> +             reclaim_state->reclaimed_slab = 0;
> +     }
>  
> -                     /*
> -                      * Tag a node as congested if all the dirty pages
> -                      * scanned were backed by a congested BDI and
> -                      * wait_iff_congested will stall.
> -                      */
> -                     if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -                             set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +     /* Record the subtree's reclaim efficiency */
> +     vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
> +                sc->nr_scanned - nr_scanned,
> +                sc->nr_reclaimed - nr_reclaimed);
>  
> -                     /* Allow kswapd to start writing pages during reclaim.*/
> -                     if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> -                             set_bit(PGDAT_DIRTY, &pgdat->flags);
> +     if (sc->nr_reclaimed - nr_reclaimed)
> +             reclaimable = true;
>  
> -                     /*
> -                      * If kswapd scans pages marked marked for immediate
> -                      * reclaim and under writeback (nr_immediate), it
> -                      * implies that pages are cycling through the LRU
> -                      * faster than they are written so also forcibly stall.
> -                      */
> -                     if (sc->nr.immediate)
> -                             congestion_wait(BLK_RW_ASYNC, HZ/10);
> -             }

Don't you want to separate the block below into a separate function?
It can probably make the big loop shorter and easier to follow.

Thanks!

> +     if (current_is_kswapd()) {
> +             /*
> +              * If reclaim is isolating dirty pages under writeback,
> +              * it implies that the long-lived page allocation rate
> +              * is exceeding the page laundering rate. Either the
> +              * global limits are not being effective at throttling
> +              * processes due to the page distribution throughout
> +              * zones or there is heavy usage of a slow backing
> +              * device. The only option is to throttle from reclaim
> +              * context which is not ideal as there is no guarantee
> +              * the dirtying process is throttled in the same way
> +              * balance_dirty_pages() manages.
> +              *
> +              * Once a node is flagged PGDAT_WRITEBACK, kswapd will
> +              * count the number of pages under pages flagged for
> +              * immediate reclaim and stall if any are encountered
> +              * in the nr_immediate check below.
> +              */
> +             if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
> +                     set_bit(PGDAT_WRITEBACK, &pgdat->flags);
>  
>               /*
> -              * Legacy memcg will stall in page writeback so avoid forcibly
> -              * stalling in wait_iff_congested().
> +              * Tag a node as congested if all the dirty pages
> +              * scanned were backed by a congested BDI and
> +              * wait_iff_congested will stall.
>                */
> -             if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> -                 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> -                     set_memcg_congestion(pgdat, root, true);
> +             if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +                     set_bit(PGDAT_CONGESTED, &pgdat->flags);
> +
> +             /* Allow kswapd to start writing pages during reclaim.*/
> +             if (sc->nr.unqueued_dirty == sc->nr.file_taken)
> +                     set_bit(PGDAT_DIRTY, &pgdat->flags);
>  
>               /*
> -              * Stall direct reclaim for IO completions if underlying BDIs
> -              * and node is congested. Allow kswapd to continue until it
> -              * starts encountering unqueued dirty pages or cycling through
> -              * the LRU too quickly.
> +              * If kswapd scans pages marked marked for immediate
> +              * reclaim and under writeback (nr_immediate), it
> +              * implies that pages are cycling through the LRU
> +              * faster than they are written so also forcibly stall.
>                */
> -             if (!sc->hibernation_mode && !current_is_kswapd() &&
> -                current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> -                     wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> +             if (sc->nr.immediate)
> +                     congestion_wait(BLK_RW_ASYNC, HZ/10);
> +     }
> +
> +     /*
> +      * Legacy memcg will stall in page writeback so avoid forcibly
> +      * stalling in wait_iff_congested().
> +      */
> +     if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) &&
> +         sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
> +             set_memcg_congestion(pgdat, root, true);
> +
> +     /*
> +      * Stall direct reclaim for IO completions if underlying BDIs
> +      * and node is congested. Allow kswapd to continue until it
> +      * starts encountering unqueued dirty pages or cycling through
> +      * the LRU too quickly.
> +      */
> +     if (!sc->hibernation_mode && !current_is_kswapd() &&
> +         current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> +             wait_iff_congested(BLK_RW_ASYNC, HZ/10);
>  
> -     } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> -                                      sc));
> +     if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
> +                                 sc))
> +             goto again;
>  
>       /*
>        * Kswapd gives up on balancing particular nodes after too
> -- 
> 2.23.0
> 
>

Re: [PATCH 5/8] mm: vmscan: replace shrink_node() loop with a retry jump

Reply via email to