As reclaim is now per-node based, convert zone_reclaim to be node_reclaim
and avoid reclaiming a node multiple times due to having multiple populated
zones. The documentation and interface to userspace is the same as from
a configuration and behaviour perspective, it will be similar unless the
node-local allocation requests were also limited to lower zones.

Signed-off-by: Mel Gorman <mgor...@suse.de>
---
 include/linux/mmzone.h   | 18 +++++------
 include/linux/swap.h     |  9 +++---
 include/linux/topology.h |  2 +-
 kernel/sysctl.c          |  4 +--
 mm/huge_memory.c         |  4 +--
 mm/internal.h            |  8 ++---
 mm/page_alloc.c          | 35 +++++++++++++++-------
 mm/vmscan.c              | 77 ++++++++++++++++++++++++------------------------
 8 files changed, 85 insertions(+), 72 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c551f70951fa..84fcb7aafb2b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -362,14 +362,6 @@ struct zone {
        unsigned long           *pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 
-#ifdef CONFIG_NUMA
-       /*
-        * zone reclaim becomes active if more unmapped pages exist.
-        */
-       unsigned long           min_unmapped_pages;
-       unsigned long           min_slab_pages;
-#endif /* CONFIG_NUMA */
-
        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long           zone_start_pfn;
 
@@ -518,7 +510,6 @@ struct zone {
 } ____cacheline_internodealigned_in_smp;
 
 enum zone_flags {
-       ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
        ZONE_OOM_LOCKED,                /* zone is in OOM killer zonelist */
 };
 
@@ -533,6 +524,7 @@ enum pgdat_flags {
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
+       PGDAT_RECLAIM_LOCKED,           /* prevents concurrent reclaim */
 };
 
 static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -758,6 +750,14 @@ typedef struct pglist_data {
         */
        unsigned long           dirty_balance_reserve;
 
+#ifdef CONFIG_NUMA
+       /*
+        * zone reclaim becomes active if more unmapped pages exist.
+        */
+       unsigned long           min_unmapped_pages;
+       unsigned long           min_slab_pages;
+#endif /* CONFIG_NUMA */
+
        /* Write-intensive fields used from the page allocator */
        ZONE_PADDING(_pad1_)
        spinlock_t              lru_lock;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bb9597213e39..59d70fd04ec8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -333,13 +333,14 @@ extern int remove_mapping(struct address_space *mapping, 
struct page *page);
 extern unsigned long vm_total_pages;
 
 #ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
 extern int sysctl_min_unmapped_ratio;
 extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
 #else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+                               unsigned int order)
 {
        return 0;
 }
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 909b6e43b694..55a9b2bbb4de 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -58,7 +58,7 @@ int arch_update_cpu_topology(void);
 /*
  * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
  * (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
  * on nodes within this distance.
  */
 #define RECLAIM_DISTANCE 30
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ce410bb9f2e1..f80921283f06 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1394,8 +1394,8 @@ static struct ctl_table vm_table[] = {
 #ifdef CONFIG_NUMA
        {
                .procname       = "zone_reclaim_mode",
-               .data           = &zone_reclaim_mode,
-               .maxlen         = sizeof(zone_reclaim_mode),
+               .data           = &node_reclaim_mode,
+               .maxlen         = sizeof(node_reclaim_mode),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
                .extra1         = &zero,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b56c14a41d96..a5c4e36f200c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2249,10 +2249,10 @@ static bool khugepaged_scan_abort(int nid)
        int i;
 
        /*
-        * If zone_reclaim_mode is disabled, then no extra effort is made to
+        * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
-       if (!zone_reclaim_mode)
+       if (!node_reclaim_mode)
                return false;
 
        /* If there is a count for this node already, it must be acceptable */
diff --git a/mm/internal.h b/mm/internal.h
index a24c4a50c33f..a0b0d20ead97 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -395,10 +395,10 @@ static inline void 
mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
 
-#define ZONE_RECLAIM_NOSCAN    -2
-#define ZONE_RECLAIM_FULL      -1
-#define ZONE_RECLAIM_SOME      0
-#define ZONE_RECLAIM_SUCCESS   1
+#define NODE_RECLAIM_NOSCAN    -2
+#define NODE_RECLAIM_FULL      -1
+#define NODE_RECLAIM_SOME      0
+#define NODE_RECLAIM_SUCCESS   1
 
 extern int hwpoison_filter(struct page *p);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 637b293cd5d1..47e6332d7566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2064,7 +2064,6 @@ zonelist_scan:
                                        !node_dirty_ok(zone->zone_pgdat)) {
                        continue;
                }
-               last_pgdat = zone->zone_pgdat;
 
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                if (!zone_watermark_ok(zone, order, mark,
@@ -2076,7 +2075,7 @@ zonelist_scan:
                        if (alloc_flags & ALLOC_NO_WATERMARKS)
                                goto try_this_zone;
 
-                       if (zone_reclaim_mode == 0 ||
+                       if (node_reclaim_mode == 0 ||
                            !zone_allows_reclaim(ac->preferred_zone, zone))
                                goto this_zone_full;
 
@@ -2094,18 +2093,22 @@ zonelist_scan:
 
                        /*
                         * As we may have just activated ZLC, check if the first
-                        * eligible zone has failed zone_reclaim recently.
+                        * eligible zone has failed node_reclaim recently.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                                !zlc_zone_worth_trying(zonelist, z, 
allowednodes))
                                continue;
 
-                       ret = zone_reclaim(zone, gfp_mask, order);
+                       /* Skip if we have already attemped node_reclaim */
+                       if (last_pgdat == zone->zone_pgdat)
+                               goto try_this_zone;
+
+                       ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
                        switch (ret) {
-                       case ZONE_RECLAIM_NOSCAN:
+                       case NODE_RECLAIM_NOSCAN:
                                /* did not scan */
                                continue;
-                       case ZONE_RECLAIM_FULL:
+                       case NODE_RECLAIM_FULL:
                                /* scanned but unreclaimable */
                                continue;
                        default:
@@ -2124,7 +2127,7 @@ zonelist_scan:
                                 * min watermarks.
                                 */
                                if (((alloc_flags & ALLOC_WMARK_MASK) == 
ALLOC_WMARK_MIN) ||
-                                   ret == ZONE_RECLAIM_SOME)
+                                   ret == NODE_RECLAIM_SOME)
                                        goto this_zone_full;
 
                                continue;
@@ -2132,6 +2135,7 @@ zonelist_scan:
                }
 
 try_this_zone:
+               last_pgdat = zone->zone_pgdat;
                page = buffered_rmqueue(ac->preferred_zone, zone, order,
                                                gfp_mask, ac->migratetype);
                if (page) {
@@ -2140,6 +2144,7 @@ try_this_zone:
                        return page;
                }
 this_zone_full:
+               last_pgdat = zone->zone_pgdat;
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
                        zlc_mark_zone_full(zonelist, z);
        }
@@ -4879,9 +4884,9 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat,
                zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
                zone->node = nid;
-               zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+               pgdat->min_unmapped_pages += 
(freesize*sysctl_min_unmapped_ratio)
                                                / 100;
-               zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+               pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 
100;
 #endif
                zone->name = zone_names[j];
                zone->zone_pgdat = pgdat;
@@ -5839,6 +5844,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table 
*table, int write,
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int 
write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int rc;
 
@@ -5846,8 +5852,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct 
ctl_table *table, int write,
        if (rc)
                return rc;
 
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
        for_each_zone(zone)
-               zone->min_unmapped_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
                                sysctl_min_unmapped_ratio) / 100;
        return 0;
 }
@@ -5855,6 +5864,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct 
ctl_table *table, int write,
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
+       struct pglist_data *pgdat;
        struct zone *zone;
        int rc;
 
@@ -5862,8 +5872,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct 
ctl_table *table, int write,
        if (rc)
                return rc;
 
+       for_each_online_pgdat(pgdat)
+               pgdat->min_slab_pages = 0;
+
        for_each_zone(zone)
-               zone->min_slab_pages = (zone->managed_pages *
+               zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
                                sysctl_min_slab_ratio) / 100;
        return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3cb0cc70ddbd..cf9ae51c9a5c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3529,12 +3529,12 @@ module_init(kswapd_init)
 
 #ifdef CONFIG_NUMA
 /*
- * Zone reclaim mode
+ * Node reclaim mode
  *
- * If non-zero call zone_reclaim when the number of free pages falls below
+ * If non-zero call node_reclaim when the number of free pages falls below
  * the watermarks.
  */
-int zone_reclaim_mode __read_mostly;
+int node_reclaim_mode __read_mostly;
 
 #define RECLAIM_OFF 0
 #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
@@ -3542,14 +3542,14 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
 
 /*
- * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of
  * a zone.
  */
-#define ZONE_RECLAIM_PRIORITY 4
+#define NODE_RECLAIM_PRIORITY 4
 
 /*
- * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * Percentage of pages in a zone that must be unmapped for node_reclaim to
  * occur.
  */
 int sysctl_min_unmapped_ratio = 1;
@@ -3575,9 +3575,9 @@ static inline unsigned long 
node_unmapped_file_pages(struct pglist_data *pgdat)
 }
 
 /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static long node_pagecache_reclaimable(struct pglist_data *pgdat)
 {
-       long nr_pagecache_reclaimable;
+       long nr_pagecache_reclaimable = 0;
        long delta = 0;
 
        /*
@@ -3586,14 +3586,14 @@ static long zone_pagecache_reclaimable(struct zone 
*zone)
         * pages like swapcache and node_unmapped_file_pages() provides
         * a better estimate
         */
-       if (zone_reclaim_mode & RECLAIM_SWAP)
-               nr_pagecache_reclaimable = node_page_state(zone->zone_pgdat, 
NR_FILE_PAGES);
+       if (node_reclaim_mode & RECLAIM_SWAP)
+               nr_pagecache_reclaimable = node_page_state(pgdat, 
NR_FILE_PAGES);
        else
-               nr_pagecache_reclaimable = 
node_unmapped_file_pages(zone->zone_pgdat);
+               nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
 
        /* If we can't clean pages, remove dirty pages from consideration */
-       if (!(zone_reclaim_mode & RECLAIM_WRITE))
-               delta += node_page_state(zone->zone_pgdat, NR_FILE_DIRTY);
+       if (!(node_reclaim_mode & RECLAIM_WRITE))
+               delta += node_page_state(pgdat, NR_FILE_DIRTY);
 
        /* Watch for any possible underflows due to delta */
        if (unlikely(delta > nr_pagecache_reclaimable))
@@ -3603,21 +3603,22 @@ static long zone_pagecache_reclaimable(struct zone 
*zone)
 }
 
 /*
- * Try to free up some pages from this zone through reclaim.
+ * Try to free up some pages from this node through reclaim.
  */
-static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int 
order)
+static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned 
int order)
 {
        /* Minimum pages needed in order to stay on node */
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
+       int classzone_idx = gfp_zone(gfp_mask);
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
                .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
                .order = order,
-               .priority = ZONE_RECLAIM_PRIORITY,
-               .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+               .priority = NODE_RECLAIM_PRIORITY,
+               .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
+               .may_unmap = !!(node_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
        };
 
@@ -3632,13 +3633,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t 
gfp_mask, unsigned int order)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
-       if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+       if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
                 * Free memory by calling shrink zone with increasing
                 * priorities until we have enough memory freed.
                 */
                do {
-                       shrink_node(zone->zone_pgdat, &sc, zone_idx(zone), 
zone_idx(zone));
+                       shrink_node(pgdat, &sc, classzone_idx, classzone_idx);
                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
        }
 
@@ -3648,49 +3649,47 @@ static int __zone_reclaim(struct zone *zone, gfp_t 
gfp_mask, unsigned int order)
        return sc.nr_reclaimed >= nr_pages;
 }
 
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 {
-       int node_id;
        int ret;
 
        /*
-        * Zone reclaim reclaims unmapped file backed pages and
+        * Node reclaim reclaims unmapped file backed pages and
         * slab pages if we are over the defined limits.
         *
         * A small portion of unmapped file backed pages is needed for
         * file I/O otherwise pages read by file I/O will be immediately
-        * thrown out if the zone is overallocated. So we do not reclaim
-        * if less than a specified percentage of the zone is used by
+        * thrown out if the node is overallocated. So we do not reclaim
+        * if less than a specified percentage of the node is used by
         * unmapped file backed pages.
         */
-       if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
-           zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
-               return ZONE_RECLAIM_FULL;
+       if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
+           sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= 
pgdat->min_slab_pages)
+               return NODE_RECLAIM_FULL;
 
-       if (!pgdat_reclaimable(zone->zone_pgdat))
-               return ZONE_RECLAIM_FULL;
+       if (!pgdat_reclaimable(pgdat))
+               return NODE_RECLAIM_FULL;
 
        /*
         * Do not scan if the allocation should not be delayed.
         */
        if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
-               return ZONE_RECLAIM_NOSCAN;
+               return NODE_RECLAIM_NOSCAN;
 
        /*
-        * Only run zone reclaim on the local zone or on zones that do not
+        * Only run node reclaim on the local node or on nodes that do not
         * have associated processors. This will favor the local processor
         * over remote processors and spread off node memory allocations
         * as wide as possible.
         */
-       node_id = zone_to_nid(zone);
-       if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-               return ZONE_RECLAIM_NOSCAN;
+       if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != 
numa_node_id())
+               return NODE_RECLAIM_NOSCAN;
 
-       if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
-               return ZONE_RECLAIM_NOSCAN;
+       if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
+               return NODE_RECLAIM_NOSCAN;
 
-       ret = __zone_reclaim(zone, gfp_mask, order);
-       clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
+       ret = __node_reclaim(pgdat, gfp_mask, order);
+       clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
 
        if (!ret)
                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
-- 
2.3.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to