Demote pages from memory cgroup that has excess
toptier memory usage when top tier memory is tight.

When free top tier memory falls below this fraction
"toptier_scale_factor/10000" of overall toptier memory in a node, kswapd
reclaims top tier memory from those mem cgroups that exceeded their
toptier memory soft limit by deomoting the top tier pages to
lower memory tier.

Signed-off-by: Tim Chen <tim.c.c...@linux.intel.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 12 +++++
 include/linux/mmzone.h                  |  2 +
 mm/page_alloc.c                         | 14 +++++
 mm/vmscan.c                             | 69 ++++++++++++++++++++++++-
 4 files changed, 96 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/sysctl/vm.rst 
b/Documentation/admin-guide/sysctl/vm.rst
index 9de3847c3469..6b49e2e90953 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm:
 - vfs_cache_pressure
 - watermark_boost_factor
 - watermark_scale_factor
+- toptier_scale_factor
 - zone_reclaim_mode
 
 
@@ -962,6 +963,17 @@ too small for the allocation bursts occurring in the 
system. This knob
 can then be used to tune kswapd aggressiveness accordingly.
 
 
+toptier_scale_factor
+====================
+
+This factor controls when kswapd wakes up to demote pages of those
+cgroups that have exceeded their memory soft limit.
+
+The unit is in fractions of 10,000. The default value of 2000 means the
+if there are less than 20% of free top tier memory in the
+node/system, we will start to demote pages of those memory cgroups
+that have exceeded their soft memory limit.
+
 zone_reclaim_mode
 =================
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bbe649c4fdee..4ee0073d255f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -332,12 +332,14 @@ enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
+       WMARK_TOPTIER,
        NR_WMARK
 };
 
 #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
 #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
 #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define toptier_wmark_pages(z) (z->_watermark[WMARK_TOPTIER] + 
z->watermark_boost)
 #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
 
 struct per_cpu_pages {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 471a2c342c4f..20f3caee60f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7964,6 +7964,20 @@ static void __setup_per_zone_wmarks(void)
                zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
                zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
 
+               tmp = mult_frac(zone_managed_pages(zone),
+                               toptier_scale_factor, 10000);
+               /*
+                * Clamp toptier watermark between twice high watermark
+                * and max managed pages.
+                */
+               if (tmp < 2 * zone->_watermark[WMARK_HIGH])
+                       tmp = 2 * zone->_watermark[WMARK_HIGH];
+               if (tmp > zone_managed_pages(zone))
+                       tmp = zone_managed_pages(zone);
+               zone->_watermark[WMARK_TOPTIER] = tmp;
+
+               zone->watermark_boost = 0;
+
                spin_unlock_irqrestore(&zone->lock, flags);
        }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11bb0c6fa524..270880c8baef 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -185,6 +185,7 @@ static void set_task_reclaim_state(struct task_struct *task,
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
+int toptier_scale_factor = 2000;
 
 #ifdef CONFIG_MEMCG
 /*
@@ -3624,6 +3625,34 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, 
int highest_zoneidx)
        return false;
 }
 
+static bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int 
classzone_idx)
+{
+       int i;
+       unsigned long mark;
+       struct zone *zone;
+
+       zone = pgdat->node_zones + ZONE_NORMAL;
+
+       if (!node_state(pgdat->node_id, N_TOPTIER) ||
+           next_demotion_node(pgdat->node_id) == -1 ||
+           order > 0 || classzone_idx < ZONE_NORMAL) {
+               return true;
+       }
+
+       zone = pgdat->node_zones + ZONE_NORMAL;
+
+       if (!managed_zone(zone))
+               return true;
+
+       mark = min(toptier_wmark_pages(zone),
+                  zone_managed_pages(zone));
+
+       if (zone_page_state(zone, NR_FREE_PAGES) < mark)
+               return false;
+
+       return true;
+}
+
 /* Clear pgdat state for congested, dirty or under writeback. */
 static void clear_pgdat_congested(pg_data_t *pgdat)
 {
@@ -4049,6 +4078,39 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int 
alloc_order, int reclaim_o
        finish_wait(&pgdat->kswapd_wait, &wait);
 }
 
+static bool toptier_soft_reclaim(pg_data_t *pgdat,
+                             unsigned int reclaim_order,
+                             unsigned int classzone_idx)
+{
+       unsigned long nr_soft_scanned, nr_soft_reclaimed;
+       int ret;
+       struct scan_control sc = {
+               .gfp_mask = GFP_KERNEL,
+               .order = reclaim_order,
+               .may_unmap = 1,
+       };
+
+       if (!node_state(pgdat->node_id, N_TOPTIER) || kthread_should_stop())
+               return false;
+
+       set_task_reclaim_state(current, &sc.reclaim_state);
+
+       if (!pgdat_toptier_balanced(pgdat, 0, classzone_idx)) {
+               nr_soft_scanned = 0;
+               nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat,
+                                       0, GFP_KERNEL,
+                                       &nr_soft_scanned, N_TOPTIER);
+       }
+
+       set_task_reclaim_state(current, NULL);
+
+       if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx) &&
+          !kthread_should_stop())
+               return true;
+       else
+               return false;
+}
+
 /*
  * The background pageout daemon, started as a kernel thread
  * from the init process.
@@ -4108,6 +4170,10 @@ static int kswapd(void *p)
                WRITE_ONCE(pgdat->kswapd_order, 0);
                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
 
+               if (toptier_soft_reclaim(pgdat, 0,
+                                       highest_zoneidx))
+                       goto kswapd_try_sleep;
+
                ret = try_to_freeze();
                if (kthread_should_stop())
                        break;
@@ -4173,7 +4239,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, 
int order,
 
        /* Hopeless node, leave it to direct reclaim if possible */
        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
-           (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+           (pgdat_toptier_balanced(pgdat, 0, highest_zoneidx) &&
+            pgdat_balanced(pgdat, order, highest_zoneidx) &&
             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
                /*
                 * There may be plenty of free memory available, but it's too
-- 
2.20.1

Reply via email to