This is a Proof-of-concept code for balancing the node zones occupancy
whose imbalance may be caused by the memory hotplug.

Signed-off-by: Charan Teja Reddy <chara...@codeaurora.org>
---
 include/linux/migrate.h |   8 +-
 include/linux/mm.h      |   3 +
 include/linux/mmzone.h  |   2 +
 kernel/sysctl.c         |  11 ++
 mm/compaction.c         |   4 +-
 mm/memory_hotplug.c     | 265 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 290 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838..b7dc259 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -53,6 +53,8 @@ extern int migrate_huge_page_move_mapping(struct 
address_space *mapping,
                                  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page, int extra_count);
+extern void split_map_pages(struct list_head *list);
+extern unsigned long release_freepages(struct list_head *freelist);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
@@ -81,7 +83,11 @@ static inline int migrate_huge_page_move_mapping(struct 
address_space *mapping,
 {
        return -ENOSYS;
 }
-
+static inline void split_map_pages(struct list_head *list) { }
+static inline unsigned long release_freepages(struct list_head *freelist)
+{
+       return 0;
+}
 #endif /* CONFIG_MIGRATION */
 
 #ifdef CONFIG_COMPACTION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8..1014139 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2465,6 +2465,9 @@ extern int watermark_boost_factor;
 extern int watermark_scale_factor;
 extern bool arch_has_descending_max_zone_pfns(void);
 
+/* memory_hotplug.c */
+extern int balance_node_occupancy_pages;
+
 /* nommu.c */
 extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316..ce417c3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -977,6 +977,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table 
*, int,
                void *, size_t *, loff_t *);
 int numa_zonelist_order_handler(struct ctl_table *, int,
                void *, size_t *, loff_t *);
+extern int sysctl_balance_node_occupancy_handler(struct ctl_table *tbl,
+               int write, void *buf, size_t *len, loff_t *pos);
 extern int percpu_pagelist_fraction;
 extern char numa_zonelist_order[];
 #define NUMA_ZONELIST_ORDER_LEN        16
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9fbdd8..4b95a90 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3140,6 +3140,17 @@ static struct ctl_table vm_table[] = {
                .extra2         = SYSCTL_ONE,
        },
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+       {
+               .procname       = "balance_node_occupancy_pages",
+               .data           = &balance_node_occupancy_pages,
+               .maxlen         = sizeof(balance_node_occupancy_pages),
+               .mode           = 0200,
+               .proc_handler   = sysctl_balance_node_occupancy_handler,
+               .extra1         = SYSCTL_ZERO,
+       },
+
+#endif
        { }
 };
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccda..da3c015 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -68,7 +68,7 @@ static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 
500;
 #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
 #endif
 
-static unsigned long release_freepages(struct list_head *freelist)
+unsigned long release_freepages(struct list_head *freelist)
 {
        struct page *page, *next;
        unsigned long high_pfn = 0;
@@ -84,7 +84,7 @@ static unsigned long release_freepages(struct list_head 
*freelist)
        return high_pfn;
 }
 
-static void split_map_pages(struct list_head *list)
+void split_map_pages(struct list_head *list)
 {
        unsigned int i, order, nr_pages;
        struct page *page, *next;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9..2780c91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -97,6 +97,271 @@ void mem_hotplug_done(void)
 
 u64 max_mem_size = U64_MAX;
 
+int balance_node_occupancy_pages;
+static atomic_t target_migrate_pages = ATOMIC_INIT(0);
+
+struct movable_zone_fill_control {
+       struct list_head freepages;
+       unsigned long start_pfn;
+       unsigned long end_pfn;
+       unsigned long nr_migrate_pages;
+       unsigned long nr_free_pages;
+       unsigned long limit;
+       int target;
+       struct zone *zone;
+};
+
+static void fill_movable_zone_fn(struct work_struct *work);
+static DECLARE_WORK(fill_movable_zone_work, fill_movable_zone_fn);
+static DEFINE_MUTEX(page_migrate_lock);
+
+static inline void reset_page_order(struct page *page)
+{
+       __ClearPageBuddy(page);
+       set_page_private(page, 0);
+}
+
+static int isolate_free_page(struct page *page, unsigned int order)
+{
+       struct zone *zone;
+
+       zone = page_zone(page);
+       list_del(&page->lru);
+       zone->free_area[order].nr_free--;
+       reset_page_order(page);
+
+       return 1UL << order;
+}
+
+static void isolate_free_pages(struct movable_zone_fill_control *fc)
+{
+       struct page *page;
+       unsigned long flags;
+       unsigned int order;
+       unsigned long start_pfn = fc->start_pfn;
+       unsigned long end_pfn = fc->end_pfn;
+
+       spin_lock_irqsave(&fc->zone->lock, flags);
+       for (; start_pfn < end_pfn; start_pfn++) {
+               unsigned long isolated;
+
+               if (!pfn_valid(start_pfn))
+                       continue;
+
+               page = pfn_to_page(start_pfn);
+               if (!page)
+                       continue;
+
+               if (PageCompound(page)) {
+                       struct page *head = compound_head(page);
+                       int skip;
+
+                       skip = (1 << compound_order(head)) - (page - head);
+                       start_pfn += skip - 1;
+                       continue;
+               }
+
+               if (!PageBuddy(page))
+                       continue;
+
+               order = page_private(page);
+               isolated = isolate_free_page(page, order);
+               set_page_private(page, order);
+               list_add_tail(&page->lru, &fc->freepages);
+               fc->nr_free_pages += isolated;
+               __mod_zone_page_state(fc->zone, NR_FREE_PAGES, -isolated);
+               start_pfn += isolated - 1;
+
+               /*
+                * Make sure that the zone->lock is not held for long by
+                * returning once we have SWAP_CLUSTER_MAX pages in the
+                * free list for migration.
+                */
+               if (fc->nr_free_pages >= SWAP_CLUSTER_MAX)
+                       break;
+       }
+       fc->start_pfn = start_pfn + 1;
+       spin_unlock_irqrestore(&fc->zone->lock, flags);
+
+       split_map_pages(&fc->freepages);
+}
+
+static struct page *movable_page_alloc(struct page *page, unsigned long data)
+{
+       struct movable_zone_fill_control *fc;
+       struct page *freepage;
+
+       fc = (struct movable_zone_fill_control *)data;
+       if (list_empty(&fc->freepages)) {
+               isolate_free_pages(fc);
+               if (list_empty(&fc->freepages))
+                       return NULL;
+       }
+
+       freepage = list_entry(fc->freepages.next, struct page, lru);
+       list_del(&freepage->lru);
+       fc->nr_free_pages--;
+
+       return freepage;
+}
+
+static void movable_page_free(struct page *page, unsigned long data)
+{
+       struct movable_zone_fill_control *fc;
+
+       fc = (struct movable_zone_fill_control *)data;
+       list_add(&page->lru, &fc->freepages);
+       fc->nr_free_pages++;
+}
+
+static unsigned long get_anon_movable_pages(
+                       struct movable_zone_fill_control *fc,
+                       unsigned long start_pfn,
+                       unsigned long end_pfn, struct list_head *list)
+{
+       int found = 0, pfn, ret;
+       int limit = min_t(int, fc->target, (int)pageblock_nr_pages);
+
+       fc->nr_migrate_pages = 0;
+       for (pfn = start_pfn; pfn < end_pfn && found < limit; ++pfn) {
+               struct page *page = pfn_to_page(pfn);
+
+               if (!pfn_valid(pfn))
+                       continue;
+
+               if (PageCompound(page)) {
+                       struct page *head = compound_head(page);
+                       int skip;
+
+                       skip = (1 << compound_order(head)) - (page - head);
+                       pfn += skip - 1;
+                       continue;
+               }
+
+               if (PageBuddy(page)) {
+                       unsigned long freepage_order;
+
+                       freepage_order = READ_ONCE(page_private(page));
+                       if (freepage_order > 0 && freepage_order < MAX_ORDER)
+                               pfn += (1 << page_private(page)) - 1;
+                       continue;
+               }
+
+               if (!PageLRU(page) || !PageAnon(page))
+                       continue;
+
+               if (!get_page_unless_zero(page))
+                       continue;
+
+               found++;
+               ret = isolate_lru_page(page);
+               if (!ret) {
+                       list_add_tail(&page->lru, list);
+                       inc_node_page_state(page, NR_ISOLATED_ANON +
+                                       page_is_file_lru(page));
+                       ++fc->nr_migrate_pages;
+               }
+
+               put_page(page);
+       }
+
+       return pfn;
+}
+
+static void prepare_fc(struct movable_zone_fill_control *fc)
+{
+       struct zone *zone;
+
+       zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]);
+       fc->zone = zone;
+       fc->start_pfn = zone->zone_start_pfn;
+       fc->end_pfn = zone_end_pfn(zone);
+       fc->limit = atomic64_read(&zone->managed_pages);
+       INIT_LIST_HEAD(&fc->freepages);
+}
+
+#define MIGRATE_TIMEOUT_SEC (20)
+static void fill_movable_zone_fn(struct work_struct *work)
+{
+       unsigned long start_pfn, end_pfn;
+       unsigned long movable_highmark;
+       struct zone *normal_zone = &(NODE_DATA(0)->node_zones[ZONE_NORMAL]);
+       struct zone *movable_zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]);
+       LIST_HEAD(source);
+       int ret, free;
+       struct movable_zone_fill_control fc = { {0} };
+       unsigned long timeout = MIGRATE_TIMEOUT_SEC * HZ, expire;
+
+       start_pfn = normal_zone->zone_start_pfn;
+       end_pfn = zone_end_pfn(normal_zone);
+       movable_highmark = high_wmark_pages(movable_zone);
+
+       lru_add_drain_all();
+       drain_all_pages(normal_zone);
+       if (!mutex_trylock(&page_migrate_lock))
+               return;
+       prepare_fc(&fc);
+       if (!fc.limit)
+               goto out;
+       expire = jiffies + timeout;
+restart:
+       fc.target = atomic_xchg(&target_migrate_pages, 0);
+       if (!fc.target)
+               goto out;
+repeat:
+       cond_resched();
+       if (time_after(jiffies, expire))
+               goto out;
+       free = zone_page_state(movable_zone, NR_FREE_PAGES);
+       if (free - fc.target <= movable_highmark)
+               fc.target = free - movable_highmark;
+       if (fc.target <= 0)
+               goto out;
+
+       start_pfn = get_anon_movable_pages(&fc, start_pfn, end_pfn, &source);
+       if (list_empty(&source) && start_pfn < end_pfn)
+               goto repeat;
+
+       ret = migrate_pages(&source, movable_page_alloc, movable_page_free,
+                       (unsigned long) &fc,
+                       MIGRATE_ASYNC, MR_MEMORY_HOTPLUG);
+       if (ret)
+               putback_movable_pages(&source);
+
+       fc.target -= fc.nr_migrate_pages;
+       if (ret == -ENOMEM || start_pfn >= end_pfn)
+               goto out;
+       else if (fc.target <= 0)
+               goto restart;
+
+       goto repeat;
+out:
+       mutex_unlock(&page_migrate_lock);
+       if (fc.nr_free_pages > 0)
+               release_freepages(&fc.freepages);
+}
+
+
+
+int sysctl_balance_node_occupancy_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *length, loff_t *ppos)
+{
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
+       if (write) {
+               atomic_add(balance_node_occupancy_pages, &target_migrate_pages);
+
+               if (!work_pending(&fill_movable_zone_work))
+                       queue_work(system_unbound_wq, &fill_movable_zone_work);
+       }
+
+       return 0;
+}
+
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size,
                                                 const char *resource_name)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation

Reply via email to