From: Alexander Duyck <alexander.h.du...@linux.intel.com>

Add support for "aerating" memory in a guest by pushing individual pages
out. This patch is meant to add generic support for this by adding a common
framework that can be used later by drivers such as virtio-balloon.

Signed-off-by: Alexander Duyck <alexander.h.du...@linux.intel.com>
---
 include/linux/memory_aeration.h |   54 +++++++
 mm/Kconfig                      |    5 +
 mm/Makefile                     |    1 
 mm/aeration.c                   |  320 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 380 insertions(+)
 create mode 100644 include/linux/memory_aeration.h
 create mode 100644 mm/aeration.c

diff --git a/include/linux/memory_aeration.h b/include/linux/memory_aeration.h
new file mode 100644
index 000000000000..5ba0e634f240
--- /dev/null
+++ b/include/linux/memory_aeration.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_AERATION_H
+#define _LINUX_MEMORY_AERATION_H
+
+#include <linux/pageblock-flags.h>
+#include <linux/jump_label.h>
+#include <asm/pgtable_types.h>
+
+struct zone;
+
+#define AERATOR_MIN_ORDER      pageblock_order
+
+struct aerator_dev_info {
+       unsigned long capacity;
+       struct list_head batch_reactor;
+       atomic_t refcnt;
+       void (*react)(struct aerator_dev_info *a_dev_info);
+};
+
+extern struct static_key aerator_notify_enabled;
+
+void aerator_cycle(void);
+void __aerator_notify(struct zone *zone, int order);
+
+/**
+ * aerator_notify_free - Free page notification that will start page processing
+ * @page: Last page processed
+ * @zone: Pointer to current zone of last page processed
+ * @order: Order of last page added to zone
+ *
+ * This function is meant to act as a screener for __aerator_notify which
+ * will determine if a give zone has crossed over the high-water mark that
+ * will justify us beginning page treatment. If we have crossed that
+ * threshold then it will start the process of pulling some pages and
+ * placing them in the batch_reactor list for treatment.
+ */
+static inline void
+aerator_notify_free(struct page *page, struct zone *zone, int order)
+{
+       if (!static_key_false(&aerator_notify_enabled))
+               return;
+
+       if (order < AERATOR_MIN_ORDER)
+               return;
+
+       __aerator_notify(zone, order);
+}
+
+void aerator_shutdown(void);
+int aerator_startup(struct aerator_dev_info *sdev);
+
+#define AERATOR_ZONE_BITS      (BITS_TO_LONGS(MAX_NR_ZONES) * BITS_PER_LONG)
+#define AERATOR_HWM_BITS       (AERATOR_ZONE_BITS * MAX_NUMNODES)
+#endif /*_LINUX_MEMORY_AERATION_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..34680214cefa 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -236,6 +236,11 @@ config COMPACTION
           linux...@kvack.org.
 
 #
+# support for memory aeration
+config AERATION
+       bool
+
+#
 # support for page migration
 #
 config MIGRATION
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..26c2fcd2b89d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AERATION) += aeration.o
diff --git a/mm/aeration.c b/mm/aeration.c
new file mode 100644
index 000000000000..aaf8af8d822f
--- /dev/null
+++ b/mm/aeration.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/memory_aeration.h>
+#include <linux/mmzone.h>
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+static unsigned long *aerator_hwm;
+static struct aerator_dev_info *a_dev_info;
+struct static_key aerator_notify_enabled;
+
+void aerator_shutdown(void)
+{
+       static_key_slow_dec(&aerator_notify_enabled);
+
+       while (atomic_read(&a_dev_info->refcnt))
+               msleep(20);
+
+       kfree(aerator_hwm);
+       aerator_hwm = NULL;
+
+       a_dev_info = NULL;
+}
+EXPORT_SYMBOL_GPL(aerator_shutdown);
+
+int aerator_startup(struct aerator_dev_info *sdev)
+{
+       size_t size = BITS_TO_LONGS(AERATOR_HWM_BITS) * sizeof(unsigned long);
+       unsigned long *hwm;
+
+       if (a_dev_info || aerator_hwm)
+               return -EBUSY;
+
+       a_dev_info = sdev;
+
+       atomic_set(&sdev->refcnt, 0);
+
+       hwm = kzalloc(size, GFP_KERNEL);
+       if (!hwm) {
+               aerator_shutdown();
+               return -ENOMEM;
+       }
+
+       aerator_hwm = hwm;
+
+       static_key_slow_inc(&aerator_notify_enabled);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(aerator_startup);
+
+static inline unsigned long *get_aerator_hwm(int nid)
+{
+       if (!aerator_hwm)
+               return NULL;
+
+       return aerator_hwm + (BITS_TO_LONGS(MAX_NR_ZONES) * nid);
+}
+
+static int __aerator_fill(struct zone *zone, unsigned int size)
+{
+       struct list_head *batch = &a_dev_info->batch_reactor;
+       unsigned long nr_raw = 0;
+       unsigned int len = 0;
+       unsigned int order;
+
+       for (order = MAX_ORDER; order-- != AERATOR_MIN_ORDER;) {
+               struct free_area *area = &(zone->free_area[order]);
+               int mt = area->treatment_mt;
+
+               /*
+                * If there are no untreated pages to pull
+                * then we might as well skip the area.
+                */
+               while (area->nr_free_raw) {
+                       unsigned int count = 0;
+                       struct page *page;
+
+                       /*
+                        * If we completed aeration we can let the current
+                        * free list work on settling so that a batch of
+                        * new raw pages can build. In the meantime move on
+                        * to the next migratetype.
+                        */
+                       if (++mt >= MIGRATE_TYPES)
+                               mt = 0;
+
+                       /*
+                        * Pull pages from free list until we have drained
+                        * it or we have filled the batch reactor.
+                        */
+                       while ((page = get_raw_pages(zone, order, mt))) {
+                               list_add(&page->lru, batch);
+
+                               if (++count == (size - len))
+                                       return size;
+                       }
+
+                       /*
+                        * If we pulled any pages from this migratetype then
+                        * we must move on to a new free area as we cannot
+                        * move the membrane until after we have decanted the
+                        * pages currently being aerated.
+                        */
+                       if (count) {
+                               len += count;
+                               break;
+                       }
+               }
+
+               /*
+                * Keep a running total of the raw packets we have left
+                * behind. We will use this to determine if we should
+                * clear the HWM flag.
+                */
+               nr_raw += area->nr_free_raw;
+       }
+
+       /*
+        * If there are no longer enough free pages to fully populate
+        * the aerator, then we can just shut it down for this zone.
+        */
+       if (nr_raw < a_dev_info->capacity) {
+               unsigned long *hwm = get_aerator_hwm(zone_to_nid(zone));
+
+               clear_bit(zone_idx(zone), hwm);
+               atomic_dec(&a_dev_info->refcnt);
+       }
+
+       return len;
+}
+
+static unsigned int aerator_fill(int nid, int zid, int budget)
+{
+       pg_data_t *pgdat = NODE_DATA(nid);
+       struct zone *zone = &pgdat->node_zones[zid];
+       unsigned long flags;
+       int len;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       /* fill aerator with "raw" pages */
+       len = __aerator_fill(zone, budget);
+
+       spin_unlock_irqrestore(&zone->lock, flags);
+
+       return len;
+}
+
+static void aerator_fill_and_react(void)
+{
+       int budget = a_dev_info->capacity;
+       int nr;
+
+       /*
+        * We should never be calling this function while there are already
+        * pages in the reactor being aerated. If we are called under such
+        * a circumstance report an error.
+        */
+       BUG_ON(!list_empty(&a_dev_info->batch_reactor));
+retry:
+       /*
+        * We want to hold one additional reference against the number of
+        * active hints as we may clear the hint that originally brought us
+        * here. We will clear it after we have either vaporized the content
+        * of the pages, or if we discover all pages were stolen out from
+        * under us.
+        */
+       atomic_inc(&a_dev_info->refcnt);
+
+       for_each_set_bit(nr, aerator_hwm, AERATOR_HWM_BITS) {
+               int node_id = nr / AERATOR_ZONE_BITS;
+               int zone_id = nr % AERATOR_ZONE_BITS;
+
+               budget -= aerator_fill(node_id, zone_id, budget);
+               if (!budget)
+                       goto start_aerating;
+       }
+
+       if (unlikely(list_empty(&a_dev_info->batch_reactor))) {
+               /*
+                * If we never generated any pages, and we were holding the
+                * only remaining reference to active hints then we can
+                * just let this go for now and go idle.
+                */
+               if (atomic_dec_and_test(&a_dev_info->refcnt))
+                       return;
+
+               /*
+                * There must be a bit populated somewhere, try going
+                * back through and finding it.
+                */
+               goto retry;
+       }
+
+start_aerating:
+       a_dev_info->react(a_dev_info);
+}
+
+void aerator_decant(void)
+{
+       struct list_head *list = &a_dev_info->batch_reactor;
+       struct page *page;
+
+       /*
+        * This function should never be called on an empty list. If so it
+        * points to a bug as we should never be running the aerator when
+        * the list is empty.
+        */
+       WARN_ON(list_empty(&a_dev_info->batch_reactor));
+
+       while ((page = list_first_entry_or_null(list, struct page, lru))) {
+               list_del(&page->lru);
+
+               __SetPageTreated(page);
+
+               free_treated_page(page);
+       }
+}
+
+/**
+ * aerator_cycle - drain, fill, and start aerating another batch of pages
+ *
+ * This function is at the heart of the aerator. It should be called after
+ * the previous batch of pages has finished being processed by the aerator.
+ * It will drain the aerator, refill it, and start the next set of pages
+ * being processed.
+ */
+void aerator_cycle(void)
+{
+       aerator_decant();
+
+       /*
+        * Now that the pages have been flushed we can drop our reference to
+        * the active hints list. If there are no further hints that need to
+        * be processed we can simply go idle.
+        */
+       if (atomic_dec_and_test(&a_dev_info->refcnt))
+               return;
+
+       aerator_fill_and_react();
+}
+EXPORT_SYMBOL_GPL(aerator_cycle);
+
+static void __aerator_fill_and_react(struct zone *zone)
+{
+       /*
+        * We should never be calling this function while there are already
+        * pages in the list being aerated. If we are called under such a
+        * circumstance report an error.
+        */
+       BUG_ON(!list_empty(&a_dev_info->batch_reactor));
+
+       /*
+        * We want to hold one additional reference against the number of
+        * active hints as we may clear the hint that originally brought us
+        * here. We will clear it after we have either vaporized the content
+        * of the pages, or if we discover all pages were stolen out from
+        * under us.
+        */
+       atomic_inc(&a_dev_info->refcnt);
+
+       __aerator_fill(zone, a_dev_info->capacity);
+
+       if (unlikely(list_empty(&a_dev_info->batch_reactor))) {
+               /*
+                * If we never generated any pages, and we were holding the
+                * only remaining reference to active hints then we can just
+                * let this go for now and go idle.
+                */
+               if (atomic_dec_and_test(&a_dev_info->refcnt))
+                       return;
+
+               /*
+                * Another zone must have populated some raw pages that
+                * need to be processed. Release the zone lock and process
+                * that zone instead.
+                */
+               spin_unlock(&zone->lock);
+               aerator_fill_and_react();
+       } else {
+               /* Release the zone lock and begin the page aerator */
+               spin_unlock(&zone->lock);
+               a_dev_info->react(a_dev_info);
+       }
+
+       /* Reaquire lock so we can resume processing this zone */
+       spin_lock(&zone->lock);
+}
+
+void __aerator_notify(struct zone *zone, int order)
+{
+       int node_id = zone_to_nid(zone);
+       int zone_id = zone_idx(zone);
+       unsigned long *hwm;
+
+       if (zone->free_area[order].nr_free_raw < (2 * a_dev_info->capacity))
+               return;
+
+       hwm = get_aerator_hwm(node_id);
+
+       /*
+        * We an use separate test and set operations here as there
+        * is nothing else that can set or clear this bit while we are
+        * holding the zone lock. The advantage to doing it this way is
+        * that we don't have to dirty the cacheline unless we are
+        * changing the value.
+        */
+       if (test_bit(zone_id, hwm))
+               return;
+       set_bit(zone_id, hwm);
+
+       if (atomic_fetch_inc(&a_dev_info->refcnt))
+               return;
+
+       __aerator_fill_and_react(zone);
+}
+EXPORT_SYMBOL_GPL(__aerator_notify);
+

Reply via email to