From: Jérôme Glisse <[email protected]>

This patch add infrastructure to track heterogeneous memory policy
within the kernel. Policy are defined over range of virtual address
of a process and attach to the correspond mm_struct.

User can reset to default policy for range of virtual address using
hbind() default commands for the range.

Signed-off-by: Jérôme Glisse <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Ross Zwisler <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Haggai Eran <[email protected]>
Cc: Balbir Singh <[email protected]>
Cc: Aneesh Kumar K.V <[email protected]>
Cc: Benjamin Herrenschmidt <[email protected]>
Cc: Felix Kuehling <[email protected]>
Cc: Philip Yang <[email protected]>
Cc: Christian König <[email protected]>
Cc: Paul Blinzer <[email protected]>
Cc: Logan Gunthorpe <[email protected]>
Cc: John Hubbard <[email protected]>
Cc: Ralph Campbell <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Jonathan Cameron <[email protected]>
Cc: Mark Hairgrove <[email protected]>
Cc: Vivek Kini <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Dave Airlie <[email protected]>
Cc: Ben Skeggs <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
---
 include/linux/hms.h        |  46 ++++++
 include/linux/mm_types.h   |   6 +
 include/uapi/linux/hbind.h |   8 +
 kernel/fork.c              |   3 +
 mm/hms.c                   | 306 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 368 insertions(+), 1 deletion(-)

diff --git a/include/linux/hms.h b/include/linux/hms.h
index 511b5363d8f2..f39c390b3afb 100644
--- a/include/linux/hms.h
+++ b/include/linux/hms.h
@@ -20,6 +20,8 @@
 
 #include <linux/device.h>
 #include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
 
 
 struct hms_target;
@@ -34,6 +36,10 @@ struct hms_target_hbind {
 #if IS_ENABLED(CONFIG_HMS)
 
 
+#include <linux/interval_tree.h>
+#include <linux/rwsem.h>
+
+
 #define to_hms_object(device) container_of(device, struct hms_object, device)
 
 enum hms_type {
@@ -133,6 +139,42 @@ void hms_bridge_register(struct hms_bridge **bridgep,
 void hms_bridge_unregister(struct hms_bridge **bridgep);
 
 
+struct hms_policy_targets {
+       struct hms_target **targets;
+       unsigned ntargets;
+       struct kref kref;
+};
+
+struct hms_policy_range {
+       struct hms_policy_targets *ptargets;
+       struct interval_tree_node node;
+       struct kref kref;
+};
+
+struct hms_policy {
+       struct rb_root_cached ranges;
+       struct rw_semaphore sem;
+       struct mmu_notifier mn;
+};
+
+static inline unsigned long hms_policy_range_start(struct hms_policy_range *r)
+{
+       return r->node.start;
+}
+
+static inline unsigned long hms_policy_range_end(struct hms_policy_range *r)
+{
+       return r->node.last + 1;
+}
+
+static inline void hms_policy_init(struct mm_struct *mm)
+{
+       mm->hpolicy = NULL;
+}
+
+void hms_policy_fini(struct mm_struct *mm);
+
+
 int hms_init(void);
 
 
@@ -163,6 +205,10 @@ int hms_init(void);
 #define hms_bridge_unregister(bridgep)
 
 
+#define hms_policy_init(mm)
+#define hms_policy_fini(mm)
+
+
 static inline int hms_init(void)
 {
        return 0;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5ed8f6292a53..3da91767c689 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -26,6 +26,7 @@ typedef int vm_fault_t;
 
 struct address_space;
 struct mem_cgroup;
+struct hms_policy;
 struct hmm;
 
 /*
@@ -491,6 +492,11 @@ struct mm_struct {
                /* HMM needs to track a few things per mm */
                struct hmm *hmm;
 #endif
+
+#if IS_ENABLED(CONFIG_HMS)
+               /* Heterogeneous Memory System policy */
+               struct hms_policy *hpolicy;
+#endif
        } __randomize_layout;
 
        /*
diff --git a/include/uapi/linux/hbind.h b/include/uapi/linux/hbind.h
index a9aba17ab142..cc4687587f5a 100644
--- a/include/uapi/linux/hbind.h
+++ b/include/uapi/linux/hbind.h
@@ -39,6 +39,14 @@ struct hbind_params {
 #define HBIND_ATOM_GET_CMD(v) ((v) & 0xfffff)
 #define HBIND_ATOM_SET_CMD(v) ((v) & 0xfffff)
 
+/*
+ * HBIND_CMD_DEFAULT restore default policy ie undo any of the previous policy.
+ *
+ * Additional dwords:
+ *      NONE (DWORDS MUST BE 0 !)
+ */
+#define HBIND_CMD_DEFAULT 0
+
 
 #define HBIND_IOCTL            _IOWR('H', 0x00, struct hbind_params)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cddff89c7b..bc40edcadc69 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -38,6 +38,7 @@
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/hmm.h>
+#include <linux/hms.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
@@ -671,6 +672,7 @@ void __mmdrop(struct mm_struct *mm)
        mm_free_pgd(mm);
        destroy_context(mm);
        hmm_mm_destroy(mm);
+       hms_policy_fini(mm);
        mmu_notifier_mm_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
@@ -989,6 +991,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, 
struct task_struct *p,
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_mm_init(mm);
        hmm_mm_init(mm);
+       hms_policy_init(mm);
        init_tlb_flush_pending(mm);
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
        mm->pmd_huge_pte = NULL;
diff --git a/mm/hms.c b/mm/hms.c
index bf328bd577dc..be2c4e526f25 100644
--- a/mm/hms.c
+++ b/mm/hms.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/hms.h>
+#include <linux/mm.h>
 #include <linux/fs.h>
 
 #include <uapi/linux/hbind.h>
@@ -31,7 +32,6 @@
 
 #define HBIND_FIX_ARRAY 64
 
-
 static ssize_t hbind_read(struct file *file, char __user *buf,
                        size_t count, loff_t *ppos)
 {
@@ -44,6 +44,300 @@ static ssize_t hbind_write(struct file *file, const char 
__user *buf,
        return -EINVAL;
 }
 
+
+static void hms_policy_targets_get(struct hms_policy_targets *ptargets)
+{
+       kref_get(&ptargets->kref);
+}
+
+static void hms_policy_targets_free(struct kref *kref)
+{
+       struct hms_policy_targets *ptargets;
+
+       ptargets = container_of(kref, struct hms_policy_targets, kref);
+       kfree(ptargets->targets);
+       kfree(ptargets);
+}
+
+static void hms_policy_targets_put(struct hms_policy_targets *ptargets)
+{
+       kref_put(&ptargets->kref, &hms_policy_targets_free);
+}
+
+static struct hms_policy_targets* hms_policy_targets_new(const uint32_t 
*targets,
+                                                        unsigned ntargets)
+{
+       struct hms_policy_targets *ptargets;
+       void *_targets;
+       unsigned i, c;
+
+       _targets = kzalloc(ntargets * sizeof(void *), GFP_KERNEL);
+       if (_targets == NULL)
+               return NULL;
+
+       ptargets = kmalloc(sizeof(*ptargets), GFP_KERNEL);
+       if (ptargets == NULL) {
+               kfree(_targets);
+               return NULL;
+       }
+
+       kref_init(&ptargets->kref);
+       ptargets->targets = _targets;
+       ptargets->ntargets = ntargets;
+
+       for (i = 0, c = 0; i < ntargets; ++i) {
+               ptargets->targets[c] = hms_target_find(targets[i]);
+               c += !!((long)ptargets->targets[i]);
+       }
+
+       /* Ignore NULL targets[i] */
+       ptargets->ntargets = c;
+
+       if (!c) {
+               /* No valid targets pointless to waste memory ... */
+               hms_policy_targets_put(ptargets);
+               return NULL;
+       }
+
+       return ptargets;
+}
+
+
+static void hms_policy_range_get(struct hms_policy_range *prange)
+{
+       kref_get(&prange->kref);
+}
+
+static void hms_policy_range_free(struct kref *kref)
+{
+       struct hms_policy_range *prange;
+
+       prange = container_of(kref, struct hms_policy_range, kref);
+       hms_policy_targets_put(prange->ptargets);
+       kfree(prange);
+}
+
+static void hms_policy_range_put(struct hms_policy_range *prange)
+{
+       kref_put(&prange->kref, &hms_policy_range_free);
+}
+
+static struct hms_policy_range *hms_policy_range_new(const uint32_t *targets,
+                                                    unsigned long start,
+                                                    unsigned long end,
+                                                    unsigned ntargets)
+{
+       struct hms_policy_targets *ptargets;
+       struct hms_policy_range *prange;
+
+       ptargets = hms_policy_targets_new(targets, ntargets);
+       if (ptargets == NULL)
+               return NULL;
+
+       prange = kmalloc(sizeof(*prange), GFP_KERNEL);
+       if (prange == NULL)
+               return NULL;
+
+       prange->node.start = start & PAGE_MASK;
+       prange->node.last = PAGE_ALIGN(end) - 1;
+       prange->ptargets = ptargets;
+       kref_init(&prange->kref);
+
+       return prange;
+}
+
+static struct hms_policy_range *
+hms_policy_range_dup(struct hms_policy_range *_prange)
+{
+       struct hms_policy_range *prange;
+
+       prange = kmalloc(sizeof(*prange), GFP_KERNEL);
+       if (prange == NULL)
+               return NULL;
+
+       hms_policy_targets_get(_prange->ptargets);
+       prange->node.start = _prange->node.start;
+       prange->node.last = _prange->node.last;
+       prange->ptargets = _prange->ptargets;
+       kref_init(&prange->kref);
+
+       return prange;
+}
+
+
+void hms_policy_fini(struct mm_struct *mm)
+{
+       struct hms_policy *hpolicy = READ_ONCE(mm->hpolicy);
+       struct interval_tree_node *node;
+
+       spin_lock(&mm->page_table_lock);
+       hpolicy = READ_ONCE(mm->hpolicy);
+       mm->hpolicy = NULL;
+       spin_unlock(&mm->page_table_lock);
+
+       /* No active heterogeneous policy structure so nothing to cleanup. */
+       if (hpolicy == NULL)
+               return;
+
+       mmu_notifier_unregister_no_release(&hpolicy->mn, mm);
+
+       down_write(&hpolicy->sem);
+       node = interval_tree_iter_first(&hpolicy->ranges, 0, -1UL);
+       while (node) {
+               struct hms_policy_range *prange;
+               struct interval_tree_node *next;
+
+               prange = container_of(node, struct hms_policy_range, node);
+               next = interval_tree_iter_next(node, 0, -1UL);
+               interval_tree_remove(node, &hpolicy->ranges);
+               hms_policy_range_put(prange);
+               node = next;
+       }
+       up_write(&hpolicy->sem);
+
+       kfree(hpolicy);
+}
+
+
+static int hbind_default_locked(struct hms_policy *hpolicy,
+                               struct hbind_params *params)
+{
+       struct interval_tree_node *node;
+       unsigned long start, last;
+       int ret = 0;
+
+       start = params->start;
+       last = params->end - 1UL;
+
+       node = interval_tree_iter_first(&hpolicy->ranges, start, last);
+       while (node) {
+               struct hms_policy_range *prange;
+               struct interval_tree_node *next;
+
+               prange = container_of(node, struct hms_policy_range, node);
+               next = interval_tree_iter_next(node, start, last);
+               if (node->start < start && node->last > last) {
+                       /* Node is split in 2 */
+                       struct hms_policy_range *_prange;
+                       _prange = hms_policy_range_dup(prange);
+                       if (_prange == NULL) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+                       prange->node.last = start - 1;
+                       _prange->node.start = last + 1;
+                       interval_tree_insert(&_prange->node, &hpolicy->ranges);
+                       break;
+               } else if (node->start < start) {
+                       prange->node.last = start - 1;
+               } else if (node->last > last) {
+                       prange->node.start = last + 1;
+               } else {
+                       /* Fully inside [start, last] */
+                       interval_tree_remove(node, &hpolicy->ranges);
+               }
+
+               node = next;
+       }
+
+       return ret;
+}
+
+static int hbind_default(struct mm_struct *mm, struct hbind_params *params,
+                        const uint32_t *targets, uint32_t *atoms)
+{
+       struct hms_policy *hpolicy = READ_ONCE(mm->hpolicy);
+       int ret;
+
+       /* No active heterogeneous policy structure so no range to reset. */
+       if (hpolicy == NULL)
+               return 0;
+
+       down_write(&hpolicy->sem);
+       ret = hbind_default_locked(hpolicy, params);
+       up_write(&hpolicy->sem);
+
+       return ret;
+}
+
+
+static void hms_policy_notifier_release(struct mmu_notifier *mn,
+                                       struct mm_struct *mm)
+{
+       hms_policy_fini(mm);
+}
+
+static int hms_policy_notifier_invalidate_range_start(struct mmu_notifier *mn,
+                                      const struct mmu_notifier_range *range)
+{
+       if (range->event == MMU_NOTIFY_UNMAP) {
+               struct hbind_params params;
+
+               if (!range->blockable)
+                       return -EBUSY;
+
+               params.natoms = 0;
+               params.ntargets = 0;
+               params.end = range->end;
+               params.start = range->start;
+               hbind_default(range->mm, &params, NULL, NULL);
+       }
+
+       return 0;
+}
+
+static const struct mmu_notifier_ops hms_policy_notifier_ops = {
+       .release = hms_policy_notifier_release,
+       .invalidate_range_start = hms_policy_notifier_invalidate_range_start,
+};
+
+static struct hms_policy *hms_policy_get(struct mm_struct *mm)
+{
+       struct hms_policy *hpolicy = READ_ONCE(mm->hpolicy);
+       bool mmu_notifier = false;
+
+       /*
+        * The hpolicy struct can only be freed once the mm_struct goes away,
+        * hence only pre-allocate if none is attach yet.
+        */
+       if (hpolicy)
+               return hpolicy;
+
+       hpolicy = kzalloc(sizeof(*hpolicy), GFP_KERNEL);
+       if (hpolicy == NULL)
+               return NULL;
+
+       init_rwsem(&hpolicy->sem);
+
+       spin_lock(&mm->page_table_lock);
+       if (!mm->hpolicy) {
+               mm->hpolicy = hpolicy;
+               mmu_notifier = true;
+               hpolicy = NULL;
+       }
+       spin_unlock(&mm->page_table_lock);
+
+       if (mmu_notifier) {
+               int ret;
+
+               hpolicy->mn.ops = &hms_policy_notifier_ops;
+               ret = mmu_notifier_register(&hpolicy->mn, mm);
+               if (ret) {
+                       spin_lock(&mm->page_table_lock);
+                       hpolicy = mm->hpolicy;
+                       mm->hpolicy = NULL;
+                       spin_unlock(&mm->page_table_lock);
+               }
+       }
+
+       if (hpolicy)
+               kfree(hpolicy);
+
+       /* At this point mm->hpolicy is valid */
+       return mm->hpolicy;
+}
+
+
 static long hbind_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
        uint32_t *targets, *_dtargets = NULL, _ftargets[HBIND_FIX_ARRAY];
@@ -114,6 +408,16 @@ static long hbind_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
        for (i = 0, ndwords = 1; i < params.natoms; i += ndwords) {
                ndwords = 1 + HBIND_ATOM_GET_DWORDS(atoms[i]);
                switch (HBIND_ATOM_GET_CMD(atoms[i])) {
+               case HBIND_CMD_DEFAULT:
+                       if (ndwords != 1) {
+                               ret = -EINVAL;
+                               goto out_mm;
+                       }
+                       ret = hbind_default(current->mm, &params,
+                                           targets, atoms);
+                       if (ret)
+                               goto out_mm;
+                       break;
                default:
                        ret = -EINVAL;
                        goto out_mm;
-- 
2.17.2

Reply via email to