Add a debugfs interface to dnuma/memlayout. It keeps track of a
variable backlog of memory layouts, provides some statistics on dnuma
moved pages & cache performance, and allows the setting of a new global
memlayout.

TODO: split out statistics, backlog, & write interfaces from eachother.

Signed-off-by: Cody P Schafer <c...@linux.vnet.ibm.com>
---
 include/linux/dnuma.h     |   2 +-
 include/linux/memlayout.h |   7 +
 mm/Kconfig                |  30 ++++
 mm/Makefile               |   1 +
 mm/dnuma.c                |   4 +-
 mm/memlayout-debugfs.c    | 339 ++++++++++++++++++++++++++++++++++++++++++++++
 mm/memlayout-debugfs.h    |  39 ++++++
 mm/memlayout.c            |  20 ++-
 8 files changed, 436 insertions(+), 6 deletions(-)
 create mode 100644 mm/memlayout-debugfs.c
 create mode 100644 mm/memlayout-debugfs.h

diff --git a/include/linux/dnuma.h b/include/linux/dnuma.h
index 029a984..7a33131 100644
--- a/include/linux/dnuma.h
+++ b/include/linux/dnuma.h
@@ -64,7 +64,7 @@ static inline int dnuma_page_needs_move(struct page *page)
        return new_nid;
 }
 
-void dnuma_post_free_to_new_zone(struct page *page, int order);
+void dnuma_post_free_to_new_zone(int order);
 void dnuma_prior_free_to_new_zone(struct page *page, int order,
                                  struct zone *dest_zone,
                                  int dest_nid);
diff --git a/include/linux/memlayout.h b/include/linux/memlayout.h
index 6c26c52..14dbf35 100644
--- a/include/linux/memlayout.h
+++ b/include/linux/memlayout.h
@@ -56,6 +56,7 @@ struct memlayout {
 };
 
 extern __rcu struct memlayout *pfn_to_node_map;
+extern struct mutex memlayout_lock; /* update-side lock */
 
 /* FIXME: overflow potential in completion check */
 #define ml_for_each_pfn_in_range(rme, pfn)     \
@@ -90,7 +91,13 @@ static inline struct rangemap_entry *rme_first(struct 
memlayout *ml)
             rme = rme_next(rme))
 
 struct memlayout *memlayout_create(enum memlayout_type);
+
+/*
+ * In most cases, these should only be used by the memlayout debugfs code (or
+ * internally within memlayout)
+ */
 void memlayout_destroy(struct memlayout *ml);
+void memlayout_destroy_mem(struct memlayout *ml);
 
 int memlayout_new_range(struct memlayout *ml,
                unsigned long pfn_start, unsigned long pfn_end, int nid);
diff --git a/mm/Kconfig b/mm/Kconfig
index 86f0984..3820b3c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -193,6 +193,36 @@ config DYNAMIC_NUMA
         Choose Y if you have are running linux under a hypervisor that uses
         this feature, otherwise choose N if unsure.
 
+config DNUMA_DEBUGFS
+       bool "Export DNUMA & memlayout internals via debugfs"
+       depends on DYNAMIC_NUMA
+       help
+        Export some dynamic numa info via debugfs in <debugfs>/memlayout.
+
+        Enables the tracking and export of statistics and the exporting of the
+        current memory layout.
+
+        If you are not debugging Dynamic NUMA or memlayout, choose N.
+
+config DNUMA_BACKLOG
+       int "Number of old memlayouts to keep (0 = None, -1 = unlimited)"
+       depends on DNUMA_DEBUGFS
+       help
+        Allows access to old memory layouts & statistics in debugfs.
+
+        Each memlayout will consume some memory, and when set to -1
+        (unlimited), this can result in unbounded kernel memory use.
+
+config DNUMA_DEBUGFS_WRITE
+       bool "Change NUMA layout via debugfs"
+       depends on DNUMA_DEBUGFS
+       help
+        Enable the use of <debugfs>/memlayout/{start,end,node,commit}
+
+        Write a PFN to 'start' & 'end', then a node id to 'node'.
+        Repeat this until you are satisfied with your memory layout, then
+        write '1' to 'commit'.
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 82fe7c9b..b07926c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -59,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_DYNAMIC_NUMA) += dnuma.o memlayout.o
+obj-$(CONFIG_DNUMA_DEBUGFS) += memlayout-debugfs.o
diff --git a/mm/dnuma.c b/mm/dnuma.c
index 2ee0903..eb00b7b 100644
--- a/mm/dnuma.c
+++ b/mm/dnuma.c
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 
 #include "internal.h"
+#include "memlayout-debugfs.h"
 
 /* - must be called under lock_memory_hotplug() */
 /* TODO: avoid iterating over all PFNs. */
@@ -117,8 +118,9 @@ static void node_states_set_node(int node, struct 
memory_notify *arg)
 }
 #endif
 
-void dnuma_post_free_to_new_zone(struct page *page, int order)
+void dnuma_post_free_to_new_zone(int order)
 {
+       ml_stat_count_moved_pages(order);
 }
 
 static void dnuma_prior_return_to_new_zone(struct page *page, int order,
diff --git a/mm/memlayout-debugfs.c b/mm/memlayout-debugfs.c
new file mode 100644
index 0000000..a4fc2cb
--- /dev/null
+++ b/mm/memlayout-debugfs.c
@@ -0,0 +1,339 @@
+#include <linux/debugfs.h>
+
+#include <linux/slab.h> /* kmalloc */
+#include <linux/module.h> /* THIS_MODULE, needed for DEFINE_SIMPLE_ATTR */
+
+#include "memlayout-debugfs.h"
+
+#if CONFIG_DNUMA_BACKLOG > 0
+/* Fixed size backlog */
+#include <linux/kfifo.h>
+#include <linux/log2.h> /* roundup_pow_of_two */
+DEFINE_KFIFO(ml_backlog, struct memlayout *,
+               roundup_pow_of_two(CONFIG_DNUMA_BACKLOG));
+void ml_backlog_feed(struct memlayout *ml)
+{
+       if (kfifo_is_full(&ml_backlog)) {
+               struct memlayout *old_ml;
+               BUG_ON(!kfifo_get(&ml_backlog, &old_ml));
+               memlayout_destroy(old_ml);
+       }
+
+       kfifo_put(&ml_backlog, (const struct memlayout **)&ml);
+}
+#elif CONFIG_DNUMA_BACKLOG < 0
+/* Unlimited backlog */
+void ml_backlog_feed(struct memlayout *ml)
+{
+       /* we never use the rme_tree, so we destroy the non-debugfs portions to
+        * save memory */
+       memlayout_destroy_mem(ml);
+}
+#else /* CONFIG_DNUMA_BACKLOG == 0 */
+/* No backlog */
+void ml_backlog_feed(struct memlayout *ml)
+{
+       memlayout_destroy(ml);
+}
+#endif
+
+static atomic64_t dnuma_moved_page_ct;
+void ml_stat_count_moved_pages(int order)
+{
+       atomic64_add(1 << order, &dnuma_moved_page_ct);
+}
+
+static atomic_t ml_seq = ATOMIC_INIT(0);
+static struct dentry *root_dentry, *current_dentry;
+#define ML_LAYOUT_NAME_SZ \
+       ((size_t)(DIV_ROUND_UP(sizeof(unsigned) * 8, 3) \
+                               + 1 + strlen("layout.")))
+#define ML_REGION_NAME_SZ ((size_t)(2 * BITS_PER_LONG / 4 + 2))
+
+static void ml_layout_name(struct memlayout *ml, char *name)
+{
+       sprintf(name, "layout.%u", ml->seq);
+}
+
+static int dfs_range_get(void *data, u64 *val)
+{
+       *val = (uintptr_t)data;
+       return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(range_fops, dfs_range_get, NULL, "%lld\n");
+
+static void _ml_dbgfs_create_range(struct dentry *base,
+               struct rangemap_entry *rme, char *name)
+{
+       struct dentry *rd;
+       sprintf(name, "%05lx-%05lx", rme->pfn_start, rme->pfn_end);
+       rd = debugfs_create_file(name, 0400, base,
+                               (void *)(uintptr_t)rme->nid, &range_fops);
+       if (!rd)
+               pr_devel("debugfs: failed to create "RME_FMT"\n",
+                               RME_EXP(rme));
+       else
+               pr_devel("debugfs: created "RME_FMT"\n", RME_EXP(rme));
+}
+
+/* Must be called with memlayout_lock held */
+static void _ml_dbgfs_set_current(struct memlayout *ml, char *name)
+{
+       ml_layout_name(ml, name);
+       debugfs_remove(current_dentry);
+       current_dentry = debugfs_create_symlink("current", root_dentry, name);
+}
+
+static void ml_dbgfs_create_layout_assume_root(struct memlayout *ml)
+{
+       char name[ML_LAYOUT_NAME_SZ];
+       ml_layout_name(ml, name);
+       WARN_ON(!root_dentry);
+       ml->d = debugfs_create_dir(name, root_dentry);
+       WARN_ON(!ml->d);
+}
+
+# if defined(CONFIG_DNUMA_DEBUGFS_WRITE)
+
+#define DEFINE_DEBUGFS_GET(___type)                                    \
+       static int debugfs_## ___type ## _get(void *data, u64 *val)     \
+       {                                                               \
+               *val = *(___type *)data;                                \
+               return 0;                                               \
+       }
+
+DEFINE_DEBUGFS_GET(u32);
+DEFINE_DEBUGFS_GET(u8);
+
+#define DEFINE_WATCHED_ATTR(___type, ___var)                   \
+       static int ___var ## _watch_set(void *data, u64 val)    \
+       {                                                       \
+               ___type old_val = *(___type *)data;             \
+               int ret = ___var ## _watch(old_val, val);       \
+               if (!ret)                                       \
+                       *(___type *)data = val;                 \
+               return ret;                                     \
+       }                                                       \
+       DEFINE_SIMPLE_ATTRIBUTE(___var ## _fops,                \
+                       debugfs_ ## ___type ## _get,            \
+                       ___var ## _watch_set, "%llu\n");
+
+#define DEFINE_ACTION_ATTR(___name)
+
+static u64 dnuma_user_start;
+static u64 dnuma_user_end;
+static u32 dnuma_user_node; /* XXX: I don't care about this var, remove? */
+static u8  dnuma_user_commit, dnuma_user_clear; /* same here */
+static struct memlayout *user_ml;
+static DEFINE_MUTEX(dnuma_user_lock);
+static int dnuma_user_node_watch(u32 old_val, u32 new_val)
+{
+       int ret = 0;
+       mutex_lock(&dnuma_user_lock);
+       if (!user_ml)
+               user_ml = memlayout_create(ML_USER_DEBUG);
+
+       if (WARN_ON(!user_ml)) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       if (new_val >= nr_node_ids) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (dnuma_user_start > dnuma_user_end) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = memlayout_new_range(user_ml, dnuma_user_start, dnuma_user_end,
+                                 new_val);
+
+       if (!ret) {
+               dnuma_user_start = 0;
+               dnuma_user_end = 0;
+       }
+out:
+       mutex_unlock(&dnuma_user_lock);
+       return ret;
+}
+
+static int dnuma_user_commit_watch(u8 old_val, u8 new_val)
+{
+       mutex_lock(&dnuma_user_lock);
+       if (user_ml)
+               memlayout_commit(user_ml);
+       user_ml = NULL;
+       mutex_unlock(&dnuma_user_lock);
+       return 0;
+}
+
+static int dnuma_user_clear_watch(u8 old_val, u8 new_val)
+{
+       mutex_lock(&dnuma_user_lock);
+       if (user_ml)
+               memlayout_destroy(user_ml);
+       user_ml = NULL;
+       mutex_unlock(&dnuma_user_lock);
+       return 0;
+}
+
+DEFINE_WATCHED_ATTR(u32, dnuma_user_node);
+DEFINE_WATCHED_ATTR(u8, dnuma_user_commit);
+DEFINE_WATCHED_ATTR(u8, dnuma_user_clear);
+# endif /* defined(CONFIG_DNUMA_DEBUGFS_WRITE) */
+
+/* create the entire current memlayout.
+ * only used for the layout which exsists prior to fs initialization
+ */
+static void ml_dbgfs_create_initial_layout(void)
+{
+       struct rangemap_entry *rme;
+       char name[max(ML_REGION_NAME_SZ, ML_LAYOUT_NAME_SZ)];
+       struct memlayout *old_ml, *new_ml;
+
+       new_ml = kmalloc(sizeof(*new_ml), GFP_KERNEL);
+       if (WARN(!new_ml, "memlayout allocation failed\n"))
+               return;
+
+       mutex_lock(&memlayout_lock);
+
+       old_ml = rcu_dereference_protected(pfn_to_node_map,
+                       mutex_is_locked(&memlayout_lock));
+       if (WARN_ON(!old_ml))
+               goto e_out;
+       *new_ml = *old_ml;
+
+       if (WARN_ON(new_ml->d))
+               goto e_out;
+
+       /* this assumption holds as ml_dbgfs_create_initial_layout() (this
+        * function) is only called by ml_dbgfs_create_root() */
+       ml_dbgfs_create_layout_assume_root(new_ml);
+       if (!new_ml->d)
+               goto e_out;
+
+       ml_for_each_range(new_ml, rme) {
+               _ml_dbgfs_create_range(new_ml->d, rme, name);
+       }
+
+       _ml_dbgfs_set_current(new_ml, name);
+       rcu_assign_pointer(pfn_to_node_map, new_ml);
+       mutex_unlock(&memlayout_lock);
+
+       synchronize_rcu();
+       kfree(old_ml);
+       return;
+e_out:
+       mutex_unlock(&memlayout_lock);
+       kfree(new_ml);
+}
+
+static atomic64_t ml_cache_hits;
+static atomic64_t ml_cache_misses;
+
+void ml_stat_cache_miss(void)
+{
+       atomic64_inc(&ml_cache_misses);
+}
+
+void ml_stat_cache_hit(void)
+{
+       atomic64_inc(&ml_cache_hits);
+}
+
+/* returns 0 if root_dentry has been created */
+static int ml_dbgfs_create_root(void)
+{
+       if (root_dentry)
+               return 0;
+
+       if (!debugfs_initialized()) {
+               pr_devel("debugfs not registered or disabled.\n");
+               return -EINVAL;
+       }
+
+       root_dentry = debugfs_create_dir("memlayout", NULL);
+       if (!root_dentry) {
+               pr_devel("root dir creation failed\n");
+               return -EINVAL;
+       }
+
+       /* TODO: place in a different dir? (to keep memlayout & dnuma seperate)
+        */
+       /* FIXME: use debugfs_create_atomic64() [does not yet exsist]. */
+       debugfs_create_u64("moved-pages", 0400, root_dentry,
+                          (uint64_t *)&dnuma_moved_page_ct.counter);
+       debugfs_create_u64("pfn-lookup-cache-misses", 0400, root_dentry,
+                          (uint64_t *)&ml_cache_misses.counter);
+       debugfs_create_u64("pfn-lookup-cache-hits", 0400, root_dentry,
+                          (uint64_t *)&ml_cache_hits.counter);
+
+# if defined(CONFIG_DNUMA_DEBUGFS_WRITE)
+       /* Set node last: on write, it adds the range. */
+       debugfs_create_x64("start", 0600, root_dentry, &dnuma_user_start);
+       debugfs_create_x64("end",   0600, root_dentry, &dnuma_user_end);
+       debugfs_create_file("node",  0200, root_dentry,
+                       &dnuma_user_node, &dnuma_user_node_fops);
+       debugfs_create_file("commit",  0200, root_dentry,
+                       &dnuma_user_commit, &dnuma_user_commit_fops);
+       debugfs_create_file("clear",  0200, root_dentry,
+                       &dnuma_user_clear, &dnuma_user_clear_fops);
+# endif
+
+       /* uses root_dentry */
+       ml_dbgfs_create_initial_layout();
+
+       return 0;
+}
+
+static void ml_dbgfs_create_layout(struct memlayout *ml)
+{
+       if (ml_dbgfs_create_root()) {
+               ml->d = NULL;
+               return;
+       }
+       ml_dbgfs_create_layout_assume_root(ml);
+}
+
+static int ml_dbgfs_init_root(void)
+{
+       ml_dbgfs_create_root();
+       return 0;
+}
+
+void ml_dbgfs_init(struct memlayout *ml)
+{
+       ml->seq = atomic_inc_return(&ml_seq) - 1;
+       ml_dbgfs_create_layout(ml);
+}
+
+void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme)
+{
+       char name[ML_REGION_NAME_SZ];
+       if (ml->d)
+               _ml_dbgfs_create_range(ml->d, rme, name);
+}
+
+void ml_dbgfs_set_current(struct memlayout *ml)
+{
+       char name[ML_LAYOUT_NAME_SZ];
+       _ml_dbgfs_set_current(ml, name);
+}
+
+void ml_destroy_dbgfs(struct memlayout *ml)
+{
+       if (ml && ml->d)
+               debugfs_remove_recursive(ml->d);
+}
+
+static void __exit ml_dbgfs_exit(void)
+{
+       debugfs_remove_recursive(root_dentry);
+       root_dentry = NULL;
+}
+
+module_init(ml_dbgfs_init_root);
+module_exit(ml_dbgfs_exit);
diff --git a/mm/memlayout-debugfs.h b/mm/memlayout-debugfs.h
new file mode 100644
index 0000000..12dc1eb
--- /dev/null
+++ b/mm/memlayout-debugfs.h
@@ -0,0 +1,39 @@
+#ifndef LINUX_MM_MEMLAYOUT_DEBUGFS_H_
+#define LINUX_MM_MEMLAYOUT_DEBUGFS_H_
+
+#include <linux/memlayout.h>
+
+#ifdef CONFIG_DNUMA_DEBUGFS
+void ml_stat_count_moved_pages(int order);
+void ml_stat_cache_hit(void);
+void ml_stat_cache_miss(void);
+void ml_dbgfs_init(struct memlayout *ml);
+void ml_dbgfs_create_range(struct memlayout *ml, struct rangemap_entry *rme);
+void ml_destroy_dbgfs(struct memlayout *ml);
+void ml_dbgfs_set_current(struct memlayout *ml);
+void ml_backlog_feed(struct memlayout *ml);
+#else /* !defined(CONFIG_DNUMA_DEBUGFS) */
+static inline void ml_stat_count_moved_pages(int order)
+{}
+static inline void ml_stat_cache_hit(void)
+{}
+static inline void ml_stat_cache_miss(void)
+{}
+
+static inline void ml_dbgfs_init(struct memlayout *ml)
+{}
+static inline void ml_dbgfs_create_range(struct memlayout *ml,
+               struct rangemap_entry *rme)
+{}
+static inline void ml_destroy_dbgfs(struct memlayout *ml)
+{}
+static inline void ml_dbgfs_set_current(struct memlayout *ml)
+{}
+
+static inline void ml_backlog_feed(struct memlayout *ml)
+{
+       memlayout_destroy(ml);
+}
+#endif
+
+#endif
diff --git a/mm/memlayout.c b/mm/memlayout.c
index 7d2905b..45e7df6 100644
--- a/mm/memlayout.c
+++ b/mm/memlayout.c
@@ -14,6 +14,8 @@
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 
+#include "memlayout-debugfs.h"
+
 /* protected by memlayout_lock */
 __rcu struct memlayout *pfn_to_node_map;
 DEFINE_MUTEX(memlayout_lock);
@@ -26,7 +28,7 @@ static void free_rme_tree(struct rb_root *root)
        }
 }
 
-static void ml_destroy_mem(struct memlayout *ml)
+void memlayout_destroy_mem(struct memlayout *ml)
 {
        if (!ml)
                return;
@@ -88,6 +90,8 @@ int memlayout_new_range(struct memlayout *ml, unsigned long 
pfn_start,
 
        rb_link_node(&rme->node, parent, new);
        rb_insert_color(&rme->node, &ml->root);
+
+       ml_dbgfs_create_range(ml, rme);
        return 0;
 }
 
@@ -104,9 +108,12 @@ int memlayout_pfn_to_nid(unsigned long pfn)
        rme = ACCESS_ONCE(ml->cache);
        if (rme && rme_bounds_pfn(rme, pfn)) {
                rcu_read_unlock();
+               ml_stat_cache_hit();
                return rme->nid;
        }
 
+       ml_stat_cache_miss();
+
        node = ml->root.rb_node;
        while (node) {
                struct rangemap_entry *rme = rb_entry(node, typeof(*rme), node);
@@ -135,7 +142,8 @@ out:
 
 void memlayout_destroy(struct memlayout *ml)
 {
-       ml_destroy_mem(ml);
+       ml_destroy_dbgfs(ml);
+       memlayout_destroy_mem(ml);
 }
 
 struct memlayout *memlayout_create(enum memlayout_type type)
@@ -153,6 +161,7 @@ struct memlayout *memlayout_create(enum memlayout_type type)
        ml->type = type;
        ml->cache = NULL;
 
+       ml_dbgfs_init(ml);
        return ml;
 }
 
@@ -163,12 +172,12 @@ void memlayout_commit(struct memlayout *ml)
        if (ml->type == ML_INITIAL) {
                if (WARN(dnuma_has_memlayout(),
                                "memlayout marked first is not first, 
ignoring.\n")) {
-                       memlayout_destroy(ml);
                        ml_backlog_feed(ml);
                        return;
                }
 
                mutex_lock(&memlayout_lock);
+               ml_dbgfs_set_current(ml);
                rcu_assign_pointer(pfn_to_node_map, ml);
                mutex_unlock(&memlayout_lock);
                return;
@@ -179,13 +188,16 @@ void memlayout_commit(struct memlayout *ml)
        unlock_memory_hotplug();
 
        mutex_lock(&memlayout_lock);
+
+       ml_dbgfs_set_current(ml);
+
        old_ml = rcu_dereference_protected(pfn_to_node_map,
                        mutex_is_locked(&memlayout_lock));
 
        rcu_assign_pointer(pfn_to_node_map, ml);
 
        synchronize_rcu();
-       memlayout_destroy(old_ml);
+       ml_backlog_feed(old_ml);
 
        /* Must be called only after the new value for pfn_to_node_map has
         * propogated to all tasks, otherwise some pages may lookup the old
-- 
1.8.2.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to