This commit implements David Rientjes' idea of mempressure cgroup.

The main characteristics are the same to what I've tried to add to vmevent
API; internally, it uses Mel Gorman's idea of scanned/reclaimed ratio for
pressure index calculation. But we don't expose the index to the userland.
Instead, there are three levels of the pressure:

 o low (just reclaiming, e.g. caches are draining);
 o medium (allocation cost becomes high, e.g. swapping);
 o oom (about to oom very soon).

The rationale behind exposing levels and not the raw pressure index
described here: http://lkml.org/lkml/2012/11/16/675

For a task it is possible to be in both cpusets, memcg and mempressure
cgroups, so by rearranging the tasks it is possible to watch a specific
pressure (i.e. caused by cpuset and/or memcg).

Note that while this adds the cgroups support, the code is well separated
and eventually we might add a lightweight, non-cgroups API, i.e. vmevent.
But this is another story.

Signed-off-by: Anton Vorontsov <anton.voront...@linaro.org>
---
 Documentation/cgroups/mempressure.txt |  50 ++++++
 include/linux/cgroup_subsys.h         |   6 +
 include/linux/vmstat.h                |  11 ++
 init/Kconfig                          |  12 ++
 mm/Makefile                           |   1 +
 mm/mempressure.c                      | 330 ++++++++++++++++++++++++++++++++++
 mm/vmscan.c                           |   4 +
 7 files changed, 414 insertions(+)
 create mode 100644 Documentation/cgroups/mempressure.txt
 create mode 100644 mm/mempressure.c

diff --git a/Documentation/cgroups/mempressure.txt 
b/Documentation/cgroups/mempressure.txt
new file mode 100644
index 0000000..dbc0aca
--- /dev/null
+++ b/Documentation/cgroups/mempressure.txt
@@ -0,0 +1,50 @@
+  Memory pressure cgroup
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+  Before using the mempressure cgroup, make sure you have it mounted:
+
+   # cd /sys/fs/cgroup/
+   # mkdir mempressure
+   # mount -t cgroup cgroup ./mempressure -o mempressure
+
+  It is possible to combine cgroups, for example you can mount memory
+  (memcg) and mempressure cgroups together:
+
+   # mount -t cgroup cgroup ./mempressure -o memory,mempressure
+
+  That way the reported pressure will honour memory cgroup limits. The
+  same goes for cpusets.
+
+  After the hierarchy is mounted, you can use the following API:
+
+  /sys/fs/cgroup/.../mempressure.level
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  To maintain the interactivity/memory allocation cost, one can use the
+  pressure level notifications, and the levels are defined like this:
+
+  The "low" level means that the system is reclaiming memory for new
+  allocations. Monitoring reclaiming activity might be useful for
+  maintaining overall system's cache level. Upon notification, the program
+  (typically "Activity Manager") might analyze vmstat and act in advance
+  (i.e. prematurely shutdown unimportant services).
+
+  The "medium" level means that the system is experiencing medium memory
+  pressure, there is some mild swapping activity. Upon this event
+  applications may decide to free any resources that can be easily
+  reconstructed or re-read from a disk.
+
+  The "oom" level means that the system is actively thrashing, it is about
+  to out of memory (OOM) or even the in-kernel OOM killer is on its way to
+  trigger. Applications should do whatever they can to help the system.
+
+  Event control:
+    Is used to setup an eventfd with a level threshold. The argument to
+    the event control specifies the level threshold.
+  Read:
+    Reads mempory presure levels: low, medium or oom.
+  Write:
+    Not implemented.
+  Test:
+    To set up a notification:
+
+    # cgroup_event_listener ./mempressure.level low
+    ("low", "medium", "oom" are permitted.)
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f204a7a..b9802e2 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -37,6 +37,12 @@ SUBSYS(mem_cgroup)
 
 /* */
 
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_MEMPRESSURE)
+SUBSYS(mpc_cgroup)
+#endif
+
+/* */
+
 #if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
 SUBSYS(devices)
 #endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a13291f..c1a66c7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -10,6 +10,17 @@
 
 extern int sysctl_stat_interval;
 
+struct mem_cgroup;
+#ifdef CONFIG_CGROUP_MEMPRESSURE
+extern void vmpressure(struct mem_cgroup *memcg,
+                      ulong scanned, ulong reclaimed);
+extern void vmpressure_prio(struct mem_cgroup *memcg, int prio);
+#else
+static inline void vmpressure(struct mem_cgroup *memcg,
+                             ulong scanned, ulong reclaimed) {}
+static inline void vmpressure_prio(struct mem_cgroup *memcg, int prio) {}
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/init/Kconfig b/init/Kconfig
index 7d30240..d526249 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -891,6 +891,18 @@ config MEMCG_KMEM
          the kmem extension can use it to guarantee that no group of processes
          will ever exhaust kernel resources alone.
 
+config CGROUP_MEMPRESSURE
+       bool "Memory pressure monitor for Control Groups"
+       help
+         The memory pressure monitor cgroup provides a facility for
+         userland programs so that they could easily assist the kernel
+         with the memory management. So far the API provides simple,
+         levels-based memory pressure notifications.
+
+         For more information see Documentation/cgroups/mempressure.txt
+
+         If unsure, say N.
+
 config CGROUP_HUGETLB
        bool "HugeTLB Resource Controller for Control Groups"
        depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL
diff --git a/mm/Makefile b/mm/Makefile
index 3a46287..e69bbda 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_MEMPRESSURE) += mempressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/mempressure.c b/mm/mempressure.c
new file mode 100644
index 0000000..ea312bb
--- /dev/null
+++ b/mm/mempressure.c
@@ -0,0 +1,330 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *               Anton Vorontsov <anton.voront...@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+
+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r);
+
+/*
+ * Generic VM Pressure routines (no cgroups or any other API details)
+ */
+
+/*
+ * The window size is the number of scanned pages before we try to analyze
+ * the scanned/reclaimed ratio (or difference).
+ *
+ * It is used as a rate-limit tunable for the "low" level notification,
+ * and for averaging medium/oom levels. Using small window sizes can cause
+ * lot of false positives, but too big window size will delay the
+ * notifications.
+ */
+static const uint vmpressure_win = SWAP_CLUSTER_MAX * 16;
+static const uint vmpressure_level_med = 60;
+static const uint vmpressure_level_oom = 99;
+static const uint vmpressure_level_oom_prio = 4;
+
+enum vmpressure_levels {
+       VMPRESSURE_LOW = 0,
+       VMPRESSURE_MEDIUM,
+       VMPRESSURE_OOM,
+       VMPRESSURE_NUM_LEVELS,
+};
+
+static const char *vmpressure_str_levels[] = {
+       [VMPRESSURE_LOW] = "low",
+       [VMPRESSURE_MEDIUM] = "medium",
+       [VMPRESSURE_OOM] = "oom",
+};
+
+static enum vmpressure_levels vmpressure_level(uint pressure)
+{
+       if (pressure >= vmpressure_level_oom)
+               return VMPRESSURE_OOM;
+       else if (pressure >= vmpressure_level_med)
+               return VMPRESSURE_MEDIUM;
+       return VMPRESSURE_LOW;
+}
+
+static ulong vmpressure_calc_level(uint win, uint s, uint r)
+{
+       ulong p;
+
+       if (!s)
+               return 0;
+
+       /*
+        * We calculate the ratio (in percents) of how many pages were
+        * scanned vs. reclaimed in a given time frame (window). Note that
+        * time is in VM reclaimer's "ticks", i.e. number of pages
+        * scanned. This makes it possible to set desired reaction time
+        * and serves as a ratelimit.
+        */
+       p = win - (r * win / s);
+       p = p * 100 / win;
+
+       pr_debug("%s: %3lu  (s: %6u  r: %6u)\n", __func__, p, s, r);
+
+       return vmpressure_level(p);
+}
+
+void vmpressure(struct mem_cgroup *memcg, ulong scanned, ulong reclaimed)
+{
+       if (!scanned)
+               return;
+       mpc_vmpressure(memcg, scanned, reclaimed);
+}
+
+void vmpressure_prio(struct mem_cgroup *memcg, int prio)
+{
+       if (prio > vmpressure_level_oom_prio)
+               return;
+
+       /* OK, the prio is below the threshold, send the pre-OOM event. */
+       vmpressure(memcg, vmpressure_win, 0);
+}
+
+/*
+ * Memory pressure cgroup code
+ */
+
+struct mpc_event {
+       struct eventfd_ctx *efd;
+       enum vmpressure_levels level;
+       struct list_head node;
+};
+
+struct mpc_state {
+       struct cgroup_subsys_state css;
+
+       uint scanned;
+       uint reclaimed;
+       struct mutex sr_lock;
+
+       struct list_head events;
+       struct mutex events_lock;
+
+       struct work_struct work;
+};
+
+static struct mpc_state *wk2mpc(struct work_struct *wk)
+{
+       return container_of(wk, struct mpc_state, work);
+}
+
+static struct mpc_state *css2mpc(struct cgroup_subsys_state *css)
+{
+       return container_of(css, struct mpc_state, css);
+}
+
+static struct mpc_state *tsk2mpc(struct task_struct *tsk)
+{
+       return css2mpc(task_subsys_state(tsk, mpc_cgroup_subsys_id));
+}
+
+static struct mpc_state *cg2mpc(struct cgroup *cg)
+{
+       return css2mpc(cgroup_subsys_state(cg, mpc_cgroup_subsys_id));
+}
+
+static void mpc_event(struct mpc_state *mpc, ulong s, ulong r)
+{
+       struct mpc_event *ev;
+       int level = vmpressure_calc_level(vmpressure_win, s, r);
+
+       mutex_lock(&mpc->events_lock);
+
+       list_for_each_entry(ev, &mpc->events, node) {
+               if (level >= ev->level)
+                       eventfd_signal(ev->efd, 1);
+       }
+
+       mutex_unlock(&mpc->events_lock);
+}
+
+static void mpc_vmpressure_wk_fn(struct work_struct *wk)
+{
+       struct mpc_state *mpc = wk2mpc(wk);
+       ulong s;
+       ulong r;
+
+       mutex_lock(&mpc->sr_lock);
+       s = mpc->scanned;
+       r = mpc->reclaimed;
+       mpc->scanned = 0;
+       mpc->reclaimed = 0;
+       mutex_unlock(&mpc->sr_lock);
+
+       mpc_event(mpc, s, r);
+}
+
+static void __mpc_vmpressure(struct mpc_state *mpc, ulong s, ulong r)
+{
+       mutex_lock(&mpc->sr_lock);
+       mpc->scanned += s;
+       mpc->reclaimed += r;
+       mutex_unlock(&mpc->sr_lock);
+
+       if (s < vmpressure_win || work_pending(&mpc->work))
+               return;
+
+       schedule_work(&mpc->work);
+}
+
+static void mpc_vmpressure(struct mem_cgroup *memcg, ulong s, ulong r)
+{
+       /*
+        * There are two options for implementing cgroup pressure
+        * notifications:
+        *
+        * - Store pressure counter atomically in the task struct. Upon
+        *   hitting 'window' wake up a workqueue that will walk every
+        *   task and sum per-thread pressure into cgroup pressure (to
+        *   which the task belongs). The cons are obvious: bloats task
+        *   struct, have to walk all processes and makes pressue less
+        *   accurate (the window becomes per-thread);
+        *
+        * - Store pressure counters in per-cgroup state. This is easy and
+        *   straightforward, and that's how we do things here. But this
+        *   requires us to not put the vmpressure hooks into hotpath,
+        *   since we have to grab some locks.
+        */
+
+#ifdef CONFIG_MEMCG
+       if (memcg) {
+               struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+               struct cgroup *cg = css->cgroup;
+               struct mpc_state *mpc = cg2mpc(cg);
+
+               if (mpc)
+                       __mpc_vmpressure(mpc, s, r);
+               return;
+       }
+#endif
+       task_lock(current);
+       __mpc_vmpressure(tsk2mpc(current), s, r);
+       task_unlock(current);
+}
+
+static struct cgroup_subsys_state *mpc_css_alloc(struct cgroup *cg)
+{
+       struct mpc_state *mpc;
+
+       mpc = kzalloc(sizeof(*mpc), GFP_KERNEL);
+       if (!mpc)
+               return ERR_PTR(-ENOMEM);
+
+       mutex_init(&mpc->sr_lock);
+       mutex_init(&mpc->events_lock);
+       INIT_LIST_HEAD(&mpc->events);
+       INIT_WORK(&mpc->work, mpc_vmpressure_wk_fn);
+
+       return &mpc->css;
+}
+
+static void mpc_css_free(struct cgroup *cg)
+{
+       struct mpc_state *mpc = cg2mpc(cg);
+
+       kfree(mpc);
+}
+
+static ssize_t mpc_read_level(struct cgroup *cg, struct cftype *cft,
+                             struct file *file, char __user *buf,
+                             size_t sz, loff_t *ppos)
+{
+       struct mpc_state *mpc = cg2mpc(cg);
+       uint level;
+       const char *str;
+
+       mutex_lock(&mpc->sr_lock);
+
+       level = vmpressure_calc_level(vmpressure_win,
+                       mpc->scanned, mpc->reclaimed);
+
+       mutex_unlock(&mpc->sr_lock);
+
+       str = vmpressure_str_levels[level];
+       return simple_read_from_buffer(buf, sz, ppos, str, strlen(str));
+}
+
+static int mpc_register_level(struct cgroup *cg, struct cftype *cft,
+                             struct eventfd_ctx *eventfd, const char *args)
+{
+       struct mpc_state *mpc = cg2mpc(cg);
+       struct mpc_event *ev;
+       int lvl;
+
+       for (lvl = 0; lvl < VMPRESSURE_NUM_LEVELS; lvl++) {
+               if (!strcmp(vmpressure_str_levels[lvl], args))
+                       break;
+       }
+
+       if (lvl >= VMPRESSURE_NUM_LEVELS)
+               return -EINVAL;
+
+       ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+       if (!ev)
+               return -ENOMEM;
+
+       ev->efd = eventfd;
+       ev->level = lvl;
+
+       mutex_lock(&mpc->events_lock);
+       list_add(&ev->node, &mpc->events);
+       mutex_unlock(&mpc->events_lock);
+
+       return 0;
+}
+
+static void mpc_unregister_level(struct cgroup *cg, struct cftype *cft,
+                                struct eventfd_ctx *eventfd)
+{
+       struct mpc_state *mpc = cg2mpc(cg);
+       struct mpc_event *ev;
+
+       mutex_lock(&mpc->events_lock);
+       list_for_each_entry(ev, &mpc->events, node) {
+               if (ev->efd != eventfd)
+                       continue;
+               list_del(&ev->node);
+               kfree(ev);
+               break;
+       }
+       mutex_unlock(&mpc->events_lock);
+}
+
+static struct cftype mpc_files[] = {
+       {
+               .name = "level",
+               .read = mpc_read_level,
+               .register_event = mpc_register_level,
+               .unregister_event = mpc_unregister_level,
+       },
+       {},
+};
+
+struct cgroup_subsys mpc_cgroup_subsys = {
+       .name = "mempressure",
+       .subsys_id = mpc_cgroup_subsys_id,
+       .css_alloc = mpc_css_alloc,
+       .css_free = mpc_css_free,
+       .base_cftypes = mpc_files,
+};
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 16b42af..fed0e04 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1900,6 +1900,9 @@ restart:
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 
+       vmpressure(sc->target_mem_cgroup,
+                  sc->nr_scanned - nr_scanned, nr_reclaimed);
+
        /* reclaim/compaction might need reclaim to continue */
        if (should_continue_reclaim(lruvec, nr_reclaimed,
                                    sc->nr_scanned - nr_scanned, sc))
@@ -2122,6 +2125,7 @@ static unsigned long do_try_to_free_pages(struct zonelist 
*zonelist,
                count_vm_event(ALLOCSTALL);
 
        do {
+               vmpressure_prio(sc->target_mem_cgroup, sc->priority);
                sc->nr_scanned = 0;
                aborted_reclaim = shrink_zones(zonelist, sc);
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to