The per-CPU vmstat worker is a problem on -RT workloads (because
ideally the CPU is entirely reserved for the -RT app, without
interference). The worker transfers accumulated per-CPU 
vmstat counters to global counters.

To resolve the problem, create a userspace configurable per-CPU 
vmstat threshold: by default the VM code calculates the size of
the per-CPU vmstat arrays. This tunable allows userspace to 
configure the vmstat threshold values.

The patch below contains documentation which describes the tunables
in more detail.

Signed-off-by: Marcelo Tosatti <mtosa...@redhat.com>

---
 Documentation/vm/vmstat_thresholds.txt |   78 +++++++++++++
 mm/vmstat.c                            |  188 +++++++++++++++++++++++++++++----
 2 files changed, 247 insertions(+), 19 deletions(-)

Index: linux-2.6-git-disable-vmstat-worker/mm/vmstat.c
===================================================================
--- linux-2.6-git-disable-vmstat-worker.orig/mm/vmstat.c        2017-04-25 
07:39:13.941019853 -0300
+++ linux-2.6-git-disable-vmstat-worker/mm/vmstat.c     2017-05-03 
10:59:43.495714336 -0300
@@ -91,8 +91,16 @@
 EXPORT_SYMBOL(vm_zone_stat);
 EXPORT_SYMBOL(vm_node_stat);
 
+struct vmstat_uparam {
+       atomic_t user_stat_thresh;
+};
+
+static DEFINE_PER_CPU(struct vmstat_uparam, vmstat_uparam);
+
 #ifdef CONFIG_SMP
 
+#define MAX_THRESHOLD 125
+
 int calculate_pressure_threshold(struct zone *zone)
 {
        int threshold;
@@ -110,9 +118,9 @@
        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
 
        /*
-        * Maximum threshold is 125
+        * Maximum threshold is MAX_THRESHOLD == 125
         */
-       threshold = min(125, threshold);
+       threshold = min(MAX_THRESHOLD, threshold);
 
        return threshold;
 }
@@ -188,15 +196,31 @@
                threshold = calculate_normal_threshold(zone);
 
                for_each_online_cpu(cpu) {
-                       int pgdat_threshold;
+                       int pgdat_threshold, ustat_thresh;
+                       struct vmstat_uparam *vup;
 
-                       per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-                                                       = threshold;
+                       struct per_cpu_nodestat __percpu *pcp;
+                       struct per_cpu_pageset *p;
+
+                       p = per_cpu_ptr(zone->pageset, cpu);
+
+                       vup = &per_cpu(vmstat_uparam, cpu);
+                       ustat_thresh = atomic_read(&vup->user_stat_thresh);
+
+                       if (ustat_thresh)
+                               p->stat_threshold = ustat_thresh;
+                       else
+                               p->stat_threshold = threshold;
+
+                       pcp = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
 
                        /* Base nodestat threshold on the largest populated 
zone. */
-                       pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold;
-                       per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold
-                               = max(threshold, pgdat_threshold);
+                       pgdat_threshold = pcp->stat_threshold;
+                       if (ustat_thresh)
+                               pcp->stat_threshold = ustat_thresh;
+                       else
+                               pcp->stat_threshold = max(threshold,
+                                                         pgdat_threshold);
                }
 
                /*
@@ -226,9 +250,24 @@
                        continue;
 
                threshold = (*calculate_pressure)(zone);
-               for_each_online_cpu(cpu)
+               for_each_online_cpu(cpu) {
+                       int t, ustat_thresh;
+                       struct vmstat_uparam *vup;
+
+                       vup = &per_cpu(vmstat_uparam, cpu);
+                       ustat_thresh = atomic_read(&vup->user_stat_thresh);
+                       t = threshold;
+
+                       /*
+                        * min because pressure could cause
+                        * calculate_pressure'ed value to be smaller.
+                        */
+                       if (ustat_thresh)
+                               t = min(threshold, ustat_thresh);
+
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
-                                                       = threshold;
+                                                       = t;
+               }
        }
 }
 
@@ -249,7 +288,7 @@
 
        t = __this_cpu_read(pcp->stat_threshold);
 
-       if (unlikely(x > t || x < -t)) {
+       if (unlikely(x >= t || x <= -t)) {
                zone_page_state_add(x, zone, item);
                x = 0;
        }
@@ -269,7 +308,7 @@
 
        t = __this_cpu_read(pcp->stat_threshold);
 
-       if (unlikely(x > t || x < -t)) {
+       if (unlikely(x >= t || x <= -t)) {
                node_page_state_add(x, pgdat, item);
                x = 0;
        }
@@ -308,7 +347,7 @@
 
        v = __this_cpu_inc_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
-       if (unlikely(v > t)) {
+       if (unlikely(v >= t)) {
                s8 overstep = t >> 1;
 
                zone_page_state_add(v + overstep, zone, item);
@@ -324,7 +363,7 @@
 
        v = __this_cpu_inc_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
-       if (unlikely(v > t)) {
+       if (unlikely(v >= t)) {
                s8 overstep = t >> 1;
 
                node_page_state_add(v + overstep, pgdat, item);
@@ -352,7 +391,7 @@
 
        v = __this_cpu_dec_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
-       if (unlikely(v < - t)) {
+       if (unlikely(v <= - t)) {
                s8 overstep = t >> 1;
 
                zone_page_state_add(v - overstep, zone, item);
@@ -368,7 +407,7 @@
 
        v = __this_cpu_dec_return(*p);
        t = __this_cpu_read(pcp->stat_threshold);
-       if (unlikely(v < - t)) {
+       if (unlikely(v <= - t)) {
                s8 overstep = t >> 1;
 
                node_page_state_add(v - overstep, pgdat, item);
@@ -426,7 +465,7 @@
                o = this_cpu_read(*p);
                n = delta + o;
 
-               if (n > t || n < -t) {
+               if (n >= t || n <= -t) {
                        int os = overstep_mode * (t >> 1) ;
 
                        /* Overflow must be added to zone counters */
@@ -483,7 +522,7 @@
                o = this_cpu_read(*p);
                n = delta + o;
 
-               if (n > t || n < -t) {
+               if (n >= t || n <= -t) {
                        int os = overstep_mode * (t >> 1) ;
 
                        /* Overflow must be added to node counters */
@@ -1696,6 +1735,96 @@
                round_jiffies_relative(sysctl_stat_interval));
 }
 
+#ifdef CONFIG_SYSFS
+
+static ssize_t vmstat_thresh_show(struct device *dev,
+                                 struct device_attribute *attr, char *buf)
+{
+       int ret;
+       struct vmstat_uparam *vup;
+       unsigned int cpu = dev->id;
+
+       preempt_disable();
+
+       vup = &per_cpu(vmstat_uparam, cpu);
+       ret = sprintf(buf, "%d\n", atomic_read(&vup->user_stat_thresh));
+
+       preempt_enable();
+
+       return ret;
+}
+
+static ssize_t vmstat_thresh_store(struct device *dev,
+                                  struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       int ret, val;
+       unsigned int cpu = dev->id;
+       struct vmstat_uparam *vup;
+
+       ret = sscanf(buf, "%d", &val);
+       if (ret != 1 || val < 1 || val > MAX_THRESHOLD)
+               return -EINVAL;
+
+       preempt_disable();
+
+       if (cpu_online(cpu)) {
+               vup = &per_cpu(vmstat_uparam, cpu);
+               atomic_set(&vup->user_stat_thresh, val);
+       } else
+               count = -EINVAL;
+
+       preempt_enable();
+
+       return count;
+}
+
+struct device_attribute vmstat_threshold_attr =
+       __ATTR(vmstat_threshold, 0644, vmstat_thresh_show, vmstat_thresh_store);
+
+static struct attribute *vmstat_attrs[] = {
+       &vmstat_threshold_attr.attr,
+       NULL
+};
+
+static struct attribute_group vmstat_attr_group = {
+       .attrs  =  vmstat_attrs,
+       .name   = "vmstat"
+};
+
+static int vmstat_thresh_cpu_online(unsigned int cpu)
+{
+       struct device *dev = get_cpu_device(cpu);
+       int ret;
+
+       ret = sysfs_create_group(&dev->kobj, &vmstat_attr_group);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+static int vmstat_thresh_cpu_down_prep(unsigned int cpu)
+{
+       struct device *dev = get_cpu_device(cpu);
+
+       sysfs_remove_group(&dev->kobj, &vmstat_attr_group);
+       return 0;
+}
+
+static void init_vmstat_sysfs(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
+
+               atomic_set(&vup->user_stat_thresh, 0);
+       }
+}
+
+#endif /* CONFIG_SYSFS */
+
 static void __init init_cpu_node_state(void)
 {
        int node;
@@ -1723,9 +1852,12 @@
 {
        const struct cpumask *node_cpus;
        int node;
+       struct vmstat_uparam *vup = &per_cpu(vmstat_uparam, cpu);
 
        node = cpu_to_node(cpu);
 
+       atomic_set(&vup->user_stat_thresh, 0);
+
        refresh_zone_stat_thresholds();
        node_cpus = cpumask_of_node(node);
        if (cpumask_weight(node_cpus) > 0)
@@ -1735,7 +1867,7 @@
        return 0;
 }
 
-#endif
+#endif /* CONFIG_SMP */
 
 struct workqueue_struct *mm_percpu_wq;
 
@@ -1772,6 +1904,24 @@
 #endif
 }
 
+static int __init init_mm_internals_late(void)
+{
+#ifdef CONFIG_SYSFS
+       int ret;
+
+       init_vmstat_sysfs();
+
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/vmstat_thresh:online",
+                                       vmstat_thresh_cpu_online,
+                                       vmstat_thresh_cpu_down_prep);
+       if (ret < 0)
+               pr_err("vmstat_thresh: failed to register 'online' hotplug 
state\n");
+#endif
+       return 0;
+}
+
+late_initcall(init_mm_internals_late);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 
 /*
Index: 
linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt
===================================================================
--- /dev/null   1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-git-disable-vmstat-worker/Documentation/vm/vmstat_thresholds.txt  
2017-05-02 13:48:45.946840708 -0300
@@ -0,0 +1,78 @@
+Userspace configurable vmstat thresholds
+========================================
+
+This document describes the tunables to control
+per-CPU vmstat threshold and per-CPU vmstat worker
+thread.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_threshold:
+
+This file contains the per-CPU vmstat threshold.
+This value is the maximum that a single per-CPU vmstat statistic
+can accumulate before transferring to the global counters.
+
+A value of 0 indicates that the value is set
+by the in kernel algorithm.
+
+A value different than 0 indicates that particular
+value is used for vmstat_threshold.
+
+/sys/devices/system/cpu/cpuN/vmstat/vmstat_worker:
+
+Enable/disable the per-CPU vmstat worker.
+
+What does the vmstat_threshold value mean? What are the implications
+of changing this value? What's the difference in choosing 1, 2, 3
+or 500?
+====================================================================
+
+Its the maximum value for a vmstat statistics counter to hold. After
+that value, the statistics are transferred to the global counter:
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                long delta)
+{
+        struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+        s8 __percpu *p = pcp->vm_node_stat_diff + item;
+        long x;
+        long t;
+
+        x = delta + __this_cpu_read(*p);
+
+        t = __this_cpu_read(pcp->stat_threshold);
+
+        if (unlikely(x > t || x < -t)) {
+                node_page_state_add(x, pgdat, item);
+                x = 0;
+        }
+        __this_cpu_write(*p, x);
+}
+
+Increasing the threshold value does two things:
+        1) It decreases the number of inter-processor accesses.
+        2) It increases how much the global counters stay out of
+           sync relative to actual current values.
+
+
+Usage example:
+=============
+
+In a realtime system, the worker thread waking up and executing
+vmstat_update can be an undesired source of latencies.
+
+To avoid the worker thread from waking up, executing vmstat_update
+on cpu 1, for example, perform the following steps:
+
+
+cd /sys/devices/system/cpu/cpu0/vmstat/
+
+# Set vmstat threshold to 1 for cpu1, so that no
+# vmstat statistics are collected in cpu1's per-cpu
+# stats, instead they are immediately transferred
+# to the global counter.
+
+$ echo 1 > vmstat_threshold
+
+# Disable vmstat_update worker for cpu1:
+$ echo 0 > vmstat_worker
+


Reply via email to