[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-18 Thread Kemi Wang
There is not really any use to get NUMA stats separated by zone, and
current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
existed per-cpu infrastructure.

Suggested-by: Andi Kleen <a...@linux.intel.com>
Suggested-by: Michal Hocko <mho...@kernel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c|  23 +++
 include/linux/mmzone.h |  27 
 include/linux/vmstat.h |  31 -
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  16 +++--
 mm/vmstat.c| 177 +
 6 files changed, 46 insertions(+), 230 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..a045ea1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
+  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
 }
+
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
 static ssize_t node_read_vmstat(struct device *dev,
@@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-#ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
-#endif
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
-NR_VM_NUMA_STAT_ITEMS],
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..c06d880 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -115,20 +115,6 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
-#ifdef CONFIG_NUMA
-enum numa_stat_item {
-   NUMA_HIT,   /* allocated in intended node */
-   NUMA_MISS,  /* allocated in non intended node */
-   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
-   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
-   NUMA_LOCAL, /* allocation from local node */
-   NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
-};
-#else
-#define NR_VM_NUMA_STAT_ITEMS 0
-#endif
-
 enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
@@ -151,7 +137,18 @@ enum zone_stat_item {
NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
-   NR_LRU_BASE,
+#ifdef CONFIG_NUMA
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
+   NUMA_LOCAL, /* allocation from local node */
+   NUMA_OTHER, /* allocation from other node */
+   NR_VM_NUMA_STAT_ITEMS,
+#else
+#defineNR_VM_NUMA_STAT_ITEMS 0
+#endif
+   NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /*  " " "   "   " */
NR_INACTIVE_FILE,   /*  " " "   "   " */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..80bf290 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,37 +118,8 @@ static inline vo

[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-18 Thread Kemi Wang
There is not really any use to get NUMA stats separated by zone, and
current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
existed per-cpu infrastructure.

Suggested-by: Andi Kleen 
Suggested-by: Michal Hocko 
Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  23 +++
 include/linux/mmzone.h |  27 
 include/linux/vmstat.h |  31 -
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  16 +++--
 mm/vmstat.c| 177 +
 6 files changed, 46 insertions(+), 230 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..a045ea1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
+  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
 }
+
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
 static ssize_t node_read_vmstat(struct device *dev,
@@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-#ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
-#endif
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
-NR_VM_NUMA_STAT_ITEMS],
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..c06d880 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -115,20 +115,6 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
-#ifdef CONFIG_NUMA
-enum numa_stat_item {
-   NUMA_HIT,   /* allocated in intended node */
-   NUMA_MISS,  /* allocated in non intended node */
-   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
-   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
-   NUMA_LOCAL, /* allocation from local node */
-   NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
-};
-#else
-#define NR_VM_NUMA_STAT_ITEMS 0
-#endif
-
 enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
@@ -151,7 +137,18 @@ enum zone_stat_item {
NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
-   NR_LRU_BASE,
+#ifdef CONFIG_NUMA
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
+   NUMA_LOCAL, /* allocation from local node */
+   NUMA_OTHER, /* allocation from other node */
+   NR_VM_NUMA_STAT_ITEMS,
+#else
+#defineNR_VM_NUMA_STAT_ITEMS 0
+#endif
+   NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /*  " " "   "   " */
NR_INACTIVE_FILE,   /*  " " "   "   " */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..80bf290 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with p

[PATCH v2 5/5] mm: Rename zone_statistics() to numa_statistics()

2017-12-18 Thread Kemi Wang
Since the functionality of zone_statistics() updates numa counters, but
numa statistics has been separated from zone statistics framework. Thus,
the function name makes people confused. So, change the name to
numa_statistics() as well as its call sites accordingly.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 81e8d8f..f7583de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2790,7 +2790,7 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void numa_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
int preferred_nid = preferred_zone->node;
@@ -2854,7 +2854,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
@@ -2902,7 +2902,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
local_irq_restore(flags);
 
 out:
-- 
2.7.4



[PATCH v2 5/5] mm: Rename zone_statistics() to numa_statistics()

2017-12-18 Thread Kemi Wang
Since the functionality of zone_statistics() updates numa counters, but
numa statistics has been separated from zone statistics framework. Thus,
the function name makes people confused. So, change the name to
numa_statistics() as well as its call sites accordingly.

Signed-off-by: Kemi Wang 
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 81e8d8f..f7583de 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2790,7 +2790,7 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void numa_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
int preferred_nid = preferred_zone->node;
@@ -2854,7 +2854,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
@@ -2902,7 +2902,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
local_irq_restore(flags);
 
 out:
-- 
2.7.4



[PATCH v2 4/5] mm: use node_page_state_snapshot to avoid deviation

2017-12-18 Thread Kemi Wang
To avoid deviation, this patch uses node_page_state_snapshot instead of
node_page_state for node page stats query.
e.g. cat /proc/zoneinfo
 cat /sys/devices/system/node/node*/vmstat
 cat /sys/devices/system/node/node*/numastat

As it is a slow path and would not be read frequently, I would worry about
it.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c | 17 ++---
 mm/vmstat.c |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index a045ea1..cf303f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,12 +169,15 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
-  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
-  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
-  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
-  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
-  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_FOREIGN),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_INTERLEAVE_HIT),
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_OTHER));
 }
 
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
@@ -194,7 +197,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-node_page_state(pgdat, i));
+node_page_state_snapshot(pgdat, i));
 
return n;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 64e08ae..d65f28d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1466,7 +1466,7 @@ static void zoneinfo_show_print(struct seq_file *m, 
pg_data_t *pgdat,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n  %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-   node_page_state(pgdat, i));
+   node_page_state_snapshot(pgdat, i));
}
}
seq_printf(m,
-- 
2.7.4



[PATCH v2 4/5] mm: use node_page_state_snapshot to avoid deviation

2017-12-18 Thread Kemi Wang
To avoid deviation, this patch uses node_page_state_snapshot instead of
node_page_state for node page stats query.
e.g. cat /proc/zoneinfo
 cat /sys/devices/system/node/node*/vmstat
 cat /sys/devices/system/node/node*/numastat

As it is a slow path and would not be read frequently, I would worry about
it.

Signed-off-by: Kemi Wang 
---
 drivers/base/node.c | 17 ++---
 mm/vmstat.c |  2 +-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index a045ea1..cf303f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,12 +169,15 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
-  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
-  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
-  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
-  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
-  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_FOREIGN),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_INTERLEAVE_HIT),
+  node_page_state_snapshot(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state_snapshot(NODE_DATA(dev->id),
+  NUMA_OTHER));
 }
 
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
@@ -194,7 +197,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-node_page_state(pgdat, i));
+node_page_state_snapshot(pgdat, i));
 
return n;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 64e08ae..d65f28d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1466,7 +1466,7 @@ static void zoneinfo_show_print(struct seq_file *m, 
pg_data_t *pgdat,
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n  %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-   node_page_state(pgdat, i));
+   node_page_state_snapshot(pgdat, i));
}
}
seq_printf(m,
-- 
2.7.4



[PATCH v2 2/5] mm: Extends local cpu counter vm_diff_nodestat from s8 to s16

2017-12-18 Thread Kemi Wang
The type s8 used for vm_diff_nodestat[] as local cpu counters has the
limitation of global counters update frequency, especially for those
monotone increasing type of counters like NUMA counters with more and more
cpus/nodes. This patch extends the type of vm_diff_nodestat from s8 to s16
without any functionality change.

 before after
sizeof(struct per_cpu_nodestat)28 68

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 include/linux/mmzone.h |  4 ++--
 mm/vmstat.c| 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c06d880..2da6b6f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -289,8 +289,8 @@ struct per_cpu_pageset {
 };
 
 struct per_cpu_nodestat {
-   s8 stat_threshold;
-   s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+   s16 stat_threshold;
+   s16 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
 };
 
 #endif /* !__GENERATING_BOUNDS.H */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1dd12ae..9c681cc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -332,7 +332,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum 
node_stat_item item,
long delta)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
long x;
long t;
 
@@ -390,13 +390,13 @@ void __inc_zone_state(struct zone *zone, enum 
zone_stat_item item)
 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 v, t;
 
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
-   s8 overstep = t >> 1;
+   s16 overstep = t >> 1;
 
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
@@ -434,13 +434,13 @@ void __dec_zone_state(struct zone *zone, enum 
zone_stat_item item)
 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 v, t;
 
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
-   s8 overstep = t >> 1;
+   s16 overstep = t >> 1;
 
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
@@ -533,7 +533,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta, int overstep_mode)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
 
do {
-- 
2.7.4



[PATCH v2 3/5] mm: enlarge NUMA counters threshold size

2017-12-18 Thread Kemi Wang
We have seen significant overhead in cache bouncing caused by NUMA counters
update in multi-threaded page allocation. See 'commit 1d90ca897cb0 ("mm:
update NUMA counter threshold size")' for more details.

This patch updates NUMA counters to a fixed size of (MAX_S16 - 2) and deals
with global counter update using different threshold size for node page
stats.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 mm/vmstat.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c681cc..64e08ae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define VM_NUMA_STAT_THRESHOLD (S16_MAX - 2)
+
 #ifdef CONFIG_NUMA
 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
 
@@ -394,7 +396,11 @@ void __inc_node_state(struct pglist_data *pgdat, enum 
node_stat_item item)
s16 v, t;
 
v = __this_cpu_inc_return(*p);
-   t = __this_cpu_read(pcp->stat_threshold);
+   if (item >= NR_VM_NUMA_STAT_ITEMS)
+   t = __this_cpu_read(pcp->stat_threshold);
+   else
+   t = VM_NUMA_STAT_THRESHOLD;
+
if (unlikely(v > t)) {
s16 overstep = t >> 1;
 
@@ -549,7 +555,10 @@ static inline void mod_node_state(struct pglist_data 
*pgdat,
 * Most of the time the thresholds are the same anyways
 * for all cpus in a node.
 */
-   t = this_cpu_read(pcp->stat_threshold);
+   if (item >= NR_VM_NUMA_STAT_ITEMS)
+   t = this_cpu_read(pcp->stat_threshold);
+   else
+   t = VM_NUMA_STAT_THRESHOLD;
 
o = this_cpu_read(*p);
n = delta + o;
-- 
2.7.4



[PATCH v2 2/5] mm: Extends local cpu counter vm_diff_nodestat from s8 to s16

2017-12-18 Thread Kemi Wang
The type s8 used for vm_diff_nodestat[] as local cpu counters has the
limitation of global counters update frequency, especially for those
monotone increasing type of counters like NUMA counters with more and more
cpus/nodes. This patch extends the type of vm_diff_nodestat from s8 to s16
without any functionality change.

 before after
sizeof(struct per_cpu_nodestat)28 68

Signed-off-by: Kemi Wang 
---
 include/linux/mmzone.h |  4 ++--
 mm/vmstat.c| 16 
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c06d880..2da6b6f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -289,8 +289,8 @@ struct per_cpu_pageset {
 };
 
 struct per_cpu_nodestat {
-   s8 stat_threshold;
-   s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+   s16 stat_threshold;
+   s16 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
 };
 
 #endif /* !__GENERATING_BOUNDS.H */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1dd12ae..9c681cc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -332,7 +332,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum 
node_stat_item item,
long delta)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
long x;
long t;
 
@@ -390,13 +390,13 @@ void __inc_zone_state(struct zone *zone, enum 
zone_stat_item item)
 void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 v, t;
 
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
-   s8 overstep = t >> 1;
+   s16 overstep = t >> 1;
 
node_page_state_add(v + overstep, pgdat, item);
__this_cpu_write(*p, -overstep);
@@ -434,13 +434,13 @@ void __dec_zone_state(struct zone *zone, enum 
zone_stat_item item)
 void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 v, t;
 
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
-   s8 overstep = t >> 1;
+   s16 overstep = t >> 1;
 
node_page_state_add(v - overstep, pgdat, item);
__this_cpu_write(*p, overstep);
@@ -533,7 +533,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta, int overstep_mode)
 {
struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-   s8 __percpu *p = pcp->vm_node_stat_diff + item;
+   s16 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
 
do {
-- 
2.7.4



[PATCH v2 3/5] mm: enlarge NUMA counters threshold size

2017-12-18 Thread Kemi Wang
We have seen significant overhead in cache bouncing caused by NUMA counters
update in multi-threaded page allocation. See 'commit 1d90ca897cb0 ("mm:
update NUMA counter threshold size")' for more details.

This patch updates NUMA counters to a fixed size of (MAX_S16 - 2) and deals
with global counter update using different threshold size for node page
stats.

Signed-off-by: Kemi Wang 
---
 mm/vmstat.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c681cc..64e08ae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define VM_NUMA_STAT_THRESHOLD (S16_MAX - 2)
+
 #ifdef CONFIG_NUMA
 int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
 
@@ -394,7 +396,11 @@ void __inc_node_state(struct pglist_data *pgdat, enum 
node_stat_item item)
s16 v, t;
 
v = __this_cpu_inc_return(*p);
-   t = __this_cpu_read(pcp->stat_threshold);
+   if (item >= NR_VM_NUMA_STAT_ITEMS)
+   t = __this_cpu_read(pcp->stat_threshold);
+   else
+   t = VM_NUMA_STAT_THRESHOLD;
+
if (unlikely(v > t)) {
s16 overstep = t >> 1;
 
@@ -549,7 +555,10 @@ static inline void mod_node_state(struct pglist_data 
*pgdat,
 * Most of the time the thresholds are the same anyways
 * for all cpus in a node.
 */
-   t = this_cpu_read(pcp->stat_threshold);
+   if (item >= NR_VM_NUMA_STAT_ITEMS)
+   t = this_cpu_read(pcp->stat_threshold);
+   else
+   t = VM_NUMA_STAT_THRESHOLD;
 
o = this_cpu_read(*p);
n = delta + o;
-- 
2.7.4



[PATCH v2 0/5] mm: NUMA stats code cleanup and enhancement

2017-12-18 Thread Kemi Wang
The existed implementation of NUMA counters is per logical CPU along with
zone->vm_numa_stat[] separated by zone, plus a global numa counter array
vm_numa_stat[]. However, unlike the other vmstat counters, NUMA stats don't
effect system's decision and are only consumed when reading from /proc and
/sys. Also, usually nodes only have a single zone, except for node 0, and
there isn't really any use where you need these hits counts separated by
zone.

Therefore, we can migrate the implementation of numa stats from per-zone to
per-node (as suggested by Andi Kleen), and reuse the existed per-cpu
infrastructure with a little enhancement for NUMA stats. In this way, we
can get rid of the special way for NUMA stats and keep the performance gain
at the same time. With this patch series, about 170 lines code can be
saved.

The first patch migrates NUMA stats from per-zone to pre-node using the
existed per-cpu infrastructure. There is a little user-visual change when
read /proc/zoneinfo listed below:
 Before   After
Node 0, zone  DMA   Node 0, zone  DMA
  per-node stats  per-node stats
  nr_inactive_anon 7244  *numa_hit 98665086*
  nr_active_anon 177064  *numa_miss0*
  ...*numa_foreign 0*
  nr_bounce0 *numa_interleave 21059*
  nr_free_cma  0 *numa_local   98665086*
 *numa_hit 0**numa_other   0*
 *numa_miss0* nr_inactive_anon 20055
 *numa_foreign 0* nr_active_anon 389771
 *numa_interleave 0*  ...
 *numa_local   0* nr_bounce0
 *numa_other   0* nr_free_cma  0

The second patch extends the local cpu counter vm_stat_node_diff from s8 to
s16. It does not have any functionality change.

The third patch uses a large and constant threshold size for NUMA counters
to reduce the global NUMA counters update frequency.

The forth patch uses node_page_state_snapshot instead of node_page_state
when query a node stats (e.g. cat /sys/devices/system/node/node*/vmstat).
The only differece is that the stats value in local cpus are also included
in node_page_state_snapshot.

The last patch renames zone_statistics() to numa_statistics().

At last, I want to extend my heartiest appreciation for Michal Hocko's
suggestion of reusing the existed per-cpu infrastructure making it much
better than before.

Changelog:
  v1->v2:
  a) enhance the existed per-cpu infrastructure for node page stats by
  entending local cpu counters vm_node_stat_diff from s8 to s16
  b) reuse the per-cpu infrastrcuture for NUMA stats

Kemi Wang (5):
  mm: migrate NUMA stats from per-zone to per-node
  mm: Extends local cpu counter vm_diff_nodestat from s8 to s16
  mm: enlarge NUMA counters threshold size
  mm: use node_page_state_snapshot to avoid deviation
  mm: Rename zone_statistics() to numa_statistics()

 drivers/base/node.c|  28 +++
 include/linux/mmzone.h |  31 
 include/linux/vmstat.h |  31 
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  22 +++---
 mm/vmstat.c| 206 +
 6 files changed, 74 insertions(+), 246 deletions(-)

-- 
2.7.4



[PATCH v2 0/5] mm: NUMA stats code cleanup and enhancement

2017-12-18 Thread Kemi Wang
The existed implementation of NUMA counters is per logical CPU along with
zone->vm_numa_stat[] separated by zone, plus a global numa counter array
vm_numa_stat[]. However, unlike the other vmstat counters, NUMA stats don't
effect system's decision and are only consumed when reading from /proc and
/sys. Also, usually nodes only have a single zone, except for node 0, and
there isn't really any use where you need these hits counts separated by
zone.

Therefore, we can migrate the implementation of numa stats from per-zone to
per-node (as suggested by Andi Kleen), and reuse the existed per-cpu
infrastructure with a little enhancement for NUMA stats. In this way, we
can get rid of the special way for NUMA stats and keep the performance gain
at the same time. With this patch series, about 170 lines code can be
saved.

The first patch migrates NUMA stats from per-zone to pre-node using the
existed per-cpu infrastructure. There is a little user-visual change when
read /proc/zoneinfo listed below:
 Before   After
Node 0, zone  DMA   Node 0, zone  DMA
  per-node stats  per-node stats
  nr_inactive_anon 7244  *numa_hit 98665086*
  nr_active_anon 177064  *numa_miss0*
  ...*numa_foreign 0*
  nr_bounce0 *numa_interleave 21059*
  nr_free_cma  0 *numa_local   98665086*
 *numa_hit 0**numa_other   0*
 *numa_miss0* nr_inactive_anon 20055
 *numa_foreign 0* nr_active_anon 389771
 *numa_interleave 0*  ...
 *numa_local   0* nr_bounce0
 *numa_other   0* nr_free_cma  0

The second patch extends the local cpu counter vm_stat_node_diff from s8 to
s16. It does not have any functionality change.

The third patch uses a large and constant threshold size for NUMA counters
to reduce the global NUMA counters update frequency.

The forth patch uses node_page_state_snapshot instead of node_page_state
when query a node stats (e.g. cat /sys/devices/system/node/node*/vmstat).
The only differece is that the stats value in local cpus are also included
in node_page_state_snapshot.

The last patch renames zone_statistics() to numa_statistics().

At last, I want to extend my heartiest appreciation for Michal Hocko's
suggestion of reusing the existed per-cpu infrastructure making it much
better than before.

Changelog:
  v1->v2:
  a) enhance the existed per-cpu infrastructure for node page stats by
  entending local cpu counters vm_node_stat_diff from s8 to s16
  b) reuse the per-cpu infrastrcuture for NUMA stats

Kemi Wang (5):
  mm: migrate NUMA stats from per-zone to per-node
  mm: Extends local cpu counter vm_diff_nodestat from s8 to s16
  mm: enlarge NUMA counters threshold size
  mm: use node_page_state_snapshot to avoid deviation
  mm: Rename zone_statistics() to numa_statistics()

 drivers/base/node.c|  28 +++
 include/linux/mmzone.h |  31 
 include/linux/vmstat.h |  31 
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  22 +++---
 mm/vmstat.c| 206 +
 6 files changed, 74 insertions(+), 246 deletions(-)

-- 
2.7.4



[PATCH 2/2] mm: Rename zone_statistics() to numa_statistics()

2017-11-27 Thread Kemi Wang
Since numa statistics has been separated from zone statistics framework,
but the functionality of zone_statistics() updates numa counters. Thus, the
function name makes people confused. So, change the name to
numa_statistics() as well as its call sites accordingly.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 142e1ba..61fa717 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2783,7 +2783,7 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void numa_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -2845,7 +2845,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
@@ -2893,7 +2893,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
local_irq_restore(flags);
 
 out:
-- 
2.7.4



[PATCH 2/2] mm: Rename zone_statistics() to numa_statistics()

2017-11-27 Thread Kemi Wang
Since numa statistics has been separated from zone statistics framework,
but the functionality of zone_statistics() updates numa counters. Thus, the
function name makes people confused. So, change the name to
numa_statistics() as well as its call sites accordingly.

Signed-off-by: Kemi Wang 
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 142e1ba..61fa717 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2783,7 +2783,7 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
  *
  * Must be called with interrupts disabled.
  */
-static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
+static inline void numa_statistics(struct zone *preferred_zone, struct zone *z)
 {
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
@@ -2845,7 +2845,7 @@ static struct page *rmqueue_pcplist(struct zone 
*preferred_zone,
page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
return page;
@@ -2893,7 +2893,7 @@ struct page *rmqueue(struct zone *preferred_zone,
  get_pcppage_migratetype(page));
 
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-   zone_statistics(preferred_zone, zone);
+   numa_statistics(preferred_zone, zone);
local_irq_restore(flags);
 
 out:
-- 
2.7.4



[PATCH 1/2] mm: NUMA stats code cleanup and enhancement

2017-11-27 Thread Kemi Wang
The existed implementation of NUMA counters is per logical CPU along with
zone->vm_numa_stat[] separated by zone, plus a global numa counter array
vm_numa_stat[]. However, unlike the other vmstat counters, numa stats don't
effect system's decision and are only read from /proc and /sys, it is a
slow path operation and likely tolerate higher overhead. Additionally,
usually nodes only have a single zone, except for node 0. And there isn't
really any use where you need these hits counts separated by zone.

Therefore, we can migrate the implementation of numa stats from per-zone to
per-node, and get rid of these global numa counters. It's good enough to
keep everything in a per cpu ptr of type u64, and sum them up when need, as
suggested by Andi Kleen. That's helpful for code cleanup and enhancement
(e.g. save more than 130+ lines code).

With this patch, we can see 1.8%(335->329) drop of CPU cycles for single
page allocation and deallocation concurrently with 112 threads tested on a
2-sockets skylake platform using Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

Also, it does not cause obvious latency increase when read /proc and /sys
on a 2-sockets skylake platform. Latency shown by time command:
   base head
/proc/vmstatsys 0m0.001s sys 0m0.001s

/sys/devices/system/sys 0m0.001s sys 0m0.000s
node/node*/numastat

We would not worry it much as it is a slow path and will not be read
frequently.

Suggested-by: Andi Kleen <a...@linux.intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c|  14 ++---
 include/linux/mmzone.h |   2 -
 include/linux/vmstat.h |  61 +-
 mm/page_alloc.c|   7 +++
 mm/vmstat.c| 167 -
 5 files changed, 56 insertions(+), 195 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..0be5fbd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,12 +169,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_numa_state_snapshot(dev->id, NUMA_HIT),
+  node_numa_state_snapshot(dev->id, NUMA_MISS),
+  node_numa_state_snapshot(dev->id, NUMA_FOREIGN),
+  node_numa_state_snapshot(dev->id, NUMA_INTERLEAVE_HIT),
+  node_numa_state_snapshot(dev->id, NUMA_LOCAL),
+  node_numa_state_snapshot(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -194,7 +194,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
+node_numa_state_snapshot(nid, i));
 #endif
 
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..b2d264f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -283,7 +283,6 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
@@ -504,7 +503,6 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 } cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..7383d66 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,36 +118,8 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
-
-#ifdef CONFIG_NUMA
-static inline void zone_numa_state_add(long x, struct zone *zone,
-  

[PATCH 1/2] mm: NUMA stats code cleanup and enhancement

2017-11-27 Thread Kemi Wang
The existed implementation of NUMA counters is per logical CPU along with
zone->vm_numa_stat[] separated by zone, plus a global numa counter array
vm_numa_stat[]. However, unlike the other vmstat counters, numa stats don't
effect system's decision and are only read from /proc and /sys, it is a
slow path operation and likely tolerate higher overhead. Additionally,
usually nodes only have a single zone, except for node 0. And there isn't
really any use where you need these hits counts separated by zone.

Therefore, we can migrate the implementation of numa stats from per-zone to
per-node, and get rid of these global numa counters. It's good enough to
keep everything in a per cpu ptr of type u64, and sum them up when need, as
suggested by Andi Kleen. That's helpful for code cleanup and enhancement
(e.g. save more than 130+ lines code).

With this patch, we can see 1.8%(335->329) drop of CPU cycles for single
page allocation and deallocation concurrently with 112 threads tested on a
2-sockets skylake platform using Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

Also, it does not cause obvious latency increase when read /proc and /sys
on a 2-sockets skylake platform. Latency shown by time command:
   base head
/proc/vmstatsys 0m0.001s sys 0m0.001s

/sys/devices/system/sys 0m0.001s sys 0m0.000s
node/node*/numastat

We would not worry it much as it is a slow path and will not be read
frequently.

Suggested-by: Andi Kleen 
Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  14 ++---
 include/linux/mmzone.h |   2 -
 include/linux/vmstat.h |  61 +-
 mm/page_alloc.c|   7 +++
 mm/vmstat.c| 167 -
 5 files changed, 56 insertions(+), 195 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..0be5fbd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,12 +169,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_numa_state_snapshot(dev->id, NUMA_HIT),
+  node_numa_state_snapshot(dev->id, NUMA_MISS),
+  node_numa_state_snapshot(dev->id, NUMA_FOREIGN),
+  node_numa_state_snapshot(dev->id, NUMA_INTERLEAVE_HIT),
+  node_numa_state_snapshot(dev->id, NUMA_LOCAL),
+  node_numa_state_snapshot(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -194,7 +194,7 @@ static ssize_t node_read_vmstat(struct device *dev,
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
+node_numa_state_snapshot(nid, i));
 #endif
 
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..b2d264f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -283,7 +283,6 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
@@ -504,7 +503,6 @@ struct zone {
ZONE_PADDING(_pad3_)
/* Zone statistics */
atomic_long_t   vm_stat[NR_VM_ZONE_STAT_ITEMS];
-   atomic_long_t   vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 } cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..7383d66 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,36 +118,8 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
-
-#ifdef CONFIG_NUMA
-static inline void zone_numa_state_add(long x, struct zone *zone,
-enum numa_stat_item item)
-{
-  

[PATCH v2] buffer: Avoid setting buffer bits that are already set

2017-10-23 Thread Kemi Wang
It's expensive to set buffer flags that are already set, because that
causes a costly cache line transition.

A common case is setting the "verified" flag during ext4 writes.
This patch checks for the flag being set first.

With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4
file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on
a 2-sockets broadwell platform.

What the benchmark does is: it forks 3000 processes, and each  process do
the following:
a) open a new file
b) close the file
c) delete the file
until loop=100*1000 times.

The original patch is contributed by Andi Kleen.

Signed-off-by: Andi Kleen <a...@linux.intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
Tested-by: Kemi Wang <kemi.w...@intel.com>
Reviewed-by: Jens Axboe <ax...@kernel.dk>
---
 include/linux/buffer_head.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae55..211d8f5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -80,11 +80,14 @@ struct buffer_head {
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
+ * To avoid reset buffer flags that are already set, because that causes
+ * a costly cache line transition, check the flag first.
  */
 #define BUFFER_FNS(bit, name)  \
 static __always_inline void set_buffer_##name(struct buffer_head *bh)  \
 {  \
-   set_bit(BH_##bit, &(bh)->b_state);  \
+   if (!test_bit(BH_##bit, &(bh)->b_state))\
+   set_bit(BH_##bit, &(bh)->b_state);  \
 }  \
 static __always_inline void clear_buffer_##name(struct buffer_head *bh)
\
 {  \
-- 
2.7.4



[PATCH v2] buffer: Avoid setting buffer bits that are already set

2017-10-23 Thread Kemi Wang
It's expensive to set buffer flags that are already set, because that
causes a costly cache line transition.

A common case is setting the "verified" flag during ext4 writes.
This patch checks for the flag being set first.

With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4
file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on
a 2-sockets broadwell platform.

What the benchmark does is: it forks 3000 processes, and each  process do
the following:
a) open a new file
b) close the file
c) delete the file
until loop=100*1000 times.

The original patch is contributed by Andi Kleen.

Signed-off-by: Andi Kleen 
Signed-off-by: Kemi Wang 
Tested-by: Kemi Wang 
Reviewed-by: Jens Axboe 
---
 include/linux/buffer_head.h | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae55..211d8f5 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -80,11 +80,14 @@ struct buffer_head {
 /*
  * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
  * and buffer_foo() functions.
+ * To avoid reset buffer flags that are already set, because that causes
+ * a costly cache line transition, check the flag first.
  */
 #define BUFFER_FNS(bit, name)  \
 static __always_inline void set_buffer_##name(struct buffer_head *bh)  \
 {  \
-   set_bit(BH_##bit, &(bh)->b_state);  \
+   if (!test_bit(BH_##bit, &(bh)->b_state))\
+   set_bit(BH_##bit, &(bh)->b_state);  \
 }  \
 static __always_inline void clear_buffer_##name(struct buffer_head *bh)
\
 {  \
-- 
2.7.4



[PATCH] buffer: Avoid setting buffer bits that are already set

2017-10-23 Thread Kemi Wang
It's expensive to set buffer flags that are already set, because that
causes a costly cache line transition.

A common case is setting the "verified" flag during ext4 writes.
This patch checks for the flag being set first.

With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4
file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on
a 2-sockets broadwell platform.

What the benchmark does is: it forks 3000 processes, and each  process do
the following:
a) open a new file
b) close the file
c) delete the file
until loop=100*1000 times.

The original patch is contributed by Andi Kleen.

Signed-off-by: Andi Kleen <a...@linux.intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
Tested-by: Kemi Wang <kemi.w...@intel.com>
---
 include/linux/buffer_head.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae55..e1799f7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -84,7 +84,8 @@ struct buffer_head {
 #define BUFFER_FNS(bit, name)  \
 static __always_inline void set_buffer_##name(struct buffer_head *bh)  \
 {  \
-   set_bit(BH_##bit, &(bh)->b_state);  \
+   if (!test_bit(BH_##bit, &(bh)->b_state))\
+   set_bit(BH_##bit, &(bh)->b_state);  \
 }  \
 static __always_inline void clear_buffer_##name(struct buffer_head *bh)
\
 {  \
-- 
2.7.4



[PATCH] buffer: Avoid setting buffer bits that are already set

2017-10-23 Thread Kemi Wang
It's expensive to set buffer flags that are already set, because that
causes a costly cache line transition.

A common case is setting the "verified" flag during ext4 writes.
This patch checks for the flag being set first.

With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4
file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on
a 2-sockets broadwell platform.

What the benchmark does is: it forks 3000 processes, and each  process do
the following:
a) open a new file
b) close the file
c) delete the file
until loop=100*1000 times.

The original patch is contributed by Andi Kleen.

Signed-off-by: Andi Kleen 
Signed-off-by: Kemi Wang 
Tested-by: Kemi Wang 
---
 include/linux/buffer_head.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c8dae55..e1799f7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -84,7 +84,8 @@ struct buffer_head {
 #define BUFFER_FNS(bit, name)  \
 static __always_inline void set_buffer_##name(struct buffer_head *bh)  \
 {  \
-   set_bit(BH_##bit, &(bh)->b_state);  \
+   if (!test_bit(BH_##bit, &(bh)->b_state))\
+   set_bit(BH_##bit, &(bh)->b_state);  \
 }  \
 static __always_inline void clear_buffer_##name(struct buffer_head *bh)
\
 {  \
-- 
2.7.4



[PATCH v5] mm, sysctl: make NUMA stats configurable

2017-10-17 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo 0 > /proc/sys/vm/numa_stat
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo 1 > /proc/sys/vm/numa_stat
This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

ChangeLog:
  V4->V5
  a) Scope vm_numa_stat_lock into the sysctl handler function, as suggested
  by Michal Hocko;
  b) Only allow 0/1 value when setting a value to numa_stat at userspace,
  that would keep the possibility for add auto mode in future (e.g. 2 for
  auto mode), as suggested by Michal Hocko.

  V3->V4
  a) Get rid of auto mode of numa stats, and may add it back if necessary,
  as alignment before;
  b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled,
  as reported by Andrey Ryabinin. See commit "de55c8b2519" for details
  c) Remove extern declaration for those clear_numa_ function, and make
  them static in vmstat.c, as suggested by Vlastimil Babka.

  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 Documentation/sysctl/vm.txt | 16 +++
 include/linux/vmstat.h  | 10 +++
 kernel/sysctl.c |  9 ++
 mm/mempolicy.c  |  3 ++
 mm/page_alloc.c |  6 
 mm/vmstat.c | 70 +
 6 files changed, 114 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..f65c5c7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats 
are suppressed.)
 
 ==
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo 1 > /proc/sys/vm/numa_stat
+
+==
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c605c94 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,9 +6,19 @@
 #include 
 #include 
 #include 
+#include 
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+   int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a

[PATCH v5] mm, sysctl: make NUMA stats configurable

2017-10-17 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo 0 > /proc/sys/vm/numa_stat
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo 1 > /proc/sys/vm/numa_stat
This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

ChangeLog:
  V4->V5
  a) Scope vm_numa_stat_lock into the sysctl handler function, as suggested
  by Michal Hocko;
  b) Only allow 0/1 value when setting a value to numa_stat at userspace,
  that would keep the possibility for add auto mode in future (e.g. 2 for
  auto mode), as suggested by Michal Hocko.

  V3->V4
  a) Get rid of auto mode of numa stats, and may add it back if necessary,
  as alignment before;
  b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled,
  as reported by Andrey Ryabinin. See commit "de55c8b2519" for details
  c) Remove extern declaration for those clear_numa_ function, and make
  them static in vmstat.c, as suggested by Vlastimil Babka.

  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 Documentation/sysctl/vm.txt | 16 +++
 include/linux/vmstat.h  | 10 +++
 kernel/sysctl.c |  9 ++
 mm/mempolicy.c  |  3 ++
 mm/page_alloc.c |  6 
 mm/vmstat.c | 70 +
 6 files changed, 114 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..f65c5c7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats 
are suppressed.)
 
 ==
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo 1 > /proc/sys/vm/numa_stat
+
+==
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c605c94 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,9 +6,19 @@
 #include 
 #include 
 #include 
+#include 
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+   int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc..8f272db 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysct

[PATCH v4] mm, sysctl: make NUMA stats configurable

2017-10-16 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo 0 > /proc/sys/vm/numa_stat
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo 1 > /proc/sys/vm/numa_stat
This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

ChangeLog:
  V3->V4
  a) Get rid of auto mode of numa stats, and may add it back if necessary,
  as alignment before;
  b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled,
  as reported by Andrey Ryabinin. See commit "de55c8b2519" for details
  c) Remove extern declaration for those clear_numa_ function, and make
  them static in vmstat.c, as suggested by Vlastimil Babka.

  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 Documentation/sysctl/vm.txt | 16 +++
 include/linux/vmstat.h  | 10 +++
 kernel/sysctl.c |  7 +
 mm/mempolicy.c  |  3 ++
 mm/page_alloc.c |  6 
 mm/vmstat.c | 70 +
 6 files changed, 112 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..f65c5c7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats 
are suppressed.)
 
 ==
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo 1 > /proc/sys/vm/numa_stat
+
+==
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c605c94 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,9 +6,19 @@
 #include 
 #include 
 #include 
+#include 
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+   int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc..f6a79a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1371,6 +1371,13 @@ static struct ctl_table vm_table[] = {
.mode   = 0644,
.proc_handler   = _mempolicy_sysctl_handler,
},
+

[PATCH v4] mm, sysctl: make NUMA stats configurable

2017-10-16 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo 0 > /proc/sys/vm/numa_stat
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo 1 > /proc/sys/vm/numa_stat
This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

ChangeLog:
  V3->V4
  a) Get rid of auto mode of numa stats, and may add it back if necessary,
  as alignment before;
  b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled,
  as reported by Andrey Ryabinin. See commit "de55c8b2519" for details
  c) Remove extern declaration for those clear_numa_ function, and make
  them static in vmstat.c, as suggested by Vlastimil Babka.

  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 Documentation/sysctl/vm.txt | 16 +++
 include/linux/vmstat.h  | 10 +++
 kernel/sysctl.c |  7 +
 mm/mempolicy.c  |  3 ++
 mm/page_alloc.c |  6 
 mm/vmstat.c | 70 +
 6 files changed, 112 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..f65c5c7 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats 
are suppressed.)
 
 ==
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo 1 > /proc/sys/vm/numa_stat
+
+==
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c605c94 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,9 +6,19 @@
 #include 
 #include 
 #include 
+#include 
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+   int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc..f6a79a3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1371,6 +1371,13 @@ static struct ctl_table vm_table[] = {
.mode   = 0644,
.proc_handler   = _mempolicy_sysctl_handler,
},
+   {
+   .procname   = "numa_stat",
+   .data

[PATCH v3] mm, sysctl: make NUMA stats configurable

2017-09-28 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo [C|c]oarse > /proc/sys/vm/numa_stats_mode
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo [S|s]trict > /proc/sys/vm/numa_stats_mode

=
We recommend automatic detection of numa statistics by system, this is also
system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/numa_stats_mode
In this case, numa counter update is skipped unless it has been read by
users at least once, e.g. cat /proc/zoneinfo.

Branch target selection with jump label:
a) When numa_stats_mode is changed to *strict*, jump to the branch for numa
counters update.
b) When numa_stats_mode is changed to *coarse*, return back directly.
c) When numa_stats_mode is changed to *auto*, the branch target used in
last time is kept, and the branch target is changed to the branch for numa
counters update once numa counters are *read* by users.

Therefore, with the help of jump label, the page allocation performance is
hardly affected when numa counters are updated with a call in
zone_statistics(). Meanwhile, the auto mode can give people benefit without
manual tuning.

Many thanks to Michal Hocko, Dave Hansen and Ying Huang for comments to
help improve the original patch.

ChangeLog:
  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 Documentation/sysctl/vm.txt |  24 +
 drivers/base/node.c |   4 ++
 include/linux/vmstat.h  |  23 
 init/main.c |   3 ++
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  10 
 mm/vmstat.c | 129 
 7 files changed, 200 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..e310e69 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- numa_stats_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,29 @@ ten times more freeable objects than there are.
 
 =
 
+numa_stats_mode
+
+This interface allows numa statistics configurable.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo [C|c]oarse > /proc/sys/vm/numa_stats_mode
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo [S|s]trict > /proc/sys/vm/numa_stat_mode
+
+We recommend automatic detection of numa statistics by system, because numa
+statistics does not affect system's decision and it is very rarely
+consumed. you can do:
+   echo [A|a]uto > /proc/sys/vm/numa_stats_mode
+This is also system default configuration, with this default setting, numa
+counters update is skipped unless the counter is *read* by users at least
+once.
+
+==
+
 watermark_scale_factor:
 
 This fa

[PATCH v3] mm, sysctl: make NUMA stats configurable

2017-09-28 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested by
Dave Hansen and Ying Huang.

=
When page allocation performance becomes a bottleneck and you can tolerate
some possible tool breakage and decreased numa counter precision, you can
do:
echo [C|c]oarse > /proc/sys/vm/numa_stats_mode
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

=
When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:
echo [S|s]trict > /proc/sys/vm/numa_stats_mode

=
We recommend automatic detection of numa statistics by system, this is also
system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/numa_stats_mode
In this case, numa counter update is skipped unless it has been read by
users at least once, e.g. cat /proc/zoneinfo.

Branch target selection with jump label:
a) When numa_stats_mode is changed to *strict*, jump to the branch for numa
counters update.
b) When numa_stats_mode is changed to *coarse*, return back directly.
c) When numa_stats_mode is changed to *auto*, the branch target used in
last time is kept, and the branch target is changed to the branch for numa
counters update once numa counters are *read* by users.

Therefore, with the help of jump label, the page allocation performance is
hardly affected when numa counters are updated with a call in
zone_statistics(). Meanwhile, the auto mode can give people benefit without
manual tuning.

Many thanks to Michal Hocko, Dave Hansen and Ying Huang for comments to
help improve the original patch.

ChangeLog:
  V2->V3:
  a) Propose a better way to use jump label to eliminate the overhead of
  branch selection in zone_statistics(), as inspired by Ying Huang;
  b) Add a paragraph in commit log to describe the way for branch target
  selection;
  c) Use a more descriptive name numa_stats_mode instead of vmstat_mode,
  and change the description accordingly, as suggested by Michal Hocko;
  d) Make this functionality NUMA-specific via ifdef

  V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 Documentation/sysctl/vm.txt |  24 +
 drivers/base/node.c |   4 ++
 include/linux/vmstat.h  |  23 
 init/main.c |   3 ++
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  10 
 mm/vmstat.c | 129 
 7 files changed, 200 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..e310e69 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- numa_stats_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,29 @@ ten times more freeable objects than there are.
 
 =
 
+numa_stats_mode
+
+This interface allows numa statistics configurable.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+   echo [C|c]oarse > /proc/sys/vm/numa_stats_mode
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+   echo [S|s]trict > /proc/sys/vm/numa_stat_mode
+
+We recommend automatic detection of numa statistics by system, because numa
+statistics does not affect system's decision and it is very rarely
+consumed. you can do:
+   echo [A|a]uto > /proc/sys/vm/numa_stats_mode
+This is also system default configuration, with this default setting, numa
+counters update is skipped unless the counter is *read* by users at least
+once.
+
+==
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
diff --git a/drivers/base/node.c b/drivers/base/node.c

[PATCH v2] mm, sysctl: make VM stats configurable

2017-09-22 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow VM
stats configurable for optimizing zone_statistics(), as suggested by Dave
Hansen and Ying Huang.

===
When performance becomes a bottleneck and you can tolerate some possible
tool breakage and some decreased counter precision (e.g. numa counter), you
can do:
echo [C|c]oarse > /proc/sys/vm/vmstat_mode
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

===
When performance is not a bottleneck and you want all tooling to work, you
can do:
echo [S|s]trict > /proc/sys/vm/vmstat_mode

===
We recommend automatic detection of virtual memory statistics by system,
this is also system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/vmstat_mode
In this case, automatic detection of VM statistics, numa counter update
is skipped unless it has been read by users at least once, e.g. cat
/proc/zoneinfo.

Therefore, with different VM stats mode, numa counters update can operate
differently so that everybody can benefit.

Many thanks to Michal Hocko and Dave Hansen for comments to help improve
the original patch.

ChangeLog:
  Since V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 Documentation/sysctl/vm.txt |  26 +
 drivers/base/node.c |   2 +
 include/linux/vmstat.h  |  22 
 init/main.c |   2 +
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  14 +
 mm/vmstat.c | 126 
 7 files changed, 199 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..6ab2843 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- vmstat_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,31 @@ ten times more freeable objects than there are.
 
 =
 
+vmstat_mode
+
+This interface allows virtual memory statistics configurable.
+
+When performance becomes a bottleneck and you can tolerate some possible
+tool breakage and some decreased counter precision (e.g. numa counter), you
+can do:
+   echo [C|c]oarse > /proc/sys/vm/vmstat_mode
+ignorable statistics list:
+- numa counters
+
+When performance is not a bottleneck and you want all tooling to work, you
+can do:
+   echo [S|s]trict > /proc/sys/vm/vmstat_mode
+
+We recommend automatic detection of virtual memory statistics by system,
+this is also system default configuration, you can do:
+   echo [A|a]uto > /proc/sys/vm/vmstat_mode
+
+E.g. numa statistics does not affect system's decision and it is very
+rarely consumed. If set vmstat_mode = auto, numa counters update is skipped
+unless the counter is *read* by users at least once.
+
+==
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3855902..033c0c3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, 
NULL);
 static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
 {
+   disable_zone_statistics = false;
return sprintf(buf,
   "numa_hit %lu\n"
   "numa_miss %lu\n"
@@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev,
 NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
+   disable_zone_statistics = false;
return n;
 }
 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..22670cf 100644
--- a/include/linux/vmstat.h
+++ b/

[PATCH v2] mm, sysctl: make VM stats configurable

2017-09-22 Thread Kemi Wang
This is the second step which introduces a tunable interface that allow VM
stats configurable for optimizing zone_statistics(), as suggested by Dave
Hansen and Ying Huang.

===
When performance becomes a bottleneck and you can tolerate some possible
tool breakage and some decreased counter precision (e.g. numa counter), you
can do:
echo [C|c]oarse > /proc/sys/vm/vmstat_mode
In this case, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

===
When performance is not a bottleneck and you want all tooling to work, you
can do:
echo [S|s]trict > /proc/sys/vm/vmstat_mode

===
We recommend automatic detection of virtual memory statistics by system,
this is also system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/vmstat_mode
In this case, automatic detection of VM statistics, numa counter update
is skipped unless it has been read by users at least once, e.g. cat
/proc/zoneinfo.

Therefore, with different VM stats mode, numa counters update can operate
differently so that everybody can benefit.

Many thanks to Michal Hocko and Dave Hansen for comments to help improve
the original patch.

ChangeLog:
  Since V1->V2:
  a) Merge to one patch;
  b) Use jump label to eliminate the overhead of branch selection;
  c) Add a single-time log message at boot time to help tell users what
  happened.

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 Documentation/sysctl/vm.txt |  26 +
 drivers/base/node.c |   2 +
 include/linux/vmstat.h  |  22 
 init/main.c |   2 +
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  14 +
 mm/vmstat.c | 126 
 7 files changed, 199 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..6ab2843 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- vmstat_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,31 @@ ten times more freeable objects than there are.
 
 =
 
+vmstat_mode
+
+This interface allows virtual memory statistics configurable.
+
+When performance becomes a bottleneck and you can tolerate some possible
+tool breakage and some decreased counter precision (e.g. numa counter), you
+can do:
+   echo [C|c]oarse > /proc/sys/vm/vmstat_mode
+ignorable statistics list:
+- numa counters
+
+When performance is not a bottleneck and you want all tooling to work, you
+can do:
+   echo [S|s]trict > /proc/sys/vm/vmstat_mode
+
+We recommend automatic detection of virtual memory statistics by system,
+this is also system default configuration, you can do:
+   echo [A|a]uto > /proc/sys/vm/vmstat_mode
+
+E.g. numa statistics does not affect system's decision and it is very
+rarely consumed. If set vmstat_mode = auto, numa counters update is skipped
+unless the counter is *read* by users at least once.
+
+==
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3855902..033c0c3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, 
NULL);
 static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
 {
+   disable_zone_statistics = false;
return sprintf(buf,
   "numa_hit %lu\n"
   "numa_miss %lu\n"
@@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev,
 NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
+   disable_zone_statistics = false;
return n;
 }
 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..22670cf 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,9 +6,27 @@
 #include 
 #include 
 #include 
+#include 
 
 extern int sy

[PATCH 2/3] mm: Handle numa statistics distinctively based-on different VM stats modes

2017-09-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics().  As discussed at the 2017 MM Summit, these are a
substantial source of overhead in the page allocator and are very rarely
consumed.

A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

Therefore, with different VM stats mode, numa counters update can operate
differently so that everybody can benefit:
If vmstat_mode = auto, automatic detection of numa statistics, numa counter
update is skipped unless it has been read by users at least once,
e.g. cat /proc/zoneinfo.

If vmstat_mode = strict, numa counter is updated for each page allocation.

If vmstat_mode = coarse, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c|  2 ++
 include/linux/vmstat.h |  6 +
 mm/page_alloc.c| 13 +++
 mm/vmstat.c| 60 +++---
 4 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3855902..033c0c3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, 
NULL);
 static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
 {
+   disable_zone_statistics = false;
return sprintf(buf,
   "numa_hit %lu\n"
   "numa_miss %lu\n"
@@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev,
 NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
+   disable_zone_statistics = false;
return n;
 }
 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c3634c7..ca9854c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,7 @@
 
 extern int sysctl_stat_interval;
 
+extern bool disable_zone_statistics;
 /*
  * vmstat_mode:
  * 0 = auto mode of vmstat, automatic detection of VM statistics.
@@ -19,6 +20,7 @@ extern int sysctl_stat_interval;
 #define VMSTAT_STRICT_MODE  1
 #define VMSTAT_COARSE_MODE  2
 #define VMSTAT_MODE_LEN 16
+extern int vmstat_mode;
 extern char sysctl_vmstat_mode[];
 extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
@@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
 extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
+extern void zero_zone_numa_counters(struct zone *zone);
+extern void zero_zones_numa_counters(void);
+extern void zero_global_numa_counters(void);
+extern void invalid_numa_statistics(void);
 #else
 #define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af8..010a620 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 
+bool disable_zone_statistics = true;
+
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
 
+   /*
+* skip zone_statistics() if vmstat is a coarse mode or zone statistics
+* is inactive in auto vmstat mode
+*/
+
+   if (vmstat_mode) {
+   if (vmstat_mode == VMSTAT_COARSE_MODE)
+   return;
+   } else if (disable_zone_statistics)
+   return;
+
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e675ad2..bcaef62 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -85,15 +85,31 @@ int sysctl_vm

[PATCH 1/3] mm, sysctl: make VM stats configurable

2017-09-15 Thread Kemi Wang
This patch adds a tunable interface that allows VM stats configurable, as
suggested by Dave Hansen and Ying Huang.

When performance becomes a bottleneck and you can tolerate some possible
tool breakage and some decreased counter precision (e.g. numa counter), you
can do:
echo [C|c]oarse > /proc/sys/vm/vmstat_mode

When performance is not a bottleneck and you want all tooling to work, you
can do:
echo [S|s]trict > /proc/sys/vm/vmstat_mode

We recommend automatic detection of virtual memory statistics by system,
this is also system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/vmstat_mode

The next patch handles numa statistics distinctively based-on different VM
stats mode.

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 include/linux/vmstat.h | 14 ++
 kernel/sysctl.c|  7 +
 mm/vmstat.c| 70 ++
 3 files changed, 91 insertions(+)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c3634c7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,20 @@
 
 extern int sysctl_stat_interval;
 
+/*
+ * vmstat_mode:
+ * 0 = auto mode of vmstat, automatic detection of VM statistics.
+ * 1 = strict mode of vmstat, keep all VM statistics.
+ * 2 = coarse mode of vmstat, ignore unimportant VM statistics.
+ */
+#define VMSTAT_AUTO_MODE 0
+#define VMSTAT_STRICT_MODE  1
+#define VMSTAT_COARSE_MODE  2
+#define VMSTAT_MODE_LEN 16
+extern char sysctl_vmstat_mode[];
+extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+   void __user *buffer, size_t *length, loff_t *ppos);
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..f5b813b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = {
 
 static struct ctl_table vm_table[] = {
{
+   .procname   = "vmstat_mode",
+   .data   = _vmstat_mode,
+   .maxlen = VMSTAT_MODE_LEN,
+   .mode   = 0644,
+   .proc_handler   = sysctl_vmstat_mode_handler,
+   },
+   {
.procname   = "overcommit_memory",
.data   = _overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e7..e675ad2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,76 @@
 
 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
 
+int vmstat_mode = VMSTAT_AUTO_MODE;
+char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto";
+static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"};
+static DEFINE_MUTEX(vmstat_mode_lock);
+
+
+static int __parse_vmstat_mode(char *s)
+{
+   const char *str = s;
+
+   if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0)
+   vmstat_mode = VMSTAT_AUTO_MODE;
+   else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0)
+   vmstat_mode = VMSTAT_STRICT_MODE;
+   else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0)
+   vmstat_mode = VMSTAT_COARSE_MODE;
+   else {
+   pr_warn("Ignoring invalid vmstat_mode value: %s\n", s);
+   return -EINVAL;
+   }
+   return 0;
+}
+
+int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+   void __user *buffer, size_t *length, loff_t *ppos)
+{
+   char old_string[VMSTAT_MODE_LEN];
+   int ret, oldval;
+
+   mutex_lock(_mode_lock);
+   if (write)
+   strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN);
+   ret = proc_dostring(table, write, buffer, length, ppos);
+   if (ret || !write) {
+   mutex_unlock(_mode_lock);
+   return ret;
+   }
+
+   oldval = vmstat_mode;
+   if (__parse_vmstat_mode((char *)table->data)) {
+   /*
+* invalid sysctl_vmstat_mode value, restore saved string
+*/
+   strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN);
+   vmstat_mode = oldval;
+   } else {
+   /*
+* check whether vmstat mode changes or not
+*/
+   if (vmstat_mode == oldval) {
+   /* no change */
+   mutex_unlock(_mode_lock);
+   return 0;
+   } else if (vmstat_mode == VMSTAT_AUTO_MODE)
+   pr_info("vmstat mode changes from %s to auto mod

[PATCH 2/3] mm: Handle numa statistics distinctively based-on different VM stats modes

2017-09-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics().  As discussed at the 2017 MM Summit, these are a
substantial source of overhead in the page allocator and are very rarely
consumed.

A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

Therefore, with different VM stats mode, numa counters update can operate
differently so that everybody can benefit:
If vmstat_mode = auto, automatic detection of numa statistics, numa counter
update is skipped unless it has been read by users at least once,
e.g. cat /proc/zoneinfo.

If vmstat_mode = strict, numa counter is updated for each page allocation.

If vmstat_mode = coarse, numa counter update is ignored. We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim
on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu
cycles per single page allocation and reclaim on Jesper's page_bench03 (88
threads) running on a 2-Socket Broadwell-based server (88 threads, 126G
memory).

Benchmark link provided by Jesper D Brouer(increase loop times to
1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  2 ++
 include/linux/vmstat.h |  6 +
 mm/page_alloc.c| 13 +++
 mm/vmstat.c| 60 +++---
 4 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 3855902..033c0c3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, 
NULL);
 static ssize_t node_read_numastat(struct device *dev,
struct device_attribute *attr, char *buf)
 {
+   disable_zone_statistics = false;
return sprintf(buf,
   "numa_hit %lu\n"
   "numa_miss %lu\n"
@@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev,
 NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
+   disable_zone_statistics = false;
return n;
 }
 static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c3634c7..ca9854c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,7 @@
 
 extern int sysctl_stat_interval;
 
+extern bool disable_zone_statistics;
 /*
  * vmstat_mode:
  * 0 = auto mode of vmstat, automatic detection of VM statistics.
@@ -19,6 +20,7 @@ extern int sysctl_stat_interval;
 #define VMSTAT_STRICT_MODE  1
 #define VMSTAT_COARSE_MODE  2
 #define VMSTAT_MODE_LEN 16
+extern int vmstat_mode;
 extern char sysctl_vmstat_mode[];
 extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos);
@@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node,
 extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
 extern unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item);
+extern void zero_zone_numa_counters(struct zone *zone);
+extern void zero_zones_numa_counters(void);
+extern void zero_global_numa_counters(void);
+extern void invalid_numa_statistics(void);
 #else
 #define sum_zone_node_page_state(node, item) global_zone_page_state(item)
 #define node_page_state(node, item) global_node_page_state(item)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af8..010a620 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 
+bool disable_zone_statistics = true;
+
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone 
*preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
 
+   /*
+* skip zone_statistics() if vmstat is a coarse mode or zone statistics
+* is inactive in auto vmstat mode
+*/
+
+   if (vmstat_mode) {
+   if (vmstat_mode == VMSTAT_COARSE_MODE)
+   return;
+   } else if (disable_zone_statistics)
+   return;
+
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e675ad2..bcaef62 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -85,15 +85,31 @@ int sysctl_vmstat_mode_handler(struct ctl_table *table, int 
write,
/* no change */
  

[PATCH 1/3] mm, sysctl: make VM stats configurable

2017-09-15 Thread Kemi Wang
This patch adds a tunable interface that allows VM stats configurable, as
suggested by Dave Hansen and Ying Huang.

When performance becomes a bottleneck and you can tolerate some possible
tool breakage and some decreased counter precision (e.g. numa counter), you
can do:
echo [C|c]oarse > /proc/sys/vm/vmstat_mode

When performance is not a bottleneck and you want all tooling to work, you
can do:
echo [S|s]trict > /proc/sys/vm/vmstat_mode

We recommend automatic detection of virtual memory statistics by system,
this is also system default configuration, you can do:
echo [A|a]uto > /proc/sys/vm/vmstat_mode

The next patch handles numa statistics distinctively based-on different VM
stats mode.

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 include/linux/vmstat.h | 14 ++
 kernel/sysctl.c|  7 +
 mm/vmstat.c| 70 ++
 3 files changed, 91 insertions(+)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index ade7cb5..c3634c7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -9,6 +9,20 @@
 
 extern int sysctl_stat_interval;
 
+/*
+ * vmstat_mode:
+ * 0 = auto mode of vmstat, automatic detection of VM statistics.
+ * 1 = strict mode of vmstat, keep all VM statistics.
+ * 2 = coarse mode of vmstat, ignore unimportant VM statistics.
+ */
+#define VMSTAT_AUTO_MODE 0
+#define VMSTAT_STRICT_MODE  1
+#define VMSTAT_COARSE_MODE  2
+#define VMSTAT_MODE_LEN 16
+extern char sysctl_vmstat_mode[];
+extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+   void __user *buffer, size_t *length, loff_t *ppos);
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbb..f5b813b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = {
 
 static struct ctl_table vm_table[] = {
{
+   .procname   = "vmstat_mode",
+   .data   = _vmstat_mode,
+   .maxlen = VMSTAT_MODE_LEN,
+   .mode   = 0644,
+   .proc_handler   = sysctl_vmstat_mode_handler,
+   },
+   {
.procname   = "overcommit_memory",
.data   = _overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e7..e675ad2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,76 @@
 
 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
 
+int vmstat_mode = VMSTAT_AUTO_MODE;
+char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto";
+static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"};
+static DEFINE_MUTEX(vmstat_mode_lock);
+
+
+static int __parse_vmstat_mode(char *s)
+{
+   const char *str = s;
+
+   if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0)
+   vmstat_mode = VMSTAT_AUTO_MODE;
+   else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0)
+   vmstat_mode = VMSTAT_STRICT_MODE;
+   else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0)
+   vmstat_mode = VMSTAT_COARSE_MODE;
+   else {
+   pr_warn("Ignoring invalid vmstat_mode value: %s\n", s);
+   return -EINVAL;
+   }
+   return 0;
+}
+
+int sysctl_vmstat_mode_handler(struct ctl_table *table, int write,
+   void __user *buffer, size_t *length, loff_t *ppos)
+{
+   char old_string[VMSTAT_MODE_LEN];
+   int ret, oldval;
+
+   mutex_lock(_mode_lock);
+   if (write)
+   strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN);
+   ret = proc_dostring(table, write, buffer, length, ppos);
+   if (ret || !write) {
+   mutex_unlock(_mode_lock);
+   return ret;
+   }
+
+   oldval = vmstat_mode;
+   if (__parse_vmstat_mode((char *)table->data)) {
+   /*
+* invalid sysctl_vmstat_mode value, restore saved string
+*/
+   strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN);
+   vmstat_mode = oldval;
+   } else {
+   /*
+* check whether vmstat mode changes or not
+*/
+   if (vmstat_mode == oldval) {
+   /* no change */
+   mutex_unlock(_mode_lock);
+   return 0;
+   } else if (vmstat_mode == VMSTAT_AUTO_MODE)
+   pr_info("vmstat mode changes from %s to auto mode\n",
+   vmstat_mode_name[oldval]);
+   else if (vmstat_mode =

[PATCH 3/3] sysctl/vm.txt: Update document

2017-09-15 Thread Kemi Wang
Add a paragraph to introduce the functionality and usage on vmstat_mode in
sysctl/vm.txt

Reported-by: Jesper Dangaard Brouer <bro...@redhat.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 Documentation/sysctl/vm.txt | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..6ab2843 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- vmstat_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,31 @@ ten times more freeable objects than there are.
 
 =
 
+vmstat_mode
+
+This interface allows virtual memory statistics configurable.
+
+When performance becomes a bottleneck and you can tolerate some possible
+tool breakage and some decreased counter precision (e.g. numa counter), you
+can do:
+   echo [C|c]oarse > /proc/sys/vm/vmstat_mode
+ignorable statistics list:
+- numa counters
+
+When performance is not a bottleneck and you want all tooling to work, you
+can do:
+   echo [S|s]trict > /proc/sys/vm/vmstat_mode
+
+We recommend automatic detection of virtual memory statistics by system,
+this is also system default configuration, you can do:
+   echo [A|a]uto > /proc/sys/vm/vmstat_mode
+
+E.g. numa statistics does not affect system's decision and it is very
+rarely consumed. If set vmstat_mode = auto, numa counters update is skipped
+unless the counter is *read* by users at least once.
+
+==
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
-- 
2.7.4



[PATCH 3/3] sysctl/vm.txt: Update document

2017-09-15 Thread Kemi Wang
Add a paragraph to introduce the functionality and usage on vmstat_mode in
sysctl/vm.txt

Reported-by: Jesper Dangaard Brouer 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
Signed-off-by: Kemi Wang 
---
 Documentation/sysctl/vm.txt | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 9baf66a..6ab2843 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
+- vmstat_mode
 - watermark_scale_factor
 - zone_reclaim_mode
 
@@ -843,6 +844,31 @@ ten times more freeable objects than there are.
 
 =
 
+vmstat_mode
+
+This interface allows virtual memory statistics configurable.
+
+When performance becomes a bottleneck and you can tolerate some possible
+tool breakage and some decreased counter precision (e.g. numa counter), you
+can do:
+   echo [C|c]oarse > /proc/sys/vm/vmstat_mode
+ignorable statistics list:
+- numa counters
+
+When performance is not a bottleneck and you want all tooling to work, you
+can do:
+   echo [S|s]trict > /proc/sys/vm/vmstat_mode
+
+We recommend automatic detection of virtual memory statistics by system,
+this is also system default configuration, you can do:
+   echo [A|a]uto > /proc/sys/vm/vmstat_mode
+
+E.g. numa statistics does not affect system's decision and it is very
+rarely consumed. If set vmstat_mode = auto, numa counters update is skipped
+unless the counter is *read* by users at least once.
+
+==
+
 watermark_scale_factor:
 
 This factor controls the aggressiveness of kswapd. It defines the
-- 
2.7.4



[PATCH 0/3] Handle zone statistics distinctively based-on

2017-09-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM summit.
A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

This is the second step for optimizing zone statistics, the first patch
introduces a tunable interface that allow VM statistics configurable(see
the first patch for details):
if vmstat_mode = auto, automatic detection of VM statistics
if vmstat_mode = strict, keep all the VM statistics
if vmstat_mode = coarse, ignore unimportant VM statistics
As suggested by Dave Hansen and Ying Huang.

With this interface, the second patch handles numa counters distinctively
according to different vmstat mode, and the test result shows about 4.8%
(185->176) drop of cpu cycles with single thread and 8.1% (343->315) drop
of of cpu cycles with 88 threads for single page allocation.

The third patch updates ABI document accordingly.

Kemi Wang (3):
  mm, sysctl: make VM stats configurable
  mm: Handle numa statistics distinctively based-on different VM stats
modes
  sysctl/vm.txt: Update document

 Documentation/sysctl/vm.txt |  26 ++
 drivers/base/node.c |   2 +
 include/linux/vmstat.h  |  20 +++
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  13 +
 mm/vmstat.c | 124 
 6 files changed, 192 insertions(+)

-- 
2.7.4



[PATCH 0/3] Handle zone statistics distinctively based-on

2017-09-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM summit.
A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

This is the second step for optimizing zone statistics, the first patch
introduces a tunable interface that allow VM statistics configurable(see
the first patch for details):
if vmstat_mode = auto, automatic detection of VM statistics
if vmstat_mode = strict, keep all the VM statistics
if vmstat_mode = coarse, ignore unimportant VM statistics
As suggested by Dave Hansen and Ying Huang.

With this interface, the second patch handles numa counters distinctively
according to different vmstat mode, and the test result shows about 4.8%
(185->176) drop of cpu cycles with single thread and 8.1% (343->315) drop
of of cpu cycles with 88 threads for single page allocation.

The third patch updates ABI document accordingly.

Kemi Wang (3):
  mm, sysctl: make VM stats configurable
  mm: Handle numa statistics distinctively based-on different VM stats
modes
  sysctl/vm.txt: Update document

 Documentation/sysctl/vm.txt |  26 ++
 drivers/base/node.c |   2 +
 include/linux/vmstat.h  |  20 +++
 kernel/sysctl.c |   7 +++
 mm/page_alloc.c |  13 +
 mm/vmstat.c | 124 
 6 files changed, 192 insertions(+)

-- 
2.7.4



[PATCH v2 2/3] mm: Update NUMA counter threshold size

2017-08-24 Thread Kemi Wang
There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2,
as a small threshold greatly increases the update frequency of the global
counter from local per cpu counter(suggested by Ying Huang).

The rationality is that these statistics counters don't affect the kernel's
decision, unlike other VM counters, so it's not a problem to use a large
threshold.

With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

 Threshold   CPU cyclesThroughput(88 threads)
 32  799 241760478
 64  640 301628829
 125 537 358906028 <==> system by default (base)
 256 468 412397590
 512 428 450550704
 4096399 482520943
 2   394 489009617
 3   395 488017817
 65533   369(-31.3%) 521661345(+45.3%) <==> with this patchset
 N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Changelog:
v2:
a) Change the type of vm_numa_stat_diff[] from s16 to u16, since numa
stats counter is always a incremental field.
b) Remove numa_stat_threshold field in struct per_cpu_pageset, since it
is a constant value and rarely be changed.
c) Cut down instructions in __inc_numa_state() due to the incremental
numa counter and the consistant numa threshold.
d) Move zone_numa_state_snapshot() to an individual patch, since it
does not appear to be related to this patch.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
---
 include/linux/mmzone.h |  3 +--
 mm/vmstat.c| 28 ++--
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 582f6d9..c386ec4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,7 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   s8 numa_stat_threshold;
-   s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b54b..b015f39 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void)
 
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
+
/* Base nodestat threshold on the largest populated 
zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold;
per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold
@@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
 
threshold = (*calculate_pressure)(zone);
-   for_each_online_cpu(cpu) {
+   for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
-   }
}
 }
 
@@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone,
 enum numa_stat_item item)
 {
struct per_cpu_pageset __percpu *pcp = zone->pageset;
-   s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-   s8 v, t;
+   u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+   u16 v;
 
v = __this_cpu_inc_return(*p);
-   t = __this_cpu_read(pcp->numa_stat_threshold);
-   if (unlikely(v > t)) {
-   s8 overstep = t >> 1;
 
-   zone_numa_state_add(v + overstep, zone, item);
-   __this_cpu_write(*p, -overstep);
+   if (unlikely(v > NUMA_S

[PATCH v2 0/3] Separate NUMA statistics from zone statistics

2017-08-24 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM summit, these are a substantial
source of overhead in the page allocator and are very rarely consumed. This
significant overhead in cache bouncing caused by zone counters (NUMA
associated counters) update in parallel in multi-threaded page allocation
(pointed out by Dave Hansen).

A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

To mitigate this overhead, this patchset separates NUMA statistics from
zone statistics framework, and update NUMA counter threshold to a fixed
size of MAX_U16 - 2, as a small threshold greatly increases the update
frequency of the global counter from local per cpu counter (suggested by
Ying Huang). The rationality is that these statistics counters don't need
to be read often, unlike other VM counters, so it's not a problem to use a
large threshold and make readers more expensive.

With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below)
for per single page allocation and reclaim on Jesper's page_bench03
benchmark. Meanwhile, this patchset keeps the same style of virtual memory
statistics with little end-user-visible effects (only move the numa stats
to show behind zone page stats, see the first patch for details).

I did an experiment of single page allocation and reclaim concurrently
using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server
(88 processors with 126G memory) with different size of threshold of pcp
counter.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

   Threshold   CPU cyclesThroughput(88 threads)
  32799 241760478
  64640 301628829
  125   537 358906028 <==> system by default
  256   468 412397590
  512   428 450550704
  4096  399 482520943
  2 394 489009617
  3 395 488017817
  65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset
  N/A   342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Kemi Wang (3):
  mm: Change the call sites of numa statistics items
  mm: Update NUMA counter threshold size
  mm: Consider the number in local CPUs when *reads* NUMA stats

 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  24 +---
 include/linux/vmstat.h |  33 +++
 mm/page_alloc.c|  10 ++--
 mm/vmstat.c| 152 +++--
 5 files changed, 217 insertions(+), 24 deletions(-)

-- 
2.7.4



[PATCH v2 2/3] mm: Update NUMA counter threshold size

2017-08-24 Thread Kemi Wang
There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2,
as a small threshold greatly increases the update frequency of the global
counter from local per cpu counter(suggested by Ying Huang).

The rationality is that these statistics counters don't affect the kernel's
decision, unlike other VM counters, so it's not a problem to use a large
threshold.

With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

 Threshold   CPU cyclesThroughput(88 threads)
 32  799 241760478
 64  640 301628829
 125 537 358906028 <==> system by default (base)
 256 468 412397590
 512 428 450550704
 4096399 482520943
 2   394 489009617
 3   395 488017817
 65533   369(-31.3%) 521661345(+45.3%) <==> with this patchset
 N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Changelog:
v2:
a) Change the type of vm_numa_stat_diff[] from s16 to u16, since numa
stats counter is always a incremental field.
b) Remove numa_stat_threshold field in struct per_cpu_pageset, since it
is a constant value and rarely be changed.
c) Cut down instructions in __inc_numa_state() due to the incremental
numa counter and the consistant numa threshold.
d) Move zone_numa_state_snapshot() to an individual patch, since it
does not appear to be related to this patch.

Signed-off-by: Kemi Wang 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
---
 include/linux/mmzone.h |  3 +--
 mm/vmstat.c| 28 ++--
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 582f6d9..c386ec4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,7 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   s8 numa_stat_threshold;
-   s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0c3b54b..b015f39 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void)
 
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
+
/* Base nodestat threshold on the largest populated 
zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold;
per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold
@@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
 
threshold = (*calculate_pressure)(zone);
-   for_each_online_cpu(cpu) {
+   for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
-   }
}
 }
 
@@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone,
 enum numa_stat_item item)
 {
struct per_cpu_pageset __percpu *pcp = zone->pageset;
-   s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-   s8 v, t;
+   u16 __percpu *p = pcp->vm_numa_stat_diff + item;
+   u16 v;
 
v = __this_cpu_inc_return(*p);
-   t = __this_cpu_read(pcp->numa_stat_threshold);
-   if (unlikely(v > t)) {
-   s8 overstep = t >> 1;
 
-   zone_numa_state_add(v + overstep, zone, item);
-   __this_cpu_write(*p, -overstep);
+   if (unlikely(v > NUMA_STATS_THRESHOLD)) {
+   zone_numa_state_add(v, zone, item);
+   __

[PATCH v2 0/3] Separate NUMA statistics from zone statistics

2017-08-24 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM summit, these are a substantial
source of overhead in the page allocator and are very rarely consumed. This
significant overhead in cache bouncing caused by zone counters (NUMA
associated counters) update in parallel in multi-threaded page allocation
(pointed out by Dave Hansen).

A link to the MM summit slides:
http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017
-JesperBrouer.pdf

To mitigate this overhead, this patchset separates NUMA statistics from
zone statistics framework, and update NUMA counter threshold to a fixed
size of MAX_U16 - 2, as a small threshold greatly increases the update
frequency of the global counter from local per cpu counter (suggested by
Ying Huang). The rationality is that these statistics counters don't need
to be read often, unlike other VM counters, so it's not a problem to use a
large threshold and make readers more expensive.

With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below)
for per single page allocation and reclaim on Jesper's page_bench03
benchmark. Meanwhile, this patchset keeps the same style of virtual memory
statistics with little end-user-visible effects (only move the numa stats
to show behind zone page stats, see the first patch for details).

I did an experiment of single page allocation and reclaim concurrently
using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server
(88 processors with 126G memory) with different size of threshold of pcp
counter.

Benchmark provided by Jesper D Brouer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/
bench

   Threshold   CPU cyclesThroughput(88 threads)
  32799 241760478
  64640 301628829
  125   537 358906028 <==> system by default
  256   468 412397590
  512   428 450550704
  4096  399 482520943
  2 394 489009617
  3 395 488017817
  65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset
  N/A   342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Kemi Wang (3):
  mm: Change the call sites of numa statistics items
  mm: Update NUMA counter threshold size
  mm: Consider the number in local CPUs when *reads* NUMA stats

 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  24 +---
 include/linux/vmstat.h |  33 +++
 mm/page_alloc.c|  10 ++--
 mm/vmstat.c| 152 +++--
 5 files changed, 217 insertions(+), 24 deletions(-)

-- 
2.7.4



[PATCH v2 1/3] mm: Change the call sites of numa statistics items

2017-08-24 Thread Kemi Wang
In this patch,  NUMA statistics is separated from zone statistics
framework, all the call sites of NUMA stats are changed to use
numa-stats-specific functions, it does not have any functionality change
except that the number of NUMA stats is shown behind zone page stats when
users *read* the zone info.

E.g. cat /proc/zoneinfo
***Base***   ***With this patch***
nr_free_pages 3976 nr_free_pages 3976
nr_zone_inactive_anon 0nr_zone_inactive_anon 0
nr_zone_active_anon 0  nr_zone_active_anon 0
nr_zone_inactive_file 0nr_zone_inactive_file 0
nr_zone_active_file 0  nr_zone_active_file 0
nr_zone_unevictable 0  nr_zone_unevictable 0
nr_zone_write_pending 0nr_zone_write_pending 0
nr_mlock 0 nr_mlock 0
nr_page_table_pages 0  nr_page_table_pages 0
nr_kernel_stack 0  nr_kernel_stack 0
nr_bounce0 nr_bounce0
nr_zspages   0 nr_zspages   0
numa_hit 0*nr_free_cma  0*
numa_miss 0numa_hit 0
numa_foreign 0 numa_miss0
numa_interleave 0  numa_foreign 0
numa_local   0 numa_interleave 0
numa_other   0 numa_local   0
*nr_free_cma 0*numa_other 0
......
vm stats threshold: 10 vm stats threshold: 10
......

The next patch updates the numa stats counter size and threshold.

Changelog:

v2:
a) Modify the name of numa-stats-specific functions and params to avoid
confusion with those for zone/node page stats.
b) Get rid of showing the number of numa stat threshold in /proc/zoneinfo
since the value of this item is a constant.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  29 +
 mm/page_alloc.c|  10 ++--
 mm/vmstat.c| 159 +++--
 5 files changed, 219 insertions(+), 26 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc830..3855902 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_node_page_state(dev->id, NUMA_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_MISS),
-  sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-  sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-  sum_zone_node_page_state(dev->id, NUMA_OTHER));
+  sum_zone_numa_state(dev->id, NUMA_HIT),
+  sum_zone_numa_state(dev->id, NUMA_MISS),
+  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
+  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+  sum_zone_numa_state(dev->id, NUMA_LOCAL),
+  sum_zone_numa_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+sum_zone_numa_state(nid, i));
+#endif
+
+   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+   n += sprintf(buf+n, "%s %lu\n",
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fda9afb..582f6d9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+#ifdef CONFIG_NUMA
+enum numa_stat_item {
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN, 

[PATCH v2 1/3] mm: Change the call sites of numa statistics items

2017-08-24 Thread Kemi Wang
In this patch,  NUMA statistics is separated from zone statistics
framework, all the call sites of NUMA stats are changed to use
numa-stats-specific functions, it does not have any functionality change
except that the number of NUMA stats is shown behind zone page stats when
users *read* the zone info.

E.g. cat /proc/zoneinfo
***Base***   ***With this patch***
nr_free_pages 3976 nr_free_pages 3976
nr_zone_inactive_anon 0nr_zone_inactive_anon 0
nr_zone_active_anon 0  nr_zone_active_anon 0
nr_zone_inactive_file 0nr_zone_inactive_file 0
nr_zone_active_file 0  nr_zone_active_file 0
nr_zone_unevictable 0  nr_zone_unevictable 0
nr_zone_write_pending 0nr_zone_write_pending 0
nr_mlock 0 nr_mlock 0
nr_page_table_pages 0  nr_page_table_pages 0
nr_kernel_stack 0  nr_kernel_stack 0
nr_bounce0 nr_bounce0
nr_zspages   0 nr_zspages   0
numa_hit 0*nr_free_cma  0*
numa_miss 0numa_hit 0
numa_foreign 0 numa_miss0
numa_interleave 0  numa_foreign 0
numa_local   0 numa_interleave 0
numa_other   0 numa_local   0
*nr_free_cma 0*numa_other 0
......
vm stats threshold: 10 vm stats threshold: 10
......

The next patch updates the numa stats counter size and threshold.

Changelog:

v2:
a) Modify the name of numa-stats-specific functions and params to avoid
confusion with those for zone/node page stats.
b) Get rid of showing the number of numa stat threshold in /proc/zoneinfo
since the value of this item is a constant.

Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  29 +
 mm/page_alloc.c|  10 ++--
 mm/vmstat.c| 159 +++--
 5 files changed, 219 insertions(+), 26 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc830..3855902 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_node_page_state(dev->id, NUMA_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_MISS),
-  sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-  sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-  sum_zone_node_page_state(dev->id, NUMA_OTHER));
+  sum_zone_numa_state(dev->id, NUMA_HIT),
+  sum_zone_numa_state(dev->id, NUMA_MISS),
+  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
+  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+  sum_zone_numa_state(dev->id, NUMA_LOCAL),
+  sum_zone_numa_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+sum_zone_numa_state(nid, i));
+#endif
+
+   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+   n += sprintf(buf+n, "%s %lu\n",
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+NR_VM_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fda9afb..582f6d9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+#ifdef CONFIG_NUMA
+enum numa_stat_item {
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, 

[PATCH v2 3/3] mm: Consider the number in local CPUs when *reads* NUMA stats

2017-08-24 Thread Kemi Wang
To avoid deviation, the per cpu number of NUMA stats in vm_numa_stat_diff[]
is included when a user *reads* the NUMA stats.

Since NUMA stats does not be read by users frequently, and kernel does not
need it to make a decision, it will not be a problem to make the readers
more expensive.

Changelog:
v2:
a) new creation.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 include/linux/vmstat.h | 6 +-
 mm/vmstat.c| 9 +++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a29bd98..72e9ca6 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum 
numa_stat_item item)
return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
enum numa_stat_item item)
 {
long x = atomic_long_read(>vm_numa_stat[item]);
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b015f39..abeab81 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -895,6 +895,10 @@ unsigned long sum_zone_node_page_state(int node,
return count;
 }
 
+/*
+ * Determine the per node value of a numa stat item. To avoid deviation,
+ * the per cpu stat number in vm_numa_stat_diff[] is also included.
+ */
 unsigned long sum_zone_numa_state(int node,
 enum numa_stat_item item)
 {
@@ -903,7 +907,7 @@ unsigned long sum_zone_numa_state(int node,
unsigned long count = 0;
 
for (i = 0; i < MAX_NR_ZONES; i++)
-   count += zone_numa_state(zones + i, item);
+   count += zone_numa_state_snapshot(zones + i, item);
 
return count;
 }
@@ -1534,7 +1538,7 @@ static void zoneinfo_show_print(struct seq_file *m, 
pg_data_t *pgdat,
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
seq_printf(m, "\n  %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-   zone_numa_state(zone, i));
+   zone_numa_state_snapshot(zone, i));
 #endif
 
seq_printf(m, "\n  pagesets");
@@ -1790,6 +1794,7 @@ static bool need_update(int cpu)
 #ifdef CONFIG_NUMA
BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
 #endif
+
/*
 * The fast way of checking if there are any vmstat diffs.
 * This works because the diffs are byte sized items.
-- 
2.7.4



[PATCH v2 3/3] mm: Consider the number in local CPUs when *reads* NUMA stats

2017-08-24 Thread Kemi Wang
To avoid deviation, the per cpu number of NUMA stats in vm_numa_stat_diff[]
is included when a user *reads* the NUMA stats.

Since NUMA stats does not be read by users frequently, and kernel does not
need it to make a decision, it will not be a problem to make the readers
more expensive.

Changelog:
v2:
a) new creation.

Signed-off-by: Kemi Wang 
---
 include/linux/vmstat.h | 6 +-
 mm/vmstat.c| 9 +++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index a29bd98..72e9ca6 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum 
numa_stat_item item)
return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
enum numa_stat_item item)
 {
long x = atomic_long_read(>vm_numa_stat[item]);
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b015f39..abeab81 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -895,6 +895,10 @@ unsigned long sum_zone_node_page_state(int node,
return count;
 }
 
+/*
+ * Determine the per node value of a numa stat item. To avoid deviation,
+ * the per cpu stat number in vm_numa_stat_diff[] is also included.
+ */
 unsigned long sum_zone_numa_state(int node,
 enum numa_stat_item item)
 {
@@ -903,7 +907,7 @@ unsigned long sum_zone_numa_state(int node,
unsigned long count = 0;
 
for (i = 0; i < MAX_NR_ZONES; i++)
-   count += zone_numa_state(zones + i, item);
+   count += zone_numa_state_snapshot(zones + i, item);
 
return count;
 }
@@ -1534,7 +1538,7 @@ static void zoneinfo_show_print(struct seq_file *m, 
pg_data_t *pgdat,
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
seq_printf(m, "\n  %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-   zone_numa_state(zone, i));
+   zone_numa_state_snapshot(zone, i));
 #endif
 
seq_printf(m, "\n  pagesets");
@@ -1790,6 +1794,7 @@ static bool need_update(int cpu)
 #ifdef CONFIG_NUMA
BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
 #endif
+
/*
 * The fast way of checking if there are any vmstat diffs.
 * This works because the diffs are byte sized items.
-- 
2.7.4



[PATCH 1/2] mm: Change the call sites of numa statistics items

2017-08-15 Thread Kemi Wang
In this patch,  NUMA statistics is separated from zone statistics
framework, all the call sites of NUMA stats are changed to use
numa-stats-specific functions, it does not have any functionality change
except that the value of NUMA stats is shown behind zone page stats, and
the threshold size of NUMA stats is shown behind pcp threshold when users
*read* the zone info.

E.g. cat /proc/zoneinfo
***Base***   ***With this patch***
nr_free_pages 3976 nr_free_pages 3976
nr_zone_inactive_anon 0nr_zone_inactive_anon 0
nr_zone_active_anon 0  nr_zone_active_anon 0
nr_zone_inactive_file 0nr_zone_inactive_file 0
nr_zone_active_file 0  nr_zone_active_file 0
nr_zone_unevictable 0  nr_zone_unevictable 0
nr_zone_write_pending 0nr_zone_write_pending 0
nr_mlock 0 nr_mlock 0
nr_page_table_pages 0  nr_page_table_pages 0
nr_kernel_stack 0  nr_kernel_stack 0
nr_bounce0 nr_bounce0
nr_zspages   0 nr_zspages   0
numa_hit 0*nr_free_cma  0*
numa_miss 0numa_hit 0
numa_foreign 0 numa_miss0
numa_interleave 0  numa_foreign 0
numa_local   0 numa_interleave 0
numa_other   0 numa_local   0
*nr_free_cma 0*numa_other 0
......
vm stats threshold: 10 vm stats threshold: 10
...   *vm numa stats threshold: 10*
   ...

The next patch updates the numa stats counter size and threshold.

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
---
 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  29 +
 mm/page_alloc.c|  10 +--
 mm/vmstat.c| 167 +++--
 5 files changed, 227 insertions(+), 26 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc830..12080c6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_node_page_state(dev->id, NUMA_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_MISS),
-  sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-  sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-  sum_zone_node_page_state(dev->id, NUMA_OTHER));
+  sum_zone_node_numa_state(dev->id, NUMA_HIT),
+  sum_zone_node_numa_state(dev->id, NUMA_MISS),
+  sum_zone_node_numa_state(dev->id, NUMA_FOREIGN),
+  sum_zone_node_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+  sum_zone_node_numa_state(dev->id, NUMA_LOCAL),
+  sum_zone_node_numa_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+   for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+sum_zone_node_numa_state(nid, i));
+#endif
+
+   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+   n += sprintf(buf+n, "%s %lu\n",
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+NR_VM_ZONE_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b..0b11ba7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+#ifdef CONFIG_NUMA
+enum zone_numa_stat_item {
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLE

[PATCH 1/2] mm: Change the call sites of numa statistics items

2017-08-15 Thread Kemi Wang
In this patch,  NUMA statistics is separated from zone statistics
framework, all the call sites of NUMA stats are changed to use
numa-stats-specific functions, it does not have any functionality change
except that the value of NUMA stats is shown behind zone page stats, and
the threshold size of NUMA stats is shown behind pcp threshold when users
*read* the zone info.

E.g. cat /proc/zoneinfo
***Base***   ***With this patch***
nr_free_pages 3976 nr_free_pages 3976
nr_zone_inactive_anon 0nr_zone_inactive_anon 0
nr_zone_active_anon 0  nr_zone_active_anon 0
nr_zone_inactive_file 0nr_zone_inactive_file 0
nr_zone_active_file 0  nr_zone_active_file 0
nr_zone_unevictable 0  nr_zone_unevictable 0
nr_zone_write_pending 0nr_zone_write_pending 0
nr_mlock 0 nr_mlock 0
nr_page_table_pages 0  nr_page_table_pages 0
nr_kernel_stack 0  nr_kernel_stack 0
nr_bounce0 nr_bounce0
nr_zspages   0 nr_zspages   0
numa_hit 0*nr_free_cma  0*
numa_miss 0numa_hit 0
numa_foreign 0 numa_miss0
numa_interleave 0  numa_foreign 0
numa_local   0 numa_interleave 0
numa_other   0 numa_local   0
*nr_free_cma 0*numa_other 0
......
vm stats threshold: 10 vm stats threshold: 10
...   *vm numa stats threshold: 10*
   ...

The next patch updates the numa stats counter size and threshold.

Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  29 +
 mm/page_alloc.c|  10 +--
 mm/vmstat.c| 167 +++--
 5 files changed, 227 insertions(+), 26 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc830..12080c6 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_node_page_state(dev->id, NUMA_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_MISS),
-  sum_zone_node_page_state(dev->id, NUMA_FOREIGN),
-  sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_node_page_state(dev->id, NUMA_LOCAL),
-  sum_zone_node_page_state(dev->id, NUMA_OTHER));
+  sum_zone_node_numa_state(dev->id, NUMA_HIT),
+  sum_zone_node_numa_state(dev->id, NUMA_MISS),
+  sum_zone_node_numa_state(dev->id, NUMA_FOREIGN),
+  sum_zone_node_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
+  sum_zone_node_numa_state(dev->id, NUMA_LOCAL),
+  sum_zone_node_numa_state(dev->id, NUMA_OTHER));
 }
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+#ifdef CONFIG_NUMA
+   for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+sum_zone_node_numa_state(nid, i));
+#endif
+
+   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+   n += sprintf(buf+n, "%s %lu\n",
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+NR_VM_ZONE_NUMA_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc14b8b..0b11ba7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+#ifdef CONFIG_NUMA
+enum zone_numa_stat_item {
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLEAVE_HIT,/* interleaver p

[PATCH 0/2] Separate NUMA statistics from zone statistics

2017-08-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM submit, these are a substantial
source of overhead in the page allocator and are very rarely consumed. This
significant overhead in cache bouncing caused by zone counters (NUMA
associated counters) update in parallel in multi-threaded page allocation
(pointed out by Dave Hansen).

To mitigate this overhead, this patchset separates NUMA statistics from
zone statistics framework, and update NUMA counter threshold to a fixed
size of 32765, as a small threshold greatly increases the update frequency
of the global counter from local per cpu counter (suggested by Ying Huang).
The rationality is that these statistics counters don't need to be read
often, unlike other VM counters, so it's not a problem to use a large
threshold and make readers more expensive.

With this patchset, we see 26.6% drop of CPU cycles(537-->394, see below)
for per single page allocation and reclaim on Jesper's page_bench03
benchmark. Meanwhile, this patchset keeps the same style of virtual memory
statistics with little end-user-visible effects (see the first patch for
details), except that the number of NUMA items in each cpu
(vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads*
the value of NUMA counter to eliminate deviation.

I did an experiment of single page allocation and reclaim concurrently
using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server
(88 processors with 126G memory) with different size of threshold of pcp
counter.

Benchmark provided by Jesper D Broucer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

   Threshold   CPU cyclesThroughput(88 threads)
  32799 241760478
  64640 301628829
  125   537 358906028 <==> system by default
  256   468 412397590
  512   428 450550704
  4096  399 482520943
  2 394 489009617
  3 395 488017817
  32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset
  N/A   342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Kemi Wang (2):
  mm: Change the call sites of numa statistics items
  mm: Update NUMA counter threshold size

 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  33 ++
 mm/page_alloc.c|  10 +--
 mm/vmstat.c| 162 +++--
 5 files changed, 227 insertions(+), 25 deletions(-)

-- 
2.7.4


[PATCH 0/2] Separate NUMA statistics from zone statistics

2017-08-15 Thread Kemi Wang
Each page allocation updates a set of per-zone statistics with a call to
zone_statistics(). As discussed in 2017 MM submit, these are a substantial
source of overhead in the page allocator and are very rarely consumed. This
significant overhead in cache bouncing caused by zone counters (NUMA
associated counters) update in parallel in multi-threaded page allocation
(pointed out by Dave Hansen).

To mitigate this overhead, this patchset separates NUMA statistics from
zone statistics framework, and update NUMA counter threshold to a fixed
size of 32765, as a small threshold greatly increases the update frequency
of the global counter from local per cpu counter (suggested by Ying Huang).
The rationality is that these statistics counters don't need to be read
often, unlike other VM counters, so it's not a problem to use a large
threshold and make readers more expensive.

With this patchset, we see 26.6% drop of CPU cycles(537-->394, see below)
for per single page allocation and reclaim on Jesper's page_bench03
benchmark. Meanwhile, this patchset keeps the same style of virtual memory
statistics with little end-user-visible effects (see the first patch for
details), except that the number of NUMA items in each cpu
(vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads*
the value of NUMA counter to eliminate deviation.

I did an experiment of single page allocation and reclaim concurrently
using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server
(88 processors with 126G memory) with different size of threshold of pcp
counter.

Benchmark provided by Jesper D Broucer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

   Threshold   CPU cyclesThroughput(88 threads)
  32799 241760478
  64640 301628829
  125   537 358906028 <==> system by default
  256   468 412397590
  512   428 450550704
  4096  399 482520943
  2 394 489009617
  3 395 488017817
  32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset
  N/A   342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Kemi Wang (2):
  mm: Change the call sites of numa statistics items
  mm: Update NUMA counter threshold size

 drivers/base/node.c|  22 ---
 include/linux/mmzone.h |  25 +---
 include/linux/vmstat.h |  33 ++
 mm/page_alloc.c|  10 +--
 mm/vmstat.c| 162 +++--
 5 files changed, 227 insertions(+), 25 deletions(-)

-- 
2.7.4


[PATCH 2/2] mm: Update NUMA counter threshold size

2017-08-15 Thread Kemi Wang
There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of 32765, as a
small threshold greatly increases the update frequency of the global
counter from local per cpu counter, and the number of NUMA items in each
cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user
*reads* the value of numa counter to eliminate deviation (suggested by
Ying Huang).

The rationality is that these statistics counters don't need to be read
often, unlike other VM counters, so it's not a problem to use a large
threshold and make readers more expensive.

With this patchset, we see 26.6% drop of CPU cycles(537-->394) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Broucer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

 Threshold   CPU cyclesThroughput(88 threads)
 32  799 241760478
 64  640 301628829
 125 537 358906028 <==> system by default (base)
 256 468 412397590
 512 428 450550704
 4096399 482520943
 2   394 489009617
 3   395 488017817
 32765   394(-26.6%) 488932078(+36.2%) <==> with this patchset
 N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Signed-off-by: Kemi Wang <kemi.w...@intel.com>
Suggested-by: Dave Hansen <dave.han...@intel.com>
Suggested-by: Ying Huang <ying.hu...@intel.com>
---
 include/linux/mmzone.h |  4 ++--
 include/linux/vmstat.h |  6 +-
 mm/vmstat.c| 23 ++-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0b11ba7..7eaf0e8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,8 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   s8 numa_stat_threshold;
-   s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
+   s16 numa_stat_threshold;
+   s16 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e19379..d97cc34 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum 
zone_numa_stat_item item)
return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
enum zone_numa_stat_item item)
 {
long x = atomic_long_read(>vm_numa_stat[item]);
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a7fa30..c7f50ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STAT_THRESHOLD  32765
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -196,7 +198,7 @@ void refresh_zone_stat_thresholds(void)
= threshold;
 #ifdef CONFIG_NUMA
per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
+   = NUMA_STAT_THRESHOLD;
 #endif
/* Base nodestat threshold on the largest populated 
zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold;
@@ -231,14 +233,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
 
threshold = (*calculate_pressure)(zone);
-   for_each_online_cpu(cpu) {
+   for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
-   }
}
 }
 
@@ -872,13 +869,13 @@ void __inc_zone_numa_state(struct zone *zone,
 enum zone_numa_stat_item item)
 {
struct per_cpu_pageset __percpu *pcp = zone->pageset;
-   s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->

[PATCH 2/2] mm: Update NUMA counter threshold size

2017-08-15 Thread Kemi Wang
There is significant overhead in cache bouncing caused by zone counters
(NUMA associated counters) update in parallel in multi-threaded page
allocation (suggested by Dave Hansen).

This patch updates NUMA counter threshold to a fixed size of 32765, as a
small threshold greatly increases the update frequency of the global
counter from local per cpu counter, and the number of NUMA items in each
cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user
*reads* the value of numa counter to eliminate deviation (suggested by
Ying Huang).

The rationality is that these statistics counters don't need to be read
often, unlike other VM counters, so it's not a problem to use a large
threshold and make readers more expensive.

With this patchset, we see 26.6% drop of CPU cycles(537-->394) for per
single page allocation and reclaim on Jesper's page_bench03 benchmark.

Benchmark provided by Jesper D Broucer(increase loop times to 1000):
https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

 Threshold   CPU cyclesThroughput(88 threads)
 32  799 241760478
 64  640 301628829
 125 537 358906028 <==> system by default (base)
 256 468 412397590
 512 428 450550704
 4096399 482520943
 2   394 489009617
 3   395 488017817
 32765   394(-26.6%) 488932078(+36.2%) <==> with this patchset
 N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics

Signed-off-by: Kemi Wang 
Suggested-by: Dave Hansen 
Suggested-by: Ying Huang 
---
 include/linux/mmzone.h |  4 ++--
 include/linux/vmstat.h |  6 +-
 mm/vmstat.c| 23 ++-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0b11ba7..7eaf0e8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -282,8 +282,8 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp;
 #ifdef CONFIG_NUMA
s8 expire;
-   s8 numa_stat_threshold;
-   s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
+   s16 numa_stat_threshold;
+   s16 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS];
 #endif
 #ifdef CONFIG_SMP
s8 stat_threshold;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e19379..d97cc34 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum 
zone_numa_stat_item item)
return x;
 }
 
-static inline unsigned long zone_numa_state(struct zone *zone,
+static inline unsigned long zone_numa_state_snapshot(struct zone *zone,
enum zone_numa_stat_item item)
 {
long x = atomic_long_read(>vm_numa_stat[item]);
+   int cpu;
+
+   for_each_online_cpu(cpu)
+   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
 
return x;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a7fa30..c7f50ed 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -30,6 +30,8 @@
 
 #include "internal.h"
 
+#define NUMA_STAT_THRESHOLD  32765
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -196,7 +198,7 @@ void refresh_zone_stat_thresholds(void)
= threshold;
 #ifdef CONFIG_NUMA
per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
+   = NUMA_STAT_THRESHOLD;
 #endif
/* Base nodestat threshold on the largest populated 
zone. */
pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, 
cpu)->stat_threshold;
@@ -231,14 +233,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
continue;
 
threshold = (*calculate_pressure)(zone);
-   for_each_online_cpu(cpu) {
+   for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
-#ifdef CONFIG_NUMA
-   per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
-   = threshold;
-#endif
-   }
}
 }
 
@@ -872,13 +869,13 @@ void __inc_zone_numa_state(struct zone *zone,
 enum zone_numa_stat_item item)
 {
struct per_cpu_pageset __percpu *pcp = zone->pageset;
-   s8 __percpu *p = pcp->vm_numa_stat_diff + item;
-   s8 v, t;
+   s16 __percpu *p = pcp->vm_numa_stat_diff + item;
+   s16 v, t;
 
v = __this_cpu_inc_return(*p);