[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node
There is not really any use to get NUMA stats separated by zone, and current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the existed per-cpu infrastructure. Suggested-by: Andi Kleen <a...@linux.intel.com> Suggested-by: Michal Hocko <mho...@kernel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c| 23 +++ include/linux/mmzone.h | 27 include/linux/vmstat.h | 31 - mm/mempolicy.c | 2 +- mm/page_alloc.c| 16 +++-- mm/vmstat.c| 177 + 6 files changed, 46 insertions(+), 230 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ee090ab..a045ea1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + node_page_state(NODE_DATA(dev->id), NUMA_HIT), + node_page_state(NODE_DATA(dev->id), NUMA_MISS), + node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), + node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), + node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), + node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); } + static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); static ssize_t node_read_vmstat(struct device *dev, @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - n += sprintf(buf+n, "%s %lu\n", -vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -sum_zone_numa_state(nid, i)); -#endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", -vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + -NR_VM_NUMA_STAT_ITEMS], +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c..c06d880 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -115,20 +115,6 @@ struct zone_padding { #define ZONE_PADDING(name) #endif -#ifdef CONFIG_NUMA -enum numa_stat_item { - NUMA_HIT, /* allocated in intended node */ - NUMA_MISS, /* allocated in non intended node */ - NUMA_FOREIGN, /* was intended here, hit elsewhere */ - NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */ - NUMA_LOCAL, /* allocation from local node */ - NUMA_OTHER, /* allocation from other node */ - NR_VM_NUMA_STAT_ITEMS -}; -#else -#define NR_VM_NUMA_STAT_ITEMS 0 -#endif - enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, @@ -151,7 +137,18 @@ enum zone_stat_item { NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { - NR_LRU_BASE, +#ifdef CONFIG_NUMA + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */ + NUMA_LOCAL, /* allocation from local node */ + NUMA_OTHER, /* allocation from other node */ + NR_VM_NUMA_STAT_ITEMS, +#else +#defineNR_VM_NUMA_STAT_ITEMS 0 +#endif + NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c98..80bf290 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -118,37 +118,8 @@ static inline vo
[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node
There is not really any use to get NUMA stats separated by zone, and current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the existed per-cpu infrastructure. Suggested-by: Andi Kleen Suggested-by: Michal Hocko Signed-off-by: Kemi Wang --- drivers/base/node.c| 23 +++ include/linux/mmzone.h | 27 include/linux/vmstat.h | 31 - mm/mempolicy.c | 2 +- mm/page_alloc.c| 16 +++-- mm/vmstat.c| 177 + 6 files changed, 46 insertions(+), 230 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ee090ab..a045ea1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + node_page_state(NODE_DATA(dev->id), NUMA_HIT), + node_page_state(NODE_DATA(dev->id), NUMA_MISS), + node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), + node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), + node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), + node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); } + static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); static ssize_t node_read_vmstat(struct device *dev, @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); -#ifdef CONFIG_NUMA - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) - n += sprintf(buf+n, "%s %lu\n", -vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -sum_zone_numa_state(nid, i)); -#endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", -vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + -NR_VM_NUMA_STAT_ITEMS], +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c..c06d880 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -115,20 +115,6 @@ struct zone_padding { #define ZONE_PADDING(name) #endif -#ifdef CONFIG_NUMA -enum numa_stat_item { - NUMA_HIT, /* allocated in intended node */ - NUMA_MISS, /* allocated in non intended node */ - NUMA_FOREIGN, /* was intended here, hit elsewhere */ - NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */ - NUMA_LOCAL, /* allocation from local node */ - NUMA_OTHER, /* allocation from other node */ - NR_VM_NUMA_STAT_ITEMS -}; -#else -#define NR_VM_NUMA_STAT_ITEMS 0 -#endif - enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, @@ -151,7 +137,18 @@ enum zone_stat_item { NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { - NR_LRU_BASE, +#ifdef CONFIG_NUMA + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */ + NUMA_LOCAL, /* allocation from local node */ + NUMA_OTHER, /* allocation from other node */ + NR_VM_NUMA_STAT_ITEMS, +#else +#defineNR_VM_NUMA_STAT_ITEMS 0 +#endif + NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c98..80bf290 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with p
[PATCH v2 5/5] mm: Rename zone_statistics() to numa_statistics()
Since the functionality of zone_statistics() updates numa counters, but numa statistics has been separated from zone statistics framework. Thus, the function name makes people confused. So, change the name to numa_statistics() as well as its call sites accordingly. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81e8d8f..f7583de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2790,7 +2790,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void numa_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA int preferred_nid = preferred_zone->node; @@ -2854,7 +2854,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); } local_irq_restore(flags); return page; @@ -2902,7 +2902,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); local_irq_restore(flags); out: -- 2.7.4
[PATCH v2 5/5] mm: Rename zone_statistics() to numa_statistics()
Since the functionality of zone_statistics() updates numa counters, but numa statistics has been separated from zone statistics framework. Thus, the function name makes people confused. So, change the name to numa_statistics() as well as its call sites accordingly. Signed-off-by: Kemi Wang --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81e8d8f..f7583de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2790,7 +2790,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void numa_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA int preferred_nid = preferred_zone->node; @@ -2854,7 +2854,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); } local_irq_restore(flags); return page; @@ -2902,7 +2902,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); local_irq_restore(flags); out: -- 2.7.4
[PATCH v2 4/5] mm: use node_page_state_snapshot to avoid deviation
To avoid deviation, this patch uses node_page_state_snapshot instead of node_page_state for node page stats query. e.g. cat /proc/zoneinfo cat /sys/devices/system/node/node*/vmstat cat /sys/devices/system/node/node*/numastat As it is a slow path and would not be read frequently, I would worry about it. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c | 17 ++--- mm/vmstat.c | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index a045ea1..cf303f8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,12 +169,15 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - node_page_state(NODE_DATA(dev->id), NUMA_HIT), - node_page_state(NODE_DATA(dev->id), NUMA_MISS), - node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), - node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), - node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), - node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_HIT), + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_MISS), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_FOREIGN), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_INTERLEAVE_HIT), + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_LOCAL), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -194,7 +197,7 @@ static ssize_t node_read_vmstat(struct device *dev, for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -node_page_state(pgdat, i)); +node_page_state_snapshot(pgdat, i)); return n; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 64e08ae..d65f28d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1466,7 +1466,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - node_page_state(pgdat, i)); + node_page_state_snapshot(pgdat, i)); } } seq_printf(m, -- 2.7.4
[PATCH v2 4/5] mm: use node_page_state_snapshot to avoid deviation
To avoid deviation, this patch uses node_page_state_snapshot instead of node_page_state for node page stats query. e.g. cat /proc/zoneinfo cat /sys/devices/system/node/node*/vmstat cat /sys/devices/system/node/node*/numastat As it is a slow path and would not be read frequently, I would worry about it. Signed-off-by: Kemi Wang --- drivers/base/node.c | 17 ++--- mm/vmstat.c | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index a045ea1..cf303f8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,12 +169,15 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - node_page_state(NODE_DATA(dev->id), NUMA_HIT), - node_page_state(NODE_DATA(dev->id), NUMA_MISS), - node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN), - node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT), - node_page_state(NODE_DATA(dev->id), NUMA_LOCAL), - node_page_state(NODE_DATA(dev->id), NUMA_OTHER)); + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_HIT), + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_MISS), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_FOREIGN), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_INTERLEAVE_HIT), + node_page_state_snapshot(NODE_DATA(dev->id), NUMA_LOCAL), + node_page_state_snapshot(NODE_DATA(dev->id), + NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -194,7 +197,7 @@ static ssize_t node_read_vmstat(struct device *dev, for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -node_page_state(pgdat, i)); +node_page_state_snapshot(pgdat, i)); return n; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 64e08ae..d65f28d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1466,7 +1466,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - node_page_state(pgdat, i)); + node_page_state_snapshot(pgdat, i)); } } seq_printf(m, -- 2.7.4
[PATCH v2 2/5] mm: Extends local cpu counter vm_diff_nodestat from s8 to s16
The type s8 used for vm_diff_nodestat[] as local cpu counters has the limitation of global counters update frequency, especially for those monotone increasing type of counters like NUMA counters with more and more cpus/nodes. This patch extends the type of vm_diff_nodestat from s8 to s16 without any functionality change. before after sizeof(struct per_cpu_nodestat)28 68 Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- include/linux/mmzone.h | 4 ++-- mm/vmstat.c| 16 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c06d880..2da6b6f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -289,8 +289,8 @@ struct per_cpu_pageset { }; struct per_cpu_nodestat { - s8 stat_threshold; - s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; + s16 stat_threshold; + s16 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 1dd12ae..9c681cc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -332,7 +332,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long delta) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; + s16 __percpu *p = pcp->vm_node_stat_diff + item; long x; long t; @@ -390,13 +390,13 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->vm_node_stat_diff + item; + s16 v, t; v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { - s8 overstep = t >> 1; + s16 overstep = t >> 1; node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); @@ -434,13 +434,13 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->vm_node_stat_diff + item; + s16 v, t; v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { - s8 overstep = t >> 1; + s16 overstep = t >> 1; node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); @@ -533,7 +533,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, enum node_stat_item item, int delta, int overstep_mode) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; + s16 __percpu *p = pcp->vm_node_stat_diff + item; long o, n, t, z; do { -- 2.7.4
[PATCH v2 3/5] mm: enlarge NUMA counters threshold size
We have seen significant overhead in cache bouncing caused by NUMA counters update in multi-threaded page allocation. See 'commit 1d90ca897cb0 ("mm: update NUMA counter threshold size")' for more details. This patch updates NUMA counters to a fixed size of (MAX_S16 - 2) and deals with global counter update using different threshold size for node page stats. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- mm/vmstat.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 9c681cc..64e08ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define VM_NUMA_STAT_THRESHOLD (S16_MAX - 2) + #ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; @@ -394,7 +396,11 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) s16 v, t; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->stat_threshold); + if (item >= NR_VM_NUMA_STAT_ITEMS) + t = __this_cpu_read(pcp->stat_threshold); + else + t = VM_NUMA_STAT_THRESHOLD; + if (unlikely(v > t)) { s16 overstep = t >> 1; @@ -549,7 +555,10 @@ static inline void mod_node_state(struct pglist_data *pgdat, * Most of the time the thresholds are the same anyways * for all cpus in a node. */ - t = this_cpu_read(pcp->stat_threshold); + if (item >= NR_VM_NUMA_STAT_ITEMS) + t = this_cpu_read(pcp->stat_threshold); + else + t = VM_NUMA_STAT_THRESHOLD; o = this_cpu_read(*p); n = delta + o; -- 2.7.4
[PATCH v2 2/5] mm: Extends local cpu counter vm_diff_nodestat from s8 to s16
The type s8 used for vm_diff_nodestat[] as local cpu counters has the limitation of global counters update frequency, especially for those monotone increasing type of counters like NUMA counters with more and more cpus/nodes. This patch extends the type of vm_diff_nodestat from s8 to s16 without any functionality change. before after sizeof(struct per_cpu_nodestat)28 68 Signed-off-by: Kemi Wang --- include/linux/mmzone.h | 4 ++-- mm/vmstat.c| 16 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c06d880..2da6b6f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -289,8 +289,8 @@ struct per_cpu_pageset { }; struct per_cpu_nodestat { - s8 stat_threshold; - s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; + s16 stat_threshold; + s16 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 1dd12ae..9c681cc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -332,7 +332,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long delta) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; + s16 __percpu *p = pcp->vm_node_stat_diff + item; long x; long t; @@ -390,13 +390,13 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->vm_node_stat_diff + item; + s16 v, t; v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { - s8 overstep = t >> 1; + s16 overstep = t >> 1; node_page_state_add(v + overstep, pgdat, item); __this_cpu_write(*p, -overstep); @@ -434,13 +434,13 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->vm_node_stat_diff + item; + s16 v, t; v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { - s8 overstep = t >> 1; + s16 overstep = t >> 1; node_page_state_add(v - overstep, pgdat, item); __this_cpu_write(*p, overstep); @@ -533,7 +533,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, enum node_stat_item item, int delta, int overstep_mode) { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; - s8 __percpu *p = pcp->vm_node_stat_diff + item; + s16 __percpu *p = pcp->vm_node_stat_diff + item; long o, n, t, z; do { -- 2.7.4
[PATCH v2 3/5] mm: enlarge NUMA counters threshold size
We have seen significant overhead in cache bouncing caused by NUMA counters update in multi-threaded page allocation. See 'commit 1d90ca897cb0 ("mm: update NUMA counter threshold size")' for more details. This patch updates NUMA counters to a fixed size of (MAX_S16 - 2) and deals with global counter update using different threshold size for node page stats. Signed-off-by: Kemi Wang --- mm/vmstat.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 9c681cc..64e08ae 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define VM_NUMA_STAT_THRESHOLD (S16_MAX - 2) + #ifdef CONFIG_NUMA int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; @@ -394,7 +396,11 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) s16 v, t; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->stat_threshold); + if (item >= NR_VM_NUMA_STAT_ITEMS) + t = __this_cpu_read(pcp->stat_threshold); + else + t = VM_NUMA_STAT_THRESHOLD; + if (unlikely(v > t)) { s16 overstep = t >> 1; @@ -549,7 +555,10 @@ static inline void mod_node_state(struct pglist_data *pgdat, * Most of the time the thresholds are the same anyways * for all cpus in a node. */ - t = this_cpu_read(pcp->stat_threshold); + if (item >= NR_VM_NUMA_STAT_ITEMS) + t = this_cpu_read(pcp->stat_threshold); + else + t = VM_NUMA_STAT_THRESHOLD; o = this_cpu_read(*p); n = delta + o; -- 2.7.4
[PATCH v2 0/5] mm: NUMA stats code cleanup and enhancement
The existed implementation of NUMA counters is per logical CPU along with zone->vm_numa_stat[] separated by zone, plus a global numa counter array vm_numa_stat[]. However, unlike the other vmstat counters, NUMA stats don't effect system's decision and are only consumed when reading from /proc and /sys. Also, usually nodes only have a single zone, except for node 0, and there isn't really any use where you need these hits counts separated by zone. Therefore, we can migrate the implementation of numa stats from per-zone to per-node (as suggested by Andi Kleen), and reuse the existed per-cpu infrastructure with a little enhancement for NUMA stats. In this way, we can get rid of the special way for NUMA stats and keep the performance gain at the same time. With this patch series, about 170 lines code can be saved. The first patch migrates NUMA stats from per-zone to pre-node using the existed per-cpu infrastructure. There is a little user-visual change when read /proc/zoneinfo listed below: Before After Node 0, zone DMA Node 0, zone DMA per-node stats per-node stats nr_inactive_anon 7244 *numa_hit 98665086* nr_active_anon 177064 *numa_miss0* ...*numa_foreign 0* nr_bounce0 *numa_interleave 21059* nr_free_cma 0 *numa_local 98665086* *numa_hit 0**numa_other 0* *numa_miss0* nr_inactive_anon 20055 *numa_foreign 0* nr_active_anon 389771 *numa_interleave 0* ... *numa_local 0* nr_bounce0 *numa_other 0* nr_free_cma 0 The second patch extends the local cpu counter vm_stat_node_diff from s8 to s16. It does not have any functionality change. The third patch uses a large and constant threshold size for NUMA counters to reduce the global NUMA counters update frequency. The forth patch uses node_page_state_snapshot instead of node_page_state when query a node stats (e.g. cat /sys/devices/system/node/node*/vmstat). The only differece is that the stats value in local cpus are also included in node_page_state_snapshot. The last patch renames zone_statistics() to numa_statistics(). At last, I want to extend my heartiest appreciation for Michal Hocko's suggestion of reusing the existed per-cpu infrastructure making it much better than before. Changelog: v1->v2: a) enhance the existed per-cpu infrastructure for node page stats by entending local cpu counters vm_node_stat_diff from s8 to s16 b) reuse the per-cpu infrastrcuture for NUMA stats Kemi Wang (5): mm: migrate NUMA stats from per-zone to per-node mm: Extends local cpu counter vm_diff_nodestat from s8 to s16 mm: enlarge NUMA counters threshold size mm: use node_page_state_snapshot to avoid deviation mm: Rename zone_statistics() to numa_statistics() drivers/base/node.c| 28 +++ include/linux/mmzone.h | 31 include/linux/vmstat.h | 31 mm/mempolicy.c | 2 +- mm/page_alloc.c| 22 +++--- mm/vmstat.c| 206 + 6 files changed, 74 insertions(+), 246 deletions(-) -- 2.7.4
[PATCH v2 0/5] mm: NUMA stats code cleanup and enhancement
The existed implementation of NUMA counters is per logical CPU along with zone->vm_numa_stat[] separated by zone, plus a global numa counter array vm_numa_stat[]. However, unlike the other vmstat counters, NUMA stats don't effect system's decision and are only consumed when reading from /proc and /sys. Also, usually nodes only have a single zone, except for node 0, and there isn't really any use where you need these hits counts separated by zone. Therefore, we can migrate the implementation of numa stats from per-zone to per-node (as suggested by Andi Kleen), and reuse the existed per-cpu infrastructure with a little enhancement for NUMA stats. In this way, we can get rid of the special way for NUMA stats and keep the performance gain at the same time. With this patch series, about 170 lines code can be saved. The first patch migrates NUMA stats from per-zone to pre-node using the existed per-cpu infrastructure. There is a little user-visual change when read /proc/zoneinfo listed below: Before After Node 0, zone DMA Node 0, zone DMA per-node stats per-node stats nr_inactive_anon 7244 *numa_hit 98665086* nr_active_anon 177064 *numa_miss0* ...*numa_foreign 0* nr_bounce0 *numa_interleave 21059* nr_free_cma 0 *numa_local 98665086* *numa_hit 0**numa_other 0* *numa_miss0* nr_inactive_anon 20055 *numa_foreign 0* nr_active_anon 389771 *numa_interleave 0* ... *numa_local 0* nr_bounce0 *numa_other 0* nr_free_cma 0 The second patch extends the local cpu counter vm_stat_node_diff from s8 to s16. It does not have any functionality change. The third patch uses a large and constant threshold size for NUMA counters to reduce the global NUMA counters update frequency. The forth patch uses node_page_state_snapshot instead of node_page_state when query a node stats (e.g. cat /sys/devices/system/node/node*/vmstat). The only differece is that the stats value in local cpus are also included in node_page_state_snapshot. The last patch renames zone_statistics() to numa_statistics(). At last, I want to extend my heartiest appreciation for Michal Hocko's suggestion of reusing the existed per-cpu infrastructure making it much better than before. Changelog: v1->v2: a) enhance the existed per-cpu infrastructure for node page stats by entending local cpu counters vm_node_stat_diff from s8 to s16 b) reuse the per-cpu infrastrcuture for NUMA stats Kemi Wang (5): mm: migrate NUMA stats from per-zone to per-node mm: Extends local cpu counter vm_diff_nodestat from s8 to s16 mm: enlarge NUMA counters threshold size mm: use node_page_state_snapshot to avoid deviation mm: Rename zone_statistics() to numa_statistics() drivers/base/node.c| 28 +++ include/linux/mmzone.h | 31 include/linux/vmstat.h | 31 mm/mempolicy.c | 2 +- mm/page_alloc.c| 22 +++--- mm/vmstat.c| 206 + 6 files changed, 74 insertions(+), 246 deletions(-) -- 2.7.4
[PATCH 2/2] mm: Rename zone_statistics() to numa_statistics()
Since numa statistics has been separated from zone statistics framework, but the functionality of zone_statistics() updates numa counters. Thus, the function name makes people confused. So, change the name to numa_statistics() as well as its call sites accordingly. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 142e1ba..61fa717 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2783,7 +2783,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void numa_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -2845,7 +2845,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); } local_irq_restore(flags); return page; @@ -2893,7 +2893,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); local_irq_restore(flags); out: -- 2.7.4
[PATCH 2/2] mm: Rename zone_statistics() to numa_statistics()
Since numa statistics has been separated from zone statistics framework, but the functionality of zone_statistics() updates numa counters. Thus, the function name makes people confused. So, change the name to numa_statistics() as well as its call sites accordingly. Signed-off-by: Kemi Wang --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 142e1ba..61fa717 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2783,7 +2783,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) +static inline void numa_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; @@ -2845,7 +2845,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); } local_irq_restore(flags); return page; @@ -2893,7 +2893,7 @@ struct page *rmqueue(struct zone *preferred_zone, get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone); + numa_statistics(preferred_zone, zone); local_irq_restore(flags); out: -- 2.7.4
[PATCH 1/2] mm: NUMA stats code cleanup and enhancement
The existed implementation of NUMA counters is per logical CPU along with zone->vm_numa_stat[] separated by zone, plus a global numa counter array vm_numa_stat[]. However, unlike the other vmstat counters, numa stats don't effect system's decision and are only read from /proc and /sys, it is a slow path operation and likely tolerate higher overhead. Additionally, usually nodes only have a single zone, except for node 0. And there isn't really any use where you need these hits counts separated by zone. Therefore, we can migrate the implementation of numa stats from per-zone to per-node, and get rid of these global numa counters. It's good enough to keep everything in a per cpu ptr of type u64, and sum them up when need, as suggested by Andi Kleen. That's helpful for code cleanup and enhancement (e.g. save more than 130+ lines code). With this patch, we can see 1.8%(335->329) drop of CPU cycles for single page allocation and deallocation concurrently with 112 threads tested on a 2-sockets skylake platform using Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Also, it does not cause obvious latency increase when read /proc and /sys on a 2-sockets skylake platform. Latency shown by time command: base head /proc/vmstatsys 0m0.001s sys 0m0.001s /sys/devices/system/sys 0m0.001s sys 0m0.000s node/node*/numastat We would not worry it much as it is a slow path and will not be read frequently. Suggested-by: Andi Kleen <a...@linux.intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c| 14 ++--- include/linux/mmzone.h | 2 - include/linux/vmstat.h | 61 +- mm/page_alloc.c| 7 +++ mm/vmstat.c| 167 - 5 files changed, 56 insertions(+), 195 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ee090ab..0be5fbd 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,12 +169,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + node_numa_state_snapshot(dev->id, NUMA_HIT), + node_numa_state_snapshot(dev->id, NUMA_MISS), + node_numa_state_snapshot(dev->id, NUMA_FOREIGN), + node_numa_state_snapshot(dev->id, NUMA_INTERLEAVE_HIT), + node_numa_state_snapshot(dev->id, NUMA_LOCAL), + node_numa_state_snapshot(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -194,7 +194,7 @@ static ssize_t node_read_vmstat(struct device *dev, for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -sum_zone_numa_state(nid, i)); +node_numa_state_snapshot(nid, i)); #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c..b2d264f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -283,7 +283,6 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; @@ -504,7 +503,6 @@ struct zone { ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c98..7383d66 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -118,36 +118,8 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; - -#ifdef CONFIG_NUMA -static inline void zone_numa_state_add(long x, struct zone *zone, -
[PATCH 1/2] mm: NUMA stats code cleanup and enhancement
The existed implementation of NUMA counters is per logical CPU along with zone->vm_numa_stat[] separated by zone, plus a global numa counter array vm_numa_stat[]. However, unlike the other vmstat counters, numa stats don't effect system's decision and are only read from /proc and /sys, it is a slow path operation and likely tolerate higher overhead. Additionally, usually nodes only have a single zone, except for node 0. And there isn't really any use where you need these hits counts separated by zone. Therefore, we can migrate the implementation of numa stats from per-zone to per-node, and get rid of these global numa counters. It's good enough to keep everything in a per cpu ptr of type u64, and sum them up when need, as suggested by Andi Kleen. That's helpful for code cleanup and enhancement (e.g. save more than 130+ lines code). With this patch, we can see 1.8%(335->329) drop of CPU cycles for single page allocation and deallocation concurrently with 112 threads tested on a 2-sockets skylake platform using Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Also, it does not cause obvious latency increase when read /proc and /sys on a 2-sockets skylake platform. Latency shown by time command: base head /proc/vmstatsys 0m0.001s sys 0m0.001s /sys/devices/system/sys 0m0.001s sys 0m0.000s node/node*/numastat We would not worry it much as it is a slow path and will not be read frequently. Suggested-by: Andi Kleen Signed-off-by: Kemi Wang --- drivers/base/node.c| 14 ++--- include/linux/mmzone.h | 2 - include/linux/vmstat.h | 61 +- mm/page_alloc.c| 7 +++ mm/vmstat.c| 167 - 5 files changed, 56 insertions(+), 195 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ee090ab..0be5fbd 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -169,12 +169,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_numa_state(dev->id, NUMA_HIT), - sum_zone_numa_state(dev->id, NUMA_MISS), - sum_zone_numa_state(dev->id, NUMA_FOREIGN), - sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_numa_state(dev->id, NUMA_LOCAL), - sum_zone_numa_state(dev->id, NUMA_OTHER)); + node_numa_state_snapshot(dev->id, NUMA_HIT), + node_numa_state_snapshot(dev->id, NUMA_MISS), + node_numa_state_snapshot(dev->id, NUMA_FOREIGN), + node_numa_state_snapshot(dev->id, NUMA_INTERLEAVE_HIT), + node_numa_state_snapshot(dev->id, NUMA_LOCAL), + node_numa_state_snapshot(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -194,7 +194,7 @@ static ssize_t node_read_vmstat(struct device *dev, for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], -sum_zone_numa_state(nid, i)); +node_numa_state_snapshot(nid, i)); #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c..b2d264f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -283,7 +283,6 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; @@ -504,7 +503,6 @@ struct zone { ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } cacheline_internodealigned_in_smp; enum pgdat_flags { diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c98..7383d66 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -118,36 +118,8 @@ static inline void vm_events_fold_cpu(int cpu) * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; -extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; - -#ifdef CONFIG_NUMA -static inline void zone_numa_state_add(long x, struct zone *zone, -enum numa_stat_item item) -{ -
[PATCH v2] buffer: Avoid setting buffer bits that are already set
It's expensive to set buffer flags that are already set, because that causes a costly cache line transition. A common case is setting the "verified" flag during ext4 writes. This patch checks for the flag being set first. With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4 file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on a 2-sockets broadwell platform. What the benchmark does is: it forks 3000 processes, and each process do the following: a) open a new file b) close the file c) delete the file until loop=100*1000 times. The original patch is contributed by Andi Kleen. Signed-off-by: Andi Kleen <a...@linux.intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> Tested-by: Kemi Wang <kemi.w...@intel.com> Reviewed-by: Jens Axboe <ax...@kernel.dk> --- include/linux/buffer_head.h | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae55..211d8f5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -80,11 +80,14 @@ struct buffer_head { /* * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() * and buffer_foo() functions. + * To avoid reset buffer flags that are already set, because that causes + * a costly cache line transition, check the flag first. */ #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ - set_bit(BH_##bit, &(bh)->b_state); \ + if (!test_bit(BH_##bit, &(bh)->b_state))\ + set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ -- 2.7.4
[PATCH v2] buffer: Avoid setting buffer bits that are already set
It's expensive to set buffer flags that are already set, because that causes a costly cache line transition. A common case is setting the "verified" flag during ext4 writes. This patch checks for the flag being set first. With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4 file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on a 2-sockets broadwell platform. What the benchmark does is: it forks 3000 processes, and each process do the following: a) open a new file b) close the file c) delete the file until loop=100*1000 times. The original patch is contributed by Andi Kleen. Signed-off-by: Andi Kleen Signed-off-by: Kemi Wang Tested-by: Kemi Wang Reviewed-by: Jens Axboe --- include/linux/buffer_head.h | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae55..211d8f5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -80,11 +80,14 @@ struct buffer_head { /* * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() * and buffer_foo() functions. + * To avoid reset buffer flags that are already set, because that causes + * a costly cache line transition, check the flag first. */ #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ - set_bit(BH_##bit, &(bh)->b_state); \ + if (!test_bit(BH_##bit, &(bh)->b_state))\ + set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ -- 2.7.4
[PATCH] buffer: Avoid setting buffer bits that are already set
It's expensive to set buffer flags that are already set, because that causes a costly cache line transition. A common case is setting the "verified" flag during ext4 writes. This patch checks for the flag being set first. With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4 file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on a 2-sockets broadwell platform. What the benchmark does is: it forks 3000 processes, and each process do the following: a) open a new file b) close the file c) delete the file until loop=100*1000 times. The original patch is contributed by Andi Kleen. Signed-off-by: Andi Kleen <a...@linux.intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> Tested-by: Kemi Wang <kemi.w...@intel.com> --- include/linux/buffer_head.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae55..e1799f7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -84,7 +84,8 @@ struct buffer_head { #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ - set_bit(BH_##bit, &(bh)->b_state); \ + if (!test_bit(BH_##bit, &(bh)->b_state))\ + set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ -- 2.7.4
[PATCH] buffer: Avoid setting buffer bits that are already set
It's expensive to set buffer flags that are already set, because that causes a costly cache line transition. A common case is setting the "verified" flag during ext4 writes. This patch checks for the flag being set first. With the AIM7/creat-clo benchmark testing on a 48G ramdisk based-on ext4 file system, we see 3.3%(15431->15936) improvement of aim7.jobs-per-min on a 2-sockets broadwell platform. What the benchmark does is: it forks 3000 processes, and each process do the following: a) open a new file b) close the file c) delete the file until loop=100*1000 times. The original patch is contributed by Andi Kleen. Signed-off-by: Andi Kleen Signed-off-by: Kemi Wang Tested-by: Kemi Wang --- include/linux/buffer_head.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c8dae55..e1799f7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -84,7 +84,8 @@ struct buffer_head { #define BUFFER_FNS(bit, name) \ static __always_inline void set_buffer_##name(struct buffer_head *bh) \ { \ - set_bit(BH_##bit, &(bh)->b_state); \ + if (!test_bit(BH_##bit, &(bh)->b_state))\ + set_bit(BH_##bit, &(bh)->b_state); \ } \ static __always_inline void clear_buffer_##name(struct buffer_head *bh) \ { \ -- 2.7.4
[PATCH v5] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. ChangeLog: V4->V5 a) Scope vm_numa_stat_lock into the sysctl handler function, as suggested by Michal Hocko; b) Only allow 0/1 value when setting a value to numa_stat at userspace, that would keep the possibility for add auto mode in future (e.g. 2 for auto mode), as suggested by Michal Hocko. V3->V4 a) Get rid of auto mode of numa stats, and may add it back if necessary, as alignment before; b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled, as reported by Andrey Ryabinin. See commit "de55c8b2519" for details c) Remove extern declaration for those clear_numa_ function, and make them static in vmstat.c, as suggested by Vlastimil Babka. V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- Documentation/sysctl/vm.txt | 16 +++ include/linux/vmstat.h | 10 +++ kernel/sysctl.c | 9 ++ mm/mempolicy.c | 3 ++ mm/page_alloc.c | 6 mm/vmstat.c | 70 + 6 files changed, 114 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..f65c5c7 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) == +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c605c94 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,9 +6,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a
[PATCH v5] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. ChangeLog: V4->V5 a) Scope vm_numa_stat_lock into the sysctl handler function, as suggested by Michal Hocko; b) Only allow 0/1 value when setting a value to numa_stat at userspace, that would keep the possibility for add auto mode in future (e.g. 2 for auto mode), as suggested by Michal Hocko. V3->V4 a) Get rid of auto mode of numa stats, and may add it back if necessary, as alignment before; b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled, as reported by Andrey Ryabinin. See commit "de55c8b2519" for details c) Remove extern declaration for those clear_numa_ function, and make them static in vmstat.c, as suggested by Vlastimil Babka. V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- Documentation/sysctl/vm.txt | 16 +++ include/linux/vmstat.h | 10 +++ kernel/sysctl.c | 9 ++ mm/mempolicy.c | 3 ++ mm/page_alloc.c | 6 mm/vmstat.c | 70 + 6 files changed, 114 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..f65c5c7 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) == +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c605c94 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,9 +6,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d9c31bc..8f272db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysct
[PATCH v4] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. ChangeLog: V3->V4 a) Get rid of auto mode of numa stats, and may add it back if necessary, as alignment before; b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled, as reported by Andrey Ryabinin. See commit "de55c8b2519" for details c) Remove extern declaration for those clear_numa_ function, and make them static in vmstat.c, as suggested by Vlastimil Babka. V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- Documentation/sysctl/vm.txt | 16 +++ include/linux/vmstat.h | 10 +++ kernel/sysctl.c | 7 + mm/mempolicy.c | 3 ++ mm/page_alloc.c | 6 mm/vmstat.c | 70 + 6 files changed, 112 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..f65c5c7 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) == +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c605c94 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,9 +6,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d9c31bc..f6a79a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1371,6 +1371,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = _mempolicy_sysctl_handler, }, +
[PATCH v4] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. ChangeLog: V3->V4 a) Get rid of auto mode of numa stats, and may add it back if necessary, as alignment before; b) Skip NUMA_INTERLEAVE_HIT counter update when numa stats is disabled, as reported by Andrey Ryabinin. See commit "de55c8b2519" for details c) Remove extern declaration for those clear_numa_ function, and make them static in vmstat.c, as suggested by Vlastimil Babka. V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- Documentation/sysctl/vm.txt | 16 +++ include/linux/vmstat.h | 10 +++ kernel/sysctl.c | 7 + mm/mempolicy.c | 3 ++ mm/page_alloc.c | 6 mm/vmstat.c | 70 + 6 files changed, 112 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..f65c5c7 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -792,6 +793,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) == +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c605c94 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,9 +6,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d9c31bc..f6a79a3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1371,6 +1371,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = _mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data
[PATCH v3] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo [C|c]oarse > /proc/sys/vm/numa_stats_mode In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/numa_stats_mode = We recommend automatic detection of numa statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/numa_stats_mode In this case, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. Branch target selection with jump label: a) When numa_stats_mode is changed to *strict*, jump to the branch for numa counters update. b) When numa_stats_mode is changed to *coarse*, return back directly. c) When numa_stats_mode is changed to *auto*, the branch target used in last time is kept, and the branch target is changed to the branch for numa counters update once numa counters are *read* by users. Therefore, with the help of jump label, the page allocation performance is hardly affected when numa counters are updated with a call in zone_statistics(). Meanwhile, the auto mode can give people benefit without manual tuning. Many thanks to Michal Hocko, Dave Hansen and Ying Huang for comments to help improve the original patch. ChangeLog: V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- Documentation/sysctl/vm.txt | 24 + drivers/base/node.c | 4 ++ include/linux/vmstat.h | 23 init/main.c | 3 ++ kernel/sysctl.c | 7 +++ mm/page_alloc.c | 10 mm/vmstat.c | 129 7 files changed, 200 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..e310e69 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- numa_stats_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,29 @@ ten times more freeable objects than there are. = +numa_stats_mode + +This interface allows numa statistics configurable. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo [C|c]oarse > /proc/sys/vm/numa_stats_mode + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo [S|s]trict > /proc/sys/vm/numa_stat_mode + +We recommend automatic detection of numa statistics by system, because numa +statistics does not affect system's decision and it is very rarely +consumed. you can do: + echo [A|a]uto > /proc/sys/vm/numa_stats_mode +This is also system default configuration, with this default setting, numa +counters update is skipped unless the counter is *read* by users at least +once. + +== + watermark_scale_factor: This fa
[PATCH v3] mm, sysctl: make NUMA stats configurable
This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. = When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo [C|c]oarse > /proc/sys/vm/numa_stats_mode In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench = When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/numa_stats_mode = We recommend automatic detection of numa statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/numa_stats_mode In this case, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. Branch target selection with jump label: a) When numa_stats_mode is changed to *strict*, jump to the branch for numa counters update. b) When numa_stats_mode is changed to *coarse*, return back directly. c) When numa_stats_mode is changed to *auto*, the branch target used in last time is kept, and the branch target is changed to the branch for numa counters update once numa counters are *read* by users. Therefore, with the help of jump label, the page allocation performance is hardly affected when numa counters are updated with a call in zone_statistics(). Meanwhile, the auto mode can give people benefit without manual tuning. Many thanks to Michal Hocko, Dave Hansen and Ying Huang for comments to help improve the original patch. ChangeLog: V2->V3: a) Propose a better way to use jump label to eliminate the overhead of branch selection in zone_statistics(), as inspired by Ying Huang; b) Add a paragraph in commit log to describe the way for branch target selection; c) Use a more descriptive name numa_stats_mode instead of vmstat_mode, and change the description accordingly, as suggested by Michal Hocko; d) Make this functionality NUMA-specific via ifdef V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- Documentation/sysctl/vm.txt | 24 + drivers/base/node.c | 4 ++ include/linux/vmstat.h | 23 init/main.c | 3 ++ kernel/sysctl.c | 7 +++ mm/page_alloc.c | 10 mm/vmstat.c | 129 7 files changed, 200 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..e310e69 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- numa_stats_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,29 @@ ten times more freeable objects than there are. = +numa_stats_mode + +This interface allows numa statistics configurable. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo [C|c]oarse > /proc/sys/vm/numa_stats_mode + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo [S|s]trict > /proc/sys/vm/numa_stat_mode + +We recommend automatic detection of numa statistics by system, because numa +statistics does not affect system's decision and it is very rarely +consumed. you can do: + echo [A|a]uto > /proc/sys/vm/numa_stats_mode +This is also system default configuration, with this default setting, numa +counters update is skipped unless the counter is *read* by users at least +once. + +== + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the diff --git a/drivers/base/node.c b/drivers/base/node.c
[PATCH v2] mm, sysctl: make VM stats configurable
This is the second step which introduces a tunable interface that allow VM stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. === When performance becomes a bottleneck and you can tolerate some possible tool breakage and some decreased counter precision (e.g. numa counter), you can do: echo [C|c]oarse > /proc/sys/vm/vmstat_mode In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench === When performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/vmstat_mode === We recommend automatic detection of virtual memory statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/vmstat_mode In this case, automatic detection of VM statistics, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. Therefore, with different VM stats mode, numa counters update can operate differently so that everybody can benefit. Many thanks to Michal Hocko and Dave Hansen for comments to help improve the original patch. ChangeLog: Since V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- Documentation/sysctl/vm.txt | 26 + drivers/base/node.c | 2 + include/linux/vmstat.h | 22 init/main.c | 2 + kernel/sysctl.c | 7 +++ mm/page_alloc.c | 14 + mm/vmstat.c | 126 7 files changed, 199 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..6ab2843 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- vmstat_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,31 @@ ten times more freeable objects than there are. = +vmstat_mode + +This interface allows virtual memory statistics configurable. + +When performance becomes a bottleneck and you can tolerate some possible +tool breakage and some decreased counter precision (e.g. numa counter), you +can do: + echo [C|c]oarse > /proc/sys/vm/vmstat_mode +ignorable statistics list: +- numa counters + +When performance is not a bottleneck and you want all tooling to work, you +can do: + echo [S|s]trict > /proc/sys/vm/vmstat_mode + +We recommend automatic detection of virtual memory statistics by system, +this is also system default configuration, you can do: + echo [A|a]uto > /proc/sys/vm/vmstat_mode + +E.g. numa statistics does not affect system's decision and it is very +rarely consumed. If set vmstat_mode = auto, numa counters update is skipped +unless the counter is *read* by users at least once. + +== + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902..033c0c3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + disable_zone_statistics = false; return sprintf(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev, NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); + disable_zone_statistics = false; return n; } static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..22670cf 100644 --- a/include/linux/vmstat.h +++ b/
[PATCH v2] mm, sysctl: make VM stats configurable
This is the second step which introduces a tunable interface that allow VM stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. === When performance becomes a bottleneck and you can tolerate some possible tool breakage and some decreased counter precision (e.g. numa counter), you can do: echo [C|c]oarse > /proc/sys/vm/vmstat_mode In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench === When performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/vmstat_mode === We recommend automatic detection of virtual memory statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/vmstat_mode In this case, automatic detection of VM statistics, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. Therefore, with different VM stats mode, numa counters update can operate differently so that everybody can benefit. Many thanks to Michal Hocko and Dave Hansen for comments to help improve the original patch. ChangeLog: Since V1->V2: a) Merge to one patch; b) Use jump label to eliminate the overhead of branch selection; c) Add a single-time log message at boot time to help tell users what happened. Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- Documentation/sysctl/vm.txt | 26 + drivers/base/node.c | 2 + include/linux/vmstat.h | 22 init/main.c | 2 + kernel/sysctl.c | 7 +++ mm/page_alloc.c | 14 + mm/vmstat.c | 126 7 files changed, 199 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..6ab2843 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- vmstat_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,31 @@ ten times more freeable objects than there are. = +vmstat_mode + +This interface allows virtual memory statistics configurable. + +When performance becomes a bottleneck and you can tolerate some possible +tool breakage and some decreased counter precision (e.g. numa counter), you +can do: + echo [C|c]oarse > /proc/sys/vm/vmstat_mode +ignorable statistics list: +- numa counters + +When performance is not a bottleneck and you want all tooling to work, you +can do: + echo [S|s]trict > /proc/sys/vm/vmstat_mode + +We recommend automatic detection of virtual memory statistics by system, +this is also system default configuration, you can do: + echo [A|a]uto > /proc/sys/vm/vmstat_mode + +E.g. numa statistics does not affect system's decision and it is very +rarely consumed. If set vmstat_mode = auto, numa counters update is skipped +unless the counter is *read* by users at least once. + +== + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902..033c0c3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + disable_zone_statistics = false; return sprintf(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev, NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); + disable_zone_statistics = false; return n; } static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..22670cf 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -6,9 +6,27 @@ #include #include #include +#include extern int sy
[PATCH 2/3] mm: Handle numa statistics distinctively based-on different VM stats modes
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed at the 2017 MM Summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf Therefore, with different VM stats mode, numa counters update can operate differently so that everybody can benefit: If vmstat_mode = auto, automatic detection of numa statistics, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. If vmstat_mode = strict, numa counter is updated for each page allocation. If vmstat_mode = coarse, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c| 2 ++ include/linux/vmstat.h | 6 + mm/page_alloc.c| 13 +++ mm/vmstat.c| 60 +++--- 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902..033c0c3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + disable_zone_statistics = false; return sprintf(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev, NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); + disable_zone_statistics = false; return n; } static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c3634c7..ca9854c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -9,6 +9,7 @@ extern int sysctl_stat_interval; +extern bool disable_zone_statistics; /* * vmstat_mode: * 0 = auto mode of vmstat, automatic detection of VM statistics. @@ -19,6 +20,7 @@ extern int sysctl_stat_interval; #define VMSTAT_STRICT_MODE 1 #define VMSTAT_COARSE_MODE 2 #define VMSTAT_MODE_LEN 16 +extern int vmstat_mode; extern char sysctl_vmstat_mode[]; extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); @@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node, extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); +extern void zero_zone_numa_counters(struct zone *zone); +extern void zero_zones_numa_counters(void); +extern void zero_global_numa_counters(void); +extern void invalid_numa_statistics(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c841af8..010a620 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +bool disable_zone_statistics = true; + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* +* skip zone_statistics() if vmstat is a coarse mode or zone statistics +* is inactive in auto vmstat mode +*/ + + if (vmstat_mode) { + if (vmstat_mode == VMSTAT_COARSE_MODE) + return; + } else if (disable_zone_statistics) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; diff --git a/mm/vmstat.c b/mm/vmstat.c index e675ad2..bcaef62 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -85,15 +85,31 @@ int sysctl_vm
[PATCH 1/3] mm, sysctl: make VM stats configurable
This patch adds a tunable interface that allows VM stats configurable, as suggested by Dave Hansen and Ying Huang. When performance becomes a bottleneck and you can tolerate some possible tool breakage and some decreased counter precision (e.g. numa counter), you can do: echo [C|c]oarse > /proc/sys/vm/vmstat_mode When performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/vmstat_mode We recommend automatic detection of virtual memory statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/vmstat_mode The next patch handles numa statistics distinctively based-on different VM stats mode. Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- include/linux/vmstat.h | 14 ++ kernel/sysctl.c| 7 + mm/vmstat.c| 70 ++ 3 files changed, 91 insertions(+) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c3634c7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -9,6 +9,20 @@ extern int sysctl_stat_interval; +/* + * vmstat_mode: + * 0 = auto mode of vmstat, automatic detection of VM statistics. + * 1 = strict mode of vmstat, keep all VM statistics. + * 2 = coarse mode of vmstat, ignore unimportant VM statistics. + */ +#define VMSTAT_AUTO_MODE 0 +#define VMSTAT_STRICT_MODE 1 +#define VMSTAT_COARSE_MODE 2 +#define VMSTAT_MODE_LEN 16 +extern char sysctl_vmstat_mode[]; +extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbb..f5b813b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = { static struct ctl_table vm_table[] = { { + .procname = "vmstat_mode", + .data = _vmstat_mode, + .maxlen = VMSTAT_MODE_LEN, + .mode = 0644, + .proc_handler = sysctl_vmstat_mode_handler, + }, + { .procname = "overcommit_memory", .data = _overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e7..e675ad2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -32,6 +32,76 @@ #define NUMA_STATS_THRESHOLD (U16_MAX - 2) +int vmstat_mode = VMSTAT_AUTO_MODE; +char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto"; +static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"}; +static DEFINE_MUTEX(vmstat_mode_lock); + + +static int __parse_vmstat_mode(char *s) +{ + const char *str = s; + + if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0) + vmstat_mode = VMSTAT_AUTO_MODE; + else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0) + vmstat_mode = VMSTAT_STRICT_MODE; + else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0) + vmstat_mode = VMSTAT_COARSE_MODE; + else { + pr_warn("Ignoring invalid vmstat_mode value: %s\n", s); + return -EINVAL; + } + return 0; +} + +int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + char old_string[VMSTAT_MODE_LEN]; + int ret, oldval; + + mutex_lock(_mode_lock); + if (write) + strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN); + ret = proc_dostring(table, write, buffer, length, ppos); + if (ret || !write) { + mutex_unlock(_mode_lock); + return ret; + } + + oldval = vmstat_mode; + if (__parse_vmstat_mode((char *)table->data)) { + /* +* invalid sysctl_vmstat_mode value, restore saved string +*/ + strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN); + vmstat_mode = oldval; + } else { + /* +* check whether vmstat mode changes or not +*/ + if (vmstat_mode == oldval) { + /* no change */ + mutex_unlock(_mode_lock); + return 0; + } else if (vmstat_mode == VMSTAT_AUTO_MODE) + pr_info("vmstat mode changes from %s to auto mod
[PATCH 2/3] mm: Handle numa statistics distinctively based-on different VM stats modes
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed at the 2017 MM Summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf Therefore, with different VM stats mode, numa counters update can operate differently so that everybody can benefit: If vmstat_mode = auto, automatic detection of numa statistics, numa counter update is skipped unless it has been read by users at least once, e.g. cat /proc/zoneinfo. If vmstat_mode = strict, numa counter is updated for each page allocation. If vmstat_mode = coarse, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- drivers/base/node.c| 2 ++ include/linux/vmstat.h | 6 + mm/page_alloc.c| 13 +++ mm/vmstat.c| 60 +++--- 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 3855902..033c0c3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -153,6 +153,7 @@ static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL); static ssize_t node_read_numastat(struct device *dev, struct device_attribute *attr, char *buf) { + disable_zone_statistics = false; return sprintf(buf, "numa_hit %lu\n" "numa_miss %lu\n" @@ -194,6 +195,7 @@ static ssize_t node_read_vmstat(struct device *dev, NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); + disable_zone_statistics = false; return n; } static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c3634c7..ca9854c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -9,6 +9,7 @@ extern int sysctl_stat_interval; +extern bool disable_zone_statistics; /* * vmstat_mode: * 0 = auto mode of vmstat, automatic detection of VM statistics. @@ -19,6 +20,7 @@ extern int sysctl_stat_interval; #define VMSTAT_STRICT_MODE 1 #define VMSTAT_COARSE_MODE 2 #define VMSTAT_MODE_LEN 16 +extern int vmstat_mode; extern char sysctl_vmstat_mode[]; extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); @@ -243,6 +245,10 @@ extern unsigned long sum_zone_node_page_state(int node, extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); +extern void zero_zone_numa_counters(struct zone *zone); +extern void zero_zones_numa_counters(void); +extern void zero_global_numa_counters(void); +extern void invalid_numa_statistics(void); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c841af8..010a620 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -83,6 +83,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +bool disable_zone_statistics = true; + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -2743,6 +2745,17 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* +* skip zone_statistics() if vmstat is a coarse mode or zone statistics +* is inactive in auto vmstat mode +*/ + + if (vmstat_mode) { + if (vmstat_mode == VMSTAT_COARSE_MODE) + return; + } else if (disable_zone_statistics) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; diff --git a/mm/vmstat.c b/mm/vmstat.c index e675ad2..bcaef62 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -85,15 +85,31 @@ int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, /* no change */
[PATCH 1/3] mm, sysctl: make VM stats configurable
This patch adds a tunable interface that allows VM stats configurable, as suggested by Dave Hansen and Ying Huang. When performance becomes a bottleneck and you can tolerate some possible tool breakage and some decreased counter precision (e.g. numa counter), you can do: echo [C|c]oarse > /proc/sys/vm/vmstat_mode When performance is not a bottleneck and you want all tooling to work, you can do: echo [S|s]trict > /proc/sys/vm/vmstat_mode We recommend automatic detection of virtual memory statistics by system, this is also system default configuration, you can do: echo [A|a]uto > /proc/sys/vm/vmstat_mode The next patch handles numa statistics distinctively based-on different VM stats mode. Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- include/linux/vmstat.h | 14 ++ kernel/sysctl.c| 7 + mm/vmstat.c| 70 ++ 3 files changed, 91 insertions(+) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index ade7cb5..c3634c7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -9,6 +9,20 @@ extern int sysctl_stat_interval; +/* + * vmstat_mode: + * 0 = auto mode of vmstat, automatic detection of VM statistics. + * 1 = strict mode of vmstat, keep all VM statistics. + * 2 = coarse mode of vmstat, ignore unimportant VM statistics. + */ +#define VMSTAT_AUTO_MODE 0 +#define VMSTAT_STRICT_MODE 1 +#define VMSTAT_COARSE_MODE 2 +#define VMSTAT_MODE_LEN 16 +extern char sysctl_vmstat_mode[]; +extern int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos); + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6648fbb..f5b813b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1234,6 +1234,13 @@ static struct ctl_table kern_table[] = { static struct ctl_table vm_table[] = { { + .procname = "vmstat_mode", + .data = _vmstat_mode, + .maxlen = VMSTAT_MODE_LEN, + .mode = 0644, + .proc_handler = sysctl_vmstat_mode_handler, + }, + { .procname = "overcommit_memory", .data = _overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e7..e675ad2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -32,6 +32,76 @@ #define NUMA_STATS_THRESHOLD (U16_MAX - 2) +int vmstat_mode = VMSTAT_AUTO_MODE; +char sysctl_vmstat_mode[VMSTAT_MODE_LEN] = "auto"; +static const char *vmstat_mode_name[3] = {"auto", "strict", "coarse"}; +static DEFINE_MUTEX(vmstat_mode_lock); + + +static int __parse_vmstat_mode(char *s) +{ + const char *str = s; + + if (strcmp(str, "auto") == 0 || strcmp(str, "Auto") == 0) + vmstat_mode = VMSTAT_AUTO_MODE; + else if (strcmp(str, "strict") == 0 || strcmp(str, "Strict") == 0) + vmstat_mode = VMSTAT_STRICT_MODE; + else if (strcmp(str, "coarse") == 0 || strcmp(str, "Coarse") == 0) + vmstat_mode = VMSTAT_COARSE_MODE; + else { + pr_warn("Ignoring invalid vmstat_mode value: %s\n", s); + return -EINVAL; + } + return 0; +} + +int sysctl_vmstat_mode_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + char old_string[VMSTAT_MODE_LEN]; + int ret, oldval; + + mutex_lock(_mode_lock); + if (write) + strncpy(old_string, (char *)table->data, VMSTAT_MODE_LEN); + ret = proc_dostring(table, write, buffer, length, ppos); + if (ret || !write) { + mutex_unlock(_mode_lock); + return ret; + } + + oldval = vmstat_mode; + if (__parse_vmstat_mode((char *)table->data)) { + /* +* invalid sysctl_vmstat_mode value, restore saved string +*/ + strncpy((char *)table->data, old_string, VMSTAT_MODE_LEN); + vmstat_mode = oldval; + } else { + /* +* check whether vmstat mode changes or not +*/ + if (vmstat_mode == oldval) { + /* no change */ + mutex_unlock(_mode_lock); + return 0; + } else if (vmstat_mode == VMSTAT_AUTO_MODE) + pr_info("vmstat mode changes from %s to auto mode\n", + vmstat_mode_name[oldval]); + else if (vmstat_mode =
[PATCH 3/3] sysctl/vm.txt: Update document
Add a paragraph to introduce the functionality and usage on vmstat_mode in sysctl/vm.txt Reported-by: Jesper Dangaard Brouer <bro...@redhat.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- Documentation/sysctl/vm.txt | 26 ++ 1 file changed, 26 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..6ab2843 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- vmstat_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,31 @@ ten times more freeable objects than there are. = +vmstat_mode + +This interface allows virtual memory statistics configurable. + +When performance becomes a bottleneck and you can tolerate some possible +tool breakage and some decreased counter precision (e.g. numa counter), you +can do: + echo [C|c]oarse > /proc/sys/vm/vmstat_mode +ignorable statistics list: +- numa counters + +When performance is not a bottleneck and you want all tooling to work, you +can do: + echo [S|s]trict > /proc/sys/vm/vmstat_mode + +We recommend automatic detection of virtual memory statistics by system, +this is also system default configuration, you can do: + echo [A|a]uto > /proc/sys/vm/vmstat_mode + +E.g. numa statistics does not affect system's decision and it is very +rarely consumed. If set vmstat_mode = auto, numa counters update is skipped +unless the counter is *read* by users at least once. + +== + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the -- 2.7.4
[PATCH 3/3] sysctl/vm.txt: Update document
Add a paragraph to introduce the functionality and usage on vmstat_mode in sysctl/vm.txt Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Signed-off-by: Kemi Wang --- Documentation/sysctl/vm.txt | 26 ++ 1 file changed, 26 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a..6ab2843 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- vmstat_mode - watermark_scale_factor - zone_reclaim_mode @@ -843,6 +844,31 @@ ten times more freeable objects than there are. = +vmstat_mode + +This interface allows virtual memory statistics configurable. + +When performance becomes a bottleneck and you can tolerate some possible +tool breakage and some decreased counter precision (e.g. numa counter), you +can do: + echo [C|c]oarse > /proc/sys/vm/vmstat_mode +ignorable statistics list: +- numa counters + +When performance is not a bottleneck and you want all tooling to work, you +can do: + echo [S|s]trict > /proc/sys/vm/vmstat_mode + +We recommend automatic detection of virtual memory statistics by system, +this is also system default configuration, you can do: + echo [A|a]uto > /proc/sys/vm/vmstat_mode + +E.g. numa statistics does not affect system's decision and it is very +rarely consumed. If set vmstat_mode = auto, numa counters update is skipped +unless the counter is *read* by users at least once. + +== + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the -- 2.7.4
[PATCH 0/3] Handle zone statistics distinctively based-on
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit. A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf This is the second step for optimizing zone statistics, the first patch introduces a tunable interface that allow VM statistics configurable(see the first patch for details): if vmstat_mode = auto, automatic detection of VM statistics if vmstat_mode = strict, keep all the VM statistics if vmstat_mode = coarse, ignore unimportant VM statistics As suggested by Dave Hansen and Ying Huang. With this interface, the second patch handles numa counters distinctively according to different vmstat mode, and the test result shows about 4.8% (185->176) drop of cpu cycles with single thread and 8.1% (343->315) drop of of cpu cycles with 88 threads for single page allocation. The third patch updates ABI document accordingly. Kemi Wang (3): mm, sysctl: make VM stats configurable mm: Handle numa statistics distinctively based-on different VM stats modes sysctl/vm.txt: Update document Documentation/sysctl/vm.txt | 26 ++ drivers/base/node.c | 2 + include/linux/vmstat.h | 20 +++ kernel/sysctl.c | 7 +++ mm/page_alloc.c | 13 + mm/vmstat.c | 124 6 files changed, 192 insertions(+) -- 2.7.4
[PATCH 0/3] Handle zone statistics distinctively based-on
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit. A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf This is the second step for optimizing zone statistics, the first patch introduces a tunable interface that allow VM statistics configurable(see the first patch for details): if vmstat_mode = auto, automatic detection of VM statistics if vmstat_mode = strict, keep all the VM statistics if vmstat_mode = coarse, ignore unimportant VM statistics As suggested by Dave Hansen and Ying Huang. With this interface, the second patch handles numa counters distinctively according to different vmstat mode, and the test result shows about 4.8% (185->176) drop of cpu cycles with single thread and 8.1% (343->315) drop of of cpu cycles with 88 threads for single page allocation. The third patch updates ABI document accordingly. Kemi Wang (3): mm, sysctl: make VM stats configurable mm: Handle numa statistics distinctively based-on different VM stats modes sysctl/vm.txt: Update document Documentation/sysctl/vm.txt | 26 ++ drivers/base/node.c | 2 + include/linux/vmstat.h | 20 +++ kernel/sysctl.c | 7 +++ mm/page_alloc.c | 13 + mm/vmstat.c | 124 6 files changed, 192 insertions(+) -- 2.7.4
[PATCH v2 2/3] mm: Update NUMA counter threshold size
There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter(suggested by Ying Huang). The rationality is that these statistics counters don't affect the kernel's decision, unlike other VM counters, so it's not a problem to use a large threshold. With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cyclesThroughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096399 482520943 2 394 489009617 3 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Changelog: v2: a) Change the type of vm_numa_stat_diff[] from s16 to u16, since numa stats counter is always a incremental field. b) Remove numa_stat_threshold field in struct per_cpu_pageset, since it is a constant value and rarely be changed. c) Cut down instructions in __inc_numa_state() due to the incremental numa counter and the consistant numa threshold. d) Move zone_numa_state_snapshot() to an individual patch, since it does not appear to be related to this patch. Signed-off-by: Kemi Wang <kemi.w...@intel.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> --- include/linux/mmzone.h | 3 +-- mm/vmstat.c| 28 ++-- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 582f6d9..c386ec4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -282,8 +282,7 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff --git a/mm/vmstat.c b/mm/vmstat.c index 0c3b54b..b015f39 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif + /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold @@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->numa_stat_threshold); - if (unlikely(v > t)) { - s8 overstep = t >> 1; - zone_numa_state_add(v + overstep, zone, item); - __this_cpu_write(*p, -overstep); + if (unlikely(v > NUMA_S
[PATCH v2 0/3] Separate NUMA statistics from zone statistics
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cyclesThroughput(88 threads) 32799 241760478 64640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 2 394 489009617 3 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Kemi Wang (3): mm: Change the call sites of numa statistics items mm: Update NUMA counter threshold size mm: Consider the number in local CPUs when *reads* NUMA stats drivers/base/node.c| 22 --- include/linux/mmzone.h | 24 +--- include/linux/vmstat.h | 33 +++ mm/page_alloc.c| 10 ++-- mm/vmstat.c| 152 +++-- 5 files changed, 217 insertions(+), 24 deletions(-) -- 2.7.4
[PATCH v2 2/3] mm: Update NUMA counter threshold size
There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter(suggested by Ying Huang). The rationality is that these statistics counters don't affect the kernel's decision, unlike other VM counters, so it's not a problem to use a large threshold. With this patchset, we see 31.3% drop of CPU cycles(537-->369) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cyclesThroughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096399 482520943 2 394 489009617 3 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Changelog: v2: a) Change the type of vm_numa_stat_diff[] from s16 to u16, since numa stats counter is always a incremental field. b) Remove numa_stat_threshold field in struct per_cpu_pageset, since it is a constant value and rarely be changed. c) Cut down instructions in __inc_numa_state() due to the incremental numa counter and the consistant numa threshold. d) Move zone_numa_state_snapshot() to an individual patch, since it does not appear to be related to this patch. Signed-off-by: Kemi Wang Suggested-by: Dave Hansen Suggested-by: Ying Huang --- include/linux/mmzone.h | 3 +-- mm/vmstat.c| 28 ++-- 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 582f6d9..c386ec4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -282,8 +282,7 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff --git a/mm/vmstat.c b/mm/vmstat.c index 0c3b54b..b015f39 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STATS_THRESHOLD (U16_MAX - 2) + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -194,10 +196,7 @@ void refresh_zone_stat_thresholds(void) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif + /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold @@ -231,14 +230,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -872,16 +866,14 @@ void __inc_numa_state(struct zone *zone, enum numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + u16 __percpu *p = pcp->vm_numa_stat_diff + item; + u16 v; v = __this_cpu_inc_return(*p); - t = __this_cpu_read(pcp->numa_stat_threshold); - if (unlikely(v > t)) { - s8 overstep = t >> 1; - zone_numa_state_add(v + overstep, zone, item); - __this_cpu_write(*p, -overstep); + if (unlikely(v > NUMA_STATS_THRESHOLD)) { + zone_numa_state_add(v, zone, item); + __
[PATCH v2 0/3] Separate NUMA statistics from zone statistics
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017 -JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/ bench Threshold CPU cyclesThroughput(88 threads) 32799 241760478 64640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 2 394 489009617 3 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Kemi Wang (3): mm: Change the call sites of numa statistics items mm: Update NUMA counter threshold size mm: Consider the number in local CPUs when *reads* NUMA stats drivers/base/node.c| 22 --- include/linux/mmzone.h | 24 +--- include/linux/vmstat.h | 33 +++ mm/page_alloc.c| 10 ++-- mm/vmstat.c| 152 +++-- 5 files changed, 217 insertions(+), 24 deletions(-) -- 2.7.4
[PATCH v2 1/3] mm: Change the call sites of numa statistics items
In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce0 nr_bounce0 nr_zspages 0 nr_zspages 0 numa_hit 0*nr_free_cma 0* numa_miss 0numa_hit 0 numa_foreign 0 numa_miss0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0*numa_other 0 ...... vm stats threshold: 10 vm stats threshold: 10 ...... The next patch updates the numa stats counter size and threshold. Changelog: v2: a) Modify the name of numa-stats-specific functions and params to avoid confusion with those for zone/node page stats. b) Get rid of showing the number of numa stat threshold in /proc/zoneinfo since the value of this item is a constant. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 29 + mm/page_alloc.c| 10 ++-- mm/vmstat.c| 159 +++-- 5 files changed, 219 insertions(+), 26 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index d8dc830..3855902 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_node_page_state(dev->id, NUMA_HIT), - sum_zone_node_page_state(dev->id, NUMA_MISS), - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_node_page_state(dev->id, NUMA_LOCAL), - sum_zone_node_page_state(dev->id, NUMA_OTHER)); + sum_zone_numa_state(dev->id, NUMA_HIT), + sum_zone_numa_state(dev->id, NUMA_MISS), + sum_zone_numa_state(dev->id, NUMA_FOREIGN), + sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_numa_state(dev->id, NUMA_LOCAL), + sum_zone_numa_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], +sum_zone_numa_state(nid, i)); +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + +NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fda9afb..582f6d9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,20 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +#ifdef CONFIG_NUMA +enum numa_stat_item { + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN,
[PATCH v2 1/3] mm: Change the call sites of numa statistics items
In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce0 nr_bounce0 nr_zspages 0 nr_zspages 0 numa_hit 0*nr_free_cma 0* numa_miss 0numa_hit 0 numa_foreign 0 numa_miss0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0*numa_other 0 ...... vm stats threshold: 10 vm stats threshold: 10 ...... The next patch updates the numa stats counter size and threshold. Changelog: v2: a) Modify the name of numa-stats-specific functions and params to avoid confusion with those for zone/node page stats. b) Get rid of showing the number of numa stat threshold in /proc/zoneinfo since the value of this item is a constant. Signed-off-by: Kemi Wang --- drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 29 + mm/page_alloc.c| 10 ++-- mm/vmstat.c| 159 +++-- 5 files changed, 219 insertions(+), 26 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index d8dc830..3855902 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_node_page_state(dev->id, NUMA_HIT), - sum_zone_node_page_state(dev->id, NUMA_MISS), - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_node_page_state(dev->id, NUMA_LOCAL), - sum_zone_node_page_state(dev->id, NUMA_OTHER)); + sum_zone_numa_state(dev->id, NUMA_HIT), + sum_zone_numa_state(dev->id, NUMA_MISS), + sum_zone_numa_state(dev->id, NUMA_FOREIGN), + sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_numa_state(dev->id, NUMA_LOCAL), + sum_zone_numa_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], +sum_zone_numa_state(nid, i)); +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + +NR_VM_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fda9afb..582f6d9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,20 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +#ifdef CONFIG_NUMA +enum numa_stat_item { + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here,
[PATCH v2 3/3] mm: Consider the number in local CPUs when *reads* NUMA stats
To avoid deviation, the per cpu number of NUMA stats in vm_numa_stat_diff[] is included when a user *reads* the NUMA stats. Since NUMA stats does not be read by users frequently, and kernel does not need it to make a decision, it will not be a problem to make the readers more expensive. Changelog: v2: a) new creation. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- include/linux/vmstat.h | 6 +- mm/vmstat.c| 9 +++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a29bd98..72e9ca6 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum numa_stat_item item) return x; } -static inline unsigned long zone_numa_state(struct zone *zone, +static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(>vm_numa_stat[item]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b015f39..abeab81 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -895,6 +895,10 @@ unsigned long sum_zone_node_page_state(int node, return count; } +/* + * Determine the per node value of a numa stat item. To avoid deviation, + * the per cpu stat number in vm_numa_stat_diff[] is also included. + */ unsigned long sum_zone_numa_state(int node, enum numa_stat_item item) { @@ -903,7 +907,7 @@ unsigned long sum_zone_numa_state(int node, unsigned long count = 0; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state(zones + i, item); + count += zone_numa_state_snapshot(zones + i, item); return count; } @@ -1534,7 +1538,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - zone_numa_state(zone, i)); + zone_numa_state_snapshot(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1790,6 +1794,7 @@ static bool need_update(int cpu) #ifdef CONFIG_NUMA BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); #endif + /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. -- 2.7.4
[PATCH v2 3/3] mm: Consider the number in local CPUs when *reads* NUMA stats
To avoid deviation, the per cpu number of NUMA stats in vm_numa_stat_diff[] is included when a user *reads* the NUMA stats. Since NUMA stats does not be read by users frequently, and kernel does not need it to make a decision, it will not be a problem to make the readers more expensive. Changelog: v2: a) new creation. Signed-off-by: Kemi Wang --- include/linux/vmstat.h | 6 +- mm/vmstat.c| 9 +++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index a29bd98..72e9ca6 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum numa_stat_item item) return x; } -static inline unsigned long zone_numa_state(struct zone *zone, +static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(>vm_numa_stat[item]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b015f39..abeab81 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -895,6 +895,10 @@ unsigned long sum_zone_node_page_state(int node, return count; } +/* + * Determine the per node value of a numa stat item. To avoid deviation, + * the per cpu stat number in vm_numa_stat_diff[] is also included. + */ unsigned long sum_zone_numa_state(int node, enum numa_stat_item item) { @@ -903,7 +907,7 @@ unsigned long sum_zone_numa_state(int node, unsigned long count = 0; for (i = 0; i < MAX_NR_ZONES; i++) - count += zone_numa_state(zones + i, item); + count += zone_numa_state_snapshot(zones + i, item); return count; } @@ -1534,7 +1538,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], - zone_numa_state(zone, i)); + zone_numa_state_snapshot(zone, i)); #endif seq_printf(m, "\n pagesets"); @@ -1790,6 +1794,7 @@ static bool need_update(int cpu) #ifdef CONFIG_NUMA BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2); #endif + /* * The fast way of checking if there are any vmstat diffs. * This works because the diffs are byte sized items. -- 2.7.4
[PATCH 1/2] mm: Change the call sites of numa statistics items
In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the value of NUMA stats is shown behind zone page stats, and the threshold size of NUMA stats is shown behind pcp threshold when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce0 nr_bounce0 nr_zspages 0 nr_zspages 0 numa_hit 0*nr_free_cma 0* numa_miss 0numa_hit 0 numa_foreign 0 numa_miss0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0*numa_other 0 ...... vm stats threshold: 10 vm stats threshold: 10 ... *vm numa stats threshold: 10* ... The next patch updates the numa stats counter size and threshold. Signed-off-by: Kemi Wang <kemi.w...@intel.com> --- drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 29 + mm/page_alloc.c| 10 +-- mm/vmstat.c| 167 +++-- 5 files changed, 227 insertions(+), 26 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index d8dc830..12080c6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_node_page_state(dev->id, NUMA_HIT), - sum_zone_node_page_state(dev->id, NUMA_MISS), - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_node_page_state(dev->id, NUMA_LOCAL), - sum_zone_node_page_state(dev->id, NUMA_OTHER)); + sum_zone_node_numa_state(dev->id, NUMA_HIT), + sum_zone_node_numa_state(dev->id, NUMA_MISS), + sum_zone_node_numa_state(dev->id, NUMA_FOREIGN), + sum_zone_node_numa_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_node_numa_state(dev->id, NUMA_LOCAL), + sum_zone_node_numa_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], +sum_zone_node_numa_state(nid, i)); +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + +NR_VM_ZONE_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc14b8b..0b11ba7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,20 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +#ifdef CONFIG_NUMA +enum zone_numa_stat_item { + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLE
[PATCH 1/2] mm: Change the call sites of numa statistics items
In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the value of NUMA stats is shown behind zone page stats, and the threshold size of NUMA stats is shown behind pcp threshold when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce0 nr_bounce0 nr_zspages 0 nr_zspages 0 numa_hit 0*nr_free_cma 0* numa_miss 0numa_hit 0 numa_foreign 0 numa_miss0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0*numa_other 0 ...... vm stats threshold: 10 vm stats threshold: 10 ... *vm numa stats threshold: 10* ... The next patch updates the numa stats counter size and threshold. Signed-off-by: Kemi Wang --- drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 29 + mm/page_alloc.c| 10 +-- mm/vmstat.c| 167 +++-- 5 files changed, 227 insertions(+), 26 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index d8dc830..12080c6 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev, "interleave_hit %lu\n" "local_node %lu\n" "other_node %lu\n", - sum_zone_node_page_state(dev->id, NUMA_HIT), - sum_zone_node_page_state(dev->id, NUMA_MISS), - sum_zone_node_page_state(dev->id, NUMA_FOREIGN), - sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), - sum_zone_node_page_state(dev->id, NUMA_LOCAL), - sum_zone_node_page_state(dev->id, NUMA_OTHER)); + sum_zone_node_numa_state(dev->id, NUMA_HIT), + sum_zone_node_numa_state(dev->id, NUMA_MISS), + sum_zone_node_numa_state(dev->id, NUMA_FOREIGN), + sum_zone_node_numa_state(dev->id, NUMA_INTERLEAVE_HIT), + sum_zone_node_numa_state(dev->id, NUMA_LOCAL), + sum_zone_node_numa_state(dev->id, NUMA_OTHER)); } static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); @@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev, n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], sum_zone_node_page_state(nid, i)); - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) +#ifdef CONFIG_NUMA + for (i = 0; i < NR_VM_ZONE_NUMA_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], +sum_zone_node_numa_state(nid, i)); +#endif + + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + n += sprintf(buf+n, "%s %lu\n", +vmstat_text[i + NR_VM_ZONE_STAT_ITEMS + +NR_VM_ZONE_NUMA_STAT_ITEMS], node_page_state(pgdat, i)); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc14b8b..0b11ba7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,20 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +#ifdef CONFIG_NUMA +enum zone_numa_stat_item { + NUMA_HIT, /* allocated in intended node */ + NUMA_MISS, /* allocated in non intended node */ + NUMA_FOREIGN, /* was intended here, hit elsewhere */ + NUMA_INTERLEAVE_HIT,/* interleaver p
[PATCH 0/2] Separate NUMA statistics from zone statistics
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM submit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of 32765, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 26.6% drop of CPU cycles(537-->394, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (see the first patch for details), except that the number of NUMA items in each cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads* the value of NUMA counter to eliminate deviation. I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Broucer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cyclesThroughput(88 threads) 32799 241760478 64640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 2 394 489009617 3 395 488017817 32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Kemi Wang (2): mm: Change the call sites of numa statistics items mm: Update NUMA counter threshold size drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 33 ++ mm/page_alloc.c| 10 +-- mm/vmstat.c| 162 +++-- 5 files changed, 227 insertions(+), 25 deletions(-) -- 2.7.4
[PATCH 0/2] Separate NUMA statistics from zone statistics
Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM submit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of 32765, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 26.6% drop of CPU cycles(537-->394, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (see the first patch for details), except that the number of NUMA items in each cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads* the value of NUMA counter to eliminate deviation. I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Broucer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cyclesThroughput(88 threads) 32799 241760478 64640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 2 394 489009617 3 395 488017817 32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Kemi Wang (2): mm: Change the call sites of numa statistics items mm: Update NUMA counter threshold size drivers/base/node.c| 22 --- include/linux/mmzone.h | 25 +--- include/linux/vmstat.h | 33 ++ mm/page_alloc.c| 10 +-- mm/vmstat.c| 162 +++-- 5 files changed, 227 insertions(+), 25 deletions(-) -- 2.7.4
[PATCH 2/2] mm: Update NUMA counter threshold size
There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of 32765, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter, and the number of NUMA items in each cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads* the value of numa counter to eliminate deviation (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 26.6% drop of CPU cycles(537-->394) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Broucer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cyclesThroughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096399 482520943 2 394 489009617 3 395 488017817 32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Signed-off-by: Kemi Wang <kemi.w...@intel.com> Suggested-by: Dave Hansen <dave.han...@intel.com> Suggested-by: Ying Huang <ying.hu...@intel.com> --- include/linux/mmzone.h | 4 ++-- include/linux/vmstat.h | 6 +- mm/vmstat.c| 23 ++- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0b11ba7..7eaf0e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -282,8 +282,8 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS]; + s16 numa_stat_threshold; + s16 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1e19379..d97cc34 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum zone_numa_stat_item item) return x; } -static inline unsigned long zone_numa_state(struct zone *zone, +static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum zone_numa_stat_item item) { long x = atomic_long_read(>vm_numa_stat[item]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a7fa30..c7f50ed 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STAT_THRESHOLD 32765 + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -196,7 +198,7 @@ void refresh_zone_stat_thresholds(void) = threshold; #ifdef CONFIG_NUMA per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; + = NUMA_STAT_THRESHOLD; #endif /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; @@ -231,14 +233,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -872,13 +869,13 @@ void __inc_zone_numa_state(struct zone *zone, enum zone_numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->
[PATCH 2/2] mm: Update NUMA counter threshold size
There is significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (suggested by Dave Hansen). This patch updates NUMA counter threshold to a fixed size of 32765, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter, and the number of NUMA items in each cpu (vm_numa_stat_diff[]) is added to zone->vm_numa_stat[] when a user *reads* the value of numa counter to eliminate deviation (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 26.6% drop of CPU cycles(537-->394) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Benchmark provided by Jesper D Broucer(increase loop times to 1000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cyclesThroughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default (base) 256 468 412397590 512 428 450550704 4096399 482520943 2 394 489009617 3 395 488017817 32765 394(-26.6%) 488932078(+36.2%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics Signed-off-by: Kemi Wang Suggested-by: Dave Hansen Suggested-by: Ying Huang --- include/linux/mmzone.h | 4 ++-- include/linux/vmstat.h | 6 +- mm/vmstat.c| 23 ++- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0b11ba7..7eaf0e8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -282,8 +282,8 @@ struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; - s8 numa_stat_threshold; - s8 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS]; + s16 numa_stat_threshold; + s16 vm_numa_stat_diff[NR_VM_ZONE_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1e19379..d97cc34 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,10 +125,14 @@ static inline unsigned long global_numa_state(enum zone_numa_stat_item item) return x; } -static inline unsigned long zone_numa_state(struct zone *zone, +static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum zone_numa_stat_item item) { long x = atomic_long_read(>vm_numa_stat[item]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a7fa30..c7f50ed 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -30,6 +30,8 @@ #include "internal.h" +#define NUMA_STAT_THRESHOLD 32765 + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); @@ -196,7 +198,7 @@ void refresh_zone_stat_thresholds(void) = threshold; #ifdef CONFIG_NUMA per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; + = NUMA_STAT_THRESHOLD; #endif /* Base nodestat threshold on the largest populated zone. */ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; @@ -231,14 +233,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; -#ifdef CONFIG_NUMA - per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold - = threshold; -#endif - } } } @@ -872,13 +869,13 @@ void __inc_zone_numa_state(struct zone *zone, enum zone_numa_stat_item item) { struct per_cpu_pageset __percpu *pcp = zone->pageset; - s8 __percpu *p = pcp->vm_numa_stat_diff + item; - s8 v, t; + s16 __percpu *p = pcp->vm_numa_stat_diff + item; + s16 v, t; v = __this_cpu_inc_return(*p);