[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored in the same struct per_cpu_pages even though vmstats have no direct impact on the per-cpu page lists. This is inconsistent because the vmstats for a node are stored on a dedicated structure. The bigger issue is that the per_cpu_pages structure is not cache-aligned and stat updates either cache conflict with adjacent per-cpu lists incurring a runtime cost or padding is required incurring a memory cost. This patch splits the per-cpu pagelists and the vmstat deltas into separate structures. It's mostly a mechanical conversion but some variable renaming is done to clearly distinguish the per-cpu pages structure (pcp) from the vmstats (pzstats). Superficially, this appears to increase the size of the per_cpu_pages structure but the movement of expire fills a structure hole so there is no impact overall. [l...@intel.com: Check struct per_cpu_zonestat has a non-zero size] [vba...@suse.cz: Init zone->per_cpu_zonestats properly] Signed-off-by: Mel Gorman --- include/linux/mmzone.h | 18 include/linux/vmstat.h | 8 ++-- mm/page_alloc.c| 85 - mm/vmstat.c| 96 ++ 4 files changed, 111 insertions(+), 96 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..a4393ac27336 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -341,20 +341,21 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ +#ifdef CONFIG_NUMA + int expire; /* When 0, remote pagesets are drained */ +#endif /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; -struct per_cpu_pageset { - struct per_cpu_pages pcp; -#ifdef CONFIG_NUMA - s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; -#endif +struct per_cpu_zonestat { #ifdef CONFIG_SMP - s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; + s8 stat_threshold; +#endif +#ifdef CONFIG_NUMA + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif }; @@ -470,7 +471,8 @@ struct zone { int node; #endif struct pglist_data *zone_pgdat; - struct per_cpu_pageset __percpu *pageset; + struct per_cpu_pages__percpu *per_cpu_pageset; + struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 506d625163a1..1736ea9d24a7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct zone *zone, int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; return x; } @@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; @@ -291,7 +291,7 @@ struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); @@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, - struct per_cpu_pageset *pset) { } + struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9bf0db982f14..2d6283cab22d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = >pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count)
Re: [PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats
On Mon, Apr 12, 2021 at 07:43:18PM +0200, Vlastimil Babka wrote: > On 4/7/21 10:24 PM, Mel Gorman wrote: > > @@ -6691,7 +6697,7 @@ static __meminit void zone_pcp_init(struct zone *zone) > > * relies on the ability of the linker to provide the > > * offset of a (static) per cpu variable into the per cpu area. > > */ > > - zone->pageset = _pageset; > > + zone->per_cpu_pageset = _pageset; > > I don't see any _zonestats assignment here in zone_pcp_init() or its > caller(s), which seems strange, as zone_pcp_reset() does it. > Yes, it's required, well spotted! -- Mel Gorman SUSE Labs
Re: [PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats
On 4/7/21 10:24 PM, Mel Gorman wrote: > @@ -6691,7 +6697,7 @@ static __meminit void zone_pcp_init(struct zone *zone) >* relies on the ability of the linker to provide the >* offset of a (static) per cpu variable into the per cpu area. >*/ > - zone->pageset = _pageset; > + zone->per_cpu_pageset = _pageset; I don't see any _zonestats assignment here in zone_pcp_init() or its caller(s), which seems strange, as zone_pcp_reset() does it. > zone->pageset_high = BOOT_PAGESET_HIGH; > zone->pageset_batch = BOOT_PAGESET_BATCH; > > @@ -8954,17 +8960,19 @@ void zone_pcp_reset(struct zone *zone) > { > unsigned long flags; > int cpu; > - struct per_cpu_pageset *pset; > + struct per_cpu_zonestat *pzstats; > > /* avoid races with drain_pages() */ > local_irq_save(flags); > - if (zone->pageset != _pageset) { > + if (zone->per_cpu_pageset != _pageset) { > for_each_online_cpu(cpu) { > - pset = per_cpu_ptr(zone->pageset, cpu); > - drain_zonestat(zone, pset); > + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); > + drain_zonestat(zone, pzstats); > } > - free_percpu(zone->pageset); > - zone->pageset = _pageset; > + free_percpu(zone->per_cpu_pageset); > + free_percpu(zone->per_cpu_zonestats); > + zone->per_cpu_pageset = _pageset; > + zone->per_cpu_zonestats = _zonestats; ^ here > } > local_irq_restore(flags); > }
[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored in the same struct per_cpu_pages even though vmstats have no direct impact on the per-cpu page lists. This is inconsistent because the vmstats for a node are stored on a dedicated structure. The bigger issue is that the per_cpu_pages structure is not cache-aligned and stat updates either cache conflict with adjacent per-cpu lists incurring a runtime cost or padding is required incurring a memory cost. This patch splits the per-cpu pagelists and the vmstat deltas into separate structures. It's mostly a mechanical conversion but some variable renaming is done to clearly distinguish the per-cpu pages structure (pcp) from the vmstats (pzstats). Superficially, this appears to increase the size of the per_cpu_pages structure but the movement of expire fills a structure hole so there is no impact overall. [l...@intel.com: Check struct per_cpu_zonestat has a non-zero size] Signed-off-by: Mel Gorman --- include/linux/mmzone.h | 18 include/linux/vmstat.h | 8 ++-- mm/page_alloc.c| 84 +++- mm/vmstat.c| 96 ++ 4 files changed, 110 insertions(+), 96 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..a4393ac27336 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -341,20 +341,21 @@ struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ +#ifdef CONFIG_NUMA + int expire; /* When 0, remote pagesets are drained */ +#endif /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; -struct per_cpu_pageset { - struct per_cpu_pages pcp; -#ifdef CONFIG_NUMA - s8 expire; - u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; -#endif +struct per_cpu_zonestat { #ifdef CONFIG_SMP - s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; + s8 stat_threshold; +#endif +#ifdef CONFIG_NUMA + u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif }; @@ -470,7 +471,8 @@ struct zone { int node; #endif struct pglist_data *zone_pgdat; - struct per_cpu_pageset __percpu *pageset; + struct per_cpu_pages__percpu *per_cpu_pageset; + struct per_cpu_zonestat __percpu *per_cpu_zonestats; /* * the high and batch values are copied to individual pagesets for * faster access diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 506d625163a1..1736ea9d24a7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct zone *zone, int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_numa_stat_diff[item]; return x; } @@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) - x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; + x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; @@ -291,7 +291,7 @@ struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); -void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); +void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); @@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, - struct per_cpu_pageset *pset) { } + struct per_cpu_zonestat *pzstats) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e8aedb64b57..a68bacddcae0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = >pcp; + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); +