[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-14 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
[vba...@suse.cz: Init zone->per_cpu_zonestats properly]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 85 -
 mm/vmstat.c| 96 ++
 4 files changed, 111 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bf0db982f14..2d6283cab22d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)

Re: [PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-13 Thread Mel Gorman
On Mon, Apr 12, 2021 at 07:43:18PM +0200, Vlastimil Babka wrote:
> On 4/7/21 10:24 PM, Mel Gorman wrote:
> > @@ -6691,7 +6697,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
> >  * relies on the ability of the linker to provide the
> >  * offset of a (static) per cpu variable into the per cpu area.
> >  */
> > -   zone->pageset = _pageset;
> > +   zone->per_cpu_pageset = _pageset;
> 
> I don't see any _zonestats assignment here in zone_pcp_init() or its
> caller(s), which seems strange, as zone_pcp_reset() does it.
> 

Yes, it's required, well spotted!

-- 
Mel Gorman
SUSE Labs


Re: [PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-12 Thread Vlastimil Babka
On 4/7/21 10:24 PM, Mel Gorman wrote:
> @@ -6691,7 +6697,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
>* relies on the ability of the linker to provide the
>* offset of a (static) per cpu variable into the per cpu area.
>*/
> - zone->pageset = _pageset;
> + zone->per_cpu_pageset = _pageset;

I don't see any _zonestats assignment here in zone_pcp_init() or its
caller(s), which seems strange, as zone_pcp_reset() does it.

>   zone->pageset_high = BOOT_PAGESET_HIGH;
>   zone->pageset_batch = BOOT_PAGESET_BATCH;
>  
> @@ -8954,17 +8960,19 @@ void zone_pcp_reset(struct zone *zone)
>  {
>   unsigned long flags;
>   int cpu;
> - struct per_cpu_pageset *pset;
> + struct per_cpu_zonestat *pzstats;
>  
>   /* avoid races with drain_pages()  */
>   local_irq_save(flags);
> - if (zone->pageset != _pageset) {
> + if (zone->per_cpu_pageset != _pageset) {
>   for_each_online_cpu(cpu) {
> - pset = per_cpu_ptr(zone->pageset, cpu);
> - drain_zonestat(zone, pset);
> + pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
> + drain_zonestat(zone, pzstats);
>   }
> - free_percpu(zone->pageset);
> - zone->pageset = _pageset;
> + free_percpu(zone->per_cpu_pageset);
> + free_percpu(zone->per_cpu_zonestats);
> + zone->per_cpu_pageset = _pageset;
> + zone->per_cpu_zonestats = _zonestats;

^ here

>   }
>   local_irq_restore(flags);
>  }


[PATCH 01/11] mm/page_alloc: Split per cpu page lists and zone stats

2021-04-07 Thread Mel Gorman
The per-cpu page allocator lists and the per-cpu vmstat deltas are stored
in the same struct per_cpu_pages even though vmstats have no direct impact
on the per-cpu page lists. This is inconsistent because the vmstats for a
node are stored on a dedicated structure. The bigger issue is that the
per_cpu_pages structure is not cache-aligned and stat updates either
cache conflict with adjacent per-cpu lists incurring a runtime cost or
padding is required incurring a memory cost.

This patch splits the per-cpu pagelists and the vmstat deltas into separate
structures. It's mostly a mechanical conversion but some variable renaming
is done to clearly distinguish the per-cpu pages structure (pcp) from
the vmstats (pzstats).

Superficially, this appears to increase the size of the per_cpu_pages
structure but the movement of expire fills a structure hole so there is
no impact overall.

[l...@intel.com: Check struct per_cpu_zonestat has a non-zero size]
Signed-off-by: Mel Gorman 
---
 include/linux/mmzone.h | 18 
 include/linux/vmstat.h |  8 ++--
 mm/page_alloc.c| 84 +++-
 mm/vmstat.c| 96 ++
 4 files changed, 110 insertions(+), 96 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 47946cec7584..a4393ac27336 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -341,20 +341,21 @@ struct per_cpu_pages {
int count;  /* number of pages in the list */
int high;   /* high watermark, emptying needed */
int batch;  /* chunk size for buddy add/remove */
+#ifdef CONFIG_NUMA
+   int expire; /* When 0, remote pagesets are drained */
+#endif
 
/* Lists of pages, one per migrate type stored on the pcp-lists */
struct list_head lists[MIGRATE_PCPTYPES];
 };
 
-struct per_cpu_pageset {
-   struct per_cpu_pages pcp;
-#ifdef CONFIG_NUMA
-   s8 expire;
-   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
-#endif
+struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
-   s8 stat_threshold;
s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
+   s8 stat_threshold;
+#endif
+#ifdef CONFIG_NUMA
+   u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
 #endif
 };
 
@@ -470,7 +471,8 @@ struct zone {
int node;
 #endif
struct pglist_data  *zone_pgdat;
-   struct per_cpu_pageset __percpu *pageset;
+   struct per_cpu_pages__percpu *per_cpu_pageset;
+   struct per_cpu_zonestat __percpu *per_cpu_zonestats;
/*
 * the high and batch values are copied to individual pagesets for
 * faster access
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 506d625163a1..1736ea9d24a7 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,7 +163,7 @@ static inline unsigned long zone_numa_state_snapshot(struct 
zone *zone,
int cpu;
 
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_numa_stat_diff[item];
 
return x;
 }
@@ -236,7 +236,7 @@ static inline unsigned long zone_page_state_snapshot(struct 
zone *zone,
 #ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
-   x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item];
+   x += per_cpu_ptr(zone->per_cpu_zonestats, 
cpu)->vm_stat_diff[item];
 
if (x < 0)
x = 0;
@@ -291,7 +291,7 @@ struct ctl_table;
 int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp,
loff_t *ppos);
 
-void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
+void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);
 
 int calculate_pressure_threshold(struct zone *zone);
 int calculate_normal_threshold(struct zone *zone);
@@ -399,7 +399,7 @@ static inline void cpu_vm_stats_fold(int cpu) { }
 static inline void quiet_vmstat(void) { }
 
 static inline void drain_zonestat(struct zone *zone,
-   struct per_cpu_pageset *pset) { }
+   struct per_cpu_zonestat *pzstats) { }
 #endif /* CONFIG_SMP */
 
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e8aedb64b57..a68bacddcae0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2981,15 +2981,14 @@ void drain_zone_pages(struct zone *zone, struct 
per_cpu_pages *pcp)
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
unsigned long flags;
-   struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
 
local_irq_save(flags);
-   pset = per_cpu_ptr(zone->pageset, cpu);
 
-   pcp = >pcp;
+   pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
+