From: Dave Hansen <[email protected]>

The percpu pages calculations are a bit convoluted.  Right now,
zone_batchsize() claims to be calculating the ->batch size, but
what actually happens is:

1. Calculate how large we want the entire pcp set to be (->high)
2. Scale that down by the ratio that we want high:batch to be
3. Adjust ->batch for good cache-coloring behavior
4. Re-derive ->high by scaling back up by the (2) ratio

We actually feed the cache-coloring scaling back in to the ->high
value, when it really only *should* apply to the batch value.
That was probably unintentional, and it was one of the things
that led us to mismatching the high:batch ratio that we saw in
the previous patch.

This patch reorganizes the code.  It separates out the ->batch
and ->high calculations so that it's clear when we are
calculating each of them.  It also ensures that we always
calculate ->high _first_, then derive ->batch from it, finally
we adjust ->batch for good cache coloring behavior.

Since we are no longer calculating the batch size by itself, it
is not simple to print it out in zone_pcp_init() during boot.
We, instead, print out the 'high' value.  If anyone really misses
this, they can surely just read /proc/zoneinfo after boot.

Signed-off-by: Dave Hansen <[email protected]>
---

 linux.git-davehans/mm/page_alloc.c |   54 ++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff -puN mm/page_alloc.c~rename-zone_batchsize mm/page_alloc.c
--- linux.git/mm/page_alloc.c~rename-zone_batchsize     2013-10-15 
09:57:07.597688692 -0700
+++ linux.git-davehans/mm/page_alloc.c  2013-10-15 09:57:07.602688914 -0700
@@ -4061,10 +4061,10 @@ static void __meminit zone_init_free_lis
 
 static int pcp_high_to_batch_ratio = 4;
 
-static int zone_batchsize(struct zone *zone)
+static int calculate_zone_pcp_high(struct zone *zone)
 {
 #ifdef CONFIG_MMU
-       int batch;
+       int high;
 
        /*
         * The per-cpu-pages pools are set to around 1000th of the
@@ -4072,26 +4072,13 @@ static int zone_batchsize(struct zone *z
         *
         * OK, so we don't know how big the cache is.  So guess.
         */
-       batch = zone->managed_pages / 1024;
-       if (batch * PAGE_SIZE > 512 * 1024)
-               batch = (512 * 1024) / PAGE_SIZE;
-       batch /= pcp_high_to_batch_ratio;
-       if (batch < 1)
-               batch = 1;
-
-       /*
-        * Clamp the batch to a 2^n - 1 value. Having a power
-        * of 2 value was found to be more likely to have
-        * suboptimal cache aliasing properties in some cases.
-        *
-        * For example if 2 tasks are alternately allocating
-        * batches of pages, one task can end up with a lot
-        * of pages of one half of the possible page colors
-        * and the other with pages of the other colors.
-        */
-       batch = rounddown_pow_of_two(batch + batch/2) - 1;
+       high = zone->managed_pages / 1024;
+       if (high * PAGE_SIZE > 512 * 1024)
+               high = (512 * 1024) / PAGE_SIZE;
+       if (high < 1)
+               high = 1;
 
-       return batch;
+       return high;
 
 #else
        /* The deferral and batching of frees should be suppressed under NOMMU
@@ -4181,6 +4168,19 @@ static void pageset_setup_from_high_mark
        unsigned long batch = max(1UL, high / pcp_high_to_batch_ratio);
        if ((high / pcp_high_to_batch_ratio) > (PAGE_SHIFT * 8))
                batch = PAGE_SHIFT * 8;
+       /*
+        * Clamp the batch to a 2^n - 1 value. Having a power
+        * of 2 value was found to be more likely to have
+        * suboptimal cache aliasing properties in some cases.
+        *
+        * For example if 2 tasks are alternately allocating
+        * batches of pages, one task can end up with a lot
+        * of pages of one half of the possible page colors
+        * and the other with pages of the other colors.
+        */
+       batch = rounddown_pow_of_two(batch + batch/2) - 1;
+       if (!batch)
+               batch = 1;
 
        pageset_update(&p->pcp, high, batch);
 }
@@ -4188,12 +4188,12 @@ static void pageset_setup_from_high_mark
 static void pageset_set_high_and_batch(struct zone *zone,
                struct per_cpu_pageset *pcp)
 {
+       int high;
        if (percpu_pagelist_fraction)
-               pageset_setup_from_high_mark(pcp,
-                       (zone->managed_pages /
-                               percpu_pagelist_fraction));
+               high = (zone->managed_pages / percpu_pagelist_fraction);
        else
-               pageset_setup_from_batch_size(pcp, zone_batchsize(zone));
+               high = calculate_zone_pcp_high(zone);
+       pageset_setup_from_high_mark(pcp, high);
 }
 
 static void __meminit zone_pageset_init(struct zone *zone, int cpu)
@@ -4277,9 +4277,9 @@ static __meminit void zone_pcp_init(stru
        zone->pageset = &boot_pageset;
 
        if (zone->present_pages)
-               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
+               printk(KERN_DEBUG "  %s zone: %lu pages, pcp high:%d\n",
                        zone->name, zone->present_pages,
-                                        zone_batchsize(zone));
+                                        calculate_zone_pcp_high(zone));
 }
 
 int __meminit init_currently_empty_zone(struct zone *zone,
_
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to