Re: [PATCH v2 3/8] mm/page_alloc: fix pcp high, batch management

2014-08-07 Thread Joonsoo Kim
On Thu, Aug 07, 2014 at 10:11:14AM +0800, Zhang Yanfei wrote:
> Hi Joonsoo,
> 
> On 08/06/2014 03:18 PM, Joonsoo Kim wrote:
> > per cpu pages structure, aka pcp, has high and batch values to control
> > how many pages we perform caching. This values could be updated
> > asynchronously and updater should ensure that this doesn't make any
> > problem. For this purpose, pageset_update() is implemented and do some
> > memory synchronization. But, it turns out to be wrong when I implemented
> > new feature using this. There is no corresponding smp_rmb() in read-side
> 
> Out of curiosity, what new feature are you implementing?

I mean just zone_pcp_disable() and zone_pcp_enable(). :)

> IIRC, pageset_update() is used to update high and batch which can be changed
> during:
> 
> system boot
> sysfs
> memory hot-plug
> 
> So it seems to me that the latter two would have the problems you described 
> here.

Yes, I think so. But I'm not sure, because I didn't look at it
in detail. :)

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 3/8] mm/page_alloc: fix pcp high, batch management

2014-08-06 Thread Zhang Yanfei
Hi Joonsoo,

On 08/06/2014 03:18 PM, Joonsoo Kim wrote:
> per cpu pages structure, aka pcp, has high and batch values to control
> how many pages we perform caching. This values could be updated
> asynchronously and updater should ensure that this doesn't make any
> problem. For this purpose, pageset_update() is implemented and do some
> memory synchronization. But, it turns out to be wrong when I implemented
> new feature using this. There is no corresponding smp_rmb() in read-side

Out of curiosity, what new feature are you implementing?

IIRC, pageset_update() is used to update high and batch which can be changed
during:

system boot
sysfs
memory hot-plug

So it seems to me that the latter two would have the problems you described 
here.

Thanks.

> so that it can't guarantee anything. Without correct updating, system
> could hang in free_pcppages_bulk() due to larger batch value than high.
> To properly update this values, we need to synchronization primitives on
> read-side, but, it hurts allocator's fastpath.
> 
> There is another choice for synchronization, that is, sending IPI. This
> is somewhat expensive, but, this is really rare case so I guess it has
> no problem here. However, reducing IPI is very helpful here. Current
> logic handles each CPU's pcp update one by one. To reduce sending IPI,
> we need to re-ogranize the code to handle all CPU's pcp update at one go.
> This patch implement these requirements.
> 
> Signed-off-by: Joonsoo Kim 
> ---
>  mm/page_alloc.c |  139 
> ---
>  1 file changed, 80 insertions(+), 59 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e6fee4b..3e1e344 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3800,7 +3800,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
>   * not check if the processor is online before following the pageset pointer.
>   * Other parts of the kernel may not check if the zone is available.
>   */
> -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
> +static void setup_pageset(struct per_cpu_pageset __percpu *pcp);
>  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
>  static void setup_zone_pageset(struct zone *zone);
>  
> @@ -3846,9 +3846,9 @@ static int __build_all_zonelists(void *data)
>* needs the percpu allocator in order to allocate its pagesets
>* (a chicken-egg dilemma).
>*/
> - for_each_possible_cpu(cpu) {
> - setup_pageset(&per_cpu(boot_pageset, cpu), 0);
> + setup_pageset(&boot_pageset);
>  
> + for_each_possible_cpu(cpu) {
>  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
>   /*
>* We now know the "local memory node" for each node--
> @@ -4230,24 +4230,59 @@ static int zone_batchsize(struct zone *zone)
>   * outside of boot time (or some other assurance that no concurrent updaters
>   * exist).
>   */
> -static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
> - unsigned long batch)
> +static void pageset_update(struct zone *zone, int high, int batch)
>  {
> -   /* start with a fail safe value for batch */
> - pcp->batch = 1;
> - smp_wmb();
> + int cpu;
> + struct per_cpu_pages *pcp;
> +
> + /* start with a fail safe value for batch */
> + for_each_possible_cpu(cpu) {
> + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
> + pcp->batch = 1;
> + }
> + kick_all_cpus_sync();
> +
> + /* Update high, then batch, in order */
> + for_each_possible_cpu(cpu) {
> + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
> + pcp->high = high;
> + }
> + kick_all_cpus_sync();
>  
> -   /* Update high, then batch, in order */
> - pcp->high = high;
> - smp_wmb();
> + for_each_possible_cpu(cpu) {
> + pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
> + pcp->batch = batch;
> + }
> +}
> +
> +/*
> + * pageset_get_values_by_high() gets the high water mark for
> + * hot per_cpu_pagelist to the value high for the pageset p.
> + */
> +static void pageset_get_values_by_high(int input_high,
> + int *output_high, int *output_batch)
> +{
> + *output_batch = max(1, input_high / 4);
> + if ((input_high / 4) > (PAGE_SHIFT * 8))
> + *output_batch = PAGE_SHIFT * 8;
> +}
>  
> - pcp->batch = batch;
> +/* a companion to pageset_get_values_by_high() */
> +static void pageset_get_values_by_batch(int input_batch,
> + int *output_high, int *output_batch)
> +{
> + *output_high = 6 * input_batch;
> + *output_batch = max(1, 1 * input_batch);
>  }
>  
> -/* a companion to pageset_set_high() */
> -static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
> +static void pageset_get_values(struct zone *zone, int *high, int *batch)
>  {
> - pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
> + if (percpu_pag

[PATCH v2 3/8] mm/page_alloc: fix pcp high, batch management

2014-08-06 Thread Joonsoo Kim
per cpu pages structure, aka pcp, has high and batch values to control
how many pages we perform caching. This values could be updated
asynchronously and updater should ensure that this doesn't make any
problem. For this purpose, pageset_update() is implemented and do some
memory synchronization. But, it turns out to be wrong when I implemented
new feature using this. There is no corresponding smp_rmb() in read-side
so that it can't guarantee anything. Without correct updating, system
could hang in free_pcppages_bulk() due to larger batch value than high.
To properly update this values, we need to synchronization primitives on
read-side, but, it hurts allocator's fastpath.

There is another choice for synchronization, that is, sending IPI. This
is somewhat expensive, but, this is really rare case so I guess it has
no problem here. However, reducing IPI is very helpful here. Current
logic handles each CPU's pcp update one by one. To reduce sending IPI,
we need to re-ogranize the code to handle all CPU's pcp update at one go.
This patch implement these requirements.

Signed-off-by: Joonsoo Kim 
---
 mm/page_alloc.c |  139 ---
 1 file changed, 80 insertions(+), 59 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6fee4b..3e1e344 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3800,7 +3800,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static void setup_pageset(struct per_cpu_pageset __percpu *pcp);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 static void setup_zone_pageset(struct zone *zone);
 
@@ -3846,9 +3846,9 @@ static int __build_all_zonelists(void *data)
 * needs the percpu allocator in order to allocate its pagesets
 * (a chicken-egg dilemma).
 */
-   for_each_possible_cpu(cpu) {
-   setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+   setup_pageset(&boot_pageset);
 
+   for_each_possible_cpu(cpu) {
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
 * We now know the "local memory node" for each node--
@@ -4230,24 +4230,59 @@ static int zone_batchsize(struct zone *zone)
  * outside of boot time (or some other assurance that no concurrent updaters
  * exist).
  */
-static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
-   unsigned long batch)
+static void pageset_update(struct zone *zone, int high, int batch)
 {
-   /* start with a fail safe value for batch */
-   pcp->batch = 1;
-   smp_wmb();
+   int cpu;
+   struct per_cpu_pages *pcp;
+
+   /* start with a fail safe value for batch */
+   for_each_possible_cpu(cpu) {
+   pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+   pcp->batch = 1;
+   }
+   kick_all_cpus_sync();
+
+   /* Update high, then batch, in order */
+   for_each_possible_cpu(cpu) {
+   pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+   pcp->high = high;
+   }
+   kick_all_cpus_sync();
 
-   /* Update high, then batch, in order */
-   pcp->high = high;
-   smp_wmb();
+   for_each_possible_cpu(cpu) {
+   pcp = &per_cpu_ptr(zone->pageset, cpu)->pcp;
+   pcp->batch = batch;
+   }
+}
+
+/*
+ * pageset_get_values_by_high() gets the high water mark for
+ * hot per_cpu_pagelist to the value high for the pageset p.
+ */
+static void pageset_get_values_by_high(int input_high,
+   int *output_high, int *output_batch)
+{
+   *output_batch = max(1, input_high / 4);
+   if ((input_high / 4) > (PAGE_SHIFT * 8))
+   *output_batch = PAGE_SHIFT * 8;
+}
 
-   pcp->batch = batch;
+/* a companion to pageset_get_values_by_high() */
+static void pageset_get_values_by_batch(int input_batch,
+   int *output_high, int *output_batch)
+{
+   *output_high = 6 * input_batch;
+   *output_batch = max(1, 1 * input_batch);
 }
 
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
+static void pageset_get_values(struct zone *zone, int *high, int *batch)
 {
-   pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+   if (percpu_pagelist_fraction) {
+   pageset_get_values_by_high(
+   (zone->managed_pages / percpu_pagelist_fraction),
+   high, batch);
+   } else
+   pageset_get_values_by_batch(zone_batchsize(zone), high, batch);
 }
 
 static void pageset_init(struct per_cpu_pageset *p)
@@ -4263,51 +4298,38 @@ static void pageset_init(struct per_cpu_pageset *p)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
 
-stati