Re: [PATCH v18 00/32] per memcg lru_lock

2020-09-08 Thread Aaron Lu
On Thu, Aug 27, 2020 at 09:40:22PM -0400, Daniel Jordan wrote:
> I went back to your v1 post to see what motivated you originally, and you had
> some results from aim9 but nothing about where this reared its head in the
> first place.  How did you discover the bottleneck?  I'm just curious about how
> lru_lock hurts in practice.

I think making lru_lock per-memcg helps in colocated environment: some
workloads are of high priority while some workloads are of low priority.

For these low priority workloads, we may even want to use some swap for
it to save memory and this can cause frequent alloc/reclaim, depending
on its workingset etc. and these alloc/reclaim need to hold the global
lru lock and zone lock. And then when the high priority workloads do
page fault, their performance can be adversely affected and that is not
acceptible since these high priority workloads normally have strict SLA
requirement.


Re: [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison(Internet mail)

2020-07-24 Thread Aaron Lu
On Wed, Jul 22, 2020 at 12:23:44AM +, benbjiang(蒋彪) wrote:
> 
> 
> > +/*
> > + * This function takes care of adjusting the min_vruntime of siblings of
> > + * a core during coresched enable/disable.
> > + * This is called in stop machine context so no need to take the rq lock.
> Hi,
> 
> IMHO, it seems that stop machine context cannot guarantee race free. The 
> param *cpu* maybe not *this_cpu*, rq lock should be taken even in stop 
> machine context, and irq should be disabled too, to avoid potential races 
> with other contexts.
> 

In stop machine context, all CPUs except the active ones are spinning
with irq disabled and in this invocation of stop_machine(), only one
CPU is active so I don't think race is possible.


Re: [RFC PATCH 1/3 v2] futex: introduce FUTEX_SWAP operation

2020-06-23 Thread Aaron Lu
On Tue, Jun 16, 2020 at 10:22:26AM -0700, Peter Oskolkov wrote:
>  static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q 
> *q,
> - struct hrtimer_sleeper *timeout)
> + struct hrtimer_sleeper *timeout,
> + struct task_struct *next)
>  {
>   /*
>* The task state is guaranteed to be set before another task can
> @@ -2627,10 +2644,27 @@ static void futex_wait_queue_me(struct 
> futex_hash_bucket *hb, struct futex_q *q,
>* flagged for rescheduling. Only call schedule if there
>* is no timeout, or if it has yet to expire.
>*/
> - if (!timeout || timeout->task)
> + if (!timeout || timeout->task) {
> + if (next) {
> + /*
> +  * wake_up_process() below will be replaced
> +  * in the next patch with
> +  * wake_up_process_prefer_current_cpu().
> +  */
> + wake_up_process(next);
> + put_task_struct(next);
> + next = NULL;
> + }

So in futex_swap case, the wake up occurs in futex_wait_queue_me(). I
personally think it's more natural to do the wakeup in futex_swap()
instead.

>   freezable_schedule();
> + }
>   }
>   __set_current_state(TASK_RUNNING);
> +
> + if (next) {
> + /* Maybe call wake_up_process_prefer_current_cpu()? */
> + wake_up_process(next);
> + put_task_struct(next);
> + }
>  }
> 
>  /**

> +static int futex_swap(u32 __user *uaddr, unsigned int flags, u32 val,
> +   ktime_t *abs_time, u32 __user *uaddr2)
> +{
> + u32 bitset = FUTEX_BITSET_MATCH_ANY;
> + struct task_struct *next = NULL;
> + DEFINE_WAKE_Q(wake_q);
> + int ret;
> +
> + ret = prepare_wake_q(uaddr2, flags, 1, bitset, _q);
> + if (!wake_q_empty(_q)) {
> + /* Pull the first wakee out of the queue to swap into. */
> + next = container_of(wake_q.first, struct task_struct, wake_q);
> + wake_q.first = wake_q.first->next;
> + next->wake_q.next = NULL;
> + /*
> +  * Note that wake_up_q does not touch wake_q.last, so we
> +  * do not bother with it here.
> +  */
> + wake_up_q(_q);

wake_up_q() doesn't seem to serve any purpose in that the above
assignment of wake_q.first shall make it an empty queue now?
Also, I don't see a need to touch wake_q.first either so I think we can
get rid of wake_q altogether here.

> + }
> + if (ret < 0)
> + return ret;
> +
> + return futex_wait(uaddr, flags, val, abs_time, bitset, next);
> +}

I've cooked the below diff, on top of your patchset. It survived your
self test and schbench. Feel free to ignore it if you don't like it, or
merge it into your patchset if you think it looks better.

do wake up in futex_swap()

---
 kernel/futex.c | 43 +++
 1 file changed, 11 insertions(+), 32 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index a426671e4bbb..995bc881059c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2618,8 +2618,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q 
*q, int locked)
  * prefer to execute it locally.
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q 
*q,
-   struct hrtimer_sleeper *timeout,
-   struct task_struct *next)
+   struct hrtimer_sleeper *timeout)
 {
/*
 * The task state is guaranteed to be set before another task can
@@ -2644,22 +2643,11 @@ static void futex_wait_queue_me(struct 
futex_hash_bucket *hb, struct futex_q *q,
 * flagged for rescheduling. Only call schedule if there
 * is no timeout, or if it has yet to expire.
 */
-   if (!timeout || timeout->task) {
-   if (next) {
-   wake_up_process_prefer_current_cpu(next);
-   put_task_struct(next);
-   next = NULL;
-   }
+   if (!timeout || timeout->task)
freezable_schedule();
-   }
}
-   __set_current_state(TASK_RUNNING);
 
-   if (next) {
-   /* Maybe call wake_up_process_prefer_current_cpu()? */
-   wake_up_process(next);
-   put_task_struct(next);
-   }
+   __set_current_state(TASK_RUNNING);
 }
 
 /**
@@ -2739,7 +2727,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, 
unsigned int flags,
 }
 
 static int futex_wait(u32 __user 

Re: [RFC PATCH 0/3 v2] futex: introduce FUTEX_SWAP operation

2020-06-22 Thread Aaron Lu
On Tue, Jun 16, 2020 at 10:22:11AM -0700, Peter Oskolkov wrote:
> From 7b091e46de4f9227b5a943e6d78283564e8c1c72 Mon Sep 17 00:00:00 2001
> From: Peter Oskolkov 
> Date: Tue, 16 Jun 2020 10:13:58 -0700
> Subject: [RFC PATCH 0/3 v2] futex: introduce FUTEX_SWAP operation
> 
> This is an RFC!
> 
> As Paul Turner presented at LPC in 2013 ...
> - pdf: 
> http://pdxplumbers.osuosl.org/2013/ocw//system/presentations/1653/original/LPC%20-%20User%20Threading.pdf
> - video: https://www.youtube.com/watch?v=KXuZi9aeGTw
> 
> ... Google has developed an M:N userspace threading subsystem backed
> by Google-private SwitchTo Linux Kernel API (page 17 in the pdf referenced
> above). This subsystem provides latency-sensitive services at Google with
> fine-grained user-space control/scheduling over what is running when,
> and this subsystem is used widely internally (called schedulers or fibers).
> 
> This RFC patchset is the first step to open-source this work. As explained
> in the linked pdf and video, SwitchTo API has three core operations: wait,
> resume, and swap (=switch). So this patchset adds a FUTEX_SWAP operation
> that, in addition to FUTEX_WAIT and FUTEX_WAKE, will provide a foundation
> on top of which user-space threading libraries can be built.
> 
> Another common use case for FUTEX_SWAP is message passing a-la RPC
> between tasks: task/thread T1 prepares a message,
> wakes T2 to work on it, and waits for the results; when T2 is done, it
> wakes T1 and waits for more work to arrive. Currently the simplest
> way to implement this is
> 
> a. T1: futex-wake T2, futex-wait
> b. T2: wakes, does what it has been woken to do
> c. T2: futex-wake T1, futex-wait
> 
> With FUTEX_SWAP, steps a and c above can be reduced to one futex operation
> that runs 5-10 times faster.

schbench used futex wait/wake to do sleep/wakeup between message thread
and worker thread and when worker thread is 1 per message thread, the
message thread and worker thread is also flipcall style.

So I modified schbench to make use of futex_swap and did a comparison.
In the not overloaded case, both runs roughly the same with futex_swap
performing slightly better. In the overloaded case, futex_swap performs
better than futex wait/wake in all metrics, with 90th seeing the largest
difference: 2556us vs 6us.

I guess when the scheduler change is in place, more latency gain is
expected.

Here is the log of the schbench run(on a 16core/32cpu x86_64 machine):

overloaded case

original schbench(aka futex wait/wake)
$./schbench -m 64 -t 1 -r 30

Latency percentiles (usec)
50.th: 7
75.th: 9
90.th: 2556
95.th: 7112
*99.th: 14160
99.5000th: 17504
99.9000th: 22688
min=0, max=30351

with futex swap
$./schbench -m 64 -t 1 -r 30

Latency percentiles (usec)
50.0th: 4
75.0th: 5
90.0th: 6
95.0th: 4568
*99.0th: 12912
99.5th: 15152
99.9th: 20384
min=0, max=30723


not overloaded case

original schbench(aka futex wait/wake)

$./schbench -m 32 -t 1 -r 30
Latency percentiles (usec)
50.th: 6
75.th: 7
90.th: 8
95.th: 9
*99.th: 10
99.5000th: 12
99.9000th: 18
min=0, max=398


with futex swap

$./schbench -m 32 -t 1 -r 30
Latency percentiles (usec)
50.0th: 4
75.0th: 5
90.0th: 5
95.0th: 6
*99.0th: 8
99.5th: 9
99.9th: 12
min=0, max=245


Re: [PATCH updated v2] sched/fair: core wide cfs task priority comparison

2020-05-22 Thread Aaron Lu
On Sat, May 16, 2020 at 11:42:30AM +0800, Aaron Lu wrote:
> On Thu, May 14, 2020 at 03:02:48PM +0200, Peter Zijlstra wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -4476,6 +4473,16 @@ next_class:;
> > WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
> > }
> >  
> > +   /* XXX SMT2 only */
> > +   if (new_active == 1 && old_active > 1) {
> 
> There is a case when incompatible task appears but we failed to 'drop
> into single-rq mode' per the above condition check. The TLDR is: when
> there is a task that sits on the sibling rq with the same cookie as
> 'max', new_active will be 2 instead of 1 and that would cause us missing
> the chance to do a sync of core min_vruntime.

FWIW: when I disable the feature of running cookie_pick task on sibling
and thus enforce a strict single-rq mode, Peter's patch works well for
the scenario described below.

> This is how it happens:
> 1) 2 tasks of the same cgroup with different weight running on 2 siblings,
>say cg0_A with weight 1024 bound at cpu0 and cg0_B with weight 2 bound
>at cpu1(assume cpu0 and cpu1 are siblings);
> 2) Since new_active == 2, we didn't trigger min_vruntime sync. For
>simplicity, let's assume both siblings' root cfs_rq's min_vruntime and
>core_vruntime are all at 0 now;
> 3) let the two tasks run a while;
> 4) a new task cg1_C of another cgroup gets queued on cpu1. Since cpu1's
>existing task has a very small weight, its cfs_rq's min_vruntime can
>be much larger than cpu0's cfs_rq min_vruntime. So cg1_C's vruntime is
>much larger than cg0_A's and the 'max' of the core wide task
>selection goes to cg0_A;
> 5) Now I suppose we should drop into single-rq mode and by doing a sync
>of core min_vruntime, cg1_C's turn shall come. But the problem is, our
>current selection logic prefer not to waste CPU time so after decides
>cg0_A as the 'max', the sibling will also do a cookie_pick() and
>get cg0_B to run. This is where problem asises: new_active is 2
>instead of the expected 1.
> 6) Due to we didn't do the sync of core min_vruntime, the newly queued
>cg1_C shall wait a long time before cg0_A's vruntime catches up.

P.S. this is what I did to enforce a strict single-rq mode:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1fa5b48b742a..0f5580bc7e96 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4411,7 +4411,7 @@ pick_task(struct rq *rq, const struct sched_class *class, 
struct task_struct *ma
(!max || prio_less(max, class_pick)))
return class_pick;
 
-   return cookie_pick;
+   return NULL;
 }
 
 static struct task_struct *


Re: [RFC PATCH 07/13] sched: Add core wide task selection and scheduling.

2020-05-21 Thread Aaron Lu
On Thu, May 21, 2020 at 10:35:56PM -0400, Joel Fernandes wrote:
> Discussed a lot with Vineeth. Below is an improved version of the pick_task()
> similification.
> 
> It also handles the following "bug" in the existing code as well that Vineeth
> brought up in OSPM: Suppose 2 siblings of a core: rq 1 and rq 2.
> 
> In priority order (high to low), say we have the tasks:
> A - untagged  (rq 1)
> B - tagged(rq 2)
> C - untagged  (rq 2)
> 
> Say, B and C are in the same scheduling class.
> 
> When the pick_next_task() loop runs, it looks at rq 1 and max is A, A is
> tenantively selected for rq 1. Then it looks at rq 2 and the class_pick is B.
> But that's not compatible with A. So rq 2 gets forced idle.
> 
> In reality, rq 2 could have run C instead of idle. The fix is to add C to the
> tag tree as Peter suggested in OSPM.

I like the idea of adding untagged task to the core tree.

> Updated diff below:
> 
> ---8<---
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 005d7f7323e2d..625377f393ed3 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -182,9 +182,6 @@ static void sched_core_enqueue(struct rq *rq, struct 
> task_struct *p)
>  
>   rq->core->core_task_seq++;
>  
> - if (!p->core_cookie)
> - return;
> -
>   node = >core_tree.rb_node;
>   parent = *node;
>  
> @@ -215,7 +212,7 @@ static void sched_core_dequeue(struct rq *rq, struct 
> task_struct *p)
>  
>  void sched_core_add(struct rq *rq, struct task_struct *p)
>  {
> - if (p->core_cookie && task_on_rq_queued(p))
> + if (task_on_rq_queued(p))
>   sched_core_enqueue(rq, p);
>  }

It appears there are other call sites of sched_core_enqueue() where
core_cookie is checked: cpu_cgroup_fork() and __sched_write_tag().


Re: [PATCH updated v2] sched/fair: core wide cfs task priority comparison

2020-05-15 Thread Aaron Lu
On Thu, May 14, 2020 at 03:02:48PM +0200, Peter Zijlstra wrote:
> On Fri, May 08, 2020 at 08:34:57PM +0800, Aaron Lu wrote:
> > With this said, I realized a workaround for the issue described above:
> > when the core went from 'compatible mode'(step 1-3) to 'incompatible
> > mode'(step 4), reset all root level sched entities' vruntime to be the
> > same as the core wide min_vruntime. After all, the core is transforming
> > from two runqueue mode to single runqueue mode... I think this can solve
> > the issue to some extent but I may miss other scenarios.
> 
> A little something like so, this syncs min_vruntime when we switch to
> single queue mode. This is very much SMT2 only, I got my head in twist
> when thikning about more siblings, I'll have to try again later.

Thanks a lot for the patch, I now see that "there is no need to adjust
every se's vruntime". :-)

> This very much retains the horrible approximation of S we always do.
> 
> Also, it is _completely_ untested...

I've been testing it.

One problem below.

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4293,10 +4281,11 @@ static struct task_struct *
>  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  {
>   struct task_struct *next, *max = NULL;
> + int old_active = 0, new_active = 0;
>   const struct sched_class *class;
>   const struct cpumask *smt_mask;
> - int i, j, cpu;
>   bool need_sync = false;
> + int i, j, cpu;
>  
>   cpu = cpu_of(rq);
>   if (cpu_is_offline(cpu))
> @@ -4349,10 +4338,14 @@ pick_next_task(struct rq *rq, struct tas
>   rq_i->core_pick = NULL;
>  
>   if (rq_i->core_forceidle) {
> + // XXX is_idle_task(rq_i->curr) && rq_i->nr_running ??
>   need_sync = true;
>   rq_i->core_forceidle = false;
>   }
>  
> + if (!is_idle_task(rq_i->curr))
> + old_active++;
> +
>   if (i != cpu)
>   update_rq_clock(rq_i);
>   }
> @@ -4463,8 +4456,12 @@ next_class:;
>  
>   WARN_ON_ONCE(!rq_i->core_pick);
>  
> - if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
> - rq_i->core_forceidle = true;
> + if (is_idle_task(rq_i->core_pick)) {
> + if (rq_i->nr_running)
> + rq_i->core_forceidle = true;
> + } else {
> + new_active++;
> + }
>  
>   if (i == cpu)
>   continue;
> @@ -4476,6 +4473,16 @@ next_class:;
>   WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
>   }
>  
> + /* XXX SMT2 only */
> + if (new_active == 1 && old_active > 1) {

There is a case when incompatible task appears but we failed to 'drop
into single-rq mode' per the above condition check. The TLDR is: when
there is a task that sits on the sibling rq with the same cookie as
'max', new_active will be 2 instead of 1 and that would cause us missing
the chance to do a sync of core min_vruntime.

This is how it happens:
1) 2 tasks of the same cgroup with different weight running on 2 siblings,
   say cg0_A with weight 1024 bound at cpu0 and cg0_B with weight 2 bound
   at cpu1(assume cpu0 and cpu1 are siblings);
2) Since new_active == 2, we didn't trigger min_vruntime sync. For
   simplicity, let's assume both siblings' root cfs_rq's min_vruntime and
   core_vruntime are all at 0 now;
3) let the two tasks run a while;
4) a new task cg1_C of another cgroup gets queued on cpu1. Since cpu1's
   existing task has a very small weight, its cfs_rq's min_vruntime can
   be much larger than cpu0's cfs_rq min_vruntime. So cg1_C's vruntime is
   much larger than cg0_A's and the 'max' of the core wide task
   selection goes to cg0_A;
5) Now I suppose we should drop into single-rq mode and by doing a sync
   of core min_vruntime, cg1_C's turn shall come. But the problem is, our
   current selection logic prefer not to waste CPU time so after decides
   cg0_A as the 'max', the sibling will also do a cookie_pick() and
   get cg0_B to run. This is where problem asises: new_active is 2
   instead of the expected 1.
6) Due to we didn't do the sync of core min_vruntime, the newly queued
   cg1_C shall wait a long time before cg0_A's vruntime catches up.

One naive way of precisely determine when to drop into single-rq mode is
to track how many tasks of a particular tag exists and use that to
decide if the core is in compatible mode(all tasks belong to the same
cgroup, IOW, have the same core_cookie) or not and act accordingly,
except that: does this sound too compl

Re: [PATCH updated v2] sched/fair: core wide cfs task priority comparison

2020-05-08 Thread Aaron Lu
On Fri, May 08, 2020 at 11:09:25AM +0200, Peter Zijlstra wrote:
> On Fri, May 08, 2020 at 04:44:19PM +0800, Aaron Lu wrote:
> > On Wed, May 06, 2020 at 04:35:06PM +0200, Peter Zijlstra wrote:
> 
> > > Aside from this being way to complicated for what it does -- you
> > > could've saved the min_vruntime for each rq and compared them with
> > > subtraction -- it is also terminally broken afaict.
> > >
> > > Consider any infeasible weight scenario. Take for instance two tasks,
> > > each bound to their respective sibling, one with weight 1 and one with
> > > weight 2. Then the lower weight task will run ahead of the higher weight
> > > task without bound.
> > 
> > I don't follow how this could happen. Even the lower weight task runs
> > first, after some time, the higher weight task will get its turn and
> > from then on, the higher weight task will get more chance to run(due to
> > its higher weight and thus, slower accumulation of vruntime).
> 
> That seems to assume they're mutually exclusive. In that case, as I
> argued, we only have a single runqueue and then yes it works. But if
> they're not exclusive, and can run concurrently, it comes apart.

Ah right, now I see what you mean. Sorry for misunderstanding.

And yes, that 'utterly destroys the concept of a shared time base' and
then bad things can happen:
1) two same tagged tasks(t1 and t2) running on two siblings, with t1's
   weight lower than t2's;
2) both tasks are cpu intensive;
3) over time, the lower weight task(t1)'s vruntime becomes bigger and
   bigger than t2's vruntime and the core wide min_vruntime is the
   same as t1's vruntime per this patch;
4) a new task enqueued on the same sibling as t1, if the new task has
   an incompatible tag, it will be starved by t2 because t2's vruntime
   is way smaller than the core wide min_vruntime.

With this said, I realized a workaround for the issue described above:
when the core went from 'compatible mode'(step 1-3) to 'incompatible
mode'(step 4), reset all root level sched entities' vruntime to be the
same as the core wide min_vruntime. After all, the core is transforming
from two runqueue mode to single runqueue mode... I think this can solve
the issue to some extent but I may miss other scenarios.

I'll also re-read your last email about the 'lag' idea.


Re: [PATCH updated v2] sched/fair: core wide cfs task priority comparison

2020-05-08 Thread Aaron Lu
On Wed, May 06, 2020 at 04:35:06PM +0200, Peter Zijlstra wrote:
> 
> Sorry for being verbose; I've been procrastinating replying, and in
> doing so the things I wanted to say kept growing.
> 
> On Fri, Apr 24, 2020 at 10:24:43PM +0800, Aaron Lu wrote:
> 
> > To make this work, the root level sched entities' vruntime of the two
> > threads must be directly comparable. So one of the hyperthread's root
> > cfs_rq's min_vruntime is chosen as the core wide one and all root level
> > sched entities' vruntime is normalized against it.
> 
> > +/*
> > + * This is called in stop machine context so no need to take the rq lock.
> > + *
> > + * Core scheduling is going to be enabled and the root level sched entities
> > + * of both siblings will use cfs_rq->min_vruntime as the common cfs_rq
> > + * min_vruntime, so it's necessary to normalize vruntime of existing root
> > + * level sched entities in sibling_cfs_rq.
> > + *
> > + * Update of sibling_cfs_rq's min_vruntime isn't necessary as we will be
> > + * only using cfs_rq->min_vruntime during the entire run of core 
> > scheduling.
> > + */
> > +void sched_core_normalize_se_vruntime(int cpu)
> > +{
> > +   struct cfs_rq *cfs_rq = _rq(cpu)->cfs;
> > +   int i;
> > +
> > +   for_each_cpu(i, cpu_smt_mask(cpu)) {
> > +   struct sched_entity *se, *next;
> > +   struct cfs_rq *sibling_cfs_rq;
> > +   s64 delta;
> > +
> > +   if (i == cpu)
> > +   continue;
> > +
> > +   sibling_cfs_rq = _rq(i)->cfs;
> > +   if (!sibling_cfs_rq->nr_running)
> > +   continue;
> > +
> > +   delta = cfs_rq->min_vruntime - sibling_cfs_rq->min_vruntime;
> > +   rbtree_postorder_for_each_entry_safe(se, next,
> > +   _cfs_rq->tasks_timeline.rb_root,
> > +   run_node) {
> > +   se->vruntime += delta;
> > +   }
> > +   }
> > +}
> 
> Aside from this being way to complicated for what it does -- you
> could've saved the min_vruntime for each rq and compared them with
> subtraction -- it is also terminally broken afaict.
>
> Consider any infeasible weight scenario. Take for instance two tasks,
> each bound to their respective sibling, one with weight 1 and one with
> weight 2. Then the lower weight task will run ahead of the higher weight
> task without bound.

I don't follow how this could happen. Even the lower weight task runs
first, after some time, the higher weight task will get its turn and
from then on, the higher weight task will get more chance to run(due to
its higher weight and thus, slower accumulation of vruntime).

We used to have the following patch as a standalone one in v4:
sched/fair : Wake up forced idle siblings if needed
https://lore.kernel.org/lkml/cover.1572437285.git.vpil...@digitalocean.com/T/#md22d25d0e2932d059013e9b56600d8a847b02a13
Which originates from:
https://lore.kernel.org/lkml/20190725143344.GD992@aaronlu/

And in this series, it seems to be merged in:
[RFC PATCH 07/13] sched: Add core wide task selection and scheduling
https://lore.kernel.org/lkml/e942da7fd881977923463f19648085c1bfaa37f8.1583332765.git.vpil...@digitalocean.com/

My local test shows that when two cgroup's share are both set to 1024
and each bound to one sibling of a core, start a cpu intensive task in
each cgroup, then the cpu intensive task will each consume 50% cpu. When
one cgroup's share set to 512, it will consume about 33% while the other
consumes 67%, as expected.

I think the current patch works fine when 2 differently tagged tasks are
competing CPU, but when there are 3 tasks or more, things can get less
fair.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-10-14 Thread Aaron Lu
On Sun, Oct 13, 2019 at 08:44:32AM -0400, Vineeth Remanan Pillai wrote:
> On Fri, Oct 11, 2019 at 11:55 PM Aaron Lu  wrote:
> 
> >
> > I don't think we need do the normalization afterwrads and it appears
> > we are on the same page regarding core wide vruntime.

Should be "we are not on the same page..."

[...]
> > The weird thing about my patch is, the min_vruntime is often increased,
> > it doesn't point to the smallest value as in a traditional cfs_rq. This
> > probabaly can be changed to follow the tradition, I don't quite remember
> > why I did this, will need to check this some time later.
> 
> Yeah, I noticed this. In my patch, I had already accounted for this and 
> changed
> to min() instead of max() which is more logical that min_vruntime should be 
> the
> minimum of both the run queue.

I now remembered why I used max().

Assume rq1 and rq2's min_vruntime are both at 2000 and the core wide
min_vruntime is also 2000. Also assume both runqueues are empty at the
moment. Then task t1 is queued to rq1 and runs for a long time while rq2
keeps empty. rq1's min_vruntime will be incremented all the time while
the core wide min_vruntime stays at 2000 if min() is used. Then when
another task gets queued to rq2, it will get really large unfair boost
by using a much smaller min_vruntime as its base.

To fix this, either max() is used as is done in my patch, or adjust
rq2's min_vruntime to be the same as rq1's on each
update_core_cfs_min_vruntime() when rq2 is found empty and then use
min() to get the core wide min_vruntime. Looks not worth the trouble to
use min().


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-10-11 Thread Aaron Lu
On Fri, Oct 11, 2019 at 08:10:30AM -0400, Vineeth Remanan Pillai wrote:
> > Thanks for the clarification.
> >
> > Yes, this is the initialization issue I mentioned before when core
> > scheduling is initially enabled. rq1's vruntime is bumped the first time
> > update_core_cfs_rq_min_vruntime() is called and if there are already
> > some tasks queued, new tasks queued on rq1 will be starved to some extent.
> >
> > Agree that this needs fix. But we shouldn't need do this afterwards.
> >
> > So do I understand correctly that patch1 is meant to solve the
> > initialization issue?
> 
> I think we need this update logic even after initialization. I mean, core
> runqueue's min_vruntime can get updated every time when the core
> runqueue's min_vruntime changes with respect to the sibling's min_vruntime.
> So, whenever this update happens, we would need to propagate the changes
> down the tree right? Please let me know if I am visualizing it wrong.

I don't think we need do the normalization afterwrads and it appears
we are on the same page regarding core wide vruntime.

The intent of my patch is to treat all the root level sched entities of
the two siblings as if they are in a single cfs_rq of the core. With a
core wide min_vruntime, the core scheduler can decide which sched entity
to run next. And the individual sched entity's vruntime shouldn't be
changed based on the change of core wide min_vruntime, or faireness can
hurt(if we add or reduce vruntime of a sched entity, its credit will
change).

The weird thing about my patch is, the min_vruntime is often increased,
it doesn't point to the smallest value as in a traditional cfs_rq. This
probabaly can be changed to follow the tradition, I don't quite remember
why I did this, will need to check this some time later.

All those sub cfs_rq's sched entities are not interesting. Because once
we decided which sched entity in the root level cfs_rq should run next,
we can then pick the final next task from there(using the usual way). In
other words, to make scheduler choose the correct candidate for the core,
we only need worry about sched entities on both CPU's root level cfs_rqs.

Does this make sense?


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-10-11 Thread Aaron Lu
On Fri, Oct 11, 2019 at 07:32:48AM -0400, Vineeth Remanan Pillai wrote:
> > > The reason we need to do this is because, new tasks that gets created will
> > > have a vruntime based on the new min_vruntime and old tasks will have it
> > > based on the old min_vruntime
> >
> > I think this is expected behaviour.
> >
> I don't think this is the expected behavior. If we hadn't changed the root
> cfs->min_vruntime for the core rq, then it would have been the expected
> behaviour. But now, we are updating the core rq's root cfs, min_vruntime
> without changing the the vruntime down to the tree. To explain, consider
> this example based on your patch. Let cpu 1 and 2 be siblings. And let 
> rq(cpu1)
> be the core rq. Let rq1->cfs->min_vruntime=1000 and 
> rq2->cfs->min_vruntime=2000.
> So in update_core_cfs_rq_min_vruntime, you update rq1->cfs->min_vruntime
> to 2000 because that is the max. So new tasks enqueued on rq1 starts with
> vruntime of 2000 while the tasks in that runqueue are still based on the old
> min_vruntime(1000). So the new tasks gets enqueued some where to the right
> of the tree and has to wait until already existing tasks catch up the
> vruntime to
> 2000. This is what I meant by starvation. This happens always when we update
> the core rq's cfs->min_vruntime. Hope this clarifies.

Thanks for the clarification.

Yes, this is the initialization issue I mentioned before when core
scheduling is initially enabled. rq1's vruntime is bumped the first time
update_core_cfs_rq_min_vruntime() is called and if there are already
some tasks queued, new tasks queued on rq1 will be starved to some extent.

Agree that this needs fix. But we shouldn't need do this afterwards.

So do I understand correctly that patch1 is meant to solve the
initialization issue?


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-10-11 Thread Aaron Lu
On Thu, Oct 10, 2019 at 10:29:47AM -0400, Vineeth Remanan Pillai wrote:
> > I didn't see why we need do this.
> >
> > We only need to have the root level sched entities' vruntime become core
> > wide since we will compare vruntime for them across hyperthreads. For
> > sched entities on sub cfs_rqs, we never(at least, not now) compare their
> > vruntime outside their cfs_rqs.
> >
> The reason we need to do this is because, new tasks that gets created will
> have a vruntime based on the new min_vruntime and old tasks will have it
> based on the old min_vruntime

I think this is expected behaviour.

> and it can cause starvation based on how
> you set the min_vruntime.

Care to elaborate the starvation problem?

> With this new patch, we normalize the whole
> tree so that new tasks and old tasks compare with the same min_vruntime.

Again, what's the point of normalizing sched entities' vruntime in
sub-cfs_rqs? Their vruntime comparisons only happen inside their own
cfs_rq, we don't do cross CPU vruntime comparison for them.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-10-10 Thread Aaron Lu
On Wed, Oct 02, 2019 at 04:48:14PM -0400, Vineeth Remanan Pillai wrote:
> On Mon, Sep 30, 2019 at 7:53 AM Vineeth Remanan Pillai
>  wrote:
> >
> > >
> > Sorry, I misunderstood the fix and I did not initially see the core wide
> > min_vruntime that you tried to maintain in the rq->core. This approach
> > seems reasonable. I think we can fix the potential starvation that you
> > mentioned in the comment by adjusting for the difference in all the children
> > cfs_rq when we set the minvruntime in rq->core. Since we take the lock for
> > both the queues, it should be doable and I am trying to see how we can best
> > do that.
> >
> Attaching here with, the 2 patches I was working on in preparation of v4.
> 
> Patch 1 is an improvement of patch 2 of Aaron where I am propagating the
> vruntime changes to the whole tree.

I didn't see why we need do this.

We only need to have the root level sched entities' vruntime become core
wide since we will compare vruntime for them across hyperthreads. For
sched entities on sub cfs_rqs, we never(at least, not now) compare their
vruntime outside their cfs_rqs.

Thanks,
Aaron

> Patch 2 is an improvement for patch 3 of Aaron where we do resched_curr
> only when the sibling is forced idle.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-15 Thread Aaron Lu
On Fri, Sep 13, 2019 at 07:12:52AM +0800, Aubrey Li wrote:
> On Thu, Sep 12, 2019 at 8:04 PM Aaron Lu  wrote:
> >
> > On Wed, Sep 11, 2019 at 09:19:02AM -0700, Tim Chen wrote:
> > > On 9/11/19 7:02 AM, Aaron Lu wrote:
> > > I think Julien's result show that my patches did not do as well as
> > > your patches for fairness. Aubrey did some other testing with the same
> > > conclusion.  So I think keeping the forced idle time balanced is not
> > > enough for maintaining fairness.
> >
> > Well, I have done following tests:
> > 1 Julien's test script: https://paste.debian.net/plainh/834cf45c
> > 2 start two tagged will-it-scale/page_fault1, see how each performs;
> > 3 Aubrey's mysql test: https://github.com/aubreyli/coresched_bench.git
> >
> > They all show your patchset performs equally well...And consider what
> > the patch does, I think they are really doing the same thing in
> > different ways.
> 
> It looks like we are not on the same page, if you don't mind, can both of
> you rebase your patchset onto v5.3-rc8 and provide a public branch so I
> can fetch and test it at least by my benchmark?

I'm using the following branch as base which is v5.1.5 based:
https://github.com/digitalocean/linux-coresched coresched-v3-v5.1.5-test

And I have pushed Tim's branch to:
https://github.com/aaronlu/linux coresched-v3-v5.1.5-test-tim

Mine:
https://github.com/aaronlu/linux coresched-v3-v5.1.5-test-core_vruntime

The two branches both have two patches I have sent previouslly:
https://lore.kernel.org/lkml/20190810141556.GA73644@aaronlu/
Although it has some potential performance loss as pointed out by
Vineeth, I haven't got time to rework it yet.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-13 Thread Aaron Lu
On Thu, Sep 12, 2019 at 10:29:13AM -0700, Tim Chen wrote:
> On 9/12/19 5:35 AM, Aaron Lu wrote:
> > On Wed, Sep 11, 2019 at 12:47:34PM -0400, Vineeth Remanan Pillai wrote:
> 
> > 
> > core wide vruntime makes sense when there are multiple tasks of
> > different cgroups queued on the same core. e.g. when there are two
> > tasks of cgroupA and one task of cgroupB are queued on the same core,
> > assume cgroupA's one task is on one hyperthread and its other task is on
> > the other hyperthread with cgroupB's task. With my current
> > implementation or Tim's, cgroupA will get more time than cgroupB. 
> 
> I think that's expected because cgroup A has two tasks and cgroup B
> has one task, so cgroup A should get twice the cpu time than cgroup B
> to maintain fairness.

Like you said below, the ideal run time for each cgroup should depend on
their individual weight. The fact cgroupA has two tasks doesn't mean it
has twice the weight. Both cgroup can have the same cpu.share settings
and then, the more task a cgroup has, the less weight it can get for the
cgroup's per-cpu se.

I now realized one thing that's different in your idle_allowance
implementation and my core_vruntime implementation. In your
implementation, the idle_allowance is absolute time while vruntime can
be adjusted by the se's weight, that's probably one area your
implementation can make things less fair then mine.

> > If we
> > maintain core wide vruntime for cgroupA and cgroupB, we should be able
> > to maintain fairness between cgroups on this core. 
> 
> I don't think the right thing to do is to give cgroupA and cgroupB equal
> time on a core.  The time they get should still depend on their 
> load weight.

Agree.

> The better thing to do is to move one task from cgroupA to another core,
> that has only one cgroupA task so it can be paired up
> with that lonely cgroupA task.  This will eliminate the forced idle time
> for cgropuA both on current core and also the migrated core.

I'm not sure if this is always possible.

Say on a 16cores/32threads machine, there are 3 cgroups, each has 16 cpu
intensive tasks, will it be possible to make things perfectly balanced?

Don't get me wrong, I think this kind of load balancing is good and
needed, but I'm not sure if we can always make things perfectly
balanced. And if not, do we care those few cores where cgroup tasks are
not balanced and then, do we need to implement the core_wide cgoup
fairness functionality or we don't care since those cores are supposed
to be few and isn't a big deal?

> > Tim propose to solve
> > this problem by doing some kind of load balancing if I'm not mistaken, I
> > haven't taken a look at this yet.
> > 
> 
> My new patchset is trying to solve a different problem.  It is
> not trying to maintain fairness between cgroup on a core, but tries to
> even out the load of a cgroup between threads, and even out general
> load between cores. This will minimize the forced idle time.

Understood.

> 
> The fairness between cgroup relies still on
> proper vruntime accounting and proper comparison of vruntime between
> threads.  So for now, I am still using Aaron's patchset for this purpose
> as it has better fairness property than my other proposed patchsets
> for fairness purpose.
> 
> With just Aaron's current patchset we may have a lot of forced idle time
> due to the uneven distribution of tasks of different cgroup among the
> threads and cores, even though scheduling fairness is maintained.
> My new patches try to remove those forced idle time by moving the
> tasks around, to minimize cgroup unevenness between sibling threads
> and general load unevenness between the CPUs.

Yes I think this is definitely a good thing to do.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-13 Thread Aaron Lu
On Thu, Sep 12, 2019 at 10:05:43AM -0700, Tim Chen wrote:
> On 9/12/19 5:04 AM, Aaron Lu wrote:
> 
> > Well, I have done following tests:
> > 1 Julien's test script: https://paste.debian.net/plainh/834cf45c
> > 2 start two tagged will-it-scale/page_fault1, see how each performs;
> > 3 Aubrey's mysql test: https://github.com/aubreyli/coresched_bench.git
> > 
> > They all show your patchset performs equally well...And consider what
> > the patch does, I think they are really doing the same thing in
> > different ways.
> > 
> 
> Aaron,
> 
> The new feature of my new patches attempt to load balance between cores,
> and remove imbalance of cgroup load on a core that causes forced idle.
> Whereas previous patches attempt for fairness of cgroup between sibling 
> threads,
> so I think the goals are kind of orthogonal and complementary.
> 
> The premise is this, say cgroup1 is occupying 50% of cpu on cpu thread 1
> and 25% of cpu on cpu thread 2, that means we have a 25% cpu imbalance
> and cpu is force idled 25% of the time.  So ideally we need to remove
> 12.5% of cgroup 1 load from cpu thread 1 to sibling thread 2, so they
> both run at 37.5% on both thread for cgroup1 load without causing
> any force idled time.  Otherwise we will try to remove 25% of cgroup1
> load from cpu thread 1 to another core that has cgroup1 load to match.
> 
> This load balance is done in the regular load balance paths.
> 
> Previously for v3, only sched_core_balance made an attempt to pull a cookie 
> task, and only
> in the idle balance path. So if the cpu is kept busy, the cgroup load 
> imbalance
> between sibling threads could last a long time.  And the thread fairness
> patches for v3 don't help to balance load for such cases.
> 
> The new patches take into actual consideration of the amount of load imbalance
> of the same group between sibling threads when selecting task to pull, 
> and it also prevent task migration that creates
> more load imbalance. So hopefully this feature will help when we have
> more cores and need load balance across the cores.  This tries to help
> even cgroup workload between threads to minimize forced idle time, and also
> even out load across cores.

Will take a look at your new patches, thanks for the explanation.

> In your test, how many cores are on your machine and how many threads did
> each page_fault1 spawn off?

The test VM has 16 cores and 32 threads.
I created 2 tagged cgroups to run page_fault1 and each page_fault1 has
16 processes, like this:
$ ./src/will-it-scale/page_fault1_processes -t 16 -s 60


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-12 Thread Aaron Lu
On Wed, Sep 11, 2019 at 12:47:34PM -0400, Vineeth Remanan Pillai wrote:
> > > So both of you are working on top of my 2 patches that deal with the
> > > fairness issue, but I had the feeling Tim's alternative patches[1] are
> > > simpler than mine and achieves the same result(after the force idle tag
> >
> > I think Julien's result show that my patches did not do as well as
> > your patches for fairness. Aubrey did some other testing with the same
> > conclusion.  So I think keeping the forced idle time balanced is not
> > enough for maintaining fairness.
> >
> There are two main issues - vruntime comparison issue and the
> forced idle issue.  coresched_idle thread patch is addressing
> the forced idle issue as scheduler is no longer overloading idle
> thread for forcing idle. If I understand correctly, Tim's patch
> also tries to fix the forced idle issue. On top of fixing forced

Er...I don't think so. Tim's patch is meant to solve fairness issue as
mine, it doesn't attempt to address the forced idle issue.

> idle issue, we also need to fix that vruntime comparison issue
> and I think thats where Aaron's patch helps.
> 
> I think comparing parent's runtime also will have issues once
> the task group has a lot more threads with different running
> patterns. One example is a task group with lot of active threads
> and a thread with fairly less activity. So when this less active
> thread is competing with a thread in another group, there is a
> chance that it loses continuously for a while until the other
> group catches up on its vruntime.

I actually think this is expected behaviour.

Without core scheduling, when deciding which task to run, we will first
decide which "se" to run from the CPU's root level cfs runqueue and then
go downwards. Let's call the chosen se on the root level cfs runqueue
the winner se. Then with core scheduling, we will also need compare the
two winner "se"s of each hyperthread and choose the core wide winner "se".

> 
> As discussed during LPC, probably start thinking along the lines
> of global vruntime or core wide vruntime to fix the vruntime
> comparison issue?

core wide vruntime makes sense when there are multiple tasks of
different cgroups queued on the same core. e.g. when there are two
tasks of cgroupA and one task of cgroupB are queued on the same core,
assume cgroupA's one task is on one hyperthread and its other task is on
the other hyperthread with cgroupB's task. With my current
implementation or Tim's, cgroupA will get more time than cgroupB. If we
maintain core wide vruntime for cgroupA and cgroupB, we should be able
to maintain fairness between cgroups on this core. Tim propose to solve
this problem by doing some kind of load balancing if I'm not mistaken, I
haven't taken a look at this yet.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-12 Thread Aaron Lu
On Wed, Sep 11, 2019 at 09:19:02AM -0700, Tim Chen wrote:
> On 9/11/19 7:02 AM, Aaron Lu wrote:
> > Hi Tim & Julien,
> > 
> > On Fri, Sep 06, 2019 at 11:30:20AM -0700, Tim Chen wrote:
> >> On 8/7/19 10:10 AM, Tim Chen wrote:
> >>
> >>> 3) Load balancing between CPU cores
> >>> ---
> >>> Say if one CPU core's sibling threads get forced idled
> >>> a lot as it has mostly incompatible tasks between the siblings,
> >>> moving the incompatible load to other cores and pulling
> >>> compatible load to the core could help CPU utilization.
> >>>
> >>> So just considering the load of a task is not enough during
> >>> load balancing, task compatibility also needs to be considered.
> >>> Peter has put in mechanisms to balance compatible tasks between
> >>> CPU thread siblings, but not across cores.
> >>>
> >>> Status:
> >>> I have not seen patches on this issue.  This issue could lead to
> >>> large variance in workload performance based on your luck
> >>> in placing the workload among the cores.
> >>>
> >>
> >> I've made an attempt in the following two patches to address
> >> the load balancing of mismatched load between the siblings.
> >>
> >> It is applied on top of Aaron's patches:
> >> - sched: Fix incorrect rq tagged as forced idle
> >> - wrapper for cfs_rq->min_vruntime
> >>   https://lore.kernel.org/lkml/20190725143127.GB992@aaronlu/
> >> - core vruntime comparison
> >>   https://lore.kernel.org/lkml/20190725143248.GC992@aaronlu/
> > 
> > So both of you are working on top of my 2 patches that deal with the
> > fairness issue, but I had the feeling Tim's alternative patches[1] are
> > simpler than mine and achieves the same result(after the force idle tag
> 
> I think Julien's result show that my patches did not do as well as
> your patches for fairness. Aubrey did some other testing with the same
> conclusion.  So I think keeping the forced idle time balanced is not
> enough for maintaining fairness.

Well, I have done following tests:
1 Julien's test script: https://paste.debian.net/plainh/834cf45c
2 start two tagged will-it-scale/page_fault1, see how each performs;
3 Aubrey's mysql test: https://github.com/aubreyli/coresched_bench.git

They all show your patchset performs equally well...And consider what
the patch does, I think they are really doing the same thing in
different ways.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-09-11 Thread Aaron Lu
Hi Tim & Julien,

On Fri, Sep 06, 2019 at 11:30:20AM -0700, Tim Chen wrote:
> On 8/7/19 10:10 AM, Tim Chen wrote:
> 
> > 3) Load balancing between CPU cores
> > ---
> > Say if one CPU core's sibling threads get forced idled
> > a lot as it has mostly incompatible tasks between the siblings,
> > moving the incompatible load to other cores and pulling
> > compatible load to the core could help CPU utilization.
> > 
> > So just considering the load of a task is not enough during
> > load balancing, task compatibility also needs to be considered.
> > Peter has put in mechanisms to balance compatible tasks between
> > CPU thread siblings, but not across cores.
> > 
> > Status:
> > I have not seen patches on this issue.  This issue could lead to
> > large variance in workload performance based on your luck
> > in placing the workload among the cores.
> > 
> 
> I've made an attempt in the following two patches to address
> the load balancing of mismatched load between the siblings.
> 
> It is applied on top of Aaron's patches:
> - sched: Fix incorrect rq tagged as forced idle
> - wrapper for cfs_rq->min_vruntime
>   https://lore.kernel.org/lkml/20190725143127.GB992@aaronlu/
> - core vruntime comparison
>   https://lore.kernel.org/lkml/20190725143248.GC992@aaronlu/

So both of you are working on top of my 2 patches that deal with the
fairness issue, but I had the feeling Tim's alternative patches[1] are
simpler than mine and achieves the same result(after the force idle tag
fix), so unless there is something I missed, I think we should go with
the simpler one?

[1]: 
https://lore.kernel.org/lkml/b7a83fcb-5c34-9794-5688-55c52697f...@linux.intel.com/


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-15 Thread Aaron Lu
On Thu, Aug 15, 2019 at 06:09:28PM +0200, Dario Faggioli wrote:
> On Wed, 2019-08-07 at 10:10 -0700, Tim Chen wrote:
> > On 8/7/19 1:58 AM, Dario Faggioli wrote:
> > 
> > > Since I see that, in this thread, there are various patches being
> > > proposed and discussed... should I rerun my benchmarks with them
> > > applied? If yes, which ones? And is there, by any chance, one (or
> > > maybe
> > > more than one) updated git branch(es)?
> > > 
> > Hi Dario,
> > 
> Hi Tim!
> 
> > Having an extra set of eyes are certainly welcomed.
> > I'll give my 2 cents on the issues with v3.
> > 
> Ok, and thanks a lot for this.
> 
> > 1) Unfairness between the sibling threads
> > -
> > One sibling thread could be suppressing and force idling
> > the sibling thread over proportionally.  Resulting in
> > the force idled CPU not getting run and stall tasks on
> > suppressed CPU.
> > 
> > 
> > [...]
> >
> > 2) Not rescheduling forced idled CPU
> > 
> > The forced idled CPU does not get a chance to re-schedule
> > itself, and will stall for a long time even though it
> > has eligible tasks to run.
> > 
> > [...]
> > 
> > 3) Load balancing between CPU cores
> > ---
> > Say if one CPU core's sibling threads get forced idled
> > a lot as it has mostly incompatible tasks between the siblings,
> > moving the incompatible load to other cores and pulling
> > compatible load to the core could help CPU utilization.
> > 
> > So just considering the load of a task is not enough during
> > load balancing, task compatibility also needs to be considered.
> > Peter has put in mechanisms to balance compatible tasks between
> > CPU thread siblings, but not across cores.
> > 
> > [...]
> >
> Ok. Yes, as said, I've been trying to follow the thread, but thanks a
> lot again for this summary.
> 
> As said, I'm about to have numbers for the repo/branch I mentioned.
> 
> I was considering whether to also re-run the benchmarking campaign with
> some of the patches that floated around within this thread. Now, thanks
> to your summary, I have an even clearer picture about which patch does
> what, and that is indeed very useful.
> 
> I'll see about putting something together. I'm thinking of picking:
> 
> https://lore.kernel.org/lkml/b7a83fcb-5c34-9794-5688-55c52697f...@linux.intel.com/
> https://lore.kernel.org/lkml/20190725143344.GD992@aaronlu/
> 
> And maybe even (part of):
> https://lore.kernel.org/lkml/20190810141556.GA73644@aaronlu/#t
> 
> If anyone has ideas or suggestions about whether or not this choice
> makes sense, feel free to share. :-)

Makes sense to me.
patch3 in the last link is slightly better than the one in the 2nd link,
so just use that instead.

Thanks,
Aaron

> Also, I only have another week before leaving, so let's see what I
> manage to actually run, and then share here, by then.
> 
> Thanks and Regards
> -- 
> Dario Faggioli, Ph.D
> http://about.me/dario.faggioli
> Virtualization Software Engineer
> SUSE Labs, SUSE https://www.suse.com/
> ---
> <> (Raistlin Majere)
> 




Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-12 Thread Aaron Lu
On 2019/8/12 23:38, Vineeth Remanan Pillai wrote:
>> I have two other small changes that I think are worth sending out.
>>
>> The first simplify logic in pick_task() and the 2nd avoid task pick all
>> over again when max is preempted. I also refined the previous hack patch to
>> make schedule always happen only for root cfs rq. Please see below for
>> details, thanks.
>>
> I see a potential issue here. With the simplification in pick_task,
> you might introduce a livelock where the match logic spins for ever.
> But you avoid that with the patch 2, by removing the loop if a pick
> preempts max. The potential problem is that, you miss a case where
> the newly picked task might have a match in the sibling on which max
> was selected before. By selecting idle, you ignore the potential match.

Oh that's right, I missed this.

> As of now, the potential match check does not really work because,
> sched_core_find will always return the same task and we do not check
> the whole core_tree for a next match. This is in my TODO list to have
> sched_core_find to return the best next match, if match was preempted.
> But its a bit complex and needs more thought.

Sounds worth to do :-)


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-10 Thread Aaron Lu
On Thu, Aug 08, 2019 at 09:39:45AM -0700, Tim Chen wrote:
> On 8/8/19 5:55 AM, Aaron Lu wrote:
> > On Mon, Aug 05, 2019 at 08:55:28AM -0700, Tim Chen wrote:
> >> On 8/2/19 8:37 AM, Julien Desfossez wrote:
> >>> We tested both Aaron's and Tim's patches and here are our results.
> 
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 26fea68f7f54..542974a8da18 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -3888,7 +3888,7 @@ next_class:;
> > WARN_ON_ONCE(!rq_i->core_pick);
> >  
> > if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
> > -   rq->core_forceidle = true;
> > +   rq_i->core_forceidle = true;
> 
> Good catch!
> 
> >  
> > rq_i->core_pick->core_occupation = occ;
> > 
> > With this fixed and together with the patch to let schedule always
> > happen, your latest 2 patches work well for the 10s cpuhog test I
> > described previously:
> > https://lore.kernel.org/lkml/20190725143003.GA992@aaronlu/
> 
> That's encouraging.  You are talking about my patches
> that try to keep the force idle time between sibling threads
> balanced, right?

Yes.

> > 
> > overloaded workload without any cpu binding doesn't work well though, I
> > haven't taken a closer look yet.
> > 
> 
> I think we need a load balancing scheme among the cores that will try
> to minimize force idle.

Agree.

> 
> One possible metric to measure load compatibility imbalance that leads to
> force idle is 
> 
> Say i, j are sibling threads of a cpu core
> imbalanace = \sum_tagged_cgroup  abs(Load_cgroup_cpui - Load_cgroup_cpuj)
> 
> This gives us a metric to decide if migrating a task will improve
> load compatability imbalance.  As we already track cgroup load on a CPU,
> it should be doable without adding too much overhead.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-10 Thread Aaron Lu
On Thu, Aug 08, 2019 at 02:42:57PM -0700, Tim Chen wrote:
> On 8/8/19 10:27 AM, Tim Chen wrote:
> > On 8/7/19 11:47 PM, Aaron Lu wrote:
> >> On Tue, Aug 06, 2019 at 02:19:57PM -0700, Tim Chen wrote:
> >>> +void account_core_idletime(struct task_struct *p, u64 exec)
> >>> +{
> >>> + const struct cpumask *smt_mask;
> >>> + struct rq *rq;
> >>> + bool force_idle, refill;
> >>> + int i, cpu;
> >>> +
> >>> + rq = task_rq(p);
> >>> + if (!sched_core_enabled(rq) || !p->core_cookie)
> >>> + return;
> >>
> >> I don't see why return here for untagged task. Untagged task can also
> >> preempt tagged task and force a CPU thread enter idle state.
> >> Untagged is just another tag to me, unless we want to allow untagged
> >> task to coschedule with a tagged task.
> > 
> > You are right.  This needs to be fixed.
> > 
> 
> Here's the updated patchset, including Aaron's fix and also
> added accounting of force idle time by deadline and rt tasks.

I have two other small changes that I think are worth sending out.

The first simplify logic in pick_task() and the 2nd avoid task pick all
over again when max is preempted. I also refined the previous hack patch to
make schedule always happen only for root cfs rq. Please see below for
details, thanks.

patch1:

>From cea56db35fe9f393c357cdb1bdcb2ef9b56cfe97 Mon Sep 17 00:00:00 2001
From: Aaron Lu 
Date: Mon, 5 Aug 2019 21:21:25 +0800
Subject: [PATCH 1/3] sched/core: simplify pick_task()

No need to special case !cookie case in pick_task(), we just need to
make it possible to return idle in sched_core_find() for !cookie query.
And cookie_pick will always have less priority than class_pick, so
remove the redundant check of prio_less(cookie_pick, class_pick).

Signed-off-by: Aaron Lu 
---
 kernel/sched/core.c | 19 ---
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90655c9ad937..84fec9933b74 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,6 +186,8 @@ static struct task_struct *sched_core_find(struct rq *rq, 
unsigned long cookie)
 * The idle task always matches any cookie!
 */
match = idle_sched_class.pick_task(rq);
+   if (!cookie)
+   goto out;
 
while (node) {
node_task = container_of(node, struct task_struct, core_node);
@@ -199,7 +201,7 @@ static struct task_struct *sched_core_find(struct rq *rq, 
unsigned long cookie)
node = node->rb_left;
}
}
-
+out:
return match;
 }
 
@@ -3657,18 +3659,6 @@ pick_task(struct rq *rq, const struct sched_class 
*class, struct task_struct *ma
if (!class_pick)
return NULL;
 
-   if (!cookie) {
-   /*
-* If class_pick is tagged, return it only if it has
-* higher priority than max.
-*/
-   if (max && class_pick->core_cookie &&
-   prio_less(class_pick, max))
-   return idle_sched_class.pick_task(rq);
-
-   return class_pick;
-   }
-
/*
 * If class_pick is idle or matches cookie, return early.
 */
@@ -3682,8 +3672,7 @@ pick_task(struct rq *rq, const struct sched_class *class, 
struct task_struct *ma
 * the core (so far) and it must be selected, otherwise we must go with
 * the cookie pick in order to satisfy the constraint.
 */
-   if (prio_less(cookie_pick, class_pick) &&
-   (!max || prio_less(max, class_pick)))
+   if (!max || prio_less(max, class_pick))
return class_pick;
 
return cookie_pick;
-- 
2.19.1.3.ge56e4f7

patch2:

>From 487950dc53a40d5c566602f775ce46a0bab7a412 Mon Sep 17 00:00:00 2001
From: Aaron Lu 
Date: Fri, 9 Aug 2019 14:48:01 +0800
Subject: [PATCH 2/3] sched/core: no need to pick again after max is preempted

When sibling's task preempts current max, there is no need to do the
pick all over again - the preempted cpu could just pick idle and done.

Signed-off-by: Aaron Lu 
---
 kernel/sched/core.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84fec9933b74..e88583860abe 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3756,7 +3756,6 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 * order.
 */
for_each_class(class) {
-again:
for_each_cpu_wrap(i, smt_mask, cpu) {
struct rq *rq_i = cpu_rq(i);
struct task_struct *p;
@@ -3828,10 +3827,10 @@ pick_next_task(struct rq *rq, st

Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-08 Thread Aaron Lu
On Mon, Aug 05, 2019 at 08:55:28AM -0700, Tim Chen wrote:
> On 8/2/19 8:37 AM, Julien Desfossez wrote:
> > We tested both Aaron's and Tim's patches and here are our results.
> > 
> > Test setup:
> > - 2 1-thread sysbench, one running the cpu benchmark, the other one the
> >   mem benchmark
> > - both started at the same time
> > - both are pinned on the same core (2 hardware threads)
> > - 10 30-seconds runs
> > - test script: https://paste.debian.net/plainh/834cf45c
> > - only showing the CPU events/sec (higher is better)
> > - tested 4 tag configurations:
> >   - no tag
> >   - sysbench mem untagged, sysbench cpu tagged
> >   - sysbench mem tagged, sysbench cpu untagged
> >   - both tagged with a different tag
> > - "Alone" is the sysbench CPU running alone on the core, no tag
> > - "nosmt" is both sysbench pinned on the same hardware thread, no tag
> > - "Tim's full patchset + sched" is an experiment with Tim's patchset
> >   combined with Aaron's "hack patch" to get rid of the remaining deep
> >   idle cases
> > - In all test cases, both tasks can run simultaneously (which was not
> >   the case without those patches), but the standard deviation is a
> >   pretty good indicator of the fairness/consistency.
> 
> Thanks for testing the patches and giving such detailed data.
> 
> I came to realize that for my scheme, the accumulated deficit of forced idle 
> could be wiped
> out in one execution of a task on the forced idle cpu, with the update of the 
> min_vruntime,
> even if the execution time could be far less than the accumulated deficit.
> That's probably one reason my scheme didn't achieve fairness.

Turns out there is a typo error in v3 when setting rq's core_forceidle:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26fea68f7f54..542974a8da18 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3888,7 +3888,7 @@ next_class:;
WARN_ON_ONCE(!rq_i->core_pick);
 
if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
-   rq->core_forceidle = true;
+   rq_i->core_forceidle = true;
 
rq_i->core_pick->core_occupation = occ;

With this fixed and together with the patch to let schedule always
happen, your latest 2 patches work well for the 10s cpuhog test I
described previously:
https://lore.kernel.org/lkml/20190725143003.GA992@aaronlu/

overloaded workload without any cpu binding doesn't work well though, I
haven't taken a closer look yet.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-08 Thread Aaron Lu
On Tue, Aug 06, 2019 at 02:19:57PM -0700, Tim Chen wrote:
> +void account_core_idletime(struct task_struct *p, u64 exec)
> +{
> + const struct cpumask *smt_mask;
> + struct rq *rq;
> + bool force_idle, refill;
> + int i, cpu;
> +
> + rq = task_rq(p);
> + if (!sched_core_enabled(rq) || !p->core_cookie)
> + return;

I don't see why return here for untagged task. Untagged task can also
preempt tagged task and force a CPU thread enter idle state.
Untagged is just another tag to me, unless we want to allow untagged
task to coschedule with a tagged task.

> + cpu = task_cpu(p);
> + force_idle = false;
> + refill = true;
> + smt_mask = cpu_smt_mask(cpu);
> +
> + for_each_cpu(i, smt_mask) {
> + if (cpu == i)
> + continue;
> +
> + if (cpu_rq(i)->core_forceidle)
> + force_idle = true;
> +
> + /* Only refill if everyone has run out of allowance */
> + if (cpu_rq(i)->core_idle_allowance > 0)
> + refill = false;
> + }
> +
> + if (force_idle)
> + rq->core_idle_allowance -= (s64) exec;
> +
> + if (rq->core_idle_allowance < 0 && refill) {
> + for_each_cpu(i, smt_mask) {
> + cpu_rq(i)->core_idle_allowance += (s64) 
> SCHED_IDLE_ALLOWANCE;
> + }
> + }
> +}


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-06 Thread Aaron Lu
On 2019/8/6 22:17, Phil Auld wrote:
> On Tue, Aug 06, 2019 at 09:54:01PM +0800 Aaron Lu wrote:
>> On Mon, Aug 05, 2019 at 04:09:15PM -0400, Phil Auld wrote:
>>> Hi,
>>>
>>> On Fri, Aug 02, 2019 at 11:37:15AM -0400 Julien Desfossez wrote:
>>>> We tested both Aaron's and Tim's patches and here are our results.
>>>>
>>>> Test setup:
>>>> - 2 1-thread sysbench, one running the cpu benchmark, the other one the
>>>>   mem benchmark
>>>> - both started at the same time
>>>> - both are pinned on the same core (2 hardware threads)
>>>> - 10 30-seconds runs
>>>> - test script: https://paste.debian.net/plainh/834cf45c
>>>> - only showing the CPU events/sec (higher is better)
>>>> - tested 4 tag configurations:
>>>>   - no tag
>>>>   - sysbench mem untagged, sysbench cpu tagged
>>>>   - sysbench mem tagged, sysbench cpu untagged
>>>>   - both tagged with a different tag
>>>> - "Alone" is the sysbench CPU running alone on the core, no tag
>>>> - "nosmt" is both sysbench pinned on the same hardware thread, no tag
>>>> - "Tim's full patchset + sched" is an experiment with Tim's patchset
>>>>   combined with Aaron's "hack patch" to get rid of the remaining deep
>>>>   idle cases
>>>> - In all test cases, both tasks can run simultaneously (which was not
>>>>   the case without those patches), but the standard deviation is a
>>>>   pretty good indicator of the fairness/consistency.
>>>>
>>>> No tag
>>>> --
>>>> TestAverage Stdev
>>>> Alone   1306.90 0.94
>>>> nosmt   649.95  1.44
>>>> Aaron's full patchset:  828.15  32.45
>>>> Aaron's first 2 patches:832.12  36.53
>>>> Aaron's 3rd patch alone:864.21  3.68
>>>> Tim's full patchset:852.50  4.11
>>>> Tim's full patchset + sched:852.59  8.25
>>>>
>>>> Sysbench mem untagged, sysbench cpu tagged
>>>> --
>>>> TestAverage Stdev
>>>> Alone   1306.90 0.94
>>>> nosmt   649.95  1.44
>>>> Aaron's full patchset:  586.06  1.77
>>>> Aaron's first 2 patches:630.08  47.30
>>>> Aaron's 3rd patch alone:1086.65 246.54
>>>> Tim's full patchset:852.50  4.11
>>>> Tim's full patchset + sched:390.49  15.76
>>>>
>>>> Sysbench mem tagged, sysbench cpu untagged
>>>> --
>>>> TestAverage Stdev
>>>> Alone   1306.90 0.94
>>>> nosmt   649.95  1.44
>>>> Aaron's full patchset:  583.77  3.52
>>>> Aaron's first 2 patches:513.63  63.09
>>>> Aaron's 3rd patch alone:1171.23 3.35
>>>> Tim's full patchset:564.04  58.05
>>>> Tim's full patchset + sched:1026.16 49.43
>>>>
>>>> Both sysbench tagged
>>>> 
>>>> TestAverage Stdev
>>>> Alone   1306.90 0.94
>>>> nosmt   649.95  1.44
>>>> Aaron's full patchset:  582.15  3.75
>>>> Aaron's first 2 patches:561.07  91.61
>>>> Aaron's 3rd patch alone:638.49  231.06
>>>> Tim's full patchset:679.43  70.07
>>>> Tim's full patchset + sched:664.34  210.14
>>>>
>>>
>>> Sorry if I'm missing something obvious here but with only 2 processes 
>>> of interest shouldn't one tagged and one untagged be about the same
>>> as both tagged?  
>>
>> It should.
>>
>>> In both cases the 2 sysbenches should not be running on the core at 
>>> the same time. 
>>
>> Agree.
>>
>>> There will be times when oher non-related threads could share the core
>>> with the untagged one. Is that enough to account for this difference?
>>
>> What difference do you mean?
> 
> 
> I was looking at the above posted numbers. For example:
> 
>>>> Sysbench mem untagged, sysbench cpu tagged
>>>> Aaron's 3rd patch alone:1086.65 246.54
> 
>>>> Sysbench mem tagged, sysbench cpu untagged
>>>> Aaron's 3rd patch alone:1171.23 3.35
> 
>>>> Both sysbench tagged
>>>> Aaron's 3rd patch alone:638.49  231.06
> 
> 
> Admittedly, there's some high variance on some of those numbers. 

The high variance suggests the code having some fairness issues :-)

For the test here, I didn't expect the 3rd patch being used alone
since the fairness is solved by patch2 and patch3 together.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-06 Thread Aaron Lu
On Mon, Aug 05, 2019 at 04:09:15PM -0400, Phil Auld wrote:
> Hi,
> 
> On Fri, Aug 02, 2019 at 11:37:15AM -0400 Julien Desfossez wrote:
> > We tested both Aaron's and Tim's patches and here are our results.
> > 
> > Test setup:
> > - 2 1-thread sysbench, one running the cpu benchmark, the other one the
> >   mem benchmark
> > - both started at the same time
> > - both are pinned on the same core (2 hardware threads)
> > - 10 30-seconds runs
> > - test script: https://paste.debian.net/plainh/834cf45c
> > - only showing the CPU events/sec (higher is better)
> > - tested 4 tag configurations:
> >   - no tag
> >   - sysbench mem untagged, sysbench cpu tagged
> >   - sysbench mem tagged, sysbench cpu untagged
> >   - both tagged with a different tag
> > - "Alone" is the sysbench CPU running alone on the core, no tag
> > - "nosmt" is both sysbench pinned on the same hardware thread, no tag
> > - "Tim's full patchset + sched" is an experiment with Tim's patchset
> >   combined with Aaron's "hack patch" to get rid of the remaining deep
> >   idle cases
> > - In all test cases, both tasks can run simultaneously (which was not
> >   the case without those patches), but the standard deviation is a
> >   pretty good indicator of the fairness/consistency.
> > 
> > No tag
> > --
> > TestAverage Stdev
> > Alone   1306.90 0.94
> > nosmt   649.95  1.44
> > Aaron's full patchset:  828.15  32.45
> > Aaron's first 2 patches:832.12  36.53
> > Aaron's 3rd patch alone:864.21  3.68
> > Tim's full patchset:852.50  4.11
> > Tim's full patchset + sched:852.59  8.25
> > 
> > Sysbench mem untagged, sysbench cpu tagged
> > --
> > TestAverage Stdev
> > Alone   1306.90 0.94
> > nosmt   649.95  1.44
> > Aaron's full patchset:  586.06  1.77
> > Aaron's first 2 patches:630.08  47.30
> > Aaron's 3rd patch alone:1086.65 246.54
> > Tim's full patchset:852.50  4.11
> > Tim's full patchset + sched:390.49  15.76
> > 
> > Sysbench mem tagged, sysbench cpu untagged
> > --
> > TestAverage Stdev
> > Alone   1306.90 0.94
> > nosmt   649.95  1.44
> > Aaron's full patchset:  583.77  3.52
> > Aaron's first 2 patches:513.63  63.09
> > Aaron's 3rd patch alone:1171.23 3.35
> > Tim's full patchset:564.04  58.05
> > Tim's full patchset + sched:1026.16 49.43
> > 
> > Both sysbench tagged
> > 
> > TestAverage Stdev
> > Alone   1306.90 0.94
> > nosmt   649.95  1.44
> > Aaron's full patchset:  582.15  3.75
> > Aaron's first 2 patches:561.07  91.61
> > Aaron's 3rd patch alone:638.49  231.06
> > Tim's full patchset:679.43  70.07
> > Tim's full patchset + sched:664.34  210.14
> > 
> 
> Sorry if I'm missing something obvious here but with only 2 processes 
> of interest shouldn't one tagged and one untagged be about the same
> as both tagged?  

It should.

> In both cases the 2 sysbenches should not be running on the core at 
> the same time. 

Agree.

> There will be times when oher non-related threads could share the core
> with the untagged one. Is that enough to account for this difference?

What difference do you mean?

Thanks,
Aaron

> > So in terms of fairness, Aaron's full patchset is the most consistent, but 
> > only
> > Tim's patchset performs better than nosmt in some conditions.
> > 
> > Of course, this is one of the worst case scenario, as soon as we have
> > multithreaded applications on overcommitted systems, core scheduling 
> > performs
> > better than nosmt.
> > 
> > Thanks,
> > 
> > Julien
> 
> -- 


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-06 Thread Aaron Lu
On Tue, Aug 06, 2019 at 08:24:17AM -0400, Vineeth Remanan Pillai wrote:
> > >
> > > I also think a way to make fairness per cookie per core, is this what you
> > > want to propose?
> >
> > Yes, that's what I meant.
> 
> I think that would hurt some kind of workloads badly, especially if
> one tenant is
> having way more tasks than the other. Tenant with more task on the same core
> might have immediate requirements from some threads than the other and we
> would fail to take that into account. With some hierarchical management, we 
> can
> alleviate this, but as Aaron said, it would be a bit messy.

I think tenant will have per core weight, similar to sched entity's per
cpu weight. The tenant's per core weight could derive from its
corresponding taskgroup's per cpu sched entities' weight(sum them up
perhaps). Tenant with higher weight will have its core wide vruntime
advance slower than tenant with lower weight. Does this address the
issue here?

> Peter's rebalance logic actually takes care of most of the runq
> imbalance caused
> due to cookie tagging. What we have found from our testing is, fairness issue 
> is
> caused mostly due to a Hyperthread going idle and not waking up. Aaron's 3rd
> patch works around that. As Julien mentioned, we are working on a per thread
> coresched idle thread concept. The problem that we found was, idle thread 
> causes
> accounting issues and wakeup issues as it was not designed to be used in this
> context. So if we can have a low priority thread which looks like any other 
> task
> to the scheduler, things becomes easy for the scheduler and we achieve 
> security
> as well. Please share your thoughts on this idea.

Care to elaborate the idea of coresched idle thread concept?
How it solved the hyperthread going idle problem and what the accounting
issues and wakeup issues are, etc.

Thanks,
Aaron

> The results are encouraging, but we do not yet have the coresched idle
> to not spin
> 100%. We will soon post the patch once it is a bit more stable for
> running the tests
> that we all have done so far.
> 
> Thanks,
> Vineeth


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-06 Thread Aaron Lu
On 2019/8/6 14:56, Aubrey Li wrote:
> On Tue, Aug 6, 2019 at 11:24 AM Aaron Lu  wrote:
>> I've been thinking if we should consider core wide tenent fairness?
>>
>> Let's say there are 3 tasks on 2 threads' rq of the same core, 2 tasks
>> (e.g. A1, A2) belong to tenent A and the 3rd B1 belong to another tenent
>> B. Assume A1 and B1 are queued on the same thread and A2 on the other
>> thread, when we decide priority for A1 and B1, shall we also consider
>> A2's vruntime? i.e. shall we consider A1 and A2 as a whole since they
>> belong to the same tenent? I tend to think we should make fairness per
>> core per tenent, instead of per thread(cpu) per task(sched entity). What
>> do you guys think?
>>
> 
> I also think a way to make fairness per cookie per core, is this what you
> want to propose?

Yes, that's what I meant.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-08-05 Thread Aaron Lu
On Mon, Aug 05, 2019 at 08:55:28AM -0700, Tim Chen wrote:
> On 8/2/19 8:37 AM, Julien Desfossez wrote:
> > We tested both Aaron's and Tim's patches and here are our results.
> > 
> > Test setup:
> > - 2 1-thread sysbench, one running the cpu benchmark, the other one the
> >   mem benchmark
> > - both started at the same time
> > - both are pinned on the same core (2 hardware threads)
> > - 10 30-seconds runs
> > - test script: https://paste.debian.net/plainh/834cf45c
> > - only showing the CPU events/sec (higher is better)
> > - tested 4 tag configurations:
> >   - no tag
> >   - sysbench mem untagged, sysbench cpu tagged
> >   - sysbench mem tagged, sysbench cpu untagged
> >   - both tagged with a different tag
> > - "Alone" is the sysbench CPU running alone on the core, no tag
> > - "nosmt" is both sysbench pinned on the same hardware thread, no tag
> > - "Tim's full patchset + sched" is an experiment with Tim's patchset
> >   combined with Aaron's "hack patch" to get rid of the remaining deep
> >   idle cases
> > - In all test cases, both tasks can run simultaneously (which was not
> >   the case without those patches), but the standard deviation is a
> >   pretty good indicator of the fairness/consistency.
> 
> Thanks for testing the patches and giving such detailed data.

Thanks Julien.

> I came to realize that for my scheme, the accumulated deficit of forced idle 
> could be wiped
> out in one execution of a task on the forced idle cpu, with the update of the 
> min_vruntime,
> even if the execution time could be far less than the accumulated deficit.
> That's probably one reason my scheme didn't achieve fairness.

I've been thinking if we should consider core wide tenent fairness?

Let's say there are 3 tasks on 2 threads' rq of the same core, 2 tasks
(e.g. A1, A2) belong to tenent A and the 3rd B1 belong to another tenent
B. Assume A1 and B1 are queued on the same thread and A2 on the other
thread, when we decide priority for A1 and B1, shall we also consider
A2's vruntime? i.e. shall we consider A1 and A2 as a whole since they
belong to the same tenent? I tend to think we should make fairness per
core per tenent, instead of per thread(cpu) per task(sched entity). What
do you guys think?

Implemention of the idea is a mess to me, as I feel I'm duplicating the
existing per cpu per sched_entity enqueue/update vruntime/dequeue logic
for the per core per tenent stuff.


[PATCH 3/3] temp hack to make tick based schedule happen

2019-07-25 Thread Aaron Lu
When a hyperthread is forced idle and the other hyperthread has a single
CPU intensive task running, the running task can occupy the hyperthread
for a long time with no scheduling point and starve the other
hyperthread.

Fix this temporarily by always checking if the task has exceed its
timeslice and if so, do a schedule.

Signed-off-by: Aaron Lu 
---
 kernel/sched/fair.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 43babc2a12a5..730c9359e9c9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4093,6 +4093,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct 
sched_entity *curr)
return;
}
 
+   if (cfs_rq->nr_running <= 1)
+   return;
+
/*
 * Ensure that a task that missed wakeup preemption by a
 * narrow margin doesn't have to wait for a full slice.
@@ -4261,8 +4264,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity 
*curr, int queued)
return;
 #endif
 
-   if (cfs_rq->nr_running > 1)
-   check_preempt_tick(cfs_rq, curr);
+   check_preempt_tick(cfs_rq, curr);
 }
 
 
-- 
2.19.1.3.ge56e4f7



[PATCH 2/3] core vruntime comparison

2019-07-25 Thread Aaron Lu
This patch provides a vruntime based way to compare two cfs task's
priority, be it on the same cpu or different threads of the same core.

When the two tasks are on the same CPU, we just need to find a common
cfs_rq both sched_entities are on and then do the comparison.

When the two tasks are on differen threads of the same core, the root
level sched_entities to which the two tasks belong will be used to do
the comparison.

An ugly illustration for the cross CPU case:

   cpu0 cpu1
 /   |  \ /   |  \
se1 se2 se3  se4 se5 se6
/  \/   \
  se21 se22   se61  se62

Assume CPU0 and CPU1 are smt siblings and task A's se is se21 while
task B's se is se61. To compare priority of task A and B, we compare
priority of se2 and se6. Whose vruntime is smaller, who wins.

To make this work, the root level se should have a common cfs_rq min
vuntime, which I call it the core cfs_rq min vruntime.

Potential issues: when core scheduling is enabled, if there are tasks
already in some CPU's rq, then new tasks will be queued with the per-core
cfs_rq min vruntime while the old tasks are using the original root
level cfs_rq's min_vruntime. The two values can differ greatly and can
cause tasks with a large vruntime starve. So enable core scheduling
early when the system is still kind of idle for the time being to avoid
this problem.

Signed-off-by: Aaron Lu 
---
 kernel/sched/core.c  | 15 ++---
 kernel/sched/fair.c  | 79 +++-
 kernel/sched/sched.h |  2 ++
 3 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 90655c9ad937..bc746ea4cc82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -105,19 +105,8 @@ static inline bool prio_less(struct task_struct *a, struct 
task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-   if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-   u64 vruntime = b->se.vruntime;
-
-   /*
-* Normalize the vruntime if tasks are in different cpus.
-*/
-   if (task_cpu(a) != task_cpu(b)) {
-   vruntime -= task_cfs_rq(b)->min_vruntime;
-   vruntime += task_cfs_rq(a)->min_vruntime;
-   }
-
-   return !((s64)(a->se.vruntime - vruntime) <= 0);
-   }
+   if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+   return cfs_prio_less(a, b);
 
return false;
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a7b26c96f46b..43babc2a12a5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,9 +431,85 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline struct cfs_rq *root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+   return _of(cfs_rq)->cfs;
+}
+
+static inline bool is_root_cfs_rq(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq == root_cfs_rq(cfs_rq);
+}
+
+static inline struct cfs_rq *core_cfs_rq(struct cfs_rq *cfs_rq)
+{
+   return _of(cfs_rq)->core->cfs;
+}
+
 static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
 {
-   return cfs_rq->min_vruntime;
+   if (!sched_core_enabled(rq_of(cfs_rq)))
+   return cfs_rq->min_vruntime;
+
+   if (is_root_cfs_rq(cfs_rq))
+   return core_cfs_rq(cfs_rq)->min_vruntime;
+   else
+   return cfs_rq->min_vruntime;
+}
+
+static void update_core_cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+{
+   struct cfs_rq *cfs_rq_core;
+
+   if (!sched_core_enabled(rq_of(cfs_rq)))
+   return;
+
+   if (!is_root_cfs_rq(cfs_rq))
+   return;
+
+   cfs_rq_core = core_cfs_rq(cfs_rq);
+   cfs_rq_core->min_vruntime = max(cfs_rq_core->min_vruntime,
+   cfs_rq->min_vruntime);
+}
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+   struct sched_entity *sea = >se;
+   struct sched_entity *seb = >se;
+   bool samecpu = task_cpu(a) == task_cpu(b);
+   struct task_struct *p;
+   s64 delta;
+
+   if (samecpu) {
+   /* vruntime is per cfs_rq */
+   while (!is_same_group(sea, seb)) {
+   int sea_depth = sea->depth;
+   int seb_depth = seb->depth;
+
+   if (sea_depth >= seb_depth)
+   sea = parent_entity(sea);
+   if (sea_depth <= seb_depth)
+   seb = parent_entity(seb);
+   }
+
+   delta = (s64)(sea->vruntime - seb->vruntime);
+   goto out;
+   }
+
+   /* crosscpu: compare root level se's vruntime to decide priority */
+   while (sea->parent)
+ 

[RFC PATCH 1/3] wrapper for cfs_rq->min_vruntime

2019-07-25 Thread Aaron Lu
Add a wrapper function cfs_rq_min_vruntime(cfs_rq) to
return cfs_rq->min_vruntime.

It will be used in the following patch, no functionality
change.

Signed-off-by: Aaron Lu 
---
 kernel/sched/fair.c | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 26d29126d6a5..a7b26c96f46b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,6 +431,11 @@ find_matching_se(struct sched_entity **se, struct 
sched_entity **pse)
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
+static inline u64 cfs_rq_min_vruntime(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq->min_vruntime;
+}
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -467,7 +472,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
struct sched_entity *curr = cfs_rq->curr;
struct rb_node *leftmost = rb_first_cached(_rq->tasks_timeline);
 
-   u64 vruntime = cfs_rq->min_vruntime;
+   u64 vruntime = cfs_rq_min_vruntime(cfs_rq);
 
if (curr) {
if (curr->on_rq)
@@ -487,7 +492,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
 
/* ensure we never gain time by being placed backwards. */
-   cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+   cfs_rq->min_vruntime = max_vruntime(cfs_rq_min_vruntime(cfs_rq), 
vruntime);
 #ifndef CONFIG_64BIT
smp_wmb();
cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -3742,7 +3747,7 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq) {}
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
-   s64 d = se->vruntime - cfs_rq->min_vruntime;
+   s64 d = se->vruntime - cfs_rq_min_vruntime(cfs_rq);
 
if (d < 0)
d = -d;
@@ -3755,7 +3760,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 {
-   u64 vruntime = cfs_rq->min_vruntime;
+   u64 vruntime = cfs_rq_min_vruntime(cfs_rq);
 
/*
 * The 'current' period is already promised to the current tasks,
@@ -3848,7 +3853,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 * update_curr().
 */
if (renorm && curr)
-   se->vruntime += cfs_rq->min_vruntime;
+   se->vruntime += cfs_rq_min_vruntime(cfs_rq);
 
update_curr(cfs_rq);
 
@@ -3859,7 +3864,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 * fairness detriment of existing tasks.
 */
if (renorm && !curr)
-   se->vruntime += cfs_rq->min_vruntime;
+   se->vruntime += cfs_rq_min_vruntime(cfs_rq);
 
/*
 * When enqueuing a sched_entity, we must:
@@ -3972,7 +3977,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 * can move min_vruntime forward still more.
 */
if (!(flags & DEQUEUE_SLEEP))
-   se->vruntime -= cfs_rq->min_vruntime;
+   se->vruntime -= cfs_rq_min_vruntime(cfs_rq);
 
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
@@ -6722,7 +6727,7 @@ static void migrate_task_rq_fair(struct task_struct *p, 
int new_cpu)
min_vruntime = cfs_rq->min_vruntime;
} while (min_vruntime != min_vruntime_copy);
 #else
-   min_vruntime = cfs_rq->min_vruntime;
+   min_vruntime = cfs_rq_min_vruntime(cfs_rq);
 #endif
 
se->vruntime -= min_vruntime;
@@ -10215,7 +10220,7 @@ static void task_fork_fair(struct task_struct *p)
resched_curr(rq);
}
 
-   se->vruntime -= cfs_rq->min_vruntime;
+   se->vruntime -= cfs_rq_min_vruntime(cfs_rq);
rq_unlock(rq, );
 }
 
@@ -10335,7 +10340,7 @@ static void detach_task_cfs_rq(struct task_struct *p)
 * cause 'unlimited' sleep bonus.
 */
place_entity(cfs_rq, se, 0);
-   se->vruntime -= cfs_rq->min_vruntime;
+   se->vruntime -= cfs_rq_min_vruntime(cfs_rq);
}
 
detach_entity_cfs_rq(se);
@@ -10349,7 +10354,7 @@ static void attach_task_cfs_rq(struct task_struct *p)
attach_entity_cfs_rq(se);
 
if (!vruntime_normalized(p))
-   se->vruntime += cfs_rq->min_vruntime;
+   se->vruntime += cfs_rq_min_vruntime(cfs_rq);
 }
 
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
-- 
2.19.1.3.ge56e4f7



Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-07-25 Thread Aaron Lu
On Mon, Jul 22, 2019 at 06:26:46PM +0800, Aubrey Li wrote:
> The granularity period of util_avg seems too large to decide task priority
> during pick_task(), at least it is in my case, cfs_prio_less() always picked
> core max task, so pick_task() eventually picked idle, which causes this change
> not very helpful for my case.
> 
>  -0 [057] dN..83.716973: __schedule: max: sysbench/2578
> 889050f68600
>  -0 [057] dN..83.716974: __schedule:
> (swapper/5/0;140,0,0) ?< (mysqld/2511;119,1042118143,0)
>  -0 [057] dN..83.716975: __schedule:
> (sysbench/2578;119,96449836,0) ?< (mysqld/2511;119,1042118143,0)
>  -0 [057] dN..83.716975: cfs_prio_less: picked
> sysbench/2578 util_avg: 20 527 -507 <=== here===
>  -0 [057] dN..83.716976: __schedule: pick_task cookie
> pick swapper/5/0 889050f68600

I tried a different approach based on vruntime with 3 patches following.

When the two tasks are on the same CPU, no change is made, I still route
the two sched entities up till they are in the same group(cfs_rq) and
then do the vruntime comparison.

When the two tasks are on differen threads of the same core, the root
level sched_entities to which the two tasks belong will be used to do
the comparison.

An ugly illustration for the cross CPU case:

   cpu0 cpu1
 /   |  \ /   |  \
se1 se2 se3  se4 se5 se6
/  \/   \
  se21 se22   se61  se62

Assume CPU0 and CPU1 are smt siblings and task A's se is se21 while
task B's se is se61. To compare priority of task A and B, we compare
priority of se2 and se6. The smaller vruntime wins.

To make this work, the root level ses on both CPU should have a common
cfs_rq min vuntime, which I call it the core cfs_rq min vruntime.

This is mostly done in patch2/3.

Test:
1 wrote an cpu intensive program that does nothing but while(1) in
  main(), let's call it cpuhog;
2 start 2 cgroups, with one cgroup's cpuset binding to CPU2 and the
  other binding to cpu3. cpu2 and cpu3 are smt siblings on the test VM;
3 enable cpu.tag for the two cgroups;
4 start one cpuhog task in each cgroup;
5 kill both cpuhog tasks after 10 seconds;
6 check each cgroup's cpu usage.

If the task is scheduled fairly, then each cgroup's cpu usage should be
around 5s.

With v3, the cpu usage of both cgroups are sometimes 3s, 7s; sometimes
1s, 9s.

With the 3 patches applied, the numbers are mostly around 5s, 5s.

Another test is starting two cgroups simultaneously with cpu.tag set,
with one cgroup running: will-it-scale/page_fault1_processes -t 16 -s 30,
the other running: will-it-scale/page_fault2_processes -t 16 -s 30.
With v3, like I said last time, the later started page_fault processes
can't start running. With the 3 patches applied, both running at the
same time with each CPU having a relatively fair score:

output line of 16 page_fault1 processes in 1 second interval:
min:105225 max:131716 total:1872322

output line of 16 page_fault2 processes in 1 second interval:
min:86797 max:110554 total:1581177

Note the value in min and max, the smaller the gap is, the better the
faireness is.

Aubrey,

I haven't been able to run your workload yet...


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-07-22 Thread Aaron Lu
On 2019/7/22 18:26, Aubrey Li wrote:
> The granularity period of util_avg seems too large to decide task priority
> during pick_task(), at least it is in my case, cfs_prio_less() always picked
> core max task, so pick_task() eventually picked idle, which causes this change
> not very helpful for my case.
> 
>  -0 [057] dN..83.716973: __schedule: max: sysbench/2578
> 889050f68600
>  -0 [057] dN..83.716974: __schedule:
> (swapper/5/0;140,0,0) ?< (mysqld/2511;119,1042118143,0)
>  -0 [057] dN..83.716975: __schedule:
> (sysbench/2578;119,96449836,0) ?< (mysqld/2511;119,1042118143,0)
>  -0 [057] dN..83.716975: cfs_prio_less: picked
> sysbench/2578 util_avg: 20 527 -507 <=== here===
>  -0 [057] dN..83.716976: __schedule: pick_task cookie
> pick swapper/5/0 889050f68600

Can you share your setup of the test? I would like to try it locally.
Thanks.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-07-18 Thread Aaron Lu
On Thu, Jul 18, 2019 at 04:27:19PM -0700, Tim Chen wrote:
> 
> 
> On 7/18/19 3:07 AM, Aaron Lu wrote:
> > On Wed, Jun 19, 2019 at 02:33:02PM -0400, Julien Desfossez wrote:
> 
> > 
> > With the below patch on top of v3 that makes use of util_avg to decide
> > which task win, I can do all 8 steps and the final scores of the 2
> > workloads are: 1796191 and 2199586. The score number are not close,
> > suggesting some unfairness, but I can finish the test now...
> 
> Aaron,
> 
> Do you still see high variance in terms of workload throughput that
> was a problem with the previous version?

Any suggestion how to measure this?
It's not clear how Aubrey did his test, will need to take a look at
sysbench.

> >
> >  
> >  }
> > +
> > +bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
> > +{
> > +   struct sched_entity *sea = >se;
> > +   struct sched_entity *seb = >se;
> > +   bool samecore = task_cpu(a) == task_cpu(b);
> 
> 
> Probably "samecpu" instead of "samecore" will be more accurate.
> I think task_cpu(a) and task_cpu(b)
> can be different, but still belong to the same cpu core.

Right, definitely, guess I'm brain damaged.

> 
> > +   struct task_struct *p;
> > +   s64 delta;
> > +
> > +   if (samecore) {
> > +   /* vruntime is per cfs_rq */
> > +   while (!is_same_group(sea, seb)) {
> > +   int sea_depth = sea->depth;
> > +   int seb_depth = seb->depth;
> > +
> > +   if (sea_depth >= seb_depth)
> 
> Should this be strictly ">" instead of ">=" ?

Same depth doesn't necessarily mean same group while the purpose here is
to make sure they are in the same cfs_rq. When they are of the same
depth but in different cfs_rqs, we will continue to go up till we reach
rq->cfs.

> 
> > +   sea = parent_entity(sea);
> > +   if (sea_depth <= seb_depth)
> 
> Should use "<" ?

Ditto here.
When they are of the same depth but no in the same cfs_rq, both se will
move up.

> > +   seb = parent_entity(seb);
> > +   }
> > +
> > +   delta = (s64)(sea->vruntime - seb->vruntime);
> > +   }
> > +

Thanks.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-07-18 Thread Aaron Lu
On Wed, Jun 19, 2019 at 02:33:02PM -0400, Julien Desfossez wrote:
> On 17-Jun-2019 10:51:27 AM, Aubrey Li wrote:
> > The result looks still unfair, and particularly, the variance is too high,
> 
> I just want to confirm that I am also seeing the same issue with a
> similar setup. I also tried with the priority boost fix we previously
> posted, the results are slightly better, but we are still seeing a very
> high variance.
> 
> On average, the results I get for 10 30-seconds runs are still much
> better than nosmt (both sysbench pinned on the same sibling) for the
> memory benchmark, and pretty similar for the CPU benchmark, but the high
> variance between runs is indeed concerning.

I was thinking to use util_avg signal to decide which task win in
__prio_less() in the cross cpu case. The reason util_avg is chosen
is because it represents how cpu intensive the task is, so the end
result is, less cpu intensive task will preempt more cpu intensive
task.

Here is the test I have done to see how util_avg works
(on a single node, 16 cores, 32 cpus vm):
1 Start tmux and then start 3 windows with each running bash;
2 Place two shells into two different cgroups and both have cpu.tag set;
3 Switch to the 1st tmux window, start
  will-it-scale/page_fault1_processes -t 16 -s 30
  in the first tagged shell;
4 Switch to the 2nd tmux window;
5 Start
  will-it-scale/page_fault1_processes -t 16 -s 30
  in the 2nd tagged shell;
6 Switch to the 3rd tmux window;
7 Do some simple things in the 3rd untagged shell like ls to see if
  untagged task is able to proceed;
8 Wait for the two page_fault workloads to finish.

With v3 here, I can not do step 4 and later steps, i.e. the 16
page_fault1 processes started in step 3 will occupy all 16 cores and
other tasks do not have a chance to run, including tmux, which made
switching tmux window impossible.

With the below patch on top of v3 that makes use of util_avg to decide
which task win, I can do all 8 steps and the final scores of the 2
workloads are: 1796191 and 2199586. The score number are not close,
suggesting some unfairness, but I can finish the test now...

Here is the diff(consider it as a POC):

---
 kernel/sched/core.c  | 35 ++-
 kernel/sched/fair.c  | 36 
 kernel/sched/sched.h |  2 ++
 3 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 26fea68f7f54..7557a7bbb481 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -105,25 +105,8 @@ static inline bool prio_less(struct task_struct *a, struct 
task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-   if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-   u64 a_vruntime = a->se.vruntime;
-   u64 b_vruntime = b->se.vruntime;
-
-   /*
-* Normalize the vruntime if tasks are in different cpus.
-*/
-   if (task_cpu(a) != task_cpu(b)) {
-   b_vruntime -= task_cfs_rq(b)->min_vruntime;
-   b_vruntime += task_cfs_rq(a)->min_vruntime;
-
-   trace_printk("(%d:%Lu,%Lu,%Lu) <> (%d:%Lu,%Lu,%Lu)\n",
-a->pid, a_vruntime, a->se.vruntime, 
task_cfs_rq(a)->min_vruntime,
-b->pid, b_vruntime, b->se.vruntime, 
task_cfs_rq(b)->min_vruntime);
-
-   }
-
-   return !((s64)(a_vruntime - b_vruntime) <= 0);
-   }
+   if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
+   return cfs_prio_less(a, b);
 
return false;
 }
@@ -3663,20 +3646,6 @@ pick_task(struct rq *rq, const struct sched_class 
*class, struct task_struct *ma
if (!class_pick)
return NULL;
 
-   if (!cookie) {
-   /*
-* If class_pick is tagged, return it only if it has
-* higher priority than max.
-*/
-   bool max_is_higher = sched_feat(CORESCHED_STALL_FIX) ?
-max && !prio_less(max, class_pick) :
-max && prio_less(class_pick, max);
-   if (class_pick->core_cookie && max_is_higher)
-   return idle_sched_class.pick_task(rq);
-
-   return class_pick;
-   }
-
/*
 * If class_pick is idle or matches cookie, return early.
 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 26d29126d6a5..06fb00689db1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10740,3 +10740,39 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+   struct sched_entity *sea = >se;
+   struct sched_entity *seb = >se;
+   bool samecore = task_cpu(a) == task_cpu(b);

Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-05-31 Thread Aaron Lu
On Fri, May 31, 2019 at 02:53:21PM +0800, Aubrey Li wrote:
> On Fri, May 31, 2019 at 2:09 PM Aaron Lu  wrote:
> >
> > On 2019/5/31 13:12, Aubrey Li wrote:
> > > On Fri, May 31, 2019 at 11:01 AM Aaron Lu  
> > > wrote:
> > >>
> > >> This feels like "date" failed to schedule on some CPU
> > >> on time.
> > >>
> > >> My first reaction is: when shell wakes up from sleep, it will
> > >> fork date. If the script is untagged and those workloads are
> > >> tagged and all available cores are already running workload
> > >> threads, the forked date can lose to the running workload
> > >> threads due to __prio_less() can't properly do vruntime comparison
> > >> for tasks on different CPUs. So those idle siblings can't run
> > >> date and are idled instead. See my previous post on this:
> > >> https://lore.kernel.org/lkml/20190429033620.GA128241@aaronlu/
> > >> (Now that I re-read my post, I see that I didn't make it clear
> > >> that se_bash and se_hog are assigned different tags(e.g. hog is
> > >> tagged and bash is untagged).
> > >
> > > Yes, script is untagged. This looks like exactly the problem in you
> > > previous post. I didn't follow that, does that discussion lead to a 
> > > solution?
> >
> > No immediate solution yet.
> >
> > >>
> > >> Siblings being forced idle is expected due to the nature of core
> > >> scheduling, but when two tasks belonging to two siblings are
> > >> fighting for schedule, we should let the higher priority one win.
> > >>
> > >> It used to work on v2 is probably due to we mistakenly
> > >> allow different tagged tasks to schedule on the same core at
> > >> the same time, but that is fixed in v3.
> > >
> > > I have 64 threads running on a 104-CPU server, that is, when the
> >
> > 104-CPU means 52 cores I guess.
> > 64 threads may(should?) spread on all the 52 cores and that is enough
> > to make 'date' suffer.
> 
> 64 threads should spread onto all the 52 cores, but why they can get
> scheduled while untagged "date" can not? Is it because in the current

If 'date' didn't get scheduled, there will be no output at all unless
all those workload threads finished :-)

I guess the workload you used is not entirely CPU intensive, or 'date'
can be totally blocked due to START_DEBIT. But note that START_DEBIT
isn't the problem here, cross CPU vruntime comparison is.

> implementation the task with cookie always has higher priority than the
> task without a cookie?

No.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-05-31 Thread Aaron Lu
On 2019/5/31 13:12, Aubrey Li wrote:
> On Fri, May 31, 2019 at 11:01 AM Aaron Lu  wrote:
>>
>> This feels like "date" failed to schedule on some CPU
>> on time.
>>
>> My first reaction is: when shell wakes up from sleep, it will
>> fork date. If the script is untagged and those workloads are
>> tagged and all available cores are already running workload
>> threads, the forked date can lose to the running workload
>> threads due to __prio_less() can't properly do vruntime comparison
>> for tasks on different CPUs. So those idle siblings can't run
>> date and are idled instead. See my previous post on this:
>> https://lore.kernel.org/lkml/20190429033620.GA128241@aaronlu/
>> (Now that I re-read my post, I see that I didn't make it clear
>> that se_bash and se_hog are assigned different tags(e.g. hog is
>> tagged and bash is untagged).
> 
> Yes, script is untagged. This looks like exactly the problem in you
> previous post. I didn't follow that, does that discussion lead to a solution?

No immediate solution yet.

>>
>> Siblings being forced idle is expected due to the nature of core
>> scheduling, but when two tasks belonging to two siblings are
>> fighting for schedule, we should let the higher priority one win.
>>
>> It used to work on v2 is probably due to we mistakenly
>> allow different tagged tasks to schedule on the same core at
>> the same time, but that is fixed in v3.
> 
> I have 64 threads running on a 104-CPU server, that is, when the

104-CPU means 52 cores I guess.
64 threads may(should?) spread on all the 52 cores and that is enough
to make 'date' suffer.

> system has ~40% idle time, and "date" is still failed to be picked
> up onto CPU on time. This may be the nature of core scheduling,
> but it seems to be far from fairness.

Exactly.

> Shouldn't we share the core between (sysbench+gemmbench)
> and (date)? I mean core level sharing instead of  "date" starvation?

We need to make core scheduling fair, but due to no
immediate solution to vruntime comparison cross CPUs, it's not
done yet.


Re: [RFC PATCH v3 00/16] Core scheduling v3

2019-05-30 Thread Aaron Lu
On 2019/5/30 22:04, Aubrey Li wrote:
> On Thu, May 30, 2019 at 4:36 AM Vineeth Remanan Pillai
>  wrote:
>>
>> Third iteration of the Core-Scheduling feature.
>>
>> This version fixes mostly correctness related issues in v2 and
>> addresses performance issues. Also, addressed some crashes related
>> to cgroups and cpu hotplugging.
>>
>> We have tested and verified that incompatible processes are not
>> selected during schedule. In terms of performance, the impact
>> depends on the workload:
>> - on CPU intensive applications that use all the logical CPUs with
>>   SMT enabled, enabling core scheduling performs better than nosmt.
>> - on mixed workloads with considerable io compared to cpu usage,
>>   nosmt seems to perform better than core scheduling.
> 
> My testing scripts can not be completed on this version. I figured out the
> number of cpu utilization report entry didn't reach my minimal requirement.
> Then I wrote a simple script to verify.
> 
> $ cat test.sh
> #!/bin/sh
> 
> for i in `seq 1 10`
> do
> echo `date`, $i
> sleep 1
> done
> 

Is the shell put to some cgroup and assigned some tag or simply untagged?

> 
> Normally it works as below:
> 
> Thu May 30 14:13:40 CST 2019, 1
> Thu May 30 14:13:41 CST 2019, 2
> Thu May 30 14:13:42 CST 2019, 3
> Thu May 30 14:13:43 CST 2019, 4
> Thu May 30 14:13:44 CST 2019, 5
> Thu May 30 14:13:45 CST 2019, 6
> Thu May 30 14:13:46 CST 2019, 7
> Thu May 30 14:13:47 CST 2019, 8
> Thu May 30 14:13:48 CST 2019, 9
> Thu May 30 14:13:49 CST 2019, 10
> 
> When the system was running 32 sysbench threads and
> 32 gemmbench threads, it worked as below(the system
> has ~38% idle time)

Are the two workloads assigned different tags?
And how many cores/threads do you have?

> Thu May 30 14:14:20 CST 2019, 1
> Thu May 30 14:14:21 CST 2019, 2
> Thu May 30 14:14:22 CST 2019, 3
> Thu May 30 14:14:24 CST 2019, 4 <===x=
> Thu May 30 14:14:25 CST 2019, 5
> Thu May 30 14:14:26 CST 2019, 6
> Thu May 30 14:14:28 CST 2019, 7 <===x=
> Thu May 30 14:14:29 CST 2019, 8
> Thu May 30 14:14:31 CST 2019, 9 <===x=
> Thu May 30 14:14:34 CST 2019, 10 <===x=

This feels like "date" failed to schedule on some CPU
on time.

> And it got worse when the system was running 64/64 case,
> the system still had ~3% idle time
> Thu May 30 14:26:40 CST 2019, 1
> Thu May 30 14:26:46 CST 2019, 2
> Thu May 30 14:26:53 CST 2019, 3
> Thu May 30 14:27:01 CST 2019, 4
> Thu May 30 14:27:03 CST 2019, 5
> Thu May 30 14:27:11 CST 2019, 6
> Thu May 30 14:27:31 CST 2019, 7
> Thu May 30 14:27:32 CST 2019, 8
> Thu May 30 14:27:41 CST 2019, 9
> Thu May 30 14:27:56 CST 2019, 10
> 
> Any thoughts?

My first reaction is: when shell wakes up from sleep, it will
fork date. If the script is untagged and those workloads are
tagged and all available cores are already running workload
threads, the forked date can lose to the running workload
threads due to __prio_less() can't properly do vruntime comparison
for tasks on different CPUs. So those idle siblings can't run
date and are idled instead. See my previous post on this:

https://lore.kernel.org/lkml/20190429033620.GA128241@aaronlu/
(Now that I re-read my post, I see that I didn't make it clear
that se_bash and se_hog are assigned different tags(e.g. hog is
tagged and bash is untagged).

Siblings being forced idle is expected due to the nature of core
scheduling, but when two tasks belonging to two siblings are
fighting for schedule, we should let the higher priority one win.

It used to work on v2 is probably due to we mistakenly
allow different tagged tasks to schedule on the same core at
the same time, but that is fixed in v3.


Re: [RFC PATCH v2 00/17] Core scheduling v2

2019-05-08 Thread Aaron Lu
On Wed, May 08, 2019 at 01:49:09PM -0400, Julien Desfossez wrote:
> On 08-May-2019 10:30:09 AM, Aaron Lu wrote:
> > On Mon, May 06, 2019 at 03:39:37PM -0400, Julien Desfossez wrote:
> > > On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> > > > This is what I have used to make sure no two unmatched tasks being
> > > > scheduled on the same core: (on top of v1, I thinks it's easier to just
> > > > show the diff instead of commenting on various places of the patches :-)
> > > 
> > > We imported this fix in v2 and made some small changes and optimizations
> > > (with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
> > > and in both cases, the performance problem where the core can end up
> > 
> > By 'core', do you mean a logical CPU(hyperthread) or the entire core?
> No I really meant the entire core.
> 
> I’m sorry, I should have added a little bit more context. This relates
> to a performance issue we saw in v1 and discussed here:
> https://lore.kernel.org/lkml/20190410150116.gi2...@worktop.programming.kicks-ass.net/T/#mb9f1f54a99bac468fc5c55b06a9da306ff48e90b
> 
> We proposed a fix that solved this, Peter came up with a better one
> (https://lkml.org/lkml/2019/4/26/658), but if we add your isolation fix
> as posted above, the same problem reappears. Hope this clarifies your
> ask.

It's clear now, thanks.
I don't immediately see how my isolation fix would make your fix stop
working, will need to check. But I'm busy with other stuffs so it will
take a while.

> 
> I hope that we did not miss anything crucial while integrating your fix
> on top of v2 + Peter’s fix. The changes are conceptually similar, but we
> refactored it slightly to make the logic clear. Please have a look and
> let us know

I suppose you already have a branch that have all the bits there? I
wonder if you can share that branch somewhere so I can start working on
top of it to make sure we are on the same page?

Also, it would be good if you can share the workload, cmdline options,
how many workers need to start etc. to reproduce this issue.

Thanks.


Re: [RFC PATCH v2 00/17] Core scheduling v2

2019-05-07 Thread Aaron Lu
On Mon, May 06, 2019 at 03:39:37PM -0400, Julien Desfossez wrote:
> On 29-Apr-2019 11:53:21 AM, Aaron Lu wrote:
> > This is what I have used to make sure no two unmatched tasks being
> > scheduled on the same core: (on top of v1, I thinks it's easier to just
> > show the diff instead of commenting on various places of the patches :-)
> 
> We imported this fix in v2 and made some small changes and optimizations
> (with and without Peter’s fix from https://lkml.org/lkml/2019/4/26/658)
> and in both cases, the performance problem where the core can end up

By 'core', do you mean a logical CPU(hyperthread) or the entire core?

> idle with tasks in its runqueues came back.

Assume you meant a hyperthread, then the question is: when a hyperthread
is idle with tasks sitting in its runqueue, do these tasks match with the
other hyperthread's rq->curr? If so, then it is a problem that need to
be addressed; if not, then this is due to the constraint imposed by the
mitigation of L1TF.

Thanks.


Re: [RFC PATCH v2 13/17] sched: Add core wide task selection and scheduling.

2019-04-29 Thread Aaron Lu
On Tue, Apr 23, 2019 at 04:18:18PM +, Vineeth Remanan Pillai wrote:
> +// XXX fairness/fwd progress conditions
> +static struct task_struct *
> +pick_task(struct rq *rq, const struct sched_class *class, struct task_struct 
> *max)
> +{
> + struct task_struct *class_pick, *cookie_pick;
> + unsigned long cookie = 0UL;
> +
> + /*
> +  * We must not rely on rq->core->core_cookie here, because we fail to 
> reset
> +  * rq->core->core_cookie on new picks, such that we can detect if we 
> need
> +  * to do single vs multi rq task selection.
> +  */
> +
> + if (max && max->core_cookie) {
> + WARN_ON_ONCE(rq->core->core_cookie != max->core_cookie);
> + cookie = max->core_cookie;
> + }
> +
> + class_pick = class->pick_task(rq);
> + if (!cookie)
> + return class_pick;
> +
> + cookie_pick = sched_core_find(rq, cookie);
> + if (!class_pick)
> + return cookie_pick;
> +
> + /*
> +  * If class > max && class > cookie, it is the highest priority task on
> +  * the core (so far) and it must be selected, otherwise we must go with
> +  * the cookie pick in order to satisfy the constraint.
> +  */
> + if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, 
> class_pick))

It apapears to me the cpu_prio_less(cookie_pick, class_pick) isn't
needed.

If cookie_pick is idle task, then cpu_prio_less(cookie_pick, class_pick)
is always true;
If cookie_pick is not idle task and has the same sched class as
class_pick, then class_pick is the best candidate to run accoring to
their sched class. In this case, cpu_prio_less(cookie_pick, class_pick)
shouldn't return false or it feels like a bug;
If cookie_pick is not idle task and has a different sched class as
class_pick:
 - if cookie_pick's sched class has higher priority than class_pick's
   sched class, then cookie_pick should have been selected in previous
   sched class iteration; and since its cookie matches with max,
   everything should have been finished already;
 - if cookie_pick's sched class has lower priority than class_pick's
   sched class, then cpu_prio_less(cookie_pick, class_pick) will still
   returns true.

So looks like cpu_prio_less(cookie_pick, class_pick) should always
return true and thus not needed.

> + return class_pick;
> +
> + return cookie_pick;
> +}


Re: [RFC PATCH v2 11/17] sched: Basic tracking of matching tasks

2019-04-29 Thread Aaron Lu
On Tue, Apr 23, 2019 at 04:18:16PM +, Vineeth Remanan Pillai wrote:
> +/*
> + * Find left-most (aka, highest priority) task matching @cookie.
> + */
> +struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
> +{
> + struct rb_node *node = rq->core_tree.rb_node;
> + struct task_struct *node_task, *match;
> +
> + /*
> +  * The idle task always matches any cookie!
> +  */
> + match = idle_sched_class.pick_task(rq);
> +
> + while (node) {
> + node_task = container_of(node, struct task_struct, core_node);
> +
> + if (node_task->core_cookie < cookie) {
> + node = node->rb_left;

Should go right here?

> + } else if (node_task->core_cookie > cookie) {
> + node = node->rb_right;

And left here?

> + } else {
> + match = node_task;
> + node = node->rb_left;
> + }
> + }
> +
> + return match;
> +}


Re: [RFC PATCH v2 09/17] sched: Introduce sched_class::pick_task()

2019-04-28 Thread Aaron Lu
On Tue, Apr 23, 2019 at 04:18:14PM +, Vineeth Remanan Pillai wrote:
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c055bad249a9..45d86b862750 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4132,7 +4132,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct 
> sched_entity *curr)
>* Avoid running the skip buddy, if running something else can
>* be done without getting too unfair.
>*/
> - if (cfs_rq->skip == se) {
> + if (cfs_rq->skip && cfs_rq->skip == se) {
>   struct sched_entity *second;
>  
>   if (se == curr) {
> @@ -4150,13 +4150,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct 
> sched_entity *curr)
>   /*
>* Prefer last buddy, try to return the CPU to a preempted task.
>*/
> - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
> + if (left && cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 
> 1)
>   se = cfs_rq->last;
>  
>   /*
>* Someone really wants this to run. If it's not unfair, run it.
>*/
> - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
> + if (left && cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 
> 1)
>   se = cfs_rq->next;
>  
>   clear_buddies(cfs_rq, se);
> @@ -6937,6 +6937,37 @@ static void check_preempt_wakeup(struct rq *rq, struct 
> task_struct *p, int wake_
>   set_last_buddy(se);
>  }
>  
> +static struct task_struct *
> +pick_task_fair(struct rq *rq)
> +{
> + struct cfs_rq *cfs_rq = >cfs;
> + struct sched_entity *se;
> +
> + if (!cfs_rq->nr_running)
> + return NULL;
> +
> + do {
> + struct sched_entity *curr = cfs_rq->curr;
> +
> + se = pick_next_entity(cfs_rq, NULL);
> +
> + if (!(se || curr))
> + return NULL;

I think you have already avoided the null pointer access bug in
the above pick_next_entity() by doing multiple checks for null pointers:
cfs_rq->skip and left.

An alternative way to fix the null pointer access bug: if curr is the
only runnable entity in this cfs_rq, there is no need to call
pick_next_entity(cfs_rq, NULL) since the rbtree is empty. This way
pick_next_entity() doesn't need change. something like:

do {
struct sched_entity *curr = cfs_rq->curr;

if (curr && curr->on_rq && cfs_rq->nr_running == 1)
se = NULL;
else
se = pick_next_entity(cfs_rq, NULL);

/* the following code doesn't change */
> +
> + if (curr) {
> + if (se && curr->on_rq)
> + update_curr(cfs_rq);
> +
> + if (!se || entity_before(curr, se))
> + se = curr;
> + }
> +
> + cfs_rq = group_cfs_rq(se);
> + } while (cfs_rq);
> +
> + return task_of(se);
> +}

There is another problem I'm thinking: suppose cpu0 and cpu1 are
siblings and task A, B are runnable on cpu0 and curr is A. When cpu1
schedules, pick_task_fair() will also be called for cpu0 to decide
which CPU's task to preempt the other.

When pick_task_fair() is called for cpu0 due to cpu1 schedules:
curr(i.e. A) may only run a few nanoseconds, and thus can have a higher
vruntime than B. So we chose B to fight with task chosen from cpu1. If
B wins, we will schedule B on cpu0. If B loses, we will probably
schedule idle on cpu0(if cookie unmatch). Either case, A didn't get its
share. We probably want to make sure a task at least running for some
time before being considered to be preempted.


Re: [RFC PATCH v2 00/17] Core scheduling v2

2019-04-28 Thread Aaron Lu
On Tue, Apr 23, 2019 at 06:45:27PM +, Vineeth Remanan Pillai wrote:
> >> - Processes with different tags can still share the core
> 
> > I may have missed something... Could you explain this statement?
> 
> > This, to me, is the whole point of the patch series. If it's not
> > doing this then ... what?
> 
> What I meant was, the patch needs some more work to be accurate.
> There are some race conditions where the core violation can still
> happen. In our testing, we saw around 1 to 5% of the time being
> shared with incompatible processes. One example of this happening
> is as follows(let cpu 0 and 1 be siblings):
> - cpu 0 selects a process with a cookie
> - cpu 1 selects a higher priority process without cookie
> - Selection process restarts for cpu 0 and it might select a
>   process with cookie but with lesser priority.
> - Since it is lesser priority, the logic in pick_next_task
>   doesn't compare again for the cookie(trusts pick_task) and
>   proceeds.
> 
> This is one of the scenarios that we saw from traces, but there
> might be other race conditions as well. Fix seems a little
> involved and We are working on that.

This is what I have used to make sure no two unmatched tasks being
scheduled on the same core: (on top of v1, I thinks it's easier to just
show the diff instead of commenting on various places of the patches :-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb24a0141e57..0cdb1c6a00a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,6 +186,10 @@ struct task_struct *sched_core_find(struct rq *rq, 
unsigned long cookie)
 */
match = idle_sched_class.pick_task(rq);
 
+   /* TODO: untagged tasks are not in the core tree */
+   if (!cookie)
+   goto out;
+
while (node) {
node_task = container_of(node, struct task_struct, core_node);
 
@@ -199,6 +203,7 @@ struct task_struct *sched_core_find(struct rq *rq, unsigned 
long cookie)
}
}
 
+out:
return match;
 }
 
@@ -3634,6 +3639,8 @@ static inline bool cookie_match(struct task_struct *a, 
struct task_struct *b)
 }
 
 // XXX fairness/fwd progress conditions
+// when max is unset, return class_pick;
+// when max is set, return cookie_pick unless class_pick has higher priority.
 static struct task_struct *
 pick_task(struct rq *rq, const struct sched_class *class, struct task_struct 
*max)
 {
@@ -3652,7 +3659,19 @@ pick_task(struct rq *rq, const struct sched_class 
*class, struct task_struct *ma
}
 
class_pick = class->pick_task(rq);
-   if (!cookie)
+   /*
+* we can only return class_pick here when max is not set.
+*
+* when max is set and cookie is 0, we still have to check if
+* class_pick's cookie matches with max, or we can end up picking
+* an unmacthed task. e.g. max is untagged and class_pick here
+* is tagged.
+*/
+   if (!cookie && !max)
+   return class_pick;
+
+   /* in case class_pick matches with max, no need to check priority */
+   if (class_pick && cookie_match(class_pick, max))
return class_pick;
 
cookie_pick = sched_core_find(rq, cookie);
@@ -3663,8 +3682,11 @@ pick_task(struct rq *rq, const struct sched_class 
*class, struct task_struct *ma
 * If class > max && class > cookie, it is the highest priority task on
 * the core (so far) and it must be selected, otherwise we must go with
 * the cookie pick in order to satisfy the constraint.
+*
+* class_pick and cookie_pick are on the same cpu so use cpu_prio_less()
+* max and class_pick are on different cpus so use core_prio_less()
 */
-   if (cpu_prio_less(cookie_pick, class_pick) && cpu_prio_less(max, 
class_pick))
+   if (cpu_prio_less(cookie_pick, class_pick) && core_prio_less(max, 
class_pick))
return class_pick;
 
return cookie_pick;
@@ -3731,8 +3753,17 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
rq_i->core_pick = NULL;
 
-   if (i != cpu)
+   if (i != cpu) {
update_rq_clock(rq_i);
+   /*
+* we are going to pick tasks for both cpus, if our
+* sibling is idle and we have core_cookie set, now
+* is the time to clear/reset it so that we can do
+* an unconstained pick.
+*/
+   if (is_idle_task(rq_i->curr) && rq_i->core->core_cookie)
+   rq_i->core->core_cookie = 0;
+   }
}
 
/*
@@ -3794,20 +3825,42 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 *
 * NOTE: this is a linear max-filter and is thus bounded
 * in execution time.

Re: [RFC PATCH v2 11/17] sched: Basic tracking of matching tasks

2019-04-28 Thread Aaron Lu
On Tue, Apr 23, 2019 at 04:18:16PM +, Vineeth Remanan Pillai wrote:
> +/*
> + * l(a,b)
> + * le(a,b) := !l(b,a)
> + * g(a,b)  := l(b,a)
> + * ge(a,b) := !l(a,b)
> + */
> +
> +/* real prio, less is less */
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> bool core_cmp)
> +{
> + u64 vruntime;
> +
> + int pa = __task_prio(a), pb = __task_prio(b);
> +
> + if (-pa < -pb)
> + return true;
> +
> + if (-pb < -pa)
> + return false;
> +
> + if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
> + return !dl_time_before(a->dl.deadline, b->dl.deadline);
> +
> + vruntime = b->se.vruntime;
> + if (core_cmp) {
> + vruntime -= task_cfs_rq(b)->min_vruntime;
> + vruntime += task_cfs_rq(a)->min_vruntime;
> + }
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) <= 0);
> +
> + return false;
> +}

This unfortunately still doesn't work.

Consider the following task layout on two sibling CPUs(cpu0 and cpu1):

rq0.cfs_rqrq1.cfs_rq
| |
 se_bashse_hog

se_hog is the sched_entity for a cpu intensive task and se_bash is the
sched_entity for bash.

There are two problems:
1 SCHED_DEBIT
when user execute some commands through bash, say ls, bash will fork.
The newly forked ls' vruntime is set in the future due to SCHED_DEBIT.
This made 'ls' lose in __prio_less() when compared with hog, whose
vruntime may very likely be the same as its cfs_rq's min_vruntime.

This is OK since we do not want forked process to starve already running
ones. The problem is, since hog keeps running, its vruntime will always
sync with its cfs_rq's min_vruntime. OTOH, 'ls' can not run, its
cfs_rq's min_vruntime doesn't proceed, making 'ls' always lose to hog.

2 who schedules, who wins
so I disabled SCHED_DEBIT, for testing's purpose. When cpu0 schedules,
ls could win where both sched_entity's vruntime is the same as their
cfs_rqs' min_vruntime. So does hog: when cpu1 schedules, hog can preempt
ls in the same way. The end result is, interactive task can lose to cpu
intensive task and ls can feel "dead".

I haven't figured out a way to solve this yet. A core wide cfs_rq's
min_vruntime can probably solve this. Your suggestions are appreciated.


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-16 Thread Aaron Lu
On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> On Tue, Apr 02, 2019 at 02:46:13PM +0800, Aaron Lu wrote:
...
> > Perhaps we can test if max is on the same cpu as class_pick and then
> > use cpu_prio_less() or core_prio_less() accordingly here, or just
> > replace core_prio_less(max, p) with cpu_prio_less(max, p) in
> > pick_next_task(). The 2nd obviously breaks the comment of
> > core_prio_less() though: /* cannot compare vruntime across CPUs */.
> 
> Right, so as the comment states, you cannot directly compare vruntime
> across CPUs, doing that is completely buggered.
> 
> That also means that the cpu_prio_less(max, class_pick) in pick_task()
> is buggered, because there is no saying @max is on this CPU to begin
> with.

I find it difficult to decide which task of fair_sched_class having
higher priority when the two tasks belong to different CPUs.

Please see below.

> Another approach would be something like the below:
> 
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
>   */
>  
>  /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> u64 vruntime)
>  {
>   int pa = __task_prio(a), pb = __task_prio(b);
>  
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
>   if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
>   return !dl_time_before(a->dl.deadline, b->dl.deadline);
>  
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
>  
>   return false;
>  }
>  
>  static inline bool cpu_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
>  }
>  
>  static inline bool core_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime

(I used task_cfs_rq() instead of task_rq() above.)

Consider the following scenario:
(assume cpu0 and cpu1 are siblings of core0)

1 a cpu-intensive task belonging to cgroupA running on cpu0;
2 launch 'ls' from a shell(bash) which belongs to cgroupB;
3 'ls' blocked for a long time(if not forever).

Per my limited understanding: the launch of 'ls' cause bash to fork,
then the newly forked process' vruntime will be 6ms(probably not
precise) ahead of its cfs_rq due to START_DEBIT. Since there is no other
running task on that cfs_rq, the cfs_rq's min_vruntime doesn't have a
chance to get updated and the newly forked process will always have a
distance of 6ms compared to its cfs_rq and it will always 'lose' to the
cpu-intensive task belonging to cgroupA by core_prio_less().

No idea how to solve this...

> +
> + return __prio_less(a, b, vruntime);
>  }
>  
>  static inline bool __sched_core_less(struct task_struct *a, struct 
> task_struct *b)


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-10 Thread Aaron Lu
On Wed, Apr 10, 2019 at 04:44:18PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 10, 2019 at 12:36:33PM +0800, Aaron Lu wrote:
> > On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > > Now that we have accumulated quite a number of different fixes to your 
> > > orginal
> > > posted patches.  Would you like to post a v2 of the core scheduler with 
> > > the fixes?
> > 
> > One more question I'm not sure: should a task with cookie=0, i.e. tasks
> > that are untagged, be allowed to scheduled on the the same core with
> > another tagged task?
> 
> That was not meant to be possible.

Good to know this.

> > The current patch seems to disagree on this, e.g. in pick_task(),
> > if max is already chosen but max->core_cookie == 0, then we didn't care
> > about cookie and simply use class_pick for the other cpu. This means we
> > could schedule two tasks with different cookies(one is zero and the
> > other can be tagged).
> 
> When core_cookie==0 we shouldn't schedule the other siblings at all.

Not even with another untagged task?

I was thinking to leave host side tasks untagged, like kernel threads,
init and other system daemons or utilities etc., and tenant tasks tagged.
Then at least two untagged tasks can be scheduled on the same core.

Kindly let me know if you see a problem with this.

> > But then sched_core_find() only allow idle task to match with any tagged
> > tasks(we didn't place untagged tasks to the core tree of course :-).
> > 
> > Thoughts? Do I understand this correctly? If so, I think we probably
> > want to make this clear before v2. I personally feel, we shouldn't allow
> > untagged tasks(like kernel threads) to match with tagged tasks.
> 
> Agreed, cookie should always match or idle.

Thanks a lot for the clarification.


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-10 Thread Aaron Lu
On Wed, Apr 10, 2019 at 10:18:10PM +0800, Aubrey Li wrote:
> On Wed, Apr 10, 2019 at 12:36 PM Aaron Lu  wrote:
> >
> > On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> > > Now that we have accumulated quite a number of different fixes to your 
> > > orginal
> > > posted patches.  Would you like to post a v2 of the core scheduler with 
> > > the fixes?
> >
> > One more question I'm not sure: should a task with cookie=0, i.e. tasks
> > that are untagged, be allowed to scheduled on the the same core with
> > another tagged task?
> >
> > The current patch seems to disagree on this, e.g. in pick_task(),
> > if max is already chosen but max->core_cookie == 0, then we didn't care
> > about cookie and simply use class_pick for the other cpu. This means we
> > could schedule two tasks with different cookies(one is zero and the
> > other can be tagged).
> >
> > But then sched_core_find() only allow idle task to match with any tagged
> > tasks(we didn't place untagged tasks to the core tree of course :-).
> >
> > Thoughts? Do I understand this correctly? If so, I think we probably
> > want to make this clear before v2. I personally feel, we shouldn't allow
> > untagged tasks(like kernel threads) to match with tagged tasks.
> 
> Does it make sense if we take untagged tasks as hypervisor, and different
> cookie tasks as different VMs? Isolation is done between VMs, not between
> VM and hypervisor.
> 
> Did you see anything harmful if an untagged task and a tagged task
> run simultaneously on the same core?

VM can see hypervisor's data then, I think.
We probably do not want that happen.


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-09 Thread Aaron Lu
On Tue, Apr 09, 2019 at 11:09:45AM -0700, Tim Chen wrote:
> Now that we have accumulated quite a number of different fixes to your orginal
> posted patches.  Would you like to post a v2 of the core scheduler with the 
> fixes?

One more question I'm not sure: should a task with cookie=0, i.e. tasks
that are untagged, be allowed to scheduled on the the same core with
another tagged task?

The current patch seems to disagree on this, e.g. in pick_task(),
if max is already chosen but max->core_cookie == 0, then we didn't care
about cookie and simply use class_pick for the other cpu. This means we
could schedule two tasks with different cookies(one is zero and the
other can be tagged).

But then sched_core_find() only allow idle task to match with any tagged
tasks(we didn't place untagged tasks to the core tree of course :-).

Thoughts? Do I understand this correctly? If so, I think we probably
want to make this clear before v2. I personally feel, we shouldn't allow
untagged tasks(like kernel threads) to match with tagged tasks.


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-05 Thread Aaron Lu
On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> Another approach would be something like the below:
> 
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
>   */
>  
>  /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> u64 vruntime)
>  {
>   int pa = __task_prio(a), pb = __task_prio(b);
>  
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
>   if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
>   return !dl_time_before(a->dl.deadline, b->dl.deadline);
>  
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
 ~~~
I think <= should be used here, so that two tasks with the same vruntime
will return false. Or we could bounce two tasks having different tags
with one set to max in the first round and the other set to max in the
next round. CPU would stuck in __schedule() with irq disabled.

>  
>   return false;
>  }
>  
>  static inline bool cpu_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
>  }
>  
>  static inline bool core_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime

After some testing, I figured task_cfs_rq() should be used instead of
task_rq(:-)

With the two changes(and some other minor ones that still need more time
to sort out), I'm now able to start doing 2 full CPU kbuilds in 2 tagged
cgroups. Previouslly, the system would hang pretty soon after I started
kbuild in any tagged cgroup(presumbly, CPUs stucked in __schedule() with
irqs disabled).

And there is no warning appeared due to two tasks having different tags
get scheduled on the same CPU.

Thanks,
Aaron

> +
> + return __prio_less(a, b, vruntime);
>  }
>  
>  static inline bool __sched_core_less(struct task_struct *a, struct 
> task_struct *b)


Re: [RFC][PATCH 13/16] sched: Add core wide task selection and scheduling.

2019-04-02 Thread Aaron Lu
On Tue, Apr 02, 2019 at 10:28:12AM +0200, Peter Zijlstra wrote:
> Another approach would be something like the below:
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -87,7 +87,7 @@ static inline int __task_prio(struct tas
>   */
>  
>  /* real prio, less is less */
> -static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> bool runtime)
> +static inline bool __prio_less(struct task_struct *a, struct task_struct *b, 
> u64 vruntime)
>  {
>   int pa = __task_prio(a), pb = __task_prio(b);
>  
> @@ -104,21 +104,25 @@ static inline bool __prio_less(struct ta
>   if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
>   return !dl_time_before(a->dl.deadline, b->dl.deadline);
>  
> - if (pa == MAX_RT_PRIO + MAX_NICE && runtime) /* fair */
> - return !((s64)(a->se.vruntime - b->se.vruntime) < 0);
> + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
> + return !((s64)(a->se.vruntime - vruntime) < 0);
>  
>   return false;
>  }
>  
>  static inline bool cpu_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - return __prio_less(a, b, true);
> + return __prio_less(a, b, b->se.vruntime);
>  }
>  
>  static inline bool core_prio_less(struct task_struct *a, struct task_struct 
> *b)
>  {
> - /* cannot compare vruntime across CPUs */
> - return __prio_less(a, b, false);
> + u64 vruntime = b->se.vruntime;
> +
> + vruntime -= task_rq(b)->cfs.min_vruntime;
> + vruntime += task_rq(a)->cfs.min_vruntime
> +
> + return __prio_less(a, b, vruntime);
>  }
>  
>  static inline bool __sched_core_less(struct task_struct *a, struct 
> task_struct *b)

Brilliant, I like this approach, it makes core_prio_less() work across
CPUs. So I tested this, together with changing
cpu_prio_less(max, class_pick) to core_prio_less(max, class_pick) in
pick_task(), this problem is gone :-)

I verified with below debug code:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cb24a0141e57..50658e79363f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3832,6 +3832,14 @@ next_class:;
 
WARN_ON_ONCE(!rq_i->core_pick);
 
+   if (rq->core->core_cookie && rq_i->core_pick->core_cookie &&
+   rq->core->core_cookie != rq_i->core_pick->core_cookie) {
+   trace_printk("expect 0x%lx, cpu%d got 0x%lx\n",
+   rq->core->core_cookie, i,
+   rq_i->core_pick->core_cookie);
+   WARN_ON_ONCE(1);
+   }
+
rq_i->core_pick->core_occupation = occ;
 
if (i == cpu)
-- 
2.19.1.3.ge56e4f7



Re: [RFC][PATCH 00/16] sched: Core scheduling

2019-03-26 Thread Aaron Lu
On Tue, Mar 26, 2019 at 03:32:12PM +0800, Aaron Lu wrote:
> On Fri, Mar 08, 2019 at 11:44:01AM -0800, Subhra Mazumdar wrote:
> > 
> > On 2/22/19 4:45 AM, Mel Gorman wrote:
> > >On Mon, Feb 18, 2019 at 09:49:10AM -0800, Linus Torvalds wrote:
> > >>On Mon, Feb 18, 2019 at 9:40 AM Peter Zijlstra  
> > >>wrote:
> > >>>However; whichever way around you turn this cookie; it is expensive and 
> > >>>nasty.
> > >>Do you (or anybody else) have numbers for real loads?
> > >>
> > >>Because performance is all that matters. If performance is bad, then
> > >>it's pointless, since just turning off SMT is the answer.
> > >>
> > >I tried to do a comparison between tip/master, ht disabled and this series
> > >putting test workloads into a tagged cgroup but unfortunately it failed
> > >
> > >[  156.978682] BUG: unable to handle kernel NULL pointer dereference at 
> > >0058
> > >[  156.986597] #PF error: [normal kernel read fault]
> > >[  156.991343] PGD 0 P4D 0
> > >[  156.993905] Oops:  [#1] SMP PTI
> > >[  156.997438] CPU: 15 PID: 0 Comm: swapper/15 Not tainted 
> > >5.0.0-rc7-schedcore-v1r1 #1
> > >[  157.005161] Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 
> > >2.0a 05/09/2016
> > >[  157.012896] RIP: 0010:wakeup_preempt_entity.isra.70+0x9/0x50
> > >[  157.018613] Code: 00 be c0 82 60 00 e9 86 02 1a 00 66 0f 1f 44 00 00 48 
> > >c1 e7 03 be c0 80 60 00 e9 72 02 1a 00 66 90 0f 1f 44 00 00
> > >  53 48 89 fb <48> 2b 5e 58 48 85 db 7e 2c 48 81 3e 00 00 10 00 8b 05 a9 
> > > b7 19 01
> > >[  157.037544] RSP: 0018:c9000c5bbde8 EFLAGS: 00010086
> > >[  157.042819] RAX: 88810f5f6a00 RBX: 0001547f175c RCX: 
> > >0001
> > >[  157.050015] RDX: 88bf3bdb0a40 RSI:  RDI: 
> > >0001547f175c
> > >[  157.057215] RBP: 88bf7fae32c0 R08: 0001e358 R09: 
> > >88810fb9f000
> > >[  157.064410] R10: c9000c5bbe08 R11: 88810fb9f5c4 R12: 
> > >
> > >[  157.071611] R13: 88bf4e3ea0c0 R14:  R15: 
> > >88bf4e3ea7a8
> > >[  157.078814] FS:  () GS:88bf7f5c() 
> > >knlGS:
> > >[  157.086977] CS:  0010 DS:  ES:  CR0: 80050033
> > >[  157.092779] CR2: 0058 CR3: 0220e005 CR4: 
> > >003606e0
> > >[  157.099979] DR0:  DR1:  DR2: 
> > >
> > >[  157.109529] DR3:  DR6: fffe0ff0 DR7: 
> > >0400
> > >[  157.119058] Call Trace:
> > >[  157.123865]  pick_next_entity+0x61/0x110
> > >[  157.130137]  pick_task_fair+0x4b/0x90
> > >[  157.136124]  __schedule+0x365/0x12c0
> > >[  157.141985]  schedule_idle+0x1e/0x40
> > >[  157.147822]  do_idle+0x166/0x280
> > >[  157.153275]  cpu_startup_entry+0x19/0x20
> > >[  157.159420]  start_secondary+0x17a/0x1d0
> > >[  157.165568]  secondary_startup_64+0xa4/0xb0
> > >[  157.171985] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs 
> > >msr intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp 
> > >kvm_intel kvm ipmi_ssif irqbypass crc32_pclmul ghash_clmulni_intel ixgbe 
> > >aesni_intel xfrm_algo iTCO_wdt joydev iTCO_vendor_support libphy igb 
> > >aes_x86_64 crypto_simd ptp cryptd mei_me mdio pps_core ioatdma glue_helper 
> > >pcspkr ipmi_si lpc_ich i2c_i801 mei dca ipmi_devintf ipmi_msghandler 
> > >acpi_pad pcc_cpufreq button btrfs libcrc32c xor zstd_decompress 
> > >zstd_compress raid6_pq hid_generic usbhid ast i2c_algo_bit drm_kms_helper 
> > >syscopyarea sysfillrect sysimgblt fb_sys_fops xhci_pci crc32c_intel 
> > >ehci_pci ttm xhci_hcd ehci_hcd drm ahci usbcore mpt3sas libahci raid_class 
> > >scsi_transport_sas wmi sg nbd dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc 
> > >scsi_dh_alua
> > >[  157.258990] CR2: 0058
> > >[  157.264961] ---[ end trace a301ac5e3ee86fde ]---
> > >[  157.283719] RIP: 0010:wakeup_preempt_entity.isra.70+0x9/0x50
> > >[  157.291967] Code: 00 be c0 82 60 00 e9 86 02 1a 00 66 0f 1f 44 00 00 48 
> > >c1 e7 03 be c0 80 60 00 e9 72 02 1a 00 66 90 0f 1f 44 00 00 53 48 89 fb 
> > ><48> 2b 5e 58 48 85 db 7e 2c 48 81 3e 00 00 10 00 8b 05 a9 b7 19 01
> > >[  157.316121] RSP: 0018:c9000c5bbde8 EFLAGS: 00010086
> > >[  157.324060] RA

Re: [RFC][PATCH 00/16] sched: Core scheduling

2019-03-26 Thread Aaron Lu
On Fri, Mar 08, 2019 at 11:44:01AM -0800, Subhra Mazumdar wrote:
> 
> On 2/22/19 4:45 AM, Mel Gorman wrote:
> >On Mon, Feb 18, 2019 at 09:49:10AM -0800, Linus Torvalds wrote:
> >>On Mon, Feb 18, 2019 at 9:40 AM Peter Zijlstra  wrote:
> >>>However; whichever way around you turn this cookie; it is expensive and 
> >>>nasty.
> >>Do you (or anybody else) have numbers for real loads?
> >>
> >>Because performance is all that matters. If performance is bad, then
> >>it's pointless, since just turning off SMT is the answer.
> >>
> >I tried to do a comparison between tip/master, ht disabled and this series
> >putting test workloads into a tagged cgroup but unfortunately it failed
> >
> >[  156.978682] BUG: unable to handle kernel NULL pointer dereference at 
> >0058
> >[  156.986597] #PF error: [normal kernel read fault]
> >[  156.991343] PGD 0 P4D 0
> >[  156.993905] Oops:  [#1] SMP PTI
> >[  156.997438] CPU: 15 PID: 0 Comm: swapper/15 Not tainted 
> >5.0.0-rc7-schedcore-v1r1 #1
> >[  157.005161] Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 
> >05/09/2016
> >[  157.012896] RIP: 0010:wakeup_preempt_entity.isra.70+0x9/0x50
> >[  157.018613] Code: 00 be c0 82 60 00 e9 86 02 1a 00 66 0f 1f 44 00 00 48 
> >c1 e7 03 be c0 80 60 00 e9 72 02 1a 00 66 90 0f 1f 44 00 00
> >  53 48 89 fb <48> 2b 5e 58 48 85 db 7e 2c 48 81 3e 00 00 10 00 8b 05 a9 b7 
> > 19 01
> >[  157.037544] RSP: 0018:c9000c5bbde8 EFLAGS: 00010086
> >[  157.042819] RAX: 88810f5f6a00 RBX: 0001547f175c RCX: 
> >0001
> >[  157.050015] RDX: 88bf3bdb0a40 RSI:  RDI: 
> >0001547f175c
> >[  157.057215] RBP: 88bf7fae32c0 R08: 0001e358 R09: 
> >88810fb9f000
> >[  157.064410] R10: c9000c5bbe08 R11: 88810fb9f5c4 R12: 
> >
> >[  157.071611] R13: 88bf4e3ea0c0 R14:  R15: 
> >88bf4e3ea7a8
> >[  157.078814] FS:  () GS:88bf7f5c() 
> >knlGS:
> >[  157.086977] CS:  0010 DS:  ES:  CR0: 80050033
> >[  157.092779] CR2: 0058 CR3: 0220e005 CR4: 
> >003606e0
> >[  157.099979] DR0:  DR1:  DR2: 
> >
> >[  157.109529] DR3:  DR6: fffe0ff0 DR7: 
> >0400
> >[  157.119058] Call Trace:
> >[  157.123865]  pick_next_entity+0x61/0x110
> >[  157.130137]  pick_task_fair+0x4b/0x90
> >[  157.136124]  __schedule+0x365/0x12c0
> >[  157.141985]  schedule_idle+0x1e/0x40
> >[  157.147822]  do_idle+0x166/0x280
> >[  157.153275]  cpu_startup_entry+0x19/0x20
> >[  157.159420]  start_secondary+0x17a/0x1d0
> >[  157.165568]  secondary_startup_64+0xa4/0xb0
> >[  157.171985] Modules linked in: af_packet iscsi_ibft iscsi_boot_sysfs msr 
> >intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel 
> >kvm ipmi_ssif irqbypass crc32_pclmul ghash_clmulni_intel ixgbe aesni_intel 
> >xfrm_algo iTCO_wdt joydev iTCO_vendor_support libphy igb aes_x86_64 
> >crypto_simd ptp cryptd mei_me mdio pps_core ioatdma glue_helper pcspkr 
> >ipmi_si lpc_ich i2c_i801 mei dca ipmi_devintf ipmi_msghandler acpi_pad 
> >pcc_cpufreq button btrfs libcrc32c xor zstd_decompress zstd_compress 
> >raid6_pq hid_generic usbhid ast i2c_algo_bit drm_kms_helper syscopyarea 
> >sysfillrect sysimgblt fb_sys_fops xhci_pci crc32c_intel ehci_pci ttm 
> >xhci_hcd ehci_hcd drm ahci usbcore mpt3sas libahci raid_class 
> >scsi_transport_sas wmi sg nbd dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc 
> >scsi_dh_alua
> >[  157.258990] CR2: 0058
> >[  157.264961] ---[ end trace a301ac5e3ee86fde ]---
> >[  157.283719] RIP: 0010:wakeup_preempt_entity.isra.70+0x9/0x50
> >[  157.291967] Code: 00 be c0 82 60 00 e9 86 02 1a 00 66 0f 1f 44 00 00 48 
> >c1 e7 03 be c0 80 60 00 e9 72 02 1a 00 66 90 0f 1f 44 00 00 53 48 89 fb <48> 
> >2b 5e 58 48 85 db 7e 2c 48 81 3e 00 00 10 00 8b 05 a9 b7 19 01
> >[  157.316121] RSP: 0018:c9000c5bbde8 EFLAGS: 00010086
> >[  157.324060] RAX: 88810f5f6a00 RBX: 0001547f175c RCX: 
> >0001
> >[  157.333932] RDX: 88bf3bdb0a40 RSI:  RDI: 
> >0001547f175c
> >[  157.343795] RBP: 88bf7fae32c0 R08: 0001e358 R09: 
> >88810fb9f000
> >[  157.353634] R10: c9000c5bbe08 R11: 88810fb9f5c4 R12: 
> >
> >[  157.363506] R13: 88bf4e3ea0c0 R14:  R15: 
> >88bf4e3ea7a8
> >[  157.373395] FS:  () GS:88bf7f5c() 
> >knlGS:
> >[  157.384238] CS:  0010 DS:  ES:  CR0: 80050033
> >[  157.392709] CR2: 0058 CR3: 0220e005 CR4: 
> >003606e0
> >[  157.402601] DR0:  DR1:  DR2: 
> >
> >[  157.412488] DR3:  DR6: fffe0ff0 DR7: 
> >0400
> >[  157.422334] Kernel panic - not syncing: Attempted to kill the idle task!
> >[  158.529804] Shutting down cpus 

Re: [RFC][PATCH 00/16] sched: Core scheduling

2019-03-12 Thread Aaron Lu
On Mon, Mar 11, 2019 at 05:20:19PM -0700, Greg Kerr wrote:
> On Mon, Mar 11, 2019 at 4:36 PM Subhra Mazumdar
>  wrote:
> >
> >
> > On 3/11/19 11:34 AM, Subhra Mazumdar wrote:
> > >
> > > On 3/10/19 9:23 PM, Aubrey Li wrote:
> > >> On Sat, Mar 9, 2019 at 3:50 AM Subhra Mazumdar
> > >>  wrote:
> > >>> expected. Most of the performance recovery happens in patch 15 which,
> > >>> unfortunately, is also the one that introduces the hard lockup.
> > >>>
> > >> After applied Subhra's patch, the following is triggered by enabling
> > >> core sched when a cgroup is
> > >> under heavy load.
> > >>
> > > It seems you are facing some other deadlock where printk is involved.
> > > Can you
> > > drop the last patch (patch 16 sched: Debug bits...) and try?
> > >
> > > Thanks,
> > > Subhra
> > >
> > Never Mind, I am seeing the same lockdep deadlock output even w/o patch
> > 16. Btw
> > the NULL fix had something missing, following works.
> 
> Is this panic below, which occurs when I tag the first process,
> related or known? If not, I will debug it tomorrow.
> 
> [   46.831828] BUG: unable to handle kernel NULL pointer dereference
> at 
> [   46.831829] core sched enabled
> [   46.834261] #PF error: [WRITE]
> [   46.834899] PGD 0 P4D 0
> [   46.835438] Oops: 0002 [#1] SMP PTI
> [   46.836158] CPU: 0 PID: 11 Comm: migration/0 Not tainted
> 5.0.0everyday-glory-03949-g2d8fdbb66245-dirty #7
> [   46.838206] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> BIOS 1.10.2-1 04/01/2014

Probably due to SMT not enabled for this qemu setup.

rq->core can be NULL for cpu0: sched_cpu_starting() won't be called for
CPU0 and since it doesn't have any siblings, its rq->core remains
un-initialized(NULL).

> [   46.839844] RIP: 0010:_raw_spin_lock+0x7/0x20
> [   46.840448] Code: 00 00 00 65 81 05 25 ca 5c 51 00 02 00 00 31 c0
> ba ff 00 00 00 f0 0f b1 17 74 05 e9 93 80 46 ff f3 c3 90 31 c0 ba 01
> 00 00 00  0f b1 17 74 07 89 c6 e9 1c 6e 46 ff f3 c3 66 2e 0f 1f 84
> 00 00
> [   46.843000] RSP: 0018:b9d300cabe38 EFLAGS: 00010046
> [   46.843744] RAX:  RBX:  RCX: 
> 0004
> [   46.844709] RDX: 0001 RSI: aea435ae RDI: 
> 
> [   46.845689] RBP: b9d300cabed8 R08:  R09: 
> 00020800
> [   46.846651] R10: af603ea0 R11: 0001 R12: 
> af6576c0
> [   46.847619] R13: 9a57366c8000 R14: 9a5737401300 R15: 
> ade868f0
> [   46.848584] FS:  () GS:9a5737a0()
> knlGS:
> [   46.849680] CS:  0010 DS:  ES:  CR0: 80050033
> [   46.850455] CR2:  CR3: 0001d36fa000 CR4: 
> 06f0
> [   46.851415] DR0:  DR1:  DR2: 
> 
> [   46.852371] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [   46.853326] Call Trace:
> [   46.853678]  __schedule+0x139/0x11f0
> [   46.854167]  ? cpumask_next+0x16/0x20
> [   46.854668]  ? cpu_stop_queue_work+0xc0/0xc0
> [   46.855252]  ? sort_range+0x20/0x20
> [   46.855742]  schedule+0x4e/0x60
> [   46.856171]  smpboot_thread_fn+0x12a/0x160
> [   46.856725]  kthread+0x112/0x120
> [   46.857164]  ? kthread_stop+0xf0/0xf0
> [   46.857661]  ret_from_fork+0x35/0x40
> [   46.858146] Modules linked in:
> [   46.858562] CR2: 
> [   46.859022] ---[ end trace e9fff08f17bfd2be ]---


Re: [RFC PATCH 4/4] mm: Add merge page notifier

2019-02-11 Thread Aaron Lu
On 2019/2/11 23:58, Alexander Duyck wrote:
> On Mon, 2019-02-11 at 14:40 +0800, Aaron Lu wrote:
>> On 2019/2/5 2:15, Alexander Duyck wrote:
>>> From: Alexander Duyck 
>>>
>>> Because the implementation was limiting itself to only providing hints on
>>> pages huge TLB order sized or larger we introduced the possibility for free
>>> pages to slip past us because they are freed as something less then
>>> huge TLB in size and aggregated with buddies later.
>>>
>>> To address that I am adding a new call arch_merge_page which is called
>>> after __free_one_page has merged a pair of pages to create a higher order
>>> page. By doing this I am able to fill the gap and provide full coverage for
>>> all of the pages huge TLB order or larger.
>>>
>>> Signed-off-by: Alexander Duyck 
>>> ---
>>>  arch/x86/include/asm/page.h |   12 
>>>  arch/x86/kernel/kvm.c   |   28 
>>>  include/linux/gfp.h |4 
>>>  mm/page_alloc.c |2 ++
>>>  4 files changed, 46 insertions(+)
>>>
>>> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
>>> index 4487ad7a3385..9540a97c9997 100644
>>> --- a/arch/x86/include/asm/page.h
>>> +++ b/arch/x86/include/asm/page.h
>>> @@ -29,6 +29,18 @@ static inline void arch_free_page(struct page *page, 
>>> unsigned int order)
>>> if (static_branch_unlikely(_free_page_hint_enabled))
>>> __arch_free_page(page, order);
>>>  }
>>> +
>>> +struct zone;
>>> +
>>> +#define HAVE_ARCH_MERGE_PAGE
>>> +void __arch_merge_page(struct zone *zone, struct page *page,
>>> +  unsigned int order);
>>> +static inline void arch_merge_page(struct zone *zone, struct page *page,
>>> +  unsigned int order)
>>> +{
>>> +   if (static_branch_unlikely(_free_page_hint_enabled))
>>> +   __arch_merge_page(zone, page, order);
>>> +}
>>>  #endif
>>>  
>>>  #include 
>>> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
>>> index 09c91641c36c..957bb4f427bb 100644
>>> --- a/arch/x86/kernel/kvm.c
>>> +++ b/arch/x86/kernel/kvm.c
>>> @@ -785,6 +785,34 @@ void __arch_free_page(struct page *page, unsigned int 
>>> order)
>>>PAGE_SIZE << order);
>>>  }
>>>  
>>> +void __arch_merge_page(struct zone *zone, struct page *page,
>>> +  unsigned int order)
>>> +{
>>> +   /*
>>> +* The merging logic has merged a set of buddies up to the
>>> +* KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER. Since that is the case, take
>>> +* advantage of this moment to notify the hypervisor of the free
>>> +* memory.
>>> +*/
>>> +   if (order != KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
>>> +   return;
>>> +
>>> +   /*
>>> +* Drop zone lock while processing the hypercall. This
>>> +* should be safe as the page has not yet been added
>>> +* to the buddy list as of yet and all the pages that
>>> +* were merged have had their buddy/guard flags cleared
>>> +* and their order reset to 0.
>>> +*/
>>> +   spin_unlock(>lock);
>>> +
>>> +   kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
>>> +  PAGE_SIZE << order);
>>> +
>>> +   /* reacquire lock and resume freeing memory */
>>> +   spin_lock(>lock);
>>> +}
>>> +
>>>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>>>  
>>>  /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
>>> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
>>> index fdab7de7490d..4746d5560193 100644
>>> --- a/include/linux/gfp.h
>>> +++ b/include/linux/gfp.h
>>> @@ -459,6 +459,10 @@ static inline struct zonelist *node_zonelist(int nid, 
>>> gfp_t flags)
>>>  #ifndef HAVE_ARCH_FREE_PAGE
>>>  static inline void arch_free_page(struct page *page, int order) { }
>>>  #endif
>>> +#ifndef HAVE_ARCH_MERGE_PAGE
>>> +static inline void
>>> +arch_merge_page(struct zone *zone, struct page *page, int order) { }
>>> +#endif
>>>  #ifndef HAVE_ARCH_ALLOC_PAGE
>>>  static inline void arch_alloc_page(struct page *page, int order) { }
>>>  #endif
>>> diff --git a/mm/page_alloc.c b/mm/page_allo

Re: [RFC PATCH 4/4] mm: Add merge page notifier

2019-02-10 Thread Aaron Lu
On 2019/2/5 2:15, Alexander Duyck wrote:
> From: Alexander Duyck 
> 
> Because the implementation was limiting itself to only providing hints on
> pages huge TLB order sized or larger we introduced the possibility for free
> pages to slip past us because they are freed as something less then
> huge TLB in size and aggregated with buddies later.
> 
> To address that I am adding a new call arch_merge_page which is called
> after __free_one_page has merged a pair of pages to create a higher order
> page. By doing this I am able to fill the gap and provide full coverage for
> all of the pages huge TLB order or larger.
> 
> Signed-off-by: Alexander Duyck 
> ---
>  arch/x86/include/asm/page.h |   12 
>  arch/x86/kernel/kvm.c   |   28 
>  include/linux/gfp.h |4 
>  mm/page_alloc.c |2 ++
>  4 files changed, 46 insertions(+)
> 
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 4487ad7a3385..9540a97c9997 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -29,6 +29,18 @@ static inline void arch_free_page(struct page *page, 
> unsigned int order)
>   if (static_branch_unlikely(_free_page_hint_enabled))
>   __arch_free_page(page, order);
>  }
> +
> +struct zone;
> +
> +#define HAVE_ARCH_MERGE_PAGE
> +void __arch_merge_page(struct zone *zone, struct page *page,
> +unsigned int order);
> +static inline void arch_merge_page(struct zone *zone, struct page *page,
> +unsigned int order)
> +{
> + if (static_branch_unlikely(_free_page_hint_enabled))
> + __arch_merge_page(zone, page, order);
> +}
>  #endif
>  
>  #include 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 09c91641c36c..957bb4f427bb 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -785,6 +785,34 @@ void __arch_free_page(struct page *page, unsigned int 
> order)
>  PAGE_SIZE << order);
>  }
>  
> +void __arch_merge_page(struct zone *zone, struct page *page,
> +unsigned int order)
> +{
> + /*
> +  * The merging logic has merged a set of buddies up to the
> +  * KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER. Since that is the case, take
> +  * advantage of this moment to notify the hypervisor of the free
> +  * memory.
> +  */
> + if (order != KVM_PV_UNUSED_PAGE_HINT_MIN_ORDER)
> + return;
> +
> + /*
> +  * Drop zone lock while processing the hypercall. This
> +  * should be safe as the page has not yet been added
> +  * to the buddy list as of yet and all the pages that
> +  * were merged have had their buddy/guard flags cleared
> +  * and their order reset to 0.
> +  */
> + spin_unlock(>lock);
> +
> + kvm_hypercall2(KVM_HC_UNUSED_PAGE_HINT, page_to_phys(page),
> +PAGE_SIZE << order);
> +
> + /* reacquire lock and resume freeing memory */
> + spin_lock(>lock);
> +}
> +
>  #ifdef CONFIG_PARAVIRT_SPINLOCKS
>  
>  /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index fdab7de7490d..4746d5560193 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -459,6 +459,10 @@ static inline struct zonelist *node_zonelist(int nid, 
> gfp_t flags)
>  #ifndef HAVE_ARCH_FREE_PAGE
>  static inline void arch_free_page(struct page *page, int order) { }
>  #endif
> +#ifndef HAVE_ARCH_MERGE_PAGE
> +static inline void
> +arch_merge_page(struct zone *zone, struct page *page, int order) { }
> +#endif
>  #ifndef HAVE_ARCH_ALLOC_PAGE
>  static inline void arch_alloc_page(struct page *page, int order) { }
>  #endif
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index c954f8c1fbc4..7a1309b0b7c5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -913,6 +913,8 @@ static inline void __free_one_page(struct page *page,
>   page = page + (combined_pfn - pfn);
>   pfn = combined_pfn;
>   order++;
> +
> + arch_merge_page(zone, page, order);

Not a proper place AFAICS.

Assume we have an order-8 page being sent here for merge and its order-8
buddy is also free, then order++ became 9 and arch_merge_page() will do
the hint to host on this page as an order-9 page, no problem so far.
Then the next round, assume the now order-9 page's buddy is also free,
order++ will become 10 and arch_merge_page() will again hint to host on
this page as an order-10 page. The first hint to host became redundant.

I think the proper place is after the done_merging tag.

BTW, with arch_merge_page() at the proper place, I don't think patch3/4
is necessary - any freed page will go through merge anyway, we won't
lose any hint opportunity. Or do I miss anything?

>   }
>   if (max_order < MAX_ORDER) {
>   /* If we are here, it means order is >= pageblock_order.
> 


[PATCH v2 RESEND 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()

2018-11-19 Thread Aaron Lu
page_frag_free() calls __free_pages_ok() to free the page back to
Buddy. This is OK for high order page, but for order-0 pages, it
misses the optimization opportunity of using Per-Cpu-Pages and can
cause zone lock contention when called frequently.

Paweł Staszewski recently shared his result of 'how Linux kernel
handles normal traffic'[1] and from perf data, Jesper Dangaard Brouer
found the lock contention comes from page allocator:

  mlx5e_poll_tx_cq
  |
   --16.34%--napi_consume_skb
 |
 |--12.65%--__free_pages_ok
 |  |
 |   --11.86%--free_one_page
 | |
 | |--10.10%--queued_spin_lock_slowpath
 | |
 |  --0.65%--_raw_spin_lock
 |
 |--1.55%--page_frag_free
 |
  --1.44%--skb_release_data

Jesper explained how it happened: mlx5 driver RX-page recycle
mechanism is not effective in this workload and pages have to go
through the page allocator. The lock contention happens during
mlx5 DMA TX completion cycle. And the page allocator cannot keep
up at these speeds.[2]

I thought that __free_pages_ok() are mostly freeing high order
pages and thought this is an lock contention for high order pages
but Jesper explained in detail that __free_pages_ok() here are
actually freeing order-0 pages because mlx5 is using order-0 pages
to satisfy its page pool allocation request.[3]

The free path as pointed out by Jesper is:
skb_free_head()
  -> skb_free_frag()
-> page_frag_free()
And the pages being freed on this path are order-0 pages.

Fix this by doing similar things as in __page_frag_cache_drain() -
send the being freed page to PCP if it's an order-0 page, or
directly to Buddy if it is a high order page.

With this change, Paweł hasn't noticed lock contention yet in
his workload and Jesper has noticed a 7% performance improvement
using a micro benchmark and lock contention is gone. Ilias' test
on a 'low' speed 1Gbit interface on an cortex-a53 shows ~11%
performance boost testing with 64byte packets and __free_pages_ok()
disappeared from perf top.

[1]: https://www.spinics.net/lists/netdev/msg531362.html
[2]: https://www.spinics.net/lists/netdev/msg531421.html
[3]: https://www.spinics.net/lists/netdev/msg531556.html

Reported-by: Paweł Staszewski 
Analysed-by: Jesper Dangaard Brouer 
Acked-by: Vlastimil Babka 
Acked-by: Mel Gorman 
Acked-by: Jesper Dangaard Brouer 
Acked-by: Ilias Apalodimas 
Tested-by: Ilias Apalodimas 
Acked-by: Alexander Duyck 
Acked-by: Tariq Toukan 
---
 mm/page_alloc.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 421c5b652708..8f8c6b33b637 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4677,8 +4677,14 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page)))
-   __free_pages_ok(page, compound_order(page));
+   if (unlikely(put_page_testzero(page))) {
+   unsigned int order = compound_order(page);
+
+   if (order == 0)
+   free_unref_page(page);
+   else
+   __free_pages_ok(page, order);
+   }
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2



[PATCH v2 RESEND 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()

2018-11-19 Thread Aaron Lu
page_frag_free() calls __free_pages_ok() to free the page back to
Buddy. This is OK for high order page, but for order-0 pages, it
misses the optimization opportunity of using Per-Cpu-Pages and can
cause zone lock contention when called frequently.

Paweł Staszewski recently shared his result of 'how Linux kernel
handles normal traffic'[1] and from perf data, Jesper Dangaard Brouer
found the lock contention comes from page allocator:

  mlx5e_poll_tx_cq
  |
   --16.34%--napi_consume_skb
 |
 |--12.65%--__free_pages_ok
 |  |
 |   --11.86%--free_one_page
 | |
 | |--10.10%--queued_spin_lock_slowpath
 | |
 |  --0.65%--_raw_spin_lock
 |
 |--1.55%--page_frag_free
 |
  --1.44%--skb_release_data

Jesper explained how it happened: mlx5 driver RX-page recycle
mechanism is not effective in this workload and pages have to go
through the page allocator. The lock contention happens during
mlx5 DMA TX completion cycle. And the page allocator cannot keep
up at these speeds.[2]

I thought that __free_pages_ok() are mostly freeing high order
pages and thought this is an lock contention for high order pages
but Jesper explained in detail that __free_pages_ok() here are
actually freeing order-0 pages because mlx5 is using order-0 pages
to satisfy its page pool allocation request.[3]

The free path as pointed out by Jesper is:
skb_free_head()
  -> skb_free_frag()
-> page_frag_free()
And the pages being freed on this path are order-0 pages.

Fix this by doing similar things as in __page_frag_cache_drain() -
send the being freed page to PCP if it's an order-0 page, or
directly to Buddy if it is a high order page.

With this change, Paweł hasn't noticed lock contention yet in
his workload and Jesper has noticed a 7% performance improvement
using a micro benchmark and lock contention is gone. Ilias' test
on a 'low' speed 1Gbit interface on an cortex-a53 shows ~11%
performance boost testing with 64byte packets and __free_pages_ok()
disappeared from perf top.

[1]: https://www.spinics.net/lists/netdev/msg531362.html
[2]: https://www.spinics.net/lists/netdev/msg531421.html
[3]: https://www.spinics.net/lists/netdev/msg531556.html

Reported-by: Paweł Staszewski 
Analysed-by: Jesper Dangaard Brouer 
Acked-by: Vlastimil Babka 
Acked-by: Mel Gorman 
Acked-by: Jesper Dangaard Brouer 
Acked-by: Ilias Apalodimas 
Tested-by: Ilias Apalodimas 
Acked-by: Alexander Duyck 
Acked-by: Tariq Toukan 
---
 mm/page_alloc.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 421c5b652708..8f8c6b33b637 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4677,8 +4677,14 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page)))
-   __free_pages_ok(page, compound_order(page));
+   if (unlikely(put_page_testzero(page))) {
+   unsigned int order = compound_order(page);
+
+   if (order == 0)
+   free_unref_page(page);
+   else
+   __free_pages_ok(page, order);
+   }
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2



[PATCH v3 RESEND 2/2] mm/page_alloc: use a single function to free page

2018-11-19 Thread Aaron Lu
There are multiple places of freeing a page, they all do the same
things so a common function can be used to reduce code duplicate.

It also avoids bug fixed in one function but left in another.

Acked-by: Vlastimil Babka 
Signed-off-by: Aaron Lu 
---
 mm/page_alloc.c | 37 ++---
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f8c6b33b637..93cc8e686eca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4547,16 +4547,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
 {
-   if (put_page_testzero(page)) {
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (order == 0)
+   free_unref_page(page);
+   else
+   __free_pages_ok(page, order);
 }
 
+void __free_pages(struct page *page, unsigned int order)
+{
+   if (put_page_testzero(page))
+   free_the_page(page, order);
+}
 EXPORT_SYMBOL(__free_pages);
 
 void free_pages(unsigned long addr, unsigned int order)
@@ -4605,14 +4608,8 @@ void __page_frag_cache_drain(struct page *page, unsigned 
int count)
 {
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 
-   if (page_ref_sub_and_test(page, count)) {
-   unsigned int order = compound_order(page);
-
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (page_ref_sub_and_test(page, count))
+   free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
@@ -4677,14 +4674,8 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page))) {
-   unsigned int order = compound_order(page);
-
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (unlikely(put_page_testzero(page)))
+   free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2



[PATCH v3 RESEND 2/2] mm/page_alloc: use a single function to free page

2018-11-19 Thread Aaron Lu
There are multiple places of freeing a page, they all do the same
things so a common function can be used to reduce code duplicate.

It also avoids bug fixed in one function but left in another.

Acked-by: Vlastimil Babka 
Signed-off-by: Aaron Lu 
---
 mm/page_alloc.c | 37 ++---
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f8c6b33b637..93cc8e686eca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4547,16 +4547,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
 {
-   if (put_page_testzero(page)) {
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (order == 0)
+   free_unref_page(page);
+   else
+   __free_pages_ok(page, order);
 }
 
+void __free_pages(struct page *page, unsigned int order)
+{
+   if (put_page_testzero(page))
+   free_the_page(page, order);
+}
 EXPORT_SYMBOL(__free_pages);
 
 void free_pages(unsigned long addr, unsigned int order)
@@ -4605,14 +4608,8 @@ void __page_frag_cache_drain(struct page *page, unsigned 
int count)
 {
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
 
-   if (page_ref_sub_and_test(page, count)) {
-   unsigned int order = compound_order(page);
-
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (page_ref_sub_and_test(page, count))
+   free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
@@ -4677,14 +4674,8 @@ void page_frag_free(void *addr)
 {
struct page *page = virt_to_head_page(addr);
 
-   if (unlikely(put_page_testzero(page))) {
-   unsigned int order = compound_order(page);
-
-   if (order == 0)
-   free_unref_page(page);
-   else
-   __free_pages_ok(page, order);
-   }
+   if (unlikely(put_page_testzero(page)))
+   free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
 
-- 
2.17.2



[PATCH RESEND 0/2] free order-0 pages through PCP in page_frag_free() and cleanup

2018-11-19 Thread Aaron Lu
This is a resend of the two patches.

Patch 1 is the same as:
[PATCH v2 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()
https://lkml.kernel.org/r/20181106052833.gc6...@intel.com
With one more ack from Tariq Toukan.

Patch 2 is the same as:
[PATCH v3 2/2] mm/page_alloc: use a single function to free page
https://lkml.kernel.org/r/20181106113149.gc24...@intel.com
With some changelog rewording.

Applies on top of v4.20-rc2-mmotm-2018-11-16-14-52.

Aaron Lu (2):
  mm/page_alloc: free order-0 pages through PCP in page_frag_free()
  mm/page_alloc: use a single function to free page

 mm/page_alloc.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

-- 
2.17.2



[PATCH RESEND 0/2] free order-0 pages through PCP in page_frag_free() and cleanup

2018-11-19 Thread Aaron Lu
This is a resend of the two patches.

Patch 1 is the same as:
[PATCH v2 1/2] mm/page_alloc: free order-0 pages through PCP in page_frag_free()
https://lkml.kernel.org/r/20181106052833.gc6...@intel.com
With one more ack from Tariq Toukan.

Patch 2 is the same as:
[PATCH v3 2/2] mm/page_alloc: use a single function to free page
https://lkml.kernel.org/r/20181106113149.gc24...@intel.com
With some changelog rewording.

Applies on top of v4.20-rc2-mmotm-2018-11-16-14-52.

Aaron Lu (2):
  mm/page_alloc: free order-0 pages through PCP in page_frag_free()
  mm/page_alloc: use a single function to free page

 mm/page_alloc.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

-- 
2.17.2



[PATCH] mm/swap: use nr_node_ids for avail_lists in swap_info_struct

2018-11-15 Thread Aaron Lu
Since commit a2468cc9bfdf ("swap: choose swap device according to
numa node"), avail_lists field of swap_info_struct is changed to
an array with MAX_NUMNODES elements. This made swap_info_struct
size increased to 40KiB and needs an order-4 page to hold it.

This is not optimal in that:
1 Most systems have way less than MAX_NUMNODES(1024) nodes so it
  is a waste of memory;
2 It could cause swapon failure if the swap device is swapped on
  after system has been running for a while, due to no order-4
  page is available as pointed out by Vasily Averin.

Solve the above two issues by using nr_node_ids(which is the actual
possible node number the running system has) for avail_lists instead
of MAX_NUMNODES.

nr_node_ids is unknown at compile time so can't be directly used
when declaring this array. What I did here is to declare avail_lists
as zero element array and allocate space for it when allocating
space for swap_info_struct. The reason why keep using array but
not pointer is plist_for_each_entry needs the field to be part
of the struct, so pointer will not work.

This patch is on top of Vasily Averin's fix commit. I think the
use of kvzalloc for swap_info_struct is still needed in case
nr_node_ids is really big on some systems.

Cc: Vasily Averin 
Cc: Michal Hocko 
Cc: Huang Ying 
Signed-off-by: Aaron Lu 
---
 include/linux/swap.h | 11 ++-
 mm/swapfile.c|  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d8a07a4f171d..3d3630b3f63d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -233,7 +233,6 @@ struct swap_info_struct {
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
-   struct plist_node avail_lists[MAX_NUMNODES];/* entry in 
swap_avail_heads */
signed char type;   /* strange name for an index */
unsigned intmax;/* extent of the swap_map */
unsigned char *swap_map;/* vmalloc'ed array of usage counts */
@@ -274,6 +273,16 @@ struct swap_info_struct {
 */
struct work_struct discard_work; /* discard worker */
struct swap_cluster_list discard_clusters; /* discard clusters list */
+   struct plist_node avail_lists[0]; /*
+  * entries in swap_avail_heads, one
+  * entry per node.
+  * Must be last as the number of the
+  * array is nr_node_ids, which is not
+  * a fixed value so have to allocate
+  * dynamically.
+  * And it has to be an array so that
+  * plist_for_each_* can work.
+  */
 };
 
 #ifdef CONFIG_64BIT
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..6e06821623f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2812,8 +2812,9 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
+   int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
 
-   p = kvzalloc(sizeof(*p), GFP_KERNEL);
+   p = kvzalloc(size, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
 
-- 
2.17.2



[PATCH] mm/swap: use nr_node_ids for avail_lists in swap_info_struct

2018-11-15 Thread Aaron Lu
Since commit a2468cc9bfdf ("swap: choose swap device according to
numa node"), avail_lists field of swap_info_struct is changed to
an array with MAX_NUMNODES elements. This made swap_info_struct
size increased to 40KiB and needs an order-4 page to hold it.

This is not optimal in that:
1 Most systems have way less than MAX_NUMNODES(1024) nodes so it
  is a waste of memory;
2 It could cause swapon failure if the swap device is swapped on
  after system has been running for a while, due to no order-4
  page is available as pointed out by Vasily Averin.

Solve the above two issues by using nr_node_ids(which is the actual
possible node number the running system has) for avail_lists instead
of MAX_NUMNODES.

nr_node_ids is unknown at compile time so can't be directly used
when declaring this array. What I did here is to declare avail_lists
as zero element array and allocate space for it when allocating
space for swap_info_struct. The reason why keep using array but
not pointer is plist_for_each_entry needs the field to be part
of the struct, so pointer will not work.

This patch is on top of Vasily Averin's fix commit. I think the
use of kvzalloc for swap_info_struct is still needed in case
nr_node_ids is really big on some systems.

Cc: Vasily Averin 
Cc: Michal Hocko 
Cc: Huang Ying 
Signed-off-by: Aaron Lu 
---
 include/linux/swap.h | 11 ++-
 mm/swapfile.c|  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d8a07a4f171d..3d3630b3f63d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -233,7 +233,6 @@ struct swap_info_struct {
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
-   struct plist_node avail_lists[MAX_NUMNODES];/* entry in 
swap_avail_heads */
signed char type;   /* strange name for an index */
unsigned intmax;/* extent of the swap_map */
unsigned char *swap_map;/* vmalloc'ed array of usage counts */
@@ -274,6 +273,16 @@ struct swap_info_struct {
 */
struct work_struct discard_work; /* discard worker */
struct swap_cluster_list discard_clusters; /* discard clusters list */
+   struct plist_node avail_lists[0]; /*
+  * entries in swap_avail_heads, one
+  * entry per node.
+  * Must be last as the number of the
+  * array is nr_node_ids, which is not
+  * a fixed value so have to allocate
+  * dynamically.
+  * And it has to be an array so that
+  * plist_for_each_* can work.
+  */
 };
 
 #ifdef CONFIG_64BIT
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..6e06821623f6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2812,8 +2812,9 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
+   int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
 
-   p = kvzalloc(sizeof(*p), GFP_KERNEL);
+   p = kvzalloc(size, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
 
-- 
2.17.2



Re: [LKP] [bpf] fd978bf7fd: will-it-scale.per_process_ops -4.0% regression

2018-11-08 Thread Aaron Lu
On Fri, Nov 09, 2018 at 08:19:54AM +0800, Rong Chen wrote:
> 
> 
> On 11/02/2018 04:36 PM, Daniel Borkmann wrote:
> > Hi Rong,
> > 
> > On 11/02/2018 03:14 AM, kernel test robot wrote:
> > > Greeting,
> > > 
> > > FYI, we noticed a -4.0% regression of will-it-scale.per_process_ops due 
> > > to commit:
> > > 
> > > 
> > > commit: fd978bf7fd312581a7ca454a991f0ffb34c4204b ("bpf: Add reference 
> > > tracking to verifier")
> > > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> > > 
> > > in testcase: will-it-scale
> > > on test machine: 80 threads Skylake with 64G memory
> > > with following parameters:
> > > 
> > >   nr_task: 100%
> > >   mode: process
> > >   test: mmap1
> > >   cpufreq_governor: performance
> > Hmm, so the test cases you are running are these ones:
> > 
> >https://github.com/antonblanchard/will-it-scale/blob/master/tests/mmap1.c
> >https://github.com/antonblanchard/will-it-scale/blob/master/tests/mmap2.c
> > 
> > The commit from Joe referenced above only adds a feature to the (eBPF) 
> > verifier. Looking
> > through will-it-scale test suite, looks like there's neither cBPF nor eBPF 
> > in use and if
> > it would have been the former (e.g. via seccomp BPF), then also this has no 
> > effect on it
> > since this doesn't load through bpf(2); meaning if so then something must 
> > use eBPF here,
> > but then it's also unclear right now how this would even remotely affect 
> > mmap() test
> > performance by -4%. Hm, are you certain it's not a false bisection? If so, 
> > what else is
> > loading eBPF on your machine in parallel when you run the tests?
> 
> Please accept my apologies for taking your time, It's a false bisection.

It shouldn't be a false bisection - I rerun the two kernels using this
workload with this commit and its parent on the same test box, the
performance gap is there.

> Something strange happened, we're trying to figure out the root cause.

Yeah, it's not clear what it is :-)

Daniel & Joe,

We will see if we can find something and will let you know if we do.
Before then, I think you can ignore this report, thanks for your time.

Regards,
Aaron

> > > test-description: Will It Scale takes a testcase and runs it from 1 
> > > through to n parallel copies to see if the testcase will scale. It builds 
> > > both a process and threads based test in order to see any differences 
> > > between the two.
> > > test-url: https://github.com/antonblanchard/will-it-scale
> > > 
> > > In addition to that, the commit also has significant impact on the 
> > > following tests:
> > > 
> > > +--+---+
> > > | testcase: change | will-it-scale: will-it-scale.per_process_ops -3.8% 
> > > regression |
> > > | test machine | 80 threads Skylake with 64G memory   
> > >  |
> > > | test parameters  | cpufreq_governor=performance 
> > >  |
> > > |  | mode=process 
> > >  |
> > > |  | nr_task=100% 
> > >  |
> > > |  | test=mmap2   
> > >  |
> > > +--+---+
> > > 
> > > 
> > > Details are as below:
> > > -->
> > > 
> > > 
> > > To reproduce:
> > > 
> > >  git clone https://github.com/intel/lkp-tests.git
> > >  cd lkp-tests
> > >  bin/lkp install job.yaml  # job file is attached in this email
> > >  bin/lkp run job.yaml
> > > 
> > > =
> > > compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase:
> > >
> > > gcc-7/performance/x86_64-rhel-7.2/process/100%/debian-x86_64-2018-04-03.cgz/lkp-skl-2sp2/mmap1/will-it-scale
> > > 
> > > commit:
> > >84dbf35073 ("bpf: Macrofy stack state copy")
> > >fd978bf7fd ("bpf: Add reference tracking to verifier")
> > > 
> > > 84dbf3507349696b fd978bf7fd312581a7ca454a99
> > >  --
> > >   %stddev %change %stddev
> > >   \  |\
> > >   16811-4.0%  16140
> > > will-it-scale.per_process_ops
> > > 1344946-4.0%1291230will-it-scale.workload
> > >  107.75 ± 38%+252.4% 379.75 ± 93%  cpuidle.POLL.usage
> > >  121.70 ± 18% +18.9% 144.70 ±  4%  
> > > sched_debug.cfs_rq:/.exec_clock.stddev
> > >4933+2.0%   5031
> > > proc-vmstat.nr_inactive_anon
> > >4933+2.0%   5031
> > > proc-vmstat.nr_zone_inactive_anon
> > >9874+9.0%  10765 ±  7%  
> > > 

Re: [LKP] [bpf] fd978bf7fd: will-it-scale.per_process_ops -4.0% regression

2018-11-08 Thread Aaron Lu
On Fri, Nov 09, 2018 at 08:19:54AM +0800, Rong Chen wrote:
> 
> 
> On 11/02/2018 04:36 PM, Daniel Borkmann wrote:
> > Hi Rong,
> > 
> > On 11/02/2018 03:14 AM, kernel test robot wrote:
> > > Greeting,
> > > 
> > > FYI, we noticed a -4.0% regression of will-it-scale.per_process_ops due 
> > > to commit:
> > > 
> > > 
> > > commit: fd978bf7fd312581a7ca454a991f0ffb34c4204b ("bpf: Add reference 
> > > tracking to verifier")
> > > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> > > 
> > > in testcase: will-it-scale
> > > on test machine: 80 threads Skylake with 64G memory
> > > with following parameters:
> > > 
> > >   nr_task: 100%
> > >   mode: process
> > >   test: mmap1
> > >   cpufreq_governor: performance
> > Hmm, so the test cases you are running are these ones:
> > 
> >https://github.com/antonblanchard/will-it-scale/blob/master/tests/mmap1.c
> >https://github.com/antonblanchard/will-it-scale/blob/master/tests/mmap2.c
> > 
> > The commit from Joe referenced above only adds a feature to the (eBPF) 
> > verifier. Looking
> > through will-it-scale test suite, looks like there's neither cBPF nor eBPF 
> > in use and if
> > it would have been the former (e.g. via seccomp BPF), then also this has no 
> > effect on it
> > since this doesn't load through bpf(2); meaning if so then something must 
> > use eBPF here,
> > but then it's also unclear right now how this would even remotely affect 
> > mmap() test
> > performance by -4%. Hm, are you certain it's not a false bisection? If so, 
> > what else is
> > loading eBPF on your machine in parallel when you run the tests?
> 
> Please accept my apologies for taking your time, It's a false bisection.

It shouldn't be a false bisection - I rerun the two kernels using this
workload with this commit and its parent on the same test box, the
performance gap is there.

> Something strange happened, we're trying to figure out the root cause.

Yeah, it's not clear what it is :-)

Daniel & Joe,

We will see if we can find something and will let you know if we do.
Before then, I think you can ignore this report, thanks for your time.

Regards,
Aaron

> > > test-description: Will It Scale takes a testcase and runs it from 1 
> > > through to n parallel copies to see if the testcase will scale. It builds 
> > > both a process and threads based test in order to see any differences 
> > > between the two.
> > > test-url: https://github.com/antonblanchard/will-it-scale
> > > 
> > > In addition to that, the commit also has significant impact on the 
> > > following tests:
> > > 
> > > +--+---+
> > > | testcase: change | will-it-scale: will-it-scale.per_process_ops -3.8% 
> > > regression |
> > > | test machine | 80 threads Skylake with 64G memory   
> > >  |
> > > | test parameters  | cpufreq_governor=performance 
> > >  |
> > > |  | mode=process 
> > >  |
> > > |  | nr_task=100% 
> > >  |
> > > |  | test=mmap2   
> > >  |
> > > +--+---+
> > > 
> > > 
> > > Details are as below:
> > > -->
> > > 
> > > 
> > > To reproduce:
> > > 
> > >  git clone https://github.com/intel/lkp-tests.git
> > >  cd lkp-tests
> > >  bin/lkp install job.yaml  # job file is attached in this email
> > >  bin/lkp run job.yaml
> > > 
> > > =
> > > compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase:
> > >
> > > gcc-7/performance/x86_64-rhel-7.2/process/100%/debian-x86_64-2018-04-03.cgz/lkp-skl-2sp2/mmap1/will-it-scale
> > > 
> > > commit:
> > >84dbf35073 ("bpf: Macrofy stack state copy")
> > >fd978bf7fd ("bpf: Add reference tracking to verifier")
> > > 
> > > 84dbf3507349696b fd978bf7fd312581a7ca454a99
> > >  --
> > >   %stddev %change %stddev
> > >   \  |\
> > >   16811-4.0%  16140
> > > will-it-scale.per_process_ops
> > > 1344946-4.0%1291230will-it-scale.workload
> > >  107.75 ± 38%+252.4% 379.75 ± 93%  cpuidle.POLL.usage
> > >  121.70 ± 18% +18.9% 144.70 ±  4%  
> > > sched_debug.cfs_rq:/.exec_clock.stddev
> > >4933+2.0%   5031
> > > proc-vmstat.nr_inactive_anon
> > >4933+2.0%   5031
> > > proc-vmstat.nr_zone_inactive_anon
> > >9874+9.0%  10765 ±  7%  
> > > 

Re: [PATCH v2 2/2] mm/page_alloc: use a single function to free page

2018-11-06 Thread Aaron Lu
On Tue, Nov 06, 2018 at 10:32:00AM +0100, Vlastimil Babka wrote:
> On 11/6/18 9:47 AM, Aaron Lu wrote:
> > On Tue, Nov 06, 2018 at 09:16:20AM +0100, Vlastimil Babka wrote:
> >> On 11/6/18 6:30 AM, Aaron Lu wrote:
> >>> We have multiple places of freeing a page, most of them doing similar
> >>> things and a common function can be used to reduce code duplicate.
> >>>
> >>> It also avoids bug fixed in one function but left in another.
> >>>
> >>> Signed-off-by: Aaron Lu 
> >>
> >> Acked-by: Vlastimil Babka 
> > 
> > Thanks.
> > 
> >> I assume there's no arch that would run page_ref_sub_and_test(1) slower
> >> than put_page_testzero(), for the critical __free_pages() case?
> > 
> > Good question.
> > 
> > I followed the non-arch specific calls and found that:
> > page_ref_sub_and_test() ends up calling atomic_sub_return(i, v) while
> > put_page_testzero() ends up calling atomic_sub_return(1, v). So they
> > should be same for archs that do not have their own implementations.
> 
> x86 seems to distinguish between DECL and SUBL, see

Ah right.

> arch/x86/include/asm/atomic.h although I could not figure out where does
> e.g. arch_atomic_dec_and_test become atomic_dec_and_test to override the
> generic implementation.

I didn't check that either but I think it will :-)

> I don't know if the CPU e.g. executes DECL faster, but objectively it
> has one parameter less. Maybe it doesn't matter?

No immediate idea.

> > Back to your question: I don't know either.
> > If this is deemed unsafe, we can probably keep the ref modify part in
> > their original functions and only take the free part into a common
> > function.
> 
> I guess you could also employ  if (__builtin_constant_p(nr)) in
> free_the_page(), but the result will be ugly I guess, and maybe not
> worth it :)

Right I can't make it clean.
I think I'll just move the free part a common function and leave the ref
decreasing part as is to be safe.

Regards,
Aaron
 
> >>> ---
> >>> v2: move comments close to code as suggested by Dave.
> >>>
> >>>  mm/page_alloc.c | 36 
> >>>  1 file changed, 16 insertions(+), 20 deletions(-)
> >>>
> >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >>> index 91a9a6af41a2..4faf6b7bf225 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
> >>>  }
> >>>  EXPORT_SYMBOL(get_zeroed_page);
> >>>  
> >>> -void __free_pages(struct page *page, unsigned int order)
> >>> +static inline void free_the_page(struct page *page, unsigned int order, 
> >>> int nr)
> >>>  {
> >>> - if (put_page_testzero(page)) {
> >>> + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> >>> +
> >>> + /*
> >>> +  * Free a page by reducing its ref count by @nr.
> >>> +  * If its refcount reaches 0, then according to its order:
> >>> +  * order0: send to PCP;
> >>> +  * high order: directly send to Buddy.
> >>> +  */
> >>> + if (page_ref_sub_and_test(page, nr)) {
> >>>   if (order == 0)
> >>>   free_unref_page(page);
> >>>   else
> >>> @@ -4435,6 +4443,10 @@ void __free_pages(struct page *page, unsigned int 
> >>> order)
> >>>   }
> >>>  }
> >>>  
> >>> +void __free_pages(struct page *page, unsigned int order)
> >>> +{
> >>> + free_the_page(page, order, 1);
> >>> +}
> >>>  EXPORT_SYMBOL(__free_pages);
> >>>  
> >>>  void free_pages(unsigned long addr, unsigned int order)
> >>> @@ -4481,16 +4493,7 @@ static struct page 
> >>> *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>>  
> >>>  void __page_frag_cache_drain(struct page *page, unsigned int count)
> >>>  {
> >>> - VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> >>> -
> >>> - if (page_ref_sub_and_test(page, count)) {
> >>> - unsigned int order = compound_order(page);
> >>> -
> >>> - if (order == 0)
> >>> - free_unref_page(page);
> >>> - else
> >>> - __free_pages_ok(page, order);
> >>> - }
> >>> + free_the_page(page, compound_order(page), count);
> >>>  }
> >>>  EXPORT_SYMBOL(__page_frag_cache_drain);
> >>>  
> >>> @@ -4555,14 +4558,7 @@ void page_frag_free(void *addr)
> >>>  {
> >>>   struct page *page = virt_to_head_page(addr);
> >>>  
> >>> - if (unlikely(put_page_testzero(page))) {
> >>> - unsigned int order = compound_order(page);
> >>> -
> >>> - if (order == 0)
> >>> - free_unref_page(page);
> >>> - else
> >>> - __free_pages_ok(page, order);
> >>> - }
> >>> + free_the_page(page, compound_order(page), 1);
> >>>  }
> >>>  EXPORT_SYMBOL(page_frag_free);
> >>>  
> >>>
> >>
> 


Re: [PATCH v2 2/2] mm/page_alloc: use a single function to free page

2018-11-06 Thread Aaron Lu
On Tue, Nov 06, 2018 at 10:32:00AM +0100, Vlastimil Babka wrote:
> On 11/6/18 9:47 AM, Aaron Lu wrote:
> > On Tue, Nov 06, 2018 at 09:16:20AM +0100, Vlastimil Babka wrote:
> >> On 11/6/18 6:30 AM, Aaron Lu wrote:
> >>> We have multiple places of freeing a page, most of them doing similar
> >>> things and a common function can be used to reduce code duplicate.
> >>>
> >>> It also avoids bug fixed in one function but left in another.
> >>>
> >>> Signed-off-by: Aaron Lu 
> >>
> >> Acked-by: Vlastimil Babka 
> > 
> > Thanks.
> > 
> >> I assume there's no arch that would run page_ref_sub_and_test(1) slower
> >> than put_page_testzero(), for the critical __free_pages() case?
> > 
> > Good question.
> > 
> > I followed the non-arch specific calls and found that:
> > page_ref_sub_and_test() ends up calling atomic_sub_return(i, v) while
> > put_page_testzero() ends up calling atomic_sub_return(1, v). So they
> > should be same for archs that do not have their own implementations.
> 
> x86 seems to distinguish between DECL and SUBL, see

Ah right.

> arch/x86/include/asm/atomic.h although I could not figure out where does
> e.g. arch_atomic_dec_and_test become atomic_dec_and_test to override the
> generic implementation.

I didn't check that either but I think it will :-)

> I don't know if the CPU e.g. executes DECL faster, but objectively it
> has one parameter less. Maybe it doesn't matter?

No immediate idea.

> > Back to your question: I don't know either.
> > If this is deemed unsafe, we can probably keep the ref modify part in
> > their original functions and only take the free part into a common
> > function.
> 
> I guess you could also employ  if (__builtin_constant_p(nr)) in
> free_the_page(), but the result will be ugly I guess, and maybe not
> worth it :)

Right I can't make it clean.
I think I'll just move the free part a common function and leave the ref
decreasing part as is to be safe.

Regards,
Aaron
 
> >>> ---
> >>> v2: move comments close to code as suggested by Dave.
> >>>
> >>>  mm/page_alloc.c | 36 
> >>>  1 file changed, 16 insertions(+), 20 deletions(-)
> >>>
> >>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> >>> index 91a9a6af41a2..4faf6b7bf225 100644
> >>> --- a/mm/page_alloc.c
> >>> +++ b/mm/page_alloc.c
> >>> @@ -4425,9 +4425,17 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
> >>>  }
> >>>  EXPORT_SYMBOL(get_zeroed_page);
> >>>  
> >>> -void __free_pages(struct page *page, unsigned int order)
> >>> +static inline void free_the_page(struct page *page, unsigned int order, 
> >>> int nr)
> >>>  {
> >>> - if (put_page_testzero(page)) {
> >>> + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> >>> +
> >>> + /*
> >>> +  * Free a page by reducing its ref count by @nr.
> >>> +  * If its refcount reaches 0, then according to its order:
> >>> +  * order0: send to PCP;
> >>> +  * high order: directly send to Buddy.
> >>> +  */
> >>> + if (page_ref_sub_and_test(page, nr)) {
> >>>   if (order == 0)
> >>>   free_unref_page(page);
> >>>   else
> >>> @@ -4435,6 +4443,10 @@ void __free_pages(struct page *page, unsigned int 
> >>> order)
> >>>   }
> >>>  }
> >>>  
> >>> +void __free_pages(struct page *page, unsigned int order)
> >>> +{
> >>> + free_the_page(page, order, 1);
> >>> +}
> >>>  EXPORT_SYMBOL(__free_pages);
> >>>  
> >>>  void free_pages(unsigned long addr, unsigned int order)
> >>> @@ -4481,16 +4493,7 @@ static struct page 
> >>> *__page_frag_cache_refill(struct page_frag_cache *nc,
> >>>  
> >>>  void __page_frag_cache_drain(struct page *page, unsigned int count)
> >>>  {
> >>> - VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
> >>> -
> >>> - if (page_ref_sub_and_test(page, count)) {
> >>> - unsigned int order = compound_order(page);
> >>> -
> >>> - if (order == 0)
> >>> - free_unref_page(page);
> >>> - else
> >>> - __free_pages_ok(page, order);
> >>> - }
> >>> + free_the_page(page, compound_order(page), count);
> >>>  }
> >>>  EXPORT_SYMBOL(__page_frag_cache_drain);
> >>>  
> >>> @@ -4555,14 +4558,7 @@ void page_frag_free(void *addr)
> >>>  {
> >>>   struct page *page = virt_to_head_page(addr);
> >>>  
> >>> - if (unlikely(put_page_testzero(page))) {
> >>> - unsigned int order = compound_order(page);
> >>> -
> >>> - if (order == 0)
> >>> - free_unref_page(page);
> >>> - else
> >>> - __free_pages_ok(page, order);
> >>> - }
> >>> + free_the_page(page, compound_order(page), 1);
> >>>  }
> >>>  EXPORT_SYMBOL(page_frag_free);
> >>>  
> >>>
> >>
> 


Re: [PATCH v2] mm: use kvzalloc for swap_info_struct allocation

2018-11-05 Thread Aaron Lu
On Mon, Nov 05, 2018 at 03:11:56PM +0100, Michal Hocko wrote:
> On Mon 05-11-18 14:17:01, Vasily Averin wrote:
> > commit a2468cc9bfdf ("swap: choose swap device according to numa node")
> > changed 'avail_lists' field of 'struct swap_info_struct' to an array.
> > In popular linux distros it increased size of swap_info_struct up to
> > 40 Kbytes and now swap_info_struct allocation requires order-4 page.
> > Switch to kvzmalloc allows to avoid unexpected allocation failures.
> 
> While this fixes the most visible issue is this a good long term
> solution? Aren't we wasting memory without a good reason? IIRC our limit

That's right, we need a better way of handling this in the long term.

> for swap files/devices is much smaller than potential NUMA nodes numbers
> so we can safely expect that would be only few numa affine nodes. I am
> not really familiar with the rework which has added numa node awareness
> but I wouls assueme that we should either go with one global table with
> a linked list of possible swap_info structure per numa node or use a
> sparse array.

There is a per-numa-node plist of available swap devices, so every swap
device needs an entry on those per-numa-node plist.

I think we can convert avail_lists from array to pointer and use vzalloc
to allocate the needed memory. MAX_NUMANODES can be used for a simple
implementation, or use the precise online node number but then we will
need to handle node online/offline events.

sparse array sounds promising, I'll take a look, thanks for the pointer.

> That being said I am not really objecting to this patch as it is simple
> and backportable to older (stable kernels).
>  
> I would even dare to add
> Fixes: a2468cc9bfdf ("swap: choose swap device according to numa node")
> 
> because not being able to add a swap space on a fragmented system looks
> like a regression to me.

Agree, especially it used to work.

Regards,
Aaron

> > Acked-by: Aaron Lu 
> > Signed-off-by: Vasily Averin 
> 
> Acked-by: Michal Hocko 
> > ---
> >  mm/swapfile.c | 6 +++---
> >  1 file changed, 3 insertions(+), 3 deletions(-)
> > 
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 644f746e167a..8688ae65ef58 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > unsigned int type;
> > int i;
> >  
> > -   p = kzalloc(sizeof(*p), GFP_KERNEL);
> > +   p = kvzalloc(sizeof(*p), GFP_KERNEL);
> > if (!p)
> > return ERR_PTR(-ENOMEM);
> >  
> > @@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > }
> > if (type >= MAX_SWAPFILES) {
> > spin_unlock(_lock);
> > -   kfree(p);
> > +   kvfree(p);
> > return ERR_PTR(-EPERM);
> > }
> > if (type >= nr_swapfiles) {
> > @@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > smp_wmb();
> > nr_swapfiles++;
> > } else {
> > -   kfree(p);
> > +   kvfree(p);
> > p = swap_info[type];
> > /*
> >  * Do not memset this entry: a racing procfs swap_next()
> > -- 
> > 2.17.1
> 
> -- 
> Michal Hocko
> SUSE Labs
> 


Re: [PATCH v2] mm: use kvzalloc for swap_info_struct allocation

2018-11-05 Thread Aaron Lu
On Mon, Nov 05, 2018 at 03:11:56PM +0100, Michal Hocko wrote:
> On Mon 05-11-18 14:17:01, Vasily Averin wrote:
> > commit a2468cc9bfdf ("swap: choose swap device according to numa node")
> > changed 'avail_lists' field of 'struct swap_info_struct' to an array.
> > In popular linux distros it increased size of swap_info_struct up to
> > 40 Kbytes and now swap_info_struct allocation requires order-4 page.
> > Switch to kvzmalloc allows to avoid unexpected allocation failures.
> 
> While this fixes the most visible issue is this a good long term
> solution? Aren't we wasting memory without a good reason? IIRC our limit

That's right, we need a better way of handling this in the long term.

> for swap files/devices is much smaller than potential NUMA nodes numbers
> so we can safely expect that would be only few numa affine nodes. I am
> not really familiar with the rework which has added numa node awareness
> but I wouls assueme that we should either go with one global table with
> a linked list of possible swap_info structure per numa node or use a
> sparse array.

There is a per-numa-node plist of available swap devices, so every swap
device needs an entry on those per-numa-node plist.

I think we can convert avail_lists from array to pointer and use vzalloc
to allocate the needed memory. MAX_NUMANODES can be used for a simple
implementation, or use the precise online node number but then we will
need to handle node online/offline events.

sparse array sounds promising, I'll take a look, thanks for the pointer.

> That being said I am not really objecting to this patch as it is simple
> and backportable to older (stable kernels).
>  
> I would even dare to add
> Fixes: a2468cc9bfdf ("swap: choose swap device according to numa node")
> 
> because not being able to add a swap space on a fragmented system looks
> like a regression to me.

Agree, especially it used to work.

Regards,
Aaron

> > Acked-by: Aaron Lu 
> > Signed-off-by: Vasily Averin 
> 
> Acked-by: Michal Hocko 
> > ---
> >  mm/swapfile.c | 6 +++---
> >  1 file changed, 3 insertions(+), 3 deletions(-)
> > 
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index 644f746e167a..8688ae65ef58 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > unsigned int type;
> > int i;
> >  
> > -   p = kzalloc(sizeof(*p), GFP_KERNEL);
> > +   p = kvzalloc(sizeof(*p), GFP_KERNEL);
> > if (!p)
> > return ERR_PTR(-ENOMEM);
> >  
> > @@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > }
> > if (type >= MAX_SWAPFILES) {
> > spin_unlock(_lock);
> > -   kfree(p);
> > +   kvfree(p);
> > return ERR_PTR(-EPERM);
> > }
> > if (type >= nr_swapfiles) {
> > @@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void)
> > smp_wmb();
> > nr_swapfiles++;
> > } else {
> > -   kfree(p);
> > +   kvfree(p);
> > p = swap_info[type];
> > /*
> >  * Do not memset this entry: a racing procfs swap_next()
> > -- 
> > 2.17.1
> 
> -- 
> Michal Hocko
> SUSE Labs
> 


Re: [PATCH 1/2] mm: use kvzalloc for swap_info_struct allocation

2018-11-04 Thread Aaron Lu
On Mon, Nov 05, 2018 at 07:59:13AM +0300, Vasily Averin wrote:
> 
> 
> On 11/5/18 3:50 AM, Huang, Ying wrote:
> > Vasily Averin  writes:
> > 
> >> commit a2468cc9bfdf ("swap: choose swap device according to numa node")
> >> increased size of swap_info_struct up to 44 Kbytes, now it requires
> >> 4th order page.
> > 
> > Why swap_info_struct could be so large?  Because MAX_NUMNODES could be
> > thousands so that 'avail_lists' field could be tens KB?  If so, I think
> > it's fair to use kvzalloc().  Can you add one line comment?  Because
> > struct swap_info_struct is quite small in default configuration.
> 
> I was incorrect not 44Kb but 40kb should be here.
> We have found CONFIG_NODES_SHIFT=10 in new RHEL7 update 6 kernel,
> default ubuntu kernels have the same setting too.
> 
> crash> struct swap_info_struct -o
> struct swap_info_struct {
>   [0] unsigned long flags;
>   [8] short prio;
>...
> [140] spinlock_t lock;
> [144] struct plist_node list;
> [184] struct plist_node avail_lists[1024]; <<<< here

So every 'struct plist_node' takes 40 bytes and 1024 of them take a
total of 40960 bytes, which is 10 pages and need an order-4 page to host
them. It looks a little too much, especially consider most of the space
will left be unused since most systems have nodes <= 4. I didn't realize
this problem when developing this patch, thanks for pointing this out.

I think using kvzalloc() as is done by your patch is better here as it
can avoid possible failure of swapon.

Acked-by: Aaron Lu 

BTW, for systems with few swap devices this may not be a big deal, but
according to your description, your workload will create a lot of swap
devices and each of them will likely cause an order-4 unmovable pages
allocated(when kvzalloc() doesn't fallback). I was thinking maybe we
should convert avail_lists to a pointer in swap_info_struct and use
vzalloc() for it.

Thanks,
Aaron

>   [41144] struct swap_cluster_info *cluster_info;
>   [41152] struct swap_cluster_list free_clusters;
>   ...
>   [41224] spinlock_t cont_lock;
> }
> SIZE: 41232
> 
> struct swap_info_struct {
> ...
> RH_KABI_EXTEND(struct plist_node avail_lists[MAX_NUMNODES]) /* entry 
> in swap_avail_head */
> ...
> }
> 
> #define MAX_NUMNODES(1 << NODES_SHIFT)
> 
> #ifdef CONFIG_NODES_SHIFT 
> #define NODES_SHIFT CONFIG_NODES_SHIFT
> #else
> #define NODES_SHIFT 0
> #endif
> 
> /boot/config-4.15.0-38-generic:CONFIG_NODES_SHIFT=10
> 


Re: [PATCH 1/2] mm: use kvzalloc for swap_info_struct allocation

2018-11-04 Thread Aaron Lu
On Mon, Nov 05, 2018 at 07:59:13AM +0300, Vasily Averin wrote:
> 
> 
> On 11/5/18 3:50 AM, Huang, Ying wrote:
> > Vasily Averin  writes:
> > 
> >> commit a2468cc9bfdf ("swap: choose swap device according to numa node")
> >> increased size of swap_info_struct up to 44 Kbytes, now it requires
> >> 4th order page.
> > 
> > Why swap_info_struct could be so large?  Because MAX_NUMNODES could be
> > thousands so that 'avail_lists' field could be tens KB?  If so, I think
> > it's fair to use kvzalloc().  Can you add one line comment?  Because
> > struct swap_info_struct is quite small in default configuration.
> 
> I was incorrect not 44Kb but 40kb should be here.
> We have found CONFIG_NODES_SHIFT=10 in new RHEL7 update 6 kernel,
> default ubuntu kernels have the same setting too.
> 
> crash> struct swap_info_struct -o
> struct swap_info_struct {
>   [0] unsigned long flags;
>   [8] short prio;
>...
> [140] spinlock_t lock;
> [144] struct plist_node list;
> [184] struct plist_node avail_lists[1024]; <<<< here

So every 'struct plist_node' takes 40 bytes and 1024 of them take a
total of 40960 bytes, which is 10 pages and need an order-4 page to host
them. It looks a little too much, especially consider most of the space
will left be unused since most systems have nodes <= 4. I didn't realize
this problem when developing this patch, thanks for pointing this out.

I think using kvzalloc() as is done by your patch is better here as it
can avoid possible failure of swapon.

Acked-by: Aaron Lu 

BTW, for systems with few swap devices this may not be a big deal, but
according to your description, your workload will create a lot of swap
devices and each of them will likely cause an order-4 unmovable pages
allocated(when kvzalloc() doesn't fallback). I was thinking maybe we
should convert avail_lists to a pointer in swap_info_struct and use
vzalloc() for it.

Thanks,
Aaron

>   [41144] struct swap_cluster_info *cluster_info;
>   [41152] struct swap_cluster_list free_clusters;
>   ...
>   [41224] spinlock_t cont_lock;
> }
> SIZE: 41232
> 
> struct swap_info_struct {
> ...
> RH_KABI_EXTEND(struct plist_node avail_lists[MAX_NUMNODES]) /* entry 
> in swap_avail_head */
> ...
> }
> 
> #define MAX_NUMNODES(1 << NODES_SHIFT)
> 
> #ifdef CONFIG_NODES_SHIFT 
> #define NODES_SHIFT CONFIG_NODES_SHIFT
> #else
> #define NODES_SHIFT 0
> #endif
> 
> /boot/config-4.15.0-38-generic:CONFIG_NODES_SHIFT=10
> 


Re: [LKP] [lkp-robot] [sched/fair] d519329f72: unixbench.score -9.9% regression

2018-10-25 Thread Aaron Lu
On Wed, Oct 24, 2018 at 06:01:37PM +0100, Patrick Bellasi wrote:
> On 24-Oct 14:41, Aaron Lu wrote:
> > On Mon, Apr 02, 2018 at 11:20:00AM +0800, Ye, Xiaolong wrote:
> > > 
> > > Greeting,
> > > 
> > > FYI, we noticed a -9.9% regression of unixbench.score due to commit:
> > > 
> > > 
> > > commit: d519329f72a6f36bc4f2b85452640cfe583b4f81 ("sched/fair: Update 
> > > util_est only on util_avg updates")
> > > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
> > > 
> > > in testcase: unixbench
> > > on test machine: 8 threads Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz with 6G 
> > > memory
> > > with following parameters:
> > > 
> > > runtime: 300s
> > > nr_task: 100%
> > > test: execl
> > > 
> > > test-description: UnixBench is the original BYTE UNIX benchmark suite 
> > > aims to test performance of Unix-like system.
> > > test-url: https://github.com/kdlucas/byte-unixbench
> 
> Hi Aaron,
>  
> > I tested this workload on different machines with this commit
> > d519329f72a6f36bc4f2b85452 and its parent a07630b8b2c16f82, I also
> > tested with v4.19-rc8 to see if the regression is gone -
> > the performance drop is there with v4.19-rc8 and with different
> > machines so I assume this regression is not solved yet.
> >
> > Here are detailed data:
> > 
> > cmdline used to run this workload:
> > ./Run execl -c $nr_cpu -i 30
> 
> I had a better look into this issue and found that something like this
> could be the cure for the execl throughput regression:

Good news, yes they are!

> ---8<---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 908c9cdae2f0..c34d41b542fc 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6258,8 +6258,17 @@ static unsigned long cpu_util_wake(int cpu, struct 
> task_struct *p)
>* covered by the following code when estimated utilization is
>* enabled.
>*/
> - if (sched_feat(UTIL_EST))
> - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
> + if (sched_feat(UTIL_EST)) {
> + unsigned int estimated =
> + READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +
> + if (unlikely(current == p)) {
> + estimated -= min_t(unsigned int, estimated,
> + (_task_util_est(p) | UTIL_AVG_UNCHANGED));
> + }
> +
> + util = max(util, estimated);
> + }
> 
>   /*
>* Utilization (estimated) can exceed the CPU capacity, thus let's
> ---8<---
> 
> I'll test this better on a machine on my side and send out a proper
> patch by tomorrow.
> 
> > Please let me know if you need other information, thanks.
> 
> Would be nice if you can test the above on your side too.
> 

commit cbcb74a95c5af32f9127a102feca323139ba2c49 is the commit I made
from your diff and it restored performance for the two desktops. the
result on the skylake server isn't quite stable so I think the
performance gap is due to noise.

lkp-ivb-d04:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 2946.0,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
2669.5,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
2924.5,

lkp-hsw-d01:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 
7013.5333,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
6421.2333,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
7090.4001,

lkp-skl-2sp2:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 9347.02,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 9362.76,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 9520.86,


Re: [LKP] [lkp-robot] [sched/fair] d519329f72: unixbench.score -9.9% regression

2018-10-25 Thread Aaron Lu
On Wed, Oct 24, 2018 at 06:01:37PM +0100, Patrick Bellasi wrote:
> On 24-Oct 14:41, Aaron Lu wrote:
> > On Mon, Apr 02, 2018 at 11:20:00AM +0800, Ye, Xiaolong wrote:
> > > 
> > > Greeting,
> > > 
> > > FYI, we noticed a -9.9% regression of unixbench.score due to commit:
> > > 
> > > 
> > > commit: d519329f72a6f36bc4f2b85452640cfe583b4f81 ("sched/fair: Update 
> > > util_est only on util_avg updates")
> > > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
> > > 
> > > in testcase: unixbench
> > > on test machine: 8 threads Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz with 6G 
> > > memory
> > > with following parameters:
> > > 
> > > runtime: 300s
> > > nr_task: 100%
> > > test: execl
> > > 
> > > test-description: UnixBench is the original BYTE UNIX benchmark suite 
> > > aims to test performance of Unix-like system.
> > > test-url: https://github.com/kdlucas/byte-unixbench
> 
> Hi Aaron,
>  
> > I tested this workload on different machines with this commit
> > d519329f72a6f36bc4f2b85452 and its parent a07630b8b2c16f82, I also
> > tested with v4.19-rc8 to see if the regression is gone -
> > the performance drop is there with v4.19-rc8 and with different
> > machines so I assume this regression is not solved yet.
> >
> > Here are detailed data:
> > 
> > cmdline used to run this workload:
> > ./Run execl -c $nr_cpu -i 30
> 
> I had a better look into this issue and found that something like this
> could be the cure for the execl throughput regression:

Good news, yes they are!

> ---8<---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 908c9cdae2f0..c34d41b542fc 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6258,8 +6258,17 @@ static unsigned long cpu_util_wake(int cpu, struct 
> task_struct *p)
>* covered by the following code when estimated utilization is
>* enabled.
>*/
> - if (sched_feat(UTIL_EST))
> - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
> + if (sched_feat(UTIL_EST)) {
> + unsigned int estimated =
> + READ_ONCE(cfs_rq->avg.util_est.enqueued);
> +
> + if (unlikely(current == p)) {
> + estimated -= min_t(unsigned int, estimated,
> + (_task_util_est(p) | UTIL_AVG_UNCHANGED));
> + }
> +
> + util = max(util, estimated);
> + }
> 
>   /*
>* Utilization (estimated) can exceed the CPU capacity, thus let's
> ---8<---
> 
> I'll test this better on a machine on my side and send out a proper
> patch by tomorrow.
> 
> > Please let me know if you need other information, thanks.
> 
> Would be nice if you can test the above on your side too.
> 

commit cbcb74a95c5af32f9127a102feca323139ba2c49 is the commit I made
from your diff and it restored performance for the two desktops. the
result on the skylake server isn't quite stable so I think the
performance gap is due to noise.

lkp-ivb-d04:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 2946.0,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
2669.5,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
2924.5,

lkp-hsw-d01:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 
7013.5333,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
6421.2333,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
7090.4001,

lkp-skl-2sp2:
cbcb74a95c5af32f9127a102feca323139ba2c49/avg.json:  "unixbench.score": 9347.02,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 9362.76,
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 9520.86,


Re: [LKP] [lkp-robot] [sched/fair] d519329f72: unixbench.score -9.9% regression

2018-10-24 Thread Aaron Lu
On Mon, Apr 02, 2018 at 11:20:00AM +0800, Ye, Xiaolong wrote:
> 
> Greeting,
> 
> FYI, we noticed a -9.9% regression of unixbench.score due to commit:
> 
> 
> commit: d519329f72a6f36bc4f2b85452640cfe583b4f81 ("sched/fair: Update 
> util_est only on util_avg updates")
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
> 
> in testcase: unixbench
> on test machine: 8 threads Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz with 6G 
> memory
> with following parameters:
> 
> runtime: 300s
> nr_task: 100%
> test: execl
> 
> test-description: UnixBench is the original BYTE UNIX benchmark suite aims to 
> test performance of Unix-like system.
> test-url: https://github.com/kdlucas/byte-unixbench

I tested this workload on different machines with this commit
d519329f72a6f36bc4f2b85452 and its parent a07630b8b2c16f82, I also
tested with v4.19-rc8 to see if the regression is gone -
the performance drop is there with v4.19-rc8 and with different
machines so I assume this regression is not solved yet.
Here are detailed data:

cmdline used to run this workload:
./Run execl -c $nr_cpu -i 30

lkp-ivb-d04(Ivybridge desktop, 4 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 --
 %stddev %change %stddev
 \  |\
  2924-8.7%   2669unixbench.score
 
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
2924.5,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
2669.5,
v4.19-rc8/avg.json:  "unixbench.score": 2611.7333,

lkp-hsw-d01(Haswell desktop, 8 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 --
 %stddev %change %stddev
 \  |\
  7090-9.4%   6421unixbench.score
 
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
7090.4001,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
6421.2333,
v4.19-rc8/avg.json:  "unixbench.score": 6431.9333,

lkp-skl-2sp2(2 sockets Skylake, 80 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 -- 
  
 %stddev %change %stddev
  
 \  |\  
  
  9657 ±  2%  -3.3%   9334unixbench.score
  
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 9657.6,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 9334.9,
v4.19-rc8/avg.json:  "unixbench.score": 9278.3334,

Please let me know if you need other information, thanks.


Re: [LKP] [lkp-robot] [sched/fair] d519329f72: unixbench.score -9.9% regression

2018-10-24 Thread Aaron Lu
On Mon, Apr 02, 2018 at 11:20:00AM +0800, Ye, Xiaolong wrote:
> 
> Greeting,
> 
> FYI, we noticed a -9.9% regression of unixbench.score due to commit:
> 
> 
> commit: d519329f72a6f36bc4f2b85452640cfe583b4f81 ("sched/fair: Update 
> util_est only on util_avg updates")
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master
> 
> in testcase: unixbench
> on test machine: 8 threads Intel(R) Core(TM) i7 CPU 870 @ 2.93GHz with 6G 
> memory
> with following parameters:
> 
> runtime: 300s
> nr_task: 100%
> test: execl
> 
> test-description: UnixBench is the original BYTE UNIX benchmark suite aims to 
> test performance of Unix-like system.
> test-url: https://github.com/kdlucas/byte-unixbench

I tested this workload on different machines with this commit
d519329f72a6f36bc4f2b85452 and its parent a07630b8b2c16f82, I also
tested with v4.19-rc8 to see if the regression is gone -
the performance drop is there with v4.19-rc8 and with different
machines so I assume this regression is not solved yet.
Here are detailed data:

cmdline used to run this workload:
./Run execl -c $nr_cpu -i 30

lkp-ivb-d04(Ivybridge desktop, 4 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 --
 %stddev %change %stddev
 \  |\
  2924-8.7%   2669unixbench.score
 
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
2924.5,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
2669.5,
v4.19-rc8/avg.json:  "unixbench.score": 2611.7333,

lkp-hsw-d01(Haswell desktop, 8 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 --
 %stddev %change %stddev
 \  |\
  7090-9.4%   6421unixbench.score
 
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 
7090.4001,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 
6421.2333,
v4.19-rc8/avg.json:  "unixbench.score": 6431.9333,

lkp-skl-2sp2(2 sockets Skylake, 80 CPUs)
a07630b8b2c16f82 d519329f72a6f36bc4f2b85452
 -- 
  
 %stddev %change %stddev
  
 \  |\  
  
  9657 ±  2%  -3.3%   9334unixbench.score
  
a07630b8b2c16f82fd5b71d890079f4dd7599c1d/avg.json:  "unixbench.score": 9657.6,
d519329f72a6f36bc4f2b85452640cfe583b4f81/avg.json:  "unixbench.score": 9334.9,
v4.19-rc8/avg.json:  "unixbench.score": 9278.3334,

Please let me know if you need other information, thanks.


Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-22 Thread Aaron Lu
On Mon, Oct 22, 2018 at 11:37:53AM +0200, Vlastimil Babka wrote:
> On 10/17/18 8:33 AM, Aaron Lu wrote:
> > Profile on Intel Skylake server shows the most time consuming part
> > under zone->lock on allocation path is accessing those to-be-returned
> > page's "struct page" on the free_list inside zone->lock. One explanation
> > is, different CPUs are releasing pages to the head of free_list and
> > those page's 'struct page' may very well be cache cold for the allocating
> > CPU when it grabs these pages from free_list' head. The purpose here
> > is to avoid touching these pages one by one inside zone->lock.
> 
> What about making the pages cache-hot first, without zone->lock, by
> traversing via page->lru. It would need some safety checks obviously
> (maybe based on page_to_pfn + pfn_valid, or something) to make sure we
> only read from real struct pages in case there's some update racing. The
> worst case would be not populating enough due to race, and thus not
> gaining the performance when doing the actual rmqueueing under lock.

Yes, there are the 2 potential problems you have pointed out:
1 we may be prefetching something that isn't a page due to page->lru can
  be reused as different things under different scenerios;
2 we may not be able to prefetch much due to other CPU is doing
  allocation inside the lock, it's possible we end up with prefetching
  pages that are on another CPU's pcp list.

Considering the above 2 problems, I feel prefetching outside lock a
little risky and troublesome.

Allocation path is the hard part of improving page allocator
performance - in free path, we can prefetch them safely outside the lock
and we can even pre-merge them outside the lock to reduce the pressure of
the zone lock; but in allocation path, there is pretty nothing we can do
before acquiring the lock, except taking the risk to prefetch them
without taking the lock as you mentioned here.

We can come back to this if 'address space range' lock doesn't work out.


Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-22 Thread Aaron Lu
On Mon, Oct 22, 2018 at 11:37:53AM +0200, Vlastimil Babka wrote:
> On 10/17/18 8:33 AM, Aaron Lu wrote:
> > Profile on Intel Skylake server shows the most time consuming part
> > under zone->lock on allocation path is accessing those to-be-returned
> > page's "struct page" on the free_list inside zone->lock. One explanation
> > is, different CPUs are releasing pages to the head of free_list and
> > those page's 'struct page' may very well be cache cold for the allocating
> > CPU when it grabs these pages from free_list' head. The purpose here
> > is to avoid touching these pages one by one inside zone->lock.
> 
> What about making the pages cache-hot first, without zone->lock, by
> traversing via page->lru. It would need some safety checks obviously
> (maybe based on page_to_pfn + pfn_valid, or something) to make sure we
> only read from real struct pages in case there's some update racing. The
> worst case would be not populating enough due to race, and thus not
> gaining the performance when doing the actual rmqueueing under lock.

Yes, there are the 2 potential problems you have pointed out:
1 we may be prefetching something that isn't a page due to page->lru can
  be reused as different things under different scenerios;
2 we may not be able to prefetch much due to other CPU is doing
  allocation inside the lock, it's possible we end up with prefetching
  pages that are on another CPU's pcp list.

Considering the above 2 problems, I feel prefetching outside lock a
little risky and troublesome.

Allocation path is the hard part of improving page allocator
performance - in free path, we can prefetch them safely outside the lock
and we can even pre-merge them outside the lock to reduce the pressure of
the zone lock; but in allocation path, there is pretty nothing we can do
before acquiring the lock, except taking the risk to prefetch them
without taking the lock as you mentioned here.

We can come back to this if 'address space range' lock doesn't work out.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-20 Thread Aaron Lu
On Fri, Oct 19, 2018 at 08:00:53AM -0700, Daniel Jordan wrote:
> On Fri, Oct 19, 2018 at 09:54:35AM +0100, Mel Gorman wrote:
> > On Fri, Oct 19, 2018 at 01:57:03PM +0800, Aaron Lu wrote:
> > > > 
> > > > I don't think this is the right way of thinking about it because it's
> > > > possible to have the system split in such a way so that the migration
> > > > scanner only encounters unmovable pages before it meets the free scanner
> > > > where unmerged buddies were in the higher portion of the address space.
> > > 
> > > Yes it is possible unmerged pages are in the higher portion.
> > > 
> > > My understanding is, when the two scanners meet, all unmerged pages will
> > > be either used by the free scanner as migrate targets or sent to merge
> > > by the migration scanner.
> > > 
> > 
> > It's not guaranteed if the lower portion of the address space consisted
> > entirely of pages that cannot migrate (because they are unmovable or because
> > migration failed due to pins). It's actually a fundamental limitation
> > of compaction that it can miss migration and compaction opportunities
> > due to how the scanners are implemented. It was designed that way to
> > avoid pageblocks being migrated unnecessarily back and forth but the
> > downside is missed opportunities.
> > 
> > > > You either need to keep unmerged buddies on a separate list or search
> > > > the order-0 free list for merge candidates prior to compaction.
> > > > 
> > > > > > It's needed to form them efficiently but excessive reclaim or 
> > > > > > writing 3
> > > > > > to drop_caches can also do it. Be careful of tying lazy buddy too
> > > > > > closely to compaction.
> > > > > 
> > > > > That's the current design of this patchset, do you see any immediate
> > > > > problem of this? Is it that you are worried about high-order 
> > > > > allocation
> > > > > success rate using this design?
> > > > 
> > > > I've pointed out what I see are the design flaws but yes, in general, 
> > > > I'm
> > > > worried about the high order allocation success rate using this design,
> > > > the reliance on compaction and the fact that the primary motivation is
> > > > when THP is disabled.
> > > 
> > > When THP is in use, zone lock contention is pretty much nowhere :-)
> > > 
> > > I'll see what I can get with 'address space range' lock first and will
> > > come back to 'lazy buddy' if it doesn't work out.
> 
> With the address space range idea, wouldn't the zone free_area require changes
> too?  I can't see how locking by address range could synchronize it as it
> exists now otherwise, with per order/mt list heads.
> 
> One idea is to further subdivide the free area according to how the locking
> works and find some reasonable way to handle having to search for pages of a
> given order/mt in multiple places.

I plan to create one free_are per 'address space range'. The challenge
will be how to quickly locate a free_area that has the required free
page on allocation path. Other details like how big the address space
range should be etc. will need to be explored with testing.

I think this approach is worth a try because it wouldn't cause
fragmentation.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-20 Thread Aaron Lu
On Fri, Oct 19, 2018 at 08:00:53AM -0700, Daniel Jordan wrote:
> On Fri, Oct 19, 2018 at 09:54:35AM +0100, Mel Gorman wrote:
> > On Fri, Oct 19, 2018 at 01:57:03PM +0800, Aaron Lu wrote:
> > > > 
> > > > I don't think this is the right way of thinking about it because it's
> > > > possible to have the system split in such a way so that the migration
> > > > scanner only encounters unmovable pages before it meets the free scanner
> > > > where unmerged buddies were in the higher portion of the address space.
> > > 
> > > Yes it is possible unmerged pages are in the higher portion.
> > > 
> > > My understanding is, when the two scanners meet, all unmerged pages will
> > > be either used by the free scanner as migrate targets or sent to merge
> > > by the migration scanner.
> > > 
> > 
> > It's not guaranteed if the lower portion of the address space consisted
> > entirely of pages that cannot migrate (because they are unmovable or because
> > migration failed due to pins). It's actually a fundamental limitation
> > of compaction that it can miss migration and compaction opportunities
> > due to how the scanners are implemented. It was designed that way to
> > avoid pageblocks being migrated unnecessarily back and forth but the
> > downside is missed opportunities.
> > 
> > > > You either need to keep unmerged buddies on a separate list or search
> > > > the order-0 free list for merge candidates prior to compaction.
> > > > 
> > > > > > It's needed to form them efficiently but excessive reclaim or 
> > > > > > writing 3
> > > > > > to drop_caches can also do it. Be careful of tying lazy buddy too
> > > > > > closely to compaction.
> > > > > 
> > > > > That's the current design of this patchset, do you see any immediate
> > > > > problem of this? Is it that you are worried about high-order 
> > > > > allocation
> > > > > success rate using this design?
> > > > 
> > > > I've pointed out what I see are the design flaws but yes, in general, 
> > > > I'm
> > > > worried about the high order allocation success rate using this design,
> > > > the reliance on compaction and the fact that the primary motivation is
> > > > when THP is disabled.
> > > 
> > > When THP is in use, zone lock contention is pretty much nowhere :-)
> > > 
> > > I'll see what I can get with 'address space range' lock first and will
> > > come back to 'lazy buddy' if it doesn't work out.
> 
> With the address space range idea, wouldn't the zone free_area require changes
> too?  I can't see how locking by address range could synchronize it as it
> exists now otherwise, with per order/mt list heads.
> 
> One idea is to further subdivide the free area according to how the locking
> works and find some reasonable way to handle having to search for pages of a
> given order/mt in multiple places.

I plan to create one free_are per 'address space range'. The challenge
will be how to quickly locate a free_area that has the required free
page on allocation path. Other details like how big the address space
range should be etc. will need to be explored with testing.

I think this approach is worth a try because it wouldn't cause
fragmentation.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 12:16:32PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 10:59:04PM +0800, Aaron Lu wrote:
> > > Any particuular reason why? I assume it's related to the number of zone
> > > locks with the increase number of zones and the number of threads used
> > > for the test.
> > 
> > I think so too.
> > 
> > The 4 sockets server has 192 CPUs in total while the 2 sockets server
> > has 112 CPUs in total. Assume only ZONE_NORMAL are used, for the 4
> > sockets server it would be 192/4=48(CPUs per zone) while for the 2
> > sockets server it is 112/2=56(CPUs per zone). The test is started with
> > nr_task=nr_cpu so for the 2 sockets servers, it ends up having more CPUs
> > consuming one zone.
> > 
> 
> Nice that the prediction is accurate. It brings us to another option --
> breaking up the zone lock by either hash or address space ranges. The
> address space ranges would probably be easier to implement. Where it
> gets hairy is that PFN walkers would need different zone locks. However,
> overall it might be a better option because it's not order-0 specific.

I think the 'address space range' lock is worth a try.

> It would be a lot of legwork because all uses of the zone lock would
> have to be audited to see which ones protect the free lists and which
> ones protect "something else".

Yes a lot of details.

> > > That's important to know. It does reduce the utility of the patch
> > > somewhat but not all arches support THP and THP is not always enabled on
> > > x86.
> > 
> > I always wondered how systems are making use of THP.
> > After all, when system has been runing a while(days or months), file
> > cache should consumed a lot of memory and high order pages will become
> > more and more scare. If order9 page can't be reliably allocated, will
> > workload rely on it?
> > Just a thought.
> > 
> 
> File cache can usually be trivially reclaimed and moved. It's a "how
> long is a piece of string" to determine at what point a system can get
> fragmented and whether than can be prevented. It's somewhat outside the
> scope of this patch but anecdotally I'm looking at a machine with 20 days
> uptime and it still has 2390GB worth of THPs free after a large amount
> of reclaim activity over the system lifetime so fragmentation avoidance
> does work in some cases.

Good to know, thanks.

> 
> > THP is of course pretty neat that it reduced TLB cost, needs fewer page
> > table etc. I just wondered if people really rely on it, or using it
> > after their system has been up for a long time.
> > 
> 
> If people didn't rely on it then we might as well delete THP and the
> declare the whole tmpfs-backed-THP as worthless.
> 
> > > Yes, but note that the concept is still problematic.
> > > isolate_migratepages_block is not guaranteed to find a pageblock with
> > > unmerged buddies in it. If there are pageblocks towards the end of the
> > > zone with unmerged pages, they may never be found. This will be very hard
> > > to detect at runtime because it's heavily dependant on the exact state
> > > of the system.
> > 
> > Quite true.
> > 
> > The intent here though, is not to have compaction merge back all
> > unmerged pages, but did the merge for these unmerged pages in a
> > piggyback way, i.e. since isolate_migratepages_block() is doing the
> > scan, why don't we let it handle these unmerged pages when it meets
> > them?
> > 
> > If for some reason isolate_migratepages_block() didn't meet a single
> > unmerged page before compaction succeed, we probably do not need worry
> > much yet since compaction succeeded anyway.
> > 
> 
> I don't think this is the right way of thinking about it because it's
> possible to have the system split in such a way so that the migration
> scanner only encounters unmovable pages before it meets the free scanner
> where unmerged buddies were in the higher portion of the address space.

Yes it is possible unmerged pages are in the higher portion.

My understanding is, when the two scanners meet, all unmerged pages will
be either used by the free scanner as migrate targets or sent to merge
by the migration scanner.

> 
> You either need to keep unmerged buddies on a separate list or search
> the order-0 free list for merge candidates prior to compaction.
> 
> > > It's needed to form them efficiently but excessive reclaim or writing 3
> > > to drop_caches can also do it. Be careful of tying lazy buddy too
> > > closely to compaction.
> > 
> > That's the current design of this patchset, do you see any immediate
&g

Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 12:16:32PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 10:59:04PM +0800, Aaron Lu wrote:
> > > Any particuular reason why? I assume it's related to the number of zone
> > > locks with the increase number of zones and the number of threads used
> > > for the test.
> > 
> > I think so too.
> > 
> > The 4 sockets server has 192 CPUs in total while the 2 sockets server
> > has 112 CPUs in total. Assume only ZONE_NORMAL are used, for the 4
> > sockets server it would be 192/4=48(CPUs per zone) while for the 2
> > sockets server it is 112/2=56(CPUs per zone). The test is started with
> > nr_task=nr_cpu so for the 2 sockets servers, it ends up having more CPUs
> > consuming one zone.
> > 
> 
> Nice that the prediction is accurate. It brings us to another option --
> breaking up the zone lock by either hash or address space ranges. The
> address space ranges would probably be easier to implement. Where it
> gets hairy is that PFN walkers would need different zone locks. However,
> overall it might be a better option because it's not order-0 specific.

I think the 'address space range' lock is worth a try.

> It would be a lot of legwork because all uses of the zone lock would
> have to be audited to see which ones protect the free lists and which
> ones protect "something else".

Yes a lot of details.

> > > That's important to know. It does reduce the utility of the patch
> > > somewhat but not all arches support THP and THP is not always enabled on
> > > x86.
> > 
> > I always wondered how systems are making use of THP.
> > After all, when system has been runing a while(days or months), file
> > cache should consumed a lot of memory and high order pages will become
> > more and more scare. If order9 page can't be reliably allocated, will
> > workload rely on it?
> > Just a thought.
> > 
> 
> File cache can usually be trivially reclaimed and moved. It's a "how
> long is a piece of string" to determine at what point a system can get
> fragmented and whether than can be prevented. It's somewhat outside the
> scope of this patch but anecdotally I'm looking at a machine with 20 days
> uptime and it still has 2390GB worth of THPs free after a large amount
> of reclaim activity over the system lifetime so fragmentation avoidance
> does work in some cases.

Good to know, thanks.

> 
> > THP is of course pretty neat that it reduced TLB cost, needs fewer page
> > table etc. I just wondered if people really rely on it, or using it
> > after their system has been up for a long time.
> > 
> 
> If people didn't rely on it then we might as well delete THP and the
> declare the whole tmpfs-backed-THP as worthless.
> 
> > > Yes, but note that the concept is still problematic.
> > > isolate_migratepages_block is not guaranteed to find a pageblock with
> > > unmerged buddies in it. If there are pageblocks towards the end of the
> > > zone with unmerged pages, they may never be found. This will be very hard
> > > to detect at runtime because it's heavily dependant on the exact state
> > > of the system.
> > 
> > Quite true.
> > 
> > The intent here though, is not to have compaction merge back all
> > unmerged pages, but did the merge for these unmerged pages in a
> > piggyback way, i.e. since isolate_migratepages_block() is doing the
> > scan, why don't we let it handle these unmerged pages when it meets
> > them?
> > 
> > If for some reason isolate_migratepages_block() didn't meet a single
> > unmerged page before compaction succeed, we probably do not need worry
> > much yet since compaction succeeded anyway.
> > 
> 
> I don't think this is the right way of thinking about it because it's
> possible to have the system split in such a way so that the migration
> scanner only encounters unmovable pages before it meets the free scanner
> where unmerged buddies were in the higher portion of the address space.

Yes it is possible unmerged pages are in the higher portion.

My understanding is, when the two scanners meet, all unmerged pages will
be either used by the free scanner as migrate targets or sent to merge
by the migration scanner.

> 
> You either need to keep unmerged buddies on a separate list or search
> the order-0 free list for merge candidates prior to compaction.
> 
> > > It's needed to form them efficiently but excessive reclaim or writing 3
> > > to drop_caches can also do it. Be careful of tying lazy buddy too
> > > closely to compaction.
> > 
> > That's the current design of this patchset, do you see any immediate
&g

Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 12:20:55PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 10:23:27PM +0800, Aaron Lu wrote:
> > > RT has had problems with cpu_relax in the past but more importantly, as
> > > this delay for parallel compactions and allocations of contig ranges,
> > > we could be stuck here for very long periods of time with interrupts
> > 
> > The longest possible time is one CPU accessing pcp->batch number cold
> > cachelines. Reason:
> > When zone_wait_cluster_alloc() is called, we already held zone lock so
> > no more allocations are possible. Waiting in_progress to become zero
> > means waiting any CPU that increased in_progress to finish processing
> > their allocated pages. Since they will at most allocate pcp->batch pages
> > and worse case are all these page structres are cache cold, so the
> > longest wait time is one CPU accessing pcp->batch number cold cache lines.
> > 
> > I have no idea if this time is too long though.
> > 
> 
> But compact_zone calls zone_wait_and_disable_cluster_alloc so how is the
> disabled time there bound by pcp->batch?

My mistake, I misunderstood spin_lock_irqsave() and thought lock would
need be acquired before irq is disabled...

So yeah, your concern of possible excessive long irq disabled time here
is true.


Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 12:20:55PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 10:23:27PM +0800, Aaron Lu wrote:
> > > RT has had problems with cpu_relax in the past but more importantly, as
> > > this delay for parallel compactions and allocations of contig ranges,
> > > we could be stuck here for very long periods of time with interrupts
> > 
> > The longest possible time is one CPU accessing pcp->batch number cold
> > cachelines. Reason:
> > When zone_wait_cluster_alloc() is called, we already held zone lock so
> > no more allocations are possible. Waiting in_progress to become zero
> > means waiting any CPU that increased in_progress to finish processing
> > their allocated pages. Since they will at most allocate pcp->batch pages
> > and worse case are all these page structres are cache cold, so the
> > longest wait time is one CPU accessing pcp->batch number cold cache lines.
> > 
> > I have no idea if this time is too long though.
> > 
> 
> But compact_zone calls zone_wait_and_disable_cluster_alloc so how is the
> disabled time there bound by pcp->batch?

My mistake, I misunderstood spin_lock_irqsave() and thought lock would
need be acquired before irq is disabled...

So yeah, your concern of possible excessive long irq disabled time here
is true.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 10:23:22AM +0200, Vlastimil Babka wrote:
> On 10/18/18 8:48 AM, Aaron Lu wrote:
> > On Wed, Oct 17, 2018 at 07:03:30PM +0200, Vlastimil Babka wrote:
> >> On 10/17/18 3:58 PM, Mel Gorman wrote:
> >>> Again, as compaction is not guaranteed to find the pageblocks, it would
> >>> be important to consider whether a) that matters or b) find an
> >>> alternative way of keeping unmerged buddies on separate lists so they
> >>> can be quickly discovered when a high-order allocation fails.
> >>
> >> Agree, unmerged buddies could be on separate freelist from regular
> >> order-0 freelist. That list could be also preferred to allocations
> >> before the regular one. Then one could e.g. try "direct merging" via
> >> this list when compaction fails, or prefer direct merging to compaction
> >> for non-costly-order allocations, do direct merging when allocation
> >> context doesn't even allow compaction (atomic etc).
> > 
> > One concern regarding "direct merging" these unmerged pages via this
> > separate freelist(let's call it unmerged_free_list) is: adjacent
> > unmerged pages on the unmerged_free_list could be far away from each
> > other regarding their physical positions, so during the process of
> > merging them, the needed high order page may not be able to be formed
> > in a short time. Actually, the time could be unbound in a bad condition
> > when:
> > 1 unmerged pages adjacent on the unmerged_free_list happen to be far
> >   away from each other regarding their physical positions; and
> 
> I'm not sure I understand. Why should it matter for merging if pages are
> adjacent on the unmerged_free_list? The buddy for merging is found the
> usual way, no?

Yes it's found the usual way. I probably didn't state clear, let me try
again.

Consider a pageblock, initially as an free order9 page. Let's assume
this order9 page is expand()ed into 512 order0 pages during different
allocation requests and they go to different applications running on
different CPUs. After some time, all of them are freed back, but each
of them is freed back at different times, so they are not adjacent on
unmerged_free_list(they could be far away from each other).

In the above scenerio, merging pages on unmerged_free_list one by one
may not be an efficent way to form a high-order page, but scanning a
pageblock PFN wise could be.

Of course, the above scenerio is imagined by me as a worst case, normal
case could be much better.

> 
> > 2 there are a lot of unmerged pages on unmerged_free_list.
> 
> That will affect allocation latency, yeah. Still might be faster than
> direct compaction. And possible to do in GFP_ATOMIC context, unlike
> direct compaction.

I see, but I'm not sure if it is OK to do 'direct merging' in GFP_ATOMIC
context - it is better for cases where failure to have the high-order
page allocated is very bad, but it might not be a good idea if the caller
has a fallback mechanism, i.e. if high order page allocation failed, they
can work with order0.

> 
> > That's the reason I hooked the merging of unmerged pages in compaction
> > when isolate_migratepages_block() is scanning every page of a pageblock
> > in PFN order.
> > 
> > OTOH, if there is a kernel thread trying to reduce fragmentation by
> > doing merges for these unmerged pages, I think it's perfect fine to let
> > it iterate all unmerged pages of that list and do_merge() for all of
> > them.
> > 
> > So what about this: if kcompactd is running, let it handle these
> > unmerged pages on the list and after that, do its usual job of
> > compaction. If direct compaction is running, do not handle unmerged
> > pages on that list but rely on isolate_migratepages_block() to do the
> > merging as is done in this patchset.
> > 
> > This of course has the effect of tying compaction with 'lazy merging'.
> > If it is not desirable, what about creating a new kernel thread to do
> > the merging of unmerged pages on the list while keeping the behaviour of
> > isolate_migratepages_block() in this patchset to improve compaction
> > success rate.
> 
> Note that anything based on daemons will seem like reducing latency for
> allocations, but if we delay merging and then later do it from a daemon,
> the overall zone lock times will be essentially the same, right? The
> reduced zone lock benefits only happen when the unmerged pages get
> reallocated.

Agree.
 
> Also I would definitely consider always merging pages freed to
> >> non-MOVABLE pageblocks. We really don't want to increase the
> >> fragmentation in those. However that means it probably won't help the
> >> netperf case?
> > 
> > Yes, that would be unfortunate for all in-kernel users of page
> > allocator...
> 
> In that case there should definitely be a direct merging possibility
> IMHO, even if only as a last resort before stealing from another pageblock.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Thu, Oct 18, 2018 at 10:23:22AM +0200, Vlastimil Babka wrote:
> On 10/18/18 8:48 AM, Aaron Lu wrote:
> > On Wed, Oct 17, 2018 at 07:03:30PM +0200, Vlastimil Babka wrote:
> >> On 10/17/18 3:58 PM, Mel Gorman wrote:
> >>> Again, as compaction is not guaranteed to find the pageblocks, it would
> >>> be important to consider whether a) that matters or b) find an
> >>> alternative way of keeping unmerged buddies on separate lists so they
> >>> can be quickly discovered when a high-order allocation fails.
> >>
> >> Agree, unmerged buddies could be on separate freelist from regular
> >> order-0 freelist. That list could be also preferred to allocations
> >> before the regular one. Then one could e.g. try "direct merging" via
> >> this list when compaction fails, or prefer direct merging to compaction
> >> for non-costly-order allocations, do direct merging when allocation
> >> context doesn't even allow compaction (atomic etc).
> > 
> > One concern regarding "direct merging" these unmerged pages via this
> > separate freelist(let's call it unmerged_free_list) is: adjacent
> > unmerged pages on the unmerged_free_list could be far away from each
> > other regarding their physical positions, so during the process of
> > merging them, the needed high order page may not be able to be formed
> > in a short time. Actually, the time could be unbound in a bad condition
> > when:
> > 1 unmerged pages adjacent on the unmerged_free_list happen to be far
> >   away from each other regarding their physical positions; and
> 
> I'm not sure I understand. Why should it matter for merging if pages are
> adjacent on the unmerged_free_list? The buddy for merging is found the
> usual way, no?

Yes it's found the usual way. I probably didn't state clear, let me try
again.

Consider a pageblock, initially as an free order9 page. Let's assume
this order9 page is expand()ed into 512 order0 pages during different
allocation requests and they go to different applications running on
different CPUs. After some time, all of them are freed back, but each
of them is freed back at different times, so they are not adjacent on
unmerged_free_list(they could be far away from each other).

In the above scenerio, merging pages on unmerged_free_list one by one
may not be an efficent way to form a high-order page, but scanning a
pageblock PFN wise could be.

Of course, the above scenerio is imagined by me as a worst case, normal
case could be much better.

> 
> > 2 there are a lot of unmerged pages on unmerged_free_list.
> 
> That will affect allocation latency, yeah. Still might be faster than
> direct compaction. And possible to do in GFP_ATOMIC context, unlike
> direct compaction.

I see, but I'm not sure if it is OK to do 'direct merging' in GFP_ATOMIC
context - it is better for cases where failure to have the high-order
page allocated is very bad, but it might not be a good idea if the caller
has a fallback mechanism, i.e. if high order page allocation failed, they
can work with order0.

> 
> > That's the reason I hooked the merging of unmerged pages in compaction
> > when isolate_migratepages_block() is scanning every page of a pageblock
> > in PFN order.
> > 
> > OTOH, if there is a kernel thread trying to reduce fragmentation by
> > doing merges for these unmerged pages, I think it's perfect fine to let
> > it iterate all unmerged pages of that list and do_merge() for all of
> > them.
> > 
> > So what about this: if kcompactd is running, let it handle these
> > unmerged pages on the list and after that, do its usual job of
> > compaction. If direct compaction is running, do not handle unmerged
> > pages on that list but rely on isolate_migratepages_block() to do the
> > merging as is done in this patchset.
> > 
> > This of course has the effect of tying compaction with 'lazy merging'.
> > If it is not desirable, what about creating a new kernel thread to do
> > the merging of unmerged pages on the list while keeping the behaviour of
> > isolate_migratepages_block() in this patchset to improve compaction
> > success rate.
> 
> Note that anything based on daemons will seem like reducing latency for
> allocations, but if we delay merging and then later do it from a daemon,
> the overall zone lock times will be essentially the same, right? The
> reduced zone lock benefits only happen when the unmerged pages get
> reallocated.

Agree.
 
> Also I would definitely consider always merging pages freed to
> >> non-MOVABLE pageblocks. We really don't want to increase the
> >> fragmentation in those. However that means it probably won't help the
> >> netperf case?
> > 
> > Yes, that would be unfortunate for all in-kernel users of page
> > allocator...
> 
> In that case there should definitely be a direct merging possibility
> IMHO, even if only as a last resort before stealing from another pageblock.


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Wed, Oct 17, 2018 at 07:03:30PM +0200, Vlastimil Babka wrote:
> On 10/17/18 3:58 PM, Mel Gorman wrote:
> > Again, as compaction is not guaranteed to find the pageblocks, it would
> > be important to consider whether a) that matters or b) find an
> > alternative way of keeping unmerged buddies on separate lists so they
> > can be quickly discovered when a high-order allocation fails.
> 
> Agree, unmerged buddies could be on separate freelist from regular
> order-0 freelist. That list could be also preferred to allocations
> before the regular one. Then one could e.g. try "direct merging" via
> this list when compaction fails, or prefer direct merging to compaction
> for non-costly-order allocations, do direct merging when allocation
> context doesn't even allow compaction (atomic etc).

One concern regarding "direct merging" these unmerged pages via this
separate freelist(let's call it unmerged_free_list) is: adjacent
unmerged pages on the unmerged_free_list could be far away from each
other regarding their physical positions, so during the process of
merging them, the needed high order page may not be able to be formed
in a short time. Actually, the time could be unbound in a bad condition
when:
1 unmerged pages adjacent on the unmerged_free_list happen to be far
  away from each other regarding their physical positions; and
2 there are a lot of unmerged pages on unmerged_free_list.

That's the reason I hooked the merging of unmerged pages in compaction
when isolate_migratepages_block() is scanning every page of a pageblock
in PFN order.

OTOH, if there is a kernel thread trying to reduce fragmentation by
doing merges for these unmerged pages, I think it's perfect fine to let
it iterate all unmerged pages of that list and do_merge() for all of
them.

So what about this: if kcompactd is running, let it handle these
unmerged pages on the list and after that, do its usual job of
compaction. If direct compaction is running, do not handle unmerged
pages on that list but rely on isolate_migratepages_block() to do the
merging as is done in this patchset.

This of course has the effect of tying compaction with 'lazy merging'.
If it is not desirable, what about creating a new kernel thread to do
the merging of unmerged pages on the list while keeping the behaviour of
isolate_migratepages_block() in this patchset to improve compaction
success rate.

> Also I would definitely consider always merging pages freed to
> non-MOVABLE pageblocks. We really don't want to increase the
> fragmentation in those. However that means it probably won't help the
> netperf case?

Yes, that would be unfortunate for all in-kernel users of page
allocator...


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-18 Thread Aaron Lu
On Wed, Oct 17, 2018 at 07:03:30PM +0200, Vlastimil Babka wrote:
> On 10/17/18 3:58 PM, Mel Gorman wrote:
> > Again, as compaction is not guaranteed to find the pageblocks, it would
> > be important to consider whether a) that matters or b) find an
> > alternative way of keeping unmerged buddies on separate lists so they
> > can be quickly discovered when a high-order allocation fails.
> 
> Agree, unmerged buddies could be on separate freelist from regular
> order-0 freelist. That list could be also preferred to allocations
> before the regular one. Then one could e.g. try "direct merging" via
> this list when compaction fails, or prefer direct merging to compaction
> for non-costly-order allocations, do direct merging when allocation
> context doesn't even allow compaction (atomic etc).

One concern regarding "direct merging" these unmerged pages via this
separate freelist(let's call it unmerged_free_list) is: adjacent
unmerged pages on the unmerged_free_list could be far away from each
other regarding their physical positions, so during the process of
merging them, the needed high order page may not be able to be formed
in a short time. Actually, the time could be unbound in a bad condition
when:
1 unmerged pages adjacent on the unmerged_free_list happen to be far
  away from each other regarding their physical positions; and
2 there are a lot of unmerged pages on unmerged_free_list.

That's the reason I hooked the merging of unmerged pages in compaction
when isolate_migratepages_block() is scanning every page of a pageblock
in PFN order.

OTOH, if there is a kernel thread trying to reduce fragmentation by
doing merges for these unmerged pages, I think it's perfect fine to let
it iterate all unmerged pages of that list and do_merge() for all of
them.

So what about this: if kcompactd is running, let it handle these
unmerged pages on the list and after that, do its usual job of
compaction. If direct compaction is running, do not handle unmerged
pages on that list but rely on isolate_migratepages_block() to do the
merging as is done in this patchset.

This of course has the effect of tying compaction with 'lazy merging'.
If it is not desirable, what about creating a new kernel thread to do
the merging of unmerged pages on the list while keeping the behaviour of
isolate_migratepages_block() in this patchset to improve compaction
success rate.

> Also I would definitely consider always merging pages freed to
> non-MOVABLE pageblocks. We really don't want to increase the
> fragmentation in those. However that means it probably won't help the
> netperf case?

Yes, that would be unfortunate for all in-kernel users of page
allocator...


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 02:58:07PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 09:10:59PM +0800, Aaron Lu wrote:
> > On Wed, Oct 17, 2018 at 11:44:27AM +0100, Mel Gorman wrote:
> > > On Wed, Oct 17, 2018 at 02:33:27PM +0800, Aaron Lu wrote:
> > > > Running will-it-scale/page_fault1 process mode workload on a 2 sockets
> > > > Intel Skylake server showed severe lock contention of zone->lock, as
> > > > high as about 80%(42% on allocation path and 35% on free path) CPU
> > > > cycles are burnt spinning. With perf, the most time consuming part 
> > > > inside
> > > > that lock on free path is cache missing on page structures, mostly on
> > > > the to-be-freed page's buddy due to merging.
> > > > 
> > > 
> > > This confuses me slightly. The commit log for d8a759b57035 ("mm,
> > > page_alloc: double zone's batchsize") indicates that the contention for
> > > will-it-scale moved from the zone lock to the LRU lock. This appears to
> > > contradict that although the exact test case is different (page_fault_1
> > > vs page_fault2). Can you clarify why commit d8a759b57035 is
> > > insufficient?
> > 
> > commit d8a759b57035 helps zone lock scalability and while it reduced
> > zone lock scalability to some extent(but not entirely eliminated it),
> > the lock contention shifted to LRU lock in the meantime.
> > 
> 
> I assume you meant "zone lock contention" in the second case.

Yes, that's right.

> 
> > e.g. from commit d8a759b57035's changelog, with the same test case
> > will-it-scale/page_fault1:
> > 
> > 4 sockets Skylake:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   15345900+0.00%   64% 8%   72%
> >  63   17992886   +17.25%   24%45%   69%
> > 
> > 4 sockets Broadwell:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   16703983+0.00%   67% 7%   74%
> >  63   1825+9.49%   38%33%   71%
> > 
> > 2 sockets Skylake:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   9554867 +0.00%   66% 3%   69%
> >  63   9980145 +4.45%   62% 4%   66%
> > 
> > Please note that though zone lock contention for the 4 sockets server
> > reduced a lot with commit d8a759b57035, 2 sockets Skylake still suffered
> > a lot from zone lock contention even after we doubled batch size.
> > 
> 
> Any particuular reason why? I assume it's related to the number of zone
> locks with the increase number of zones and the number of threads used
> for the test.

I think so too.

The 4 sockets server has 192 CPUs in total while the 2 sockets server
has 112 CPUs in total. Assume only ZONE_NORMAL are used, for the 4
sockets server it would be 192/4=48(CPUs per zone) while for the 2
sockets server it is 112/2=56(CPUs per zone). The test is started with
nr_task=nr_cpu so for the 2 sockets servers, it ends up having more CPUs
consuming one zone.

> 
> > Also, the reduced zone lock contention will again get worse if LRU lock
> > is optimized away by Daniel's work, or in cases there are no LRU in the
> > picture, e.g. an in-kernel user of page allocator like Tariq Toukan
> > demonstrated with netperf.
> > 
> 
> Vaguely understood, I never looked at the LRU lock patches.
> 
> > > I'm wondering is this really about reducing the number of dirtied cache
> > > lines due to struct page updates and less about the actual zone lock.
> > 
> > Hmm...if we reduce the time it takes under the zone lock, aren't we
> > helping the zone lock? :-)
> > 
> 
> Indirectly yes but reducing cache line dirtying is useful in itself so
> they should be at least considered separately as independent
> optimisations.
> 
> > > 
> > > > One way to avoid this overhead is not do any merging at all for order-0
> > > > pages. With this approach, the lock contention for zone->lock on free
> > > > path dropped to 1.1% but allocation side still has as high as 42% lock
> > > > contention. In the meantime, the dropped lock contention on free side
> > > > doesn't translate to performance increase, instead, it's consumed by
> > > > increased lock contention of the per node lru_lock(rose from 5% to 37%)
> > > > and the final performa

Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 02:58:07PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 09:10:59PM +0800, Aaron Lu wrote:
> > On Wed, Oct 17, 2018 at 11:44:27AM +0100, Mel Gorman wrote:
> > > On Wed, Oct 17, 2018 at 02:33:27PM +0800, Aaron Lu wrote:
> > > > Running will-it-scale/page_fault1 process mode workload on a 2 sockets
> > > > Intel Skylake server showed severe lock contention of zone->lock, as
> > > > high as about 80%(42% on allocation path and 35% on free path) CPU
> > > > cycles are burnt spinning. With perf, the most time consuming part 
> > > > inside
> > > > that lock on free path is cache missing on page structures, mostly on
> > > > the to-be-freed page's buddy due to merging.
> > > > 
> > > 
> > > This confuses me slightly. The commit log for d8a759b57035 ("mm,
> > > page_alloc: double zone's batchsize") indicates that the contention for
> > > will-it-scale moved from the zone lock to the LRU lock. This appears to
> > > contradict that although the exact test case is different (page_fault_1
> > > vs page_fault2). Can you clarify why commit d8a759b57035 is
> > > insufficient?
> > 
> > commit d8a759b57035 helps zone lock scalability and while it reduced
> > zone lock scalability to some extent(but not entirely eliminated it),
> > the lock contention shifted to LRU lock in the meantime.
> > 
> 
> I assume you meant "zone lock contention" in the second case.

Yes, that's right.

> 
> > e.g. from commit d8a759b57035's changelog, with the same test case
> > will-it-scale/page_fault1:
> > 
> > 4 sockets Skylake:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   15345900+0.00%   64% 8%   72%
> >  63   17992886   +17.25%   24%45%   69%
> > 
> > 4 sockets Broadwell:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   16703983+0.00%   67% 7%   74%
> >  63   1825+9.49%   38%33%   71%
> > 
> > 2 sockets Skylake:
> > batch   score change   zone_contention   lru_contention   
> > total_contention
> >  31   9554867 +0.00%   66% 3%   69%
> >  63   9980145 +4.45%   62% 4%   66%
> > 
> > Please note that though zone lock contention for the 4 sockets server
> > reduced a lot with commit d8a759b57035, 2 sockets Skylake still suffered
> > a lot from zone lock contention even after we doubled batch size.
> > 
> 
> Any particuular reason why? I assume it's related to the number of zone
> locks with the increase number of zones and the number of threads used
> for the test.

I think so too.

The 4 sockets server has 192 CPUs in total while the 2 sockets server
has 112 CPUs in total. Assume only ZONE_NORMAL are used, for the 4
sockets server it would be 192/4=48(CPUs per zone) while for the 2
sockets server it is 112/2=56(CPUs per zone). The test is started with
nr_task=nr_cpu so for the 2 sockets servers, it ends up having more CPUs
consuming one zone.

> 
> > Also, the reduced zone lock contention will again get worse if LRU lock
> > is optimized away by Daniel's work, or in cases there are no LRU in the
> > picture, e.g. an in-kernel user of page allocator like Tariq Toukan
> > demonstrated with netperf.
> > 
> 
> Vaguely understood, I never looked at the LRU lock patches.
> 
> > > I'm wondering is this really about reducing the number of dirtied cache
> > > lines due to struct page updates and less about the actual zone lock.
> > 
> > Hmm...if we reduce the time it takes under the zone lock, aren't we
> > helping the zone lock? :-)
> > 
> 
> Indirectly yes but reducing cache line dirtying is useful in itself so
> they should be at least considered separately as independent
> optimisations.
> 
> > > 
> > > > One way to avoid this overhead is not do any merging at all for order-0
> > > > pages. With this approach, the lock contention for zone->lock on free
> > > > path dropped to 1.1% but allocation side still has as high as 42% lock
> > > > contention. In the meantime, the dropped lock contention on free side
> > > > doesn't translate to performance increase, instead, it's consumed by
> > > > increased lock contention of the per node lru_lock(rose from 5% to 37%)
> > > > and the final performa

Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 12:20:42PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 02:33:28PM +0800, Aaron Lu wrote:
> > Profile on Intel Skylake server shows the most time consuming part
> > under zone->lock on allocation path is accessing those to-be-returned
> > page's "struct page" on the free_list inside zone->lock. One explanation
> > is, different CPUs are releasing pages to the head of free_list and
> > those page's 'struct page' may very well be cache cold for the allocating
> > CPU when it grabs these pages from free_list' head. The purpose here
> > is to avoid touching these pages one by one inside zone->lock.
> > 
> 
> I didn't read this one in depth because it's somewhat ortogonal to the
> lazy buddy merging which I think would benefit from being finalised and
> ensuring that there are no reductions in high-order allocation success
> rates.  Pages being allocated on one CPU and freed on another is not that
> unusual -- ping-pong workloads or things like netperf used to exhibit
> this sort of pattern.
> 
> However, this part stuck out
> 
> > +static inline void zone_wait_cluster_alloc(struct zone *zone)
> > +{
> > +   while (atomic_read(>cluster.in_progress))
> > +   cpu_relax();
> > +}
> > +
> 
> RT has had problems with cpu_relax in the past but more importantly, as
> this delay for parallel compactions and allocations of contig ranges,
> we could be stuck here for very long periods of time with interrupts

The longest possible time is one CPU accessing pcp->batch number cold
cachelines. Reason:
When zone_wait_cluster_alloc() is called, we already held zone lock so
no more allocations are possible. Waiting in_progress to become zero
means waiting any CPU that increased in_progress to finish processing
their allocated pages. Since they will at most allocate pcp->batch pages
and worse case are all these page structres are cache cold, so the
longest wait time is one CPU accessing pcp->batch number cold cache lines.

I have no idea if this time is too long though.

> disabled. It gets even worse if it's from an interrupt context such as
> jumbo frame allocation or a high-order slab allocation that is atomic.

My understanding is atomic allocation won't trigger compaction, no?

> These potentially large periods of time with interrupts disabled is very
> hazardous.

I see and agree, thanks for pointing this out.
Hopefully, the above mentioned worst case time won't be regarded as
unbound or too long.

> It may be necessary to consider instead minimising the number
> of struct page update when merging to PCP and then either increasing the
> size of the PCP or allowing it to exceed pcp->high for short periods of
> time to batch the struct page updates.

I don't quite follow this part. It doesn't seem possible we can exceed
pcp->high in allocation path, or are you talking about free path?

And thanks a lot for the review!


Re: [RFC v4 PATCH 3/5] mm/rmqueue_bulk: alloc without touching individual page structure

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 12:20:42PM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 02:33:28PM +0800, Aaron Lu wrote:
> > Profile on Intel Skylake server shows the most time consuming part
> > under zone->lock on allocation path is accessing those to-be-returned
> > page's "struct page" on the free_list inside zone->lock. One explanation
> > is, different CPUs are releasing pages to the head of free_list and
> > those page's 'struct page' may very well be cache cold for the allocating
> > CPU when it grabs these pages from free_list' head. The purpose here
> > is to avoid touching these pages one by one inside zone->lock.
> > 
> 
> I didn't read this one in depth because it's somewhat ortogonal to the
> lazy buddy merging which I think would benefit from being finalised and
> ensuring that there are no reductions in high-order allocation success
> rates.  Pages being allocated on one CPU and freed on another is not that
> unusual -- ping-pong workloads or things like netperf used to exhibit
> this sort of pattern.
> 
> However, this part stuck out
> 
> > +static inline void zone_wait_cluster_alloc(struct zone *zone)
> > +{
> > +   while (atomic_read(>cluster.in_progress))
> > +   cpu_relax();
> > +}
> > +
> 
> RT has had problems with cpu_relax in the past but more importantly, as
> this delay for parallel compactions and allocations of contig ranges,
> we could be stuck here for very long periods of time with interrupts

The longest possible time is one CPU accessing pcp->batch number cold
cachelines. Reason:
When zone_wait_cluster_alloc() is called, we already held zone lock so
no more allocations are possible. Waiting in_progress to become zero
means waiting any CPU that increased in_progress to finish processing
their allocated pages. Since they will at most allocate pcp->batch pages
and worse case are all these page structres are cache cold, so the
longest wait time is one CPU accessing pcp->batch number cold cache lines.

I have no idea if this time is too long though.

> disabled. It gets even worse if it's from an interrupt context such as
> jumbo frame allocation or a high-order slab allocation that is atomic.

My understanding is atomic allocation won't trigger compaction, no?

> These potentially large periods of time with interrupts disabled is very
> hazardous.

I see and agree, thanks for pointing this out.
Hopefully, the above mentioned worst case time won't be regarded as
unbound or too long.

> It may be necessary to consider instead minimising the number
> of struct page update when merging to PCP and then either increasing the
> size of the PCP or allowing it to exceed pcp->high for short periods of
> time to batch the struct page updates.

I don't quite follow this part. It doesn't seem possible we can exceed
pcp->high in allocation path, or are you talking about free path?

And thanks a lot for the review!


Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 11:44:27AM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 02:33:27PM +0800, Aaron Lu wrote:
> > Running will-it-scale/page_fault1 process mode workload on a 2 sockets
> > Intel Skylake server showed severe lock contention of zone->lock, as
> > high as about 80%(42% on allocation path and 35% on free path) CPU
> > cycles are burnt spinning. With perf, the most time consuming part inside
> > that lock on free path is cache missing on page structures, mostly on
> > the to-be-freed page's buddy due to merging.
> > 
> 
> This confuses me slightly. The commit log for d8a759b57035 ("mm,
> page_alloc: double zone's batchsize") indicates that the contention for
> will-it-scale moved from the zone lock to the LRU lock. This appears to
> contradict that although the exact test case is different (page_fault_1
> vs page_fault2). Can you clarify why commit d8a759b57035 is
> insufficient?

commit d8a759b57035 helps zone lock scalability and while it reduced
zone lock scalability to some extent(but not entirely eliminated it),
the lock contention shifted to LRU lock in the meantime.

e.g. from commit d8a759b57035's changelog, with the same test case
will-it-scale/page_fault1:

4 sockets Skylake:
batch   score change   zone_contention   lru_contention   
total_contention
 31   15345900+0.00%   64% 8%   72%
 63   17992886   +17.25%   24%45%   69%

4 sockets Broadwell:
batch   score change   zone_contention   lru_contention   
total_contention
 31   16703983+0.00%   67% 7%   74%
 63   1825+9.49%   38%33%   71%

2 sockets Skylake:
batch   score change   zone_contention   lru_contention   
total_contention
 31   9554867 +0.00%   66% 3%   69%
 63   9980145 +4.45%   62% 4%   66%

Please note that though zone lock contention for the 4 sockets server
reduced a lot with commit d8a759b57035, 2 sockets Skylake still suffered
a lot from zone lock contention even after we doubled batch size.

Also, the reduced zone lock contention will again get worse if LRU lock
is optimized away by Daniel's work, or in cases there are no LRU in the
picture, e.g. an in-kernel user of page allocator like Tariq Toukan
demonstrated with netperf.

> I'm wondering is this really about reducing the number of dirtied cache
> lines due to struct page updates and less about the actual zone lock.

Hmm...if we reduce the time it takes under the zone lock, aren't we
helping the zone lock? :-)

> 
> > One way to avoid this overhead is not do any merging at all for order-0
> > pages. With this approach, the lock contention for zone->lock on free
> > path dropped to 1.1% but allocation side still has as high as 42% lock
> > contention. In the meantime, the dropped lock contention on free side
> > doesn't translate to performance increase, instead, it's consumed by
> > increased lock contention of the per node lru_lock(rose from 5% to 37%)
> > and the final performance slightly dropped about 1%.
> > 
> 
> Although this implies it's really about contention.
> 
> > Though performance dropped a little, it almost eliminated zone lock
> > contention on free path and it is the foundation for the next patch
> > that eliminates zone lock contention for allocation path.
> > 
> 
> Can you clarify whether THP was enabled or not? As this is order-0 focused,
> it would imply the series should have minimal impact due to limited merging.

Sorry about this, I should have mentioned THP is not used here.

> 
> > Suggested-by: Dave Hansen 
> > Signed-off-by: Aaron Lu 
> > ---
> >  include/linux/mm_types.h |  9 +++-
> >  mm/compaction.c  | 13 +-
> >  mm/internal.h| 27 
> >  mm/page_alloc.c  | 88 ++--
> >  4 files changed, 121 insertions(+), 16 deletions(-)
> > 
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 5ed8f6292a53..aed93053ef6e 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -179,8 +179,13 @@ struct page {
> > int units;  /* SLOB */
> > };
> >  
> > -   /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
> > -   atomic_t _refcount;
> > +   union {
> > +   /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
> > +   atomic_t _refcount;
> > +
> > +   /* For pages in Buddy: if skipped merging when added to Buddy */
> > +   bool buddy_merge_skipped;
> > +   };
> 

Re: [RFC v4 PATCH 2/5] mm/__free_one_page: skip merge for order-0 page unless compaction failed

2018-10-17 Thread Aaron Lu
On Wed, Oct 17, 2018 at 11:44:27AM +0100, Mel Gorman wrote:
> On Wed, Oct 17, 2018 at 02:33:27PM +0800, Aaron Lu wrote:
> > Running will-it-scale/page_fault1 process mode workload on a 2 sockets
> > Intel Skylake server showed severe lock contention of zone->lock, as
> > high as about 80%(42% on allocation path and 35% on free path) CPU
> > cycles are burnt spinning. With perf, the most time consuming part inside
> > that lock on free path is cache missing on page structures, mostly on
> > the to-be-freed page's buddy due to merging.
> > 
> 
> This confuses me slightly. The commit log for d8a759b57035 ("mm,
> page_alloc: double zone's batchsize") indicates that the contention for
> will-it-scale moved from the zone lock to the LRU lock. This appears to
> contradict that although the exact test case is different (page_fault_1
> vs page_fault2). Can you clarify why commit d8a759b57035 is
> insufficient?

commit d8a759b57035 helps zone lock scalability and while it reduced
zone lock scalability to some extent(but not entirely eliminated it),
the lock contention shifted to LRU lock in the meantime.

e.g. from commit d8a759b57035's changelog, with the same test case
will-it-scale/page_fault1:

4 sockets Skylake:
batch   score change   zone_contention   lru_contention   
total_contention
 31   15345900+0.00%   64% 8%   72%
 63   17992886   +17.25%   24%45%   69%

4 sockets Broadwell:
batch   score change   zone_contention   lru_contention   
total_contention
 31   16703983+0.00%   67% 7%   74%
 63   1825+9.49%   38%33%   71%

2 sockets Skylake:
batch   score change   zone_contention   lru_contention   
total_contention
 31   9554867 +0.00%   66% 3%   69%
 63   9980145 +4.45%   62% 4%   66%

Please note that though zone lock contention for the 4 sockets server
reduced a lot with commit d8a759b57035, 2 sockets Skylake still suffered
a lot from zone lock contention even after we doubled batch size.

Also, the reduced zone lock contention will again get worse if LRU lock
is optimized away by Daniel's work, or in cases there are no LRU in the
picture, e.g. an in-kernel user of page allocator like Tariq Toukan
demonstrated with netperf.

> I'm wondering is this really about reducing the number of dirtied cache
> lines due to struct page updates and less about the actual zone lock.

Hmm...if we reduce the time it takes under the zone lock, aren't we
helping the zone lock? :-)

> 
> > One way to avoid this overhead is not do any merging at all for order-0
> > pages. With this approach, the lock contention for zone->lock on free
> > path dropped to 1.1% but allocation side still has as high as 42% lock
> > contention. In the meantime, the dropped lock contention on free side
> > doesn't translate to performance increase, instead, it's consumed by
> > increased lock contention of the per node lru_lock(rose from 5% to 37%)
> > and the final performance slightly dropped about 1%.
> > 
> 
> Although this implies it's really about contention.
> 
> > Though performance dropped a little, it almost eliminated zone lock
> > contention on free path and it is the foundation for the next patch
> > that eliminates zone lock contention for allocation path.
> > 
> 
> Can you clarify whether THP was enabled or not? As this is order-0 focused,
> it would imply the series should have minimal impact due to limited merging.

Sorry about this, I should have mentioned THP is not used here.

> 
> > Suggested-by: Dave Hansen 
> > Signed-off-by: Aaron Lu 
> > ---
> >  include/linux/mm_types.h |  9 +++-
> >  mm/compaction.c  | 13 +-
> >  mm/internal.h| 27 
> >  mm/page_alloc.c  | 88 ++--
> >  4 files changed, 121 insertions(+), 16 deletions(-)
> > 
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 5ed8f6292a53..aed93053ef6e 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -179,8 +179,13 @@ struct page {
> > int units;  /* SLOB */
> > };
> >  
> > -   /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
> > -   atomic_t _refcount;
> > +   union {
> > +   /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
> > +   atomic_t _refcount;
> > +
> > +   /* For pages in Buddy: if skipped merging when added to Buddy */
> > +   bool buddy_merge_skipped;
> > +   };
> 

[RFC v4 PATCH 5/5] mm/can_skip_merge(): make it more aggressive to attempt cluster alloc/free

2018-10-17 Thread Aaron Lu
After system runs a long time, it's easy for a zone to have no
suitable high order page available and that will stop cluster alloc
and free in current implementation due to compact_considered > 0.

To make it favour order0 alloc/free, relax the condition to only
disallow cluster alloc/free when problem would occur, e.g. when
compaction is in progress.

Signed-off-by: Aaron Lu 
---
 mm/internal.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index fb4e8f7976e5..309a3f43e613 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -538,10 +538,6 @@ void try_to_merge_page(struct page *page);
 #ifdef CONFIG_COMPACTION
 static inline bool can_skip_merge(struct zone *zone, int order)
 {
-   /* Compaction has failed in this zone, we shouldn't skip merging */
-   if (zone->compact_considered)
-   return false;
-
/* Only consider no_merge for order 0 pages */
if (order)
return false;
-- 
2.17.2



[RFC v4 PATCH 5/5] mm/can_skip_merge(): make it more aggressive to attempt cluster alloc/free

2018-10-17 Thread Aaron Lu
After system runs a long time, it's easy for a zone to have no
suitable high order page available and that will stop cluster alloc
and free in current implementation due to compact_considered > 0.

To make it favour order0 alloc/free, relax the condition to only
disallow cluster alloc/free when problem would occur, e.g. when
compaction is in progress.

Signed-off-by: Aaron Lu 
---
 mm/internal.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index fb4e8f7976e5..309a3f43e613 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -538,10 +538,6 @@ void try_to_merge_page(struct page *page);
 #ifdef CONFIG_COMPACTION
 static inline bool can_skip_merge(struct zone *zone, int order)
 {
-   /* Compaction has failed in this zone, we shouldn't skip merging */
-   if (zone->compact_considered)
-   return false;
-
/* Only consider no_merge for order 0 pages */
if (order)
return false;
-- 
2.17.2



[RFC v4 PATCH 1/5] mm/page_alloc: use helper functions to add/remove a page to/from buddy

2018-10-17 Thread Aaron Lu
There are multiple places that add/remove a page into/from buddy,
introduce helper functions for them.

This also makes it easier to add code when a page is added/removed
to/from buddy.

No functionality change.

Acked-by: Vlastimil Babka 
Signed-off-by: Aaron Lu 
---
 mm/page_alloc.c | 65 +
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 89d2a2ab3fe6..14c20bb3a3da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -697,12 +697,41 @@ static inline void set_page_order(struct page *page, 
unsigned int order)
__SetPageBuddy(page);
 }
 
+static inline void add_to_buddy_common(struct page *page, struct zone *zone,
+   unsigned int order)
+{
+   set_page_order(page, order);
+   zone->free_area[order].nr_free++;
+}
+
+static inline void add_to_buddy_head(struct page *page, struct zone *zone,
+   unsigned int order, int mt)
+{
+   add_to_buddy_common(page, zone, order);
+   list_add(>lru, >free_area[order].free_list[mt]);
+}
+
+static inline void add_to_buddy_tail(struct page *page, struct zone *zone,
+   unsigned int order, int mt)
+{
+   add_to_buddy_common(page, zone, order);
+   list_add_tail(>lru, >free_area[order].free_list[mt]);
+}
+
 static inline void rmv_page_order(struct page *page)
 {
__ClearPageBuddy(page);
set_page_private(page, 0);
 }
 
+static inline void remove_from_buddy(struct page *page, struct zone *zone,
+   unsigned int order)
+{
+   list_del(>lru);
+   zone->free_area[order].nr_free--;
+   rmv_page_order(page);
+}
+
 /*
  * This function checks whether a page is free && is the buddy
  * we can coalesce a page and its buddy if
@@ -803,13 +832,10 @@ static inline void __free_one_page(struct page *page,
 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 * merge with it and move up one order.
 */
-   if (page_is_guard(buddy)) {
+   if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
-   } else {
-   list_del(>lru);
-   zone->free_area[order].nr_free--;
-   rmv_page_order(buddy);
-   }
+   else
+   remove_from_buddy(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -841,8 +867,6 @@ static inline void __free_one_page(struct page *page,
}
 
 done_merging:
-   set_page_order(page, order);
-
/*
 * If this is not the largest possible page, check if the buddy
 * of the next-highest order is free. If it is, it's possible
@@ -859,15 +883,12 @@ static inline void __free_one_page(struct page *page,
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
-   list_add_tail(>lru,
-   >free_area[order].free_list[migratetype]);
-   goto out;
+   add_to_buddy_tail(page, zone, order, migratetype);
+   return;
}
}
 
-   list_add(>lru, >free_area[order].free_list[migratetype]);
-out:
-   zone->free_area[order].nr_free++;
+   add_to_buddy_head(page, zone, order, migratetype);
 }
 
 /*
@@ -1805,9 +1826,7 @@ static inline void expand(struct zone *zone, struct page 
*page,
if (set_page_guard(zone, [size], high, migratetype))
continue;
 
-   list_add([size].lru, >free_list[migratetype]);
-   area->nr_free++;
-   set_page_order([size], high);
+   add_to_buddy_head([size], zone, high, migratetype);
}
 }
 
@@ -1951,9 +1970,7 @@ struct page *__rmqueue_smallest(struct zone *zone, 
unsigned int order,
struct page, lru);
if (!page)
continue;
-   list_del(>lru);
-   rmv_page_order(page);
-   area->nr_free--;
+   remove_from_buddy(page, zone, current_order);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -2871,9 +2888,7 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
}
 
/* Remove page from free list */
-   list_del(>lru);
-   zone->free_area[order].nr_f

  1   2   3   4   5   6   7   8   9   10   >