Hello Peter, thank you for your reply.
On Tue, 2016-08-02 at 12:37 +0200, Peter Zijlstra wrote: > On Tue, Jul 26, 2016 at 04:07:14PM +0200, Giovanni Gherdovich wrote: > > > Signed-off-by: Mike Galbraith <mgalbra...@suse.de> > > Signed-off-by: Giovanni Gherdovich <ggherdov...@suse.cz> > > SoB chain is borken. Either Mike wrote the patch in which case you're > missing a From: Mike header someplace, or you wrote it and Mike needs > to be a Ack/Reviewed or somesuch. Right. As Mike already explained, this patch is the result of him correcting a much more involved/complicated solution I prepared to solve the problem. I will put the "From: Mike" in v2. > > > --- > > kernel/sched/core.c | 4 ++++ > > 1 file changed, 4 insertions(+) > > > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > > index 51d7105..0ef1e69 100644 > > --- a/kernel/sched/core.c > > +++ b/kernel/sched/core.c > > @@ -2998,6 +2998,10 @@ unsigned long long task_sched_runtime(struct > > task_struct *p) > > * thread, breaking clock_gettime(). > > */ > > if (task_current(rq, p) && task_on_rq_queued(p)) { > > +#if defined(CONFIG_FAIR_GROUP_SCHED) > > This here wants a comment on why we're doing this. Because I'm sure > that if someone were to read this code in a few weeks they'd go > WTF!? I had that config variable set in the machine I was testing on, and thought that for some reason it was related to my observations. I will repeat the experiment without it, and if I obtain the same results I will drop the conditional. Otherwise I will motivate its necessity. I will submit a v2 early next week, rebasing the patch on the forthcoming 4.8-rc1 tag and updating the experimental data. > > Also, is there a possibility of manual CSE we should do? > > > + prefetch((&p->se)->cfs_rq->curr); > > + prefetch(&(&p->se)->cfs_rq->curr->exec_start); > > +#endif > > update_rq_clock(rq); > > p->sched_class->update_curr(rq); > > } Good point. I verified and GCC 4.8.5 gets it already without hints needed. This is the alternative code with the CSE that I compiled: -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51d7105..5d676db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2998,6 +2998,11 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { +#if defined(CONFIG_FAIR_GROUP_SCHED) + struct sched_entity *curr = (&p->se)->cfs_rq->curr; + prefetch(curr); + prefetch(&curr->exec_start); +#endif update_rq_clock(rq); p->sched_class->update_curr(rq); } -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 I post below the snippets of generated code with and without CSE that I got running 'disassemble /m task_sched_runtime' in gdb; you'll see they're identical. If you prefer the explicit hint I'll include it in v2, but it's probably safe to say it isn't needed. Regards, Giovanni with CSE: -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 3001#if defined(CONFIG_FAIR_GROUP_SCHED) 3002 struct sched_entity *curr = (&p->se)->cfs_rq->curr; <+117>: mov 0x1d0(%rbx),%rdx <+124>: mov 0x38(%rdx),%rdx 3003 prefetch(curr); 3004 prefetch(&curr->exec_start); 3005#endif 3006 update_rq_clock(rq); 3007 p->sched_class->update_curr(rq); <+144>: mov 0x58(%rbx),%rdx <+148>: mov %rax,%rdi <+151>: mov %rax,-0x20(%rbp) <+155>: callq *0xb0(%rdx) <+161>: mov -0x20(%rbp),%rax <+165>: jmp <task_sched_runtime+66> <+167>: mov %rax,%rdi <+170>: mov %rax,-0x20(%rbp) <+174>: callq <update_rq_clock> <+179>: mov -0x20(%rbp),%rax <+183>: jmp <task_sched_runtime+144> : nopl 0x0(%rax) 3008 } 3009 ns = p->se.sum_exec_runtime; <+66>: mov 0xc8(%rbx),%r12 3010 task_rq_unlock(rq, p, &rf); 3011 3012 return ns; <+103>: mov %r12,%rax w/o CSE: -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 -- -- >8 3001#if defined(CONFIG_FAIR_GROUP_SCHED) 3002 prefetch((&p->se)->cfs_rq->curr); <+117>: mov 0x1d0(%rbx),%rdx <+124>: mov 0x38(%rdx),%rdx 3003 prefetch(&(&p->se)->cfs_rq->curr->exec_start); 3004#endif 3005 update_rq_clock(rq); 3006 p->sched_class->update_curr(rq); <+144>: mov 0x58(%rbx),%rdx <+148>: mov %rax,%rdi <+151>: mov %rax,-0x20(%rbp) <+155>: callq *0xb0(%rdx) <+161>: mov -0x20(%rbp),%rax <+165>: jmp <task_sched_runtime+66> <+167>: mov %rax,%rdi <+170>: mov %rax,-0x20(%rbp) <+174>: callq <update_rq_clock> <+179>: mov -0x20(%rbp),%rax <+183>: jmp <task_sched_runtime+144> : nopl 0x0(%rax) 3007 } 3008 ns = p->se.sum_exec_runtime; <+66>: mov 0xc8(%rbx),%r12 3009 task_rq_unlock(rq, p, &rf); 3010 3011 return ns; <+103>: mov %r12,%rax