Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Vincent Guittot
On Tue, 27 Oct 2020 at 13:06, Marc Zyngier  wrote:
>
> On 2020-10-27 11:21, Vincent Guittot wrote:
> > On Tue, 27 Oct 2020 at 11:50, Vincent Guittot
> >  wrote:
> >>
> >> On Tue, 27 Oct 2020 at 11:37, Marc Zyngier  wrote:
> >> >
> >> > On 2020-10-27 10:12, Vincent Guittot wrote:
> >> > > HI Marc,
> >> > >
> >> > > On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
> >> > >  wrote:
> >> > >>
> >> > >> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
> >> > >> >
> >> > >
> >> > > ...
> >> > >
> >> > >> > >>
> >> > >> > >> One of the major difference is that we end up, in some cases
> >> > >> > >> (such as when performing IRQ time accounting on the scheduler
> >> > >> > >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> >> > >> > >> Other than the (relatively small) overhead, there should be
> >> > >> > >> no consequences to it (these pairs are designed to nest
> >> > >> > >> correctly, and the accounting shouldn't be off).
> >> > >> > >
> >> > >> > > While rebasing on mainline, I have faced a performance regression 
> >> > >> > > for
> >> > >> > > the benchmark:
> >> > >> > > perf bench sched pipe
> >> > >> > > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS 
> >> > >> > > (thx2)
> >> > >> > >
> >> > >> > > The regression comes from:
> >> > >> > > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> >> > >> > > interrupts")
> >> > >> >
> >> > >> > That's interesting, as this patch doesn't really change anything 
> >> > >> > (most
> >> > >> > of the potential overhead comes in later). The only potential 
> >> > >> > overhead
> >> > >> > I can see is that the scheduler_ipi() call is now wrapped around
> >> > >> > irq_enter()/irq_exit().
> >> > >> >
> >> > >> > >
> >> > >> > >   v5.9  + this patch
> >> > >> > > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> >> > >> > > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> >> > >> > >
> >> > >> > > By + this patch,  I mean merging branch from this patch. Whereas
> >> > >> > > merging the previous:
> >> > >> > > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded 
> >> > >> > > from
> >> > >> > > /proc/interrupts")
> >> > >> > >  It doesn't show any regression
> >> > >> >
> >> > >> > Since you are running perf, can you spot where the overhead occurs?
> >> > >
> >> > > Any idea about the root cause of the regression ?
> >> > > I have faced it on more arm64 platforms in the meantime
> >> >
> >> > two possible causes:
> >> >
> >> > (1) irq_enter/exit on the rescheduling IPI means we reschedule much more
> >> > often
> >> > (2) irq_domain lookups add some overhead.
> >> >
> >> > For (1), I have this series[1] which is ugly as sin and needs much more
> >> > testing.
> >>
> >> Ok, I'm going to test this series to see if it fixes the perf
> >> regression
> >
> > You have spotted the root cause of the regression. We are back to ~1%
> > performance diff on the hikey
>
> Yeah. Only thing is that I can't look at this hack without vomiting...

At least, we know the root cause and the impact of irq_enter/exit
>
>  M.
> --
> Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Marc Zyngier

On 2020-10-27 11:21, Vincent Guittot wrote:

On Tue, 27 Oct 2020 at 11:50, Vincent Guittot
 wrote:


On Tue, 27 Oct 2020 at 11:37, Marc Zyngier  wrote:
>
> On 2020-10-27 10:12, Vincent Guittot wrote:
> > HI Marc,
> >
> > On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
> >  wrote:
> >>
> >> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
> >> >
> >
> > ...
> >
> >> > >>
> >> > >> One of the major difference is that we end up, in some cases
> >> > >> (such as when performing IRQ time accounting on the scheduler
> >> > >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> >> > >> Other than the (relatively small) overhead, there should be
> >> > >> no consequences to it (these pairs are designed to nest
> >> > >> correctly, and the accounting shouldn't be off).
> >> > >
> >> > > While rebasing on mainline, I have faced a performance regression for
> >> > > the benchmark:
> >> > > perf bench sched pipe
> >> > > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> >> > >
> >> > > The regression comes from:
> >> > > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> >> > > interrupts")
> >> >
> >> > That's interesting, as this patch doesn't really change anything (most
> >> > of the potential overhead comes in later). The only potential overhead
> >> > I can see is that the scheduler_ipi() call is now wrapped around
> >> > irq_enter()/irq_exit().
> >> >
> >> > >
> >> > >   v5.9  + this patch
> >> > > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> >> > > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> >> > >
> >> > > By + this patch,  I mean merging branch from this patch. Whereas
> >> > > merging the previous:
> >> > > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> >> > > /proc/interrupts")
> >> > >  It doesn't show any regression
> >> >
> >> > Since you are running perf, can you spot where the overhead occurs?
> >
> > Any idea about the root cause of the regression ?
> > I have faced it on more arm64 platforms in the meantime
>
> two possible causes:
>
> (1) irq_enter/exit on the rescheduling IPI means we reschedule much more
> often
> (2) irq_domain lookups add some overhead.
>
> For (1), I have this series[1] which is ugly as sin and needs much more
> testing.

Ok, I'm going to test this series to see if it fixes the perf 
regression


You have spotted the root cause of the regression. We are back to ~1%
performance diff on the hikey


Yeah. Only thing is that I can't look at this hack without vomiting...

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Vincent Guittot
On Tue, 27 Oct 2020 at 11:50, Vincent Guittot
 wrote:
>
> On Tue, 27 Oct 2020 at 11:37, Marc Zyngier  wrote:
> >
> > On 2020-10-27 10:12, Vincent Guittot wrote:
> > > HI Marc,
> > >
> > > On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
> > >  wrote:
> > >>
> > >> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
> > >> >
> > >
> > > ...
> > >
> > >> > >>
> > >> > >> One of the major difference is that we end up, in some cases
> > >> > >> (such as when performing IRQ time accounting on the scheduler
> > >> > >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> > >> > >> Other than the (relatively small) overhead, there should be
> > >> > >> no consequences to it (these pairs are designed to nest
> > >> > >> correctly, and the accounting shouldn't be off).
> > >> > >
> > >> > > While rebasing on mainline, I have faced a performance regression for
> > >> > > the benchmark:
> > >> > > perf bench sched pipe
> > >> > > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> > >> > >
> > >> > > The regression comes from:
> > >> > > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> > >> > > interrupts")
> > >> >
> > >> > That's interesting, as this patch doesn't really change anything (most
> > >> > of the potential overhead comes in later). The only potential overhead
> > >> > I can see is that the scheduler_ipi() call is now wrapped around
> > >> > irq_enter()/irq_exit().
> > >> >
> > >> > >
> > >> > >   v5.9  + this patch
> > >> > > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> > >> > > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> > >> > >
> > >> > > By + this patch,  I mean merging branch from this patch. Whereas
> > >> > > merging the previous:
> > >> > > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> > >> > > /proc/interrupts")
> > >> > >  It doesn't show any regression
> > >> >
> > >> > Since you are running perf, can you spot where the overhead occurs?
> > >
> > > Any idea about the root cause of the regression ?
> > > I have faced it on more arm64 platforms in the meantime
> >
> > two possible causes:
> >
> > (1) irq_enter/exit on the rescheduling IPI means we reschedule much more
> > often
> > (2) irq_domain lookups add some overhead.
> >
> > For (1), I have this series[1] which is ugly as sin and needs much more
> > testing.
>
> Ok, I'm going to test this series to see if it fixes the perf regression

You have spotted the root cause of the regression. We are back to ~1%
performance diff on the hikey

>
> >
> > For (2), I have some ideas which need more work (let the irq domain
> > resolve to
> > an irq_desc instead of an interrupt number, avoiding another radix-tree
> > lookup).
> >
> >  M.
> >
> > [1]
> > https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-fixes
> > --
> > Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Vincent Guittot
On Tue, 27 Oct 2020 at 11:37, Marc Zyngier  wrote:
>
> On 2020-10-27 10:12, Vincent Guittot wrote:
> > HI Marc,
> >
> > On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
> >  wrote:
> >>
> >> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
> >> >
> >
> > ...
> >
> >> > >>
> >> > >> One of the major difference is that we end up, in some cases
> >> > >> (such as when performing IRQ time accounting on the scheduler
> >> > >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> >> > >> Other than the (relatively small) overhead, there should be
> >> > >> no consequences to it (these pairs are designed to nest
> >> > >> correctly, and the accounting shouldn't be off).
> >> > >
> >> > > While rebasing on mainline, I have faced a performance regression for
> >> > > the benchmark:
> >> > > perf bench sched pipe
> >> > > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> >> > >
> >> > > The regression comes from:
> >> > > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> >> > > interrupts")
> >> >
> >> > That's interesting, as this patch doesn't really change anything (most
> >> > of the potential overhead comes in later). The only potential overhead
> >> > I can see is that the scheduler_ipi() call is now wrapped around
> >> > irq_enter()/irq_exit().
> >> >
> >> > >
> >> > >   v5.9  + this patch
> >> > > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> >> > > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> >> > >
> >> > > By + this patch,  I mean merging branch from this patch. Whereas
> >> > > merging the previous:
> >> > > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> >> > > /proc/interrupts")
> >> > >  It doesn't show any regression
> >> >
> >> > Since you are running perf, can you spot where the overhead occurs?
> >
> > Any idea about the root cause of the regression ?
> > I have faced it on more arm64 platforms in the meantime
>
> two possible causes:
>
> (1) irq_enter/exit on the rescheduling IPI means we reschedule much more
> often
> (2) irq_domain lookups add some overhead.
>
> For (1), I have this series[1] which is ugly as sin and needs much more
> testing.

Ok, I'm going to test this series to see if it fixes the perf regression

>
> For (2), I have some ideas which need more work (let the irq domain
> resolve to
> an irq_desc instead of an interrupt number, avoiding another radix-tree
> lookup).
>
>  M.
>
> [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-fixes
> --
> Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Marc Zyngier

On 2020-10-27 10:12, Vincent Guittot wrote:

HI Marc,

On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
 wrote:


On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
>


...


> >>
> >> One of the major difference is that we end up, in some cases
> >> (such as when performing IRQ time accounting on the scheduler
> >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> >> Other than the (relatively small) overhead, there should be
> >> no consequences to it (these pairs are designed to nest
> >> correctly, and the accounting shouldn't be off).
> >
> > While rebasing on mainline, I have faced a performance regression for
> > the benchmark:
> > perf bench sched pipe
> > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> >
> > The regression comes from:
> > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> > interrupts")
>
> That's interesting, as this patch doesn't really change anything (most
> of the potential overhead comes in later). The only potential overhead
> I can see is that the scheduler_ipi() call is now wrapped around
> irq_enter()/irq_exit().
>
> >
> >   v5.9  + this patch
> > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> >
> > By + this patch,  I mean merging branch from this patch. Whereas
> > merging the previous:
> > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> > /proc/interrupts")
> >  It doesn't show any regression
>
> Since you are running perf, can you spot where the overhead occurs?


Any idea about the root cause of the regression ?
I have faced it on more arm64 platforms in the meantime


two possible causes:

(1) irq_enter/exit on the rescheduling IPI means we reschedule much more 
often

(2) irq_domain lookups add some overhead.

For (1), I have this series[1] which is ugly as sin and needs much more 
testing.


For (2), I have some ideas which need more work (let the irq domain 
resolve to
an irq_desc instead of an interrupt number, avoiding another radix-tree 
lookup).


M.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git/log/?h=irq/ipi-fixes

--
Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-27 Thread Vincent Guittot
HI Marc,

On Mon, 19 Oct 2020 at 17:43, Vincent Guittot
 wrote:
>
> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
> >

...

> > >>
> > >> One of the major difference is that we end up, in some cases
> > >> (such as when performing IRQ time accounting on the scheduler
> > >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> > >> Other than the (relatively small) overhead, there should be
> > >> no consequences to it (these pairs are designed to nest
> > >> correctly, and the accounting shouldn't be off).
> > >
> > > While rebasing on mainline, I have faced a performance regression for
> > > the benchmark:
> > > perf bench sched pipe
> > > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> > >
> > > The regression comes from:
> > > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> > > interrupts")
> >
> > That's interesting, as this patch doesn't really change anything (most
> > of the potential overhead comes in later). The only potential overhead
> > I can see is that the scheduler_ipi() call is now wrapped around
> > irq_enter()/irq_exit().
> >
> > >
> > >   v5.9  + this patch
> > > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> > > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> > >
> > > By + this patch,  I mean merging branch from this patch. Whereas
> > > merging the previous:
> > > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> > > /proc/interrupts")
> > >  It doesn't show any regression
> >
> > Since you are running perf, can you spot where the overhead occurs?

Any idea about the root cause of the regression ?
I have faced it on more arm64 platforms in the meantime

>
> hmm... Difficult to say because tracing the bench decreases a lot the
> result. I have pasted the perf reports.
>
> With this patch :
>
> # Samples: 634  of event 'cpu-clock'
> # Event count (approx.): 15850
> #
> # Overhead  Command Shared Object   Symbol
> #   ..  ..  ..
> #
> 31.86%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irqrestore
>  8.68%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irq
>  6.31%  sched-pipe  [kernel.kallsyms]   [k] __schedule
>  5.21%  sched-pipe  [kernel.kallsyms]   [k] schedule
>  4.73%  sched-pipe  [kernel.kallsyms]   [k] pipe_read
>  3.31%  sched-pipe  [kernel.kallsyms]   [k] el0_svc_common.constprop.3
>  2.84%  sched-pipe  [kernel.kallsyms]   [k] ww_mutex_lock_interruptible
>  2.52%  sched-pipe  [kernel.kallsyms]   [k] init_wait_entry
>  2.37%  sched-pipe  [kernel.kallsyms]   [k] mutex_unlock
>  2.21%  sched-pipe  [kernel.kallsyms]   [k] new_sync_read
>  1.89%  sched-pipe  [kernel.kallsyms]   [k] new_sync_write
>  1.74%  sched-pipe  [kernel.kallsyms]   [k] security_file_permission
>  1.74%  sched-pipe  [kernel.kallsyms]   [k] vfs_read
>  1.58%  sched-pipe  [kernel.kallsyms]   [k] __my_cpu_offset
>  1.26%  sched-pipe  libpthread-2.24.so  [.] 0x00010a2c
>  1.10%  sched-pipe  [kernel.kallsyms]   [k] mutex_lock
>  1.10%  sched-pipe  [kernel.kallsyms]   [k] vfs_write
>
> After reverting this patch which gives a result similar to v5.9:
>
> # Samples: 659  of event 'cpu-clock'
> # Event count (approx.): 16475
> #
> # Overhead  Command Shared Object   Symbol
> #   ..  ..  ...
> #
> 29.29%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irqrestore
> 21.40%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irq
>  4.86%  sched-pipe  [kernel.kallsyms]   [k] pipe_read
>  4.55%  sched-pipe  [kernel.kallsyms]   [k] ww_mutex_lock_interruptible
>  2.88%  sched-pipe  [kernel.kallsyms]   [k] __schedule
>  2.88%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_lock_irqsave
>  2.88%  sched-pipe  [kernel.kallsyms]   [k] schedule
>  2.12%  sched-pipe  [kernel.kallsyms]   [k] new_sync_read
>  1.82%  sched-pipe  [kernel.kallsyms]   [k] mutex_lock
>  1.67%  sched-pipe  [kernel.kallsyms]   [k] el0_svc_common.constprop.3
>  1.67%  sched-pipe  [kernel.kallsyms]   [k] pipe_write
>  1.21%  sched-pipe  [kernel.kallsyms]   [k] rw_verify_area
>  1.21%  sched-pipe  [kernel.kallsyms]   [k] security_file_permission
>  1.06%  sched-pipe  [kernel.kallsyms]   [k] fsnotify
>
> I have only put symbol with overhead above 1%
>
> so _raw_spin_unlock_irq, schedule and __schedule seem the most
> impacted but i can't get any conclusion
>
> I can sent you perf.data files if you want
>
>
> >
> > Thanks,
> >
> >  M.
> > --
> > Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-19 Thread Valentin Schneider
Hi,

On 19/10/20 16:43, Vincent Guittot wrote:
> On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
>> Since you are running perf, can you spot where the overhead occurs?
>
> hmm... Difficult to say because tracing the bench decreases a lot the
> result. I have pasted the perf reports.
>



> I have only put symbol with overhead above 1%
>
> so _raw_spin_unlock_irq, schedule and __schedule seem the most
> impacted but i can't get any conclusion
>

AFAICT on TX2 you should be able to run these and get some more details
within IRQ-disabled regions:

https://lore.kernel.org/linux-arm-kernel/20200924110706.254996-1-alexandru.eli...@arm.com/

(they should be on linux-next)

> I can sent you perf.data files if you want
>
>
>>
>> Thanks,
>>
>>  M.
>> --
>> Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-19 Thread Vincent Guittot
On Mon, 19 Oct 2020 at 15:04, Marc Zyngier  wrote:
>
> Hi Vincent,
>
> On 2020-10-19 13:42, Vincent Guittot wrote:
> > Hi Marc,
> >
> > On Tue, 1 Sep 2020 at 16:44, Marc Zyngier  wrote:
> >>
> >> In order to deal with IPIs as normal interrupts, let's add
> >> a new way to register them with the architecture code.
> >>
> >> set_smp_ipi_range() takes a range of interrupts, and allows
> >> the arch code to request them as if the were normal interrupts.
> >> A standard handler is then called by the core IRQ code to deal
> >> with the IPI.
> >>
> >> This means that we don't need to call irq_enter/irq_exit, and
> >> that we don't need to deal with set_irq_regs either. So let's
> >> move the dispatcher into its own function, and leave handle_IPI()
> >> as a compatibility function.
> >>
> >> On the sending side, let's make use of ipi_send_mask, which
> >> already exists for this purpose.
> >>
> >> One of the major difference is that we end up, in some cases
> >> (such as when performing IRQ time accounting on the scheduler
> >> IPI), end up with nested irq_enter()/irq_exit() pairs.
> >> Other than the (relatively small) overhead, there should be
> >> no consequences to it (these pairs are designed to nest
> >> correctly, and the accounting shouldn't be off).
> >
> > While rebasing on mainline, I have faced a performance regression for
> > the benchmark:
> > perf bench sched pipe
> > on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)
> >
> > The regression comes from:
> > commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal
> > interrupts")
>
> That's interesting, as this patch doesn't really change anything (most
> of the potential overhead comes in later). The only potential overhead
> I can see is that the scheduler_ipi() call is now wrapped around
> irq_enter()/irq_exit().
>
> >
> >   v5.9  + this patch
> > hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
> > thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%
> >
> > By + this patch,  I mean merging branch from this patch. Whereas
> > merging the previous:
> > commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
> > /proc/interrupts")
> >  It doesn't show any regression
>
> Since you are running perf, can you spot where the overhead occurs?

hmm... Difficult to say because tracing the bench decreases a lot the
result. I have pasted the perf reports.

With this patch :

# Samples: 634  of event 'cpu-clock'
# Event count (approx.): 15850
#
# Overhead  Command Shared Object   Symbol
#   ..  ..  ..
#
31.86%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irqrestore
 8.68%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irq
 6.31%  sched-pipe  [kernel.kallsyms]   [k] __schedule
 5.21%  sched-pipe  [kernel.kallsyms]   [k] schedule
 4.73%  sched-pipe  [kernel.kallsyms]   [k] pipe_read
 3.31%  sched-pipe  [kernel.kallsyms]   [k] el0_svc_common.constprop.3
 2.84%  sched-pipe  [kernel.kallsyms]   [k] ww_mutex_lock_interruptible
 2.52%  sched-pipe  [kernel.kallsyms]   [k] init_wait_entry
 2.37%  sched-pipe  [kernel.kallsyms]   [k] mutex_unlock
 2.21%  sched-pipe  [kernel.kallsyms]   [k] new_sync_read
 1.89%  sched-pipe  [kernel.kallsyms]   [k] new_sync_write
 1.74%  sched-pipe  [kernel.kallsyms]   [k] security_file_permission
 1.74%  sched-pipe  [kernel.kallsyms]   [k] vfs_read
 1.58%  sched-pipe  [kernel.kallsyms]   [k] __my_cpu_offset
 1.26%  sched-pipe  libpthread-2.24.so  [.] 0x00010a2c
 1.10%  sched-pipe  [kernel.kallsyms]   [k] mutex_lock
 1.10%  sched-pipe  [kernel.kallsyms]   [k] vfs_write

After reverting this patch which gives a result similar to v5.9:

# Samples: 659  of event 'cpu-clock'
# Event count (approx.): 16475
#
# Overhead  Command Shared Object   Symbol
#   ..  ..  ...
#
29.29%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irqrestore
21.40%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_unlock_irq
 4.86%  sched-pipe  [kernel.kallsyms]   [k] pipe_read
 4.55%  sched-pipe  [kernel.kallsyms]   [k] ww_mutex_lock_interruptible
 2.88%  sched-pipe  [kernel.kallsyms]   [k] __schedule
 2.88%  sched-pipe  [kernel.kallsyms]   [k] _raw_spin_lock_irqsave
 2.88%  sched-pipe  [kernel.kallsyms]   [k] schedule
 2.12%  sched-pipe  [kernel.kallsyms]   [k] new_sync_read
 1.82%  sched-pipe  [kernel.kallsyms]   [k] mutex_lock
 1.67%  sched-pipe  [kernel.kallsyms]   [k] el0_svc_common.constprop.3
 1.67%  sched-pipe  [kernel.kallsyms]   [k] pipe_write
 1.21%  sched-pipe  [kernel.kallsyms]   [k] rw_verify_area
 1.21%  sched-pipe  [kernel.kallsyms]   [k] security_file_permission
 1.06%  sched-pipe  [kernel.kallsyms]   [k] fsnotify

I have only put symbol with overhead above 1%

so _raw_spin_unlock_irq, schedule 

Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-19 Thread Marc Zyngier

Hi Vincent,

On 2020-10-19 13:42, Vincent Guittot wrote:

Hi Marc,

On Tue, 1 Sep 2020 at 16:44, Marc Zyngier  wrote:


In order to deal with IPIs as normal interrupts, let's add
a new way to register them with the architecture code.

set_smp_ipi_range() takes a range of interrupts, and allows
the arch code to request them as if the were normal interrupts.
A standard handler is then called by the core IRQ code to deal
with the IPI.

This means that we don't need to call irq_enter/irq_exit, and
that we don't need to deal with set_irq_regs either. So let's
move the dispatcher into its own function, and leave handle_IPI()
as a compatibility function.

On the sending side, let's make use of ipi_send_mask, which
already exists for this purpose.

One of the major difference is that we end up, in some cases
(such as when performing IRQ time accounting on the scheduler
IPI), end up with nested irq_enter()/irq_exit() pairs.
Other than the (relatively small) overhead, there should be
no consequences to it (these pairs are designed to nest
correctly, and the accounting shouldn't be off).


While rebasing on mainline, I have faced a performance regression for
the benchmark:
perf bench sched pipe
on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)

The regression comes from:
commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal 
interrupts")


That's interesting, as this patch doesn't really change anything (most
of the potential overhead comes in later). The only potential overhead
I can see is that the scheduler_ipi() call is now wrapped around
irq_enter()/irq_exit().



  v5.9  + this patch
hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%

By + this patch,  I mean merging branch from this patch. Whereas
merging the previous:
commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
/proc/interrupts")
 It doesn't show any regression


Since you are running perf, can you spot where the overhead occurs?

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-10-19 Thread Vincent Guittot
Hi Marc,

On Tue, 1 Sep 2020 at 16:44, Marc Zyngier  wrote:
>
> In order to deal with IPIs as normal interrupts, let's add
> a new way to register them with the architecture code.
>
> set_smp_ipi_range() takes a range of interrupts, and allows
> the arch code to request them as if the were normal interrupts.
> A standard handler is then called by the core IRQ code to deal
> with the IPI.
>
> This means that we don't need to call irq_enter/irq_exit, and
> that we don't need to deal with set_irq_regs either. So let's
> move the dispatcher into its own function, and leave handle_IPI()
> as a compatibility function.
>
> On the sending side, let's make use of ipi_send_mask, which
> already exists for this purpose.
>
> One of the major difference is that we end up, in some cases
> (such as when performing IRQ time accounting on the scheduler
> IPI), end up with nested irq_enter()/irq_exit() pairs.
> Other than the (relatively small) overhead, there should be
> no consequences to it (these pairs are designed to nest
> correctly, and the accounting shouldn't be off).

While rebasing on mainline, I have faced a performance regression for
the benchmark:
perf bench sched pipe
on my arm64 dual quad core (hikey) and my 2 nodes x 112 CPUS (thx2)

The regression comes from:
commit: d3afc7f12987 ("arm64: Allow IPIs to be handled as normal interrupts")

  v5.9  + this patch
hikey :   48818(+/- 0.31)   37503(+/- 0.15%)  -23.2%
thx2  :  132410(+/- 1.72)  122646(+/- 1.92%)   -7.4%

By + this patch,  I mean merging branch from this patch. Whereas
merging the previous:
commit: 83cfac95c018 ("genirq: Allow interrupts to be excluded from
/proc/interrupts")
 It doesn't show any regression

Vincent

>
> Reviewed-by: Valentin Schneider 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/Kconfig   |  1 +
>  arch/arm64/include/asm/smp.h |  5 ++
>  arch/arm64/kernel/smp.c  | 93 +++-
>  3 files changed, 87 insertions(+), 12 deletions(-)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 6d232837cbee..d0fdbe5fb32f 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -106,6 +106,7 @@ config ARM64
> select GENERIC_CPU_VULNERABILITIES
> select GENERIC_EARLY_IOREMAP
> select GENERIC_IDLE_POLL_SETUP
> +   select GENERIC_IRQ_IPI
> select GENERIC_IRQ_MULTI_HANDLER
> select GENERIC_IRQ_PROBE
> select GENERIC_IRQ_SHOW
> diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
> index 0eadbf933e35..57c5db15f6b7 100644
> --- a/arch/arm64/include/asm/smp.h
> +++ b/arch/arm64/include/asm/smp.h
> @@ -78,6 +78,11 @@ extern void set_smp_cross_call(void (*)(const struct 
> cpumask *, unsigned int));
>
>  extern void (*__smp_cross_call)(const struct cpumask *, unsigned int);
>
> +/*
> + * Register IPI interrupts with the arch SMP code
> + */
> +extern void set_smp_ipi_range(int ipi_base, int nr_ipi);
> +
>  /*
>   * Called from the secondary holding pen, this is the secondary CPU entry 
> point.
>   */
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 355ee9eed4dd..00c9db1b61b5 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -75,6 +75,13 @@ enum ipi_msg_type {
> IPI_WAKEUP
>  };
>
> +static int ipi_irq_base __read_mostly;
> +static int nr_ipi __read_mostly = NR_IPI;
> +static struct irq_desc *ipi_desc[NR_IPI] __read_mostly;
> +
> +static void ipi_setup(int cpu);
> +static void ipi_teardown(int cpu);
> +
>  #ifdef CONFIG_HOTPLUG_CPU
>  static int op_cpu_kill(unsigned int cpu);
>  #else
> @@ -237,6 +244,8 @@ asmlinkage notrace void secondary_start_kernel(void)
>  */
> notify_cpu_starting(cpu);
>
> +   ipi_setup(cpu);
> +
> store_cpu_topology(cpu);
> numa_add_cpu(cpu);
>
> @@ -302,6 +311,7 @@ int __cpu_disable(void)
>  * and we must not schedule until we're ready to give up the cpu.
>  */
> set_cpu_online(cpu, false);
> +   ipi_teardown(cpu);
>
> /*
>  * OK - migrate IRQs away from this CPU
> @@ -890,10 +900,9 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct 
> pt_regs *regs)
>  /*
>   * Main handler for inter-processor interrupts
>   */
> -void handle_IPI(int ipinr, struct pt_regs *regs)
> +static void do_handle_IPI(int ipinr)
>  {
> unsigned int cpu = smp_processor_id();
> -   struct pt_regs *old_regs = set_irq_regs(regs);
>
> if ((unsigned)ipinr < NR_IPI) {
> trace_ipi_entry_rcuidle(ipi_types[ipinr]);
> @@ -906,21 +915,16 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
> break;
>
> case IPI_CALL_FUNC:
> -   irq_enter();
> generic_smp_call_function_interrupt();
> -   irq_exit();
> break;
>
> case IPI_CPU_STOP:
> -   irq_enter();
> local_cpu_stop();
> -   irq_exit();
> 

Re: [PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-09-11 Thread Catalin Marinas
On Tue, Sep 01, 2020 at 03:43:11PM +0100, Marc Zyngier wrote:
> In order to deal with IPIs as normal interrupts, let's add
> a new way to register them with the architecture code.
> 
> set_smp_ipi_range() takes a range of interrupts, and allows
> the arch code to request them as if the were normal interrupts.
> A standard handler is then called by the core IRQ code to deal
> with the IPI.
> 
> This means that we don't need to call irq_enter/irq_exit, and
> that we don't need to deal with set_irq_regs either. So let's
> move the dispatcher into its own function, and leave handle_IPI()
> as a compatibility function.
> 
> On the sending side, let's make use of ipi_send_mask, which
> already exists for this purpose.
> 
> One of the major difference is that we end up, in some cases
> (such as when performing IRQ time accounting on the scheduler
> IPI), end up with nested irq_enter()/irq_exit() pairs.
> Other than the (relatively small) overhead, there should be
> no consequences to it (these pairs are designed to nest
> correctly, and the accounting shouldn't be off).
> 
> Reviewed-by: Valentin Schneider 
> Signed-off-by: Marc Zyngier 

In case you need an ack for the arm64 part:

Acked-by: Catalin Marinas 


[PATCH v3 03/16] arm64: Allow IPIs to be handled as normal interrupts

2020-09-01 Thread Marc Zyngier
In order to deal with IPIs as normal interrupts, let's add
a new way to register them with the architecture code.

set_smp_ipi_range() takes a range of interrupts, and allows
the arch code to request them as if the were normal interrupts.
A standard handler is then called by the core IRQ code to deal
with the IPI.

This means that we don't need to call irq_enter/irq_exit, and
that we don't need to deal with set_irq_regs either. So let's
move the dispatcher into its own function, and leave handle_IPI()
as a compatibility function.

On the sending side, let's make use of ipi_send_mask, which
already exists for this purpose.

One of the major difference is that we end up, in some cases
(such as when performing IRQ time accounting on the scheduler
IPI), end up with nested irq_enter()/irq_exit() pairs.
Other than the (relatively small) overhead, there should be
no consequences to it (these pairs are designed to nest
correctly, and the accounting shouldn't be off).

Reviewed-by: Valentin Schneider 
Signed-off-by: Marc Zyngier 
---
 arch/arm64/Kconfig   |  1 +
 arch/arm64/include/asm/smp.h |  5 ++
 arch/arm64/kernel/smp.c  | 93 +++-
 3 files changed, 87 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 6d232837cbee..d0fdbe5fb32f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -106,6 +106,7 @@ config ARM64
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_IDLE_POLL_SETUP
+   select GENERIC_IRQ_IPI
select GENERIC_IRQ_MULTI_HANDLER
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index 0eadbf933e35..57c5db15f6b7 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -78,6 +78,11 @@ extern void set_smp_cross_call(void (*)(const struct cpumask 
*, unsigned int));
 
 extern void (*__smp_cross_call)(const struct cpumask *, unsigned int);
 
+/*
+ * Register IPI interrupts with the arch SMP code
+ */
+extern void set_smp_ipi_range(int ipi_base, int nr_ipi);
+
 /*
  * Called from the secondary holding pen, this is the secondary CPU entry 
point.
  */
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 355ee9eed4dd..00c9db1b61b5 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -75,6 +75,13 @@ enum ipi_msg_type {
IPI_WAKEUP
 };
 
+static int ipi_irq_base __read_mostly;
+static int nr_ipi __read_mostly = NR_IPI;
+static struct irq_desc *ipi_desc[NR_IPI] __read_mostly;
+
+static void ipi_setup(int cpu);
+static void ipi_teardown(int cpu);
+
 #ifdef CONFIG_HOTPLUG_CPU
 static int op_cpu_kill(unsigned int cpu);
 #else
@@ -237,6 +244,8 @@ asmlinkage notrace void secondary_start_kernel(void)
 */
notify_cpu_starting(cpu);
 
+   ipi_setup(cpu);
+
store_cpu_topology(cpu);
numa_add_cpu(cpu);
 
@@ -302,6 +311,7 @@ int __cpu_disable(void)
 * and we must not schedule until we're ready to give up the cpu.
 */
set_cpu_online(cpu, false);
+   ipi_teardown(cpu);
 
/*
 * OK - migrate IRQs away from this CPU
@@ -890,10 +900,9 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct 
pt_regs *regs)
 /*
  * Main handler for inter-processor interrupts
  */
-void handle_IPI(int ipinr, struct pt_regs *regs)
+static void do_handle_IPI(int ipinr)
 {
unsigned int cpu = smp_processor_id();
-   struct pt_regs *old_regs = set_irq_regs(regs);
 
if ((unsigned)ipinr < NR_IPI) {
trace_ipi_entry_rcuidle(ipi_types[ipinr]);
@@ -906,21 +915,16 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
break;
 
case IPI_CALL_FUNC:
-   irq_enter();
generic_smp_call_function_interrupt();
-   irq_exit();
break;
 
case IPI_CPU_STOP:
-   irq_enter();
local_cpu_stop();
-   irq_exit();
break;
 
case IPI_CPU_CRASH_STOP:
if (IS_ENABLED(CONFIG_KEXEC_CORE)) {
-   irq_enter();
-   ipi_cpu_crash_stop(cpu, regs);
+   ipi_cpu_crash_stop(cpu, get_irq_regs());
 
unreachable();
}
@@ -928,17 +932,13 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
case IPI_TIMER:
-   irq_enter();
tick_receive_broadcast();
-   irq_exit();
break;
 #endif
 
 #ifdef CONFIG_IRQ_WORK
case IPI_IRQ_WORK:
-   irq_enter();
irq_work_run();
-   irq_exit();
break;
 #endif
 
@@ -957,9 +957,78 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 
if ((unsigned)ipinr < NR_IPI)
trace_ipi_exit_rcuidle(ipi_types[ipinr]);
+}
+
+/*