Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-21 Thread kernel test robot
Hi Alexey,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/irq/core]
[also build test ERROR on linus/master linux/master v5.10-rc4 next-20201120]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Alexey-Kardashevskiy/genirq-irqdomain-Add-reference-counting-to-IRQs/20201109-175020
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
d315c627a18249930750fe4eb2b21f3fe9b32ea4
config: m68k-randconfig-m031-20201122 (attached as .config)
compiler: m68k-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/3fe0622aa0aeca70507a5e71b599bed6be0be581
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Alexey-Kardashevskiy/genirq-irqdomain-Add-reference-counting-to-IRQs/20201109-175020
git checkout 3fe0622aa0aeca70507a5e71b599bed6be0be581
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=m68k 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   kernel/irq/irqdomain.c: In function 'irq_create_mapping':
>> kernel/irq/irqdomain.c:672:20: error: 'struct irq_desc' has no member named 
>> 'kobj'
 672 |   kobject_get(&desc->kobj);
 |^~
   kernel/irq/irqdomain.c: In function 'irq_create_fwspec_mapping':
   kernel/irq/irqdomain.c:807:21: error: 'struct irq_desc' has no member named 
'kobj'
 807 |kobject_get(&desc->kobj);
 | ^~
   kernel/irq/irqdomain.c:822:21: error: 'struct irq_desc' has no member named 
'kobj'
 822 |kobject_get(&desc->kobj);
 | ^~
   kernel/irq/irqdomain.c: In function 'irq_dispose_mapping':
   kernel/irq/irqdomain.c:880:19: error: 'struct irq_desc' has no member named 
'kobj'
 880 |  kobject_put(&desc->kobj);
 |   ^~
   kernel/irq/irqdomain.c: In function '__irq_domain_alloc_irqs':
   kernel/irq/irqdomain.c:1473:21: error: 'struct irq_desc' has no member named 
'kobj'
1473 |kobject_get(&desc->kobj);
 | ^~

vim +672 kernel/irq/irqdomain.c

   636  
   637  /**
   638   * irq_create_mapping() - Map a hardware interrupt into linux irq space
   639   * @domain: domain owning this hardware interrupt or NULL for default 
domain
   640   * @hwirq: hardware irq number in that domain space
   641   *
   642   * Only one mapping per hardware interrupt is permitted. Returns a linux
   643   * irq number.
   644   * If the sense/trigger is to be specified, set_irq_type() should be 
called
   645   * on the number returned from that call.
   646   */
   647  unsigned int irq_create_mapping(struct irq_domain *domain,
   648  irq_hw_number_t hwirq)
   649  {
   650  struct device_node *of_node;
   651  int virq;
   652  struct irq_desc *desc;
   653  
   654  pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
   655  
   656  /* Look for default domain if nececssary */
   657  if (domain == NULL)
   658  domain = irq_default_domain;
   659  if (domain == NULL) {
   660  WARN(1, "%s(, %lx) called with NULL domain\n", 
__func__, hwirq);
   661  return 0;
   662  }
   663  pr_debug("-> using domain @%p\n", domain);
   664  
   665  of_node = irq_domain_get_of_node(domain);
   666  
   667  /* Check if mapping already exists */
   668  virq = irq_find_mapping(domain, hwirq);
   669  if (virq) {
   670  desc = irq_to_desc(virq);
   671  pr_debug("-> existing mapping on virq %d\n", virq);
 > 672  kobject_get(&desc->kobj);
   673  return virq;
   674  }
   675  
   676  /* Allocate a virtual interrupt number */
   677  virq = irq_domain_alloc_descs(-1, 1, hwirq, 
of_node_to_nid(of_node), NULL);
   678  if (virq <= 0) {
   679  pr_debug("-> virq allocation failed\n");
   680  return 0;
   681  }
   682  
   683  if (irq_domain_associate(domain, virq, hwirq)) {
   684  irq_free_desc(virq);
   685  return 0;
   686  }
   687  
   688  pr_debug("irq %lu on domain %s mapped to virtual irq %u\n",
   689  hwirq, of_node_full_name(of_node), virq);
   690  
   691  return virq;
   692  }
   693  EXPORT_SYMBOL_GPL(ir

Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-14 Thread Thomas Gleixner
On Mon, Nov 09 2020 at 20:46, Alexey Kardashevskiy wrote:
> PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
> irq_dispose_mapping(). The problem with that these interrupts are
> shared and when performing hot unplug, we need to unmap the interrupt
> only when the last device is released.
>
> There was a comment about whether hierarchical IRQ domains should
> contribute to this reference counter and I need some help here as
> I cannot see why.
> It is reverse now - IRQs contribute to domain->mapcount and
> irq_domain_associate/irq_domain_disassociate take necessary steps to
> keep this counter in order.

I'm not yet convinced that this change is correct under all
circumstances. See below.

> What might be missing is that if we have cascade of IRQs (as in the
> IOAPIC example from Documentation/core-api/irq/irq-domain.rst ), then
> a parent IRQ should contribute to the children IRQs and it is up to

No. Hierarchical irq domains handle ONE interrupt and have nothing to do
with parent/child interrupts. They represent the various layers of
hardware involved in delivering one particular interrupt. Just looking
at this example:

  Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU

There are 3 interrupt chips involved: IOAPIC - REMAP - APIC and each of
them has to do configuration and/or resource allocation when setting up
an interrupt. Also during handling the various chip levels might be
involved.

So now if you remove interrupt remapping (config, commandline, lack of
HW) the delivery chain looks like this:

  Device --> IOAPIC -> Local APIC -> CPU

So we only have two chips involved.

Hierarchical domains handle that at boot time by associating the
relevant parent domains to the involved chips.

> Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
> to see in debugfs about IRQs but on my thinkpad there nothing about
> hierarchy.

Enable GENERIC_IRQ_DEBUGFS and surf around in
/sys/kernel/debug/irq/domains.

cat /sys/kernel/debug/irq/domains/IO-APIC-240 
name:   IO-APIC-240
 size:   24
 mapped: 15
 flags:  0x0003
 parent: AMD-IR-3
name:   AMD-IR-3
 size:   0
 mapped: 28
 flags:  0x0003
 parent: VECTOR
name:   VECTOR
 size:   0
 mapped: 295
 flags:  0x0003

You will find something like this on your thinkpad as well. It might be
a two level hierarchy if there is no remapping unit.

name:   IO-APIC-240
 size:   24
 mapped: 15
 flags:  0x0003
 parent: VECTOR
name:   VECTOR
 size:   0
 mapped: 292
 flags:  0x0003

and if you look at an interrupt:

# cat /sys/kernel/debug/irq/irqs/4
handler:  handle_edge_irq
device:   (null)
status:   0x4000
istate:   0x
ddepth:   0
wdepth:   0
dstate:   0x35408200
IRQD_ACTIVATED
IRQD_IRQ_STARTED
IRQD_SINGLE_TARGET
IRQD_MOVE_PCNTXT
IRQD_AFFINITY_ON_ACTIVATE
IRQD_CAN_RESERVE
IRQD_HANDLE_ENFORCE_IRQCTX
node: 0
affinity: 0-63,128-191
effectiv: 130
pending:  
domain:  IO-APIC-240
 hwirq:   0x4
 chip:IR-IO-APIC
  flags:   0x10
 IRQCHIP_SKIP_SET_WAKE
 parent:
domain:  AMD-IR-3
 hwirq:   0xa0
 chip:AMD-IR
  flags:   0x0
 parent:
domain:  VECTOR
 hwirq:   0x4
 chip:APIC
  flags:   0x0
 Vector:33
 Target:   130
 move_in_progress: 0
 is_managed:   0
 can_reserve:  1
 has_reserved: 0
 cleanup_pending:  0

you can see the domain hierarchy as well and the relevant information
per domain. So on the IO-APIC it's hwirq 4, i.e. PIN 4. In the remap
domain it's 0xa0 which is the relevant table IIRC and the vector
domain has extra information about the target vector (33) and the target
CPU (130).

> So I'll ask again :)
>
> What is the easiest way to get irq-hierarchical hardware?
> I have a bunch of powerpc boxes (no good) but also a raspberry pi,
> a bunch of 32/64bit orange pi's, an "armada" arm box,
> thinkpads - is any of this good for the task?

You have it already. Everything you listed except PPC uses hierarchical
interrupt domains.

> +static void delayed_free_desc(struct rcu_head *rhp);
>  static void irq_kobj_release(struct kobject *kobj)
>  {
>   struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
> +#ifdef CONFIG_IRQ_DOMAIN
> + struct irq_domain *domain;
> + unsigned int virq = desc->irq_data.irq;
>  
> - free_masks(desc);
> - free_percpu(desc->kstat_irqs);
> - kfree(desc);
> + domain = desc->irq_data.domain;
> + if (domain) {
> + if (irq_domain_is_hierarchy(domain)) {
> + irq_domain_free_irqs(virq, 1);
> + } else {
> + irq_domain_disassociate(domain, virq);
> + irq_free_desc(virq);
> + 

Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-14 Thread Marc Zyngier

On 2020-11-14 03:37, Alexey Kardashevskiy wrote:


What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?


If your HW doesn't require an interrupt hierarchy, run VMs!
Booting an arm64 guest with virtual PCI devices will result in
hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC).


Absolutely :) But the beauty of ARM is that one can buy an actual ARM
device for 20$, I have "opi one+ allwinner h6 64bit cortex a53 1GB
RAM", is it worth using KVM on this device, or is it too small for
that?


I've run VMs on smaller machines. 256MB of guest RAM is enough to boot
a full blown Debian system with PCI devices, and your AW box should be
up to the task as long as you run a mainline kernel on it. Please don't
add to the pile of junk!


You can use KVM, or even bare QEMU on x86 if you are so inclined.


Have a QEMU command line handy for x86/tcg?


/me digs, as my x86 boxes are overspec'd X terminals these days:

Here you go, courtesy of Will:
http://cdn.kernel.org/pub/linux/kernel/people/will/docs/qemu/qemu-arm64-howto.html

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-14 Thread Frederic Barrat

On 14/11/2020 04:37, Alexey Kardashevskiy wrote:


I'll try to go through this patch over the week-end (or more probably
early next week), and try to understand where our understandings
differ.


Great, thanks! Fred spotted a problem with irq_free_descs() not doing 
kobject_put() anymore and this is a problem for sa.c and the likes 
and I will go though these places anyway.


So there are callers out there which don't care about mapping the 
interrupt. Wouldn't it be easier to leave alone the kobject from the irq 
descriptor (my understanding is that it's there to handle the sysfs 
representation) and add a simple kref counter, just to handle the 
mapping part?


  Fred




Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-13 Thread Alexey Kardashevskiy




On 14/11/2020 05:19, Cédric Le Goater wrote:

On 11/9/20 10:46 AM, Alexey Kardashevskiy wrote:

PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.


The background context for such a need is that the POWER9 and POWER10
processors have a new XIVE interrupt controller which uses MMIO pages
for interrupt management. Each interrupt has a pair of pages which are
required to be unmapped in some environment, like PHB removal. And so,
all interrupts need to be unmmaped.



This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

As kobject_put() is called directly now (not via RCU), it can also handle
the early boot case (irq_kobj_base==NULL) with the help of
the kobject::state_in_sysfs flag and without additional irq_sysfs_del().


Could this change be done in a following patch ?


No. Before this patch, we remove from sysfs -  call kobject_del() - 
before calling kobject_put() which we do via RCU. After the patch, this 
kobject_del() is called from the very last kobject_put() and when we get 
to this release handler - the sysfs node is already removed and we get a 
message about the missing parent.




While at this, clean up the comment at where irq_sysfs_del() was called.>
Quick grep shows no sign of irq reference counting in drivers. Drivers
typically request mapping when probing and dispose it when removing;


Some ARM drivers call directly irq_alloc_descs() and irq_free_descs().
Is  that a problem ?


Kind of, I'll need to go through these places and replace 
irq_free_descs() with kobject_put() (may be some wrapper or may be 
change irq_free_descs() to do kobject_put()).




platforms tend to dispose only if setup failed and the rest seems
calling one dispose per one mapping. Except (at least) PPC/pseries
which needs https://lkml.org/lkml/2020/10/27/259

Cc: Cédric Le Goater 
Cc: Marc Zyngier 
Cc: Michael Ellerman 
Cc: Qian Cai 
Cc: Rob Herring 
Cc: Frederic Barrat 
Cc: Michal Suchánek 
Cc: Thomas Gleixner 
Signed-off-by: Alexey Kardashevskiy 


I used this patch and the ppc one doing the LSI removal:

   
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20201027090655.14118-3-...@ozlabs.ru/

on different P10 and P9 systems, on a large system (>1K HW theads),
KVM guests and pSeries machines. Checked that PHB removal was OK.
  
Tested-by: Cédric Le Goater 


But IRQ subsystem covers much more than these systems.


Indeed. But doing our own powerpc-only reference counting on top of 
irs_desc is just ugly.





Some comments below,


---

This is what it is fixing for powerpc:

There was a comment about whether hierarchical IRQ domains should
contribute to this reference counter and I need some help here as
I cannot see why.
It is reverse now - IRQs contribute to domain->mapcount and
irq_domain_associate/irq_domain_disassociate take necessary steps to
keep this counter in order. What might be missing is that if we have
cascade of IRQs (as in the IOAPIC example from
Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
contribute to the children IRQs and it is up to
irq_domain_ops::alloc/free hooks, and they all seem to be eventually
calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
right.

Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
to see in debugfs about IRQs but on my thinkpad there nothing about
hierarchy.

So I'll ask again :)

What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?



---
Changes:
v3:
* removed very wrong kobject_get/_put from irq_domain_associate/
irq_domain_disassociate as these are called from kobject_release so
irq_descs were never actually released
* removed irq_sysfs_del as 1) we do not seem to need it with changed
counting  2) produces a "no parent" warning as it would be called from
kobject_release which removes sysfs nodes itself

v2:
* added more get/put, including irq_domain_associate/irq_domain_disassociate
---
  kernel/irq/irqdesc.c   | 55 ++
  kernel/irq/irqdomain.c | 37 
  2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..79c904ebfd5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,18 +295,6 @@ static void irq_sysfs_add(int irq, struct irq_desc *

Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-13 Thread Alexey Kardashevskiy




On 14/11/2020 05:34, Marc Zyngier wrote:

Hi Alexey,

On 2020-11-09 09:46, Alexey Kardashevskiy wrote:

PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.

This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

As kobject_put() is called directly now (not via RCU), it can also handle
the early boot case (irq_kobj_base==NULL) with the help of
the kobject::state_in_sysfs flag and without additional irq_sysfs_del().
While at this, clean up the comment at where irq_sysfs_del() was called.

Quick grep shows no sign of irq reference counting in drivers. Drivers
typically request mapping when probing and dispose it when removing;
platforms tend to dispose only if setup failed and the rest seems
calling one dispose per one mapping. Except (at least) PPC/pseries
which needs https://lkml.org/lkml/2020/10/27/259

Cc: Cédric Le Goater 
Cc: Marc Zyngier 
Cc: Michael Ellerman 
Cc: Qian Cai 
Cc: Rob Herring 
Cc: Frederic Barrat 
Cc: Michal Suchánek 
Cc: Thomas Gleixner 
Signed-off-by: Alexey Kardashevskiy 
---

This is what it is fixing for powerpc:

There was a comment about whether hierarchical IRQ domains should
contribute to this reference counter and I need some help here as
I cannot see why.
It is reverse now - IRQs contribute to domain->mapcount and
irq_domain_associate/irq_domain_disassociate take necessary steps to
keep this counter in order. What might be missing is that if we have
cascade of IRQs (as in the IOAPIC example from
Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
contribute to the children IRQs and it is up to
irq_domain_ops::alloc/free hooks, and they all seem to be eventually
calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
right.

Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
to see in debugfs about IRQs but on my thinkpad there nothing about
hierarchy.

So I'll ask again :)

What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?


If your HW doesn't require an interrupt hierarchy, run VMs!
Booting an arm64 guest with virtual PCI devices will result in
hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC).


Absolutely :) But the beauty of ARM is that one can buy an actual ARM 
device for 20$, I have "opi one+ allwinner h6 64bit cortex a53 1GB RAM", 
is it worth using KVM on this device, or is it too small for that?



You can use KVM, or even bare QEMU on x86 if you are so inclined.


Have a QEMU command line handy for x86/tcg?


I'll try to go through this patch over the week-end (or more probably
early next week), and try to understand where our understandings
differ.


Great, thanks! Fred spotted a problem with irq_free_descs() not doing 
kobject_put() anymore and this is a problem for sa.c and the likes 
and I will go though these places anyway.



--
Alexey


Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-13 Thread Marc Zyngier

Hi Alexey,

On 2020-11-09 09:46, Alexey Kardashevskiy wrote:
PCI devices share 4 legacy INTx interrupts from the same PCI host 
bridge.

Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.

This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

As kobject_put() is called directly now (not via RCU), it can also 
handle

the early boot case (irq_kobj_base==NULL) with the help of
the kobject::state_in_sysfs flag and without additional 
irq_sysfs_del().
While at this, clean up the comment at where irq_sysfs_del() was 
called.


Quick grep shows no sign of irq reference counting in drivers. Drivers
typically request mapping when probing and dispose it when removing;
platforms tend to dispose only if setup failed and the rest seems
calling one dispose per one mapping. Except (at least) PPC/pseries
which needs https://lkml.org/lkml/2020/10/27/259

Cc: Cédric Le Goater 
Cc: Marc Zyngier 
Cc: Michael Ellerman 
Cc: Qian Cai 
Cc: Rob Herring 
Cc: Frederic Barrat 
Cc: Michal Suchánek 
Cc: Thomas Gleixner 
Signed-off-by: Alexey Kardashevskiy 
---

This is what it is fixing for powerpc:

There was a comment about whether hierarchical IRQ domains should
contribute to this reference counter and I need some help here as
I cannot see why.
It is reverse now - IRQs contribute to domain->mapcount and
irq_domain_associate/irq_domain_disassociate take necessary steps to
keep this counter in order. What might be missing is that if we have
cascade of IRQs (as in the IOAPIC example from
Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
contribute to the children IRQs and it is up to
irq_domain_ops::alloc/free hooks, and they all seem to be eventually
calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
right.

Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
to see in debugfs about IRQs but on my thinkpad there nothing about
hierarchy.

So I'll ask again :)

What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?


If your HW doesn't require an interrupt hierarchy, run VMs!
Booting an arm64 guest with virtual PCI devices will result in
hierarchies being created (PCI-MSI -> GIC MSI widget -> GIC).
You can use KVM, or even bare QEMU on x86 if you are so inclined.

I'll try to go through this patch over the week-end (or more probably
early next week), and try to understand where our understandings
differ.

Thanks,

M.
--
Jazz is not dead. It just smells funny...


Re: [PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-13 Thread Cédric Le Goater
On 11/9/20 10:46 AM, Alexey Kardashevskiy wrote:
> PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
> irq_dispose_mapping(). The problem with that these interrupts are
> shared and when performing hot unplug, we need to unmap the interrupt
> only when the last device is released.

The background context for such a need is that the POWER9 and POWER10 
processors have a new XIVE interrupt controller which uses MMIO pages 
for interrupt management. Each interrupt has a pair of pages which are
required to be unmapped in some environment, like PHB removal. And so,
all interrupts need to be unmmaped. 

> 
> This reuses already existing irq_desc::kobj for this purpose.
> The refcounter is naturally 1 when the descriptor is allocated already;
> this adds kobject_get() in places where already existing mapped virq
> is returned.
> 
> This reorganizes irq_dispose_mapping() to release the kobj and let
> the release callback do the cleanup.
> 
> As kobject_put() is called directly now (not via RCU), it can also handle
> the early boot case (irq_kobj_base==NULL) with the help of
> the kobject::state_in_sysfs flag and without additional irq_sysfs_del().

Could this change be done in a following patch ? 

> While at this, clean up the comment at where irq_sysfs_del() was called.>
> Quick grep shows no sign of irq reference counting in drivers. Drivers
> typically request mapping when probing and dispose it when removing;

Some ARM drivers call directly irq_alloc_descs() and irq_free_descs(). 
Is  that a problem ? 

> platforms tend to dispose only if setup failed and the rest seems
> calling one dispose per one mapping. Except (at least) PPC/pseries
> which needs https://lkml.org/lkml/2020/10/27/259
> 
> Cc: Cédric Le Goater 
> Cc: Marc Zyngier 
> Cc: Michael Ellerman 
> Cc: Qian Cai 
> Cc: Rob Herring 
> Cc: Frederic Barrat 
> Cc: Michal Suchánek 
> Cc: Thomas Gleixner 
> Signed-off-by: Alexey Kardashevskiy 

I used this patch and the ppc one doing the LSI removal:  

  
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20201027090655.14118-3-...@ozlabs.ru/

on different P10 and P9 systems, on a large system (>1K HW theads), 
KVM guests and pSeries machines. Checked that PHB removal was OK. 
 
Tested-by: Cédric Le Goater 

But IRQ subsystem covers much more than these systems.


Some comments below, 

> ---
> 
> This is what it is fixing for powerpc:
> 
> There was a comment about whether hierarchical IRQ domains should
> contribute to this reference counter and I need some help here as
> I cannot see why.
> It is reverse now - IRQs contribute to domain->mapcount and
> irq_domain_associate/irq_domain_disassociate take necessary steps to
> keep this counter in order. What might be missing is that if we have
> cascade of IRQs (as in the IOAPIC example from
> Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
> contribute to the children IRQs and it is up to
> irq_domain_ops::alloc/free hooks, and they all seem to be eventually
> calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
> right.
> 
> Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
> to see in debugfs about IRQs but on my thinkpad there nothing about
> hierarchy.
> 
> So I'll ask again :)
> 
> What is the easiest way to get irq-hierarchical hardware?
> I have a bunch of powerpc boxes (no good) but also a raspberry pi,
> a bunch of 32/64bit orange pi's, an "armada" arm box,
> thinkpads - is any of this good for the task?
> 
> 
> 
> ---
> Changes:
> v3:
> * removed very wrong kobject_get/_put from irq_domain_associate/
> irq_domain_disassociate as these are called from kobject_release so
> irq_descs were never actually released
> * removed irq_sysfs_del as 1) we do not seem to need it with changed
> counting  2) produces a "no parent" warning as it would be called from
> kobject_release which removes sysfs nodes itself
> 
> v2:
> * added more get/put, including irq_domain_associate/irq_domain_disassociate
> ---
>  kernel/irq/irqdesc.c   | 55 ++
>  kernel/irq/irqdomain.c | 37 
>  2 files changed, 46 insertions(+), 46 deletions(-)
> 
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 1a7723604399..79c904ebfd5c 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -295,18 +295,6 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
>   }
>  }
>  
> -static void irq_sysfs_del(struct irq_desc *desc)
> -{
> - /*
> -  * If irq_sysfs_init() has not yet been invoked (early boot), then
> -  * irq_kobj_base is NULL and the descriptor was never added.
> -  * kobject_del() complains about a object with no parent, so make
> -  * it conditional.
> -  */
> - if (irq_kobj_base)
> - kobject_del(&desc->kobj);
> -}
> -
>  static int __init irq_sysfs_init(void)
>  {
>   str

[PATCH kernel v3] genirq/irqdomain: Add reference counting to IRQs

2020-11-09 Thread Alexey Kardashevskiy
PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
Device drivers map/unmap hardware interrupts via irq_create_mapping()/
irq_dispose_mapping(). The problem with that these interrupts are
shared and when performing hot unplug, we need to unmap the interrupt
only when the last device is released.

This reuses already existing irq_desc::kobj for this purpose.
The refcounter is naturally 1 when the descriptor is allocated already;
this adds kobject_get() in places where already existing mapped virq
is returned.

This reorganizes irq_dispose_mapping() to release the kobj and let
the release callback do the cleanup.

As kobject_put() is called directly now (not via RCU), it can also handle
the early boot case (irq_kobj_base==NULL) with the help of
the kobject::state_in_sysfs flag and without additional irq_sysfs_del().
While at this, clean up the comment at where irq_sysfs_del() was called.

Quick grep shows no sign of irq reference counting in drivers. Drivers
typically request mapping when probing and dispose it when removing;
platforms tend to dispose only if setup failed and the rest seems
calling one dispose per one mapping. Except (at least) PPC/pseries
which needs https://lkml.org/lkml/2020/10/27/259

Cc: Cédric Le Goater 
Cc: Marc Zyngier 
Cc: Michael Ellerman 
Cc: Qian Cai 
Cc: Rob Herring 
Cc: Frederic Barrat 
Cc: Michal Suchánek 
Cc: Thomas Gleixner 
Signed-off-by: Alexey Kardashevskiy 
---

This is what it is fixing for powerpc:

There was a comment about whether hierarchical IRQ domains should
contribute to this reference counter and I need some help here as
I cannot see why.
It is reverse now - IRQs contribute to domain->mapcount and
irq_domain_associate/irq_domain_disassociate take necessary steps to
keep this counter in order. What might be missing is that if we have
cascade of IRQs (as in the IOAPIC example from
Documentation/core-api/irq/irq-domain.rst ), then a parent IRQ should
contribute to the children IRQs and it is up to
irq_domain_ops::alloc/free hooks, and they all seem to be eventually
calling irq_domain_alloc_irqs_xxx/irq_domain_free_irqs_xxx which seems
right.

Documentation/core-api/irq/irq-domain.rst also suggests there is a lot
to see in debugfs about IRQs but on my thinkpad there nothing about
hierarchy.

So I'll ask again :)

What is the easiest way to get irq-hierarchical hardware?
I have a bunch of powerpc boxes (no good) but also a raspberry pi,
a bunch of 32/64bit orange pi's, an "armada" arm box,
thinkpads - is any of this good for the task?



---
Changes:
v3:
* removed very wrong kobject_get/_put from irq_domain_associate/
irq_domain_disassociate as these are called from kobject_release so
irq_descs were never actually released
* removed irq_sysfs_del as 1) we do not seem to need it with changed
counting  2) produces a "no parent" warning as it would be called from
kobject_release which removes sysfs nodes itself

v2:
* added more get/put, including irq_domain_associate/irq_domain_disassociate
---
 kernel/irq/irqdesc.c   | 55 ++
 kernel/irq/irqdomain.c | 37 
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1a7723604399..79c904ebfd5c 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -295,18 +295,6 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
}
 }
 
-static void irq_sysfs_del(struct irq_desc *desc)
-{
-   /*
-* If irq_sysfs_init() has not yet been invoked (early boot), then
-* irq_kobj_base is NULL and the descriptor was never added.
-* kobject_del() complains about a object with no parent, so make
-* it conditional.
-*/
-   if (irq_kobj_base)
-   kobject_del(&desc->kobj);
-}
-
 static int __init irq_sysfs_init(void)
 {
struct irq_desc *desc;
@@ -337,7 +325,6 @@ static struct kobj_type irq_kobj_type = {
 };
 
 static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
-static void irq_sysfs_del(struct irq_desc *desc) {}
 
 #endif /* CONFIG_SYSFS */
 
@@ -419,20 +406,40 @@ static struct irq_desc *alloc_desc(int irq, int node, 
unsigned int flags,
return NULL;
 }
 
+static void delayed_free_desc(struct rcu_head *rhp);
 static void irq_kobj_release(struct kobject *kobj)
 {
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
+#ifdef CONFIG_IRQ_DOMAIN
+   struct irq_domain *domain;
+   unsigned int virq = desc->irq_data.irq;
 
-   free_masks(desc);
-   free_percpu(desc->kstat_irqs);
-   kfree(desc);
+   domain = desc->irq_data.domain;
+   if (domain) {
+   if (irq_domain_is_hierarchy(domain)) {
+   irq_domain_free_irqs(virq, 1);
+   } else {
+   irq_domain_disassociate(domain, virq);
+   irq_free_desc(virq);
+   }
+   }
+#endif
+   /*
+* We free the