Re: [PATCH 3/3] x86/pci: Fix racy accesses to MSI-X Control register

2022-12-13 Thread David Vrabel




On 13/12/2022 11:50, Jan Beulich wrote:

On 13.12.2022 12:34, David Vrabel wrote:

On 12/12/2022 17:04, Jan Beulich wrote:

On 10.11.2022 17:59, David Vrabel wrote:


--- a/xen/arch/x86/include/asm/msi.h
+++ b/xen/arch/x86/include/asm/msi.h
@@ -237,7 +237,10 @@ struct arch_msix {
   int table_refcnt[MAX_MSIX_TABLE_PAGES];
   int table_idx[MAX_MSIX_TABLE_PAGES];
   spinlock_t table_lock;
+spinlock_t control_lock;
   bool host_maskall, guest_maskall;
+uint16_t host_enable;


If you want to keep this more narrow than "unsigned int", then please
add a BUILD_BUG_ON() against NR_CPUS, so the need to update the field
can be easily noticed (in some perhaps distant future).


This is only incremented:

- while holding the pci_devs lock, or
- while holding a lock for one of the associated IRQs.


How do the locks held matter here, especially given that - as iirc you say
in the description - neither lock is held uniformly?


Because the usage here is:

   lock()
   host_enable++
   ...
   host_enable--
   unlock()


Since there are at most 4096 MSI-X vectors (and thus at most 4096 IRQs),
the highest value this can be (even with >> 4096 PCPUs) is 4097, thus a
uint16_t is fine.


Where's the 4096 coming from as a limit for MSI-X vectors? DYM 2048, which
is the per-device limit (because the qsize field is 11 bits wide)? If so,
yes, I think that's indeed restricting how large the number can get.


Yes, I did mean 2048 here.


I could only think of less neat ones like host_enable_override or
forced_enable or some such. If I had any good name in mind, I would > certainly 
have said so.


host_enable_override works for me.

David



Re: [PATCH 3/3] x86/pci: Fix racy accesses to MSI-X Control register

2022-12-13 Thread Jan Beulich
On 13.12.2022 12:34, David Vrabel wrote:
> On 12/12/2022 17:04, Jan Beulich wrote:
>> On 10.11.2022 17:59, David Vrabel wrote:
>>>
>>> --- a/xen/arch/x86/include/asm/msi.h
>>> +++ b/xen/arch/x86/include/asm/msi.h
>>> @@ -237,7 +237,10 @@ struct arch_msix {
>>>   int table_refcnt[MAX_MSIX_TABLE_PAGES];
>>>   int table_idx[MAX_MSIX_TABLE_PAGES];
>>>   spinlock_t table_lock;
>>> +spinlock_t control_lock;
>>>   bool host_maskall, guest_maskall;
>>> +uint16_t host_enable;
>>
>> If you want to keep this more narrow than "unsigned int", then please
>> add a BUILD_BUG_ON() against NR_CPUS, so the need to update the field
>> can be easily noticed (in some perhaps distant future).
> 
> This is only incremented:
> 
> - while holding the pci_devs lock, or
> - while holding a lock for one of the associated IRQs.

How do the locks held matter here, especially given that - as iirc you say
in the description - neither lock is held uniformly?

> Since there are at most 4096 MSI-X vectors (and thus at most 4096 IRQs), 
> the highest value this can be (even with >> 4096 PCPUs) is 4097, thus a 
> uint16_t is fine.

Where's the 4096 coming from as a limit for MSI-X vectors? DYM 2048, which
is the per-device limit (because the qsize field is 11 bits wide)? If so,
yes, I think that's indeed restricting how large the number can get.

>>> +static void msix_update_unlock(struct pci_dev *dev, unsigned int pos, 
>>> uint16_t control)
>>> +{
>>> +uint16_t new_control;
>>> +unsigned long flags;
>>> +
>>> +spin_lock_irqsave(>msix->control_lock, flags);
>>> +
>>> +dev->msix->host_enable--;
>>> +
>>> +new_control = control & ~(PCI_MSIX_FLAGS_ENABLE | 
>>> PCI_MSIX_FLAGS_MASKALL);
>>> +
>>> +if ( dev->msix->host_enable || dev->msix->guest_enable )
>>> +new_control |= PCI_MSIX_FLAGS_ENABLE;
>>> +if ( dev->msix->host_maskall || dev->msix->guest_maskall || 
>>> dev->msix->host_enable )
>>> +new_control |= PCI_MSIX_FLAGS_MASKALL;
>>
>> In particular this use of "host_enable" suggests the field wants to be
>> named differently: It makes no sense to derive MASKALL from ENABLE
>> without it being clear (from the name) that the field is an init-time
>> override only.
> 
> I think the name as-is makes sense. It is analogous to the host_maskall 
> and complements guest_enable. I can't think of a better name, so what 
> name do you suggest.

I could only think of less neat ones like host_enable_override or
forced_enable or some such. If I had any good name in mind, I would
certainly have said so.

Jan



Re: [PATCH 3/3] x86/pci: Fix racy accesses to MSI-X Control register

2022-12-13 Thread David Vrabel




On 12/12/2022 17:04, Jan Beulich wrote:

On 10.11.2022 17:59, David Vrabel wrote:


--- a/xen/arch/x86/include/asm/msi.h
+++ b/xen/arch/x86/include/asm/msi.h
@@ -237,7 +237,10 @@ struct arch_msix {
  int table_refcnt[MAX_MSIX_TABLE_PAGES];
  int table_idx[MAX_MSIX_TABLE_PAGES];
  spinlock_t table_lock;
+spinlock_t control_lock;
  bool host_maskall, guest_maskall;
+uint16_t host_enable;


If you want to keep this more narrow than "unsigned int", then please
add a BUILD_BUG_ON() against NR_CPUS, so the need to update the field
can be easily noticed (in some perhaps distant future).


This is only incremented:

- while holding the pci_devs lock, or
- while holding a lock for one of the associated IRQs.

Since there are at most 4096 MSI-X vectors (and thus at most 4096 IRQs), 
the highest value this can be (even with >> 4096 PCPUs) is 4097, thus a 
uint16_t is fine.



+static void msix_update_unlock(struct pci_dev *dev, unsigned int pos, uint16_t 
control)
+{
+uint16_t new_control;
+unsigned long flags;
+
+spin_lock_irqsave(>msix->control_lock, flags);
+
+dev->msix->host_enable--;
+
+new_control = control & ~(PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
+
+if ( dev->msix->host_enable || dev->msix->guest_enable )
+new_control |= PCI_MSIX_FLAGS_ENABLE;
+if ( dev->msix->host_maskall || dev->msix->guest_maskall || 
dev->msix->host_enable )
+new_control |= PCI_MSIX_FLAGS_MASKALL;


In particular this use of "host_enable" suggests the field wants to be
named differently: It makes no sense to derive MASKALL from ENABLE
without it being clear (from the name) that the field is an init-time
override only.


I think the name as-is makes sense. It is analogous to the host_maskall 
and complements guest_enable. I can't think of a better name, so what 
name do you suggest.


David



Re: [PATCH 3/3] x86/pci: Fix racy accesses to MSI-X Control register

2022-12-12 Thread Jan Beulich
On 10.11.2022 17:59, David Vrabel wrote:
> Concurrent access the the MSI-X control register are not serialized
> with a suitable lock. For example, in msix_capability_init() access
> use the pcidevs_lock() but some calls to msi_set_mask_bit() use the
> interrupt descriptor lock.
> 
> This can lead to MSI-X being incorrectly disabled and subsequent
> failures due to msix_memory_decoded() calls that check for MSI-X being
> enabled.
> 
> This was seen with some non-compliant hardware that gated MSI-X
> messages on the per-vector mask bit only (i.e., the MSI-X Enable bit
> and Function Mask bits in the MSI-X Control register were ignored). An
> interrupt (with a pending move) for vector 0 would occur while vector
> 1 was being initialized in msix_capability_init(). Updates the the
> Control register would race and the vector 1 initialization would
> intermittently fail with -ENXIO.
> 
> Typically a race between initializing a vector and another vector
> being moved doesn't occur because:
> 
> 1. Racy Control accesses only occur when MSI-X is (guest) disabled
> 
> 2  Hardware should only raise interrupts when MSI-X is enabled and unmasked.
> 
> 3. Xen always sets Function Mask when temporarily enabling MSI-X.
> 
> But there may be other race conditions depending on hardware and guest
> driver behaviour (e.g., disabling MSI-X when a IRQ has a pending move
> on another PCPU).
> 
> Fix this by:
> 
> 1. Tracking the host and guest enable state in a similar way to the
>host and guest maskall state. Note that since multiple CPUs can be
>updating different vectors concurrently, a counter is needed for
>the host enable state.
> 
> 2. Add a new lock for serialize the Control read/modify/write
>sequence.
> 
> 3. Wrap the above in two helper functions (msix_update_lock(), and
>msix_update_unlock()), which bracket any MSI-X register updates
>that require MSI-X to be (temporarily) enabled.
> 
> Signed-off-by: David Vrabel 

Just a couple of mechanical / style comments, at least for now:

> SIM: https://t.corp.amazon.com/P63914633
> 
> CR: https://code.amazon.com/reviews/CR-79020945

These tags shouldn't be here, unless they have a meaning in the
"upstream" context.

> --- a/xen/arch/x86/include/asm/msi.h
> +++ b/xen/arch/x86/include/asm/msi.h
> @@ -237,7 +237,10 @@ struct arch_msix {
>  int table_refcnt[MAX_MSIX_TABLE_PAGES];
>  int table_idx[MAX_MSIX_TABLE_PAGES];
>  spinlock_t table_lock;
> +spinlock_t control_lock;
>  bool host_maskall, guest_maskall;
> +uint16_t host_enable;

If you want to keep this more narrow than "unsigned int", then please
add a BUILD_BUG_ON() against NR_CPUS, so the need to update the field
can be easily noticed (in some perhaps distant future).

> +static void msix_update_unlock(struct pci_dev *dev, unsigned int pos, 
> uint16_t control)
> +{
> +uint16_t new_control;
> +unsigned long flags;
> +
> +spin_lock_irqsave(>msix->control_lock, flags);
> +
> +dev->msix->host_enable--;
> +
> +new_control = control & ~(PCI_MSIX_FLAGS_ENABLE | 
> PCI_MSIX_FLAGS_MASKALL);
> +
> +if ( dev->msix->host_enable || dev->msix->guest_enable )
> +new_control |= PCI_MSIX_FLAGS_ENABLE;
> +if ( dev->msix->host_maskall || dev->msix->guest_maskall || 
> dev->msix->host_enable )
> +new_control |= PCI_MSIX_FLAGS_MASKALL;

In particular this use of "host_enable" suggests the field wants to be
named differently: It makes no sense to derive MASKALL from ENABLE
without it being clear (from the name) that the field is an init-time
override only.

> @@ -299,11 +350,16 @@ static void msix_set_enable(struct pci_dev *dev, int 
> enable)
>  pos = pci_find_cap_offset(seg, bus, slot, func, PCI_CAP_ID_MSIX);
>  if ( pos )
>  {
> +spin_lock_irq(>msix->control_lock);
> +
> +dev->msix->host_enable = false;

You explicitly say this isn't a boolean, so the initializer would better
be 0.

Jan



[PATCH 3/3] x86/pci: Fix racy accesses to MSI-X Control register

2022-11-10 Thread David Vrabel
Concurrent access the the MSI-X control register are not serialized
with a suitable lock. For example, in msix_capability_init() access
use the pcidevs_lock() but some calls to msi_set_mask_bit() use the
interrupt descriptor lock.

This can lead to MSI-X being incorrectly disabled and subsequent
failures due to msix_memory_decoded() calls that check for MSI-X being
enabled.

This was seen with some non-compliant hardware that gated MSI-X
messages on the per-vector mask bit only (i.e., the MSI-X Enable bit
and Function Mask bits in the MSI-X Control register were ignored). An
interrupt (with a pending move) for vector 0 would occur while vector
1 was being initialized in msix_capability_init(). Updates the the
Control register would race and the vector 1 initialization would
intermittently fail with -ENXIO.

Typically a race between initializing a vector and another vector
being moved doesn't occur because:

1. Racy Control accesses only occur when MSI-X is (guest) disabled

2  Hardware should only raise interrupts when MSI-X is enabled and unmasked.

3. Xen always sets Function Mask when temporarily enabling MSI-X.

But there may be other race conditions depending on hardware and guest
driver behaviour (e.g., disabling MSI-X when a IRQ has a pending move
on another PCPU).

Fix this by:

1. Tracking the host and guest enable state in a similar way to the
   host and guest maskall state. Note that since multiple CPUs can be
   updating different vectors concurrently, a counter is needed for
   the host enable state.

2. Add a new lock for serialize the Control read/modify/write
   sequence.

3. Wrap the above in two helper functions (msix_update_lock(), and
   msix_update_unlock()), which bracket any MSI-X register updates
   that require MSI-X to be (temporarily) enabled.

Signed-off-by: David Vrabel 
SIM: https://t.corp.amazon.com/P63914633

CR: https://code.amazon.com/reviews/CR-79020945
---
 xen/arch/x86/include/asm/msi.h |   3 +
 xen/arch/x86/msi.c | 215 +
 xen/drivers/passthrough/msi.c  |   1 +
 3 files changed, 114 insertions(+), 105 deletions(-)

diff --git a/xen/arch/x86/include/asm/msi.h b/xen/arch/x86/include/asm/msi.h
index fe670895ee..aa36e44f4e 100644
--- a/xen/arch/x86/include/asm/msi.h
+++ b/xen/arch/x86/include/asm/msi.h
@@ -237,7 +237,10 @@ struct arch_msix {
 int table_refcnt[MAX_MSIX_TABLE_PAGES];
 int table_idx[MAX_MSIX_TABLE_PAGES];
 spinlock_t table_lock;
+spinlock_t control_lock;
 bool host_maskall, guest_maskall;
+uint16_t host_enable;
+bool guest_enable;
 domid_t warned;
 };
 
diff --git a/xen/arch/x86/msi.c b/xen/arch/x86/msi.c
index 6c675d11d1..8e394da07a 100644
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -147,6 +147,57 @@ static bool msix_memory_decoded(const struct pci_dev *dev, 
unsigned int pos)
 return memory_decoded(dev);
 }
 
+
+/*
+ * Ensure MSI-X interrupts are masked during setup. Some devices require
+ * MSI-X to be enabled before we can touch the MSI-X registers. We need
+ * to mask all the vectors to prevent interrupts coming in before they're
+ * fully set up.
+ */
+static uint16_t msix_update_lock(struct pci_dev *dev, unsigned int pos)
+{
+uint16_t control, new_control;
+unsigned long flags;
+
+spin_lock_irqsave(>msix->control_lock, flags);
+
+dev->msix->host_enable++;
+
+control = pci_conf_read16(dev->sbdf, msix_control_reg(pos));
+if ( !(control & PCI_MSIX_FLAGS_ENABLE) )
+{
+new_control = control | PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL;
+pci_conf_write16(dev->sbdf, msix_control_reg(pos), new_control);
+}
+else
+dev->msix->guest_enable = true;
+
+spin_unlock_irqrestore(>msix->control_lock, flags);
+
+return control;
+}
+
+static void msix_update_unlock(struct pci_dev *dev, unsigned int pos, uint16_t 
control)
+{
+uint16_t new_control;
+unsigned long flags;
+
+spin_lock_irqsave(>msix->control_lock, flags);
+
+dev->msix->host_enable--;
+
+new_control = control & ~(PCI_MSIX_FLAGS_ENABLE | PCI_MSIX_FLAGS_MASKALL);
+
+if ( dev->msix->host_enable || dev->msix->guest_enable )
+new_control |= PCI_MSIX_FLAGS_ENABLE;
+if ( dev->msix->host_maskall || dev->msix->guest_maskall || 
dev->msix->host_enable )
+new_control |= PCI_MSIX_FLAGS_MASKALL;
+if ( new_control != control )
+pci_conf_write16(dev->sbdf, msix_control_reg(pos), control);
+
+spin_unlock_irqrestore(>msix->control_lock, flags);
+}
+
 /*
  * MSI message composition
  */
@@ -288,7 +339,7 @@ static void msi_set_enable(struct pci_dev *dev, int enable)
 __msi_set_enable(seg, bus, slot, func, pos, enable);
 }
 
-static void msix_set_enable(struct pci_dev *dev, int enable)
+static void msix_force_disable(struct pci_dev *dev)
 {
 int pos;
 u16 control, seg = dev->seg;
@@ -299,11 +350,16 @@ static void msix_set_enable(struct pci_dev *dev, int 
enable)
 pos =