On Tue, Nov 02, 2010 at 04:49:20PM +0100, Jan Kiszka wrote:
> PCI 2.3 allows to generically disable IRQ sources at device level. This
> enables us to share IRQs of such devices between on the host side when
> passing them to a guest.
> 
> Signed-off-by: Jan Kiszka <jan.kis...@siemens.com>
> ---
>  include/linux/kvm_host.h |    1 +
>  virt/kvm/assigned-dev.c  |  194 
> ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 180 insertions(+), 15 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 46120da..fdc2bd9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -466,6 +466,7 @@ struct kvm_assigned_dev_kernel {
>       unsigned int entries_nr;
>       int host_irq;
>       bool host_irq_disabled;
> +     bool pci_2_3;
>       struct msix_entry *host_msix_entries;
>       int guest_irq;
>       struct msix_entry *guest_msix_entries;
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ca402ed..91fe9c8 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -55,17 +55,145 @@ static int find_index_from_host_irq(struct 
> kvm_assigned_dev_kernel
>       return index;
>  }
>  
> +/*
> + * Verify that the device supports Interrupt Disable bit in command register,
> + * per PCI 2.3, by flipping this bit and reading it back: this bit was 
> readonly
> + * in PCI 2.2.
> + */
> +static bool pci_2_3_supported(struct pci_dev *pdev)
> +{
> +     bool supported = false;
> +     u16 orig, new;
> +
> +     pci_block_user_cfg_access(pdev);
> +     pci_read_config_word(pdev, PCI_COMMAND, &orig);
> +     pci_write_config_word(pdev, PCI_COMMAND,
> +                           orig ^ PCI_COMMAND_INTX_DISABLE);
> +     pci_read_config_word(pdev, PCI_COMMAND, &new);
> +
> +     /*
> +      * There's no way to protect against
> +      * hardware bugs or detect them reliably, but as long as we know
> +      * what the value should be, let's go ahead and check it.
> +      */
> +     if ((new ^ orig) & ~PCI_COMMAND_INTX_DISABLE) {
> +             dev_err(&pdev->dev, "Command changed from 0x%x to 0x%x: "
> +                     "driver or HW bug?\n", orig, new);
> +             goto out;
> +     }
> +     if (!((new ^ orig) & PCI_COMMAND_INTX_DISABLE)) {
> +             dev_warn(&pdev->dev, "Device does not support "
> +                      "disabling interrupts: unable to bind.\n");
> +             goto out;
> +     }
> +     supported = true;
> +
> +     /* Now restore the original value. */
> +     pci_write_config_word(pdev, PCI_COMMAND, orig);
> +
> +out:
> +     pci_unblock_user_cfg_access(pdev);
> +     return supported;
> +}
> +
> +static unsigned int
> +pci_2_3_set_irq_mask(struct pci_dev *dev, bool mask, bool check_status)
> +{
> +     u32 cmd_status_dword;
> +     u16 origcmd, newcmd;
> +     unsigned int status;
> +
> +     /*
> +      * We do a single dword read to retrieve both command and status.
> +      * Document assumptions that make this possible.
> +      */
> +     BUILD_BUG_ON(PCI_COMMAND % 4);
> +     BUILD_BUG_ON(PCI_COMMAND + 2 != PCI_STATUS);
> +
> +     pci_block_user_cfg_access(dev);
> +
> +     /*
> +      * Read both command and status registers in a single 32-bit operation.
> +      * Note: we could cache the value for command and move the status read
> +      * out of the lock if there was a way to get notified of user changes
> +      * to command register through sysfs. Should be good for shared irqs.
> +      */
> +     pci_read_config_dword(dev, PCI_COMMAND, &cmd_status_dword);
> +     origcmd = cmd_status_dword;
> +     status = cmd_status_dword >> 16;
> +
> +     if (check_status) {
> +             bool irq_pending = status & PCI_STATUS_INTERRUPT;
> +
> +             /*
> +              * Check interrupt status register to see whether our device
> +              * triggered the interrupt (when masking) or the next IRQ is
> +              * already pending (when unmasking).
> +              */
> +             if (!(mask == irq_pending))

Same as mask != irq_pending?

> +                     goto done;
> +     }
> +
> +     newcmd = origcmd & ~PCI_COMMAND_INTX_DISABLE;
> +     if (mask)
> +             newcmd |= PCI_COMMAND_INTX_DISABLE;
> +     if (newcmd != origcmd)
> +             pci_write_config_word(dev, PCI_COMMAND, newcmd);
> +
> +done:
> +     pci_unblock_user_cfg_access(dev);
> +     return status;
> +}
> +
> +static void pci_2_3_irq_mask(struct pci_dev *dev)
> +{
> +     pci_2_3_set_irq_mask(dev, true, false);
> +}
> +
> +static unsigned int pci_2_3_irq_check_and_mask(struct pci_dev *dev)
> +{
> +     return pci_2_3_set_irq_mask(dev, true, true);
> +}
> +
> +static void pci_2_3_irq_unmask(struct pci_dev *dev)
> +{
> +     pci_2_3_set_irq_mask(dev, false, false);
> +}
> +
> +static unsigned int pci_2_3_irq_check_and_unmask(struct pci_dev *dev)
> +{
> +     return pci_2_3_set_irq_mask(dev, false, true);
> +}
> +

IMO this is not a terribly good interface: all users check the pending bit
(PCI_STATUS_INTERRUPT) which is what the function pci_2_3_set_irq_mask
did anyway.  I'd suggest returning irqreturn_t or bool and not unsigned
int.


> +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
> +{
> +     struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
> +     int ret = IRQ_WAKE_THREAD;
> +
> +     spin_lock(&assigned_dev->intx_lock);
> +     if (assigned_dev->host_irq_disabled ||
> +         !(pci_2_3_irq_check_and_mask(assigned_dev->dev) &
> +                     PCI_STATUS_INTERRUPT))
> +             ret = IRQ_NONE;
> +     else
> +             assigned_dev->host_irq_disabled = true;

This is a bug I think.  For pci 2.3 we should never track interrupt
state in kvm IMO.  For example, if userspace unmasks an interrupt
through a config write, we will get an interrupt while host_irq_disabled
is set. If we then fail to mask it, kaboom.

> +     spin_unlock(&assigned_dev->intx_lock);
> +
> +     return ret;
> +}
> +
>  static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id)
>  {
>       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
>       u32 vector;
>       int index;
>  
> -     if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) {
> -             spin_lock(&assigned_dev->intx_lock);
> +     if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX &&
> +         !assigned_dev->pci_2_3) {
> +             spin_lock_irq(&assigned_dev->intx_lock);
>               disable_irq_nosync(irq);
>               assigned_dev->host_irq_disabled = true;
> -             spin_unlock(&assigned_dev->intx_lock);
> +             spin_unlock_irq(&assigned_dev->intx_lock);
>       }
>  
>       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
> @@ -87,6 +215,7 @@ static irqreturn_t kvm_assigned_dev_thread(int irq, void 
> *dev_id)
>  static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
>  {
>       struct kvm_assigned_dev_kernel *dev;
> +     bool reassert = false;
>  
>       if (kian->gsi == -1)
>               return;
> @@ -99,12 +228,23 @@ static void kvm_assigned_dev_ack_irq(struct 
> kvm_irq_ack_notifier *kian)
>       /* The guest irq may be shared so this ack may be
>        * from another device.
>        */
> -     spin_lock(&dev->intx_lock);
> +     spin_lock_irq(&dev->intx_lock);
>       if (dev->host_irq_disabled) {
> -             enable_irq(dev->host_irq);
> +             if (dev->pci_2_3) {
> +                     if (pci_2_3_irq_check_and_unmask(dev->dev) &
> +                         PCI_STATUS_INTERRUPT) {
> +                             reassert = true;
> +                             goto out;
> +                     }
> +             } else
> +                     enable_irq(dev->host_irq);

Or

                if (!dev->pci_2_3)
                        enable_irq(dev->host_irq);
                else if (pci_2_3_irq_check_and_unmask(dev->dev) & 
PCI_STATUS_INTERRUPT) {
                        reassert = true;
                        goto out;
                }

to reduce nesting.

>               dev->host_irq_disabled = false;
>       }
> -     spin_unlock(&dev->intx_lock);
> +out:
> +     spin_unlock_irq(&dev->intx_lock);
> +
> +     if (reassert)
> +             kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 1);

Hmm, I think this still has more overhead than it needs to have.
Instead of setting level to 0 and then back to 1, can't we just
avoid set to 1 in the first place? This would need a different
interface than pci_2_3_irq_check_and_unmask to avoid a race
where interrupt is received while we are acking another one:

        block userspace access
        check pending bit
        if (!pending)
                set irq (0)
        clear pending
        block userspace access

Would be worth it for high volume devices.

>  }
>  
>  static void deassign_guest_irq(struct kvm *kvm,
> @@ -151,7 +291,11 @@ static void deassign_host_irq(struct kvm *kvm,
>               pci_disable_msix(assigned_dev->dev);
>       } else {
>               /* Deal with MSI and INTx */
> -             disable_irq(assigned_dev->host_irq);
> +             if (assigned_dev->pci_2_3) {
> +                     pci_2_3_irq_mask(assigned_dev->dev);
> +                     synchronize_irq(assigned_dev->host_irq);
> +             } else
> +                     disable_irq(assigned_dev->host_irq);
>  
>               free_irq(assigned_dev->host_irq, (void *)assigned_dev);
>  
> @@ -199,6 +343,13 @@ static void kvm_free_assigned_device(struct kvm *kvm,
>  
>       pci_reset_function(assigned_dev->dev);
>  
> +     /*
> +      * Unmask the IRQ at PCI level once the reset is done - the next user
> +      * may not expect the IRQ being masked.
> +      */
> +     if (assigned_dev->pci_2_3)
> +             pci_2_3_irq_unmask(assigned_dev->dev);
> +

Doesn't pci_reset_function clear mask bit? It seems to ...

>       pci_release_regions(assigned_dev->dev);
>       pci_disable_device(assigned_dev->dev);
>       pci_dev_put(assigned_dev->dev);
> @@ -224,15 +375,29 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
>  static int assigned_device_enable_host_intx(struct kvm *kvm,
>                                           struct kvm_assigned_dev_kernel *dev)
>  {
> +     irq_handler_t irq_handler = NULL;
> +     unsigned long flags = 0;
> +
>       dev->host_irq = dev->dev->irq;
> -     /* Even though this is PCI, we don't want to use shared
> -      * interrupts. Sharing host devices with guest-assigned devices
> -      * on the same interrupt line is not a happy situation: there
> -      * are going to be long delays in accepting, acking, etc.
> +
> +     /*
> +      * We can only share the IRQ line with other host devices if we are
> +      * able to disable the IRQ source at device-level - independently of
> +      * the guest driver. Otherwise host devices may suffer from unbounded
> +      * IRQ latencies when the guest keeps the line asserted.
>        */
> -     if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread,
> -                              0, dev->irq_name, (void *)dev))
> +     dev->pci_2_3 = pci_2_3_supported(dev->dev);
> +     if (dev->pci_2_3) {
> +             irq_handler = kvm_assigned_dev_intr;
> +             flags = IRQF_SHARED;
> +     }

I would prefer and else clause here instead of initializing
variables at the top and overwriting here. Makes it easier
to see which value is valid when.

> +     if (request_threaded_irq(dev->host_irq, irq_handler,
> +                              kvm_assigned_dev_thread, flags,
> +                              dev->irq_name, (void *)dev))
>               return -EIO;
> +
> +     if (dev->pci_2_3)
> +             pci_2_3_irq_unmask(dev->dev);
>       return 0;
>  }
>  
> @@ -308,7 +473,6 @@ static int assigned_device_enable_guest_msi(struct kvm 
> *kvm,
>  {
>       dev->guest_irq = irq->guest_irq;
>       dev->ack_notifier.gsi = -1;
> -     dev->host_irq_disabled = false;
>       return 0;
>  }
>  #endif
> @@ -320,7 +484,6 @@ static int assigned_device_enable_guest_msix(struct kvm 
> *kvm,
>  {
>       dev->guest_irq = irq->guest_irq;
>       dev->ack_notifier.gsi = -1;
> -     dev->host_irq_disabled = false;
>       return 0;
>  }
>  #endif
> @@ -354,6 +517,7 @@ static int assign_host_irq(struct kvm *kvm,
>       default:
>               r = -EINVAL;
>       }
> +     dev->host_irq_disabled = false;
>  
>       if (!r)
>               dev->irq_requested_type |= host_irq_type;
> -- 
> 1.7.1
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to