Hi Alex,

On 11/9/18 11:09 PM, Alex Williamson wrote:
> In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
> remove") a mutex was added to freeze the refcnt for a device so that
> we can handle errors and perform bus resets on final close.  However,
> bus resets can be rather slow and a global mutex here is undesirable.
> A per-device mutex provides the best granularity, but then our chances
> of triggering a bus/slot reset with multiple affected devices is slim
> when devices are released in parallel.
Sorry I don't get the above sentence.
  Instead create a reflck object
> shared among all devices under the same bus or slot, allowing devices
> on independent buses to be released in parallel while serializing per
> bus/slot.>
> Reported-by: Christian Ehrhardt <christian.ehrha...@canonical.com>
> Tested-by: Christian Ehrhardt <christian.ehrha...@canonical.com>
> Signed-off-by: Alex Williamson <alex.william...@redhat.com>
> ---
>  drivers/vfio/pci/vfio_pci.c         |  157 
> ++++++++++++++++++++++++++++++-----
>  drivers/vfio/pci/vfio_pci_private.h |    6 +
>  2 files changed, 139 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 50cdedfca9fe..d443fb7a4e75 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
>  MODULE_PARM_DESC(disable_idle_d3,
>                "Disable using the PCI D3 low power state for idle, unused 
> devices");
>  
> -static DEFINE_MUTEX(driver_lock);
> -
>  static inline bool vfio_vga_disabled(void)
>  {
>  #ifdef CONFIG_VFIO_PCI_VGA
> @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
>  {
>       struct vfio_pci_device *vdev = device_data;
>  
> -     mutex_lock(&driver_lock);
> +     mutex_lock(&vdev->reflck->lock);
>  
>       if (!(--vdev->refcnt)) {
>               vfio_spapr_pci_eeh_release(vdev->pdev);
>               vfio_pci_disable(vdev);
>       }
>  
> -     mutex_unlock(&driver_lock);
> +     mutex_unlock(&vdev->reflck->lock);
>  
>       module_put(THIS_MODULE);
>  }
> @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
>       if (!try_module_get(THIS_MODULE))
>               return -ENODEV;
>  
> -     mutex_lock(&driver_lock);
> +     mutex_lock(&vdev->reflck->lock);
>  
>       if (!vdev->refcnt) {
>               ret = vfio_pci_enable(vdev);
> @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
>       }
>       vdev->refcnt++;
>  error:
> -     mutex_unlock(&driver_lock);
> +     mutex_unlock(&vdev->reflck->lock);
>       if (ret)
>               module_put(THIS_MODULE);
>       return ret;
> @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
>       .request        = vfio_pci_request,
>  };
>  
> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> +
>  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
> *id)
>  {
>       struct vfio_pci_device *vdev;
> @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, const 
> struct pci_device_id *id)
>               return ret;
>       }
>  
> +     ret = vfio_pci_reflck_attach(vdev);
> +     if (ret) {
> +             vfio_del_group_dev(&pdev->dev);
> +             vfio_iommu_group_put(group, &pdev->dev);
> +             kfree(vdev);
> +             return ret;
> +     }
> +
>       if (vfio_pci_is_vga(pdev)) {
>               vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
>               vga_set_legacy_decoding(pdev,
> @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
>       if (!vdev)
>               return;
>  
> +     vfio_pci_reflck_put(vdev->reflck);
> +
>       vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
>       kfree(vdev->region);
>       mutex_destroy(&vdev->ioeventfds_lock);
> @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
>       .err_handler    = &vfio_err_handlers,
>  };
>  
> +static DEFINE_MUTEX(reflck_lock);
> +
> +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
> +{
> +     struct vfio_pci_reflck *reflck;
> +
> +     reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
> +     if (!reflck)
> +             return ERR_PTR(-ENOMEM);
> +
> +     kref_init(&reflck->kref);
> +     mutex_init(&reflck->lock);
> +
> +     return reflck;
> +}
> +
> +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
> +{
> +     kref_get(&reflck->kref);
> +}
> +
> +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
> +{
> +     struct vfio_pci_reflck **preflck = data;
> +     struct vfio_device *device;
> +     struct vfio_pci_device *vdev;
> +
> +     device = vfio_device_get_from_dev(&pdev->dev);
> +     if (!device)
> +             return 0;
> +
> +     if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> +             vfio_device_put(device);
> +             return 0;
> +     }
> +
> +     vdev = vfio_device_data(device);
> +
> +     if (vdev->reflck) {
> +             vfio_pci_reflck_get(vdev->reflck);
> +             *preflck = vdev->reflck;
> +             vfio_device_put(device);
> +             return 1;
> +     }
> +
> +     vfio_device_put(device);
> +     return 0;
> +}
> +
> +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
> +{
> +     bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
> +
> +     mutex_lock(&reflck_lock);
> +
> +     if (pci_is_root_bus(vdev->pdev->bus) ||
> +         vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
> +                                       &vdev->reflck, slot) <= 0)
!vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
                                          &vdev->reflck, slot)?
I don't think we can have negative returned value.
> +             vdev->reflck = vfio_pci_reflck_alloc();
> +
> +     mutex_unlock(&reflck_lock);
> +
> +     return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
> +}
> +
> +static void vfio_pci_reflck_release(struct kref *kref)
> +{
> +     struct vfio_pci_reflck *reflck = container_of(kref,
> +                                                   struct vfio_pci_reflck,
> +                                                   kref);
> +
> +     kfree(reflck);
> +     mutex_unlock(&reflck_lock);
> +}
> +
> +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
> +{
> +     kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
> +}
> +
>  struct vfio_devices {
>       struct vfio_device **devices;
>       int cur_index;
>       int max_index;
>  };
>  
> -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
> +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
>  {
>       struct vfio_devices *devs = data;
>       struct vfio_device *device;
> +     struct vfio_pci_device *vdev;
>  
>       if (devs->cur_index == devs->max_index)
>               return -ENOSPC;
> @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, 
> void *data)
>               return -EBUSY;
>       }
>  
> +     vdev = vfio_device_data(device);
> +
> +     /* Only collect unused devices */
abort if the slot/bus features a used device?
> +     if (vdev->refcnt) {
> +             vfio_device_put(device);
> +             return -EBUSY;
> +     }
> +
>       devs->devices[devs->cur_index++] = device;
>       return 0;
>  }
>  
>  /*
> - * Attempt to do a bus/slot reset if there are devices affected by a reset 
> for
> - * this device that are needs_reset and all of the affected devices are 
> unused
> - * (!refcnt). Callers are required to hold driver_lock when calling this to
> - * prevent device opens and concurrent bus reset attempts.  We prevent device
> - * unbinds by acquiring and holding a reference to the vfio_device.
> + * Attempt to do a bus/slot reset if there are devices affected by a reset
> + * for this device that are "needs_reset" and all of the affected devices
> + * are unused (!refcnt).

This comment still sounds a bit cryptic to me. Assuming I got the point,
may I suggest something like:

If one of the device of the slot/bus needs a reset (but cannot be reset
at function level) and all the devices of the slot/bus are unused
(!refcount), we attempt to do a bus/slot reset.

  Callers are required to hold vdev->reflck->lock
> + * to prevent concurrent device opens, which is expected to protect all
> + * affected devices.  A vfio_device reference is also acquired for each
> + * affected device to prevent unbinds.
>   *
>   * NB: vfio-core considers a group to be viable even if some devices are
>   * bound to drivers like pci-stub or pcieport.  Here we require all devices
> @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct 
> vfio_pci_device *vdev)
>  {
>       struct vfio_devices devs = { .cur_index = 0 };
>       int i = 0, ret = -EINVAL;
> -     bool needs_reset = false, slot = false;
> +     bool slot = false;
>       struct vfio_pci_device *tmp;
>  
>       if (!pci_probe_reset_slot(vdev->pdev->slot))
> @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct 
> vfio_pci_device *vdev)
>               return;
>  
>       if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> -                                       vfio_pci_get_devs, &devs, slot))
> +                                       vfio_pci_get_unused_devs,
> +                                       &devs, slot))
>               goto put_devs;
>  
> +     /* Does at least one need a reset? */
>       for (i = 0; i < devs.cur_index; i++) {
>               tmp = vfio_device_data(devs.devices[i]);
> -             if (tmp->needs_reset)
> -                     needs_reset = true;
> -             if (tmp->refcnt)
> -                     goto put_devs;
> +             if (tmp->needs_reset) {
> +                     ret = pci_reset_bus(vdev->pdev);
> +                     break;
> +             }
>       }
>  
> -     if (needs_reset)
> -             ret = pci_reset_bus(vdev->pdev);
> -
>  put_devs:
>       for (i = 0; i < devs.cur_index; i++) {
>               tmp = vfio_device_data(devs.devices[i]);
> -             if (!ret)
> +
> +             /*
> +              * If reset was successful, affected devices no longer need
> +              * a reset and we should return all the collateral devices
> +              * to low power.  If not successful, we either didn't reset
> +              * the bus or timed out waiting for it, so let's not touch
> +              * the power state.
> +              */
> +             if (!ret) {
>                       tmp->needs_reset = false;
>  
> -             if (!tmp->refcnt && !disable_idle_d3)
> -                     pci_set_power_state(tmp->pdev, PCI_D3hot);
> +                     if (tmp != vdev && !disable_idle_d3)
> +                             pci_set_power_state(tmp->pdev, PCI_D3hot);
> +             }
>  
>               vfio_device_put(devs.devices[i]);
>       }
> diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> b/drivers/vfio/pci/vfio_pci_private.h
> index cde3b5d3441a..aa2355e67340 100644
> --- a/drivers/vfio/pci/vfio_pci_private.h
> +++ b/drivers/vfio/pci/vfio_pci_private.h
> @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
>       struct list_head        res_next;
>  };
>  
> +struct vfio_pci_reflck {
> +     struct kref             kref;
> +     struct mutex            lock;
> +};
> +
>  struct vfio_pci_device {
>       struct pci_dev          *pdev;
>       void __iomem            *barmap[PCI_STD_RESOURCE_END + 1];
> @@ -104,6 +109,7 @@ struct vfio_pci_device {
>       bool                    needs_reset;
>       bool                    nointx;
>       struct pci_saved_state  *pci_saved_state;
> +     struct vfio_pci_reflck  *reflck;
>       int                     refcnt;
>       int                     ioeventfds_nr;
>       struct eventfd_ctx      *err_trigger;
> 
Reviewed-by: Eric Auger <eric.au...@redhat.com>

Thanks

Eric

Reply via email to