On Tue, 13 Nov 2018 15:42:49 +0100
Auger Eric <eric.au...@redhat.com> wrote:

> Hi Alex,
> 
> On 11/9/18 11:09 PM, Alex Williamson wrote:
> > In commit 61d792562b53 ("vfio-pci: Use mutex around open, release, and
> > remove") a mutex was added to freeze the refcnt for a device so that
> > we can handle errors and perform bus resets on final close.  However,
> > bus resets can be rather slow and a global mutex here is undesirable.
> > A per-device mutex provides the best granularity, but then our chances
> > of triggering a bus/slot reset with multiple affected devices is slim
> > when devices are released in parallel.  
> Sorry I don't get the above sentence.

There's a locking granularity question here, where currently we're
locking at a global level.  If I want to reduce that granularity, it
seems the obvious question is what minimum granularity can we achieve.
A per-device lock it technically that minimum for the purposes of
serializing the device around open/release, but then concurrent
releases don't necessarily provide us the opportunity to perform a bus
reset affecting multiple devices since all the devices are racing each
other.  Therefore I conclude that a bus/slot locking granularity
provides us the best compromise of granularity vs functionality.

>   Instead create a reflck object
> > shared among all devices under the same bus or slot, allowing devices
> > on independent buses to be released in parallel while serializing per  
> > bus/slot.>  
> > Reported-by: Christian Ehrhardt <christian.ehrha...@canonical.com>
> > Tested-by: Christian Ehrhardt <christian.ehrha...@canonical.com>
> > Signed-off-by: Alex Williamson <alex.william...@redhat.com>
> > ---
> >  drivers/vfio/pci/vfio_pci.c         |  157 
> > ++++++++++++++++++++++++++++++-----
> >  drivers/vfio/pci/vfio_pci_private.h |    6 +
> >  2 files changed, 139 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> > index 50cdedfca9fe..d443fb7a4e75 100644
> > --- a/drivers/vfio/pci/vfio_pci.c
> > +++ b/drivers/vfio/pci/vfio_pci.c
> > @@ -56,8 +56,6 @@ module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR);
> >  MODULE_PARM_DESC(disable_idle_d3,
> >              "Disable using the PCI D3 low power state for idle, unused 
> > devices");
> >  
> > -static DEFINE_MUTEX(driver_lock);
> > -
> >  static inline bool vfio_vga_disabled(void)
> >  {
> >  #ifdef CONFIG_VFIO_PCI_VGA
> > @@ -393,14 +391,14 @@ static void vfio_pci_release(void *device_data)
> >  {
> >     struct vfio_pci_device *vdev = device_data;
> >  
> > -   mutex_lock(&driver_lock);
> > +   mutex_lock(&vdev->reflck->lock);
> >  
> >     if (!(--vdev->refcnt)) {
> >             vfio_spapr_pci_eeh_release(vdev->pdev);
> >             vfio_pci_disable(vdev);
> >     }
> >  
> > -   mutex_unlock(&driver_lock);
> > +   mutex_unlock(&vdev->reflck->lock);
> >  
> >     module_put(THIS_MODULE);
> >  }
> > @@ -413,7 +411,7 @@ static int vfio_pci_open(void *device_data)
> >     if (!try_module_get(THIS_MODULE))
> >             return -ENODEV;
> >  
> > -   mutex_lock(&driver_lock);
> > +   mutex_lock(&vdev->reflck->lock);
> >  
> >     if (!vdev->refcnt) {
> >             ret = vfio_pci_enable(vdev);
> > @@ -424,7 +422,7 @@ static int vfio_pci_open(void *device_data)
> >     }
> >     vdev->refcnt++;
> >  error:
> > -   mutex_unlock(&driver_lock);
> > +   mutex_unlock(&vdev->reflck->lock);
> >     if (ret)
> >             module_put(THIS_MODULE);
> >     return ret;
> > @@ -1187,6 +1185,9 @@ static const struct vfio_device_ops vfio_pci_ops = {
> >     .request        = vfio_pci_request,
> >  };
> >  
> > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
> > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
> > +
> >  static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
> > *id)
> >  {
> >     struct vfio_pci_device *vdev;
> > @@ -1233,6 +1234,14 @@ static int vfio_pci_probe(struct pci_dev *pdev, 
> > const struct pci_device_id *id)
> >             return ret;
> >     }
> >  
> > +   ret = vfio_pci_reflck_attach(vdev);
> > +   if (ret) {
> > +           vfio_del_group_dev(&pdev->dev);
> > +           vfio_iommu_group_put(group, &pdev->dev);
> > +           kfree(vdev);
> > +           return ret;
> > +   }
> > +
> >     if (vfio_pci_is_vga(pdev)) {
> >             vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
> >             vga_set_legacy_decoding(pdev,
> > @@ -1264,6 +1273,8 @@ static void vfio_pci_remove(struct pci_dev *pdev)
> >     if (!vdev)
> >             return;
> >  
> > +   vfio_pci_reflck_put(vdev->reflck);
> > +
> >     vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
> >     kfree(vdev->region);
> >     mutex_destroy(&vdev->ioeventfds_lock);
> > @@ -1320,16 +1331,97 @@ static struct pci_driver vfio_pci_driver = {
> >     .err_handler    = &vfio_err_handlers,
> >  };
> >  
> > +static DEFINE_MUTEX(reflck_lock);
> > +
> > +static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
> > +{
> > +   struct vfio_pci_reflck *reflck;
> > +
> > +   reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
> > +   if (!reflck)
> > +           return ERR_PTR(-ENOMEM);
> > +
> > +   kref_init(&reflck->kref);
> > +   mutex_init(&reflck->lock);
> > +
> > +   return reflck;
> > +}
> > +
> > +static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
> > +{
> > +   kref_get(&reflck->kref);
> > +}
> > +
> > +static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
> > +{
> > +   struct vfio_pci_reflck **preflck = data;
> > +   struct vfio_device *device;
> > +   struct vfio_pci_device *vdev;
> > +
> > +   device = vfio_device_get_from_dev(&pdev->dev);
> > +   if (!device)
> > +           return 0;
> > +
> > +   if (pci_dev_driver(pdev) != &vfio_pci_driver) {
> > +           vfio_device_put(device);
> > +           return 0;
> > +   }
> > +
> > +   vdev = vfio_device_data(device);
> > +
> > +   if (vdev->reflck) {
> > +           vfio_pci_reflck_get(vdev->reflck);
> > +           *preflck = vdev->reflck;
> > +           vfio_device_put(device);
> > +           return 1;
> > +   }
> > +
> > +   vfio_device_put(device);
> > +   return 0;
> > +}
> > +
> > +static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
> > +{
> > +   bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
> > +
> > +   mutex_lock(&reflck_lock);
> > +
> > +   if (pci_is_root_bus(vdev->pdev->bus) ||
> > +       vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
> > +                                     &vdev->reflck, slot) <= 0)  
> !vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
>                                         &vdev->reflck, slot)?
> I don't think we can have negative returned value.

Correct, our callback function does not return a negative, but it's
certainly within the framework of this callback to do so.  I thought it
was more consistent with the framework to handle that case so we don't
overlook it in the future should the callback function change.

> > +           vdev->reflck = vfio_pci_reflck_alloc();
> > +
> > +   mutex_unlock(&reflck_lock);
> > +
> > +   return IS_ERR(vdev->reflck) ? PTR_ERR(vdev->reflck) : 0;
> > +}
> > +
> > +static void vfio_pci_reflck_release(struct kref *kref)
> > +{
> > +   struct vfio_pci_reflck *reflck = container_of(kref,
> > +                                                 struct vfio_pci_reflck,
> > +                                                 kref);
> > +
> > +   kfree(reflck);
> > +   mutex_unlock(&reflck_lock);
> > +}
> > +
> > +static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
> > +{
> > +   kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
> > +}
> > +
> >  struct vfio_devices {
> >     struct vfio_device **devices;
> >     int cur_index;
> >     int max_index;
> >  };
> >  
> > -static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
> > +static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
> >  {
> >     struct vfio_devices *devs = data;
> >     struct vfio_device *device;
> > +   struct vfio_pci_device *vdev;
> >  
> >     if (devs->cur_index == devs->max_index)
> >             return -ENOSPC;
> > @@ -1343,16 +1435,25 @@ static int vfio_pci_get_devs(struct pci_dev *pdev, 
> > void *data)
> >             return -EBUSY;
> >     }
> >  
> > +   vdev = vfio_device_data(device);
> > +
> > +   /* Only collect unused devices */  
> abort if the slot/bus features a used device?

Correct.  Is this a suggestion for a wording change?

> > +   if (vdev->refcnt) {
> > +           vfio_device_put(device);
> > +           return -EBUSY;
> > +   }
> > +
> >     devs->devices[devs->cur_index++] = device;
> >     return 0;
> >  }
> >  
> >  /*
> > - * Attempt to do a bus/slot reset if there are devices affected by a reset 
> > for
> > - * this device that are needs_reset and all of the affected devices are 
> > unused
> > - * (!refcnt). Callers are required to hold driver_lock when calling this to
> > - * prevent device opens and concurrent bus reset attempts.  We prevent 
> > device
> > - * unbinds by acquiring and holding a reference to the vfio_device.
> > + * Attempt to do a bus/slot reset if there are devices affected by a reset
> > + * for this device that are "needs_reset" and all of the affected devices
> > + * are unused (!refcnt).  
> 
> This comment still sounds a bit cryptic to me. Assuming I got the point,
> may I suggest something like:
> 
> If one of the device of the slot/bus needs a reset (but cannot be reset
> at function level) and all the devices of the slot/bus are unused
> (!refcount), we attempt to do a bus/slot reset.

I don't disagree that the original wording is somewhat convoluted, but
I don't see that this is a tremendous improvement.  How about:

  If a bus or slot reset is available for the provided device and:
    - All of the devices affected by that bus or slot reset are unused
      (!refcnt)
    - At least one of the affected devices is marked dirty via
      needs_reset (such as by lack of FLR support)
  Then attempt to perform that bus or slot reset.

>   Callers are required to hold vdev->reflck->lock
> > + * to prevent concurrent device opens, which is expected to protect all
> > + * affected devices.  A vfio_device reference is also acquired for each
> > + * affected device to prevent unbinds.
> >   *
> >   * NB: vfio-core considers a group to be viable even if some devices are
> >   * bound to drivers like pci-stub or pcieport.  Here we require all devices
> > @@ -1363,7 +1464,7 @@ static void vfio_pci_try_bus_reset(struct 
> > vfio_pci_device *vdev)
> >  {
> >     struct vfio_devices devs = { .cur_index = 0 };
> >     int i = 0, ret = -EINVAL;
> > -   bool needs_reset = false, slot = false;
> > +   bool slot = false;
> >     struct vfio_pci_device *tmp;
> >  
> >     if (!pci_probe_reset_slot(vdev->pdev->slot))
> > @@ -1381,28 +1482,36 @@ static void vfio_pci_try_bus_reset(struct 
> > vfio_pci_device *vdev)
> >             return;
> >  
> >     if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
> > -                                     vfio_pci_get_devs, &devs, slot))
> > +                                     vfio_pci_get_unused_devs,
> > +                                     &devs, slot))
> >             goto put_devs;
> >  
> > +   /* Does at least one need a reset? */
> >     for (i = 0; i < devs.cur_index; i++) {
> >             tmp = vfio_device_data(devs.devices[i]);
> > -           if (tmp->needs_reset)
> > -                   needs_reset = true;
> > -           if (tmp->refcnt)
> > -                   goto put_devs;
> > +           if (tmp->needs_reset) {
> > +                   ret = pci_reset_bus(vdev->pdev);
> > +                   break;
> > +           }
> >     }
> >  
> > -   if (needs_reset)
> > -           ret = pci_reset_bus(vdev->pdev);
> > -
> >  put_devs:
> >     for (i = 0; i < devs.cur_index; i++) {
> >             tmp = vfio_device_data(devs.devices[i]);
> > -           if (!ret)
> > +
> > +           /*
> > +            * If reset was successful, affected devices no longer need
> > +            * a reset and we should return all the collateral devices
> > +            * to low power.  If not successful, we either didn't reset
> > +            * the bus or timed out waiting for it, so let's not touch
> > +            * the power state.
> > +            */
> > +           if (!ret) {
> >                     tmp->needs_reset = false;
> >  
> > -           if (!tmp->refcnt && !disable_idle_d3)
> > -                   pci_set_power_state(tmp->pdev, PCI_D3hot);
> > +                   if (tmp != vdev && !disable_idle_d3)
> > +                           pci_set_power_state(tmp->pdev, PCI_D3hot);
> > +           }
> >  
> >             vfio_device_put(devs.devices[i]);
> >     }
> > diff --git a/drivers/vfio/pci/vfio_pci_private.h 
> > b/drivers/vfio/pci/vfio_pci_private.h
> > index cde3b5d3441a..aa2355e67340 100644
> > --- a/drivers/vfio/pci/vfio_pci_private.h
> > +++ b/drivers/vfio/pci/vfio_pci_private.h
> > @@ -76,6 +76,11 @@ struct vfio_pci_dummy_resource {
> >     struct list_head        res_next;
> >  };
> >  
> > +struct vfio_pci_reflck {
> > +   struct kref             kref;
> > +   struct mutex            lock;
> > +};
> > +
> >  struct vfio_pci_device {
> >     struct pci_dev          *pdev;
> >     void __iomem            *barmap[PCI_STD_RESOURCE_END + 1];
> > @@ -104,6 +109,7 @@ struct vfio_pci_device {
> >     bool                    needs_reset;
> >     bool                    nointx;
> >     struct pci_saved_state  *pci_saved_state;
> > +   struct vfio_pci_reflck  *reflck;
> >     int                     refcnt;
> >     int                     ioeventfds_nr;
> >     struct eventfd_ctx      *err_trigger;
> >   
> Reviewed-by: Eric Auger <eric.au...@redhat.com>

Thanks!

Alex

Reply via email to