Hi Alex,

> -----Original Message-----
> From: Alex Williamson [mailto:alex.william...@redhat.com]
> Sent: Thursday, December 07, 2017 4:08 PM
> To: Shameerali Kolothum Thodi <shameerali.kolothum.th...@huawei.com>
> Cc: eric.au...@redhat.com; pmo...@linux.vnet.ibm.com;
> k...@vger.kernel.org; linux-kernel@vger.kernel.org; Linuxarm
> <linux...@huawei.com>
> Subject: Re: [RFC] vfio/type1: Add IOVA_RANGE capability support
> 
> On Wed, 6 Dec 2017 16:07:36 +0000
> Shameer Kolothum <shameerali.kolothum.th...@huawei.com> wrote:
> 
> > This patch allows the user-space to retrieve the supported
> > IOVA range(s), excluding any reserved regions. The implementation
> > is based on capability chains, added to the VFIO_IOMMU_GET_INFO ioctl.
> >
> > This is following the discussions here[1] and is based on the RFC patch[2].
> >
> > ToDo:
> >  - This currently derives the default supported iova range from the first
> >    iommu domain. This needs to be changed to go through the domain_list
> >    instead.
> >  - Sync with Pierre's patch[3].
> >
> > 1.https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg03651.html
> > 2.https://lists.linuxfoundation.org/pipermail/iommu/2016-
> November/019002.html
> > 3.https://patchwork.kernel.org/patch/10084655/
> >
> > Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
> > ---
> >  drivers/vfio/vfio_iommu_type1.c | 172
> +++++++++++++++++++++++++++++++++++++++-
> >  include/uapi/linux/vfio.h       |  13 +++
> >  2 files changed, 184 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c
> b/drivers/vfio/vfio_iommu_type1.c
> > index e30e29a..72ca78a 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -28,6 +28,7 @@
> >  #include <linux/device.h>
> >  #include <linux/fs.h>
> >  #include <linux/iommu.h>
> > +#include <linux/list_sort.h>
> >  #include <linux/module.h>
> >  #include <linux/mm.h>
> >  #include <linux/rbtree.h>
> > @@ -92,6 +93,12 @@ struct vfio_group {
> >     struct list_head        next;
> >  };
> >
> > +struct vfio_iommu_iova {
> > +   struct list_head        list;
> > +   phys_addr_t             start;
> > +   phys_addr_t             end;
> > +};
> > +
> >  /*
> >   * Guest RAM pinning working set or DMA target
> >   */
> > @@ -1537,6 +1544,144 @@ static int
> vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
> >     return ret;
> >  }
> >
> > +static int vfio_add_iova_cap(struct vfio_info_cap *caps, u64 start, u64 
> > end)
> > +{
> > +   struct vfio_iommu_type1_info_cap_iova_range *cap;
> > +   struct vfio_info_cap_header *header;
> > +
> > +   header = vfio_info_cap_add(caps, sizeof(*cap),
> > +                   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
> > +   if (IS_ERR(header))
> > +           return PTR_ERR(header);
> > +
> > +   cap = container_of(header,
> > +                      struct vfio_iommu_type1_info_cap_iova_range,
> > +                      header);
> > +
> > +   cap->start = start;
> > +   cap->end = end;
> > +
> > +   return 0;
> > +}
> > +
> > +static int vfio_insert_iova(phys_addr_t start, phys_addr_t end,
> > +                           struct list_head *head)
> > +{
> > +   struct vfio_iommu_iova *region;
> > +
> > +   region = kzalloc(sizeof(*region), GFP_KERNEL);
> 
> You're initializing every field, so a zero'd allocation is not
> necessary here.

Ok.

> > +   if (!region)
> > +           return -ENOMEM;
> > +
> > +   INIT_LIST_HEAD(&region->list);
> > +   region->start = start;
> > +   region->end = end;
> > +
> > +   list_add_tail(&region->list, head);
> > +   return 0;
> > +}
> > +
> > +/*
> > + * Check and update iova region list in case a reserved region
> > + * overlaps the iommu iova range.
> > + */
> > +static int vfio_update_iommu_iova_range(phys_addr_t start, phys_addr_t
> end,
> > +                                   struct list_head *iova)
> > +{
> > +   struct vfio_iommu_iova *node;
> > +   phys_addr_t a, b;
> > +   int ret = 0;
> > +
> > +   if (list_empty(iova))
> > +           return -ENODEV;
> > +
> > +   node = list_last_entry(iova, struct vfio_iommu_iova, list);
> > +   a = node->start;
> > +   b = node->end;
> > +
> > +   /* No overlap */
> > +   if ((start > b) || (end < a))
> > +           return 0;
> > +
> > +   if (start > a)
> > +           ret = vfio_insert_iova(a, start - 1, &node->list);
> > +   if (ret)
> > +           goto done;
> > +   if (end < b)
> > +           ret = vfio_insert_iova(end + 1, b, &node->list);
> > +
> > +done:
> > +   list_del(&node->list);
> > +   kfree(node);
> > +
> > +   return ret;
> > +}
> > +
> > +static int vfio_resv_cmp(void *priv, struct list_head *a, struct list_head 
> > *b)
> > +{
> > +   struct iommu_resv_region *ra, *rb;
> > +
> > +   ra = container_of(a, struct iommu_resv_region, list);
> > +   rb = container_of(b, struct iommu_resv_region, list);
> > +
> > +   if (ra->start < rb->start)
> > +           return -1;
> > +   if (ra->start > rb->start)
> > +           return 1;
> > +   return 0;
> > +}
> > +
> > +static int vfio_build_iommu_iova_caps(struct vfio_iommu *iommu,
> > +                           struct vfio_info_cap *caps)
> > +{
> > +   struct iommu_resv_region *resv, *resv_next;
> > +   struct vfio_iommu_iova *iova, *iova_next;
> > +   struct list_head group_resv_regions, vfio_iova_regions;
> > +   struct vfio_domain *domain;
> > +   struct vfio_group *g;
> > +   phys_addr_t start, end;
> > +   int ret = 0;
> > +
> > +   domain = list_first_entry(&iommu->domain_list,
> > +                             struct vfio_domain, next);
> 
> How do you resolve that we can have multiple domains in a container and
> each my provide different apertures?  Eric noted that the attach group
> function attempts to do compatibility checks, so we need to figure out
> how we determine IOVA apertures are compatible.  The most obvious
> answer seems to be that we should look through the dma_list on the
> vfio_iommu and determine if there are existing mappings that are
> incompatible with the new domain.  That also suggests that we should
> maintain and update a list of valid iova ranges as we go such that we
> can reject mappings outside of those valid ranges.

Right. I will go through this in detail and see how to accommodate this.

> > +   /* Get the default iova range supported */
> > +   start = domain->domain->geometry.aperture_start;
> > +   end = domain->domain->geometry.aperture_end;
> 
> There's an IOMMU API for this, iommu_domain_get_attr() with
> DOMAIN_ATTR_GEOMETRY.
> 
> > +   INIT_LIST_HEAD(&vfio_iova_regions);
> > +   vfio_insert_iova(start, end, &vfio_iova_regions);
> > +
> > +   /* Get reserved regions if any */
> > +   INIT_LIST_HEAD(&group_resv_regions);
> > +   list_for_each_entry(g, &domain->group_list, next)
> > +           iommu_get_group_resv_regions(g->iommu_group,
> > +                                           &group_resv_regions);
> 
> Reserved ranges also need to be accounted for as groups are added to a
> domain.  Again, if dma list includes any mappings overlapping reserved
> ranges (ie. holes in the iova space), the group attach should fail.

Ok.

> > +   list_sort(NULL, &group_resv_regions, vfio_resv_cmp);
> > +
> > +   /* Update iova range excluding reserved regions */
> > +   list_for_each_entry(resv, &group_resv_regions, list) {
> > +           ret = vfio_update_iommu_iova_range(resv->start,
> > +                           resv->start + resv->length - 1,
> > +                           &vfio_iova_regions);
> > +           if (ret)
> > +                   goto done;
> > +   }
> > +
> > +   list_for_each_entry(iova, &vfio_iova_regions, list) {
> > +           ret = vfio_add_iova_cap(caps, iova->start, iova->end);
> 
> It seems like a fair bit of overhead and nuisance for the user to have
> each iova range added as a separate capability.  I think I'd prefer to
> see the capability size be based on a number of entries field.

Ok. I think the suggestion is to have something similar to the sparse mmap
capability implementation. 
 
> > +           if (ret)
> > +                   goto done;
> > +   }
> > +
> > +done:
> > +   list_for_each_entry_safe(resv, resv_next, &group_resv_regions, list)
> > +           kfree(resv);
> > +
> > +   list_for_each_entry_safe(iova, iova_next, &vfio_iova_regions, list)
> > +           kfree(iova);
> > +
> > +   return ret;
> > +}
> > +
> >  static long vfio_iommu_type1_ioctl(void *iommu_data,
> >                                unsigned int cmd, unsigned long arg)
> >  {
> > @@ -1558,8 +1703,10 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >             }
> >     } else if (cmd == VFIO_IOMMU_GET_INFO) {
> >             struct vfio_iommu_type1_info info;
> > +           struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
> > +           int ret;
> >
> > -           minsz = offsetofend(struct vfio_iommu_type1_info,
> iova_pgsizes);
> > +           minsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
> 
> This is incompatible, it will break existing userspace.  See
> include/uapi/linux/vfio.h:

Right. 

> /*
>  * Callers of INFO ioctls passing insufficiently sized buffers will see
>  * the capability chain flag bit set, a zero value for the first capability
>  * offset (if available within the provided argsz), and argsz will be
>  * updated to report the necessary buffer size.  For compatibility, the
>  * INFO ioctl will not report error in this case, but the capability chain
>  * will not be available.
>  */
> 
> >
> >             if (copy_from_user(&info, (void __user *)arg, minsz))
> >                     return -EFAULT;
> > @@ -1571,6 +1718,29 @@ static long vfio_iommu_type1_ioctl(void
> *iommu_data,
> >
> >             info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
> >
> > +           ret = vfio_build_iommu_iova_caps(iommu, &caps);
> > +           if (ret)
> > +                   return ret;
> > +
> > +           if (caps.size) {
> > +                   info.flags |= VFIO_IOMMU_INFO_CAPS;
> > +                   if (info.argsz < sizeof(info) + caps.size) {
> > +                           info.argsz = sizeof(info) + caps.size;
> > +                           info.cap_offset = 0;
> > +                   } else {
> > +                           vfio_info_cap_shift(&caps, sizeof(info));
> > +                           if (copy_to_user((void __user *)arg +
> > +                                           sizeof(info), caps.buf,
> > +                                           caps.size)) {
> > +                                   kfree(caps.buf);
> > +                                   return -EFAULT;
> > +                           }
> > +                           info.cap_offset = sizeof(info);
> > +                   }
> > +
> > +                   kfree(caps.buf);
> > +           }
> > +
> >             return copy_to_user((void __user *)arg, &info, minsz) ?
> >                     -EFAULT : 0;
> >
> > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> > index e3301db..c4e338b 100644
> > --- a/include/uapi/linux/vfio.h
> > +++ b/include/uapi/linux/vfio.h
> > @@ -517,7 +517,20 @@ struct vfio_iommu_type1_info {
> >     __u32   argsz;
> >     __u32   flags;
> >  #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)   /* supported page sizes info */
> > +#define VFIO_IOMMU_INFO_CAPS       (1 << 1)        /* Info supports caps */
> >     __u64   iova_pgsizes;           /* Bitmap of supported page sizes */
> > +   __u32   cap_offset;     /* Offset within info struct of first cap */
> > +   __u32   __resv;
> 
> Hmm, I'm not sure why the additional reserved field here is necessary.
> I guess we need 8-byte alignment of this iova range capability, but
> that should probably be accounted for explicitly as the capability is
> constructed in the buffer rather than implicitly by the ending offset
> of the static structure.

Ok.

> > +};
> > +
> > +/*
> > + * The IOVA_RANGE capability allows to report the IOVA range(s),
> > + */
> > +#define VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE  1
> > +struct vfio_iommu_type1_info_cap_iova_range {
> > +   struct vfio_info_cap_header header;
> > +   __u64 start;
> > +   __u64 end;
> >  };
> >
> 
> It should be noted that this is version 1 of this structure.

Ok. Will update that.

Many thanks for going through this and the feedback.

Cheers,
Shameer

Reply via email to