Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-04-26 Thread Tomasz Nowicki

On 26.04.2017 12:08, Jean-Philippe Brucker wrote:

On 26/04/17 07:53, Tomasz Nowicki wrote:

+mutex_lock(>tasks_lock);
+list_for_each_entry(vfio_task, >tasks, list) {
+if (vfio_task->pasid != svm.pasid)
+continue;
+
+ret = iommu_unbind_task(device->dev, svm.pasid, flags);
+if (ret)
+dev_warn(device->dev, "failed to unbind PASID %u\n",
+ vfio_task->pasid);
+
+list_del(_task->list);
+kfree(vfio_task);


Please use list_for_each_entry_safe.


There is:

+break;

right after kfree, so we'd never follow vfio_task->list.next after freeing
vfio_task. The code searches for the _only_ task matching the PASID,
removes it and leaves the loop.



Ah right. Sorry for the noise.

Tomasz
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-04-26 Thread Jean-Philippe Brucker
On 26/04/17 07:53, Tomasz Nowicki wrote:
>> +mutex_lock(>tasks_lock);
>> +list_for_each_entry(vfio_task, >tasks, list) {
>> +if (vfio_task->pasid != svm.pasid)
>> +continue;
>> +
>> +ret = iommu_unbind_task(device->dev, svm.pasid, flags);
>> +if (ret)
>> +dev_warn(device->dev, "failed to unbind PASID %u\n",
>> + vfio_task->pasid);
>> +
>> +list_del(_task->list);
>> +kfree(vfio_task);
> 
> Please use list_for_each_entry_safe.

There is:

+break;

right after kfree, so we'd never follow vfio_task->list.next after freeing
vfio_task. The code searches for the _only_ task matching the PASID,
removes it and leaves the loop.

Thanks,
Jean-Philippe

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-04-26 Thread Tomasz Nowicki

Hi Jean,

On 27.02.2017 20:54, Jean-Philippe Brucker wrote:

Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
between a device and a process address space, identified by a
device-specific ID named PASID. This allows the device to target DMA
transactions at the process virtual addresses without a need for mapping
and unmapping buffers explicitly in the IOMMU. The process page tables are
shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
PASID.

Also add a capability flag in device info to detect whether the system and
the device support SVM.

Users need to specify the state of a PASID when unbinding, with flags
VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
PASID invalidation is specific to each device and only partially covered
by the specification:

* Device must have an implementation-defined mechanism for stopping the
  use of a PASID. When this mechanism finishes, the device has stopped
  issuing transactions for this PASID and all transactions for this PASID
  have been flushed to the IOMMU.

* Device may either wait for all outstanding PRI requests for this PASID
  to finish, or issue a Stop Marker message, a barrier that separates PRI
  requests affecting this instance of the PASID from PRI requests
  affecting the next instance. In the first case, we say that the PASID is
  "clean", in the second case it is "flushed" (and the IOMMU has to wait
  for the Stop Marker before reassigning the PASID.)

We expect similar distinctions for platform devices. Ideally there should
be a callback for each PCI device, allowing the IOMMU to ask the device to
stop using a PASID. When the callback returns, the PASID is either flushed
or clean and the return value tells which.

For the moment I don't know how to implement this callback for PCI, so if
the user forgets to call unbind with either "clean" or "flushed", the
PASID is never reused. For platform devices, it might be simpler to
implement since we could associate an invalidate_pasid callback to a DT
compatible string, as is currently done for reset.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/vfio/pci/vfio_pci.c |  24 ++
 drivers/vfio/vfio.c | 104 
 include/uapi/linux/vfio.h   |  55 +++
 3 files changed, 183 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 324c52e3a1a4..3d7733f94891 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device 
*vdev,
return 0;
 }



[...]



kfree(device);
@@ -1622,6 +1651,75 @@ static int vfio_device_fops_release(struct inode *inode, 
struct file *filep)
return 0;
 }

+static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
+  unsigned long arg)
+{
+   int ret;
+   unsigned long minsz;
+
+   struct vfio_device_svm svm;
+   struct vfio_task *vfio_task;
+
+   minsz = offsetofend(struct vfio_device_svm, pasid);
+
+   if (copy_from_user(, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (svm.argsz < minsz)
+   return -EINVAL;
+
+   if (cmd == VFIO_DEVICE_BIND_TASK) {
+   struct task_struct *task = current;
+
+   ret = iommu_bind_task(device->dev, task, , 0, NULL);
+   if (ret)
+   return ret;
+
+   vfio_task = kzalloc(sizeof(*vfio_task), GFP_KERNEL);
+   if (!vfio_task) {
+   iommu_unbind_task(device->dev, svm.pasid,
+ IOMMU_PASID_CLEAN);
+   return -ENOMEM;
+   }
+
+   vfio_task->pasid = svm.pasid;
+
+   mutex_lock(>tasks_lock);
+   list_add(_task->list, >tasks);
+   mutex_unlock(>tasks_lock);
+
+   } else {
+   int flags = 0;
+
+   if (svm.flags & ~(VFIO_SVM_PASID_RELEASE_FLUSHED |
+ VFIO_SVM_PASID_RELEASE_CLEAN))
+   return -EINVAL;
+
+   if (svm.flags & VFIO_SVM_PASID_RELEASE_FLUSHED)
+   flags = IOMMU_PASID_FLUSHED;
+   else if (svm.flags & VFIO_SVM_PASID_RELEASE_CLEAN)
+   flags = IOMMU_PASID_CLEAN;
+
+   mutex_lock(>tasks_lock);
+   list_for_each_entry(vfio_task, >tasks, list) {
+   if (vfio_task->pasid != svm.pasid)
+   continue;
+
+   ret = iommu_unbind_task(device->dev, svm.pasid, flags);
+   if (ret)
+   

RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-29 Thread Liu, Yi L
> -Original Message-
> From: Jean-Philippe Brucker [mailto:jean-philippe.bruc...@arm.com]
> Sent: Monday, March 27, 2017 6:14 PM
> To: Liu, Yi L <yi.l@intel.com>; Alex Williamson 
> <alex.william...@redhat.com>
> Cc: Shanker Donthineni <shank...@qti.qualcomm.com>; k...@vger.kernel.org;
> Catalin Marinas <catalin.mari...@arm.com>; Sinan Kaya
> <ok...@qti.qualcomm.com>; Will Deacon <will.dea...@arm.com>;
> iommu@lists.linux-foundation.org; Harv Abdulhamid <ha...@qti.qualcomm.com>;
> linux-...@vger.kernel.org; Bjorn Helgaas <bhelg...@google.com>; David
> Woodhouse <dw...@infradead.org>; linux-arm-ker...@lists.infradead.org; Nate
> Watterson <nwatt...@qti.qualcomm.com>; Tian, Kevin <kevin.t...@intel.com>;
> Lan, Tianyu <tianyu@intel.com>; Raj, Ashok <ashok@intel.com>; Pan, 
> Jacob
> jun <jacob.jun....@intel.com>; Joerg Roedel <j...@8bytes.org>; Robin Murphy
> <robin.mur...@arm.com>
> Subject: Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
> 
> On 24/03/17 07:46, Liu, Yi L wrote:
> [...]
> >>>>
> >>>> So we need some kind of high-level classification that the vIOMMU
> >>>> must communicate to the physical one. Each IOMMU flavor would get a
> >>>> unique, global identifier, simply to make sure that vIOMMU and
> >>>> pIOMMU speak
> >> the same language.
> >>>> For example:
> >>>>
> >>>> 0x65776886 "AMDV" AMD IOMMU
> >>>> 0x73788476 "INTL" Intel IOMMU
> >>>> 0x83515748 "S390" s390 IOMMU
> >>>> 0x8385 "SMMU" ARM SMMU
> >>>> etc.
> >>>>
> >>>> It needs to be a global magic number that everyone can recognize.
> >>>> Could be as simple as 32-bit numbers allocated from 0. Once we have
> >>>> a global magic number, we can use it to differentiate 
> >>>> architecture-specific
> details.
> >
> > I prefer simple numbers to stand for each vendor.
> 
> Sure, I don't have any preference. Simple numbers could be easier to allocate.
> 
> >>> I may need to think more on this part.
> >>>
> >>>> struct pasid_table_info {
> >>>>  __u64 ptr;
> >>>>  __u64 size; /* Is it number of entry or size in
> >>>> bytes? */
> >>>
> >>> For Intel platform, it's encoded. But I can make it in bytes. Here,
> >>> I'd like to check with you if whole guest PASID info is also needed on 
> >>> ARM?
> >>
> >> It will be needed on ARM if someone ever emulates the SMMU with SVM.
> >> Though I'm not planning on doing that myself, it is unavoidable. And
> >> it would be a shame for the next SVM virtualization solution to have
> >> to introduce a new flag "VFIO_SVM_BIND_PASIDPT_2" if they could reuse
> >> most of the BIND_PASIDPT interface but simply needed to add one or
> >> two configuration fields specific to their IOMMU.
> >
> > So you are totally fine with putting PASID table ptr and size in the
> > generic part? Maybe we have different usage for it. For me, it's a
> > guest PASID table ptr. For you, it may be different.
> 
> It's the same for SMMU, with some added format specifiers that would go in
> 'opaque[]'. I think that table pointer and size (in bytes, or number of
> entries) is generic enough for a "bind table" call and can be reused by future
> implementations.
> 
> >>>>
> >>>>  __u32 model;/* magic number */
> >>>>  __u32 variant;  /* version of the IOMMU architecture,
> >>>> maybe? IOMMU-specific. */
> >
> > For variant, it will be combined with model to do sanity check. Am I right?
> > Maybe it could be moved to opaque.
> 
> Yes I guess it could be moved to opaque. It would be a version of the model 
> used, so
> we wouldn't have to allocate a new model number whenever an architecture
> updates the fields of its PASID descriptors, but we can let IOMMU drivers 
> decide if
> they need it and what to put in there.
> 
> >>>>  __u8 opaque[];  /* IOMMU-specific details */
> >>>> };
> >>>>
> [...]
> >>
> >> Yes, that seems sensible. I could add an explicit VFIO_BIND_PASID
> >> flags to make it explicit that data[] is "u32 pasid" and avoid having any 
> >> default.
> >
>

Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-27 Thread Jean-Philippe Brucker
On 24/03/17 07:46, Liu, Yi L wrote:
[...]

 So we need some kind of high-level classification that the vIOMMU
 must communicate to the physical one. Each IOMMU flavor would get a
 unique, global identifier, simply to make sure that vIOMMU and pIOMMU speak
>> the same language.
 For example:

 0x65776886 "AMDV" AMD IOMMU
 0x73788476 "INTL" Intel IOMMU
 0x83515748 "S390" s390 IOMMU
 0x8385 "SMMU" ARM SMMU
 etc.

 It needs to be a global magic number that everyone can recognize.
 Could be as simple as 32-bit numbers allocated from 0. Once we have a
 global magic number, we can use it to differentiate architecture-specific 
 details.
> 
> I prefer simple numbers to stand for each vendor.

Sure, I don't have any preference. Simple numbers could be easier to allocate.

>>> I may need to think more on this part.
>>>
 struct pasid_table_info {
__u64 ptr;
__u64 size; /* Is it number of entry or size in
   bytes? */
>>>
>>> For Intel platform, it's encoded. But I can make it in bytes. Here,
>>> I'd like to check with you if whole guest PASID info is also needed on ARM?
>>
>> It will be needed on ARM if someone ever emulates the SMMU with SVM.
>> Though I'm not planning on doing that myself, it is unavoidable. And it 
>> would be a
>> shame for the next SVM virtualization solution to have to introduce a new 
>> flag
>> "VFIO_SVM_BIND_PASIDPT_2" if they could reuse most of the BIND_PASIDPT
>> interface but simply needed to add one or two configuration fields specific 
>> to their
>> IOMMU.
> 
> So you are totally fine with putting PASID table ptr and size in the generic 
> part? Maybe
> we have different usage for it. For me, it's a guest PASID table ptr. For 
> you, it may be
> different.

It's the same for SMMU, with some added format specifiers that would go in
'opaque[]'. I think that table pointer and size (in bytes, or number of
entries) is generic enough for a "bind table" call and can be reused by
future implementations.


__u32 model;/* magic number */
__u32 variant;  /* version of the IOMMU architecture,
   maybe? IOMMU-specific. */
> 
> For variant, it will be combined with model to do sanity check. Am I right?
> Maybe it could be moved to opaque.

Yes I guess it could be moved to opaque. It would be a version of the
model used, so we wouldn't have to allocate a new model number whenever an
architecture updates the fields of its PASID descriptors, but we can let
IOMMU drivers decide if they need it and what to put in there.

__u8 opaque[];  /* IOMMU-specific details */
 };

[...]
>>
>> Yes, that seems sensible. I could add an explicit VFIO_BIND_PASID flags to 
>> make it
>> explicit that data[] is "u32 pasid" and avoid having any default.
> 
> Add it in the comment I suppose. The length is 4 byes, it could be deduced 
> from argsz.
> 
>>

> #define VFIO_SVM_PASSDOWN_INVALIDATE(1 << 1)

 Using the vfio_device_svm structure for invalidate operations is a
 bit odd, it might be nicer to add a new VFIO_SVM_INVALIDATE ioctl,
 that takes the above iommu_svm_tlb_invalidate_info as argument (with
 an added argsz.)
>>>
>>> Agree, would add a separate IOCTL for invalidation.
>>>

> #define VFIO_SVM_PASID_RELEASE_FLUSHED(1 << 2)
> #define VFIO_SVM_PASID_RELEASE_CLEAN(1 << 3)
>__u32   flags;
>__u32   length;

 If length is the size of data[], I guess we can already deduce this info 
 from argsz.
>>>
>>> yes, it is size of data. Maybe remove argsz. How about your opinion?
>>
>> Argsz as first argument is standard across all VFIO ioctls, it allows the 
>> kernel to check
>> that the structure passed by the guest is consistent with the structure it 
>> knows (see
>> the comment at the beginning of
>> include/uapi/linux/vfio.h) So argsz would be the preferred way to 
>> communicate the
>> size of data[]
> 
> yes, it makes sense. BTW. I think it is still possible to leave the "length" 
> since there is
> similar usage. e.g. struct vfio_irq_set, count tells the size of data[] in 
> that structure.
> Anyhow, it's no big deal here.
> 
>__u8data[];
> };

 In general, I think that your proposal would work fine along mine.
 Note that for my next version of this patch, I would like to move the
 BIND/UNBIND SVM operations onto a VFIO container fd, instead of a VFIO 
 device
>> fd, if possible.
>>>
>>> Attach the BIND/UNBIND operation onto VFIO container fd is practical.
>>>
>>> BTW. Before you send out your next version, we'd better have a
>>> consensus on the vfio_device_svm definition. So that we can continue to 
>>> drive our
>> own work separately.
>>>
 ---
 As an aside, it also aligns with the one I'm working on at the
 moment, for virtual SVM at a finer 

RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-24 Thread Liu, Yi L
> -Original Message-
> From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf
> Of Jean-Philippe Brucker
> Sent: Thursday, March 23, 2017 9:38 PM
> To: Liu, Yi L <yi.l@intel.com>; Alex Williamson 
> <alex.william...@redhat.com>
> Cc: Shanker Donthineni <shank...@qti.qualcomm.com>; k...@vger.kernel.org;
> Catalin Marinas <catalin.mari...@arm.com>; Sinan Kaya
> <ok...@qti.qualcomm.com>; Will Deacon <will.dea...@arm.com>;
> iommu@lists.linux-foundation.org; Harv Abdulhamid <ha...@qti.qualcomm.com>;
> linux-...@vger.kernel.org; Bjorn Helgaas <bhelg...@google.com>; David
> Woodhouse <dw...@infradead.org>; linux-arm-ker...@lists.infradead.org; Nate
> Watterson <nwatt...@qti.qualcomm.com>; Tian, Kevin <kevin.t...@intel.com>;
> Lan, Tianyu <tianyu@intel.com>; Raj, Ashok <ashok@intel.com>; Pan, 
> Jacob
> jun <jacob.jun....@intel.com>; Joerg Roedel <j...@8bytes.org>; Robin Murphy
> <robin.mur...@arm.com>
> Subject: Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
> 
> On 23/03/17 08:39, Liu, Yi L wrote:
> > Hi Jean,
> >
> > Thx for the excellent ideas. Pls refer to comments inline.
> >
> > [...]
> >
> >>> Hi Jean,
> >>>
> >>> I'm working on virtual SVM, and have some comments on the VFIO
> >>> channel definition.
> >>
> >> Thanks a lot for the comments, this is quite interesting to me. I
> >> just have some concerns about portability so I'm proposing a way to be 
> >> slightly
> more generic below.
> >>
> >
> > yes, portability is what need to consider.
> >
> > [...]
> >
> >>>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >>>> index
> >>>> 519eff362c1c..3fe4197a5ea0 100644
> >>>> --- a/include/uapi/linux/vfio.h
> >>>> +++ b/include/uapi/linux/vfio.h
> >>>> @@ -198,6 +198,7 @@ struct vfio_device_info {
> >>>>  #define VFIO_DEVICE_FLAGS_PCI   (1 << 1)/* vfio-pci device */
> >>>>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device 
> >>>> */
> >>>>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)/* vfio-amba device */
> >>>> +#define VFIO_DEVICE_FLAGS_SVM   (1 << 4)/* Device supports
> bind/unbind */
> >>>>  __u32   num_regions;/* Max region index + 1 */
> >>>>  __u32   num_irqs;   /* Max IRQ index + 1 */
> >>>>  };
> >>>> @@ -409,6 +410,60 @@ struct vfio_irq_set {
> >>>>   */
> >>>>  #define VFIO_DEVICE_RESET   _IO(VFIO_TYPE, VFIO_BASE + 11)
> >>>>
> >>>> +struct vfio_device_svm {
> >>>> +__u32   argsz;
> >>>> +__u32   flags;
> >>>> +#define VFIO_SVM_PASID_RELEASE_FLUSHED  (1 << 0)
> >>>> +#define VFIO_SVM_PASID_RELEASE_CLEAN(1 << 1)
> >>>> +__u32   pasid;
> >>>> +};
> >>>
> >>> For virtual SVM work, the VFIO channel would be used to passdown
> >>> guest PASID tale PTR and invalidation information. And may have
> >>> further usage except the above.
> >>>
> >>> Here is the virtual SVM design doc which illustrates the VFIO usage.
> >>> https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> >>>
> >>> For the guest PASID table ptr passdown, I've following message in pseudo 
> >>> code.
> >>> struct pasid_table_info {
> >>> __u64 ptr;
> >>> __u32 size;
> >>>  };
> >>
> >> There should probably be a way to specify the table format, so that
> >> the pIOMMU driver can check that it recognizes the format used by the
> >> vIOMMU before attaching it. This would allow to reuse the structure
> >> for other IOMMU architectures. If, for instance, the host has an
> >> intel IOMMU and someone decides to emulate an ARM SMMU with Qemu
> >> (their loss :), it can certainly use VFIO for passing-through devices
> >> with MAP/UNMAP. But if Qemu then attempts to passdown a PASID table
> >> in SMMU format, the Intel driver should have a way to reject it, as the 
> >> SMMU
> format isn't compatible.
> >
> > Exactly, it would be grt if we can have the API defined as generic as
> > MAP/UNMAP. The case you mentioned to emulate

RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-23 Thread Liu, Yi L
Hi Jean,

Thx for the excellent ideas. Pls refer to comments inline.

[...]

> > Hi Jean,
> >
> > I'm working on virtual SVM, and have some comments on the VFIO channel
> > definition.
> 
> Thanks a lot for the comments, this is quite interesting to me. I just have 
> some
> concerns about portability so I'm proposing a way to be slightly more generic 
> below.
> 

yes, portability is what need to consider.

[...]

> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> >> index
> >> 519eff362c1c..3fe4197a5ea0 100644
> >> --- a/include/uapi/linux/vfio.h
> >> +++ b/include/uapi/linux/vfio.h
> >> @@ -198,6 +198,7 @@ struct vfio_device_info {
> >>  #define VFIO_DEVICE_FLAGS_PCI (1 << 1)/* vfio-pci device */
> >>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)   /* vfio-platform device 
> >> */
> >>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)  /* vfio-amba device */
> >> +#define VFIO_DEVICE_FLAGS_SVM (1 << 4)/* Device supports 
> >> bind/unbind */
> >>__u32   num_regions;/* Max region index + 1 */
> >>__u32   num_irqs;   /* Max IRQ index + 1 */
> >>  };
> >> @@ -409,6 +410,60 @@ struct vfio_irq_set {
> >>   */
> >>  #define VFIO_DEVICE_RESET _IO(VFIO_TYPE, VFIO_BASE + 11)
> >>
> >> +struct vfio_device_svm {
> >> +  __u32   argsz;
> >> +  __u32   flags;
> >> +#define VFIO_SVM_PASID_RELEASE_FLUSHED(1 << 0)
> >> +#define VFIO_SVM_PASID_RELEASE_CLEAN  (1 << 1)
> >> +  __u32   pasid;
> >> +};
> >
> > For virtual SVM work, the VFIO channel would be used to passdown guest
> > PASID tale PTR and invalidation information. And may have further
> > usage except the above.
> >
> > Here is the virtual SVM design doc which illustrates the VFIO usage.
> > https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> >
> > For the guest PASID table ptr passdown, I've following message in pseudo 
> > code.
> > struct pasid_table_info {
> > __u64 ptr;
> > __u32 size;
> >  };
> 
> There should probably be a way to specify the table format, so that the pIOMMU
> driver can check that it recognizes the format used by the vIOMMU before 
> attaching
> it. This would allow to reuse the structure for other IOMMU architectures. 
> If, for
> instance, the host has an intel IOMMU and someone decides to emulate an ARM
> SMMU with Qemu (their loss :), it can certainly use VFIO for passing-through 
> devices
> with MAP/UNMAP. But if Qemu then attempts to passdown a PASID table in SMMU
> format, the Intel driver should have a way to reject it, as the SMMU format 
> isn't
> compatible.

Exactly, it would be grt if we can have the API defined as generic as 
MAP/UNMAP. The
case you mentioned to emulate an ARM SMMU on an Intel platform is 
representative.
For such cases, the problem is different vendors may have different PASID table 
format
and also different page table format. In my understanding, these incompatible 
things
may just result in failure if users try such emulation. What's your opinion 
here?
Anyhow, better to listen to different voices.

> 
> I'm tackling a similar problem at the moment, but for passing a single page 
> directory
> instead of full PASID table to the IOMMU.

For, Intel IOMMU, passing the whole guest PASID table is enough and it also 
avoids 
too much pgd passing. However, I'm open on this idea. You may just add a new 
flag
in "struct vfio_device_svm" and pass the single pgd down to host.

> 
> So we need some kind of high-level classification that the vIOMMU must
> communicate to the physical one. Each IOMMU flavor would get a unique, global
> identifier, simply to make sure that vIOMMU and pIOMMU speak the same 
> language.
> For example:
> 
> 0x65776886 "AMDV" AMD IOMMU
> 0x73788476 "INTL" Intel IOMMU
> 0x83515748 "S390" s390 IOMMU
> 0x8385 "SMMU" ARM SMMU
> etc.
> 
> It needs to be a global magic number that everyone can recognize. Could be as
> simple as 32-bit numbers allocated from 0. Once we have a global magic 
> number, we
> can use it to differentiate architecture-specific details.

I may need to think more on this part.
 
> struct pasid_table_info {
>   __u64 ptr;
>   __u64 size; /* Is it number of entry or size in
>  bytes? */

For Intel platform, it's encoded. But I can make it in bytes. Here, I'd like
to check with you if whole guest PASID info is also needed on ARM?

> 
>   __u32 model;/* magic number */
>   __u32 variant;  /* version of the IOMMU architecture,
>  maybe? IOMMU-specific. */
>   __u8 opaque[];  /* IOMMU-specific details */
> };
> 
> And then each IOMMU or page-table code can do low-level validation of the 
> format,
> by reading the details in 'opaque'. I assume that for Intel this would be 
> empty. But

yes, for Intel, if the PASID ptr is in the definition, opaque would be empty.

> for instance on ARM SMMUv3, PASID table can have either one or two levels, and

Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread jacob pan
On Tue, 21 Mar 2017 19:37:56 +
Jean-Philippe Brucker  wrote:

> > For invalidation, I've following info in in pseudo code.
> > struct iommu_svm_tlb_invalidate_info
> > {
> >__u32 inv_type;
> > #define IOTLB_INV   (1 << 0)
> > #define EXTENDED_IOTLB_INV  (1 << 1)
> > #define DEVICE_IOTLB_INV(1 << 2)
> > #define EXTENDED_DEVICE_IOTLB_INV   (1 << 3)
> > #define PASID_CACHE_INV (1 << 4)
> >__u32 pasid;
> >__u64 addr;
> >__u64 size;
> >__u8 granularity;
> > #define DEFAULT_INV_GRN0
> > #define PAGE_SELECTIVE_INV (1 << 0)
> > #define PASID_SELECVIVE_INV(1 << 1)
> >__u64 flags;
> > #define INVALIDATE_HINT_BIT(1 << 0)
> > #define GLOBAL_HINT_BIT(1 << 1)
> > #define DRAIN_READ_BIT (1 << 2)
> > #define DRAIN_WRITE_BIT(1 << 3)
> > #define DEVICE_TLB_GLOBAL_BIT  (1 << 4)
> >__u8 mip;
> >__u16 pfsid;
> > };  
> 
> This would also benefit from being split into generic and
> architectural parts. Former would be defined in VFIO, latter would be
> in the IOMMU driver.
> 
> struct tlb_invalidate_info
> {
>   __u8 granularity
> #define DEFAULT_INV_GRN   0   /* What is default? */
> #define PAGE_SELECTIVE_INV(1 << 0)
> #define PASID_SELECTIVE_INV   (1 << 1)
>   __u32 pasid;
>   __u64 addr;
>   __u64 size;
> 
>   /* Since IOMMU format has already been validated for this
> table, the IOMMU driver knows that the following structure is in a
>  format it knows */
>   __u8 opaque[];
> };
> 
> struct tlb_invalidate_info_intel
> {
>   __u32 inv_type;
>   ...
>   __u64 flags;
>   ...
>   __u8 mip;
>   __u16 pfsid;
> };

I presume for Intel/arch specific part, the opaque data can simply be
the descriptor itself. i.e. struct qi_desc. There is no need to
generalize since it has already been validated to be architecture
specific data.

Arch specific IOMMU driver can take further action on the opaque data,
e.g. replace guest DID with host DID based on other info passed down
from the caller.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread Jean-Philippe Brucker
On 21/03/17 07:04, Liu, Yi L wrote:
> Hi Jean,
> 
> I'm working on virtual SVM, and have some comments on the VFIO channel
> definition.

Thanks a lot for the comments, this is quite interesting to me. I just
have some concerns about portability so I'm proposing a way to be slightly
more generic below.

>> -Original Message-
>> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
>> boun...@lists.linux-foundation.org] On Behalf Of Jean-Philippe Brucker
>> Sent: Tuesday, February 28, 2017 3:55 AM
>> Cc: Shanker Donthineni <shank...@qti.qualcomm.com>; k...@vger.kernel.org;
>> Catalin Marinas <catalin.mari...@arm.com>; Sinan Kaya
>> <ok...@qti.qualcomm.com>; Will Deacon <will.dea...@arm.com>;
>> iommu@lists.linux-foundation.org; Harv Abdulhamid <ha...@qti.qualcomm.com>;
>> linux-...@vger.kernel.org; Bjorn Helgaas <bhelg...@google.com>; David
>> Woodhouse <dw...@infradead.org>; linux-arm-ker...@lists.infradead.org; Nate
>> Watterson <nwatt...@qti.qualcomm.com>
>> Subject: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
>>
>> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
>> between a device and a process address space, identified by a 
>> device-specific ID
>> named PASID. This allows the device to target DMA transactions at the process
>> virtual addresses without a need for mapping and unmapping buffers 
>> explicitly in the
>> IOMMU. The process page tables are shared with the IOMMU, and mechanisms such
>> as PCI ATS/PRI may be used to handle faults. VFIO_DEVICE_UNBIND_TASK removed
>> a bond identified by a PASID.
>>
>> Also add a capability flag in device info to detect whether the system and 
>> the device
>> support SVM.
>>
>> Users need to specify the state of a PASID when unbinding, with flags
>> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
>> PASID invalidation is specific to each device and only partially covered by 
>> the
>> specification:
>>
>> * Device must have an implementation-defined mechanism for stopping the
>>   use of a PASID. When this mechanism finishes, the device has stopped
>>   issuing transactions for this PASID and all transactions for this PASID
>>   have been flushed to the IOMMU.
>>
>> * Device may either wait for all outstanding PRI requests for this PASID
>>   to finish, or issue a Stop Marker message, a barrier that separates PRI
>>   requests affecting this instance of the PASID from PRI requests
>>   affecting the next instance. In the first case, we say that the PASID is
>>   "clean", in the second case it is "flushed" (and the IOMMU has to wait
>>   for the Stop Marker before reassigning the PASID.)
>>
>> We expect similar distinctions for platform devices. Ideally there should be 
>> a callback
>> for each PCI device, allowing the IOMMU to ask the device to stop using a 
>> PASID.
>> When the callback returns, the PASID is either flushed or clean and the 
>> return value
>> tells which.
>>
>> For the moment I don't know how to implement this callback for PCI, so if 
>> the user
>> forgets to call unbind with either "clean" or "flushed", the PASID is never 
>> reused. For
>> platform devices, it might be simpler to implement since we could associate 
>> an
>> invalidate_pasid callback to a DT compatible string, as is currently done 
>> for reset.
>>
>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.bruc...@arm.com>
> 
> [...]
> 
>>  drivers/vfio/pci/vfio_pci.c |  24 ++
>>  drivers/vfio/vfio.c | 104 
>> 
>>  include/uapi/linux/vfio.h   |  55 +++
>>  3 files changed, 183 insertions(+)
>>
> ...
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
>> 519eff362c1c..3fe4197a5ea0 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -198,6 +198,7 @@ struct vfio_device_info {
>>  #define VFIO_DEVICE_FLAGS_PCI   (1 << 1)/* vfio-pci device */
>>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
>>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)/* vfio-amba device */
>> +#define VFIO_DEVICE_FLAGS_SVM   (1 << 4)/* Device supports 
>> bind/unbind */
>>  __u32   num_regions;/* Max region index + 1 */
>>  __u32   num_irqs;   /* Max IRQ index + 1 */
>>  };
>> @@ -409,6 +410,60

RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread Liu, Yi L
Hi Jean,

I'm working on virtual SVM, and have some comments on the VFIO channel
definition.

> -Original Message-
> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
> boun...@lists.linux-foundation.org] On Behalf Of Jean-Philippe Brucker
> Sent: Tuesday, February 28, 2017 3:55 AM
> Cc: Shanker Donthineni <shank...@qti.qualcomm.com>; k...@vger.kernel.org;
> Catalin Marinas <catalin.mari...@arm.com>; Sinan Kaya
> <ok...@qti.qualcomm.com>; Will Deacon <will.dea...@arm.com>;
> iommu@lists.linux-foundation.org; Harv Abdulhamid <ha...@qti.qualcomm.com>;
> linux-...@vger.kernel.org; Bjorn Helgaas <bhelg...@google.com>; David
> Woodhouse <dw...@infradead.org>; linux-arm-ker...@lists.infradead.org; Nate
> Watterson <nwatt...@qti.qualcomm.com>
> Subject: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
> 
> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a device-specific 
> ID
> named PASID. This allows the device to target DMA transactions at the process
> virtual addresses without a need for mapping and unmapping buffers explicitly 
> in the
> IOMMU. The process page tables are shared with the IOMMU, and mechanisms such
> as PCI ATS/PRI may be used to handle faults. VFIO_DEVICE_UNBIND_TASK removed
> a bond identified by a PASID.
> 
> Also add a capability flag in device info to detect whether the system and 
> the device
> support SVM.
> 
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered by 
> the
> specification:
> 
> * Device must have an implementation-defined mechanism for stopping the
>   use of a PASID. When this mechanism finishes, the device has stopped
>   issuing transactions for this PASID and all transactions for this PASID
>   have been flushed to the IOMMU.
> 
> * Device may either wait for all outstanding PRI requests for this PASID
>   to finish, or issue a Stop Marker message, a barrier that separates PRI
>   requests affecting this instance of the PASID from PRI requests
>   affecting the next instance. In the first case, we say that the PASID is
>   "clean", in the second case it is "flushed" (and the IOMMU has to wait
>   for the Stop Marker before reassigning the PASID.)
> 
> We expect similar distinctions for platform devices. Ideally there should be 
> a callback
> for each PCI device, allowing the IOMMU to ask the device to stop using a 
> PASID.
> When the callback returns, the PASID is either flushed or clean and the 
> return value
> tells which.
> 
> For the moment I don't know how to implement this callback for PCI, so if the 
> user
> forgets to call unbind with either "clean" or "flushed", the PASID is never 
> reused. For
> platform devices, it might be simpler to implement since we could associate an
> invalidate_pasid callback to a DT compatible string, as is currently done for 
> reset.
> 
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.bruc...@arm.com>

[...]

>  drivers/vfio/pci/vfio_pci.c |  24 ++
>  drivers/vfio/vfio.c | 104 
> 
>  include/uapi/linux/vfio.h   |  55 +++
>  3 files changed, 183 insertions(+)
> 
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
> 519eff362c1c..3fe4197a5ea0 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -198,6 +198,7 @@ struct vfio_device_info {
>  #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */
>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)  /* vfio-platform device */
>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3) /* vfio-amba device */
> +#define VFIO_DEVICE_FLAGS_SVM(1 << 4)/* Device supports 
> bind/unbind */
>   __u32   num_regions;/* Max region index + 1 */
>   __u32   num_irqs;   /* Max IRQ index + 1 */
>  };
> @@ -409,6 +410,60 @@ struct vfio_irq_set {
>   */
>  #define VFIO_DEVICE_RESET_IO(VFIO_TYPE, VFIO_BASE + 11)
> 
> +struct vfio_device_svm {
> + __u32   argsz;
> + __u32   flags;
> +#define VFIO_SVM_PASID_RELEASE_FLUSHED   (1 << 0)
> +#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
> + __u32   pasid;
> +};

For virtual SVM work, the VFIO channel would be used to passdown guest
PASID tale PTR and invalidation information. And may have further usage
except the above.

Here is the virtual SVM design doc which illust

Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-02-28 Thread Jean-Philippe Brucker
Hi Alex,

Thanks for the feedback!

On Mon, Feb 27, 2017 at 08:54:09PM -0700, Alex Williamson wrote:
> On Mon, 27 Feb 2017 19:54:40 +
> Jean-Philippe Brucker  wrote:
[...]
> >  
> > +static long vfio_svm_ioctl(struct vfio_device *device, unsigned int cmd,
> > +  unsigned long arg)
> > +{
> > +   int ret;
> > +   unsigned long minsz;
> > +
> > +   struct vfio_device_svm svm;
> > +   struct vfio_task *vfio_task;
> > +
> > +   minsz = offsetofend(struct vfio_device_svm, pasid);
> > +
> > +   if (copy_from_user(, (void __user *)arg, minsz))
> > +   return -EFAULT;
> > +
> > +   if (svm.argsz < minsz)
> > +   return -EINVAL;
> > +
> > +   if (cmd == VFIO_DEVICE_BIND_TASK) {
> > +   struct task_struct *task = current;
> 
> Seems like SVM should be in the name of these ioctls.
> 
> svm.flags needs to be validated here or else we lose the field for
> future use... you add this in the next patch, but see compatibility
> comment there.

Agreed, I'll be more careful with the flags.

> > +
> > +   ret = iommu_bind_task(device->dev, task, , 0, NULL);
> > +   if (ret)
> > +   return ret;
> 
> vfio-pci advertises the device feature, but vfio intercepts the ioctl
> and attempts to handle it regardless of device support.
> 
> We also need to be careful of using, or even referencing iommu_ops
> without regard to the device or IOMMU backend.  SPAPR doesn't fully
> implement IOMMU API, vfio-noiommu devices don't have iommu_ops, mdev
> devices don't either.  I agree with your comments in the cover letter,
> it's not entirely clear that the device fd is the right place to host
> this.

Yes, and I like the idea of moving the ioctl into type1 IOMMU.
Something like VFIO_IOMMU_BIND_TASK (or perhaps VFIO_IOMMU_SVM_BIND?),
applied on the container instead of the device might be better. The
semantics are tricky to define though, both for VFIO and IOMMU, because
devices in a container or an IOMMU group might have different SVM
capabilities.

When this ioctl successfully returns with a PASID, two possibilities:
A. either it implies that all devices attached to the container are now
   able to perform DMA with this PASID,
B. or some devices in the container do not support SVM, but those that
   support it can all use the PASID. The user needs to inspect device
   flags individually to know which can support SVM. When user is a
   userspace device driver, it is familiar with the device it's driving
   and knows whether is supports SVM or not, but a VMM wouldn't.

After binding the container to the task and obtaining a PASID, user
wants to add a group to the container. So we need to replay the binding
on the new group, by telling the IOMMU to use that particular PASID. If
the device supports less PASID bits, I guess we should reject the
attach? If the device doesn't support SVM, for case A we should reject
the attach, for case B we accept it. Alternatively, we could simply
forbid to add groups to containers after a bind.

The problem is similar for adding devices to IOMMU groups. If a group is
bound to an address space, and a less capable device is added to the
group, we probably don't want to reject the device altogether, nor do we
want to unbind the PASID.

> > +
> > +   vfio_task = kzalloc(sizeof(*vfio_task), GFP_KERNEL);
> > +   if (!vfio_task) {
> > +   iommu_unbind_task(device->dev, svm.pasid,
> > + IOMMU_PASID_CLEAN);
> > +   return -ENOMEM;
> > +   }
> > +
> > +   vfio_task->pasid = svm.pasid;
> > +
> > +   mutex_lock(>tasks_lock);
> > +   list_add(_task->list, >tasks);
> > +   mutex_unlock(>tasks_lock);
> > +
> > +   } else {
> > +   int flags = 0;
> > +
> > +   if (svm.flags & ~(VFIO_SVM_PASID_RELEASE_FLUSHED |
> > + VFIO_SVM_PASID_RELEASE_CLEAN))
> > +   return -EINVAL;
> > +
> > +   if (svm.flags & VFIO_SVM_PASID_RELEASE_FLUSHED)
> > +   flags = IOMMU_PASID_FLUSHED;
> > +   else if (svm.flags & VFIO_SVM_PASID_RELEASE_CLEAN)
> > +   flags = IOMMU_PASID_CLEAN;
> > +
> > +   mutex_lock(>tasks_lock);
> > +   list_for_each_entry(vfio_task, >tasks, list) {
> > +   if (vfio_task->pasid != svm.pasid)
> > +   continue;
> > +
> > +   ret = iommu_unbind_task(device->dev, svm.pasid, flags);
> > +   if (ret)
> > +   dev_warn(device->dev, "failed to unbind PASID 
> > %u\n",
> > +vfio_task->pasid);
> > +
> > +   list_del(_task->list);
> > +   kfree(vfio_task);
> > +   break;
> > +   }
> > +   mutex_unlock(>tasks_lock);
> > +   }
> > +
> > +   return copy_to_user((void __user *)arg, , minsz) ? 

Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-02-27 Thread Alex Williamson
On Mon, 27 Feb 2017 19:54:40 +
Jean-Philippe Brucker  wrote:

> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a
> device-specific ID named PASID. This allows the device to target DMA
> transactions at the process virtual addresses without a need for mapping
> and unmapping buffers explicitly in the IOMMU. The process page tables are
> shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
> handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
> PASID.
> 
> Also add a capability flag in device info to detect whether the system and
> the device support SVM.
> 
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered
> by the specification:
> 
> * Device must have an implementation-defined mechanism for stopping the
>   use of a PASID. When this mechanism finishes, the device has stopped
>   issuing transactions for this PASID and all transactions for this PASID
>   have been flushed to the IOMMU.
> 
> * Device may either wait for all outstanding PRI requests for this PASID
>   to finish, or issue a Stop Marker message, a barrier that separates PRI
>   requests affecting this instance of the PASID from PRI requests
>   affecting the next instance. In the first case, we say that the PASID is
>   "clean", in the second case it is "flushed" (and the IOMMU has to wait
>   for the Stop Marker before reassigning the PASID.)
> 
> We expect similar distinctions for platform devices. Ideally there should
> be a callback for each PCI device, allowing the IOMMU to ask the device to
> stop using a PASID. When the callback returns, the PASID is either flushed
> or clean and the return value tells which.
> 
> For the moment I don't know how to implement this callback for PCI, so if
> the user forgets to call unbind with either "clean" or "flushed", the
> PASID is never reused. For platform devices, it might be simpler to
> implement since we could associate an invalidate_pasid callback to a DT
> compatible string, as is currently done for reset.
> 
> Signed-off-by: Jean-Philippe Brucker 
> ---
>  drivers/vfio/pci/vfio_pci.c |  24 ++
>  drivers/vfio/vfio.c | 104 
> 
>  include/uapi/linux/vfio.h   |  55 +++
>  3 files changed, 183 insertions(+)
> 
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 324c52e3a1a4..3d7733f94891 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device 
> *vdev,
>   return 0;
>  }
>  
> +static bool vfio_pci_supports_svm(struct vfio_pci_device *vdev)
> +{
> + struct pci_dev *pdev = vdev->pdev;
> +
> + if (!pdev->ats_enabled)
> + return false;
> +
> + if (!pdev->pasid_enabled || pci_max_pasids(pdev) <= 1)
> + return false;
> +
> + if (!pdev->pri_enabled)
> + return false;
> +
> + /*
> +  * If the IOMMU driver enabled all of these, then it supports PCI SVM
> +  * for this device.
> +  */
> + return true;
> +}
> +
>  static long vfio_pci_ioctl(void *device_data,
>  unsigned int cmd, unsigned long arg)
>  {
> @@ -642,6 +663,9 @@ static long vfio_pci_ioctl(void *device_data,
>  
>   info.flags = VFIO_DEVICE_FLAGS_PCI;
>  
> + if (vfio_pci_supports_svm(vdev))
> + info.flags |= VFIO_DEVICE_FLAGS_SVM;
> +
>   if (vdev->reset_works)
>   info.flags |= VFIO_DEVICE_FLAGS_RESET;
>  
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 609f4f982c74..c4505d8f4c61 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -97,6 +97,14 @@ struct vfio_device {
>   struct vfio_group   *group;
>   struct list_headgroup_next;
>   void*device_data;
> +
> + struct mutextasks_lock;
> + struct list_headtasks;
> +};
> +
> +struct vfio_task {
> + int pasid;
> + struct list_headlist;
>  };
>  
>  #ifdef CONFIG_VFIO_NOIOMMU
> @@ -520,6 +528,9 @@ struct vfio_device *vfio_group_create_device(struct 
> vfio_group *group,
>   device->device_data = device_data;
>   dev_set_drvdata(dev, device);
>  
> + mutex_init(>tasks_lock);
> + INIT_LIST_HEAD(>tasks);
> +
>   /* No need to get group_lock, caller has group reference */
>   vfio_group_get(group);
>  
> @@ -532,6 +543,8 @@ struct 

[RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-02-27 Thread Jean-Philippe Brucker
Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
between a device and a process address space, identified by a
device-specific ID named PASID. This allows the device to target DMA
transactions at the process virtual addresses without a need for mapping
and unmapping buffers explicitly in the IOMMU. The process page tables are
shared with the IOMMU, and mechanisms such as PCI ATS/PRI may be used to
handle faults. VFIO_DEVICE_UNBIND_TASK removed a bond identified by a
PASID.

Also add a capability flag in device info to detect whether the system and
the device support SVM.

Users need to specify the state of a PASID when unbinding, with flags
VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
PASID invalidation is specific to each device and only partially covered
by the specification:

* Device must have an implementation-defined mechanism for stopping the
  use of a PASID. When this mechanism finishes, the device has stopped
  issuing transactions for this PASID and all transactions for this PASID
  have been flushed to the IOMMU.

* Device may either wait for all outstanding PRI requests for this PASID
  to finish, or issue a Stop Marker message, a barrier that separates PRI
  requests affecting this instance of the PASID from PRI requests
  affecting the next instance. In the first case, we say that the PASID is
  "clean", in the second case it is "flushed" (and the IOMMU has to wait
  for the Stop Marker before reassigning the PASID.)

We expect similar distinctions for platform devices. Ideally there should
be a callback for each PCI device, allowing the IOMMU to ask the device to
stop using a PASID. When the callback returns, the PASID is either flushed
or clean and the return value tells which.

For the moment I don't know how to implement this callback for PCI, so if
the user forgets to call unbind with either "clean" or "flushed", the
PASID is never reused. For platform devices, it might be simpler to
implement since we could associate an invalidate_pasid callback to a DT
compatible string, as is currently done for reset.

Signed-off-by: Jean-Philippe Brucker 
---
 drivers/vfio/pci/vfio_pci.c |  24 ++
 drivers/vfio/vfio.c | 104 
 include/uapi/linux/vfio.h   |  55 +++
 3 files changed, 183 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 324c52e3a1a4..3d7733f94891 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -623,6 +624,26 @@ int vfio_pci_register_dev_region(struct vfio_pci_device 
*vdev,
return 0;
 }
 
+static bool vfio_pci_supports_svm(struct vfio_pci_device *vdev)
+{
+   struct pci_dev *pdev = vdev->pdev;
+
+   if (!pdev->ats_enabled)
+   return false;
+
+   if (!pdev->pasid_enabled || pci_max_pasids(pdev) <= 1)
+   return false;
+
+   if (!pdev->pri_enabled)
+   return false;
+
+   /*
+* If the IOMMU driver enabled all of these, then it supports PCI SVM
+* for this device.
+*/
+   return true;
+}
+
 static long vfio_pci_ioctl(void *device_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -642,6 +663,9 @@ static long vfio_pci_ioctl(void *device_data,
 
info.flags = VFIO_DEVICE_FLAGS_PCI;
 
+   if (vfio_pci_supports_svm(vdev))
+   info.flags |= VFIO_DEVICE_FLAGS_SVM;
+
if (vdev->reset_works)
info.flags |= VFIO_DEVICE_FLAGS_RESET;
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 609f4f982c74..c4505d8f4c61 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -97,6 +97,14 @@ struct vfio_device {
struct vfio_group   *group;
struct list_headgroup_next;
void*device_data;
+
+   struct mutextasks_lock;
+   struct list_headtasks;
+};
+
+struct vfio_task {
+   int pasid;
+   struct list_headlist;
 };
 
 #ifdef CONFIG_VFIO_NOIOMMU
@@ -520,6 +528,9 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
device->device_data = device_data;
dev_set_drvdata(dev, device);
 
+   mutex_init(>tasks_lock);
+   INIT_LIST_HEAD(>tasks);
+
/* No need to get group_lock, caller has group reference */
vfio_group_get(group);
 
@@ -532,6 +543,8 @@ struct vfio_device *vfio_group_create_device(struct 
vfio_group *group,
 
 static void vfio_device_release(struct kref *kref)
 {
+   int ret;
+   struct vfio_task *tmp, *task;
struct vfio_device *device = container_of(kref,
  struct vfio_device,