from:"Lu Baolu"

This adds support to return the default pasid associated with
an auxiliary domain. The PCI device which is bound with this
domain should use this value as the pasid for all DMA requests
of the subset of device which is isolated and protected with
this domain.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 8ecf09db6047..b79c72cc5931 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5684,6 +5684,15 @@ intel_iommu_dev_feat_enabled(struct device *dev, enum 
iommu_dev_features feat)
return false;
 }
 
+static int
+intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+   return dmar_domain->default_pasid > 0 ?
+   dmar_domain->default_pasid : -EINVAL;
+}
+
 const struct iommu_ops intel_iommu_ops = {
.capable= intel_iommu_capable,
.domain_alloc   = intel_iommu_domain_alloc,
@@ -5692,6 +5701,7 @@ const struct iommu_ops intel_iommu_ops = {
.detach_dev = intel_iommu_detach_device,
.aux_attach_dev = intel_iommu_aux_attach_device,
.aux_detach_dev = intel_iommu_aux_detach_device,
+   .aux_get_pasid  = intel_iommu_aux_get_pasid,
.map= intel_iommu_map,
.unmap  = intel_iommu_unmap,
.iova_to_phys   = intel_iommu_iova_to_phys,
-- 
2.17.1

[PATCH v6 7/9] vfio/mdev: Add iommu related member in mdev_device

A parent device might create different types of mediated
devices. For example, a mediated device could be created
by the parent device with full isolation and protection
provided by the IOMMU. One usage case could be found on
Intel platforms where a mediated device is an assignable
subset of a PCI, the DMA requests on behalf of it are all
tagged with a PASID. Since IOMMU supports PASID-granular
translations (scalable mode in VT-d 3.0), this mediated
device could be individually protected and isolated by an
IOMMU.

This patch adds a new member in the struct mdev_device to
indicate that the mediated device represented by mdev could
be isolated and protected by attaching a domain to a device
represented by mdev->iommu_device. It also adds a helper to
add or set the iommu device.

* mdev_device->iommu_device
  - This, if set, indicates that the mediated device could
be fully isolated and protected by IOMMU via attaching
an iommu domain to this device. If empty, it indicates
using vendor defined isolation, hence bypass IOMMU.

* mdev_set/get_iommu_device(dev, iommu_device)
  - Set or get the iommu device which represents this mdev
in IOMMU's device scope. Drivers don't need to set the
iommu device if it uses vendor defined isolation.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Kevin Tian 
Suggested-by: Alex Williamson 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/mdev/mdev_core.c| 18 ++
 drivers/vfio/mdev/mdev_private.h |  1 +
 include/linux/mdev.h | 14 ++
 3 files changed, 33 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 0212f0ee8aea..9be58d392d2b 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -390,6 +390,24 @@ int mdev_device_remove(struct device *dev, bool 
force_remove)
return 0;
 }
 
+int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
+{
+   struct mdev_device *mdev = to_mdev_device(dev);
+
+   mdev->iommu_device = iommu_device;
+
+   return 0;
+}
+EXPORT_SYMBOL(mdev_set_iommu_device);
+
+struct device *mdev_get_iommu_device(struct device *dev)
+{
+   struct mdev_device *mdev = to_mdev_device(dev);
+
+   return mdev->iommu_device;
+}
+EXPORT_SYMBOL(mdev_get_iommu_device);
+
 static int __init mdev_init(void)
 {
return mdev_bus_register();
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index b5819b7d7ef7..891841862ef8 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -34,6 +34,7 @@ struct mdev_device {
struct list_head next;
struct kobject *type_kobj;
bool active;
+   struct device *iommu_device;
 };
 
 #define to_mdev_device(dev)container_of(dev, struct mdev_device, dev)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index b6e048e1045f..c3ab8a9cfcc7 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -15,6 +15,20 @@
 
 struct mdev_device;
 
+/*
+ * Called by the parent device driver to set the device which represents
+ * this mdev in iommu protection scope. By default, the iommu device is
+ * NULL, that indicates using vendor defined isolation.
+ *
+ * @dev: the mediated device that iommu will isolate.
+ * @iommu_device: a pci device which represents the iommu for @dev.
+ *
+ * Return 0 for success, otherwise negative error value.
+ */
+int mdev_set_iommu_device(struct device *dev, struct device *iommu_device);
+
+struct device *mdev_get_iommu_device(struct device *dev);
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device 
to
  * register the device to mdev module.
-- 
2.17.1

[PATCH v6 8/9] vfio/type1: Add domain at(de)taching group helpers

This adds helpers to attach or detach a domain to a
group. This will replace iommu_attach_group() which
only works for non-mdev devices.

If a domain is attaching to a group which includes the
mediated devices, it should attach to the iommu device
(a pci device which represents the mdev in iommu scope)
instead. The added helper supports attaching domain to
groups for both pci and mdev devices.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/vfio_iommu_type1.c | 84 ++---
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..ccc4165474aa 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -91,6 +91,7 @@ struct vfio_dma {
 struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
+   boolmdev_group; /* An mdev group */
 };
 
 /*
@@ -1298,6 +1299,75 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+static struct device *vfio_mdev_get_iommu_device(struct device *dev)
+{
+   struct device *(*fn)(struct device *dev);
+   struct device *iommu_device;
+
+   fn = symbol_get(mdev_get_iommu_device);
+   if (fn) {
+   iommu_device = fn(dev);
+   symbol_put(mdev_get_iommu_device);
+
+   return iommu_device;
+   }
+
+   return NULL;
+}
+
+static int vfio_mdev_attach_domain(struct device *dev, void *data)
+{
+   struct iommu_domain *domain = data;
+   struct device *iommu_device;
+
+   iommu_device = vfio_mdev_get_iommu_device(dev);
+   if (iommu_device) {
+   if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
+   return iommu_aux_attach_device(domain, iommu_device);
+   else
+   return iommu_attach_device(domain, iommu_device);
+   }
+
+   return -EINVAL;
+}
+
+static int vfio_mdev_detach_domain(struct device *dev, void *data)
+{
+   struct iommu_domain *domain = data;
+   struct device *iommu_device;
+
+   iommu_device = vfio_mdev_get_iommu_device(dev);
+   if (iommu_device) {
+   if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
+   iommu_aux_detach_device(domain, iommu_device);
+   else
+   iommu_detach_device(domain, iommu_device);
+   }
+
+   return 0;
+}
+
+static int vfio_iommu_attach_group(struct vfio_domain *domain,
+  struct vfio_group *group)
+{
+   if (group->mdev_group)
+   return iommu_group_for_each_dev(group->iommu_group,
+   domain->domain,
+   vfio_mdev_attach_domain);
+   else
+   return iommu_attach_group(domain->domain, group->iommu_group);
+}
+
+static void vfio_iommu_detach_group(struct vfio_domain *domain,
+   struct vfio_group *group)
+{
+   if (group->mdev_group)
+   iommu_group_for_each_dev(group->iommu_group, domain->domain,
+vfio_mdev_detach_domain);
+   else
+   iommu_detach_group(domain->domain, group->iommu_group);
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
@@ -1373,7 +1443,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
goto out_domain;
}
 
-   ret = iommu_attach_group(domain->domain, iommu_group);
+   ret = vfio_iommu_attach_group(domain, group);
if (ret)
goto out_domain;
 
@@ -1405,8 +1475,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
list_for_each_entry(d, >domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
d->prot == domain->prot) {
-   iommu_detach_group(domain->domain, iommu_group);
-   if (!iommu_attach_group(d->domain, iommu_group)) {
+   vfio_iommu_detach_group(domain, group);
+   if (!vfio_iommu_attach_group(d, group)) {
list_add(>next, >group_list);
iommu_domain_free(domain->domain);
kfree(domain);
@@ -1414,7 +1484,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
return 0;
}
 
-   ret = iommu_attach_group(domain->domain, iommu_group);
+   ret = vfio_iommu_attach_group(domai

[PATCH v6 4/9] iommu/vt-d: Move common code out of iommu_attch_device()

This part of code could be used by both normal and aux
domain specific attach entries. Hence move them into a
common function to avoid duplication.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 60 ++---
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a7e2238f869a..3d83451a414d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5054,35 +5054,14 @@ static void intel_iommu_domain_free(struct iommu_domain 
*domain)
domain_exit(to_dmar_domain(domain));
 }
 
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-struct device *dev)
+static int prepare_domain_attach_device(struct iommu_domain *domain,
+   struct device *dev)
 {
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
int addr_width;
u8 bus, devfn;
 
-   if (device_is_rmrr_locked(dev)) {
-   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
-   return -EPERM;
-   }
-
-   /* normally dev is not mapped */
-   if (unlikely(domain_context_mapped(dev))) {
-   struct dmar_domain *old_domain;
-
-   old_domain = find_domain(dev);
-   if (old_domain) {
-   rcu_read_lock();
-   dmar_remove_one_dev_info(dev);
-   rcu_read_unlock();
-
-   if (!domain_type_is_vm_or_si(old_domain) &&
-list_empty(_domain->devices))
-   domain_exit(old_domain);
-   }
-   }
-
iommu = device_to_iommu(dev, , );
if (!iommu)
return -ENODEV;
@@ -5115,7 +5094,40 @@ static int intel_iommu_attach_device(struct iommu_domain 
*domain,
dmar_domain->agaw--;
}
 
-   return domain_add_dev_info(dmar_domain, dev);
+   return 0;
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+struct device *dev)
+{
+   int ret;
+
+   if (device_is_rmrr_locked(dev)) {
+   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
+   return -EPERM;
+   }
+
+   /* normally dev is not mapped */
+   if (unlikely(domain_context_mapped(dev))) {
+   struct dmar_domain *old_domain;
+
+   old_domain = find_domain(dev);
+   if (old_domain) {
+   rcu_read_lock();
+   dmar_remove_one_dev_info(dev);
+   rcu_read_unlock();
+
+   if (!domain_type_is_vm_or_si(old_domain) &&
+   list_empty(_domain->devices))
+   domain_exit(old_domain);
+   }
+   }
+
+   ret = prepare_domain_attach_device(domain, dev);
+   if (ret)
+   return ret;
+
+   return domain_add_dev_info(to_dmar_domain(domain), dev);
 }
 
 static void intel_iommu_detach_device(struct iommu_domain *domain,
-- 
2.17.1

[PATCH v6 9/9] vfio/type1: Handle different mdev isolation type

This adds the support to determine the isolation type
of a mediated device group by checking whether it has
an iommu device. If an iommu device exists, an iommu
domain will be allocated and then attached to the iommu
device. Otherwise, keep the same behavior as it is.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/vfio_iommu_type1.c | 48 -
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index ccc4165474aa..f1392c582a3c 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1368,13 +1368,40 @@ static void vfio_iommu_detach_group(struct vfio_domain 
*domain,
iommu_detach_group(domain->domain, group->iommu_group);
 }
 
+static bool vfio_bus_is_mdev(struct bus_type *bus)
+{
+   struct bus_type *mdev_bus;
+   bool ret = false;
+
+   mdev_bus = symbol_get(mdev_bus_type);
+   if (mdev_bus) {
+   ret = (bus == mdev_bus);
+   symbol_put(mdev_bus_type);
+   }
+
+   return ret;
+}
+
+static int vfio_mdev_iommu_device(struct device *dev, void *data)
+{
+   struct device **old = data, *new;
+
+   new = vfio_mdev_get_iommu_device(dev);
+   if (!new || (*old && *old != new))
+   return -EINVAL;
+
+   *old = new;
+
+   return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group;
struct vfio_domain *domain, *d;
-   struct bus_type *bus = NULL, *mdev_bus;
+   struct bus_type *bus = NULL;
int ret;
bool resv_msi, msi_remap;
phys_addr_t resv_msi_base;
@@ -1409,23 +1436,30 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_free;
 
-   mdev_bus = symbol_get(mdev_bus_type);
+   if (vfio_bus_is_mdev(bus)) {
+   struct device *iommu_device = NULL;
 
-   if (mdev_bus) {
-   if ((bus == mdev_bus) && !iommu_present(bus)) {
-   symbol_put(mdev_bus_type);
+   group->mdev_group = true;
+
+   /* Determine the isolation type */
+   ret = iommu_group_for_each_dev(iommu_group, _device,
+  vfio_mdev_iommu_device);
+   if (ret || !iommu_device) {
if (!iommu->external_domain) {
INIT_LIST_HEAD(>group_list);
iommu->external_domain = domain;
-   } else
+   } else {
kfree(domain);
+   }
 
list_add(>next,
 >external_domain->group_list);
mutex_unlock(>lock);
+
return 0;
}
-   symbol_put(mdev_bus_type);
+
+   bus = iommu_device->bus;
}
 
domain->domain = iommu_domain_alloc(bus);
-- 
2.17.1

[PATCH v6 5/9] iommu/vt-d: Aux-domain specific domain attach/detach

When multiple domains per device has been enabled by the
device driver, the device will tag the default PASID for
the domain to all DMA traffics out of the subset of this
device; and the IOMMU should translate the DMA requests
in PASID granularity.

This adds the intel_iommu_aux_attach/detach_device() ops
to support managing PASID granular translation structures
when the device driver has enabled multiple domains per
device.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 152 
 include/linux/intel-iommu.h |  10 +++
 2 files changed, 162 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 3d83451a414d..8ecf09db6047 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2478,6 +2478,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->iommu = iommu;
info->pasid_table = NULL;
info->auxd_enabled = 0;
+   INIT_LIST_HEAD(>auxiliary_domains);
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5054,6 +5055,131 @@ static void intel_iommu_domain_free(struct iommu_domain 
*domain)
domain_exit(to_dmar_domain(domain));
 }
 
+/*
+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+static inline bool
+is_aux_domain(struct device *dev, struct iommu_domain *domain)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   return info && info->auxd_enabled &&
+   domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
+
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   domain->auxd_refcnt++;
+   list_add(>auxd, >auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+   struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   list_del(>auxd);
+   domain->auxd_refcnt--;
+
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+}
+
+static int aux_domain_add_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+   int ret;
+   u8 bus, devfn;
+   unsigned long flags;
+   struct intel_iommu *iommu;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu)
+   return -ENODEV;
+
+   if (domain->default_pasid <= 0) {
+   int pasid;
+
+   pasid = intel_pasid_alloc_id(domain, PASID_MIN,
+pci_max_pasids(to_pci_dev(dev)),
+GFP_KERNEL);
+   if (pasid <= 0) {
+   pr_err("Can't allocate default pasid\n");
+   return -ENODEV;
+   }
+   domain->default_pasid = pasid;
+   }
+
+   spin_lock_irqsave(_domain_lock, flags);
+   /*
+* iommu->lock must be held to attach domain to iommu and setup the
+* pasid entry for second level translation.
+*/
+   spin_lock(>lock);
+   ret = domain_attach_iommu(domain, iommu);
+   if (ret)
+   goto attach_failed;
+
+   /* Setup the PASID entry for mediated devices: */
+   ret = intel_pasid_setup_second_level(iommu, domain, dev,
+domain->default_pasid);
+   if (ret)
+   goto table_failed;
+   spin_unlock(>lock);
+
+   auxiliary_link_device(domain, dev);
+
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+
+table_failed:
+   domain_detach_iommu(domain, iommu);
+attach_failed:
+   spin_unlock(>lock);
+   spin_unlock_irqrestore(_domain_lock, flags);
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+
+   return ret;
+}
+
+static void aux_domain_remove_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info;
+   struct intel_iommu *iommu;
+   unsigned long flags;
+
+   if (!is_aux_domain(dev, >domain))
+   return;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   iommu = info-&

[PATCH v6 3/9] iommu/vt-d: Add per-device IOMMU feature ops entries

This adds the iommu ops entries for aux-domain per-device
feature query and enable/disable.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 159 
 include/linux/intel-iommu.h |   1 +
 2 files changed, 160 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c4e0024c9736..a7e2238f869a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2477,6 +2477,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->domain = domain;
info->iommu = iommu;
info->pasid_table = NULL;
+   info->auxd_enabled = 0;
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5211,6 +5212,42 @@ static phys_addr_t intel_iommu_iova_to_phys(struct 
iommu_domain *domain,
return phys;
 }
 
+static inline bool scalable_mode_support(void)
+{
+   struct dmar_drhd_unit *drhd;
+   struct intel_iommu *iommu;
+   bool ret = true;
+
+   rcu_read_lock();
+   for_each_active_iommu(iommu, drhd) {
+   if (!sm_supported(iommu)) {
+   ret = false;
+   break;
+   }
+   }
+   rcu_read_unlock();
+
+   return ret;
+}
+
+static inline bool iommu_pasid_support(void)
+{
+   struct dmar_drhd_unit *drhd;
+   struct intel_iommu *iommu;
+   bool ret = true;
+
+   rcu_read_lock();
+   for_each_active_iommu(iommu, drhd) {
+   if (!pasid_supported(iommu)) {
+   ret = false;
+   break;
+   }
+   }
+   rcu_read_unlock();
+
+   return ret;
+}
+
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
if (cap == IOMMU_CAP_CACHE_COHERENCY)
@@ -5367,6 +5404,124 @@ struct intel_iommu *intel_svm_device_to_iommu(struct 
device *dev)
 }
 #endif /* CONFIG_INTEL_IOMMU_SVM */
 
+static int intel_iommu_enable_auxd(struct device *dev)
+{
+   struct device_domain_info *info;
+   struct intel_iommu *iommu;
+   unsigned long flags;
+   u8 bus, devfn;
+   int ret;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu || dmar_disabled)
+   return -EINVAL;
+
+   if (!sm_supported(iommu) || !pasid_supported(iommu))
+   return -EINVAL;
+
+   ret = intel_iommu_enable_pasid(iommu, dev);
+   if (ret)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   info->auxd_enabled = 1;
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+}
+
+static int intel_iommu_disable_auxd(struct device *dev)
+{
+   struct device_domain_info *info;
+   unsigned long flags;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   if (!WARN_ON(!info))
+   info->auxd_enabled = 0;
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+}
+
+/*
+ * A PCI express designated vendor specific extended capability is defined
+ * in the section 3.7 of Intel scalable I/O virtualization technical spec
+ * for system software and tools to detect endpoint devices supporting the
+ * Intel scalable IO virtualization without host driver dependency.
+ *
+ * Returns the address of the matching extended capability structure within
+ * the device's PCI configuration space or 0 if the device does not support
+ * it.
+ */
+static int siov_find_pci_dvsec(struct pci_dev *pdev)
+{
+   int pos;
+   u16 vendor, id;
+
+   pos = pci_find_next_ext_capability(pdev, 0, 0x23);
+   while (pos) {
+   pci_read_config_word(pdev, pos + 4, );
+   pci_read_config_word(pdev, pos + 8, );
+   if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
+   return pos;
+
+   pos = pci_find_next_ext_capability(pdev, pos, 0x23);
+   }
+
+   return 0;
+}
+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   if (feat == IOMMU_DEV_FEAT_AUX) {
+   int ret;
+
+   if (!dev_is_pci(dev) || dmar_disabled ||
+   !scalable_mode_support() || !iommu_pasid_support())
+   return false;
+
+   ret = pci_pasid_features(to_pci_dev(dev));
+   if (ret < 0)
+   return false;
+
+   return !!siov_find_pci_dvsec(to_pci_dev(dev));
+   }
+
+   return false;
+}
+
+static int
+intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return intel_iommu_enable_auxd(dev);
+
+   return -ENODEV;
+}
+
+static int
+intel_iommu_dev_disable_feat(struct device *dev, enu

[PATCH v6 1/9] iommu: Add APIs for multiple domains per device

Sharing a physical PCI device in a finer-granularity way
is becoming a consensus in the industry. IOMMU vendors
are also engaging efforts to support such sharing as well
as possible. Among the efforts, the capability of support
finer-granularity DMA isolation is a common requirement
due to the security consideration. With finer-granularity
DMA isolation, all DMA requests out of or to a subset of
a physical PCI device can be protected by the IOMMU. As a
result, there is a request in software to attach multiple
domains to a physical PCI device. One example of such use
model is the Intel Scalable IOV [1] [2]. The Intel vt-d
3.0 spec [3] introduces the scalable mode which enables
PASID granularity DMA isolation.

This adds the APIs to support multiple domains per device.
In order to ease the discussions, we call it 'a domain in
auxiliary mode' or simply 'auxiliary domain' when multiple
domains are attached to a physical device.

The APIs include:

* iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)
  - Check whether both IOMMU and device support IOMMU aux
domain feature. Below aux-domain specific interfaces
are available only after this returns true.

* iommu_dev_enable/disable_feature(dev, IOMMU_DEV_FEAT_AUX)
  - Enable/disable device specific aux-domain feature.

* iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)
  - Check whether the aux domain specific feature enabled or
not.

* iommu_aux_attach_device(domain, dev)
  - Attaches @domain to @dev in the auxiliary mode. Multiple
domains could be attached to a single device in the
auxiliary mode with each domain representing an isolated
address space for an assignable subset of the device.

* iommu_aux_detach_device(domain, dev)
  - Detach @domain which has been attached to @dev in the
auxiliary mode.

* iommu_aux_get_pasid(domain, dev)
  - Return ID used for finer-granularity DMA translation.
For the Intel Scalable IOV usage model, this will be
a PASID. The device which supports Scalable IOV needs
to write this ID to the device register so that DMA
requests could be tagged with a right PASID prefix.

This has been updated with the latest proposal from Joerg
posted here [5].

Many people involved in discussions of this design.

Kevin Tian 
Liu Yi L 
Ashok Raj 
Sanjay Kumar 
Jacob Pan 
Alex Williamson 
Jean-Philippe Brucker 
Joerg Roedel 

and some discussions can be found here [4] [5].

[1] 
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification
[2] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf
[3] 
https://software.intel.com/en-us/download/intel-virtualization-technology-for-directed-io-architecture-specification
[4] https://lkml.org/lkml/2018/7/26/4
[5] https://www.spinics.net/lists/iommu/msg31874.html

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Kevin Tian 
Suggested-by: Jean-Philippe Brucker 
Suggested-by: Joerg Roedel 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/iommu.c | 91 +++
 include/linux/iommu.h | 70 +
 2 files changed, 161 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3ed4db334341..d0b323e8357f 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2033,3 +2033,94 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, 
int num_ids)
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
+
+/*
+ * Per device IOMMU features.
+ */
+bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_has_feat)
+   return ops->dev_has_feat(dev, feat);
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
+
+int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_enable_feat)
+   return ops->dev_enable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
+
+int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_disable_feat)
+   return ops->dev_disable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
+
+bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features 
feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_feat_enabled)
+   return ops->dev_feat_enabled(dev, feat);
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_feature_enabled);
+
+/*
+ * Aux-domain specific attach/detach.
+ *
+ * Only works if iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX) returns true.
+ * Also, as long a

[PATCH v6 2/9] iommu/vt-d: Move enable pasid out of CONFIG_INTEL_IOMMU_SVM

This moves intel_iommu_enable_pasid() out of the scope of
CONFIG_INTEL_IOMMU_SVM with more and more features requiring
pasid function.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 22 +++---
 drivers/iommu/intel-svm.c   | 19 ++-
 include/linux/intel-iommu.h |  2 +-
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index af23cfc2a05e..c4e0024c9736 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5295,8 +5295,7 @@ static void intel_iommu_put_resv_regions(struct device 
*dev,
}
 }
 
-#ifdef CONFIG_INTEL_IOMMU_SVM
-int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev 
*sdev)
+int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
 {
struct device_domain_info *info;
struct context_entry *context;
@@ -5305,7 +5304,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
u64 ctx_lo;
int ret;
 
-   domain = get_valid_domain_for_dev(sdev->dev);
+   domain = get_valid_domain_for_dev(dev);
if (!domain)
return -EINVAL;
 
@@ -5313,7 +5312,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
spin_lock(>lock);
 
ret = -EINVAL;
-   info = sdev->dev->archdata.iommu;
+   info = dev->archdata.iommu;
if (!info || !info->pasid_supported)
goto out;
 
@@ -5322,15 +5321,13 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
goto out;
 
ctx_lo = context[0].lo;
-
-   sdev->did = domain->iommu_did[iommu->seq_id];
-   sdev->sid = PCI_DEVID(info->bus, info->devfn);
-
if (!(ctx_lo & CONTEXT_PASIDE)) {
ctx_lo |= CONTEXT_PASIDE;
context[0].lo = ctx_lo;
wmb();
-   iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
+   iommu->flush.flush_context(iommu,
+  domain->iommu_did[iommu->seq_id],
+  PCI_DEVID(info->bus, info->devfn),
   DMA_CCMD_MASK_NOBIT,
   DMA_CCMD_DEVICE_INVL);
}
@@ -5339,12 +5336,6 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
if (!info->pasid_enabled)
iommu_enable_dev_iotlb(info);
 
-   if (info->ats_enabled) {
-   sdev->dev_iotlb = 1;
-   sdev->qdep = info->ats_qdep;
-   if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
-   sdev->qdep = 0;
-   }
ret = 0;
 
  out:
@@ -5354,6 +5345,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
return ret;
 }
 
+#ifdef CONFIG_INTEL_IOMMU_SVM
 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
 {
struct intel_iommu *iommu;
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index c79540deaf00..ecc255ddf6ae 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -228,6 +228,7 @@ static LIST_HEAD(global_svm_list);
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
 {
struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+   struct device_domain_info *info;
struct intel_svm_dev *sdev;
struct intel_svm *svm = NULL;
struct mm_struct *mm = NULL;
@@ -291,13 +292,29 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
}
sdev->dev = dev;
 
-   ret = intel_iommu_enable_pasid(iommu, sdev);
+   ret = intel_iommu_enable_pasid(iommu, dev);
if (ret || !pasid) {
/* If they don't actually want to assign a PASID, this is
 * just an enabling check/preparation. */
kfree(sdev);
goto out;
}
+
+   info = dev->archdata.iommu;
+   if (!info || !info->pasid_supported) {
+   kfree(sdev);
+   goto out;
+   }
+
+   sdev->did = info->domain->iommu_did[iommu->seq_id];
+   sdev->sid = PCI_DEVID(info->bus, info->devfn);
+   if (info->ats_enabled) {
+   sdev->dev_iotlb = 1;
+   sdev->qdep = info->ats_qdep;
+   if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
+   sdev->qdep = 0;
+   }
+
/* Finish the setup now we know we're keeping it */
sdev->users = 1;
sdev->ops = ops;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index fa364de9db18..b7d1e2fbb9ca 100644
--- a/i

[PATCH v6 0/9] vfio/mdev: IOMMU aware mediated device

PATCH 7/09~9/09) adds the iommu device
attribute to each mdev, determines isolation type according to the
existence of an iommu device when attaching group in vfio type1 iommu
module, and attaches the domain to iommu aware mediated devices.

Best regards,
Lu Baolu

Change log:
v5->v6:
- Add a new API iommu_dev_feature_enabled() to check whether
an IOMMU specific feature is enabled.
- Rework the vt-d specific per device feature ops according
to Joerg's comments. [https://lkml.org/lkml/2019/1/11/302].
- PATCH 2/9 is added to move intel_iommu_enable_pasid() out
of the scope of CONFIG_INTEL_IOMMU_SVM without functional
changes.
- All patches are rebased on top of vt-d branch of Joerg's
iommu tree.

v4->v5:
- The iommu APIs have been updated with Joerg's proposal posted
here https://www.spinics.net/lists/iommu/msg31874.html.
- Some typos in commit message and comments have been fixed.
- PATCH 3/8 was split from 4/8 to ease code review.
- mdev->domain was removed and could bring back when there's a
real consumer.
- Other code review comments I received during v4 review period
except the EXPORT_SYMBOL vs. EXPORT_SYMBOL_GPL in PATCH 6/8.
- Rebase all patches to 5.0-rc1.

v3->v4:
- Use aux domain specific interfaces for domain attach and detach.
- Rebase all patches to 4.20-rc1.

v2->v3:
- Remove domain type enum and use a pointer on mdev_device instead.
- Add a generic interface for getting/setting per device iommu
attributions. And use it for query aux domain capability, enable
aux domain and disable aux domain purpose.
- Reuse iommu_domain_get_attr() to retrieve the id in a aux domain.
- We discussed the impact of the default domain implementation
on reusing iommu_at(de)tach_device() interfaces. We agreed
that reusing iommu_at(de)tach_device() interfaces is the right
direction and we could tweak the code to remove the impact.
https://www.spinics.net/lists/kvm/msg175285.html
- Removed the RFC tag since no objections received.
- This patch has been submitted separately.
https://www.spinics.net/lists/kvm/msg173936.html

v1->v2:
- Rewrite the patches with the concept of auxiliary domains.

Lu Baolu (9):
iommu: Add APIs for multiple domains per device
iommu/vt-d: Move enable pasid out of CONFIG_INTEL_IOMMU_SVM
iommu/vt-d: Add per-device IOMMU feature ops entries
iommu/vt-d: Move common code out of iommu_attch_device()
iommu/vt-d: Aux-domain specific domain attach/detach
iommu/vt-d: Return ID associated with an auxiliary domain
vfio/mdev: Add iommu related member in mdev_device
vfio/type1: Add domain at(de)taching group helpers
vfio/type1: Handle different mdev isolation type

--
2.17.1

Re: [PATCH v1 4/7] iommu/vt-d: Remove unnecessary local variable initializations

2019-02-10 Thread Lu Baolu


Hi,

On 2/11/19 11:33 AM, Bjorn Helgaas wrote:

On Sun, Feb 10, 2019 at 8:00 PM Lu Baolu  wrote:


Hi,

On 2/9/19 6:06 AM, Bjorn Helgaas wrote:

From: Bjorn Helgaas 

A local variable initialization is a hint that the variable will be used in
an unusual way.  If the initialization is unnecessary, that hint becomes a
distraction.

Remove unnecessary initializations.  No functional change intended.

Signed-off-by: Bjorn Helgaas 
---
   drivers/iommu/intel-iommu.c |   27 +--
   1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 81077803880f..2acd08c82cdc 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -865,7 +865,7 @@ static void free_context_table(struct intel_iommu *iommu)
   static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 unsigned long pfn, int *target_level)
   {
- struct dma_pte *parent, *pte = NULL;
+ struct dma_pte *parent, *pte;
   int level = agaw_to_level(domain->agaw);
   int offset;

@@ -922,7 +922,7 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain 
*domain,
unsigned long pfn,
int level, int *large_page)
   {
- struct dma_pte *parent, *pte = NULL;
+ struct dma_pte *parent, *pte;
   int total = agaw_to_level(domain->agaw);
   int offset;

@@ -954,7 +954,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
   unsigned long start_pfn,
   unsigned long last_pfn)
   {
- unsigned int large_page = 1;
+ unsigned int large_page;
   struct dma_pte *first_pte, *pte;

   BUG_ON(!domain_pfn_supported(domain, start_pfn));
@@ -1132,7 +1132,7 @@ static struct page *domain_unmap(struct dmar_domain 
*domain,
unsigned long start_pfn,
unsigned long last_pfn)
   {
- struct page *freelist = NULL;
+ struct page *freelist;


I am afraid this change might cause problem. "freelist" might go through
dma_pte_clear_level() without any touches.


Thanks for your review!  Can you clarify your concern?  "freelist"
isn't passed into dma_pte_clear_level().  Here's the existing code


Oh!

Yes, you are right. I confused it with another function. Sorry about it.

Best regards,
Lu Baolu

Re: [PATCH v1 4/7] iommu/vt-d: Remove unnecessary local variable initializations

2019-02-10 Thread Lu Baolu


Hi,

On 2/9/19 6:06 AM, Bjorn Helgaas wrote:

From: Bjorn Helgaas 

A local variable initialization is a hint that the variable will be used in
an unusual way.  If the initialization is unnecessary, that hint becomes a
distraction.

Remove unnecessary initializations.  No functional change intended.

Signed-off-by: Bjorn Helgaas 
---
  drivers/iommu/intel-iommu.c |   27 +--
  1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 81077803880f..2acd08c82cdc 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -865,7 +865,7 @@ static void free_context_table(struct intel_iommu *iommu)
  static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
  unsigned long pfn, int *target_level)
  {
-   struct dma_pte *parent, *pte = NULL;
+   struct dma_pte *parent, *pte;
int level = agaw_to_level(domain->agaw);
int offset;
  
@@ -922,7 +922,7 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,

 unsigned long pfn,
 int level, int *large_page)
  {
-   struct dma_pte *parent, *pte = NULL;
+   struct dma_pte *parent, *pte;
int total = agaw_to_level(domain->agaw);
int offset;
  
@@ -954,7 +954,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain,

unsigned long start_pfn,
unsigned long last_pfn)
  {
-   unsigned int large_page = 1;
+   unsigned int large_page;
struct dma_pte *first_pte, *pte;
  
  	BUG_ON(!domain_pfn_supported(domain, start_pfn));

@@ -1132,7 +1132,7 @@ static struct page *domain_unmap(struct dmar_domain 
*domain,
 unsigned long start_pfn,
 unsigned long last_pfn)
  {
-   struct page *freelist = NULL;
+   struct page *freelist;


I am afraid this change might cause problem. "freelist" might go through
dma_pte_clear_level() without any touches.

Best regards,
Lu Baolu

Re: [PATCH 1/1] iommu/vt-d: Leave scalable mode default off

2019-01-24 Thread Lu Baolu


Hi Joerg,

On 1/24/19 9:22 PM, Joerg Roedel wrote:

On Thu, Jan 24, 2019 at 10:31:32AM +0800, Lu Baolu wrote:

Commit 765b6a98c1de3 ("iommu/vt-d: Enumerate the scalable
mode capability") enables VT-d scalable mode if hardware
advertises the capability. As we will bring up different
features and use cases to upstream in different patch
series, it will leave some intermediate kernel versions
which support partial features. Hence, end user might run
into problems when they use such kernels on bare metals
or virtualization environments.


I don't get it, can you be more specific about the problems that users
might run into?


Sorry, I didn't make it clear in the message.

Around VT-d scalable mode, we plan to enable several features.
For example,

(1)basic scalable mode support;
(2)aux domain;
(3)system level pasid allocation;


Since they will be submitted in different patch series for reviewing and
merging, users will face compatible problems. For example, when users
run kernel v5.0, they might fail to assign an ADI (Assignable Device
Interface) to a VM because the aux domain is not included yet. They will
complain "I have a kernel claimed to support scalable mode, but when I
tried to assign an ADI to a VM, ...".

So we decide to leave it off by default, and turn it default on later
when all the features get merged. Users could try scalable mode features
with "intel-iommu=sm_on" in kernel command line.


And is this patch needed as a fix for v5.0 or is it just
a precaution because future patches might break something for users?


It will be better if it can be a fix for v5.0.

Best regards,
Lu Baolu

Re: [PATCH v5 2/8] iommu/vt-d: Add per-device IOMMU feature ops entries

2019-01-23 Thread Lu Baolu


Hi Joerg,

On 1/11/19 7:16 PM, Joerg Roedel wrote:

+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return scalable_mode_support() && info && info->auxd_enabled;
+
+   return false;
+}

Why is this checking the auxd_enabled flag? The function should just
return whether the device_supports_  scalable mode, not whether it is
enabled.


Yes, as the API name implies, it should return the device capability
instead of enable/disable status. I misused this API in the IOMMU
driver.

Since we already have iommu_dev_enable/disable_feature() to enable and
disable an iommu specific feature, is it possible to add another API to
query whether a specific feature has been enabled?

How about

bool iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX)?

This is necessary for the third party drivers (like vfio) to determine
which domain attach interface it should use:

if (iommu_dev_feature_enabled(dev, IOMMU_DEV_FEAT_AUX))
iommmu_aux_attach_device(domain, dev)
else
iommu_attach_device(domain, dev)


Best regards,
Lu Baolu

[PATCH 1/1] iommu/vt-d: Leave scalable mode default off

2019-01-23 Thread Lu Baolu

Commit 765b6a98c1de3 ("iommu/vt-d: Enumerate the scalable
mode capability") enables VT-d scalable mode if hardware
advertises the capability. As we will bring up different
features and use cases to upstream in different patch
series, it will leave some intermediate kernel versions
which support partial features. Hence, end user might run
into problems when they use such kernels on bare metals
or virtualization environments.

This leaves scalable mode default off and end users could
turn it on with "intel-iommu=sm_on" only when they have
clear ideas about which scalable features are supported
in the kernel.

Cc: Liu Yi L 
Cc: Jacob Pan 
Suggested-by: Ashok Raj 
Suggested-by: Kevin Tian 
Signed-off-by: Lu Baolu 
---
 Documentation/admin-guide/kernel-parameters.txt | 7 +++
 drivers/iommu/intel-iommu.c | 8 
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index b799bcf67d7b..858b6c0b9a15 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1696,12 +1696,11 @@
By default, super page will be supported if Intel IOMMU
has the capability. With this option, super page will
not be supported.
-   sm_off [Default Off]
-   By default, scalable mode will be supported if the
+   sm_on [Default Off]
+   By default, scalable mode will be disabled even if the
hardware advertises that it has support for the scalable
mode translation. With this option set, scalable mode
-   will not be used even on hardware which claims to 
support
-   it.
+   will be used on hardware which claims to support it.
tboot_noforce [Default Off]
Do not force the Intel IOMMU enabled under tboot.
By default, tboot will force Intel IOMMU on, which
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2bd9ac285c0d..8d31d3e25e30 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -363,7 +363,7 @@ static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
-static int intel_iommu_sm = 1;
+static int intel_iommu_sm;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL   1
@@ -456,9 +456,9 @@ static int __init intel_iommu_setup(char *str)
} else if (!strncmp(str, "sp_off", 6)) {
pr_info("Disable supported super page\n");
intel_iommu_superpage = 0;
-   } else if (!strncmp(str, "sm_off", 6)) {
-   pr_info("Intel-IOMMU: disable scalable mode support\n");
-   intel_iommu_sm = 0;
+   } else if (!strncmp(str, "sm_on", 5)) {
+   pr_info("Intel-IOMMU: scalable mode supported\n");
+   intel_iommu_sm = 1;
} else if (!strncmp(str, "tboot_noforce", 13)) {
printk(KERN_INFO
"Intel-IOMMU: not forcing on after tboot. This 
could expose security risk for tboot\n");
-- 
2.17.1

Re: [PATCH v5 4/8] iommu/vt-d: Aux-domain specific domain attach/detach

2019-01-14 Thread Lu Baolu


Hi,

On 1/14/19 8:26 PM, Jonathan Cameron wrote:

On Thu, 10 Jan 2019 11:00:23 +0800
Lu Baolu  wrote:


When multiple domains per device has been enabled by the
device driver, the device will tag the default PASID for
the domain to all DMA traffics out of the subset of this
device; and the IOMMU should translate the DMA requests
in PASID granularity.

This adds the intel_iommu_aux_attach/detach_device() ops
to support managing PASID granular translation structures
when the device driver has enabled multiple domains per
device.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 


The following is probably a rather naive review given I don't know
the driver or hardware well at all.  Still, it seems like things
are a lot less balanced than I'd expect and isn't totally obvious
to me why that is.


Thank you!




---
  drivers/iommu/intel-iommu.c | 152 
  include/linux/intel-iommu.h |  10 +++
  2 files changed, 162 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e9119d45a29d..b8fb6a4bd447 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2482,6 +2482,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->iommu = iommu;
info->pasid_table = NULL;
info->auxd_enabled = 0;
+   INIT_LIST_HEAD(>auxiliary_domains);
  
  	if (dev && dev_is_pci(dev)) {

struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5058,6 +5059,131 @@ static void intel_iommu_domain_free(struct iommu_domain 
*domain)
domain_exit(to_dmar_domain(domain));
  }
  
+/*

+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+static inline bool
+is_aux_domain(struct device *dev, struct iommu_domain *domain)


I'm finding the distinction between an aux domain capability on
a given device and whether one is actually in use to be obscured
slightly in the function naming.

This one for example is actually checking if we have a domain
that is capable of being enabled for aux domain use, but not
yet actually in that mode?

Mind you I'm not sure I have a better answer for the naming.
can_aux_domain_be_enabled?  is_unattached_aux_domain?




device aux mode vs. normal mode
===

When we talk about the auxiliary mode (simply aux-mode), it means "the
device works in aux-mode or normal mode". "normal mode" means that the
device (and it's corresponding IOMMU) supports only RID (PCI Request ID)
based DMA translation; while, aux-mode means the the device (and it's
IOMMU) supports fine-grained DMA translation, like PASID based DMA
translation with Intel VT-d scalable mode.

We are adding below APIs to switch a device between these two modes:

int iommu_dev_enable/disable_feature(dev, IOMMU_DEV_FEAT_AUX)

And this API (still under discussion) to check which mode the device is
working in:

bool iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)

aux-domain
==

If a device is working in aux-mode and we are going to attach a domain
to this device, we say "this domain will be attached to the device in
aux mode", and simply "aux domain". So a domain is "normal" when it is
going to attach to a device in normal mode; and is "aux-domain" when it
is going to attach to a device in aux mode.




+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   return info && info->auxd_enabled &&
+   domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
+
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   domain->auxd_refcnt++;
+   list_add(>auxd, >auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+   struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   list_del(>auxd);
+   domain->auxd_refcnt--;
+
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);


This seems unbalanced wrt to what is happening in auxiliary_link_device.
If this is necessary then it would be good to have comments saying why.
To my uniformed eye, looks like we could do this at the end of
aux_domain_remove_dev, except that we need to hold the lock.
As such perhaps it makes sense to do the pasid allocation under that
lock in the first place?

I'm not 100% sure, but is there a ra

Re: [PATCH v5 1/8] iommu: Add APIs for multiple domains per device

2019-01-14 Thread Lu Baolu


Hi,

On 1/14/19 7:22 PM, Jonathan Cameron wrote:

On Thu, 10 Jan 2019 11:00:20 +0800
Lu Baolu  wrote:


Sharing a physical PCI device in a finer-granularity way
is becoming a consensus in the industry. IOMMU vendors
are also engaging efforts to support such sharing as well
as possible. Among the efforts, the capability of support
finer-granularity DMA isolation is a common requirement
due to the security consideration. With finer-granularity
DMA isolation, all DMA requests out of or to a subset of
a physical PCI device can be protected by the IOMMU. As a
result, there is a request in software to attach multiple
domains to a physical PCI device. One example of such use
model is the Intel Scalable IOV [1] [2]. The Intel vt-d
3.0 spec [3] introduces the scalable mode which enables
PASID granularity DMA isolation.

This adds the APIs to support multiple domains per device.
In order to ease the discussions, we call it 'a domain in
auxiliary mode' or simply 'auxiliary domain' when multiple
domains are attached to a physical device.

The APIs include:

* iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)
   - Check whether both IOMMU and device support IOMMU aux
 domain feature. Below aux-domain specific interfaces
 are available only after this returns true.

* iommu_dev_enable/disable_feature(dev, IOMMU_DEV_FEAT_AUX)
   - Enable/disable device specific aux-domain feature.

* iommu_aux_attach_device(domain, dev)
   - Attaches @domain to @dev in the auxiliary mode. Multiple
 domains could be attached to a single device in the
 auxiliary mode with each domain representing an isolated
 address space for an assignable subset of the device.

* iommu_aux_detach_device(domain, dev)
   - Detach @domain which has been attached to @dev in the
 auxiliary mode.

* iommu_aux_get_pasid(domain, dev)
   - Return ID used for finer-granularity DMA translation.
 For the Intel Scalable IOV usage model, this will be
 a PASID. The device which supports Scalable IOV needs
 to write this ID to the device register so that DMA
 requests could be tagged with a right PASID prefix.

This has been updated with the latest proposal from Joerg
posted here [5].

Many people involved in discussions of this design.

Kevin Tian 
Liu Yi L 
Ashok Raj 
Sanjay Kumar 
Jacob Pan 
Alex Williamson 
Jean-Philippe Brucker 
Joerg Roedel 

and some discussions can be found here [4] [5].

[1] 
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification
[2] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf
[3] 
https://software.intel.com/en-us/download/intel-virtualization-technology-for-directed-io-architecture-specification
[4] https://lkml.org/lkml/2018/7/26/4
[5] https://www.spinics.net/lists/iommu/msg31874.html

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Kevin Tian 
Suggested-by: Jean-Philippe Brucker 
Suggested-by: Joerg Roedel 
Signed-off-by: Lu Baolu 


One trivial comment inline.


Thank you!




---
  drivers/iommu/iommu.c | 80 +++
  include/linux/iommu.h | 61 +
  2 files changed, 141 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3ed4db334341..9166b6145409 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2033,3 +2033,83 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, 
int num_ids)
return 0;
  }
  EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
+
+/*
+ * Per device IOMMU features.
+ */
+bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_has_feat)
+   return ops->dev_has_feat(dev, feat);
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
+
+int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_enable_feat)
+   return ops->dev_enable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
+
+int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_disable_feat)
+   return ops->dev_disable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
+
+/*
+ * Aux-domain specific attach/detach.
+ *
+ * Only works if iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX) returns true.
+ * Also, as long as domains are attached to a device through this interface,
+ * any tries to call iommu_attach_device() should fail (iommu_detach_device()
+ * can't fail, so we fail on the tryint to re-attach). This should make us safe


when trying to re-attach. (perhaps?)


Yes. I will fix it.

Best regards,

Re: [PATCH v5 2/8] iommu/vt-d: Add per-device IOMMU feature ops entries

2019-01-13 Thread Lu Baolu


Hi Joerg,

Thanks for reviewing my patch.

On 1/11/19 7:16 PM, Joerg Roedel wrote:

Hi,

this looks a bit confusing to me because I can see no checking whether
the device actually supports scalable mode.


Yes. I should put some checking there. Device scalable mode capability
is exposed in PCI extended capability list.


More below:

On Thu, Jan 10, 2019 at 11:00:21AM +0800, Lu Baolu wrote:

+static int intel_iommu_enable_auxd(struct device *dev)
+{
+   struct device_domain_info *info;
+   struct dmar_domain *domain;
+   unsigned long flags;
+
+   if (!scalable_mode_support())
+   return -ENODEV;
+
+   domain = get_valid_domain_for_dev(dev);
+   if (!domain)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   info->auxd_enabled = 1;
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+}


This code sets a flag to mark scalable mode enabled. Doesn't the device
need some handling too, like enabling the PASID capability and all?


Yes. My design was rough. We should prepare the device for scalable mode
instead of assuming that everything is ready.




+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return scalable_mode_support() && info && info->auxd_enabled;
+
+   return false;
+}


Why is this checking the auxd_enabled flag?


We need an API to check whether this feature is enabled. In vfio, it
is used like below,

if (iommu_dev_has_feat(dev, FEAT_AUX_DOMAIN))
iommu_aux_attach_device(dev, domain)
else
iommu_attach_device(dev, domain)


The function should just
return whether the device _supports_ scalable mode, not whether it is
enabled.


Do we want to have an API to tell whether device has aux-domain feature?
It could be included in the enable API. The enable API returns failure
if device doesn't support aux-domain.



Regards,

        Joerg



Best regards,
Lu Baolu

[PATCH 1/1] iommu/vt-d: Support page request in scalable mode

2019-01-10 Thread Lu Baolu

From: Jacob Pan 

VT-d Rev3.0 has made a few changes to the page request interface,

1. widened PRQ descriptor from 128 bits to 256 bits;
2. removed streaming response type;
3. introduced private data that requires page response even the
   request is not last request in group (LPIG).

This is a supplement to commit 1c4f88b7f1f92 ("iommu/vt-d: Shared
virtual address in scalable mode") and makes the svm code compliant
with VT-d Rev3.0.

Cc: Ashok Raj 
Cc: Liu Yi L 
Cc: Kevin Tian 
Signed-off-by: Jacob Pan 
Fixes: 1c4f88b7f1f92 ("iommu/vt-d: Shared virtual address in scalable mode")
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c   | 77 ++---
 include/linux/intel-iommu.h | 21 +-
 include/linux/intel-svm.h   |  2 +-
 3 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a2a2aa4439aa..79add5716552 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -470,20 +470,31 @@ EXPORT_SYMBOL_GPL(intel_svm_is_pasid_valid);
 
 /* Page request queue descriptor */
 struct page_req_dsc {
-   u64 srr:1;
-   u64 bof:1;
-   u64 pasid_present:1;
-   u64 lpig:1;
-   u64 pasid:20;
-   u64 bus:8;
-   u64 private:23;
-   u64 prg_index:9;
-   u64 rd_req:1;
-   u64 wr_req:1;
-   u64 exe_req:1;
-   u64 priv_req:1;
-   u64 devfn:8;
-   u64 addr:52;
+   union {
+   struct {
+   u64 type:8;
+   u64 pasid_present:1;
+   u64 priv_data_present:1;
+   u64 rsvd:6;
+   u64 rid:16;
+   u64 pasid:20;
+   u64 exe_req:1;
+   u64 pm_req:1;
+   u64 rsvd2:10;
+   };
+   u64 qw_0;
+   };
+   union {
+   struct {
+   u64 rd_req:1;
+   u64 wr_req:1;
+   u64 lpig:1;
+   u64 prg_index:9;
+   u64 addr:52;
+   };
+   u64 qw_1;
+   };
+   u64 priv_data[2];
 };
 
 #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x10)
@@ -596,7 +607,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
/* Accounting for major/minor faults? */
rcu_read_lock();
list_for_each_entry_rcu(sdev, >devs, list) {
-   if (sdev->sid == PCI_DEVID(req->bus, req->devfn))
+   if (sdev->sid == req->rid)
break;
}
/* Other devices can go away, but the drivers are not permitted
@@ -609,33 +620,35 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
if (sdev && sdev->ops && sdev->ops->fault_cb) {
int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-   (req->exe_req << 1) | (req->priv_req);
-   sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 
req->private, rwxp, result);
+   (req->exe_req << 1) | (req->pm_req);
+   sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr,
+   req->priv_data, rwxp, result);
}
/* We get here in the error case where the PASID lookup failed,
   and these can be NULL. Do not use them below this point! */
sdev = NULL;
svm = NULL;
no_pasid:
-   if (req->lpig) {
-   /* Page Group Response */
+   if (req->lpig || req->priv_data_present) {
+   /*
+* Per VT-d spec. v3.0 ch7.7, system software must
+* respond with page group response if private data
+* is present (PDP) or last page in group (LPIG) bit
+* is set. This is an additional VT-d feature beyond
+* PCI ATS spec.
+*/
resp.qw0 = QI_PGRP_PASID(req->pasid) |
-   QI_PGRP_DID((req->bus << 8) | req->devfn) |
+   QI_PGRP_DID(req->rid) |
QI_PGRP_PASID_P(req->pasid_present) |
+   QI_PGRP_PDP(req->pasid_present) |
+   QI_PGRP_RESP_CODE(result) |
QI_PGRP_RESP_TYPE;
resp.qw1 = QI_PGRP_IDX(req->prg_index) |
-   QI_PGRP_PRIV(req->private) |
-   QI_PGRP_RESP_CODE(result);
-   } e

[PATCH v5 6/8] vfio/mdev: Add iommu related member in mdev_device

A parent device might create different types of mediated
devices. For example, a mediated device could be created
by the parent device with full isolation and protection
provided by the IOMMU. One usage case could be found on
Intel platforms where a mediated device is an assignable
subset of a PCI, the DMA requests on behalf of it are all
tagged with a PASID. Since IOMMU supports PASID-granular
translations (scalable mode in VT-d 3.0), this mediated
device could be individually protected and isolated by an
IOMMU.

This patch adds a new member in the struct mdev_device to
indicate that the mediated device represented by mdev could
be isolated and protected by attaching a domain to a device
represented by mdev->iommu_device. It also adds a helper to
add or set the iommu device.

* mdev_device->iommu_device
  - This, if set, indicates that the mediated device could
be fully isolated and protected by IOMMU via attaching
an iommu domain to this device. If empty, it indicates
using vendor defined isolation, hence bypass IOMMU.

* mdev_set/get_iommu_device(dev, iommu_device)
  - Set or get the iommu device which represents this mdev
in IOMMU's device scope. Drivers don't need to set the
iommu device if it uses vendor defined isolation.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Kevin Tian 
Suggested-by: Alex Williamson 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/mdev/mdev_core.c| 18 ++
 drivers/vfio/mdev/mdev_private.h |  1 +
 include/linux/mdev.h | 14 ++
 3 files changed, 33 insertions(+)

diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 0212f0ee8aea..9be58d392d2b 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -390,6 +390,24 @@ int mdev_device_remove(struct device *dev, bool 
force_remove)
return 0;
 }
 
+int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
+{
+   struct mdev_device *mdev = to_mdev_device(dev);
+
+   mdev->iommu_device = iommu_device;
+
+   return 0;
+}
+EXPORT_SYMBOL(mdev_set_iommu_device);
+
+struct device *mdev_get_iommu_device(struct device *dev)
+{
+   struct mdev_device *mdev = to_mdev_device(dev);
+
+   return mdev->iommu_device;
+}
+EXPORT_SYMBOL(mdev_get_iommu_device);
+
 static int __init mdev_init(void)
 {
return mdev_bus_register();
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index b5819b7d7ef7..891841862ef8 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -34,6 +34,7 @@ struct mdev_device {
struct list_head next;
struct kobject *type_kobj;
bool active;
+   struct device *iommu_device;
 };
 
 #define to_mdev_device(dev)container_of(dev, struct mdev_device, dev)
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index b6e048e1045f..c3ab8a9cfcc7 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -15,6 +15,20 @@
 
 struct mdev_device;
 
+/*
+ * Called by the parent device driver to set the device which represents
+ * this mdev in iommu protection scope. By default, the iommu device is
+ * NULL, that indicates using vendor defined isolation.
+ *
+ * @dev: the mediated device that iommu will isolate.
+ * @iommu_device: a pci device which represents the iommu for @dev.
+ *
+ * Return 0 for success, otherwise negative error value.
+ */
+int mdev_set_iommu_device(struct device *dev, struct device *iommu_device);
+
+struct device *mdev_get_iommu_device(struct device *dev);
+
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device 
to
  * register the device to mdev module.
-- 
2.17.1

[PATCH v5 8/8] vfio/type1: Handle different mdev isolation type

This adds the support to determine the isolation type
of a mediated device group by checking whether it has
an iommu device. If an iommu device exists, an iommu
domain will be allocated and then attached to the iommu
device. Otherwise, keep the same behavior as it is.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/vfio_iommu_type1.c | 48 -
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 97278ac8da95..140366014a1b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1368,13 +1368,40 @@ static void vfio_iommu_detach_group(struct vfio_domain 
*domain,
iommu_detach_group(domain->domain, group->iommu_group);
 }
 
+static bool vfio_bus_is_mdev(struct bus_type *bus)
+{
+   struct bus_type *mdev_bus;
+   bool ret = false;
+
+   mdev_bus = symbol_get(mdev_bus_type);
+   if (mdev_bus) {
+   ret = (bus == mdev_bus);
+   symbol_put(mdev_bus_type);
+   }
+
+   return ret;
+}
+
+static int vfio_mdev_iommu_device(struct device *dev, void *data)
+{
+   struct device **old = data, *new;
+
+   new = vfio_mdev_get_iommu_device(dev);
+   if (!new || (*old && *old != new))
+   return -EINVAL;
+
+   *old = new;
+
+   return 0;
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group;
struct vfio_domain *domain, *d;
-   struct bus_type *bus = NULL, *mdev_bus;
+   struct bus_type *bus = NULL;
int ret;
bool resv_msi, msi_remap;
phys_addr_t resv_msi_base;
@@ -1409,23 +1436,30 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_free;
 
-   mdev_bus = symbol_get(mdev_bus_type);
+   if (vfio_bus_is_mdev(bus)) {
+   struct device *iommu_device = NULL;
 
-   if (mdev_bus) {
-   if ((bus == mdev_bus) && !iommu_present(bus)) {
-   symbol_put(mdev_bus_type);
+   group->mdev_group = true;
+
+   /* Determine the isolation type */
+   ret = iommu_group_for_each_dev(iommu_group, _device,
+  vfio_mdev_iommu_device);
+   if (ret || !iommu_device) {
if (!iommu->external_domain) {
INIT_LIST_HEAD(>group_list);
iommu->external_domain = domain;
-   } else
+   } else {
kfree(domain);
+   }
 
list_add(>next,
 >external_domain->group_list);
mutex_unlock(>lock);
+
return 0;
}
-   symbol_put(mdev_bus_type);
+
+   bus = iommu_device->bus;
}
 
domain->domain = iommu_domain_alloc(bus);
-- 
2.17.1

[PATCH v5 4/8] iommu/vt-d: Aux-domain specific domain attach/detach

When multiple domains per device has been enabled by the
device driver, the device will tag the default PASID for
the domain to all DMA traffics out of the subset of this
device; and the IOMMU should translate the DMA requests
in PASID granularity.

This adds the intel_iommu_aux_attach/detach_device() ops
to support managing PASID granular translation structures
when the device driver has enabled multiple domains per
device.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 152 
 include/linux/intel-iommu.h |  10 +++
 2 files changed, 162 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e9119d45a29d..b8fb6a4bd447 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2482,6 +2482,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->iommu = iommu;
info->pasid_table = NULL;
info->auxd_enabled = 0;
+   INIT_LIST_HEAD(>auxiliary_domains);
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5058,6 +5059,131 @@ static void intel_iommu_domain_free(struct iommu_domain 
*domain)
domain_exit(to_dmar_domain(domain));
 }
 
+/*
+ * Check whether a @domain could be attached to the @dev through the
+ * aux-domain attach/detach APIs.
+ */
+static inline bool
+is_aux_domain(struct device *dev, struct iommu_domain *domain)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   return info && info->auxd_enabled &&
+   domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
+
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   domain->auxd_refcnt++;
+   list_add(>auxd, >auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+   struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   list_del(>auxd);
+   domain->auxd_refcnt--;
+
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+}
+
+static int aux_domain_add_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+   int ret;
+   u8 bus, devfn;
+   unsigned long flags;
+   struct intel_iommu *iommu;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu)
+   return -ENODEV;
+
+   if (domain->default_pasid <= 0) {
+   int pasid;
+
+   pasid = intel_pasid_alloc_id(domain, PASID_MIN,
+pci_max_pasids(to_pci_dev(dev)),
+GFP_KERNEL);
+   if (pasid <= 0) {
+   pr_err("Can't allocate default pasid\n");
+   return -ENODEV;
+   }
+   domain->default_pasid = pasid;
+   }
+
+   spin_lock_irqsave(_domain_lock, flags);
+   /*
+* iommu->lock must be held to attach domain to iommu and setup the
+* pasid entry for second level translation.
+*/
+   spin_lock(>lock);
+   ret = domain_attach_iommu(domain, iommu);
+   if (ret)
+   goto attach_failed;
+
+   /* Setup the PASID entry for mediated devices: */
+   ret = intel_pasid_setup_second_level(iommu, domain, dev,
+domain->default_pasid);
+   if (ret)
+   goto table_failed;
+   spin_unlock(>lock);
+
+   auxiliary_link_device(domain, dev);
+
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+
+table_failed:
+   domain_detach_iommu(domain, iommu);
+attach_failed:
+   spin_unlock(>lock);
+   spin_unlock_irqrestore(_domain_lock, flags);
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+
+   return ret;
+}
+
+static void aux_domain_remove_dev(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info;
+   struct intel_iommu *iommu;
+   unsigned long flags;
+
+   if (!is_aux_domain(dev, >domain))
+   return;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   iommu = info-&

[PATCH v5 3/8] iommu/vt-d: Move common code out of iommu_attch_device()

This part of code could be used by both normal and aux
domain specific attach entries. Hence move them into a
common function to avoid duplication.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 60 ++---
 1 file changed, 36 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ee8832d26f7e..e9119d45a29d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5058,35 +5058,14 @@ static void intel_iommu_domain_free(struct iommu_domain 
*domain)
domain_exit(to_dmar_domain(domain));
 }
 
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-struct device *dev)
+static int prepare_domain_attach_device(struct iommu_domain *domain,
+   struct device *dev)
 {
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
int addr_width;
u8 bus, devfn;
 
-   if (device_is_rmrr_locked(dev)) {
-   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
-   return -EPERM;
-   }
-
-   /* normally dev is not mapped */
-   if (unlikely(domain_context_mapped(dev))) {
-   struct dmar_domain *old_domain;
-
-   old_domain = find_domain(dev);
-   if (old_domain) {
-   rcu_read_lock();
-   dmar_remove_one_dev_info(old_domain, dev);
-   rcu_read_unlock();
-
-   if (!domain_type_is_vm_or_si(old_domain) &&
-list_empty(_domain->devices))
-   domain_exit(old_domain);
-   }
-   }
-
iommu = device_to_iommu(dev, , );
if (!iommu)
return -ENODEV;
@@ -5119,7 +5098,40 @@ static int intel_iommu_attach_device(struct iommu_domain 
*domain,
dmar_domain->agaw--;
}
 
-   return domain_add_dev_info(dmar_domain, dev);
+   return 0;
+}
+
+static int intel_iommu_attach_device(struct iommu_domain *domain,
+struct device *dev)
+{
+   int ret;
+
+   if (device_is_rmrr_locked(dev)) {
+   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
+   return -EPERM;
+   }
+
+   /* normally dev is not mapped */
+   if (unlikely(domain_context_mapped(dev))) {
+   struct dmar_domain *old_domain;
+
+   old_domain = find_domain(dev);
+   if (old_domain) {
+   rcu_read_lock();
+   dmar_remove_one_dev_info(old_domain, dev);
+   rcu_read_unlock();
+
+   if (!domain_type_is_vm_or_si(old_domain) &&
+   list_empty(_domain->devices))
+   domain_exit(old_domain);
+   }
+   }
+
+   ret = prepare_domain_attach_device(domain, dev);
+   if (ret)
+   return ret;
+
+   return domain_add_dev_info(to_dmar_domain(domain), dev);
 }
 
 static void intel_iommu_detach_device(struct iommu_domain *domain,
-- 
2.17.1

[PATCH v5 7/8] vfio/type1: Add domain at(de)taching group helpers

This adds helpers to attach or detach a domain to a
group. This will replace iommu_attach_group() which
only works for non-mdev devices.

If a domain is attaching to a group which includes the
mediated devices, it should attach to the iommu device
(a pci device which represents the mdev in iommu scope)
instead. The added helper supports attaching domain to
groups for both pci and mdev devices.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/vfio/vfio_iommu_type1.c | 84 ++---
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 7651cfb14836..97278ac8da95 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -91,6 +91,7 @@ struct vfio_dma {
 struct vfio_group {
struct iommu_group  *iommu_group;
struct list_headnext;
+   boolmdev_group; /* An mdev group */
 };
 
 /*
@@ -1298,6 +1299,75 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+static struct device *vfio_mdev_get_iommu_device(struct device *dev)
+{
+   struct device *(*fn)(struct device *dev);
+   struct device *iommu_device;
+
+   fn = symbol_get(mdev_get_iommu_device);
+   if (fn) {
+   iommu_device = fn(dev);
+   symbol_put(mdev_get_iommu_device);
+
+   return iommu_device;
+   }
+
+   return NULL;
+}
+
+static int vfio_mdev_attach_domain(struct device *dev, void *data)
+{
+   struct iommu_domain *domain = data;
+   struct device *iommu_device;
+
+   iommu_device = vfio_mdev_get_iommu_device(dev);
+   if (iommu_device) {
+   if (iommu_dev_has_feature(iommu_device, IOMMU_DEV_FEAT_AUX))
+   return iommu_aux_attach_device(domain, iommu_device);
+   else
+   return iommu_attach_device(domain, iommu_device);
+   }
+
+   return -EINVAL;
+}
+
+static int vfio_mdev_detach_domain(struct device *dev, void *data)
+{
+   struct iommu_domain *domain = data;
+   struct device *iommu_device;
+
+   iommu_device = vfio_mdev_get_iommu_device(dev);
+   if (iommu_device) {
+   if (iommu_dev_has_feature(iommu_device, IOMMU_DEV_FEAT_AUX))
+   iommu_aux_detach_device(domain, iommu_device);
+   else
+   iommu_detach_device(domain, iommu_device);
+   }
+
+   return 0;
+}
+
+static int vfio_iommu_attach_group(struct vfio_domain *domain,
+  struct vfio_group *group)
+{
+   if (group->mdev_group)
+   return iommu_group_for_each_dev(group->iommu_group,
+   domain->domain,
+   vfio_mdev_attach_domain);
+   else
+   return iommu_attach_group(domain->domain, group->iommu_group);
+}
+
+static void vfio_iommu_detach_group(struct vfio_domain *domain,
+   struct vfio_group *group)
+{
+   if (group->mdev_group)
+   iommu_group_for_each_dev(group->iommu_group, domain->domain,
+vfio_mdev_detach_domain);
+   else
+   iommu_detach_group(domain->domain, group->iommu_group);
+}
+
 static int vfio_iommu_type1_attach_group(void *iommu_data,
 struct iommu_group *iommu_group)
 {
@@ -1373,7 +1443,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
goto out_domain;
}
 
-   ret = iommu_attach_group(domain->domain, iommu_group);
+   ret = vfio_iommu_attach_group(domain, group);
if (ret)
goto out_domain;
 
@@ -1405,8 +1475,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
list_for_each_entry(d, >domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
d->prot == domain->prot) {
-   iommu_detach_group(domain->domain, iommu_group);
-   if (!iommu_attach_group(d->domain, iommu_group)) {
+   vfio_iommu_detach_group(domain, group);
+   if (!vfio_iommu_attach_group(d, group)) {
list_add(>next, >group_list);
iommu_domain_free(domain->domain);
kfree(domain);
@@ -1414,7 +1484,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
return 0;
}
 
-   ret = iommu_attach_group(domain->domain, iommu_group);
+   ret = vfio_iommu_attach_group(domai

[PATCH v5 5/8] iommu/vt-d: Return ID associated with an auxiliary domain

This adds support to return the default pasid associated with
an auxiliary domain. The PCI device which is bound with this
domain should use this value as the pasid for all DMA requests
of the subset of device which is isolated and protected with
this domain.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index b8fb6a4bd447..614906276bf1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5624,6 +5624,15 @@ intel_iommu_dev_disable_feat(struct device *dev, enum 
iommu_dev_features feat)
return -ENODEV;
 }
 
+static int
+intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
+{
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+
+   return dmar_domain->default_pasid > 0 ?
+   dmar_domain->default_pasid : -EINVAL;
+}
+
 const struct iommu_ops intel_iommu_ops = {
.capable= intel_iommu_capable,
.domain_alloc   = intel_iommu_domain_alloc,
@@ -5632,6 +5641,7 @@ const struct iommu_ops intel_iommu_ops = {
.detach_dev = intel_iommu_detach_device,
.aux_attach_dev = intel_iommu_aux_attach_device,
.aux_detach_dev = intel_iommu_aux_detach_device,
+   .aux_get_pasid  = intel_iommu_aux_get_pasid,
.map= intel_iommu_map,
.unmap  = intel_iommu_unmap,
.iova_to_phys   = intel_iommu_iova_to_phys,
-- 
2.17.1

[PATCH v5 1/8] iommu: Add APIs for multiple domains per device

Sharing a physical PCI device in a finer-granularity way
is becoming a consensus in the industry. IOMMU vendors
are also engaging efforts to support such sharing as well
as possible. Among the efforts, the capability of support
finer-granularity DMA isolation is a common requirement
due to the security consideration. With finer-granularity
DMA isolation, all DMA requests out of or to a subset of
a physical PCI device can be protected by the IOMMU. As a
result, there is a request in software to attach multiple
domains to a physical PCI device. One example of such use
model is the Intel Scalable IOV [1] [2]. The Intel vt-d
3.0 spec [3] introduces the scalable mode which enables
PASID granularity DMA isolation.

This adds the APIs to support multiple domains per device.
In order to ease the discussions, we call it 'a domain in
auxiliary mode' or simply 'auxiliary domain' when multiple
domains are attached to a physical device.

The APIs include:

* iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX)
  - Check whether both IOMMU and device support IOMMU aux
domain feature. Below aux-domain specific interfaces
are available only after this returns true.

* iommu_dev_enable/disable_feature(dev, IOMMU_DEV_FEAT_AUX)
  - Enable/disable device specific aux-domain feature.

* iommu_aux_attach_device(domain, dev)
  - Attaches @domain to @dev in the auxiliary mode. Multiple
domains could be attached to a single device in the
auxiliary mode with each domain representing an isolated
address space for an assignable subset of the device.

* iommu_aux_detach_device(domain, dev)
  - Detach @domain which has been attached to @dev in the
auxiliary mode.

* iommu_aux_get_pasid(domain, dev)
  - Return ID used for finer-granularity DMA translation.
For the Intel Scalable IOV usage model, this will be
a PASID. The device which supports Scalable IOV needs
to write this ID to the device register so that DMA
requests could be tagged with a right PASID prefix.

This has been updated with the latest proposal from Joerg
posted here [5].

Many people involved in discussions of this design.

Kevin Tian 
Liu Yi L 
Ashok Raj 
Sanjay Kumar 
Jacob Pan 
Alex Williamson 
Jean-Philippe Brucker 
Joerg Roedel 

and some discussions can be found here [4] [5].

[1] 
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification
[2] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf
[3] 
https://software.intel.com/en-us/download/intel-virtualization-technology-for-directed-io-architecture-specification
[4] https://lkml.org/lkml/2018/7/26/4
[5] https://www.spinics.net/lists/iommu/msg31874.html

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Kevin Tian 
Suggested-by: Jean-Philippe Brucker 
Suggested-by: Joerg Roedel 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/iommu.c | 80 +++
 include/linux/iommu.h | 61 +
 2 files changed, 141 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3ed4db334341..9166b6145409 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2033,3 +2033,83 @@ int iommu_fwspec_add_ids(struct device *dev, u32 *ids, 
int num_ids)
return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
+
+/*
+ * Per device IOMMU features.
+ */
+bool iommu_dev_has_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_has_feat)
+   return ops->dev_has_feat(dev, feat);
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_has_feature);
+
+int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_enable_feat)
+   return ops->dev_enable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
+
+int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
+{
+   const struct iommu_ops *ops = dev->bus->iommu_ops;
+
+   if (ops && ops->dev_disable_feat)
+   return ops->dev_disable_feat(dev, feat);
+
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
+
+/*
+ * Aux-domain specific attach/detach.
+ *
+ * Only works if iommu_dev_has_feature(dev, IOMMU_DEV_FEAT_AUX) returns true.
+ * Also, as long as domains are attached to a device through this interface,
+ * any tries to call iommu_attach_device() should fail (iommu_detach_device()
+ * can't fail, so we fail on the tryint to re-attach). This should make us safe
+ * against a device being attached to a guest as a whole while there are still
+ * pasid users on it (aux and sva).
+ */
+int iommu_aux_attach_device(struct iommu_domain *domain, struct device *dev)
+{
+   int ret = -ENODEV;
+
+   if (do

[PATCH v5 2/8] iommu/vt-d: Add per-device IOMMU feature ops entries

This adds the iommu ops entries for aux-domain per-device
feature query and enable/disable.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 86 +
 include/linux/intel-iommu.h |  1 +
 2 files changed, 87 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2bd9ac285c0d..ee8832d26f7e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2481,6 +2481,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->domain = domain;
info->iommu = iommu;
info->pasid_table = NULL;
+   info->auxd_enabled = 0;
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5215,6 +5216,24 @@ static phys_addr_t intel_iommu_iova_to_phys(struct 
iommu_domain *domain,
return phys;
 }
 
+static inline bool scalable_mode_support(void)
+{
+   struct dmar_drhd_unit *drhd;
+   struct intel_iommu *iommu;
+   bool ret = true;
+
+   rcu_read_lock();
+   for_each_active_iommu(iommu, drhd) {
+   if (!sm_supported(iommu)) {
+   ret = false;
+   break;
+   }
+   }
+   rcu_read_unlock();
+
+   return ret;
+}
+
 static bool intel_iommu_capable(enum iommu_cap cap)
 {
if (cap == IOMMU_CAP_CACHE_COHERENCY)
@@ -5379,6 +5398,70 @@ struct intel_iommu *intel_svm_device_to_iommu(struct 
device *dev)
 }
 #endif /* CONFIG_INTEL_IOMMU_SVM */
 
+static int intel_iommu_enable_auxd(struct device *dev)
+{
+   struct device_domain_info *info;
+   struct dmar_domain *domain;
+   unsigned long flags;
+
+   if (!scalable_mode_support())
+   return -ENODEV;
+
+   domain = get_valid_domain_for_dev(dev);
+   if (!domain)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   info->auxd_enabled = 1;
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+}
+
+static int intel_iommu_disable_auxd(struct device *dev)
+{
+   struct device_domain_info *info;
+   unsigned long flags;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   info = dev->archdata.iommu;
+   if (!WARN_ON(!info))
+   info->auxd_enabled = 0;
+   spin_unlock_irqrestore(_domain_lock, flags);
+
+   return 0;
+}
+
+static bool
+intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return scalable_mode_support() && info && info->auxd_enabled;
+
+   return false;
+}
+
+static int
+intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return intel_iommu_enable_auxd(dev);
+
+   return -ENODEV;
+}
+
+static int
+intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+   if (feat == IOMMU_DEV_FEAT_AUX)
+   return intel_iommu_disable_auxd(dev);
+
+   return -ENODEV;
+}
+
 const struct iommu_ops intel_iommu_ops = {
.capable= intel_iommu_capable,
.domain_alloc   = intel_iommu_domain_alloc,
@@ -5393,6 +5476,9 @@ const struct iommu_ops intel_iommu_ops = {
.get_resv_regions   = intel_iommu_get_resv_regions,
.put_resv_regions   = intel_iommu_put_resv_regions,
.device_group   = pci_device_group,
+   .dev_has_feat   = intel_iommu_dev_has_feat,
+   .dev_enable_feat= intel_iommu_dev_enable_feat,
+   .dev_disable_feat   = intel_iommu_dev_disable_feat,
.pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
 };
 
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 0605f3bf6e79..7cf9f7f3724a 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -571,6 +571,7 @@ struct device_domain_info {
u8 pri_enabled:1;
u8 ats_supported:1;
u8 ats_enabled:1;
+   u8 auxd_enabled:1;  /* Multiple domains per device */
u8 ats_qdep;
struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
struct intel_iommu *iommu; /* IOMMU used by this device */
-- 
2.17.1

[PATCH v5 0/8] vfio/mdev: IOMMU aware mediated device

iommu device when attaching group in vfio type1 iommu
module, and attaches the domain to iommu aware mediated devices.

Best regards,
Lu Baolu

Change log:
v4->v5:
- The iommu APIs have been updated with Joerg's proposal posted
here https://www.spinics.net/lists/iommu/msg31874.html.
- Some typos in commit message and comments have been fixed.
- PATCH 3/8 was split from 4/8 to ease code review.
- mdev->domain was removed and could bring back when there's a
real consumer.
- Other code review comments I received during v4 review period
except the EXPORT_SYMBOL vs. EXPORT_SYMBOL_GPL in PATCH 6/8.
- Rebase all patches to 5.0-rc1.

v3->v4:
- Use aux domain specific interfaces for domain attach and detach.
- Rebase all patches to 4.20-rc1.

v1->v2:
- Rewrite the patches with the concept of auxiliary domains.

Lu Baolu (8):
iommu: Add APIs for multiple domains per device
iommu/vt-d: Add per-device IOMMU feature ops entries
iommu/vt-d: Move common code out of iommu_attch_device()
iommu/vt-d: Aux-domain specific domain attach/detach
iommu/vt-d: Return ID associated with an auxiliary domain
vfio/mdev: Add iommu related member in mdev_device
vfio/type1: Add domain at(de)taching group helpers
vfio/type1: Handle different mdev isolation type

--
2.17.1

Re: [RFC PATCH 1/5] iommu: Add APIs for IOMMU PASID management

2018-12-15 Thread Lu Baolu


Hi,

On 12/16/18 6:38 AM, Liu, Yi L wrote:

From: Lu Baolu [mailto:baolu...@linux.intel.com]
Sent: Sunday, November 11, 2018 10:45 PM
Subject: [RFC PATCH 1/5] iommu: Add APIs for IOMMU PASID management

This adds APIs for IOMMU drivers and device drivers to manage the PASIDs used 
for
DMA transfer and translation. It bases on I/O ASID allocator for PASID namespace
management and relies on vendor specific IOMMU drivers for paravirtual PASIDs.

Below APIs are added:

* iommu_pasid_init(pasid)
   - Initialize a PASID consumer. The vendor specific IOMMU
 drivers are able to set the PASID range imposed by IOMMU
 hardware through a callback in iommu_ops.

* iommu_pasid_exit(pasid)
   - The PASID consumer stops consuming any PASID.

* iommu_pasid_alloc(pasid, min, max, private, *ioasid)
   - Allocate a PASID and associate a @private data with this
 PASID. The PASID value is stored in @ioaisd if returning
 success.

* iommu_pasid_free(pasid, ioasid)
   - Free a PASID to the pool so that it could be consumed by
 others.

This also adds below helpers to lookup or iterate PASID items associated with a
consumer.

* iommu_pasid_for_each(pasid, func, data)
   - Iterate PASID items of the consumer identified by @pasid,
 and call @func() against each item. An error returned from
 @func() will break the iteration.

* iommu_pasid_find(pasid, ioasid)
   - Retrieve the private data associated with @ioasid.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Jean-Philippe Brucker 
Signed-off-by: Lu Baolu 
---
  drivers/iommu/Kconfig |  1 +
  drivers/iommu/iommu.c | 89 +++
  include/linux/iommu.h | 73 +++
  3 files changed, 163 insertions(+)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index
d9a25715650e..39f2bb76c7b8 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -1,6 +1,7 @@
  # IOMMU_API always gets selected by whoever wants it.
  config IOMMU_API
bool
+   select IOASID

  menuconfig IOMMU_SUPPORT
bool "IOMMU Hardware Support"
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index
0b7c96d1425e..570b244897bb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2082,3 +2082,92 @@ void iommu_detach_device_aux(struct iommu_domain
*domain, struct device *dev)
}
  }
  EXPORT_SYMBOL_GPL(iommu_detach_device_aux);
+
+/*
+ * APIs for PASID used by IOMMU and the device drivers which depend
+ * on IOMMU.
+ */
+struct iommu_pasid *iommu_pasid_init(struct bus_type *bus) {


I'm thinking about if using struct iommu_domain here is better
than struct bus_type. The major purpose is to pass iommu_ops
in it and route into iommu-sublayer. iommu_domain may be
better since some modules like vfio_iommu_type1 would use
iommu_domain more than bus type.


But drivers might call this during initialization when it doesn't has
any domain yet.

Best regards,
Lu Baolu

[PATCH v6 08/12] iommu/vt-d: Pass pasid table to context mapping

So that the pasid related info, such as the pasid table and the
maximum of pasid could be used during setting up scalable mode
context.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Cc: Sanjay Kumar 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
Reviewed-by: Kevin Tian 
---
 drivers/iommu/intel-iommu.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index a077ff3f67a6..55c4ffda1246 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1920,6 +1920,7 @@ static void domain_exit(struct dmar_domain *domain)
 
 static int domain_context_mapping_one(struct dmar_domain *domain,
  struct intel_iommu *iommu,
+ struct pasid_table *table,
  u8 bus, u8 devfn)
 {
u16 did = domain->iommu_did[iommu->seq_id];
@@ -2042,6 +2043,7 @@ static int domain_context_mapping_one(struct dmar_domain 
*domain,
 struct domain_context_mapping_data {
struct dmar_domain *domain;
struct intel_iommu *iommu;
+   struct pasid_table *table;
 };
 
 static int domain_context_mapping_cb(struct pci_dev *pdev,
@@ -2050,25 +2052,31 @@ static int domain_context_mapping_cb(struct pci_dev 
*pdev,
struct domain_context_mapping_data *data = opaque;
 
return domain_context_mapping_one(data->domain, data->iommu,
- PCI_BUS_NUM(alias), alias & 0xff);
+ data->table, PCI_BUS_NUM(alias),
+ alias & 0xff);
 }
 
 static int
 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
 {
+   struct domain_context_mapping_data data;
+   struct pasid_table *table;
struct intel_iommu *iommu;
u8 bus, devfn;
-   struct domain_context_mapping_data data;
 
iommu = device_to_iommu(dev, , );
if (!iommu)
return -ENODEV;
 
+   table = intel_pasid_get_table(dev);
+
if (!dev_is_pci(dev))
-   return domain_context_mapping_one(domain, iommu, bus, devfn);
+   return domain_context_mapping_one(domain, iommu, table,
+ bus, devfn);
 
data.domain = domain;
data.iommu = iommu;
+   data.table = table;
 
return pci_for_each_dma_alias(to_pci_dev(dev),
  _context_mapping_cb, );
-- 
2.17.1

[PATCH v6 03/12] iommu/vt-d: Move page table helpers into header

So that they could also be used in other source files.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Cc: Sanjay Kumar 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
Reviewed-by: Kevin Tian 
---
 drivers/iommu/intel-iommu.c | 43 -
 include/linux/intel-iommu.h | 43 +
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 73d7c15bf737..52cdb0e077f2 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -290,49 +290,6 @@ static inline void context_clear_entry(struct 
context_entry *context)
context->hi = 0;
 }
 
-/*
- * 0: readable
- * 1: writable
- * 2-6: reserved
- * 7: super page
- * 8-10: available
- * 11: snoop behavior
- * 12-63: Host physcial address
- */
-struct dma_pte {
-   u64 val;
-};
-
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
-   pte->val = 0;
-}
-
-static inline u64 dma_pte_addr(struct dma_pte *pte)
-{
-#ifdef CONFIG_64BIT
-   return pte->val & VTD_PAGE_MASK;
-#else
-   /* Must have a full atomic 64-bit read */
-   return  __cmpxchg64(>val, 0ULL, 0ULL) & VTD_PAGE_MASK;
-#endif
-}
-
-static inline bool dma_pte_present(struct dma_pte *pte)
-{
-   return (pte->val & 3) != 0;
-}
-
-static inline bool dma_pte_superpage(struct dma_pte *pte)
-{
-   return (pte->val & DMA_PTE_LARGE_PAGE);
-}
-
-static inline int first_pte_in_page(struct dma_pte *pte)
-{
-   return !((unsigned long)pte & ~VTD_PAGE_MASK);
-}
-
 /*
  * This domain is a statically identity mapping domain.
  * 1. This domain creats a static 1:1 mapping to all usable memory.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 8c9b6063d275..b4da61385ebf 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -590,6 +590,49 @@ static inline void __iommu_flush_cache(
clflush_cache_range(addr, size);
 }
 
+/*
+ * 0: readable
+ * 1: writable
+ * 2-6: reserved
+ * 7: super page
+ * 8-10: available
+ * 11: snoop behavior
+ * 12-63: Host physcial address
+ */
+struct dma_pte {
+   u64 val;
+};
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+   pte->val = 0;
+}
+
+static inline u64 dma_pte_addr(struct dma_pte *pte)
+{
+#ifdef CONFIG_64BIT
+   return pte->val & VTD_PAGE_MASK;
+#else
+   /* Must have a full atomic 64-bit read */
+   return  __cmpxchg64(>val, 0ULL, 0ULL) & VTD_PAGE_MASK;
+#endif
+}
+
+static inline bool dma_pte_present(struct dma_pte *pte)
+{
+   return (pte->val & 3) != 0;
+}
+
+static inline bool dma_pte_superpage(struct dma_pte *pte)
+{
+   return (pte->val & DMA_PTE_LARGE_PAGE);
+}
+
+static inline int first_pte_in_page(struct dma_pte *pte)
+{
+   return !((unsigned long)pte & ~VTD_PAGE_MASK);
+}
+
 extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev 
*dev);
 extern int dmar_find_matched_atsr_unit(struct pci_dev *dev);
 
-- 
2.17.1

[PATCH v6 05/12] iommu/vt-d: Reserve a domain id for FL and PT modes

Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
entry for first-level or pass-through translation should be
programmed with a domain id different from those used for
second-level or nested translation. It is recommended that
software could use a same domain id for all first-only and
pass-through translations.

This reserves a domain id for first-level and pass-through
translations.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Cc: Sanjay Kumar 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 10 ++
 drivers/iommu/intel-pasid.h |  6 ++
 2 files changed, 16 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 52cdb0e077f2..5e924bc8ebec 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1621,6 +1621,16 @@ static int iommu_init_domains(struct intel_iommu *iommu)
 */
set_bit(0, iommu->domain_ids);
 
+   /*
+* Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
+* entry for first-level or pass-through translation modes should
+* be programmed with a domain id different from those used for
+* second-level or nested translation. We reserve a domain id for
+* this purpose.
+*/
+   if (sm_supported(iommu))
+   set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
+
return 0;
 }
 
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 12f480c2bb8b..03c1612d173c 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -17,6 +17,12 @@
 #define PDE_PFN_MASK   PAGE_MASK
 #define PASID_PDE_SHIFT6
 
+/*
+ * Domain ID reserved for pasid entries programmed for first-level
+ * only and pass-through transfer modes.
+ */
+#define FLPT_DEFAULT_DID   1
+
 struct pasid_dir_entry {
u64 val;
 };
-- 
2.17.1

[PATCH v6 10/12] iommu/vt-d: Add first level page table interface

This adds an interface to setup the PASID entries for first
level page table translation.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-pasid.c | 80 +
 drivers/iommu/intel-pasid.h | 11 +
 include/linux/intel-iommu.h |  1 +
 3 files changed, 92 insertions(+)

diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index 6d2b2e87e6fc..c3dcf4dc2496 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -10,6 +10,7 @@
 #define pr_fmt(fmt)"DMAR: " fmt
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -389,6 +390,26 @@ static inline void pasid_set_page_snoop(struct pasid_entry 
*pe, bool value)
pasid_set_bits(>val[1], 1 << 23, value);
 }
 
+/*
+ * Setup the First Level Page table Pointer field (Bit 140~191)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flptr(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[2], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the First Level Paging Mode field (Bit 130~131) of a
+ * scalable mode PASID entry.
+ */
+static inline void
+pasid_set_flpm(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[2], GENMASK_ULL(3, 2), value << 2);
+}
+
 static void
 pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
u16 did, int pasid)
@@ -459,6 +480,65 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
devtlb_invalidation_with_pasid(iommu, dev, pasid);
 }
 
+/*
+ * Set up the scalable mode pasid table entry for first only
+ * translation type.
+ */
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+ struct device *dev, pgd_t *pgd,
+ int pasid, u16 did, int flags)
+{
+   struct pasid_entry *pte;
+
+   if (!ecap_flts(iommu->ecap)) {
+   pr_err("No first level translation support on %s\n",
+  iommu->name);
+   return -EINVAL;
+   }
+
+   pte = intel_pasid_get_entry(dev, pasid);
+   if (WARN_ON(!pte))
+   return -EINVAL;
+
+   pasid_clear_entry(pte);
+
+   /* Setup the first level page table pointer: */
+   pasid_set_flptr(pte, (u64)__pa(pgd));
+   if (flags & PASID_FLAG_SUPERVISOR_MODE) {
+   if (!ecap_srs(iommu->ecap)) {
+   pr_err("No supervisor request support on %s\n",
+  iommu->name);
+   return -EINVAL;
+   }
+   pasid_set_sre(pte);
+   }
+
+#ifdef CONFIG_X86
+   if (cpu_feature_enabled(X86_FEATURE_LA57))
+   pasid_set_flpm(pte, 1);
+#endif /* CONFIG_X86 */
+
+   pasid_set_domain_id(pte, did);
+   pasid_set_address_width(pte, iommu->agaw);
+   pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+
+   /* Setup Present and PASID Granular Transfer Type: */
+   pasid_set_translation_type(pte, 1);
+   pasid_set_present(pte);
+
+   if (!ecap_coherent(iommu->ecap))
+   clflush_cache_range(pte, sizeof(*pte));
+
+   if (cap_caching_mode(iommu->cap)) {
+   pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+   iotlb_invalidation_with_pasid(iommu, did, pasid);
+   } else {
+   iommu_flush_write_buffer(iommu);
+   }
+
+   return 0;
+}
+
 /*
  * Set up the scalable mode pasid entry for second only translation type.
  */
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 55bb8715329d..512c63ec8a22 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -25,6 +25,14 @@
  */
 #define FLPT_DEFAULT_DID   1
 
+/*
+ * The SUPERVISOR_MODE flag indicates a first level translation which
+ * can be used for access to kernel addresses. It is valid only for
+ * access to the kernel's static 1:1 mapping of physical memory — not
+ * to vmalloc or even module mappings.
+ */
+#define PASID_FLAG_SUPERVISOR_MODE BIT(0)
+
 struct pasid_dir_entry {
u64 val;
 };
@@ -51,6 +59,9 @@ struct pasid_table *intel_pasid_get_table(struct device *dev);
 int intel_pasid_get_dev_max_id(struct device *dev);
 struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
 void intel_pasid_clear_entry(struct device *dev, int pasid);
+int intel_pasid_setup_first_level(struct intel_iommu *iommu,
+ struct device *dev, pgd_t *pgd,
+ int pasid, u16 did, int flags);
 int intel_pasid_setup_second_level(struct intel_iommu *iommu,
   struct dmar_domain *domain,
   struct device *dev, int pasid);
diff --git a/include/linux/inte

[PATCH v6 11/12] iommu/vt-d: Shared virtual address in scalable mode

This patch enables the current SVA (Shared Virtual Address)
implementation to work in the scalable mode.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c | 38 
 drivers/iommu/intel-pasid.c |  2 +-
 drivers/iommu/intel-pasid.h |  1 -
 drivers/iommu/intel-svm.c   | 58 -
 include/linux/intel-iommu.h |  9 +-
 5 files changed, 20 insertions(+), 88 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 21e5f1b9a2ca..4552166c553b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5257,18 +5257,6 @@ static void intel_iommu_put_resv_regions(struct device 
*dev,
 }
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
-static inline unsigned long intel_iommu_get_pts(struct device *dev)
-{
-   int pts, max_pasid;
-
-   max_pasid = intel_pasid_get_dev_max_id(dev);
-   pts = find_first_bit((unsigned long *)_pasid, MAX_NR_PASID_BITS);
-   if (pts < 5)
-   return 0;
-
-   return pts - 5;
-}
-
 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev 
*sdev)
 {
struct device_domain_info *info;
@@ -5300,33 +5288,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
sdev->sid = PCI_DEVID(info->bus, info->devfn);
 
if (!(ctx_lo & CONTEXT_PASIDE)) {
-   if (iommu->pasid_state_table)
-   context[1].hi = 
(u64)virt_to_phys(iommu->pasid_state_table);
-   context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
-   intel_iommu_get_pts(sdev->dev);
-
-   wmb();
-   /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
-* extended to permit requests-with-PASID if the PASIDE bit
-* is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
-* however, the PASIDE bit is ignored and requests-with-PASID
-* are unconditionally blocked. Which makes less sense.
-* So convert from CONTEXT_TT_PASS_THROUGH to one of the new
-* "guest mode" translation types depending on whether ATS
-* is available or not. Annoyingly, we can't use the new
-* modes *unless* PASIDE is set. */
-   if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 
2)) {
-   ctx_lo &= ~CONTEXT_TT_MASK;
-   if (info->ats_supported)
-   ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
-   else
-   ctx_lo |= CONTEXT_TT_PT_PASID << 2;
-   }
ctx_lo |= CONTEXT_PASIDE;
-   if (iommu->pasid_state_table)
-   ctx_lo |= CONTEXT_DINVE;
-   if (info->pri_supported)
-   ctx_lo |= CONTEXT_PRS;
context[0].lo = ctx_lo;
wmb();
iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index c3dcf4dc2496..53fe5248d8f1 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -286,7 +286,7 @@ static inline void pasid_clear_entry(struct pasid_entry *pe)
WRITE_ONCE(pe->val[7], 0);
 }
 
-void intel_pasid_clear_entry(struct device *dev, int pasid)
+static void intel_pasid_clear_entry(struct device *dev, int pasid)
 {
struct pasid_entry *pe;
 
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 512c63ec8a22..23537b3f34e3 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -58,7 +58,6 @@ void intel_pasid_free_table(struct device *dev);
 struct pasid_table *intel_pasid_get_table(struct device *dev);
 int intel_pasid_get_dev_max_id(struct device *dev);
 struct pasid_entry *intel_pasid_get_entry(struct device *dev, int pasid);
-void intel_pasid_clear_entry(struct device *dev, int pasid);
 int intel_pasid_setup_first_level(struct intel_iommu *iommu,
  struct device *dev, pgd_t *pgd,
  int pasid, u16 did, int flags);
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 9b6771a89207..935712ebda79 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -29,10 +29,6 @@
 
 #include "intel-pasid.h"
 
-#define PASID_ENTRY_P  BIT_ULL(0)
-#define PASID_ENTRY_FLPM_5LP   BIT_ULL(9)
-#define PASID_ENTRY_SREBIT_ULL(11)
-
 static irqreturn_t prq_event_thread(int irq, void *d);
 
 struct pasid_state_entry {
@@ -248,20 +244,6 @@ static void intel_invalidate_range(struct mmu_notifier *mn,

[PATCH v6 12/12] iommu/vt-d: Remove deferred invalidation

Deferred invalidation is an ECS specific feature. It will not be
supported when IOMMU works in scalable mode. As we deprecated the
ECS support, remove deferred invalidation and cleanup the code.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Cc: Sanjay Kumar 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c |  1 -
 drivers/iommu/intel-svm.c   | 45 -
 include/linux/intel-iommu.h |  8 ---
 3 files changed, 54 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 4552166c553b..eb5351e8cde5 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1700,7 +1700,6 @@ static void free_dmar_iommu(struct intel_iommu *iommu)
if (pasid_supported(iommu)) {
if (ecap_prs(iommu->ecap))
intel_svm_finish_prq(iommu);
-   intel_svm_exit(iommu);
}
 #endif
 }
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 935712ebda79..a2a2aa4439aa 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -31,15 +31,8 @@
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 
-struct pasid_state_entry {
-   u64 val;
-};
-
 int intel_svm_init(struct intel_iommu *iommu)
 {
-   struct page *pages;
-   int order;
-
if (cpu_feature_enabled(X86_FEATURE_GBPAGES) &&
!cap_fl1gp_support(iommu->cap))
return -EINVAL;
@@ -48,39 +41,6 @@ int intel_svm_init(struct intel_iommu *iommu)
!cap_5lp_support(iommu->cap))
return -EINVAL;
 
-   /* Start at 2 because it's defined as 2^(1+PSS) */
-   iommu->pasid_max = 2 << ecap_pss(iommu->ecap);
-
-   /* Eventually I'm promised we will get a multi-level PASID table
-* and it won't have to be physically contiguous. Until then,
-* limit the size because 8MiB contiguous allocations can be hard
-* to come by. The limit of 0x2, which is 1MiB for each of
-* the PASID and PASID-state tables, is somewhat arbitrary. */
-   if (iommu->pasid_max > 0x2)
-   iommu->pasid_max = 0x2;
-
-   order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
-   if (ecap_dis(iommu->ecap)) {
-   pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
-   if (pages)
-   iommu->pasid_state_table = page_address(pages);
-   else
-   pr_warn("IOMMU: %s: Failed to allocate PASID state 
table\n",
-   iommu->name);
-   }
-
-   return 0;
-}
-
-int intel_svm_exit(struct intel_iommu *iommu)
-{
-   int order = get_order(sizeof(struct pasid_entry) * iommu->pasid_max);
-
-   if (iommu->pasid_state_table) {
-   free_pages((unsigned long)iommu->pasid_state_table, order);
-   iommu->pasid_state_table = NULL;
-   }
-
return 0;
 }
 
@@ -214,11 +174,6 @@ static void intel_flush_svm_range(struct intel_svm *svm, 
unsigned long address,
 {
struct intel_svm_dev *sdev;
 
-   /* Try deferred invalidate if available */
-   if (svm->iommu->pasid_state_table &&
-   !cmpxchg64(>iommu->pasid_state_table[svm->pasid].val, 0, 1ULL 
<< 63))
-   return;
-
rcu_read_lock();
list_for_each_entry_rcu(sdev, >devs, list)
intel_flush_svm_range_dev(svm, sdev, address, pages, ih, gl);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index cfcf9c1e1872..0605f3bf6e79 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -541,15 +541,8 @@ struct intel_iommu {
struct iommu_flush flush;
 #endif
 #ifdef CONFIG_INTEL_IOMMU_SVM
-   /* These are large and need to be contiguous, so we allocate just
-* one for now. We'll maybe want to rethink that if we truly give
-* devices away to userspace processes (e.g. for DPDK) and don't
-* want to trust that userspace will use *only* the PASID it was
-* told to. But while it's all driver-arbitrated, we're fine. */
-   struct pasid_state_entry *pasid_state_table;
struct page_req_dsc *prq;
unsigned char prq_name[16];/* Name for PRQ interrupt */
-   u32 pasid_max;
 #endif
struct q_inval  *qi;/* Queued invalidation info */
u32 *iommu_state; /* Store iommu states between suspend and resume.*/
@@ -663,7 +656,6 @@ void iommu_flush_write_buffer(struct intel_iommu *iommu);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 int intel_svm_init(struct intel_iommu *iommu);
-int intel_svm_exit(struct intel_iommu *iommu);
 extern int intel_svm_enable_prq(struct intel_iommu *iommu);
 extern int intel_svm_finish_prq(struct intel_iommu *iommu);
 
-- 
2.17.1

[PATCH v6 06/12] iommu/vt-d: Add second level page table interface

This adds the interfaces to setup or tear down the structures
for second level page table translations. This includes types
of second level only translation and pass through.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c |   2 +-
 drivers/iommu/intel-pasid.c | 280 
 drivers/iommu/intel-pasid.h |   8 ++
 include/linux/intel-iommu.h |   3 +
 4 files changed, 292 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5e924bc8ebec..e741238e2326 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1210,7 +1210,7 @@ static void iommu_set_root_entry(struct intel_iommu 
*iommu)
raw_spin_unlock_irqrestore(>register_lock, flag);
 }
 
-static void iommu_flush_write_buffer(struct intel_iommu *iommu)
+void iommu_flush_write_buffer(struct intel_iommu *iommu)
 {
u32 val;
unsigned long flag;
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index fd3ccc0753b0..6d2b2e87e6fc 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -9,6 +9,7 @@
 
 #define pr_fmt(fmt)"DMAR: " fmt
 
+#include 
 #include 
 #include 
 #include 
@@ -294,3 +295,282 @@ void intel_pasid_clear_entry(struct device *dev, int 
pasid)
 
pasid_clear_entry(pe);
 }
+
+static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
+{
+   u64 old;
+
+   old = READ_ONCE(*ptr);
+   WRITE_ONCE(*ptr, (old & ~mask) | bits);
+}
+
+/*
+ * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
+ * PASID entry.
+ */
+static inline void
+pasid_set_domain_id(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[1], GENMASK_ULL(15, 0), value);
+}
+
+/*
+ * Get domain ID value of a scalable mode PASID entry.
+ */
+static inline u16
+pasid_get_domain_id(struct pasid_entry *pe)
+{
+   return (u16)(READ_ONCE(pe->val[1]) & GENMASK_ULL(15, 0));
+}
+
+/*
+ * Setup the SLPTPTR(Second Level Page Table Pointer) field (Bit 12~63)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_slptr(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[0], VTD_PAGE_MASK, value);
+}
+
+/*
+ * Setup the AW(Address Width) field (Bit 2~4) of a scalable mode PASID
+ * entry.
+ */
+static inline void
+pasid_set_address_width(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[0], GENMASK_ULL(4, 2), value << 2);
+}
+
+/*
+ * Setup the PGTT(PASID Granular Translation Type) field (Bit 6~8)
+ * of a scalable mode PASID entry.
+ */
+static inline void
+pasid_set_translation_type(struct pasid_entry *pe, u64 value)
+{
+   pasid_set_bits(>val[0], GENMASK_ULL(8, 6), value << 6);
+}
+
+/*
+ * Enable fault processing by clearing the FPD(Fault Processing
+ * Disable) field (Bit 1) of a scalable mode PASID entry.
+ */
+static inline void pasid_set_fault_enable(struct pasid_entry *pe)
+{
+   pasid_set_bits(>val[0], 1 << 1, 0);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+   pasid_set_bits(>val[2], 1 << 0, 1);
+}
+
+/*
+ * Setup the P(Present) field (Bit 0) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_present(struct pasid_entry *pe)
+{
+   pasid_set_bits(>val[0], 1 << 0, 1);
+}
+
+/*
+ * Setup Page Walk Snoop bit (Bit 87) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_page_snoop(struct pasid_entry *pe, bool value)
+{
+   pasid_set_bits(>val[1], 1 << 23, value);
+}
+
+static void
+pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
+   u16 did, int pasid)
+{
+   struct qi_desc desc;
+
+   desc.qw0 = QI_PC_DID(did) | QI_PC_PASID_SEL | QI_PC_PASID(pasid);
+   desc.qw1 = 0;
+   desc.qw2 = 0;
+   desc.qw3 = 0;
+
+   qi_submit_sync(, iommu);
+}
+
+static void
+iotlb_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid)
+{
+   struct qi_desc desc;
+
+   desc.qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+   QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE;
+   desc.qw1 = 0;
+   desc.qw2 = 0;
+   desc.qw3 = 0;
+
+   qi_submit_sync(, iommu);
+}
+
+static void
+devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
+  struct device *dev, int pasid)
+{
+   struct device_domain_info *info;
+   u16 sid, qdep, pfsid;
+
+   info = dev->archdata.iommu;
+   if (!info || !info->ats_enabled)
+   return;
+
+   sid = info->bus << 8 | info->devfn;
+   qdep = info->ats_qdep;
+   pfsid = info->pfsid;
+
+   qi_flu

[PATCH v6 09/12] iommu/vt-d: Setup context and enable RID2PASID support

This patch enables the translation for requests without PASID in
the scalable mode by setting up the root and context entries.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c | 136 
 drivers/iommu/intel-pasid.h |   1 +
 include/linux/intel-iommu.h |   1 +
 3 files changed, 108 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 55c4ffda1246..21e5f1b9a2ca 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1197,6 +1197,8 @@ static void iommu_set_root_entry(struct intel_iommu 
*iommu)
unsigned long flag;
 
addr = virt_to_phys(iommu->root_entry);
+   if (sm_supported(iommu))
+   addr |= DMA_RTADDR_SMT;
 
raw_spin_lock_irqsave(>register_lock, flag);
dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
@@ -1918,6 +1920,56 @@ static void domain_exit(struct dmar_domain *domain)
free_domain_mem(domain);
 }
 
+/*
+ * Get the PASID directory size for scalable mode context entry.
+ * Value of X in the PDTS field of a scalable mode context entry
+ * indicates PASID directory with 2^(X + 7) entries.
+ */
+static inline unsigned long context_get_sm_pds(struct pasid_table *table)
+{
+   int pds, max_pde;
+
+   max_pde = table->max_pasid >> PASID_PDE_SHIFT;
+   pds = find_first_bit((unsigned long *)_pde, MAX_NR_PASID_BITS);
+   if (pds < 7)
+   return 0;
+
+   return pds - 7;
+}
+
+/*
+ * Set the RID_PASID field of a scalable mode context entry. The
+ * IOMMU hardware will use the PASID value set in this field for
+ * DMA translations of DMA requests without PASID.
+ */
+static inline void
+context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
+{
+   context->hi |= pasid & ((1 << 20) - 1);
+   context->hi |= (1 << 20);
+}
+
+/*
+ * Set the DTE(Device-TLB Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_dte(struct context_entry *context)
+{
+   context->lo |= (1 << 2);
+}
+
+/*
+ * Set the PRE(Page Request Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_set_sm_pre(struct context_entry *context)
+{
+   context->lo |= (1 << 4);
+}
+
+/* Convert value to context PASID directory size field coding. */
+#define context_pdts(pds)  (((pds) & 0x7) << 9)
+
 static int domain_context_mapping_one(struct dmar_domain *domain,
  struct intel_iommu *iommu,
  struct pasid_table *table,
@@ -1928,8 +1980,7 @@ static int domain_context_mapping_one(struct dmar_domain 
*domain,
struct device_domain_info *info = NULL;
struct context_entry *context;
unsigned long flags;
-   struct dma_pte *pgd;
-   int ret, agaw;
+   int ret;
 
WARN_ON(did == 0);
 
@@ -1975,41 +2026,67 @@ static int domain_context_mapping_one(struct 
dmar_domain *domain,
}
}
 
-   pgd = domain->pgd;
-
context_clear_entry(context);
-   context_set_domain_id(context, did);
 
-   /*
-* Skip top levels of page tables for iommu which has less agaw
-* than default.  Unnecessary for PT mode.
-*/
-   if (translation != CONTEXT_TT_PASS_THROUGH) {
-   for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
-   ret = -ENOMEM;
-   pgd = phys_to_virt(dma_pte_addr(pgd));
-   if (!dma_pte_present(pgd))
-   goto out_unlock;
-   }
+   if (sm_supported(iommu)) {
+   unsigned long pds;
 
-   info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
-   if (info && info->ats_supported)
-   translation = CONTEXT_TT_DEV_IOTLB;
-   else
-   translation = CONTEXT_TT_MULTI_LEVEL;
+   WARN_ON(!table);
+
+   /* Setup the PASID DIR pointer: */
+   pds = context_get_sm_pds(table);
+   context->lo = (u64)virt_to_phys(table->table) |
+   context_pdts(pds);
+
+   /* Setup the RID_PASID field: */
+   context_set_sm_rid2pasid(context, PASID_RID2PASID);
 
-   context_set_address_root(context, virt_to_phys(pgd));
-   context_set_address_width(context, agaw);
-   } else {
/*
-* In pass through mode, AW must be programmed to
-* indicate the largest AGAW value supported by
-* hardware. And ASR is ignored by hardware.
+* Setup the Device-TLB enable bit and Page request
+

[PATCH v6 07/12] iommu/vt-d: Setup pasid entry for RID2PASID support

when the scalable mode is enabled, there is no second level
page translation pointer in the context entry any more (for
DMA request without PASID). Instead, a new RID2PASID field
is introduced in the context entry. Software can choose any
PASID value to set RID2PASID and then setup the translation
in the corresponding PASID entry. Upon receiving a DMA request
without PASID, hardware will firstly look at this RID2PASID
field and then treat this request as a request with a pasid
value specified in RID2PASID field.

Though software is allowed to use any PASID for the RID2PASID,
we will always use the PASID 0 as a sort of design decision.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c | 20 
 drivers/iommu/intel-pasid.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e741238e2326..a077ff3f67a6 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2462,6 +2462,22 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
dmar_remove_one_dev_info(domain, dev);
return NULL;
}
+
+   /* Setup the PASID entry for requests without PASID: */
+   spin_lock(>lock);
+   if (hw_pass_through && domain_type_is_si(domain))
+   ret = intel_pasid_setup_pass_through(iommu, domain,
+   dev, PASID_RID2PASID);
+   else
+   ret = intel_pasid_setup_second_level(iommu, domain,
+   dev, PASID_RID2PASID);
+   spin_unlock(>lock);
+   if (ret) {
+   pr_err("Setup RID2PASID for %s failed\n",
+  dev_name(dev));
+   dmar_remove_one_dev_info(domain, dev);
+   return NULL;
+   }
}
 
if (dev && domain_context_mapping(domain, dev)) {
@@ -4825,6 +4841,10 @@ static void __dmar_remove_one_dev_info(struct 
device_domain_info *info)
iommu = info->iommu;
 
if (info->dev) {
+   if (dev_is_pci(info->dev) && sm_supported(iommu))
+   intel_pasid_tear_down_entry(iommu, info->dev,
+   PASID_RID2PASID);
+
iommu_disable_dev_iotlb(info);
domain_context_clear(iommu, info->dev);
intel_pasid_free_table(info->dev);
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
index 3c70522091d3..d6f4fead4491 100644
--- a/drivers/iommu/intel-pasid.h
+++ b/drivers/iommu/intel-pasid.h
@@ -10,6 +10,7 @@
 #ifndef __INTEL_PASID_H
 #define __INTEL_PASID_H
 
+#define PASID_RID2PASID0x0
 #define PASID_MIN  0x1
 #define PASID_MAX  0x10
 #define PASID_PTE_MASK 0x3F
-- 
2.17.1

[PATCH v6 01/12] iommu/vt-d: Enumerate the scalable mode capability

The Intel vt-d spec rev3.0 introduces a new translation
mode called scalable mode, which enables PASID-granular
translations for first level, second level, nested and
pass-through modes. At the same time, the previous
Extended Context (ECS) mode is deprecated (no production
ever implements ECS).

This patch adds enumeration for Scalable Mode and removes
the deprecated ECS enumeration. It provides a boot time
option to disable scalable mode even hardware claims to
support it.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
Reviewed-by: Kevin Tian 
---
 .../admin-guide/kernel-parameters.txt | 12 ++--
 drivers/iommu/intel-iommu.c   | 64 +--
 include/linux/intel-iommu.h   |  1 +
 3 files changed, 24 insertions(+), 53 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index aefd358a5ca3..70384d8682ea 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1683,12 +1683,12 @@
By default, super page will be supported if Intel IOMMU
has the capability. With this option, super page will
not be supported.
-   ecs_off [Default Off]
-   By default, extended context tables will be supported if
-   the hardware advertises that it has support both for the
-   extended tables themselves, and also PASID support. With
-   this option set, extended tables will not be used even
-   on hardware which claims to support them.
+   sm_off [Default Off]
+   By default, scalable mode will be supported if the
+   hardware advertises that it has support for the scalable
+   mode translation. With this option set, scalable mode
+   will not be used even on hardware which claims to 
support
+   it.
tboot_noforce [Default Off]
Do not force the Intel IOMMU enabled under tboot.
By default, tboot will force Intel IOMMU on, which
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index f3afab82f3ee..c0ffde6cb21a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -405,38 +405,16 @@ static int dmar_map_gfx = 1;
 static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
-static int intel_iommu_ecs = 1;
-static int intel_iommu_pasid28;
+static int intel_iommu_sm = 1;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL   1
 #define IDENTMAP_GFX   2
 #define IDENTMAP_AZALIA4
 
-/* Broadwell and Skylake have broken ECS support — normal so-called "second
- * level" translation of DMA requests-without-PASID doesn't actually happen
- * unless you also set the NESTE bit in an extended context-entry. Which of
- * course means that SVM doesn't work because it's trying to do nested
- * translation of the physical addresses it finds in the process page tables,
- * through the IOVA->phys mapping found in the "second level" page tables.
- *
- * The VT-d specification was retroactively changed to change the definition
- * of the capability bits and pretend that Broadwell/Skylake never happened...
- * but unfortunately the wrong bit was changed. It's ECS which is broken, but
- * for some reason it was the PASID capability bit which was redefined (from
- * bit 28 on BDW/SKL to bit 40 in future).
- *
- * So our test for ECS needs to eschew those implementations which set the old
- * PASID capabiity bit 28, since those are the ones on which ECS is broken.
- * Unless we are working around the 'pasid28' limitations, that is, by putting
- * the device into passthrough mode for normal DMA and thus masking the bug.
- */
-#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
-   (intel_iommu_pasid28 || 
!ecap_broken_pasid(iommu->ecap)))
-/* PASID support is thus enabled if ECS is enabled and *either* of the old
- * or new capability bits are set. */
-#define pasid_enabled(iommu) (ecs_enabled(iommu) &&\
- (ecap_pasid(iommu->ecap) || 
ecap_broken_pasid(iommu->ecap)))
+#define sm_supported(iommu)(intel_iommu_sm && ecap_smts((iommu)->ecap))
+#define pasid_supported(iommu) (sm_supported(iommu) && \
+ecap_pasid((iommu)->ecap))
 
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@ -516,15 +494,9 @@ static int __init intel_iommu_setup(char *str)

[PATCH v6 00/12] iommu/vt-d: Add scalable mode support

Hi,

Intel vt-d rev3.0 [1] introduces a new translation mode called
'scalable mode', which enables PASID-granular translations for
first level, second level, nested and pass-through modes. The
vt-d scalable mode is the key ingredient to enable Scalable I/O
Virtualization (Scalable IOV) [2] [3], which allows sharing a
device in minimal possible granularity (ADI - Assignable Device
Interface). It also includes all the capabilities required to
enable Shared Virtual Addressing (SVA). As a result, previous
Extended Context (ECS) mode is deprecated (no production ever
implements ECS).

Each scalable mode pasid table entry is 64 bytes in length, with
fields point to the first level page table and the second level
page table. The PGTT (Pasid Granular Translation Type) field is
used by hardware to determine the translation type.


  A Scalable Mode.-.
   PASID Entry .-| |
   .--.  .-| | 1st Level   |
  7|  |  | | | Page Table  |
   .--.  | | | |
  6|  |  | | | |
   '--'  | | '-'
  5|  |  | '-'
   '--'  '-'
  4|  |^
   '--'   /
  3|  |  /   .-.
   ..---.-. /  .-| |
  2|| FLPTR | |/ .-| | 2nd Level   |
   .'---'-.  | | | Page Table  |
  1|  |  | | | |
   .-.---..--..  | | | |
  0| | SLPTR || PGTT ||> | | '-'
   '-'---''--''  | '-'
   6 |0  '-'
   3 v
 ..
 | PASID Granular Translation Type|
 ||
 | 001b: 1st level translation only   |
 | 101b: 2nd level translation only   |
 | 011b: Nested translation   |
 | 100b: Pass through |
 ''

This patch series adds the scalable mode support in the Intel
IOMMU driver. It will make all the Intel IOMMU features work
in scalable mode. The changes are all constrained within the
Intel IOMMU driver, as it's purely internal format change.

References:
[1] 
https://software.intel.com/en-us/download/intel-virtualization-technology-for-directed-io-architecture-specification
[2] 
https://software.intel.com/en-us/download/intel-scalable-io-virtualization-technical-specification
[3] https://schd.ws/hosted_files/lc32018/00/LC3-SIOV-final.pdf

Change log:
v5->v6:
  - [02/12] Move pasid table allocation out of lock range
and replace GFP_ATOMIC with GFP_KERNEL when
allocating the pasid memory.

v4->v5:
  - [04/12] Add a comment to explain why we only print
two QWORDs of an invalid descriptor.
  - [06/12] Fix domain agaw being less than iommu agaw.
  - [09/12] Move data type to local branch.
  - [11/12] Hold iommu lock when setup pasid entry for
SVM.
  - All rebased on top of vt-d branch of iommu tree.

v3->v4:
  - Rebase all patches to 4.20-rc1.
  - Use the right PASID directory table size.
  - Add a function for pass through translation setting up.
  - Refine the parameters passed to first level translation
mode interface.

v2->v3:
  - Rebase all patches on top of vt-d branch of iommu repo.
  - Set the pasid directory table size to 1 page for devices
which have no pasid support.
  - Fix various comments received during v2 review period.
All were code style related.

v1->v2:
  - Rebase all patches on top of v4.19-rc1;
  - Add 256-bit invalidation descriptor support;
  - Reserve a domain id for first level and pass-through
usage to make hardware cache entries more efficiently;
  - Various code refinements.

Lu Baolu (12):
  iommu/vt-d: Enumerate the scalable mode capability
  iommu/vt-d: Manage scalalble mode PASID tables
  iommu/vt-d: Move page table helpers into header
  iommu/vt-d: Add 256-bit invalidation descriptor support
  iommu/vt-d: Reserve a domain id for FL and PT modes
  iommu/vt-d: Add second level page table interface
  iommu/vt-d: Setup pasid entry for RID2PASID support
  iommu/vt-d: Pass pasid table to context mapping
  iommu/vt-d: Setup context and enable RID2PASID support
  iommu/vt-d: Add first level page table interface
  iommu/vt-d: Shared virtual address in scalable mode
  iommu/vt-d: Remove deferred invalidation

 .../admin-guide/kernel-parameters.txt |  12 +-
 drivers/iommu/dmar.c  |  91 ++--
 drivers/iommu/intel-iommu.c   | 351 +++---
 drivers/iommu/intel-pasid.c   | 449 +-
 drivers/i

[PATCH v6 02/12] iommu/vt-d: Manage scalalble mode PASID tables

In scalable mode, pasid structure is a two level table with
a pasid directory table and a pasid table. Any pasid entry
can be identified by a pasid value in below way.

   1
   9   6 5  0
.---.---.
|  PASID|   |
'---'---'.-.
 ||  | |
 ||  | |
 ||  | |
 | .---.  |  .-.
 | |   |  |->| PASID Entry |
 | |   |  |  '-'
 | |   |  |Plus  | |
 | .---.  |  | |
 |>| DIR Entry |>| |
 | '---' '-'
.-.  |Plus |   |
| Context |  | |   |
|  Entry  |--->|   |
'-''---'

This changes the pasid table APIs to support scalable mode
PASID directory and PASID table. It also adds a helper to
get the PASID table entry according to the pasid value.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c | 23 ++
 drivers/iommu/intel-pasid.c | 87 ++---
 drivers/iommu/intel-pasid.h | 12 -
 drivers/iommu/intel-svm.c   |  6 +--
 4 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index c0ffde6cb21a..73d7c15bf737 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -425,21 +425,24 @@ static LIST_HEAD(device_domain_list);
 
 /*
  * Iterate over elements in device_domain_list and call the specified
- * callback @fn against each element. This helper should only be used
- * in the context where the device_domain_lock has already been holden.
+ * callback @fn against each element.
  */
 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 void *data), void *data)
 {
int ret = 0;
+   unsigned long flags;
struct device_domain_info *info;
 
-   assert_spin_locked(_domain_lock);
+   spin_lock_irqsave(_domain_lock, flags);
list_for_each_entry(info, _domain_list, global) {
ret = fn(info, data);
-   if (ret)
+   if (ret) {
+   spin_unlock_irqrestore(_domain_lock, flags);
return ret;
+   }
}
+   spin_unlock_irqrestore(_domain_lock, flags);
 
return 0;
 }
@@ -2481,16 +2484,18 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
list_add(>global, _domain_list);
if (dev)
dev->archdata.iommu = info;
+   spin_unlock_irqrestore(_domain_lock, flags);
 
-   if (dev && dev_is_pci(dev) && info->pasid_supported) {
+   /* PASID table is mandatory for a PCI device in scalable mode. */
+   if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
ret = intel_pasid_alloc_table(dev);
if (ret) {
-   pr_warn("No pasid table for %s, pasid disabled\n",
-   dev_name(dev));
-   info->pasid_supported = 0;
+   pr_err("PASID table allocation for %s failed\n",
+  dev_name(dev));
+   dmar_remove_one_dev_info(domain, dev);
+   return NULL;
}
}
-   spin_unlock_irqrestore(_domain_lock, flags);
 
if (dev && domain_context_mapping(domain, dev)) {
pr_err("Domain context map for %s failed\n", dev_name(dev));
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
index fe95c9bd4d33..fd3ccc0753b0 100644
--- a/drivers/iommu/intel-pasid.c
+++ b/drivers/iommu/intel-pasid.c
@@ -123,12 +123,13 @@ int intel_pasid_alloc_table(struct device *dev)
struct pasid_table *pasid_table;
struct pasid_table_opaque data;
struct page *pages;
-   size_t size, count;
+   int max_pasid = 0;
int ret, order;
+   int size;
 
+   might_sleep();
info = dev->archdata.iommu;
-   if (WARN_ON(!info || !dev_is_pci(dev) ||
-   !info->pasid_supported || info->pasid_table))
+   if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table))
return -EINVAL;
 
/* DMA alias device already has a pasid table, use it: */
@@ -138,23 +139,25 @@ int intel_pasid_alloc_table(struct device *dev)
if (ret)
goto attach_out;
 
-

[PATCH v6 04/12] iommu/vt-d: Add 256-bit invalidation descriptor support

Intel vt-d spec rev3.0 requires software to use 256-bit
descriptors in invalidation queue. As the spec reads in
section 6.5.2:

Remapping hardware supporting Scalable Mode Translations
(ECAP_REG.SMTS=1) allow software to additionally program
the width of the descriptors (128-bits or 256-bits) that
will be written into the Queue. Software should setup the
Invalidation Queue for 256-bit descriptors before progra-
mming remapping hardware for scalable-mode translation as
128-bit descriptors are treated as invalid descriptors
(see Table 21 in Section 6.5.2.10) in scalable-mode.

This patch adds 256-bit invalidation descriptor support
if the hardware presents scalable mode capability.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/dmar.c| 91 +++--
 drivers/iommu/intel-svm.c   | 76 +++-
 drivers/iommu/intel_irq_remapping.c |  6 +-
 include/linux/intel-iommu.h |  9 ++-
 4 files changed, 121 insertions(+), 61 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index d9c748b6f9e4..9511f9aeb77c 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1160,6 +1160,7 @@ static int qi_check_fault(struct intel_iommu *iommu, int 
index)
int head, tail;
struct q_inval *qi = iommu->qi;
int wait_index = (index + 1) % QI_LENGTH;
+   int shift = qi_shift(iommu);
 
if (qi->desc_status[wait_index] == QI_ABORT)
return -EAGAIN;
@@ -1173,13 +1174,19 @@ static int qi_check_fault(struct intel_iommu *iommu, 
int index)
 */
if (fault & DMA_FSTS_IQE) {
head = readl(iommu->reg + DMAR_IQH_REG);
-   if ((head >> DMAR_IQ_SHIFT) == index) {
-   pr_err("VT-d detected invalid descriptor: "
-   "low=%llx, high=%llx\n",
-   (unsigned long long)qi->desc[index].low,
-   (unsigned long long)qi->desc[index].high);
-   memcpy(>desc[index], >desc[wait_index],
-   sizeof(struct qi_desc));
+   if ((head >> shift) == index) {
+   struct qi_desc *desc = qi->desc + head;
+
+   /*
+* desc->qw2 and desc->qw3 are either reserved or
+* used by software as private data. We won't print
+* out these two qw's for security consideration.
+*/
+   pr_err("VT-d detected invalid descriptor: qw0 = %llx, 
qw1 = %llx\n",
+  (unsigned long long)desc->qw0,
+  (unsigned long long)desc->qw1);
+   memcpy(desc, qi->desc + (wait_index << shift),
+  1 << shift);
writel(DMA_FSTS_IQE, iommu->reg + DMAR_FSTS_REG);
return -EINVAL;
}
@@ -1191,10 +1198,10 @@ static int qi_check_fault(struct intel_iommu *iommu, 
int index)
 */
if (fault & DMA_FSTS_ITE) {
head = readl(iommu->reg + DMAR_IQH_REG);
-   head = ((head >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+   head = ((head >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
head |= 1;
tail = readl(iommu->reg + DMAR_IQT_REG);
-   tail = ((tail >> DMAR_IQ_SHIFT) - 1 + QI_LENGTH) % QI_LENGTH;
+   tail = ((tail >> shift) - 1 + QI_LENGTH) % QI_LENGTH;
 
writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG);
 
@@ -1222,15 +1229,14 @@ int qi_submit_sync(struct qi_desc *desc, struct 
intel_iommu *iommu)
 {
int rc;
struct q_inval *qi = iommu->qi;
-   struct qi_desc *hw, wait_desc;
+   int offset, shift, length;
+   struct qi_desc wait_desc;
int wait_index, index;
unsigned long flags;
 
if (!qi)
return 0;
 
-   hw = qi->desc;
-
 restart:
rc = 0;
 
@@ -1243,16 +1249,21 @@ int qi_submit_sync(struct qi_desc *desc, struct 
intel_iommu *iommu)
 
index = qi->free_head;
wait_index = (index + 1) % QI_LENGTH;
+   shift = qi_shift(iommu);
+   length = 1 << shift;
 
qi->desc_status[index] = qi->desc_status[wait_index] = QI_IN_USE;
 
-   hw[index] = *desc;
-
-   wait_desc.low = QI_IWD_STATUS_DATA(QI_DONE) |
+   offset = index << shift;
+   memcpy(qi->desc + offset, desc, length);
+   wait_desc.qw0 = QI_IWD_STATUS_DATA(QI_DONE) |
QI_IWD_STATUS_WRITE | QI_IWD_TYPE;
-   wait_desc.high = virt_to_phys(>desc_status[wait_index]);
+   wait_

[PATCH v4 4/8] iommu/vt-d: Attach/detach domains in auxiliary mode

2018-11-04 Thread Lu Baolu

When multiple domains per device has been enabled by the
device driver, the device will tag the default PASID for
the domain to all DMA traffics out of the subset of this
device; and the IOMMU should translate the DMA requests
in PASID granularity.

This extends the intel_iommu_attach/detach_device() ops
to support managing PASID granular translation structures
when the device driver has enabled multiple domains per
device.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Lu Baolu 
Signed-off-by: Liu Yi L 
---
 drivers/iommu/intel-iommu.c | 192 +++-
 include/linux/intel-iommu.h |  10 ++
 2 files changed, 180 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2c86ac71c774..a61b25ad0d3b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2477,6 +2477,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->iommu = iommu;
info->pasid_table = NULL;
info->auxd_enabled = 0;
+   INIT_LIST_HEAD(>auxiliary_domains);
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5010,35 +5011,134 @@ static void intel_iommu_domain_free(struct 
iommu_domain *domain)
domain_exit(to_dmar_domain(domain));
 }
 
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-struct device *dev)
+/*
+ * Check whether a @domain will be attached to the @dev in the
+ * auxiliary mode.
+ */
+static inline bool
+is_device_attach_aux_domain(struct device *dev, struct iommu_domain *domain)
 {
-   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-   struct intel_iommu *iommu;
-   int addr_width;
-   u8 bus, devfn;
+   struct device_domain_info *info = dev->archdata.iommu;
 
-   if (device_is_rmrr_locked(dev)) {
-   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
-   return -EPERM;
-   }
+   return info && info->auxd_enabled &&
+   domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
 
-   /* normally dev is not mapped */
-   if (unlikely(domain_context_mapped(dev))) {
-   struct dmar_domain *old_domain;
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
 
-   old_domain = find_domain(dev);
-   if (old_domain) {
-   rcu_read_lock();
-   dmar_remove_one_dev_info(old_domain, dev);
-   rcu_read_unlock();
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
 
-   if (!domain_type_is_vm_or_si(old_domain) &&
-list_empty(_domain->devices))
-   domain_exit(old_domain);
+   domain->auxd_refcnt++;
+   list_add(>auxd, >auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+   struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   list_del(>auxd);
+   domain->auxd_refcnt--;
+
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+}
+
+static int domain_add_dev_auxd(struct dmar_domain *domain,
+  struct device *dev)
+{
+   int ret;
+   u8 bus, devfn;
+   unsigned long flags;
+   struct intel_iommu *iommu;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   if (domain->default_pasid <= 0) {
+   domain->default_pasid = intel_pasid_alloc_id(domain, PASID_MIN,
+   pci_max_pasids(to_pci_dev(dev)), GFP_ATOMIC);
+   if (domain->default_pasid < 0) {
+   pr_err("Can't allocate default pasid\n");
+   ret = -ENODEV;
+   goto pasid_failed;
}
}
 
+   spin_lock(>lock);
+   ret = domain_attach_iommu(domain, iommu);
+   if (ret)
+   goto attach_failed;
+
+   /* Setup the PASID entry for mediated devices: */
+   ret = intel_pasid_setup_second_level(iommu, domain, dev,
+domain->default_pasid);
+   if (ret)
+   goto table_failed;
+   spin_unlock(>

[PATCH v4 4/8] iommu/vt-d: Attach/detach domains in auxiliary mode

2018-11-04 Thread Lu Baolu

When multiple domains per device has been enabled by the
device driver, the device will tag the default PASID for
the domain to all DMA traffics out of the subset of this
device; and the IOMMU should translate the DMA requests
in PASID granularity.

This extends the intel_iommu_attach/detach_device() ops
to support managing PASID granular translation structures
when the device driver has enabled multiple domains per
device.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Signed-off-by: Sanjay Kumar 
Signed-off-by: Lu Baolu 
Signed-off-by: Liu Yi L 
---
 drivers/iommu/intel-iommu.c | 192 +++-
 include/linux/intel-iommu.h |  10 ++
 2 files changed, 180 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 2c86ac71c774..a61b25ad0d3b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2477,6 +2477,7 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
info->iommu = iommu;
info->pasid_table = NULL;
info->auxd_enabled = 0;
+   INIT_LIST_HEAD(>auxiliary_domains);
 
if (dev && dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(info->dev);
@@ -5010,35 +5011,134 @@ static void intel_iommu_domain_free(struct 
iommu_domain *domain)
domain_exit(to_dmar_domain(domain));
 }
 
-static int intel_iommu_attach_device(struct iommu_domain *domain,
-struct device *dev)
+/*
+ * Check whether a @domain will be attached to the @dev in the
+ * auxiliary mode.
+ */
+static inline bool
+is_device_attach_aux_domain(struct device *dev, struct iommu_domain *domain)
 {
-   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
-   struct intel_iommu *iommu;
-   int addr_width;
-   u8 bus, devfn;
+   struct device_domain_info *info = dev->archdata.iommu;
 
-   if (device_is_rmrr_locked(dev)) {
-   dev_warn(dev, "Device is ineligible for IOMMU domain attach due 
to platform RMRR requirement.  Contact your platform vendor.\n");
-   return -EPERM;
-   }
+   return info && info->auxd_enabled &&
+   domain->type == IOMMU_DOMAIN_UNMANAGED;
+}
 
-   /* normally dev is not mapped */
-   if (unlikely(domain_context_mapped(dev))) {
-   struct dmar_domain *old_domain;
+static void auxiliary_link_device(struct dmar_domain *domain,
+ struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
 
-   old_domain = find_domain(dev);
-   if (old_domain) {
-   rcu_read_lock();
-   dmar_remove_one_dev_info(old_domain, dev);
-   rcu_read_unlock();
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
 
-   if (!domain_type_is_vm_or_si(old_domain) &&
-list_empty(_domain->devices))
-   domain_exit(old_domain);
+   domain->auxd_refcnt++;
+   list_add(>auxd, >auxiliary_domains);
+}
+
+static void auxiliary_unlink_device(struct dmar_domain *domain,
+   struct device *dev)
+{
+   struct device_domain_info *info = dev->archdata.iommu;
+
+   assert_spin_locked(_domain_lock);
+   if (WARN_ON(!info))
+   return;
+
+   list_del(>auxd);
+   domain->auxd_refcnt--;
+
+   if (!domain->auxd_refcnt && domain->default_pasid > 0)
+   intel_pasid_free_id(domain->default_pasid);
+}
+
+static int domain_add_dev_auxd(struct dmar_domain *domain,
+  struct device *dev)
+{
+   int ret;
+   u8 bus, devfn;
+   unsigned long flags;
+   struct intel_iommu *iommu;
+
+   iommu = device_to_iommu(dev, , );
+   if (!iommu)
+   return -ENODEV;
+
+   spin_lock_irqsave(_domain_lock, flags);
+   if (domain->default_pasid <= 0) {
+   domain->default_pasid = intel_pasid_alloc_id(domain, PASID_MIN,
+   pci_max_pasids(to_pci_dev(dev)), GFP_ATOMIC);
+   if (domain->default_pasid < 0) {
+   pr_err("Can't allocate default pasid\n");
+   ret = -ENODEV;
+   goto pasid_failed;
}
}
 
+   spin_lock(>lock);
+   ret = domain_attach_iommu(domain, iommu);
+   if (ret)
+   goto attach_failed;
+
+   /* Setup the PASID entry for mediated devices: */
+   ret = intel_pasid_setup_second_level(iommu, domain, dev,
+domain->default_pasid);
+   if (ret)
+   goto table_failed;
+   spin_unlock(>

Re: [PATCH v2 0/9] iommu/vt-d: Improve PASID id and table management

Hi,

On 05/16/2018 04:56 PM, Tian, Kevin wrote:
>> From: Lu Baolu [mailto:baolu...@linux.intel.com]
>> Sent: Wednesday, May 16, 2018 4:01 PM
>>
>> Hi Joerg,
>>
>> Thank you for looking at my patches.
>>
>> On 05/15/2018 10:11 PM, Joerg Roedel wrote:
>>> On Fri, May 04, 2018 at 09:41:15AM +0800, Lu Baolu wrote:
>>>> PATCH 4~9 implement per domain PASID table. Current per IOMMU
>>>> PASID table implementation is insecure in the cases where
>>>> multiple devices under one single IOMMU unit support PASID
>>>> feature. With per domain PASID table, we can achieve finer
>>>> protection and isolation granularity.
>>> Hold on, we hat discussions in the past about doing a system-wide pasid
>>> space, so that every mm_struct with devices attached gets the same pasid
>>> across all devices it is talking to. Reason was that some devices (will)
>>> require this to work correctly. This goes into the opposite direction,
>>> so I am a bit confused here. Please explain, is this not longer
>>> necessary?
>> You are right. System-wide pasid space is necessary, hence PATCH
>> 1~3 implement it. But PATCH 4~9 don't go into the opposite direction,
>> it's designed to address another potential issue.
> one thing you may want to highlight is, even with PATCH 4-9 it's
> still doing system-wide PASID space allocation. Just PASID table
> itself is kept per-device for isolation purpose as you described
> below, i.e. each device can access only those PASIDs which are
> allocated to itself while the allocation happens system-wide...

Yes, exactly.

Best regards,
Lu Baolu

>
>> With system-wide pasid space, we can use a system-wide pasid table,
>> or just keep what we have now(per iommu unit pasid table). Both
>> system-wide and per iommu unitpasid table mean that two devices
>> might share a single pasid table. That will result in an issue.
>>
>> For an example, device A is assigned to access the memory space of
>> process A, and device B is assigned to access the memory space of
>> process B. The dma remapping infrastructure looks like:
>>
>>  .--.
>>  ..  |  |
>>  ||  |  |
>>  ..  | Paging structure |
>>  | PASID X|--|   |  for Process A   |
>>  ..  |   |  |
>>  ||  --->'--'
>>  ..  ..
>>  ||  | PASID Y|--|
>>  ..  ..  |
>>  | Dev_A context  |---|  ||  |   .--.
>>  ''   |  ..  |   |  |
>>  ||   |  ||  |   |  |
>>  ''   |  ..  |   | Paging structure |
>>  | Dev_B context  |   -->||  |   |  for Process B   |
>>  ''->''  |   |  |
>>  || system-wide  v-->'--'
>>  .. pasid table
>>  ||
>>  ''
>> Intel iommu
>>context table
>>
>>
>> Since dev_A and dev_B share a pasid table, the side effect is that a flawed
>> dev_A might access the memory space of process B (with pasid y). Vice
>> versa,
>> a flawed dev_B might access memory space of process A (with pasid x).
>>
>> What PATCH 4~9 do is to remove such possibility by assigning a pasid table
>> for each pci device. Hence, the remapping infrastructure looks like:
>>
>>
>> .--.
>> |  |
>> ..  |  |
>> ||  | Paging structure |
>> ..  |  for Process A   |
>> | PASID X|  |  |
>> ..->'--'
>> ||
>> ..
>>

Re: [PATCH v2 0/9] iommu/vt-d: Improve PASID id and table management

Hi,

On 05/16/2018 04:56 PM, Tian, Kevin wrote:
>> From: Lu Baolu [mailto:baolu...@linux.intel.com]
>> Sent: Wednesday, May 16, 2018 4:01 PM
>>
>> Hi Joerg,
>>
>> Thank you for looking at my patches.
>>
>> On 05/15/2018 10:11 PM, Joerg Roedel wrote:
>>> On Fri, May 04, 2018 at 09:41:15AM +0800, Lu Baolu wrote:
>>>> PATCH 4~9 implement per domain PASID table. Current per IOMMU
>>>> PASID table implementation is insecure in the cases where
>>>> multiple devices under one single IOMMU unit support PASID
>>>> feature. With per domain PASID table, we can achieve finer
>>>> protection and isolation granularity.
>>> Hold on, we hat discussions in the past about doing a system-wide pasid
>>> space, so that every mm_struct with devices attached gets the same pasid
>>> across all devices it is talking to. Reason was that some devices (will)
>>> require this to work correctly. This goes into the opposite direction,
>>> so I am a bit confused here. Please explain, is this not longer
>>> necessary?
>> You are right. System-wide pasid space is necessary, hence PATCH
>> 1~3 implement it. But PATCH 4~9 don't go into the opposite direction,
>> it's designed to address another potential issue.
> one thing you may want to highlight is, even with PATCH 4-9 it's
> still doing system-wide PASID space allocation. Just PASID table
> itself is kept per-device for isolation purpose as you described
> below, i.e. each device can access only those PASIDs which are
> allocated to itself while the allocation happens system-wide...

Yes, exactly.

Best regards,
Lu Baolu

>
>> With system-wide pasid space, we can use a system-wide pasid table,
>> or just keep what we have now(per iommu unit pasid table). Both
>> system-wide and per iommu unitpasid table mean that two devices
>> might share a single pasid table. That will result in an issue.
>>
>> For an example, device A is assigned to access the memory space of
>> process A, and device B is assigned to access the memory space of
>> process B. The dma remapping infrastructure looks like:
>>
>>  .--.
>>  ..  |  |
>>  ||  |  |
>>  ..  | Paging structure |
>>  | PASID X|--|   |  for Process A   |
>>  ..  |   |  |
>>  ||  --->'--'
>>  ..  ..
>>  ||  | PASID Y|--|
>>  ..  ..  |
>>  | Dev_A context  |---|  ||  |   .--.
>>  ''   |  ..  |   |  |
>>  ||   |  ||  |   |  |
>>  ''   |  ..  |   | Paging structure |
>>  | Dev_B context  |   -->||  |   |  for Process B   |
>>  ''->''  |   |  |
>>  || system-wide  v-->'--'
>>  .. pasid table
>>  ||
>>  ''
>> Intel iommu
>>context table
>>
>>
>> Since dev_A and dev_B share a pasid table, the side effect is that a flawed
>> dev_A might access the memory space of process B (with pasid y). Vice
>> versa,
>> a flawed dev_B might access memory space of process A (with pasid x).
>>
>> What PATCH 4~9 do is to remove such possibility by assigning a pasid table
>> for each pci device. Hence, the remapping infrastructure looks like:
>>
>>
>> .--.
>> |  |
>> ..  |  |
>> ||  | Paging structure |
>> ..  |  for Process A   |
>> | PASID X|  |  |
>> ..->'--'
>> ||
>> ..
>>

Re: [PATCH v2 0/9] iommu/vt-d: Improve PASID id and table management

Hi Joerg,

Thank you for looking at my patches.

On 05/15/2018 10:11 PM, Joerg Roedel wrote:
> On Fri, May 04, 2018 at 09:41:15AM +0800, Lu Baolu wrote:
>> PATCH 4~9 implement per domain PASID table. Current per IOMMU
>> PASID table implementation is insecure in the cases where
>> multiple devices under one single IOMMU unit support PASID
>> feature. With per domain PASID table, we can achieve finer
>> protection and isolation granularity.
>
> Hold on, we hat discussions in the past about doing a system-wide pasid
> space, so that every mm_struct with devices attached gets the same pasid
> across all devices it is talking to. Reason was that some devices (will)
> require this to work correctly. This goes into the opposite direction,
> so I am a bit confused here. Please explain, is this not longer
> necessary?

You are right. System-wide pasid space is necessary, hence PATCH
1~3 implement it. But PATCH 4~9 don't go into the opposite direction,
it's designed to address another potential issue.

With system-wide pasid space, we can use a system-wide pasid table,
or just keep what we have now(per iommu unit pasid table). Both
system-wide and per iommu unitpasid table mean that two devices
might share a single pasid table. That will result in an issue.

For an example, device A is assigned to access the memory space of
process A, and device B is assigned to access the memory space of
process B. The dma remapping infrastructure looks like:

 .--.
 ..  |  |
 ||  |  |
 ..  | Paging structure |
 | PASID X|--|   |  for Process A   |
 ..  |   |  |
 ||  --->'--'
 ..  ..
 ||  | PASID Y|--|
 ..  ..  |
 | Dev_A context  |---|  ||  |   .--.
 ''   |  ..  |   |  |
 ||   |  ||  |   |  |
 ''   |  ..  |   | Paging structure |
 | Dev_B context  |   -->||  |   |  for Process B   |
 ''->''  |   |  |
 || system-wide  v-->'--'
 .. pasid table
 ||
 ''
Intel iommu
   context table

Since dev_A and dev_B share a pasid table, the side effect is that a flawed
dev_A might access the memory space of process B (with pasid y). Vice versa,
a flawed dev_B might access memory space of process A (with pasid x).

What PATCH 4~9 do is to remove such possibility by assigning a pasid table
for each pci device. Hence, the remapping infrastructure looks like:

.--.
|  |
..  |  |
||  | Paging structure |
..  |  for Process A   |
| PASID X|  |  |
..->'--'
||
..
||
..
||
..
   ..   ||
   ||   ..
   ..   ||
   | Dev_A context  |-->''
   ''  pasid table 
   ||  for Dev_A   
   ''  
   | Dev_B context  |-->
   ''  |..
   ||  |||  .--.
   ..  |..  |  |
   ||  |||  |  |
   ''  |..  | Paging structure |
   Intel iommu |||  |  for Process B   |
  context table|.--

Re: [PATCH v2 0/9] iommu/vt-d: Improve PASID id and table management

Hi Joerg,

Thank you for looking at my patches.

On 05/15/2018 10:11 PM, Joerg Roedel wrote:
> On Fri, May 04, 2018 at 09:41:15AM +0800, Lu Baolu wrote:
>> PATCH 4~9 implement per domain PASID table. Current per IOMMU
>> PASID table implementation is insecure in the cases where
>> multiple devices under one single IOMMU unit support PASID
>> feature. With per domain PASID table, we can achieve finer
>> protection and isolation granularity.
>
> Hold on, we hat discussions in the past about doing a system-wide pasid
> space, so that every mm_struct with devices attached gets the same pasid
> across all devices it is talking to. Reason was that some devices (will)
> require this to work correctly. This goes into the opposite direction,
> so I am a bit confused here. Please explain, is this not longer
> necessary?

You are right. System-wide pasid space is necessary, hence PATCH
1~3 implement it. But PATCH 4~9 don't go into the opposite direction,
it's designed to address another potential issue.

With system-wide pasid space, we can use a system-wide pasid table,
or just keep what we have now(per iommu unit pasid table). Both
system-wide and per iommu unitpasid table mean that two devices
might share a single pasid table. That will result in an issue.

For an example, device A is assigned to access the memory space of
process A, and device B is assigned to access the memory space of
process B. The dma remapping infrastructure looks like:

 .--.
 ..  |  |
 ||  |  |
 ..  | Paging structure |
 | PASID X|--|   |  for Process A   |
 ..  |   |  |
 ||  --->'--'
 ..  ..
 ||  | PASID Y|--|
 ..  ..  |
 | Dev_A context  |---|  ||  |   .--.
 ''   |  ..  |   |  |
 ||   |  ||  |   |  |
 ''   |  ..  |   | Paging structure |
 | Dev_B context  |   -->||  |   |  for Process B   |
 ''->''  |   |  |
 || system-wide  v-->'--'
 .. pasid table
 ||
 ''
Intel iommu
   context table

Since dev_A and dev_B share a pasid table, the side effect is that a flawed
dev_A might access the memory space of process B (with pasid y). Vice versa,
a flawed dev_B might access memory space of process A (with pasid x).

What PATCH 4~9 do is to remove such possibility by assigning a pasid table
for each pci device. Hence, the remapping infrastructure looks like:

.--.
|  |
..  |  |
||  | Paging structure |
..  |  for Process A   |
| PASID X|  |  |
..->'--'
||
..
||
..
||
..
   ..   ||
   ||   ..
   ..   ||
   | Dev_A context  |-->''
   ''  pasid table 
   ||  for Dev_A   
   ''  
   | Dev_B context  |-->
   ''  |..
   ||  |||  .--.
   ..  |..  |  |
   ||  |||  |  |
   ''  |..  | Paging structure |
   Intel iommu |||  |  for Process B   |
  context table|.--

Re: [PATCH v5 13/23] iommu: introduce device fault report API

2018-05-15 Thread Lu Baolu

Hi,

On 05/15/2018 04:55 AM, Jacob Pan wrote:
> On Mon, 14 May 2018 14:01:06 +0800
> Lu Baolu <baolu...@linux.intel.com> wrote:
>
>> Hi,
>>
>> On 05/12/2018 04:54 AM, Jacob Pan wrote:
>>> Traditionally, device specific faults are detected and handled
>>> within their own device drivers. When IOMMU is enabled, faults such
>>> as DMA related transactions are detected by IOMMU. There is no
>>> generic reporting mechanism to report faults back to the in-kernel
>>> device driver or the guest OS in case of assigned devices.
>>>
>>> Faults detected by IOMMU is based on the transaction's source ID
>>> which can be reported at per device basis, regardless of the device
>>> type is a PCI device or not.
>>>
>>> The fault types include recoverable (e.g. page request) and
>>> unrecoverable faults(e.g. access error). In most cases, faults can
>>> be handled by IOMMU drivers internally. The primary use cases are as
>>> follows:
>>> 1. page request fault originated from an SVM capable device that is
>>> assigned to guest via vIOMMU. In this case, the first level page
>>> tables are owned by the guest. Page request must be propagated to
>>> the guest to let guest OS fault in the pages then send page
>>> response. In this mechanism, the direct receiver of IOMMU fault
>>> notification is VFIO, which can relay notification events to QEMU
>>> or other user space software.
>>>
>>> 2. faults need more subtle handling by device drivers. Other than
>>> simply invoke reset function, there are needs to let device driver
>>> handle the fault with a smaller impact.
>>>
>>> This patchset is intended to create a generic fault report API such
>>> that it can scale as follows:
>>> - all IOMMU types
>>> - PCI and non-PCI devices
>>> - recoverable and unrecoverable faults
>>> - VFIO and other other in kernel users
>>> - DMA & IRQ remapping (TBD)
>>> The original idea was brought up by David Woodhouse and discussions
>>> summarized at https://lwn.net/Articles/608914/.
>>>
>>> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
>>> Signed-off-by: Ashok Raj <ashok@intel.com>
>>> Signed-off-by: Jean-Philippe Brucker <jean-philippe.bruc...@arm.com>
>>> ---
>>>  drivers/iommu/iommu.c | 149
>>> +-
>>> include/linux/iommu.h |  35 +++- 2 files changed, 181
>>> insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>> index 3a49b96..b3f9daf 100644
>>> --- a/drivers/iommu/iommu.c
>>> +++ b/drivers/iommu/iommu.c
>>> @@ -609,6 +609,13 @@ int iommu_group_add_device(struct iommu_group
>>> *group, struct device *dev) goto err_free_name;
>>> }
>>>  
>>> +   dev->iommu_param = kzalloc(sizeof(*dev->iommu_param),
>>> GFP_KERNEL);
>>> +   if (!dev->iommu_param) {
>>> +   ret = -ENOMEM;
>>> +   goto err_free_name;
>>> +   }
>>> +   mutex_init(>iommu_param->lock);
>>> +
>>> kobject_get(group->devices_kobj);
>>>  
>>> dev->iommu_group = group;
>>> @@ -639,6 +646,7 @@ int iommu_group_add_device(struct iommu_group
>>> *group, struct device *dev) mutex_unlock(>mutex);
>>> dev->iommu_group = NULL;
>>> kobject_put(group->devices_kobj);
>>> +   kfree(dev->iommu_param);
>>>  err_free_name:
>>> kfree(device->name);
>>>  err_remove_link:
>>> @@ -685,7 +693,7 @@ void iommu_group_remove_device(struct device
>>> *dev) sysfs_remove_link(>kobj, "iommu_group");
>>>  
>>> trace_remove_device_from_group(group->id, dev);
>>> -
>>> +   kfree(dev->iommu_param);
>>> kfree(device->name);
>>> kfree(device);
>>> dev->iommu_group = NULL;
>>> @@ -820,6 +828,145 @@ int iommu_group_unregister_notifier(struct
>>> iommu_group *group,
>>> EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); 
>>>  /**
>>> + * iommu_register_device_fault_handler() - Register a device fault
>>> handler
>>> + * @dev: the device
>>> + * @handler: the fault handler
>>> + * @data: private data passed as argument to the handler
>>> + *
>>> + * When an IOMMU fault event is received, call this handler with
>>> the fault e

Re: [PATCH v5 13/23] iommu: introduce device fault report API

2018-05-15 Thread Lu Baolu

Hi,

On 05/15/2018 04:55 AM, Jacob Pan wrote:
> On Mon, 14 May 2018 14:01:06 +0800
> Lu Baolu  wrote:
>
>> Hi,
>>
>> On 05/12/2018 04:54 AM, Jacob Pan wrote:
>>> Traditionally, device specific faults are detected and handled
>>> within their own device drivers. When IOMMU is enabled, faults such
>>> as DMA related transactions are detected by IOMMU. There is no
>>> generic reporting mechanism to report faults back to the in-kernel
>>> device driver or the guest OS in case of assigned devices.
>>>
>>> Faults detected by IOMMU is based on the transaction's source ID
>>> which can be reported at per device basis, regardless of the device
>>> type is a PCI device or not.
>>>
>>> The fault types include recoverable (e.g. page request) and
>>> unrecoverable faults(e.g. access error). In most cases, faults can
>>> be handled by IOMMU drivers internally. The primary use cases are as
>>> follows:
>>> 1. page request fault originated from an SVM capable device that is
>>> assigned to guest via vIOMMU. In this case, the first level page
>>> tables are owned by the guest. Page request must be propagated to
>>> the guest to let guest OS fault in the pages then send page
>>> response. In this mechanism, the direct receiver of IOMMU fault
>>> notification is VFIO, which can relay notification events to QEMU
>>> or other user space software.
>>>
>>> 2. faults need more subtle handling by device drivers. Other than
>>> simply invoke reset function, there are needs to let device driver
>>> handle the fault with a smaller impact.
>>>
>>> This patchset is intended to create a generic fault report API such
>>> that it can scale as follows:
>>> - all IOMMU types
>>> - PCI and non-PCI devices
>>> - recoverable and unrecoverable faults
>>> - VFIO and other other in kernel users
>>> - DMA & IRQ remapping (TBD)
>>> The original idea was brought up by David Woodhouse and discussions
>>> summarized at https://lwn.net/Articles/608914/.
>>>
>>> Signed-off-by: Jacob Pan 
>>> Signed-off-by: Ashok Raj 
>>> Signed-off-by: Jean-Philippe Brucker 
>>> ---
>>>  drivers/iommu/iommu.c | 149
>>> +-
>>> include/linux/iommu.h |  35 +++- 2 files changed, 181
>>> insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
>>> index 3a49b96..b3f9daf 100644
>>> --- a/drivers/iommu/iommu.c
>>> +++ b/drivers/iommu/iommu.c
>>> @@ -609,6 +609,13 @@ int iommu_group_add_device(struct iommu_group
>>> *group, struct device *dev) goto err_free_name;
>>> }
>>>  
>>> +   dev->iommu_param = kzalloc(sizeof(*dev->iommu_param),
>>> GFP_KERNEL);
>>> +   if (!dev->iommu_param) {
>>> +   ret = -ENOMEM;
>>> +   goto err_free_name;
>>> +   }
>>> +   mutex_init(>iommu_param->lock);
>>> +
>>> kobject_get(group->devices_kobj);
>>>  
>>> dev->iommu_group = group;
>>> @@ -639,6 +646,7 @@ int iommu_group_add_device(struct iommu_group
>>> *group, struct device *dev) mutex_unlock(>mutex);
>>> dev->iommu_group = NULL;
>>> kobject_put(group->devices_kobj);
>>> +   kfree(dev->iommu_param);
>>>  err_free_name:
>>> kfree(device->name);
>>>  err_remove_link:
>>> @@ -685,7 +693,7 @@ void iommu_group_remove_device(struct device
>>> *dev) sysfs_remove_link(>kobj, "iommu_group");
>>>  
>>> trace_remove_device_from_group(group->id, dev);
>>> -
>>> +   kfree(dev->iommu_param);
>>> kfree(device->name);
>>> kfree(device);
>>> dev->iommu_group = NULL;
>>> @@ -820,6 +828,145 @@ int iommu_group_unregister_notifier(struct
>>> iommu_group *group,
>>> EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier); 
>>>  /**
>>> + * iommu_register_device_fault_handler() - Register a device fault
>>> handler
>>> + * @dev: the device
>>> + * @handler: the fault handler
>>> + * @data: private data passed as argument to the handler
>>> + *
>>> + * When an IOMMU fault event is received, call this handler with
>>> the fault event
>>> + * and data as argument. The handler should return 0 on success.
>>> If the fault is
>>> + * recov

Re: [PATCH v5 17/23] iommu/vt-d: report non-recoverable faults to device

*/
> @@ -1748,10 +1896,11 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
>   pr_err("No free IRQ vectors\n");
>   return -EINVAL;
>   }
> -
>   ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
> - if (ret)
> + if (ret) {
>   pr_err("Can't request irq\n");
> + dmar_free_fault_wq(iommu);
> + }
>   return ret;
>  }
>  
> @@ -1765,7 +1914,7 @@ int __init enable_drhd_fault_handling(void)
>*/
>   for_each_iommu(iommu, drhd) {
>   u32 fault_status;
> - int ret = dmar_set_interrupt(iommu);
> + int ret = dmar_set_interrupt(iommu, false);
>  
>   if (ret) {
>   pr_err("DRHD %Lx: failed to enable fault, interrupt, 
> ret %d\n",
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 684bd98..3949b3cf 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -3401,10 +3401,10 @@ static int __init init_dmars(void)
>   goto free_iommu;
>   }
>  #endif
> - ret = dmar_set_interrupt(iommu);
> + ret = dmar_set_interrupt(iommu, true);
> +
>   if (ret)
>   goto free_iommu;
> -
>   if (!translation_pre_enabled(iommu))
>   iommu_enable_translation(iommu);
>  
> @@ -4291,7 +4291,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
>   goto disable_iommu;
>   }
>  #endif
> - ret = dmar_set_interrupt(iommu);
> + ret = dmar_set_interrupt(iommu, true);
>   if (ret)
>   goto disable_iommu;
>  
> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
> index e2433bc..21f2162 100644
> --- a/include/linux/dmar.h
> +++ b/include/linux/dmar.h
> @@ -278,7 +278,7 @@ extern void dmar_msi_unmask(struct irq_data *data);
>  extern void dmar_msi_mask(struct irq_data *data);
>  extern void dmar_msi_read(int irq, struct msi_msg *msg);
>  extern void dmar_msi_write(int irq, struct msi_msg *msg);
> -extern int dmar_set_interrupt(struct intel_iommu *iommu);
> +extern int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault);
>  extern irqreturn_t dmar_fault(int irq, void *dev_id);
>  extern int dmar_alloc_hwirq(int id, int node, void *arg);
>  extern void dmar_free_hwirq(int irq);
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 5ac0c28..b3a26c7 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -472,6 +472,7 @@ struct intel_iommu {
>   struct iommu_device iommu;  /* IOMMU core code handle */
>   int node;
>   u32 flags;  /* Software defined flags */
> + struct workqueue_struct *fault_wq; /* Reporting IOMMU fault to device */
>  };
>  
>  /* PCI domain-device relationship */

Best regards,
Lu Baolu

Re: [PATCH v5 17/23] iommu/vt-d: report non-recoverable faults to device

mmu *iommu)
>   pr_err("No free IRQ vectors\n");
>   return -EINVAL;
>   }
> -
>   ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
> - if (ret)
> + if (ret) {
>   pr_err("Can't request irq\n");
> + dmar_free_fault_wq(iommu);
> + }
>   return ret;
>  }
>  
> @@ -1765,7 +1914,7 @@ int __init enable_drhd_fault_handling(void)
>*/
>   for_each_iommu(iommu, drhd) {
>   u32 fault_status;
> - int ret = dmar_set_interrupt(iommu);
> + int ret = dmar_set_interrupt(iommu, false);
>  
>   if (ret) {
>   pr_err("DRHD %Lx: failed to enable fault, interrupt, 
> ret %d\n",
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 684bd98..3949b3cf 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -3401,10 +3401,10 @@ static int __init init_dmars(void)
>   goto free_iommu;
>   }
>  #endif
> - ret = dmar_set_interrupt(iommu);
> + ret = dmar_set_interrupt(iommu, true);
> +
>   if (ret)
>   goto free_iommu;
> -
>   if (!translation_pre_enabled(iommu))
>   iommu_enable_translation(iommu);
>  
> @@ -4291,7 +4291,7 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
>   goto disable_iommu;
>   }
>  #endif
> - ret = dmar_set_interrupt(iommu);
> + ret = dmar_set_interrupt(iommu, true);
>   if (ret)
>   goto disable_iommu;
>  
> diff --git a/include/linux/dmar.h b/include/linux/dmar.h
> index e2433bc..21f2162 100644
> --- a/include/linux/dmar.h
> +++ b/include/linux/dmar.h
> @@ -278,7 +278,7 @@ extern void dmar_msi_unmask(struct irq_data *data);
>  extern void dmar_msi_mask(struct irq_data *data);
>  extern void dmar_msi_read(int irq, struct msi_msg *msg);
>  extern void dmar_msi_write(int irq, struct msi_msg *msg);
> -extern int dmar_set_interrupt(struct intel_iommu *iommu);
> +extern int dmar_set_interrupt(struct intel_iommu *iommu, bool queue_fault);
>  extern irqreturn_t dmar_fault(int irq, void *dev_id);
>  extern int dmar_alloc_hwirq(int id, int node, void *arg);
>  extern void dmar_free_hwirq(int irq);
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 5ac0c28..b3a26c7 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -472,6 +472,7 @@ struct intel_iommu {
>   struct iommu_device iommu;  /* IOMMU core code handle */
>   int node;
>   u32 flags;  /* Software defined flags */
> + struct workqueue_struct *fault_wq; /* Reporting IOMMU fault to device */
>  };
>  
>  /* PCI domain-device relationship */

Best regards,
Lu Baolu

Re: [PATCH v5 15/23] iommu: handle page response timeout

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> When IO page faults are reported outside IOMMU subsystem, the page
> request handler may fail for various reasons. E.g. a guest received
> page requests but did not have a chance to run for a long time. The
> irresponsive behavior could hold off limited resources on the pending
> device.
> There can be hardware or credit based software solutions as suggested
> in the PCI ATS Ch-4. To provide a basic safty net this patch
> introduces a per device deferrable timer which monitors the longest
> pending page fault that requires a response. Proper action such as
> sending failure response code could be taken when timer expires but not
> included in this patch. We need to consider the life cycle of page
> groupd ID to prevent confusion with reused group ID by a device.
> For now, a warning message provides clue of such failure.
>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> Signed-off-by: Ashok Raj <ashok@intel.com>
> ---
>  drivers/iommu/iommu.c | 53 
> +++
>  include/linux/iommu.h |  4 
>  2 files changed, 57 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 02fed3e..1f2f49e 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -827,6 +827,37 @@ int iommu_group_unregister_notifier(struct iommu_group 
> *group,
>  }
>  EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
>  
> +static void iommu_dev_fault_timer_fn(struct timer_list *t)
> +{
> + struct iommu_fault_param *fparam = from_timer(fparam, t, timer);
> + struct iommu_fault_event *evt;
> +
> + u64 now;
> +
> + now = get_jiffies_64();
> +
> + /* The goal is to ensure driver or guest page fault handler(via vfio)
> +  * send page response on time. Otherwise, limited queue resources
> +  * may be occupied by some irresponsive guests or drivers.
> +  * When per device pending fault list is not empty, we periodically 
> checks
> +  * if any anticipated page response time has expired.
> +  *
> +  * TODO:
> +  * We could do the following if response time expires:
> +  * 1. send page response code FAILURE to all pending PRQ
> +  * 2. inform device driver or vfio
> +  * 3. drain in-flight page requests and responses for this device
> +  * 4. clear pending fault list such that driver can unregister fault
> +  *handler(otherwise blocked when pending faults are present).
> +  */
> + list_for_each_entry(evt, >faults, list) {
> + if (time_after64(now, evt->expire))
> + pr_err("Page response time expired!, pasid %d gid %d 
> exp %llu now %llu\n",
> + evt->pasid, evt->page_req_group_id, 
> evt->expire, now);
> + }
> + mod_timer(t, now + prq_timeout);
> +}
> +

This timer scheme is very rough.

The timer expires every 10 seconds (by default).

0   10 2030 40  
  
+---+---+---+---+
^  ^   ^  ^^
 |   | || |
F0 F1  F2 F3   (F1,F2,F3 will not be handled until here!)

F0, F1, F2, F3 are four page faults happens during [0, 10s) time
window. F1, F2, F3 timeout won't be handled until the timer expires
again at 20s. That means a fault might be pending there until about
(2 * prq_timeout) seconds later.

Out of curiosity, Why not adding a timer in iommu_fault_event, starting it in
iommu_report_device_fault() and removing it in iommu_page_response()?

Best regards,
Lu Baolu


>  /**
>   * iommu_register_device_fault_handler() - Register a device fault handler
>   * @dev: the device
> @@ -879,6 +910,9 @@ int iommu_register_device_fault_handler(struct device 
> *dev,
>   param->fault_param->data = data;
>   INIT_LIST_HEAD(>fault_param->faults);
>  
> + if (prq_timeout)
> + timer_setup(>fault_param->timer, 
> iommu_dev_fault_timer_fn,
> + TIMER_DEFERRABLE);
>  done_unlock:
>   mutex_unlock(>lock);
>  
> @@ -935,6 +969,8 @@ int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
>  {
>   int ret = 0;
>   struct iommu_fault_event *evt_pending;
> + struct timer_list *tmr;
> + u64 exp;
>   struct iommu_fault_param *fparam;
>  
>   /* iommu_param is allocated when device is added to group */
> @@ -955,7 +991,17 @@ int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
>   ret = -ENOMEM;
>

Re: [PATCH v5 15/23] iommu: handle page response timeout

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> When IO page faults are reported outside IOMMU subsystem, the page
> request handler may fail for various reasons. E.g. a guest received
> page requests but did not have a chance to run for a long time. The
> irresponsive behavior could hold off limited resources on the pending
> device.
> There can be hardware or credit based software solutions as suggested
> in the PCI ATS Ch-4. To provide a basic safty net this patch
> introduces a per device deferrable timer which monitors the longest
> pending page fault that requires a response. Proper action such as
> sending failure response code could be taken when timer expires but not
> included in this patch. We need to consider the life cycle of page
> groupd ID to prevent confusion with reused group ID by a device.
> For now, a warning message provides clue of such failure.
>
> Signed-off-by: Jacob Pan 
> Signed-off-by: Ashok Raj 
> ---
>  drivers/iommu/iommu.c | 53 
> +++
>  include/linux/iommu.h |  4 
>  2 files changed, 57 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 02fed3e..1f2f49e 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -827,6 +827,37 @@ int iommu_group_unregister_notifier(struct iommu_group 
> *group,
>  }
>  EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
>  
> +static void iommu_dev_fault_timer_fn(struct timer_list *t)
> +{
> + struct iommu_fault_param *fparam = from_timer(fparam, t, timer);
> + struct iommu_fault_event *evt;
> +
> + u64 now;
> +
> + now = get_jiffies_64();
> +
> + /* The goal is to ensure driver or guest page fault handler(via vfio)
> +  * send page response on time. Otherwise, limited queue resources
> +  * may be occupied by some irresponsive guests or drivers.
> +  * When per device pending fault list is not empty, we periodically 
> checks
> +  * if any anticipated page response time has expired.
> +  *
> +  * TODO:
> +  * We could do the following if response time expires:
> +  * 1. send page response code FAILURE to all pending PRQ
> +  * 2. inform device driver or vfio
> +  * 3. drain in-flight page requests and responses for this device
> +  * 4. clear pending fault list such that driver can unregister fault
> +  *handler(otherwise blocked when pending faults are present).
> +  */
> + list_for_each_entry(evt, >faults, list) {
> + if (time_after64(now, evt->expire))
> + pr_err("Page response time expired!, pasid %d gid %d 
> exp %llu now %llu\n",
> + evt->pasid, evt->page_req_group_id, 
> evt->expire, now);
> + }
> + mod_timer(t, now + prq_timeout);
> +}
> +

This timer scheme is very rough.

The timer expires every 10 seconds (by default).

0   10 2030 40  
  
+---+---+---+---+
^  ^   ^  ^^
 |   | || |
F0 F1  F2 F3   (F1,F2,F3 will not be handled until here!)

F0, F1, F2, F3 are four page faults happens during [0, 10s) time
window. F1, F2, F3 timeout won't be handled until the timer expires
again at 20s. That means a fault might be pending there until about
(2 * prq_timeout) seconds later.

Out of curiosity, Why not adding a timer in iommu_fault_event, starting it in
iommu_report_device_fault() and removing it in iommu_page_response()?

Best regards,
Lu Baolu


>  /**
>   * iommu_register_device_fault_handler() - Register a device fault handler
>   * @dev: the device
> @@ -879,6 +910,9 @@ int iommu_register_device_fault_handler(struct device 
> *dev,
>   param->fault_param->data = data;
>   INIT_LIST_HEAD(>fault_param->faults);
>  
> + if (prq_timeout)
> + timer_setup(>fault_param->timer, 
> iommu_dev_fault_timer_fn,
> + TIMER_DEFERRABLE);
>  done_unlock:
>   mutex_unlock(>lock);
>  
> @@ -935,6 +969,8 @@ int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
>  {
>   int ret = 0;
>   struct iommu_fault_event *evt_pending;
> + struct timer_list *tmr;
> + u64 exp;
>   struct iommu_fault_param *fparam;
>  
>   /* iommu_param is allocated when device is added to group */
> @@ -955,7 +991,17 @@ int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
>   ret = -ENOMEM;
>   goto done_unlock;
>   }
> +

Re: [PATCH v5 14/23] iommu: introduce page response function

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> IO page faults can be handled outside IOMMU subsystem. For an example,
> when nested translation is turned on and guest owns the
> first level page tables, device page request can be forwared
> to the guest for handling faults. As the page response returns
> by the guest, IOMMU driver on the host need to process the
> response which informs the device and completes the page request
> transaction.
>
> This patch introduces generic API function for page response
> passing from the guest or other in-kernel users. The definitions of
> the generic data is based on PCI ATS specification not limited to
> any vendor.
>
> Signed-off-by: Jean-Philippe Brucker <jean-philippe.bruc...@arm.com>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> Link: https://lkml.org/lkml/2017/12/7/1725
> ---
>  drivers/iommu/iommu.c | 45 +
>  include/linux/iommu.h | 43 +++
>  2 files changed, 88 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index b3f9daf..02fed3e 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1533,6 +1533,51 @@ int iommu_sva_invalidate(struct iommu_domain *domain,
>  }
>  EXPORT_SYMBOL_GPL(iommu_sva_invalidate);
>  
> +int iommu_page_response(struct device *dev,
> + struct page_response_msg *msg)
> +{
> + struct iommu_param *param = dev->iommu_param;
> + int ret = -EINVAL;
> + struct iommu_fault_event *evt;
> + struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +
> + if (!domain || !domain->ops->page_response)
> + return -ENODEV;
> +
> + /*
> +  * Device iommu_param should have been allocated when device is
> +  * added to its iommu_group.
> +  */
> + if (!param || !param->fault_param)
> + return -EINVAL;
> +
> + /* Only send response if there is a fault report pending */
> + mutex_lock(>fault_param->lock);
> + if (list_empty(>fault_param->faults)) {
> + pr_warn("no pending PRQ, drop response\n");
> + goto done_unlock;
> + }
> + /*
> +  * Check if we have a matching page request pending to respond,
> +  * otherwise return -EINVAL
> +  */
> + list_for_each_entry(evt, >fault_param->faults, list) {
> + if (evt->pasid == msg->pasid &&
> + msg->page_req_group_id == evt->page_req_group_id) {
> + msg->private_data = evt->iommu_private;
> + ret = domain->ops->page_response(dev, msg);
> + list_del(>list);
> + kfree(evt);
> + break;
> + }
> + }

Are above two checks duplicated? We won't find a matching
request if the list is empty. And we need to  printk a message
if we can't find the matching request.

Best regards,
Lu Baolu

> +
> +done_unlock:
> + mutex_unlock(>fault_param->lock);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_page_response);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index b3312ee..722b90f 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -163,6 +163,41 @@ struct iommu_resv_region {
>  #ifdef CONFIG_IOMMU_API
>  
>  /**
> + * enum page_response_code - Return status of fault handlers, telling the 
> IOMMU
> + * driver how to proceed with the fault.
> + *
> + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
> + *   populated, retry the access. This is "Success" in PCI PRI.
> + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
> + *   this device if possible. This is "Response Failure" in PCI PRI.
> + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
> + *   access. This is "Invalid Request" in PCI PRI.
> + */
> +enum page_response_code {
> + IOMMU_PAGE_RESP_SUCCESS = 0,
> + IOMMU_PAGE_RESP_INVALID,
> + IOMMU_PAGE_RESP_FAILURE,
> +};
> +
> +/**
> + * Generic page response information based on PCI ATS and PASID spec.
> + * @addr: servicing page address
> + * @pasid: contains process address space ID
> + * @resp_code: response code
> + * @page_req_group_id: page request group index
> + * @private_data: uniquely identify device-specific private data for an
> + *individual page response
> +

Re: [PATCH v5 14/23] iommu: introduce page response function

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> IO page faults can be handled outside IOMMU subsystem. For an example,
> when nested translation is turned on and guest owns the
> first level page tables, device page request can be forwared
> to the guest for handling faults. As the page response returns
> by the guest, IOMMU driver on the host need to process the
> response which informs the device and completes the page request
> transaction.
>
> This patch introduces generic API function for page response
> passing from the guest or other in-kernel users. The definitions of
> the generic data is based on PCI ATS specification not limited to
> any vendor.
>
> Signed-off-by: Jean-Philippe Brucker 
> Signed-off-by: Jacob Pan 
> Link: https://lkml.org/lkml/2017/12/7/1725
> ---
>  drivers/iommu/iommu.c | 45 +
>  include/linux/iommu.h | 43 +++
>  2 files changed, 88 insertions(+)
>
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index b3f9daf..02fed3e 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1533,6 +1533,51 @@ int iommu_sva_invalidate(struct iommu_domain *domain,
>  }
>  EXPORT_SYMBOL_GPL(iommu_sva_invalidate);
>  
> +int iommu_page_response(struct device *dev,
> + struct page_response_msg *msg)
> +{
> + struct iommu_param *param = dev->iommu_param;
> + int ret = -EINVAL;
> + struct iommu_fault_event *evt;
> + struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> +
> + if (!domain || !domain->ops->page_response)
> + return -ENODEV;
> +
> + /*
> +  * Device iommu_param should have been allocated when device is
> +  * added to its iommu_group.
> +  */
> + if (!param || !param->fault_param)
> + return -EINVAL;
> +
> + /* Only send response if there is a fault report pending */
> + mutex_lock(>fault_param->lock);
> + if (list_empty(>fault_param->faults)) {
> + pr_warn("no pending PRQ, drop response\n");
> + goto done_unlock;
> + }
> + /*
> +  * Check if we have a matching page request pending to respond,
> +  * otherwise return -EINVAL
> +  */
> + list_for_each_entry(evt, >fault_param->faults, list) {
> + if (evt->pasid == msg->pasid &&
> + msg->page_req_group_id == evt->page_req_group_id) {
> + msg->private_data = evt->iommu_private;
> + ret = domain->ops->page_response(dev, msg);
> + list_del(>list);
> +     kfree(evt);
> + break;
> + }
> + }

Are above two checks duplicated? We won't find a matching
request if the list is empty. And we need to  printk a message
if we can't find the matching request.

Best regards,
Lu Baolu

> +
> +done_unlock:
> + mutex_unlock(>fault_param->lock);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(iommu_page_response);
> +
>  static void __iommu_detach_device(struct iommu_domain *domain,
> struct device *dev)
>  {
> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> index b3312ee..722b90f 100644
> --- a/include/linux/iommu.h
> +++ b/include/linux/iommu.h
> @@ -163,6 +163,41 @@ struct iommu_resv_region {
>  #ifdef CONFIG_IOMMU_API
>  
>  /**
> + * enum page_response_code - Return status of fault handlers, telling the 
> IOMMU
> + * driver how to proceed with the fault.
> + *
> + * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
> + *   populated, retry the access. This is "Success" in PCI PRI.
> + * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
> + *   this device if possible. This is "Response Failure" in PCI PRI.
> + * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
> + *   access. This is "Invalid Request" in PCI PRI.
> + */
> +enum page_response_code {
> + IOMMU_PAGE_RESP_SUCCESS = 0,
> + IOMMU_PAGE_RESP_INVALID,
> + IOMMU_PAGE_RESP_FAILURE,
> +};
> +
> +/**
> + * Generic page response information based on PCI ATS and PASID spec.
> + * @addr: servicing page address
> + * @pasid: contains process address space ID
> + * @resp_code: response code
> + * @page_req_group_id: page request group index
> + * @private_data: uniquely identify device-specific private data for an
> + *individual page response
> + */
> +struct page_response_msg {
> + u64 addr;
> + u32 pasid;

Re: [PATCH v5 13/23] iommu: introduce device fault report API

event {
> + struct list_head list;
>   enum iommu_fault_type type;
>   enum iommu_fault_reason reason;
>   u64 addr;
> @@ -340,10 +342,13 @@ struct iommu_fault_event {
>   * struct iommu_fault_param - per-device IOMMU fault data
>   * @dev_fault_handler: Callback function to handle IOMMU faults at device 
> level
>   * @data: handler private data
> - *
> + * @faults: holds the pending faults which needs response, e.g. page 
> response.
> + * @lock: protect pending PRQ event list
>   */
>  struct iommu_fault_param {
>   iommu_dev_fault_handler_t handler;
> + struct list_head faults;
> + struct mutex lock;
>   void *data;
>  };
>  
> @@ -357,6 +362,7 @@ struct iommu_fault_param {
>   *   struct iommu_fwspec *iommu_fwspec;
>   */
>  struct iommu_param {
> + struct mutex lock;
>   struct iommu_fault_param *fault_param;
>  };
>  
> @@ -456,6 +462,14 @@ extern int iommu_group_register_notifier(struct 
> iommu_group *group,
>struct notifier_block *nb);
>  extern int iommu_group_unregister_notifier(struct iommu_group *group,
>  struct notifier_block *nb);
> +extern int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t handler,
> + void *data);
> +
> +extern int iommu_unregister_device_fault_handler(struct device *dev);
> +
> +extern int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt);
> +
>  extern int iommu_group_id(struct iommu_group *group);
>  extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
> @@ -727,6 +741,23 @@ static inline int iommu_group_unregister_notifier(struct 
> iommu_group *group,
>   return 0;
>  }
>  
> +static inline int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t 
> handler,
> + void *data)
> +{
> + return -ENODEV;
> +}
> +
> +static inline int iommu_unregister_device_fault_handler(struct device *dev)
> +{
> + return 0;
> +}
> +
> +static inline int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
> +{
> + return -ENODEV;
> +}
> +
>  static inline int iommu_group_id(struct iommu_group *group)
>  {
>   return -ENODEV;

Best regards,
Lu Baolu

Re: [PATCH v5 13/23] iommu: introduce device fault report API

_fault_reason reason;
>   u64 addr;
> @@ -340,10 +342,13 @@ struct iommu_fault_event {
>   * struct iommu_fault_param - per-device IOMMU fault data
>   * @dev_fault_handler: Callback function to handle IOMMU faults at device 
> level
>   * @data: handler private data
> - *
> + * @faults: holds the pending faults which needs response, e.g. page 
> response.
> + * @lock: protect pending PRQ event list
>   */
>  struct iommu_fault_param {
>   iommu_dev_fault_handler_t handler;
> + struct list_head faults;
> + struct mutex lock;
>   void *data;
>  };
>  
> @@ -357,6 +362,7 @@ struct iommu_fault_param {
>   *   struct iommu_fwspec *iommu_fwspec;
>   */
>  struct iommu_param {
> + struct mutex lock;
>   struct iommu_fault_param *fault_param;
>  };
>  
> @@ -456,6 +462,14 @@ extern int iommu_group_register_notifier(struct 
> iommu_group *group,
>struct notifier_block *nb);
>  extern int iommu_group_unregister_notifier(struct iommu_group *group,
>  struct notifier_block *nb);
> +extern int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t handler,
> + void *data);
> +
> +extern int iommu_unregister_device_fault_handler(struct device *dev);
> +
> +extern int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt);
> +
>  extern int iommu_group_id(struct iommu_group *group);
>  extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
>  extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
> @@ -727,6 +741,23 @@ static inline int iommu_group_unregister_notifier(struct 
> iommu_group *group,
>   return 0;
>  }
>  
> +static inline int iommu_register_device_fault_handler(struct device *dev,
> + iommu_dev_fault_handler_t 
> handler,
> + void *data)
> +{
> + return -ENODEV;
> +}
> +
> +static inline int iommu_unregister_device_fault_handler(struct device *dev)
> +{
> + return 0;
> +}
> +
> +static inline int iommu_report_device_fault(struct device *dev, struct 
> iommu_fault_event *evt)
> +{
> + return -ENODEV;
> +}
> +
>  static inline int iommu_group_id(struct iommu_group *group)
>  {
>   return -ENODEV;

Best regards,
Lu Baolu

Re: [PATCH v5 11/23] driver core: add per device iommu param

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
>
> There are two other IOMMU data pointers under struct device today, here
> we introduce iommu_param as a parent pointer such that all device IOMMU
> data can be consolidated here. The idea was suggested here by Greg KH
> and Joerg. The name iommu_param is chosen here since iommu_data has been used.

This doesn't match what you've done in the patch. Maybe you
forgot to cleanup? :-)

The idea is to create a parent pointer under device struct and
move previous iommu_group and iommu_fwspec together with
the iommu fault related data into it.

Best regards,
Lu Baolu

>
> Suggested-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
> Reviewed-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> Link: https://lkml.org/lkml/2017/10/6/81
> ---
>  include/linux/device.h | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 4779569..c1b1796 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -41,6 +41,7 @@ struct iommu_ops;
>  struct iommu_group;
>  struct iommu_fwspec;
>  struct dev_pin_info;
> +struct iommu_param;
>  
>  struct bus_attribute {
>   struct attributeattr;
> @@ -899,6 +900,7 @@ struct dev_links_info {
>   *   device (i.e. the bus driver that discovered the device).
>   * @iommu_group: IOMMU group the device belongs to.
>   * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
> + * @iommu_param: Per device generic IOMMU runtime data
>   *
>   * @offline_disabled: If set, the device is permanently online.
>   * @offline: Set after successful invocation of bus type's .offline().
> @@ -988,6 +990,7 @@ struct device {
>   void(*release)(struct device *dev);
>   struct iommu_group  *iommu_group;
>   struct iommu_fwspec *iommu_fwspec;
> + struct iommu_param  *iommu_param;
>  
>   booloffline_disabled:1;
>   booloffline:1;

Re: [PATCH v5 11/23] driver core: add per device iommu param

Hi,

On 05/12/2018 04:54 AM, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
>
> There are two other IOMMU data pointers under struct device today, here
> we introduce iommu_param as a parent pointer such that all device IOMMU
> data can be consolidated here. The idea was suggested here by Greg KH
> and Joerg. The name iommu_param is chosen here since iommu_data has been used.

This doesn't match what you've done in the patch. Maybe you
forgot to cleanup? :-)

The idea is to create a parent pointer under device struct and
move previous iommu_group and iommu_fwspec together with
the iommu fault related data into it.

Best regards,
Lu Baolu

>
> Suggested-by: Greg Kroah-Hartman 
> Reviewed-by: Greg Kroah-Hartman 
> Signed-off-by: Jacob Pan 
> Link: https://lkml.org/lkml/2017/10/6/81
> ---
>  include/linux/device.h | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/include/linux/device.h b/include/linux/device.h
> index 4779569..c1b1796 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -41,6 +41,7 @@ struct iommu_ops;
>  struct iommu_group;
>  struct iommu_fwspec;
>  struct dev_pin_info;
> +struct iommu_param;
>  
>  struct bus_attribute {
>   struct attributeattr;
> @@ -899,6 +900,7 @@ struct dev_links_info {
>   *   device (i.e. the bus driver that discovered the device).
>   * @iommu_group: IOMMU group the device belongs to.
>   * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
> + * @iommu_param: Per device generic IOMMU runtime data
>   *
>   * @offline_disabled: If set, the device is permanently online.
>   * @offline: Set after successful invocation of bus type's .offline().
> @@ -988,6 +990,7 @@ struct device {
>   void(*release)(struct device *dev);
>   struct iommu_group  *iommu_group;
>   struct iommu_fwspec *iommu_fwspec;
> + struct iommu_param  *iommu_param;
>  
>   booloffline_disabled:1;
>   booloffline:1;

Re: [PATCH v5 09/23] iommu/vt-d: add svm/sva invalidate function

ty,
> + );
> + if (ret) {
> + pr_err("Invalid range type %d, granu %d\n", inv_info->hdr.type,
> + inv_info->granularity);
> + return ret;
> + }
> +
> + spin_lock(>lock);
> + spin_lock_irqsave(_domain_lock, flags);
> +
> + switch (inv_info->hdr.type) {
> + case IOMMU_INV_TYPE_TLB:
> + if (inv_info->size &&
> + (inv_info->addr & ((1 << (VTD_PAGE_SHIFT + 
> inv_info->size)) - 1))) {
> + pr_err("Addr out of range, addr 0x%llx, size order 
> %d\n",
> + inv_info->addr, inv_info->size);
> + ret = -ERANGE;
> + goto out_unlock;
> + }
> +
> + qi_flush_eiotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
> + inv_info->pasid,
> + inv_info->size, granu,
> + inv_info->flags & IOMMU_INVALIDATE_GLOBAL_PAGE);
> + /**
> +  * Always flush device IOTLB if ATS is enabled since guest
> +  * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
> +  * down.
> +  */
> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> + if (info && info->ats_enabled) {
> + qi_flush_dev_eiotlb(iommu, sid,
> + inv_info->pasid, info->ats_qdep,
> + inv_info->addr, inv_info->size,
> + granu);
> + }
> + break;
> + case IOMMU_INV_TYPE_PASID:
> + qi_flush_pasid(iommu, did, granu, inv_info->pasid);
> +
> + break;
> + default:
> + dev_err(dev, "Unknown IOMMU invalidation type %d\n",
> + inv_info->hdr.type);

There are three types of invalidation:

enum iommu_inv_type {
IOMMU_INV_TYPE_DTLB,
IOMMU_INV_TYPE_TLB,
IOMMU_INV_TYPE_PASID,
IOMMU_INV_NR_TYPE
};

So "unsupported" looks better than "unknown" in the message.

> + ret = -EINVAL;
> + }
> +out_unlock:
> + spin_unlock(>lock);
> + spin_unlock_irqrestore(_domain_lock, flags);
> +
> + return ret;
> +}
> +
>  static int intel_iommu_map(struct iommu_domain *domain,
>  unsigned long iova, phys_addr_t hpa,
>  size_t size, int iommu_prot)
> @@ -5401,6 +5529,7 @@ const struct iommu_ops intel_iommu_ops = {
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>   .bind_pasid_table   = intel_iommu_bind_pasid_table,
>   .unbind_pasid_table = intel_iommu_unbind_pasid_table,
> + .sva_invalidate = intel_iommu_sva_invalidate,
>  #endif
>   .map= intel_iommu_map,
>   .unmap  = intel_iommu_unmap,

Best regards,
Lu Baolu

Re: [PATCH v5 09/23] iommu/vt-d: add svm/sva invalidate function

quot;Invalid range type %d, granu %d\n", inv_info->hdr.type,
> + inv_info->granularity);
> + return ret;
> + }
> +
> + spin_lock(>lock);
> + spin_lock_irqsave(_domain_lock, flags);
> +
> + switch (inv_info->hdr.type) {
> + case IOMMU_INV_TYPE_TLB:
> + if (inv_info->size &&
> + (inv_info->addr & ((1 << (VTD_PAGE_SHIFT + 
> inv_info->size)) - 1))) {
> + pr_err("Addr out of range, addr 0x%llx, size order 
> %d\n",
> + inv_info->addr, inv_info->size);
> + ret = -ERANGE;
> + goto out_unlock;
> + }
> +
> + qi_flush_eiotlb(iommu, did, mm_to_dma_pfn(inv_info->addr),
> + inv_info->pasid,
> + inv_info->size, granu,
> + inv_info->flags & IOMMU_INVALIDATE_GLOBAL_PAGE);
> + /**
> +  * Always flush device IOTLB if ATS is enabled since guest
> +  * vIOMMU exposes CM = 1, no device IOTLB flush will be passed
> +  * down.
> +  */
> + info = iommu_support_dev_iotlb(dmar_domain, iommu, bus, devfn);
> + if (info && info->ats_enabled) {
> + qi_flush_dev_eiotlb(iommu, sid,
> + inv_info->pasid, info->ats_qdep,
> + inv_info->addr, inv_info->size,
> + granu);
> + }
> + break;
> + case IOMMU_INV_TYPE_PASID:
> + qi_flush_pasid(iommu, did, granu, inv_info->pasid);
> +
> + break;
> + default:
> + dev_err(dev, "Unknown IOMMU invalidation type %d\n",
> + inv_info->hdr.type);

There are three types of invalidation:

enum iommu_inv_type {
IOMMU_INV_TYPE_DTLB,
IOMMU_INV_TYPE_TLB,
IOMMU_INV_TYPE_PASID,
IOMMU_INV_NR_TYPE
};

So "unsupported" looks better than "unknown" in the message.

> + ret = -EINVAL;
> + }
> +out_unlock:
> + spin_unlock(>lock);
> + spin_unlock_irqrestore(_domain_lock, flags);
> +
> + return ret;
> +}
> +
>  static int intel_iommu_map(struct iommu_domain *domain,
>  unsigned long iova, phys_addr_t hpa,
>  size_t size, int iommu_prot)
> @@ -5401,6 +5529,7 @@ const struct iommu_ops intel_iommu_ops = {
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>   .bind_pasid_table   = intel_iommu_bind_pasid_table,
>   .unbind_pasid_table = intel_iommu_unbind_pasid_table,
> + .sva_invalidate = intel_iommu_sva_invalidate,
>  #endif
>   .map= intel_iommu_map,
>   .unmap  = intel_iommu_unmap,

Best regards,
Lu Baolu

Re: [PATCH v5 08/23] iommu/vt-d: support flushing more translation cache types

  0xf
>  
> +/* QI EIOTLB inv granu */
>  #define QI_GRAN_ALL_ALL  0
>  #define QI_GRAN_NONG_ALL 1
>  #define QI_GRAN_NONG_PASID   2
> @@ -504,8 +514,15 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
> u16 did, u16 sid,
>u8 fm, u64 type);
>  extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> +extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> + u32 pasid, unsigned int size_order, u64 type, bool 
> global);
>  extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
>   u16 qdep, u64 addr, unsigned mask);
> +
> +extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size, u64 
> granu);
> +extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, 
> int pasid);
> +
>  extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>  
>  extern int dmar_ir_support(void);

Best regards,
Lu Baolu

Re: [PATCH v5 08/23] iommu/vt-d: support flushing more translation cache types

 EIOTLB inv granu */
>  #define QI_GRAN_ALL_ALL  0
>  #define QI_GRAN_NONG_ALL 1
>  #define QI_GRAN_NONG_PASID   2
> @@ -504,8 +514,15 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
> u16 did, u16 sid,
>u8 fm, u64 type);
>  extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> +extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> + u32 pasid, unsigned int size_order, u64 type, bool 
> global);
>  extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
>   u16 qdep, u64 addr, unsigned mask);
> +
> +extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid,
> + u32 pasid, u16 qdep, u64 addr, unsigned size, u64 
> granu);
> +extern void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, 
> int pasid);
> +
>  extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>  
>  extern int dmar_ir_support(void);

Best regards,
Lu Baolu

Re: [PATCH v5 07/23] iommu/vt-d: fix dev iotlb pfsid use

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> PFSID should be used in the invalidation descriptor for flushing
> device IOTLBs on SRIOV VFs.

This patch could be submitted separately.

>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> ---
>  drivers/iommu/dmar.c|  6 +++---
>  drivers/iommu/intel-iommu.c | 16 +++-
>  include/linux/intel-iommu.h |  5 ++---
>  3 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index 460bed4..7852678 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1339,8 +1339,8 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, 
> u64 addr,
>   qi_submit_sync(, iommu);
>  }
>  
> -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask)
> +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask)
>  {
>   struct qi_desc desc;
>  
> @@ -1355,7 +1355,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
> sid, u16 qdep,
>   qdep = 0;
>  
>   desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
> -QI_DIOTLB_TYPE;
> +QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
>  
>   qi_submit_sync(, iommu);
>  }
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 4623294..732a10f 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -1459,6 +1459,19 @@ static void iommu_enable_dev_iotlb(struct 
> device_domain_info *info)
>   return;
>  
>   pdev = to_pci_dev(info->dev);
> + /* For IOMMU that supports device IOTLB throttling (DIT), we assign
> +  * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
> +  * queue depth at PF level. If DIT is not set, PFSID will be treated as
> +  * reserved, which should be set to 0.
> +  */
> + if (!ecap_dit(info->iommu->ecap))
> + info->pfsid = 0;
> + else if (pdev && pdev->is_virtfn) {
> + if (ecap_dit(info->iommu->ecap))
> + dev_warn(>dev, "SRIOV VF device IOTLB enabled 
> without flow control\n");

I can't understand these two lines.

Isn't the condition always true? What does the error message mean?

> + info->pfsid = PCI_DEVID(pdev->physfn->bus->number, 
> pdev->physfn->devfn);
> + } else
> + info->pfsid = PCI_DEVID(info->bus, info->devfn);
>  
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>   /* The PCIe spec, in its wisdom, declares that the behaviour of
> @@ -1524,7 +1537,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain 
> *domain,
>  
>   sid = info->bus << 8 | info->devfn;
>   qdep = info->ats_qdep;
> - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
> + qdep, addr, mask);

Alignment should match open parenthesis.

>   }
>   spin_unlock_irqrestore(_domain_lock, flags);
>  }
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index dfacd49..678a0f4 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -504,9 +504,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
> u16 did, u16 sid,
>u8 fm, u64 type);
>  extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> -        u64 addr, unsigned mask);
> -
> +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask);

Alignment should match open parenthesis.

>  extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>  
>  extern int dmar_ir_support(void);

Best regards,
Lu Baolu

Re: [PATCH v5 07/23] iommu/vt-d: fix dev iotlb pfsid use

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> PFSID should be used in the invalidation descriptor for flushing
> device IOTLBs on SRIOV VFs.

This patch could be submitted separately.

>
> Signed-off-by: Jacob Pan 
> ---
>  drivers/iommu/dmar.c|  6 +++---
>  drivers/iommu/intel-iommu.c | 16 +++-
>  include/linux/intel-iommu.h |  5 ++---
>  3 files changed, 20 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
> index 460bed4..7852678 100644
> --- a/drivers/iommu/dmar.c
> +++ b/drivers/iommu/dmar.c
> @@ -1339,8 +1339,8 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, 
> u64 addr,
>   qi_submit_sync(, iommu);
>  }
>  
> -void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> - u64 addr, unsigned mask)
> +void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask)
>  {
>   struct qi_desc desc;
>  
> @@ -1355,7 +1355,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
> sid, u16 qdep,
>   qdep = 0;
>  
>   desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
> -QI_DIOTLB_TYPE;
> +QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid);
>  
>   qi_submit_sync(, iommu);
>  }
> diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
> index 4623294..732a10f 100644
> --- a/drivers/iommu/intel-iommu.c
> +++ b/drivers/iommu/intel-iommu.c
> @@ -1459,6 +1459,19 @@ static void iommu_enable_dev_iotlb(struct 
> device_domain_info *info)
>   return;
>  
>   pdev = to_pci_dev(info->dev);
> + /* For IOMMU that supports device IOTLB throttling (DIT), we assign
> +  * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
> +  * queue depth at PF level. If DIT is not set, PFSID will be treated as
> +  * reserved, which should be set to 0.
> +  */
> + if (!ecap_dit(info->iommu->ecap))
> + info->pfsid = 0;
> + else if (pdev && pdev->is_virtfn) {
> + if (ecap_dit(info->iommu->ecap))
> + dev_warn(>dev, "SRIOV VF device IOTLB enabled 
> without flow control\n");

I can't understand these two lines.

Isn't the condition always true? What does the error message mean?

> + info->pfsid = PCI_DEVID(pdev->physfn->bus->number, 
> pdev->physfn->devfn);
> + } else
> + info->pfsid = PCI_DEVID(info->bus, info->devfn);
>  
>  #ifdef CONFIG_INTEL_IOMMU_SVM
>   /* The PCIe spec, in its wisdom, declares that the behaviour of
> @@ -1524,7 +1537,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain 
> *domain,
>  
>   sid = info->bus << 8 | info->devfn;
>   qdep = info->ats_qdep;
> - qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
> + qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
> + qdep, addr, mask);

Alignment should match open parenthesis.

>   }
>   spin_unlock_irqrestore(_domain_lock, flags);
>  }
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index dfacd49..678a0f4 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -504,9 +504,8 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
> u16 did, u16 sid,
>u8 fm, u64 type);
>  extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
> unsigned int size_order, u64 type);
> -extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
> -u64 addr, unsigned mask);
> -
> +extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
> + u16 qdep, u64 addr, unsigned mask);

Alignment should match open parenthesis.

>  extern int qi_submit_sync(struct qi_desc *desc, struct intel_iommu *iommu);
>  
>  extern int dmar_ir_support(void);

Best regards,
Lu Baolu

Re: [PATCH v5 06/23] iommu/vt-d: add definitions for PFSID

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> When SRIOV VF device IOTLB is invalidated, we need to provide
> the PF source ID such that IOMMU hardware can gauge the depth
> of invalidation queue which is shared among VFs. This is needed
> when device invalidation throttle (DIT) capability is supported.
>
> This patch adds bit definitions for checking and tracking PFSID.

Patch 6 and 7 could be posted in a separated patch series.

>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> ---
>  include/linux/intel-iommu.h | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index ddc7d79..dfacd49 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -114,6 +114,7 @@
>   * Extended Capability Register
>   */
>  
> +#define ecap_dit(e)  ((e >> 41) & 0x1)
>  #define ecap_pasid(e)((e >> 40) & 0x1)
>  #define ecap_pss(e)  ((e >> 35) & 0x1f)
>  #define ecap_eafs(e) ((e >> 34) & 0x1)
> @@ -284,6 +285,7 @@ enum {
>  #define QI_DEV_IOTLB_SID(sid)((u64)((sid) & 0x) << 32)
>  #define QI_DEV_IOTLB_QDEP(qdep)  (((qdep) & 0x1f) << 16)
>  #define QI_DEV_IOTLB_ADDR(addr)  ((u64)(addr) & VTD_PAGE_MASK)
> +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid 
> & 0xff0) << 48))
>  #define QI_DEV_IOTLB_SIZE1
>  #define QI_DEV_IOTLB_MAX_INVS32
>  
> @@ -308,6 +310,7 @@ enum {
>  #define QI_DEV_EIOTLB_PASID(p)   (((u64)p) << 32)
>  #define QI_DEV_EIOTLB_SID(sid)   ((u64)((sid) & 0x) << 16)
>  #define QI_DEV_EIOTLB_QDEP(qd)   ((u64)((qd) & 0x1f) << 4)
> +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | 
> ((u64)(pfsid & 0xff0) << 48))

PFSID[15:4] are stored in Descriptor [63:52], hence it should look like:

+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid 
& 0xfff0) << 48))



>  #define QI_DEV_EIOTLB_MAX_INVS   32
>  
>  #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
> @@ -467,6 +470,7 @@ struct device_domain_info {
>   struct list_head global; /* link to global list */
>   u8 bus; /* PCI bus number */
>   u8 devfn;   /* PCI devfn number */
> + u16 pfsid;  /* SRIOV physical function source ID */
>   u8 pasid_supported:3;
>   u8 pasid_enabled:1;
>   u8 pri_supported:1;

Best regards,
Lu Baolu

Re: [PATCH v5 06/23] iommu/vt-d: add definitions for PFSID

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> When SRIOV VF device IOTLB is invalidated, we need to provide
> the PF source ID such that IOMMU hardware can gauge the depth
> of invalidation queue which is shared among VFs. This is needed
> when device invalidation throttle (DIT) capability is supported.
>
> This patch adds bit definitions for checking and tracking PFSID.

Patch 6 and 7 could be posted in a separated patch series.

>
> Signed-off-by: Jacob Pan 
> ---
>  include/linux/intel-iommu.h | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index ddc7d79..dfacd49 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -114,6 +114,7 @@
>   * Extended Capability Register
>   */
>  
> +#define ecap_dit(e)  ((e >> 41) & 0x1)
>  #define ecap_pasid(e)((e >> 40) & 0x1)
>  #define ecap_pss(e)  ((e >> 35) & 0x1f)
>  #define ecap_eafs(e) ((e >> 34) & 0x1)
> @@ -284,6 +285,7 @@ enum {
>  #define QI_DEV_IOTLB_SID(sid)((u64)((sid) & 0x) << 32)
>  #define QI_DEV_IOTLB_QDEP(qdep)  (((qdep) & 0x1f) << 16)
>  #define QI_DEV_IOTLB_ADDR(addr)  ((u64)(addr) & VTD_PAGE_MASK)
> +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid 
> & 0xff0) << 48))
>  #define QI_DEV_IOTLB_SIZE1
>  #define QI_DEV_IOTLB_MAX_INVS32
>  
> @@ -308,6 +310,7 @@ enum {
>  #define QI_DEV_EIOTLB_PASID(p)   (((u64)p) << 32)
>  #define QI_DEV_EIOTLB_SID(sid)   ((u64)((sid) & 0x) << 16)
>  #define QI_DEV_EIOTLB_QDEP(qd)   ((u64)((qd) & 0x1f) << 4)
> +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | 
> ((u64)(pfsid & 0xff0) << 48))

PFSID[15:4] are stored in Descriptor [63:52], hence it should look like:

+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid 
& 0xfff0) << 48))



>  #define QI_DEV_EIOTLB_MAX_INVS   32
>  
>  #define QI_PGRP_IDX(idx) (((u64)(idx)) << 55)
> @@ -467,6 +470,7 @@ struct device_domain_info {
>   struct list_head global; /* link to global list */
>   u8 bus; /* PCI bus number */
>   u8 devfn;   /* PCI devfn number */
> + u16 pfsid;  /* SRIOV physical function source ID */
>   u8 pasid_supported:3;
>   u8 pasid_enabled:1;
>   u8 pri_supported:1;

Best regards,
Lu Baolu

Re: [PATCH v5 04/23] iommu/vt-d: add bind_pasid_table function

lock()
Setup context
pasid_table_bound = 1
mutex_unlock()

   mutex_lock()
   Setup context
   pasid_table_bound = 1
   mutex_unlock()


> + dev_err(dev, "Device PASID table already bound\n");
> + ret = -EBUSY;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret) {
> + dev_err(dev, "Failed to enable PASID\n");
> + goto out;
> + }
> + }

I prefer a blank line here.

> + spin_lock_irqsave(>lock, flags);
> + context = iommu_context_addr(iommu, bus, devfn, 0);
> + if (!context_present(context)) {
> + dev_err(dev, "Context not present\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* Anticipate guest to use SVM and owns the first level, so we turn
> +  * nested mode on
> +  */
> + ctx_lo = context[0].lo;
> + ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE;
> + ctx_lo &= ~CONTEXT_TT_MASK;
> + ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2;
> + context[0].lo = ctx_lo;
> +
> + /* Assign guest PASID table pointer and size order */
> + ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) |
> + (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS);
> + context[1].lo = ctx_lo;
> + /* make sure context entry is updated before flushing */
> + wmb();
> + did = dmar_domain->iommu_did[iommu->seq_id];
> + iommu->flush.flush_context(iommu, did,
> + (((u16)bus) << 8) | devfn,
> + DMA_CCMD_MASK_NOBIT,
> + DMA_CCMD_DEVICE_INVL);
> + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
> + info->pasid_table_bound = 1;
> +out_unlock:
> + spin_unlock_irqrestore(>lock, flags);
> +out:
> + return ret;
> +}
> +
> +static void intel_iommu_unbind_pasid_table(struct iommu_domain *domain,
> + struct device *dev)
> +{
> + struct intel_iommu *iommu;
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + u8 bus, devfn;
> +
> + info = dev->archdata.iommu;
> + if (!info) {
> + dev_err(dev, "Invalid device domain info\n");
> + return;
> + }
> + iommu = device_to_iommu(dev, , );
> + if (!iommu) {
> + dev_err(dev, "No IOMMU for device to unbind PASID table\n");
> + return;
> + }
> +
> + domain_context_clear(iommu, dev);
> +
> + domain_context_mapping_one(dmar_domain, iommu, bus, devfn);
> + info->pasid_table_bound = 0;
> +}
>  #endif /* CONFIG_INTEL_IOMMU_SVM */
>  
>  const struct iommu_ops intel_iommu_ops = {
> @@ -5266,6 +5384,10 @@ const struct iommu_ops intel_iommu_ops = {
>   .domain_free= intel_iommu_domain_free,
>   .attach_dev = intel_iommu_attach_device,
>   .detach_dev = intel_iommu_detach_device,
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> + .bind_pasid_table   = intel_iommu_bind_pasid_table,
> + .unbind_pasid_table = intel_iommu_unbind_pasid_table,
> +#endif
>   .map= intel_iommu_map,
>   .unmap  = intel_iommu_unmap,
>   .map_sg = default_iommu_map_sg,
> diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
> index 21b3e7d..db290b2 100644
> --- a/include/linux/dma_remapping.h
> +++ b/include/linux/dma_remapping.h
> @@ -28,6 +28,7 @@
>  
>  #define CONTEXT_DINVE(1ULL << 8)
>  #define CONTEXT_PRS  (1ULL << 9)
> +#define CONTEXT_NESTE(1ULL << 10)
>  #define CONTEXT_PASIDE   (1ULL << 11)
>  
>  struct intel_iommu;

Best regards,
Lu Baolu

Re: [PATCH v5 04/23] iommu/vt-d: add bind_pasid_table function

  mutex_lock()
   Setup context
   pasid_table_bound = 1
   mutex_unlock()


> + dev_err(dev, "Device PASID table already bound\n");
> + ret = -EBUSY;
> + goto out;
> + }
> + if (!info->pasid_enabled) {
> + ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
> + if (ret) {
> + dev_err(dev, "Failed to enable PASID\n");
> + goto out;
> + }
> + }

I prefer a blank line here.

> + spin_lock_irqsave(>lock, flags);
> + context = iommu_context_addr(iommu, bus, devfn, 0);
> + if (!context_present(context)) {
> + dev_err(dev, "Context not present\n");
> + ret = -EINVAL;
> + goto out_unlock;
> + }
> +
> + /* Anticipate guest to use SVM and owns the first level, so we turn
> +  * nested mode on
> +  */
> + ctx_lo = context[0].lo;
> + ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE;
> + ctx_lo &= ~CONTEXT_TT_MASK;
> + ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2;
> + context[0].lo = ctx_lo;
> +
> + /* Assign guest PASID table pointer and size order */
> + ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) |
> + (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS);
> + context[1].lo = ctx_lo;
> + /* make sure context entry is updated before flushing */
> + wmb();
> + did = dmar_domain->iommu_did[iommu->seq_id];
> + iommu->flush.flush_context(iommu, did,
> + (((u16)bus) << 8) | devfn,
> + DMA_CCMD_MASK_NOBIT,
> + DMA_CCMD_DEVICE_INVL);
> + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
> + info->pasid_table_bound = 1;
> +out_unlock:
> + spin_unlock_irqrestore(>lock, flags);
> +out:
> + return ret;
> +}
> +
> +static void intel_iommu_unbind_pasid_table(struct iommu_domain *domain,
> + struct device *dev)
> +{
> + struct intel_iommu *iommu;
> + struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> + struct device_domain_info *info;
> + u8 bus, devfn;
> +
> + info = dev->archdata.iommu;
> + if (!info) {
> + dev_err(dev, "Invalid device domain info\n");
> + return;
> + }
> + iommu = device_to_iommu(dev, , );
> + if (!iommu) {
> + dev_err(dev, "No IOMMU for device to unbind PASID table\n");
> + return;
> + }
> +
> + domain_context_clear(iommu, dev);
> +
> + domain_context_mapping_one(dmar_domain, iommu, bus, devfn);
> + info->pasid_table_bound = 0;
> +}
>  #endif /* CONFIG_INTEL_IOMMU_SVM */
>  
>  const struct iommu_ops intel_iommu_ops = {
> @@ -5266,6 +5384,10 @@ const struct iommu_ops intel_iommu_ops = {
>   .domain_free= intel_iommu_domain_free,
>   .attach_dev = intel_iommu_attach_device,
>   .detach_dev = intel_iommu_detach_device,
> +#ifdef CONFIG_INTEL_IOMMU_SVM
> + .bind_pasid_table   = intel_iommu_bind_pasid_table,
> + .unbind_pasid_table = intel_iommu_unbind_pasid_table,
> +#endif
>   .map= intel_iommu_map,
>   .unmap  = intel_iommu_unmap,
>   .map_sg = default_iommu_map_sg,
> diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
> index 21b3e7d..db290b2 100644
> --- a/include/linux/dma_remapping.h
> +++ b/include/linux/dma_remapping.h
> @@ -28,6 +28,7 @@
>  
>  #define CONTEXT_DINVE(1ULL << 8)
>  #define CONTEXT_PRS  (1ULL << 9)
> +#define CONTEXT_NESTE(1ULL << 10)
>  #define CONTEXT_PASIDE   (1ULL << 11)
>  
>  struct intel_iommu;

Best regards,
Lu Baolu

Re: [PATCH v5 03/23] iommu/vt-d: add a flag for pasid table bound status

Hi again,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> Adding a flag in device domain into to track whether a guest or
> user PASID table is bound to a device.
>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> ---
>  include/linux/intel-iommu.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 304afae..ddc7d79 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -473,6 +473,7 @@ struct device_domain_info {
>   u8 pri_enabled:1;
>   u8 ats_supported:1;
>   u8 ats_enabled:1;
> + u8 pasid_table_bound:1;

Can you please add some comments here? So that, people can
understand the purpose for this bit exactly.

Best regards,
Lu Baolu

>   u8 ats_qdep;
>   u64 fault_mask; /* selected IOMMU faults to be reported */
>   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */

Re: [PATCH v5 03/23] iommu/vt-d: add a flag for pasid table bound status

Hi again,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> Adding a flag in device domain into to track whether a guest or
> user PASID table is bound to a device.
>
> Signed-off-by: Jacob Pan 
> ---
>  include/linux/intel-iommu.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 304afae..ddc7d79 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -473,6 +473,7 @@ struct device_domain_info {
>   u8 pri_enabled:1;
>   u8 ats_supported:1;
>   u8 ats_enabled:1;
> + u8 pasid_table_bound:1;

Can you please add some comments here? So that, people can
understand the purpose for this bit exactly.

Best regards,
Lu Baolu

>   u8 ats_qdep;
>   u64 fault_mask; /* selected IOMMU faults to be reported */
>   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */

Re: [PATCH v5 03/23] iommu/vt-d: add a flag for pasid table bound status

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> Adding a flag in device domain into to track whether a guest or
typo:   ^^info

Best regards,
Lu Baolu

> user PASID table is bound to a device.
>
> Signed-off-by: Jacob Pan <jacob.jun@linux.intel.com>
> ---
>  include/linux/intel-iommu.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 304afae..ddc7d79 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -473,6 +473,7 @@ struct device_domain_info {
>   u8 pri_enabled:1;
>   u8 ats_supported:1;
>   u8 ats_enabled:1;
> + u8 pasid_table_bound:1;
>   u8 ats_qdep;
>   u64 fault_mask; /* selected IOMMU faults to be reported */
>   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */

Re: [PATCH v5 03/23] iommu/vt-d: add a flag for pasid table bound status

Hi,

On 05/12/2018 04:53 AM, Jacob Pan wrote:
> Adding a flag in device domain into to track whether a guest or
typo:   ^^info

Best regards,
Lu Baolu

> user PASID table is bound to a device.
>
> Signed-off-by: Jacob Pan 
> ---
>  include/linux/intel-iommu.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
> index 304afae..ddc7d79 100644
> --- a/include/linux/intel-iommu.h
> +++ b/include/linux/intel-iommu.h
> @@ -473,6 +473,7 @@ struct device_domain_info {
>   u8 pri_enabled:1;
>   u8 ats_supported:1;
>   u8 ats_enabled:1;
> + u8 pasid_table_bound:1;
>   u8 ats_qdep;
>   u64 fault_mask; /* selected IOMMU faults to be reported */
>   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */

Re: [PATCH v3 0/2] iommu/vt-d: Fix mapping PSI missing for iommu_map()

2018-05-04 Thread Lu Baolu

Hi,

On 05/04/2018 10:34 AM, Peter Xu wrote:
> v3:
> - drop the pr_debug patch [Joerg]
> - rename all the subjects as suggested [Joerg]
> - rebase
>
> v2:
> - cc correct people and iommu list
>
> (PSI stands for: Page Selective Invalidations)
>
> Intel IOMMU has the caching mode to ease emulation of the device.
> When that bit is set, we need to send PSIs even for newly mapped
> pages.  However current driver is not fully obey the rule.  E.g.,
> iommu_map() API will only do the mapping but it never sent the PSIs
> before.  That can be problematic to emulated IOMMU devices since
> they'll never be able to build up the shadow page tables if without
> such information.  This patchset tries to fix the problem.
>
> Patch 1 introduces a helper to notify the MAP PSIs.
>
> Patch 2 fixes the real problem by making sure every domain mapping
> will trigger the MAP PSI notifications.
>
> Without the patchset, nested device assignment (assign one device
> firstly to L1 guest, then to L2 guest) won't work for QEMU.  After
> applying the patchset, it works.
>
> Please review.  Thanks.

Both patches look good to me.

Best regards,
Lu Baolu

>
> Peter Xu (2):
>   iommu/vt-d: Introduce __mapping_notify_one()
>   iommu/vt-d: Fix iotlb psi missing for mappings
>
>  drivers/iommu/intel-iommu.c | 65 ++---
>  1 file changed, 46 insertions(+), 19 deletions(-)
>

Re: [PATCH v3 0/2] iommu/vt-d: Fix mapping PSI missing for iommu_map()

2018-05-04 Thread Lu Baolu

Hi,

On 05/04/2018 10:34 AM, Peter Xu wrote:
> v3:
> - drop the pr_debug patch [Joerg]
> - rename all the subjects as suggested [Joerg]
> - rebase
>
> v2:
> - cc correct people and iommu list
>
> (PSI stands for: Page Selective Invalidations)
>
> Intel IOMMU has the caching mode to ease emulation of the device.
> When that bit is set, we need to send PSIs even for newly mapped
> pages.  However current driver is not fully obey the rule.  E.g.,
> iommu_map() API will only do the mapping but it never sent the PSIs
> before.  That can be problematic to emulated IOMMU devices since
> they'll never be able to build up the shadow page tables if without
> such information.  This patchset tries to fix the problem.
>
> Patch 1 introduces a helper to notify the MAP PSIs.
>
> Patch 2 fixes the real problem by making sure every domain mapping
> will trigger the MAP PSI notifications.
>
> Without the patchset, nested device assignment (assign one device
> firstly to L1 guest, then to L2 guest) won't work for QEMU.  After
> applying the patchset, it works.
>
> Please review.  Thanks.

Both patches look good to me.

Best regards,
Lu Baolu

>
> Peter Xu (2):
>   iommu/vt-d: Introduce __mapping_notify_one()
>   iommu/vt-d: Fix iotlb psi missing for mappings
>
>  drivers/iommu/intel-iommu.c | 65 ++---
>  1 file changed, 46 insertions(+), 19 deletions(-)
>

[PATCH 0/4] iommu/vt-d: Several cleanup patches

Hi,

This includes several cleanup patches which aim to make the
code more concise and easier for reading. There aren't any
functionality changes.

Best regards,
Lu Baolu

Lu Baolu (4):
  iommu: Clean up the comments for iommu_group_alloc
  iommu/vt-d: Clean up unused variable in find_or_alloc_domain
  iommu/vt-d: Clean up pasid quirk for pre-production devices
  iommu/vt-d: Remove unnecessary parentheses

 drivers/iommu/intel-iommu.c | 36 +++-
 drivers/iommu/intel-svm.c   |  2 +-
 drivers/iommu/iommu.c   |  1 -
 include/linux/intel-iommu.h |  1 -
 4 files changed, 4 insertions(+), 36 deletions(-)

-- 
2.7.4

[PATCH 0/4] iommu/vt-d: Several cleanup patches

Hi,

This includes several cleanup patches which aim to make the
code more concise and easier for reading. There aren't any
functionality changes.

Best regards,
Lu Baolu

Lu Baolu (4):
  iommu: Clean up the comments for iommu_group_alloc
  iommu/vt-d: Clean up unused variable in find_or_alloc_domain
  iommu/vt-d: Clean up pasid quirk for pre-production devices
  iommu/vt-d: Remove unnecessary parentheses

 drivers/iommu/intel-iommu.c | 36 +++-
 drivers/iommu/intel-svm.c   |  2 +-
 drivers/iommu/iommu.c   |  1 -
 include/linux/intel-iommu.h |  1 -
 4 files changed, 4 insertions(+), 36 deletions(-)

-- 
2.7.4

[PATCH 2/4] iommu/vt-d: Clean up unused variable in find_or_alloc_domain

Remove it to make the code more concise.

Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
---
 drivers/iommu/intel-iommu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 749d8f2..9064607 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2533,7 +2533,7 @@ static struct dmar_domain *find_or_alloc_domain(struct 
device *dev, int gaw)
struct device_domain_info *info = NULL;
struct dmar_domain *domain = NULL;
struct intel_iommu *iommu;
-   u16 req_id, dma_alias;
+   u16 dma_alias;
unsigned long flags;
u8 bus, devfn;
 
@@ -2541,8 +2541,6 @@ static struct dmar_domain *find_or_alloc_domain(struct 
device *dev, int gaw)
if (!iommu)
return NULL;
 
-   req_id = ((u16)bus << 8) | devfn;
-
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
 
-- 
2.7.4

[PATCH 2/4] iommu/vt-d: Clean up unused variable in find_or_alloc_domain

Remove it to make the code more concise.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 749d8f2..9064607 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2533,7 +2533,7 @@ static struct dmar_domain *find_or_alloc_domain(struct 
device *dev, int gaw)
struct device_domain_info *info = NULL;
struct dmar_domain *domain = NULL;
struct intel_iommu *iommu;
-   u16 req_id, dma_alias;
+   u16 dma_alias;
unsigned long flags;
u8 bus, devfn;
 
@@ -2541,8 +2541,6 @@ static struct dmar_domain *find_or_alloc_domain(struct 
device *dev, int gaw)
if (!iommu)
return NULL;
 
-   req_id = ((u16)bus << 8) | devfn;
-
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
 
-- 
2.7.4

[PATCH 4/4] iommu/vt-d: Remove unnecessary parentheses

Remove unnecessary parentheses to comply with preferred coding
style.

Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
---
 drivers/iommu/intel-svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e8cd984..45f6e58 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -319,7 +319,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
} else
pasid_max = 1 << 20;
 
-   if ((flags & SVM_FLAG_SUPERVISOR_MODE)) {
+   if (flags & SVM_FLAG_SUPERVISOR_MODE) {
if (!ecap_srs(iommu->ecap))
return -EINVAL;
} else if (pasid) {
-- 
2.7.4

[PATCH 4/4] iommu/vt-d: Remove unnecessary parentheses

Remove unnecessary parentheses to comply with preferred coding
style.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e8cd984..45f6e58 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -319,7 +319,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
} else
pasid_max = 1 << 20;
 
-   if ((flags & SVM_FLAG_SUPERVISOR_MODE)) {
+   if (flags & SVM_FLAG_SUPERVISOR_MODE) {
if (!ecap_srs(iommu->ecap))
return -EINVAL;
} else if (pasid) {
-- 
2.7.4

[PATCH 3/4] iommu/vt-d: Clean up pasid quirk for pre-production devices

The pasid28 quirk is needed only for some pre-production devices.
Remove it to make the code concise.

Signed-off-by: Ashok Raj <ashok@intel.com>
Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
---
 drivers/iommu/intel-iommu.c | 32 ++--
 include/linux/intel-iommu.h |  1 -
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9064607..10bce33 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,37 +485,14 @@ static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int intel_iommu_ecs = 1;
-static int intel_iommu_pasid28;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL   1
 #define IDENTMAP_GFX   2
 #define IDENTMAP_AZALIA4
 
-/* Broadwell and Skylake have broken ECS support — normal so-called "second
- * level" translation of DMA requests-without-PASID doesn't actually happen
- * unless you also set the NESTE bit in an extended context-entry. Which of
- * course means that SVM doesn't work because it's trying to do nested
- * translation of the physical addresses it finds in the process page tables,
- * through the IOVA->phys mapping found in the "second level" page tables.
- *
- * The VT-d specification was retroactively changed to change the definition
- * of the capability bits and pretend that Broadwell/Skylake never happened...
- * but unfortunately the wrong bit was changed. It's ECS which is broken, but
- * for some reason it was the PASID capability bit which was redefined (from
- * bit 28 on BDW/SKL to bit 40 in future).
- *
- * So our test for ECS needs to eschew those implementations which set the old
- * PASID capabiity bit 28, since those are the ones on which ECS is broken.
- * Unless we are working around the 'pasid28' limitations, that is, by putting
- * the device into passthrough mode for normal DMA and thus masking the bug.
- */
-#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
-   (intel_iommu_pasid28 || 
!ecap_broken_pasid(iommu->ecap)))
-/* PASID support is thus enabled if ECS is enabled and *either* of the old
- * or new capability bits are set. */
-#define pasid_enabled(iommu) (ecs_enabled(iommu) &&\
- (ecap_pasid(iommu->ecap) || 
ecap_broken_pasid(iommu->ecap)))
+#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap))
+#define pasid_enabled(iommu)   (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
 
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@ -578,11 +555,6 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable extended context table 
support\n");
intel_iommu_ecs = 0;
-   } else if (!strncmp(str, "pasid28", 7)) {
-   printk(KERN_INFO
-   "Intel-IOMMU: enable pre-production PASID 
support\n");
-   intel_iommu_pasid28 = 1;
-   iommu_identity_mapping |= IDENTMAP_GFX;
} else if (!strncmp(str, "tboot_noforce", 13)) {
printk(KERN_INFO
"Intel-IOMMU: not forcing on after tboot. This 
could expose security risk for tboot\n");
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ef169d6..1df9401 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -121,7 +121,6 @@
 #define ecap_srs(e)((e >> 31) & 0x1)
 #define ecap_ers(e)((e >> 30) & 0x1)
 #define ecap_prs(e)((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)   ((e >> 28) & 0x1)
 #define ecap_dis(e)((e >> 27) & 0x1)
 #define ecap_nest(e)   ((e >> 26) & 0x1)
 #define ecap_mts(e)((e >> 25) & 0x1)
-- 
2.7.4

[PATCH 3/4] iommu/vt-d: Clean up pasid quirk for pre-production devices

The pasid28 quirk is needed only for some pre-production devices.
Remove it to make the code concise.

Signed-off-by: Ashok Raj 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel-iommu.c | 32 ++--
 include/linux/intel-iommu.h |  1 -
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 9064607..10bce33 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,37 +485,14 @@ static int dmar_forcedac;
 static int intel_iommu_strict;
 static int intel_iommu_superpage = 1;
 static int intel_iommu_ecs = 1;
-static int intel_iommu_pasid28;
 static int iommu_identity_mapping;
 
 #define IDENTMAP_ALL   1
 #define IDENTMAP_GFX   2
 #define IDENTMAP_AZALIA4
 
-/* Broadwell and Skylake have broken ECS support — normal so-called "second
- * level" translation of DMA requests-without-PASID doesn't actually happen
- * unless you also set the NESTE bit in an extended context-entry. Which of
- * course means that SVM doesn't work because it's trying to do nested
- * translation of the physical addresses it finds in the process page tables,
- * through the IOVA->phys mapping found in the "second level" page tables.
- *
- * The VT-d specification was retroactively changed to change the definition
- * of the capability bits and pretend that Broadwell/Skylake never happened...
- * but unfortunately the wrong bit was changed. It's ECS which is broken, but
- * for some reason it was the PASID capability bit which was redefined (from
- * bit 28 on BDW/SKL to bit 40 in future).
- *
- * So our test for ECS needs to eschew those implementations which set the old
- * PASID capabiity bit 28, since those are the ones on which ECS is broken.
- * Unless we are working around the 'pasid28' limitations, that is, by putting
- * the device into passthrough mode for normal DMA and thus masking the bug.
- */
-#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
-   (intel_iommu_pasid28 || 
!ecap_broken_pasid(iommu->ecap)))
-/* PASID support is thus enabled if ECS is enabled and *either* of the old
- * or new capability bits are set. */
-#define pasid_enabled(iommu) (ecs_enabled(iommu) &&\
- (ecap_pasid(iommu->ecap) || 
ecap_broken_pasid(iommu->ecap)))
+#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap))
+#define pasid_enabled(iommu)   (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
 
 int intel_iommu_gfx_mapped;
 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
@@ -578,11 +555,6 @@ static int __init intel_iommu_setup(char *str)
printk(KERN_INFO
"Intel-IOMMU: disable extended context table 
support\n");
intel_iommu_ecs = 0;
-   } else if (!strncmp(str, "pasid28", 7)) {
-   printk(KERN_INFO
-   "Intel-IOMMU: enable pre-production PASID 
support\n");
-   intel_iommu_pasid28 = 1;
-   iommu_identity_mapping |= IDENTMAP_GFX;
} else if (!strncmp(str, "tboot_noforce", 13)) {
printk(KERN_INFO
"Intel-IOMMU: not forcing on after tboot. This 
could expose security risk for tboot\n");
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ef169d6..1df9401 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -121,7 +121,6 @@
 #define ecap_srs(e)((e >> 31) & 0x1)
 #define ecap_ers(e)((e >> 30) & 0x1)
 #define ecap_prs(e)((e >> 29) & 0x1)
-#define ecap_broken_pasid(e)   ((e >> 28) & 0x1)
 #define ecap_dis(e)((e >> 27) & 0x1)
 #define ecap_nest(e)   ((e >> 26) & 0x1)
 #define ecap_mts(e)((e >> 25) & 0x1)
-- 
2.7.4

[PATCH 1/4] iommu: Clean up the comments for iommu_group_alloc

@name parameter has been removed.

Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
---
 drivers/iommu/iommu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d2aa2320..d87e7c2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -322,7 +322,6 @@ static struct kobj_type iommu_group_ktype = {
 
 /**
  * iommu_group_alloc - Allocate a new group
- * @name: Optional name to associate with group, visible in sysfs
  *
  * This function is called by an iommu driver to allocate a new iommu
  * group.  The iommu group represents the minimum granularity of the iommu.
-- 
2.7.4

[PATCH 1/4] iommu: Clean up the comments for iommu_group_alloc

@name parameter has been removed.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/iommu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d2aa2320..d87e7c2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -322,7 +322,6 @@ static struct kobj_type iommu_group_ktype = {
 
 /**
  * iommu_group_alloc - Allocate a new group
- * @name: Optional name to associate with group, visible in sysfs
  *
  * This function is called by an iommu driver to allocate a new iommu
  * group.  The iommu group represents the minimum granularity of the iommu.
-- 
2.7.4

[PATCH v2 1/9] iommu/vt-d: Global PASID name space

This adds the system wide PASID name space for the PASID
allocation. Currently we are using per IOMMU PASID name
spaces which are not suitable for some use cases. For an
example, one application (associated with a PASID) might
talk to two physical devices simultaneously while the two
devices could reside behind two different IOMMU units.

Cc: Ashok Raj <ashok@intel.com>
Cc: Jacob Pan <jacob.jun@linux.intel.com>
Cc: Kevin Tian <kevin.t...@intel.com>
Cc: Liu Yi L <yi.l@intel.com>
Suggested-by: Ashok Raj <ashok@intel.com>
Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>
Reviewed-by: Liu Yi L <yi.l@intel.com>
---
 drivers/iommu/Makefile  |  2 +-
 drivers/iommu/intel-iommu.c | 13 ++
 drivers/iommu/intel-pasid.c | 60 +
 drivers/iommu/intel-pasid.h | 30 +++
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/intel-pasid.c
 create mode 100644 drivers/iommu/intel-pasid.h

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1fb6958..0a190b4 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
 obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 749d8f2..98c5ae9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -53,6 +53,7 @@
 #include 
 
 #include "irq_remapping.h"
+#include "intel-pasid.h"
 
 #define ROOT_SIZE  VTD_PAGE_SIZE
 #define CONTEXT_SIZE   VTD_PAGE_SIZE
@@ -3265,6 +3266,18 @@ static int __init init_dmars(void)
}
 
for_each_active_iommu(iommu, drhd) {
+   /*
+* Find the max pasid size of all IOMMU's in the system.
+* we need to ensure the system pasid table is no bigger
+* than the smallest supported.
+*/
+   if (pasid_enabled(iommu)) {
+   u32 temp = 2 << ecap_pss(iommu->ecap);
+
+   intel_pasid_max_id = min_t(u32, temp,
+  intel_pasid_max_id);
+   }
+
g_iommus[iommu->seq_id] = iommu;
 
intel_iommu_init_qi(iommu);
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
new file mode 100644
index 000..0690f39
--- /dev/null
+++ b/drivers/iommu/intel-pasid.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pasid.c - PASID idr, table and entry manipulation
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu...@linux.intel.com>
+ */
+
+#define pr_fmt(fmt)"DMAR: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "intel-pasid.h"
+
+/*
+ * Intel IOMMU global PASID pool:
+ */
+static DEFINE_SPINLOCK(pasid_lock);
+u32 intel_pasid_max_id = PASID_MAX;
+static DEFINE_IDR(pasid_idr);
+
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
+{
+   int ret, min, max;
+
+   min = max_t(int, start, PASID_MIN);
+   max = min_t(int, end, intel_pasid_max_id);
+
+   WARN_ON(in_interrupt());
+   idr_preload(gfp);
+   spin_lock(_lock);
+   ret = idr_alloc(_idr, ptr, min, max, GFP_ATOMIC);
+   spin_unlock(_lock);
+   idr_preload_end();
+
+   return ret;
+}
+
+void intel_pasid_free_id(int pasid)
+{
+   spin_lock(_lock);
+   idr_remove(_idr, pasid);
+   spin_unlock(_lock);
+}
+
+void *intel_pasid_lookup_id(int pasid)
+{
+   void *p;
+
+   spin_lock(_lock);
+   p = idr_find(_idr, pasid);
+   spin_unlock(_lock);
+
+   return p;
+}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
new file mode 100644
index 000..0c36af0
--- /dev/null
+++ b/drivers/iommu/intel-pasid.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * intel-pasid.h - PASID idr, table and entry header
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu...@linux.intel.com>
+ */
+
+#ifndef __INTEL_PASID_H
+#define __INTEL_PASID_H
+
+/*
+ * Eventually I'm promised we will get a multi-level PASID table
+ * and it won't have to be physically contiguous. Until then,
+ * limit the size because 8MiB contiguous allocations can be hard
+ * to come by. The limit of 0x2, which is 1MiB for each of
+ * the PASID and PASID-state tables, is somewhat arbitrary.
+ *
+ * PASID 0 is r

[PATCH v2 1/9] iommu/vt-d: Global PASID name space

This adds the system wide PASID name space for the PASID
allocation. Currently we are using per IOMMU PASID name
spaces which are not suitable for some use cases. For an
example, one application (associated with a PASID) might
talk to two physical devices simultaneously while the two
devices could reside behind two different IOMMU units.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Suggested-by: Ashok Raj 
Signed-off-by: Lu Baolu 
Reviewed-by: Kevin Tian 
Reviewed-by: Liu Yi L 
---
 drivers/iommu/Makefile  |  2 +-
 drivers/iommu/intel-iommu.c | 13 ++
 drivers/iommu/intel-pasid.c | 60 +
 drivers/iommu/intel-pasid.h | 30 +++
 4 files changed, 104 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/intel-pasid.c
 create mode 100644 drivers/iommu/intel-pasid.h

diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1fb6958..0a190b4 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -14,7 +14,7 @@ obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_ARM_SMMU) += arm-smmu.o
 obj-$(CONFIG_ARM_SMMU_V3) += arm-smmu-v3.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o
+obj-$(CONFIG_INTEL_IOMMU) += intel-iommu.o intel-pasid.o
 obj-$(CONFIG_INTEL_IOMMU_SVM) += intel-svm.o
 obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o
 obj-$(CONFIG_IRQ_REMAP) += intel_irq_remapping.o irq_remapping.o
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 749d8f2..98c5ae9 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -53,6 +53,7 @@
 #include 
 
 #include "irq_remapping.h"
+#include "intel-pasid.h"
 
 #define ROOT_SIZE  VTD_PAGE_SIZE
 #define CONTEXT_SIZE   VTD_PAGE_SIZE
@@ -3265,6 +3266,18 @@ static int __init init_dmars(void)
}
 
for_each_active_iommu(iommu, drhd) {
+   /*
+* Find the max pasid size of all IOMMU's in the system.
+* we need to ensure the system pasid table is no bigger
+* than the smallest supported.
+*/
+   if (pasid_enabled(iommu)) {
+   u32 temp = 2 << ecap_pss(iommu->ecap);
+
+   intel_pasid_max_id = min_t(u32, temp,
+  intel_pasid_max_id);
+   }
+
g_iommus[iommu->seq_id] = iommu;
 
intel_iommu_init_qi(iommu);
diff --git a/drivers/iommu/intel-pasid.c b/drivers/iommu/intel-pasid.c
new file mode 100644
index 000..0690f39
--- /dev/null
+++ b/drivers/iommu/intel-pasid.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/**
+ * intel-pasid.c - PASID idr, table and entry manipulation
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu 
+ */
+
+#define pr_fmt(fmt)"DMAR: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "intel-pasid.h"
+
+/*
+ * Intel IOMMU global PASID pool:
+ */
+static DEFINE_SPINLOCK(pasid_lock);
+u32 intel_pasid_max_id = PASID_MAX;
+static DEFINE_IDR(pasid_idr);
+
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp)
+{
+   int ret, min, max;
+
+   min = max_t(int, start, PASID_MIN);
+   max = min_t(int, end, intel_pasid_max_id);
+
+   WARN_ON(in_interrupt());
+   idr_preload(gfp);
+   spin_lock(_lock);
+   ret = idr_alloc(_idr, ptr, min, max, GFP_ATOMIC);
+   spin_unlock(_lock);
+   idr_preload_end();
+
+   return ret;
+}
+
+void intel_pasid_free_id(int pasid)
+{
+   spin_lock(_lock);
+   idr_remove(_idr, pasid);
+   spin_unlock(_lock);
+}
+
+void *intel_pasid_lookup_id(int pasid)
+{
+   void *p;
+
+   spin_lock(_lock);
+   p = idr_find(_idr, pasid);
+   spin_unlock(_lock);
+
+   return p;
+}
diff --git a/drivers/iommu/intel-pasid.h b/drivers/iommu/intel-pasid.h
new file mode 100644
index 000..0c36af0
--- /dev/null
+++ b/drivers/iommu/intel-pasid.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * intel-pasid.h - PASID idr, table and entry header
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Lu Baolu 
+ */
+
+#ifndef __INTEL_PASID_H
+#define __INTEL_PASID_H
+
+/*
+ * Eventually I'm promised we will get a multi-level PASID table
+ * and it won't have to be physically contiguous. Until then,
+ * limit the size because 8MiB contiguous allocations can be hard
+ * to come by. The limit of 0x2, which is 1MiB for each of
+ * the PASID and PASID-state tables, is somewhat arbitrary.
+ *
+ * PASID 0 is reserved in caching mode (virtualised IOMMU).
+ */
+#define PASID_MIN  0x1
+#define PASID_MAX  0x2
+
+extern u32 intel_pasid_max_id;
+int intel_pasid_alloc_id(void *ptr, int start, int end, gfp_t gfp);
+void intel_pasid_free_id(int pasid);
+void *intel_pasid_lookup_id(int pasid);
+
+#endif /* __INTEL_PASID_H */
-- 
2.7.4

[PATCH v2 2/9] iommu/vt-d: Decouple idr bond pointer from svm

As we move the PASID idr out of SVM code and make it serving
as a global PASID name space, the consumer can specify a ptr
to bind it with a PASID. We shouldn't assume that each PASID
will be bond with a ptr of struct intel_svm anymore.

This patch cleans up a idr_for_each_entry() usage in the SVM
code. It's required to replace the SVM-specific idr with the
global PASID idr.

Cc: Ashok Raj <ashok@intel.com>
Cc: Jacob Pan <jacob.jun@linux.intel.com>
Cc: Kevin Tian <kevin.t...@intel.com>
Cc: Liu Yi L <yi.l@intel.com>
Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.t...@intel.com>
Reviewed-by: Liu Yi L <yi.l@intel.com>
---
 drivers/iommu/intel-svm.c   | 14 ++
 include/linux/intel-iommu.h |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e8cd984..983af0c 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -298,6 +298,7 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
 
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
 {
@@ -329,13 +330,13 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
 
mutex_lock(_mutex);
if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
-   int i;
+   struct intel_svm *t;
 
-   idr_for_each_entry(>pasid_idr, svm, i) {
-   if (svm->mm != mm ||
-   (svm->flags & SVM_FLAG_PRIVATE_PASID))
+   list_for_each_entry(t, _svm_list, list) {
+   if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
continue;
 
+   svm = t;
if (svm->pasid >= pasid_max) {
dev_warn(dev,
 "Limited PASID width. Cannot use 
existing PASID %d\n",
@@ -404,6 +405,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
svm->mm = mm;
svm->flags = flags;
INIT_LIST_HEAD_RCU(>devs);
+   INIT_LIST_HEAD(>list);
ret = -ENOMEM;
if (mm) {
ret = mmu_notifier_register(>notifier, mm);
@@ -430,6 +432,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
 */
if (cap_caching_mode(iommu->cap))
intel_flush_pasid_dev(svm, sdev, svm->pasid);
+
+   list_add_tail(>list, _svm_list);
}
list_add_rcu(>list, >devs);
 
@@ -485,6 +489,8 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
if (svm->mm)

mmu_notifier_unregister(>notifier, svm->mm);
 
+   list_del(>list);
+
/* We mandate that no page faults may 
be outstanding
 * for the PASID when 
intel_svm_unbind_mm() is called.
 * If that is not obeyed, subtle errors 
will happen.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ef169d6..795717e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -486,6 +486,7 @@ struct intel_svm {
int flags;
int pasid;
struct list_head devs;
+   struct list_head list;
 };
 
 extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct 
intel_svm_dev *sdev);
-- 
2.7.4

[PATCH v2 2/9] iommu/vt-d: Decouple idr bond pointer from svm

As we move the PASID idr out of SVM code and make it serving
as a global PASID name space, the consumer can specify a ptr
to bind it with a PASID. We shouldn't assume that each PASID
will be bond with a ptr of struct intel_svm anymore.

This patch cleans up a idr_for_each_entry() usage in the SVM
code. It's required to replace the SVM-specific idr with the
global PASID idr.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Kevin Tian 
Reviewed-by: Liu Yi L 
---
 drivers/iommu/intel-svm.c   | 14 ++
 include/linux/intel-iommu.h |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index e8cd984..983af0c 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -298,6 +298,7 @@ static const struct mmu_notifier_ops intel_mmuops = {
 };
 
 static DEFINE_MUTEX(pasid_mutex);
+static LIST_HEAD(global_svm_list);
 
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
 {
@@ -329,13 +330,13 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
 
mutex_lock(_mutex);
if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
-   int i;
+   struct intel_svm *t;
 
-   idr_for_each_entry(>pasid_idr, svm, i) {
-   if (svm->mm != mm ||
-   (svm->flags & SVM_FLAG_PRIVATE_PASID))
+   list_for_each_entry(t, _svm_list, list) {
+   if (t->mm != mm || (t->flags & SVM_FLAG_PRIVATE_PASID))
continue;
 
+   svm = t;
if (svm->pasid >= pasid_max) {
dev_warn(dev,
 "Limited PASID width. Cannot use 
existing PASID %d\n",
@@ -404,6 +405,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
svm->mm = mm;
svm->flags = flags;
INIT_LIST_HEAD_RCU(>devs);
+   INIT_LIST_HEAD(>list);
ret = -ENOMEM;
if (mm) {
ret = mmu_notifier_register(>notifier, mm);
@@ -430,6 +432,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
 */
if (cap_caching_mode(iommu->cap))
intel_flush_pasid_dev(svm, sdev, svm->pasid);
+
+   list_add_tail(>list, _svm_list);
}
list_add_rcu(>list, >devs);
 
@@ -485,6 +489,8 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
if (svm->mm)

mmu_notifier_unregister(>notifier, svm->mm);
 
+   list_del(>list);
+
/* We mandate that no page faults may 
be outstanding
 * for the PASID when 
intel_svm_unbind_mm() is called.
 * If that is not obeyed, subtle errors 
will happen.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index ef169d6..795717e 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -486,6 +486,7 @@ struct intel_svm {
int flags;
int pasid;
struct list_head devs;
+   struct list_head list;
 };
 
 extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct 
intel_svm_dev *sdev);
-- 
2.7.4

[PATCH v2 6/9] iommu/vt-d: Allocate and free pasid table

This patch allocates PASID table for a domain at the time when
it is being created (if any devices using this domain supports
PASID feature), and free it when the domain is freed.

Cc: Ashok Raj <ashok@intel.com>
Cc: Jacob Pan <jacob.jun@linux.intel.com>
Cc: Kevin Tian <kevin.t...@intel.com>
Cc: Liu Yi L <yi.l@intel.com>
Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
Reviewed-by: Liu Yi L <yi.l@intel.com>
---
 drivers/iommu/intel-iommu.c | 19 +++
 drivers/iommu/intel-svm.c   |  8 
 include/linux/intel-iommu.h | 10 --
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index caa0b5c..f86302d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2460,6 +2460,24 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
dev->archdata.iommu = info;
spin_unlock_irqrestore(_domain_lock, flags);
 
+   if (dev && dev_is_pci(dev) && info->pasid_supported) {
+   if (pasid_enabled(iommu)) {
+   size_t size, count;
+
+   size = sizeof(struct pasid_entry);
+   count = min_t(int,
+ pci_max_pasids(to_pci_dev(dev)),
+ intel_pasid_max_id);
+   ret = intel_pasid_alloc_table(dev, size, count);
+   if (ret) {
+   pr_err("PASID table allocation for %s failed\n",
+  dev_name(dev));
+   dmar_remove_one_dev_info(domain, dev);
+   return NULL;
+   }
+   }
+   }
+
if (dev && domain_context_mapping(domain, dev)) {
pr_err("Domain context map for %s failed\n", dev_name(dev));
dmar_remove_one_dev_info(domain, dev);
@@ -4826,6 +4844,7 @@ static void dmar_remove_one_dev_info(struct dmar_domain 
*domain,
unsigned long flags;
 
spin_lock_irqsave(_domain_lock, flags);
+   intel_pasid_free_table(dev);
info = dev->archdata.iommu;
__dmar_remove_one_dev_info(info);
spin_unlock_irqrestore(_domain_lock, flags);
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 24d0ea1..3abc94f 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -34,14 +34,6 @@
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 
-struct pasid_entry {
-   u64 val;
-};
-
-struct pasid_state_entry {
-   u64 val;
-};
-
 int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
 {
struct page *pages;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index bee7a3f..08e5811 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -382,8 +382,14 @@ enum {
 #define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1)
 
-struct pasid_entry;
-struct pasid_state_entry;
+struct pasid_entry {
+   u64 val;
+};
+
+struct pasid_state_entry {
+   u64 val;
+};
+
 struct page_req_dsc;
 
 struct dmar_domain {
-- 
2.7.4

[PATCH v2 8/9] iommu/vt-d: Use per-domain pasid table

This patch replaces current per iommu pasid table with
the new added per domain pasid table. Each svm-capable
PCI device will be configured with a pasid table which
shares with other svm-capable devices within its iommu
domain.

Cc: Ashok Raj <ashok@intel.com>
Cc: Jacob Pan <jacob.jun@linux.intel.com>
Cc: Kevin Tian <kevin.t...@intel.com>
Cc: Liu Yi L <yi.l@intel.com>
Signed-off-by: Lu Baolu <baolu...@linux.intel.com>
Reviewed-by: Liu Yi L <yi.l@intel.com>
---
 drivers/iommu/intel-iommu.c |  6 +++---
 drivers/iommu/intel-svm.c   | 37 +
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 5602ccd..fa6052a 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5197,7 +5197,7 @@ int intel_iommu_enable_pasid(struct intel_iommu *iommu, 
struct intel_svm_dev *sd
if (!(ctx_lo & CONTEXT_PASIDE)) {
if (iommu->pasid_state_table)
context[1].hi = 
(u64)virt_to_phys(iommu->pasid_state_table);
-   context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
+   context[1].lo = (u64)virt_to_phys(domain->pasid_table) |
intel_iommu_get_pts(domain);
 
wmb();
@@ -5265,8 +5265,8 @@ struct intel_iommu *intel_svm_device_to_iommu(struct 
device *dev)
return NULL;
}
 
-   if (!iommu->pasid_table) {
-   dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
+   if (!intel_pasid_get_table(dev)) {
+   dev_err(dev, "No PASID table for device; cannot enable SVM\n");
return NULL;
}
 
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 3abc94f..3b14819 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -256,6 +256,7 @@ static void intel_flush_pasid_dev(struct intel_svm *svm, 
struct intel_svm_dev *s
 static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
+   struct pasid_entry *pasid_table;
struct intel_svm_dev *sdev;
 
/* This might end up being called from exit_mmap(), *before* the page
@@ -270,11 +271,16 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
 * page) so that we end up taking a fault that the hardware really
 * *has* to handle gracefully without affecting other processes.
 */
-   svm->iommu->pasid_table[svm->pasid].val = 0;
-   wmb();
-
rcu_read_lock();
list_for_each_entry_rcu(sdev, >devs, list) {
+   pasid_table = intel_pasid_get_table(sdev->dev);
+   if (!pasid_table)
+   continue;
+
+   pasid_table[svm->pasid].val = 0;
+   /* Make sure the entry update is visible before translation. */
+   wmb();
+
intel_flush_pasid_dev(svm, sdev, svm->pasid);
intel_flush_svm_range_dev(svm, sdev, 0, -1, 0, !svm->mm);
}
@@ -295,6 +301,7 @@ static LIST_HEAD(global_svm_list);
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
 {
struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
+   struct pasid_entry *pasid_table;
struct intel_svm_dev *sdev;
struct intel_svm *svm = NULL;
struct mm_struct *mm = NULL;
@@ -302,7 +309,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
int pasid_max;
int ret;
 
-   if (WARN_ON(!iommu || !iommu->pasid_table))
+   pasid_table = intel_pasid_get_table(dev);
+   if (WARN_ON(!iommu || !pasid_table))
return -EINVAL;
 
if (dev_is_pci(dev)) {
@@ -380,8 +388,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
}
svm->iommu = iommu;
 
-   if (pasid_max > iommu->pasid_max)
-   pasid_max = iommu->pasid_max;
+   if (pasid_max > intel_pasid_max_id)
+   pasid_max = intel_pasid_max_id;
 
/* Do not use PASID 0 in caching mode (virtualised IOMMU) */
ret = intel_pasid_alloc_id(svm,
@@ -414,7 +422,7 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
if (cpu_feature_enabled(X86_FEATURE_LA57))
pasid_entry_val |= PASID_ENTRY_FLPM_5LP;
 
-   iommu->pasid_table[svm->pasid].val = pasid_entry_val;
+   pasid_table[svm->pasid].val = pasid_entry_val;
 
wmb();
 
@@ -442,6 +450,7 @@ EXPORT_SYMBOL_GPL(intel_svm_bind_mm);
 
 int intel_svm_unbind_mm(struct device *dev

[PATCH v2 6/9] iommu/vt-d: Allocate and free pasid table

This patch allocates PASID table for a domain at the time when
it is being created (if any devices using this domain supports
PASID feature), and free it when the domain is freed.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Kevin Tian 
Cc: Liu Yi L 
Signed-off-by: Lu Baolu 
Reviewed-by: Liu Yi L 
---
 drivers/iommu/intel-iommu.c | 19 +++
 drivers/iommu/intel-svm.c   |  8 
 include/linux/intel-iommu.h | 10 --
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index caa0b5c..f86302d 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -2460,6 +2460,24 @@ static struct dmar_domain 
*dmar_insert_one_dev_info(struct intel_iommu *iommu,
dev->archdata.iommu = info;
spin_unlock_irqrestore(_domain_lock, flags);
 
+   if (dev && dev_is_pci(dev) && info->pasid_supported) {
+   if (pasid_enabled(iommu)) {
+   size_t size, count;
+
+   size = sizeof(struct pasid_entry);
+   count = min_t(int,
+ pci_max_pasids(to_pci_dev(dev)),
+ intel_pasid_max_id);
+   ret = intel_pasid_alloc_table(dev, size, count);
+   if (ret) {
+   pr_err("PASID table allocation for %s failed\n",
+  dev_name(dev));
+   dmar_remove_one_dev_info(domain, dev);
+   return NULL;
+   }
+   }
+   }
+
if (dev && domain_context_mapping(domain, dev)) {
pr_err("Domain context map for %s failed\n", dev_name(dev));
dmar_remove_one_dev_info(domain, dev);
@@ -4826,6 +4844,7 @@ static void dmar_remove_one_dev_info(struct dmar_domain 
*domain,
unsigned long flags;
 
spin_lock_irqsave(_domain_lock, flags);
+   intel_pasid_free_table(dev);
info = dev->archdata.iommu;
__dmar_remove_one_dev_info(info);
spin_unlock_irqrestore(_domain_lock, flags);
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index 24d0ea1..3abc94f 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -34,14 +34,6 @@
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 
-struct pasid_entry {
-   u64 val;
-};
-
-struct pasid_state_entry {
-   u64 val;
-};
-
 int intel_svm_alloc_pasid_tables(struct intel_iommu *iommu)
 {
struct page *pages;
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index bee7a3f..08e5811 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -382,8 +382,14 @@ enum {
 #define VTD_FLAG_TRANS_PRE_ENABLED (1 << 0)
 #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1)
 
-struct pasid_entry;
-struct pasid_state_entry;
+struct pasid_entry {
+   u64 val;
+};
+
+struct pasid_state_entry {
+   u64 val;
+};
+
 struct page_req_dsc;
 
 struct dmar_domain {
-- 
2.7.4

[PATCH v2 8/9] iommu/vt-d: Use per-domain pasid table