Re: [PATCH v2 09/16] driver core: add iommu device fault reporting data

2017-10-05 Thread Greg Kroah-Hartman
On Thu, Oct 05, 2017 at 04:03:37PM -0700, Jacob Pan wrote:
> DMA faults can be detected by IOMMU at device level. Adding a pointer
> to struct device allows IOMMU subsystem to report relevant faults
> back to the device driver for further handling.
> For direct assigned device (or user space drivers), guest OS holds
> responsibility to handle and respond per device IOMMU fault.
> Therefore we need fault reporting mechanism to propagate faults beyond
> IOMMU subsystem.
> 
> Signed-off-by: Jacob Pan 

Acked-by: Greg Kroah-Hartman 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/vt-d: only attempt to cleanup svm page request irq if one assigned

2017-10-05 Thread Jerry Snitselaar
Only try to clean up the svm page request irq if one has
been assigned. Also clear pr_irq in the error path if irq request
fails.

Signed-off-by: Jerry Snitselaar 
---
 drivers/iommu/intel-svm.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index f6697e55c2d4..003b4a4d4b78 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -129,6 +129,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
pr_err("IOMMU: %s: Failed to request IRQ for page request 
queue\n",
   iommu->name);
dmar_free_hwirq(irq);
+   iommu->pr_irq = 0;
goto err;
}
dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL);
@@ -144,9 +145,11 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL);
dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL);
 
-   free_irq(iommu->pr_irq, iommu);
-   dmar_free_hwirq(iommu->pr_irq);
-   iommu->pr_irq = 0;
+   if (iommu->pr_irq) {
+   free_irq(iommu->pr_irq, iommu);
+   dmar_free_hwirq(iommu->pr_irq);
+   iommu->pr_irq = 0;
+   }
 
free_pages((unsigned long)iommu->prq, PRQ_ORDER);
iommu->prq = NULL;
-- 
2.13.0.rc0.45.ge2cb6ab84

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 08/16] iommu: introduce device fault data

2017-10-05 Thread Jacob Pan
Device faults detected by IOMMU can be reported outside IOMMU
subsystem. This patch intends to provide a generic device
fault data such that device drivers can communicate IOMMU faults
without model specific knowledge.

The assumption is that model specific IOMMU driver can filter and
handle most of the IOMMU faults if the cause is within IOMMU driver
control. Therefore, the fault reasons can be reported are grouped
and generalized based common specifications such as PCI ATS.

Signed-off-by: Jacob Pan 
---
 include/linux/iommu.h | 69 +++
 1 file changed, 69 insertions(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 4af1820..3f9b367 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -49,6 +49,7 @@ struct bus_type;
 struct device;
 struct iommu_domain;
 struct notifier_block;
+struct iommu_fault_event;
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ   0x0
@@ -56,6 +57,7 @@ struct notifier_block;
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
struct device *, unsigned long, int, void *);
+typedef int (*iommu_dev_fault_handler_t)(struct device *, struct 
iommu_fault_event *);
 
 struct iommu_domain_geometry {
dma_addr_t aperture_start; /* First address that can be mapped*/
@@ -264,6 +266,60 @@ struct iommu_device {
struct device *dev;
 };
 
+enum iommu_model {
+   IOMMU_MODEL_INTEL = 1,
+   IOMMU_MODEL_AMD,
+   IOMMU_MODEL_SMMU3,
+};
+
+/*  Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_CTX = 1,
+   IOMMU_FAULT_REASON_ACCESS,
+   IOMMU_FAULT_REASON_INVALIDATE,
+   IOMMU_FAULT_REASON_UNKNOWN,
+};
+
+/**
+ * struct iommu_fault_event - Generic per device fault data
+ *
+ * - PCI and non-PCI devices
+ * - Recoverable faults (e.g. page request), information based on PCI ATS
+ * and PASID spec.
+ * - Un-recoverable faults of device interest
+ * - DMA remapping and IRQ remapping faults
+
+ * @type contains fault type.
+ * @reason fault reasons if relevant outside IOMMU driver, IOMMU driver 
internal
+ * faults are not reported
+ * @paddr: tells the offending page address
+ * @pasid: contains process address space ID, used in shared virtual 
memory(SVM)
+ * @rid: requestor ID
+ * @page_req_group_id: page request group index
+ * @last_req: last request in a page request group
+ * @pasid_valid: indicates if the PRQ has a valid PASID
+ * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
+ * @private_data: uniquely identify device-specific private data for an
+ *individual page request
+ */
+struct iommu_fault_event {
+   enum iommu_fault_type type;
+   enum iommu_fault_reason reason;
+   u64 paddr;
+   u32 pasid;
+   u32 rid:16;
+   u32 page_req_group_id : 9;
+   u32 last_req : 1;
+   u32 pasid_valid : 1;
+   u32 prot;
+   u32 private_data;
+};
+
 int  iommu_device_register(struct iommu_device *iommu);
 void iommu_device_unregister(struct iommu_device *iommu);
 int  iommu_device_sysfs_add(struct iommu_device *iommu,
@@ -425,6 +481,18 @@ struct iommu_fwspec {
u32 ids[1];
 };
 
+/**
+ * struct iommu_fault_param - per-device IOMMU runtime data
+ * @dev_fault_handler: Callback function to handle IOMMU faults at device level
+ * @pasid_tbl_bound: Device PASID table is bound to a guest
+ *
+ */
+struct iommu_fault_param {
+   iommu_dev_fault_handler_t dev_fault_handler;
+   bool pasid_tbl_bound:1;
+   bool pasid_tbl_shadowed:1;
+};
+
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
  const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
@@ -437,6 +505,7 @@ struct iommu_ops {};
 struct iommu_group {};
 struct iommu_fwspec {};
 struct iommu_device {};
+struct iommu_fault_param {};
 
 static inline bool iommu_present(struct bus_type *bus)
 {
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 05/16] iommu/vt-d: add iommu invalidate function

2017-10-05 Thread Jacob Pan
This patch adds Intel VT-d specific function to implement
iommu passdown invalidate API.

The use case is for supporting caching structure invalidation
of assigned SVM capable devices. Emulated IOMMU exposes queue
invalidation capability and passes down all descriptors from the guest
to the physical IOMMU.

The assumption is that guest to host device ID mapping should be
resolved prior to calling IOMMU driver. Based on the device handle,
host IOMMU driver can replace certain fields before submit to the
invalidation queue.

Signed-off-by: Liu, Yi L 
Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c | 148 
 include/linux/intel-iommu.h |  10 +++
 2 files changed, 158 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 6832f73..81e27eb 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5023,6 +5023,153 @@ static void intel_iommu_detach_device(struct 
iommu_domain *domain,
dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
 }
 
+/*
+ * 3D array for converting IOMMU generic type-granularity to VT-d granularity
+ * X indexed by enum iommu_inv_type
+ * Y indicates request without and with PASID
+ * Z indexed by enum enum iommu_inv_granularity
+ *
+ * For an example, if we want to find the VT-d granularity encoding for IOTLB
+ * type, DMA request with PASID, and page selective. The look up indices are:
+ * [1][1][8], where
+ * 1: IOMMU_INV_TYPE_TLB
+ * 1: with PASID
+ * 8: IOMMU_INV_GRANU_PAGE_PASID
+ *
+ */
+const static u64 
inv_type_granu_table[IOMMU_INV_NR_TYPE][2][IOMMU_INV_NR_GRANU] = {
+   /* extended dev IOTLBs, only global is valid */
+   {
+   {1}
+   },
+   /* IOTLB and EIOTLB */
+   {
+   {DMA_TLB_GLOBAL_FLUSH, DMA_TLB_DSI_FLUSH, 0, DMA_TLB_PSI_FLUSH},
+   {0, 0, 0, 0, QI_GRAN_ALL_ALL, 0, QI_GRAN_NONG_ALL, 
QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID}
+   },
+   /* PASID cache */
+   {
+   {0, 0, 0, 0, 1}
+   },
+   /* context cache */
+   {
+   {DMA_CCMD_GLOBAL_INVL, DMA_CCMD_DOMAIN_INVL, 
DMA_CCMD_DEVICE_INVL}
+   }
+};
+
+static inline int to_vtd_granularity(int type, int granu, int with_pasid, u64 
*vtd_granu)
+{
+   if (type >= IOMMU_INV_NR_TYPE || granu >= IOMMU_INV_NR_GRANU || 
with_pasid > 1)
+   return -EINVAL;
+   *vtd_granu = inv_type_granu_table[type][with_pasid][granu];
+
+   return 0;
+}
+
+static int intel_iommu_invalidate(struct iommu_domain *domain,
+   struct device *dev, struct tlb_invalidate_info *inv_info)
+{
+   struct intel_iommu *iommu;
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+   struct device_domain_info *info;
+   struct pci_dev *pdev;
+   u16 did, sid, pfsid;
+   u8 bus, devfn;
+   int ret = 0;
+   u64 granu;
+   unsigned long flags;
+
+   if (!inv_info || !dmar_domain)
+   return -EINVAL;
+
+   iommu = device_to_iommu(dev, &bus, &devfn);
+   if (!iommu)
+   return -ENODEV;
+
+   if (!dev || !dev_is_pci(dev))
+   return -ENODEV;
+
+   did = dmar_domain->iommu_did[iommu->seq_id];
+   sid = PCI_DEVID(bus, devfn);
+   ret = to_vtd_granularity(inv_info->hdr.type, inv_info->granularity,
+   !!(inv_info->flags & 
IOMMU_INVALIDATE_DMA_PASID), &granu);
+   if (ret) {
+   pr_err("Invalid range type %d, granu %d\n", inv_info->hdr.type,
+   inv_info->granularity);
+   return ret;
+   }
+
+   spin_lock(&iommu->lock);
+   spin_lock_irqsave(&device_domain_lock, flags);
+
+   switch (inv_info->hdr.type) {
+   case IOMMU_INV_TYPE_CONTEXT:
+   iommu->flush.flush_context(iommu, did, sid,
+   DMA_CCMD_MASK_NOBIT, granu);
+   break;
+   case IOMMU_INV_TYPE_TLB:
+   /* We need to deal with two scenarios:
+* - IOTLB for request w/o PASID
+* - extended IOTLB for request with PASID.
+*/
+   if (inv_info->size &&
+   (inv_info->addr & ((1 << (VTD_PAGE_SHIFT + 
inv_info->size)) - 1))) {
+   pr_err("Addr out of range, addr 0x%llx, size order 
%d\n",
+   inv_info->addr, inv_info->size);
+   ret = -ERANGE;
+   goto out_unlock;
+   }
+
+   if (inv_info->flags & IOMMU_INVALIDATE_DMA_PASID)
+   qi_flush_eiotlb(iommu, did, 
mm_to_dma_pfn(inv_info->addr),
+   inv_info->pasid,
+   inv_info->size, granu,
+   inv_info->flags & 
IOMMU_INVALIDATE_GLOBAL_PAGE);
+   else
+   qi_flush_iotlb(io

[PATCH v2 15/16] iommu: introduce page response function

2017-10-05 Thread Jacob Pan
When nested translation is turned on and guest owns the
first level page tables, device page request can be forwared
to the guest for handling faults. As the page response returns
by the guest, IOMMU driver on the host need to process the
response which informs the device and completes the page request
transaction.

This patch introduces generic API function for page response
passing from the guest or other in-kernel users. The definitions of
the generic data is based on PCI ATS specification not limited to
any vendor.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/iommu.c | 14 ++
 include/linux/iommu.h | 42 ++
 2 files changed, 56 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0b058e2..b1c9a0e 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1409,6 +1409,20 @@ int iommu_invalidate(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_invalidate);
 
+int iommu_page_response(struct iommu_domain *domain, struct device *dev,
+   struct page_response_msg *msg)
+{
+   int ret = 0;
+
+   if (unlikely(!domain->ops->page_response))
+   return -ENODEV;
+
+   ret = domain->ops->page_response(domain, dev, msg);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_page_response);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a675775..c289760 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -161,6 +161,43 @@ struct iommu_resv_region {
 
 #ifdef CONFIG_IOMMU_API
 
+enum page_response_type {
+   IOMMU_PAGE_STREAM_RESP = 1,
+   IOMMU_PAGE_GROUP_RESP,
+};
+
+/**
+ * Generic page response information based on PCI ATS and PASID spec.
+ * @paddr: servicing page address
+ * @pasid: contains process address space ID, used in shared virtual 
memory(SVM)
+ * @rid: requestor ID
+ * @did: destination device ID
+ * @last_req: last request in a page request group
+ * @resp_code: response code
+ * @page_req_group_id: page request group index
+ * @prot: page access protection flag, e.g. IOMMU_READ, IOMMU_WRITE
+ * @type: group or stream response
+ * @private_data: uniquely identify device-specific private data for an
+ *individual page response
+
+ */
+struct page_response_msg {
+   u64 paddr;
+   u32 pasid;
+   u32 rid:16;
+   u32 did:16;
+   u32 resp_code:4;
+   u32 last_req:1;
+   u32 pasid_present:1;
+#define IOMMU_PAGE_RESP_SUCCESS0
+#define IOMMU_PAGE_RESP_INVALID1
+#define IOMMU_PAGE_RESP_FAILURE0xF
+   u32 page_req_group_id : 9;
+   u32 prot;
+   enum page_response_type type;
+   u32 private_data;
+};
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability
@@ -194,6 +231,7 @@ struct iommu_resv_region {
  * @bind_pasid_table: bind pasid table pointer for guest SVM
  * @unbind_pasid_table: unbind pasid table pointer and restore defaults
  * @invalidate: invalidate translation caches
+ * @page_response: handle page request response
  */
 struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -249,6 +287,8 @@ struct iommu_ops {
struct device *dev);
int (*invalidate)(struct iommu_domain *domain,
struct device *dev, struct tlb_invalidate_info *inv_info);
+   int (*page_response)(struct iommu_domain *domain, struct device *dev,
+   struct page_response_msg *msg);
 
unsigned long pgsize_bitmap;
 };
@@ -424,6 +464,8 @@ extern int iommu_unregister_device_fault_handler(struct 
device *dev);
 
 extern int iommu_report_device_fault(struct device *dev, struct 
iommu_fault_event *evt);
 
+extern int iommu_page_response(struct iommu_domain *domain, struct device *dev,
+   struct page_response_msg *msg);
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_group *iommu_group_get_for_dev(struct device *dev);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 16/16] iommu/vt-d: add intel iommu page response function

2017-10-05 Thread Jacob Pan
This patch adds page response support for Intel VT-d.
Generic response data is taken from the IOMMU API
then parsed into VT-d specific response descriptor format.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-iommu.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index ede0f2e..61ad26b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5165,6 +5165,35 @@ static int intel_iommu_invalidate(struct iommu_domain 
*domain,
return ret;
 }
 
+int intel_iommu_page_response(struct iommu_domain *domain, struct device *dev,
+   struct page_response_msg *msg)
+{
+   struct qi_desc resp;
+   struct intel_iommu *iommu = dev_to_intel_iommu(dev);
+
+   /* TODO: sanitize response message */
+   if (msg->last_req) {
+   /* Page Group Response */
+   resp.low = QI_PGRP_PASID(msg->pasid) |
+   QI_PGRP_DID(msg->did) |
+   QI_PGRP_PASID_P(msg->pasid_present) |
+   QI_PGRP_RESP_TYPE;
+   /* REVISIT: allow private data passing from device prq */
+   resp.high = QI_PGRP_IDX(msg->page_req_group_id) |
+   QI_PGRP_PRIV(msg->private_data) | 
QI_PGRP_RESP_CODE(msg->resp_code);
+   } else {
+   /* Page Stream Response */
+   resp.low = QI_PSTRM_IDX(msg->page_req_group_id) |
+   QI_PSTRM_PRIV(msg->private_data) | 
QI_PSTRM_BUS(PCI_BUS_NUM(msg->did)) |
+   QI_PSTRM_PASID(msg->pasid) | QI_PSTRM_RESP_TYPE;
+   resp.high = QI_PSTRM_ADDR(msg->paddr) | QI_PSTRM_DEVFN(msg->did 
& 0xff) |
+   QI_PSTRM_RESP_CODE(msg->resp_code);
+   }
+   qi_submit_sync(&resp, iommu);
+
+   return 0;
+}
+
 static int intel_iommu_map(struct iommu_domain *domain,
   unsigned long iova, phys_addr_t hpa,
   size_t size, int iommu_prot)
@@ -5594,6 +5623,7 @@ const struct iommu_ops intel_iommu_ops = {
.bind_pasid_table   = intel_iommu_bind_pasid_table,
.unbind_pasid_table = intel_iommu_unbind_pasid_table,
.invalidate = intel_iommu_invalidate,
+   .page_response  = intel_iommu_page_response,
 #endif
.map= intel_iommu_map,
.unmap  = intel_iommu_unmap,
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 09/16] driver core: add iommu device fault reporting data

2017-10-05 Thread Jacob Pan
DMA faults can be detected by IOMMU at device level. Adding a pointer
to struct device allows IOMMU subsystem to report relevant faults
back to the device driver for further handling.
For direct assigned device (or user space drivers), guest OS holds
responsibility to handle and respond per device IOMMU fault.
Therefore we need fault reporting mechanism to propagate faults beyond
IOMMU subsystem.

Signed-off-by: Jacob Pan 
---
 include/linux/device.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/device.h b/include/linux/device.h
index 1d26079..4e3d543 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -42,6 +42,7 @@ struct fwnode_handle;
 struct iommu_ops;
 struct iommu_group;
 struct iommu_fwspec;
+struct iommu_fault_param;
 
 struct bus_attribute {
struct attributeattr;
@@ -873,6 +874,7 @@ struct dev_links_info {
  * device (i.e. the bus driver that discovered the device).
  * @iommu_group: IOMMU group the device belongs to.
  * @iommu_fwspec: IOMMU-specific properties supplied by firmware.
+ * @iommu_fault_param: Per device generic IOMMU runtime parameters
  *
  * @offline_disabled: If set, the device is permanently online.
  * @offline:   Set after successful invocation of bus type's .offline().
@@ -962,6 +964,7 @@ struct device {
void(*release)(struct device *dev);
struct iommu_group  *iommu_group;
struct iommu_fwspec *iommu_fwspec;
+   struct iommu_fault_param*iommu_fault_param;
 
booloffline_disabled:1;
booloffline:1;
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 13/16] iommu/intel-svm: notify page request to guest

2017-10-05 Thread Jacob Pan
If the source device of a page request has its PASID table pointer
bond to a guest, the first level page tables are owned by the guest.
In this case, we shall let guest OS to manage page fault.

This patch uses the IOMMU fault notification API to send notifications,
possibly via VFIO, to the guest OS. Once guest pages are fault in, guest
will issue page response which will be passed down via the invalidation
passdown APIs.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/intel-svm.c | 87 +++
 include/linux/iommu.h |  1 +
 2 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index f6697e5..ea7c455 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -555,6 +555,78 @@ static bool is_canonical_address(u64 addr)
return (((saddr << shift) >> shift) == saddr);
 }
 
+static int prq_to_iommu_prot(struct page_req_dsc *req)
+{
+   int prot = 0;
+
+   if (req->rd_req)
+   prot |= IOMMU_READ;
+   if (req->wr_req)
+   prot |= IOMMU_WRITE;
+   if (req->exe_req)
+   prot |= IOMMU_EXEC;
+   if (req->priv_req)
+   prot |= IOMMU_PRIV;
+
+   return prot;
+}
+
+static int intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc)
+{
+   int ret = 0;
+   struct iommu_fault_event event;
+   struct pci_dev *pdev;
+
+   /**
+* If caller does not provide struct device, this is the case where
+* guest PASID table is bound to the device. So we need to retrieve
+* struct device from the page request descriptor then proceed.
+*/
+   if (!dev) {
+   pdev = pci_get_bus_and_slot(desc->bus, desc->devfn);
+   if (!pdev) {
+   pr_err("No PCI device found for PRQ [%02x:%02x.%d]\n",
+   desc->bus, PCI_SLOT(desc->devfn),
+   PCI_FUNC(desc->devfn));
+   return -ENODEV;
+   }
+   dev = &pdev->dev;
+   } else if (dev_is_pci(dev)) {
+   pdev = to_pci_dev(dev);
+   pci_dev_get(pdev);
+   } else
+   return -ENODEV;
+
+   pr_debug("Notify PRQ device [%02x:%02x.%d]\n",
+   desc->bus, PCI_SLOT(desc->devfn),
+   PCI_FUNC(desc->devfn));
+
+   /**
+* Make sure PASID table pointer is bound to guest, if yes notify
+* handler in the guest, e.g. via VFIO.
+*/
+   if (!dev->iommu_fault_param->pasid_tbl_bound) {
+   pr_debug("PRQ device pasid table not bound.\n");
+   ret = -EINVAL;
+   goto exit_put_dev;
+   }
+   /* Fill in event data for device specific processing */
+   event.type = IOMMU_FAULT_PAGE_REQ;
+   event.paddr = desc->addr;
+   event.pasid = desc->pasid;
+   event.page_req_group_id = desc->prg_index;
+   event.prot = prq_to_iommu_prot(desc);
+   event.last_req = desc->lpig;
+   event.pasid_valid = 1;
+   event.private_data = desc->private;
+   ret = iommu_report_device_fault(&pdev->dev, &event);
+
+exit_put_dev:
+   pci_dev_put(pdev);
+
+   return ret;
+}
+
 static irqreturn_t prq_event_thread(int irq, void *d)
 {
struct intel_iommu *iommu = d;
@@ -578,7 +650,12 @@ static irqreturn_t prq_event_thread(int irq, void *d)
handled = 1;
 
req = &iommu->prq[head / sizeof(*req)];
-
+   /**
+* If prq is to be handled outside iommu driver via receiver of
+* the fault notifiers, we skip the page response here.
+*/
+   if (!intel_svm_prq_report(NULL, req))
+   goto prq_advance;
result = QI_RESP_FAILURE;
address = (u64)req->addr << VTD_PAGE_SHIFT;
if (!req->pasid_present) {
@@ -649,11 +726,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
if (WARN_ON(&sdev->list == &svm->devs))
sdev = NULL;
 
-   if (sdev && sdev->ops && sdev->ops->fault_cb) {
-   int rwxp = (req->rd_req << 3) | (req->wr_req << 2) |
-   (req->exe_req << 1) | (req->priv_req);
-   sdev->ops->fault_cb(sdev->dev, req->pasid, req->addr, 
req->private, rwxp, result);
-   }
+   intel_svm_prq_report(sdev->dev, req);
/* We get here in the error case where the PASID lookup failed,
   and these can be NULL. Do not use them below this point! */
sdev = NULL;
@@ -679,7 +752,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 
qi_submit_sync(&resp, iommu);
}
-
+   prq_advance:
head = (head + sizeof(*req)) & PRQ_RING_MASK;
}
 
diff --git a

[PATCH v2 11/16] iommu/vt-d: use threaded irq for dmar_fault

2017-10-05 Thread Jacob Pan
Currently, dmar fault IRQ handler does nothing more than rate
limited printk, no critical hardware handling need to be done
in IRQ context.
Convert it to threaded IRQ would allow fault processing that
requires process context. e.g. find out offending device based
on source ID in the fault rasons.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/dmar.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 2fbff8b..ae33d61 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1748,7 +1748,8 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
return -EINVAL;
}
 
-   ret = request_irq(irq, dmar_fault, IRQF_NO_THREAD, iommu->name, iommu);
+   ret = request_threaded_irq(irq, NULL, dmar_fault,
+   IRQF_ONESHOT, iommu->name, iommu);
if (ret)
pr_err("Can't request irq\n");
return ret;
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 14/16] iommu/intel-svm: replace dev ops with fault report API

2017-10-05 Thread Jacob Pan
With the introduction of generic IOMMU device fault reporting API, we
can replace the private fault callback functions with standard function
and event data.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-svm.c |  7 +--
 include/linux/intel-svm.h | 20 +++-
 2 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index ea7c455..ea37d3e 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -283,7 +283,7 @@ static const struct mmu_notifier_ops intel_mmuops = {
 
 static DEFINE_MUTEX(pasid_mutex);
 
-int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct 
svm_dev_ops *ops)
+int intel_svm_bind_mm(struct device *dev, int *pasid, int flags)
 {
struct intel_iommu *iommu = intel_svm_device_to_iommu(dev);
struct intel_svm_dev *sdev;
@@ -329,10 +329,6 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
 
list_for_each_entry(sdev, &svm->devs, list) {
if (dev == sdev->dev) {
-   if (sdev->ops != ops) {
-   ret = -EBUSY;
-   goto out;
-   }
sdev->users++;
goto success;
}
@@ -358,7 +354,6 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int 
flags, struct svm_dev_
}
/* Finish the setup now we know we're keeping it */
sdev->users = 1;
-   sdev->ops = ops;
init_rcu_head(&sdev->rcu);
 
if (!svm) {
diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 99bc5b3..a39a502 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -18,18 +18,6 @@
 
 struct device;
 
-struct svm_dev_ops {
-   void (*fault_cb)(struct device *dev, int pasid, u64 address,
-u32 private, int rwxp, int response);
-};
-
-/* Values for rxwp in fault_cb callback */
-#define SVM_REQ_READ   (1<<3)
-#define SVM_REQ_WRITE  (1<<2)
-#define SVM_REQ_EXEC   (1<<1)
-#define SVM_REQ_PRIV   (1<<0)
-
-
 /*
  * The SVM_FLAG_PRIVATE_PASID flag requests a PASID which is *not* the "main"
  * PASID for the current process. Even if a PASID already exists, a new one
@@ -60,7 +48,6 @@ struct svm_dev_ops {
  * @dev:   Device to be granted acccess
  * @pasid: Address for allocated PASID
  * @flags: Flags. Later for requesting supervisor mode, etc.
- * @ops:   Callbacks to device driver
  *
  * This function attempts to enable PASID support for the given device.
  * If the @pasid argument is non-%NULL, a PASID is allocated for access
@@ -82,8 +69,7 @@ struct svm_dev_ops {
  * Multiple calls from the same process may result in the same PASID
  * being re-used. A reference count is kept.
  */
-extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags,
-struct svm_dev_ops *ops);
+extern int intel_svm_bind_mm(struct device *dev, int *pasid, int flags);
 
 /**
  * intel_svm_unbind_mm() - Unbind a specified PASID
@@ -120,7 +106,7 @@ extern int intel_svm_is_pasid_valid(struct device *dev, int 
pasid);
 #else /* CONFIG_INTEL_IOMMU_SVM */
 
 static inline int intel_svm_bind_mm(struct device *dev, int *pasid,
-   int flags, struct svm_dev_ops *ops)
+   int flags)
 {
return -ENOSYS;
 }
@@ -136,6 +122,6 @@ static int intel_svm_is_pasid_valid(struct device *dev, int 
pasid)
 }
 #endif /* CONFIG_INTEL_IOMMU_SVM */
 
-#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0, NULL))
+#define intel_svm_available(dev) (!intel_svm_bind_mm((dev), NULL, 0))
 
 #endif /* __INTEL_SVM_H__ */
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 07/16] iommu/vt-d: assign PFSID in device TLB invalidation

2017-10-05 Thread Jacob Pan
When SRIOV VF device IOTLB is invalidated, we need to provide
the PF source SID such that IOMMU hardware can gauge the depth
of invalidation queue which is shared among VFs. This is needed
when device invalidation throttle (DIT) capability is supported.

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-iommu.c | 13 +
 include/linux/intel-iommu.h |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e5a5209..ede0f2e 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1484,6 +1484,19 @@ static void iommu_enable_dev_iotlb(struct 
device_domain_info *info)
return;
 
pdev = to_pci_dev(info->dev);
+   /* For IOMMU that supports device IOTLB throttling (DIT), we assign
+* PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
+* queue depth at PF level. If DIT is not set, PFSID will be treated as
+* reserved, which should be set to 0.
+*/
+   if (!ecap_dit(info->iommu->ecap))
+   info->pfsid = 0;
+   else if (pdev && pdev->is_virtfn) {
+   if (ecap_dit(info->iommu->ecap))
+   dev_warn(&pdev->dev, "SRIOV VF device IOTLB enabled 
without flow control\n");
+   info->pfsid = PCI_DEVID(pdev->physfn->bus->number, 
pdev->physfn->devfn);
+   } else
+   info->pfsid = PCI_DEVID(info->bus, info->devfn);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
/* The PCIe spec, in its wisdom, declares that the behaviour of
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index f42b46c..c8ac5c6 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -112,6 +112,7 @@
  * Extended Capability Register
  */
 
+#define ecap_dit(e)((e >> 41) & 0x1)
 #define ecap_pasid(e)  ((e >> 40) & 0x1)
 #define ecap_pss(e)((e >> 35) & 0x1f)
 #define ecap_eafs(e)   ((e >> 34) & 0x1)
@@ -285,6 +286,7 @@ enum {
 #define QI_DEV_IOTLB_SID(sid)  ((u64)((sid) & 0x) << 32)
 #define QI_DEV_IOTLB_QDEP(qdep)(((qdep) & 0x1f) << 16)
 #define QI_DEV_IOTLB_ADDR(addr)((u64)(addr) & VTD_PAGE_MASK)
+#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 
0xff0) << 48))
 #define QI_DEV_IOTLB_SIZE  1
 #define QI_DEV_IOTLB_MAX_INVS  32
 
@@ -442,6 +444,7 @@ struct device_domain_info {
struct list_head global; /* link to global list */
u8 bus; /* PCI bus number */
u8 devfn;   /* PCI devfn number */
+   u16 pfsid;  /* SRIOV physical function source ID */
u8 pasid_supported:3;
u8 pasid_enabled:1;
u8 pri_supported:1;
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 10/16] iommu: introduce device fault report API

2017-10-05 Thread Jacob Pan
Traditionally, device specific faults are detected and handled within
their own device drivers. When IOMMU is enabled, faults such as DMA
related transactions are detected by IOMMU. There is no generic
reporting mechanism to report faults back to the in-kernel device
driver or the guest OS in case of assigned devices.

Faults detected by IOMMU is based on the transaction's source ID which
can be reported at per device basis, regardless of the device type is a
PCI device or not.

The fault types include recoverable (e.g. page request) and
unrecoverable faults(e.g. access error). In most cases, faults can be
handled by IOMMU drivers internally. The primary use cases are as
follows:
1. page request fault originated from an SVM capable device that is
assigned to guest via vIOMMU. In this case, the first level page tables
are owned by the guest. Page request must be propagated to the guest to
let guest OS fault in the pages then send page response. In this
mechanism, the direct receiver of IOMMU fault notification is VFIO,
which can relay notification events to QEMU or other user space
software.

2. faults need more subtle handling by device drivers. Other than
simply invoke reset function, there are needs to let device driver
handle the fault with a smaller impact.

This patchset is intended to create a generic fault report API such
that it can scale as follows:
- all IOMMU types
- PCI and non-PCI devices
- recoverable and unrecoverable faults
- VFIO and other other in kernel users
- DMA & IRQ remapping (TBD)
The original idea was brought up by David Woodhouse and discussions
summarized at https://lwn.net/Articles/608914/.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/iommu.c | 56 ++-
 include/linux/iommu.h | 23 +
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5a14154..0b058e2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -554,9 +554,15 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
 
device->dev = dev;
 
+   dev->iommu_fault_param = kzalloc(sizeof(struct iommu_fault_param), 
GFP_KERNEL);
+   if (!dev->iommu_fault_param) {
+   ret = -ENOMEM;
+   goto err_free_device;
+   }
+
ret = sysfs_create_link(&dev->kobj, &group->kobj, "iommu_group");
if (ret)
-   goto err_free_device;
+   goto err_free_device_iommu_fault_param;
 
device->name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
 rename:
@@ -615,6 +621,8 @@ int iommu_group_add_device(struct iommu_group *group, 
struct device *dev)
kfree(device->name);
 err_remove_link:
sysfs_remove_link(&dev->kobj, "iommu_group");
+err_free_device_iommu_fault_param:
+   kfree(dev->iommu_fault_param);
 err_free_device:
kfree(device);
pr_err("Failed to add device %s to group %d: %d\n", dev_name(dev), 
group->id, ret);
@@ -791,6 +799,52 @@ int iommu_group_unregister_notifier(struct iommu_group 
*group,
 }
 EXPORT_SYMBOL_GPL(iommu_group_unregister_notifier);
 
+int iommu_register_device_fault_handler(struct device *dev,
+   iommu_dev_fault_handler_t handler)
+{
+   if (dev->iommu_fault_param)
+   return -EBUSY;
+   get_device(dev);
+   dev->iommu_fault_param =
+   kzalloc(sizeof(struct iommu_fault_param), GFP_KERNEL);
+   if (!dev->iommu_fault_param)
+   return -ENOMEM;
+   dev->iommu_fault_param->dev_fault_handler = handler;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
+
+int iommu_unregister_device_fault_handler(struct device *dev)
+{
+   if (!dev->iommu_fault_param)
+   return -EINVAL;
+
+   kfree(dev->iommu_fault_param);
+   dev->iommu_fault_param = NULL;
+   put_device(dev);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
+
+
+int iommu_report_device_fault(struct device *dev, struct iommu_fault_event 
*evt)
+{
+   /* we only report device fault if there is a handler registered */
+   if (!dev->iommu_fault_param ||
+   !dev->iommu_fault_param->dev_fault_handler)
+   return -ENOSYS;
+   if (evt->type == IOMMU_FAULT_PAGE_REQ &&
+   !dev->iommu_fault_param->pasid_tbl_bound) {
+   dev_warn(dev, "PRQ not propaged, PASID table not bound\n");
+   return -EPERM;
+   }
+
+   return dev->iommu_fault_param->dev_fault_handler(dev, evt);
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
 /**
  * iommu_group_id - Return ID for a group
  * @group: the group to ID
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3f9b367..44d2ada 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -416,6 +416,13 @@ extern int iommu_group_register_notifier(struct 
iommu_g

[PATCH v2 01/16] iommu: introduce bind_pasid_table API function

2017-10-05 Thread Jacob Pan
Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
use in the guest:
https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html

As part of the proposed architecture, when an SVM capable PCI
device is assigned to a guest, nested mode is turned on. Guest owns the
first level page tables (request with PASID) which performs GVA->GPA
translation. Second level page tables are owned by the host for GPA->HPA
translation for both request with and without PASID.

A new IOMMU driver interface is therefore needed to perform tasks as
follows:
* Enable nested translation and appropriate translation type
* Assign guest PASID table pointer (in GPA) and size to host IOMMU

This patch introduces new API functions to perform bind/unbind guest PASID
tables. Based on common data, model specific IOMMU drivers can be extended
to perform the specific steps for binding pasid table of assigned devices.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/iommu.c  | 19 
 include/linux/iommu.h  | 25 +
 include/uapi/linux/iommu.h | 55 ++
 3 files changed, 99 insertions(+)
 create mode 100644 include/uapi/linux/iommu.h

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3de5c0b..761cf50 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1322,6 +1322,25 @@ int iommu_attach_device(struct iommu_domain *domain, 
struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_attach_device);
 
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+   struct pasid_table_config *pasidt_binfo)
+{
+   if (unlikely(!domain->ops->bind_pasid_table))
+   return -ENODEV;
+
+   return domain->ops->bind_pasid_table(domain, dev, pasidt_binfo);
+}
+EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
+
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+   if (unlikely(!domain->ops->unbind_pasid_table))
+   return -EINVAL;
+
+   return domain->ops->unbind_pasid_table(domain, dev);
+}
+EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 41b8c57..672cc06 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define IOMMU_READ (1 << 0)
 #define IOMMU_WRITE(1 << 1)
@@ -187,6 +188,8 @@ struct iommu_resv_region {
  * @domain_get_windows: Return the number of windows for a domain
  * @of_xlate: add OF master IDs to iommu grouping
  * @pgsize_bitmap: bitmap of all possible supported page sizes
+ * @bind_pasid_table: bind pasid table pointer for guest SVM
+ * @unbind_pasid_table: unbind pasid table pointer and restore defaults
  */
 struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -233,8 +236,14 @@ struct iommu_ops {
u32 (*domain_get_windows)(struct iommu_domain *domain);
 
int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+
bool (*is_attach_deferred)(struct iommu_domain *domain, struct device 
*dev);
 
+   int (*bind_pasid_table)(struct iommu_domain *domain, struct device *dev,
+   struct pasid_table_config *pasidt_binfo);
+   int (*unbind_pasid_table)(struct iommu_domain *domain,
+   struct device *dev);
+
unsigned long pgsize_bitmap;
 };
 
@@ -296,6 +305,10 @@ extern int iommu_attach_device(struct iommu_domain *domain,
   struct device *dev);
 extern void iommu_detach_device(struct iommu_domain *domain,
struct device *dev);
+extern int iommu_bind_pasid_table(struct iommu_domain *domain,
+   struct device *dev, struct pasid_table_config *pasidt_binfo);
+extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
+   struct device *dev);
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 phys_addr_t paddr, size_t size, int prot);
@@ -696,6 +709,18 @@ const struct iommu_ops *iommu_ops_from_fwnode(struct 
fwnode_handle *fwnode)
return NULL;
 }
 
+static inline
+int iommu_bind_pasid_table(struct iommu_domain *domain, struct device *dev,
+   struct pasid_table_config *pasidt_binfo)
+{
+   return -EINVAL;
+}
+static inline
+int iommu_unbind_pasid_table(struct iommu_domain *domain, struct device *dev)
+{
+   return -EINVAL;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
new file mode 100644
index 000..aeeaf0e
--- /dev/null
+++ b/include/uapi/linux/iommu.h
@@ -0,0 +1,55 

[PATCH v2 12/16] iommu/vt-d: report unrecoverable device faults

2017-10-05 Thread Jacob Pan
Currently, when device DMA faults are detected by IOMMU the fault
reasons are printed but the driver of the offending device is
involved in fault handling.
This patch uses per device fault reporting API to send fault event
data for further processing.
Offending device is identified by the source ID in VT-d fault reason
report registers.

Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/dmar.c | 95 +++-
 1 file changed, 94 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index ae33d61..43ea7ab 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1554,6 +1554,31 @@ static const char *irq_remap_fault_reasons[] =
"Blocked an interrupt request due to source-id verification failure",
 };
 
+/* fault data and status */
+enum intel_iommu_fault_reason {
+   INTEL_IOMMU_FAULT_REASON_SW,
+   INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT,
+   INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT,
+   INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID,
+   INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH,
+   INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS,
+   INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS,
+   INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID,
+   INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID,
+   INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID,
+   INTEL_IOMMU_FAULT_REASON_NONE_ZERO_RTP,
+   INTEL_IOMMU_FAULT_REASON_NONE_ZERO_CTP,
+   INTEL_IOMMU_FAULT_REASON_NONE_ZERO_PTE,
+   NR_INTEL_IOMMU_FAULT_REASON,
+};
+
+/* fault reasons that are allowed to be reported outside IOMMU subsystem */
+#define INTEL_IOMMU_FAULT_REASON_ALLOWED   \
+   ((1ULL << INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH) | \
+   (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS) |   \
+   (1ULL << INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS))
+
+
 static const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
 {
if (fault_reason >= 0x20 && (fault_reason - 0x20 <
@@ -1634,6 +1659,70 @@ void dmar_msi_read(int irq, struct msi_msg *msg)
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 }
 
+static enum iommu_fault_reason to_iommu_fault_reason(u8 reason)
+{
+   if (reason >= NR_INTEL_IOMMU_FAULT_REASON) {
+   pr_warn("unknown DMAR fault reason %d\n", reason);
+   return IOMMU_FAULT_REASON_UNKNOWN;
+   }
+   switch (reason) {
+   case INTEL_IOMMU_FAULT_REASON_SW:
+   case INTEL_IOMMU_FAULT_REASON_ROOT_NOT_PRESENT:
+   case INTEL_IOMMU_FAULT_REASON_CONTEXT_NOT_PRESENT:
+   case INTEL_IOMMU_FAULT_REASON_CONTEXT_INVALID:
+   case INTEL_IOMMU_FAULT_REASON_BEYOND_ADDR_WIDTH:
+   case INTEL_IOMMU_FAULT_REASON_ROOT_ADDR_INVALID:
+   case INTEL_IOMMU_FAULT_REASON_CONTEXT_PTR_INVALID:
+   return IOMMU_FAULT_REASON_CTX;
+   case INTEL_IOMMU_FAULT_REASON_NEXT_PT_INVALID:
+   case INTEL_IOMMU_FAULT_REASON_PTE_WRITE_ACCESS:
+   case INTEL_IOMMU_FAULT_REASON_PTE_READ_ACCESS:
+   return IOMMU_FAULT_REASON_ACCESS;
+   default:
+   return IOMMU_FAULT_REASON_UNKNOWN;
+   }
+}
+
+static void report_fault_to_device(struct intel_iommu *iommu, u64 addr, int 
type,
+   int fault_type, enum intel_iommu_fault_reason 
reason, u16 sid)
+{
+   struct iommu_fault_event event;
+   struct pci_dev *pdev;
+   u8 bus, devfn;
+
+   /* check if fault reason is worth reporting outside IOMMU */
+   if (!((1 << reason) & INTEL_IOMMU_FAULT_REASON_ALLOWED)) {
+   pr_debug("Fault reason %d not allowed to report to device\n",
+   reason);
+   return;
+   }
+
+   bus = PCI_BUS_NUM(sid);
+   devfn = PCI_DEVFN(PCI_SLOT(sid), PCI_FUNC(sid));
+   /*
+* we need to check if the fault reporting is requested for the
+* offending device.
+*/
+   pdev = pci_get_bus_and_slot(bus, devfn);
+   if (!pdev) {
+   pr_warn("No PCI device found for source ID %x\n", sid);
+   return;
+   }
+   /*
+* unrecoverable fault is reported per IOMMU, notifier handler can
+* resolve PCI device based on source ID.
+*/
+   event.reason = to_iommu_fault_reason(reason);
+   event.paddr = addr;
+   event.rid = sid;
+   event.type = IOMMU_FAULT_DMA_UNRECOV;
+   event.prot = type ? IOMMU_READ : IOMMU_WRITE;
+   dev_warn(&pdev->dev, "report device unrecoverable fault: %d, %x, %d\n",
+   event.reason, event.rid, event.type);
+   iommu_report_device_fault(&pdev->dev, &event);
+   pci_dev_put(pdev);
+}
+
 static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
u8 fault_reason, u16 source_id, unsigned long long addr)
 {
@@ -1647,11 +1736,15 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, 
int type,
sou

[PATCH v2 06/16] iommu/vt-d: move device_domain_info to header

2017-10-05 Thread Jacob Pan
Allow both intel-iommu.c and dmar.c to access device_domain_info.
Prepare for additional per device arch data used in TLB flush function

Signed-off-by: Jacob Pan 
---
 drivers/iommu/intel-iommu.c | 18 --
 include/linux/intel-iommu.h | 19 +++
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 81e27eb..e5a5209 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -416,24 +416,6 @@ struct dmar_domain {
   iommu core */
 };
 
-/* PCI domain-device relationship */
-struct device_domain_info {
-   struct list_head link;  /* link to domain siblings */
-   struct list_head global; /* link to global list */
-   u8 bus; /* PCI bus number */
-   u8 devfn;   /* PCI devfn number */
-   u8 pasid_supported:3;
-   u8 pasid_enabled:1;
-   u8 pri_supported:1;
-   u8 pri_enabled:1;
-   u8 ats_supported:1;
-   u8 ats_enabled:1;
-   u8 ats_qdep;
-   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
-   struct intel_iommu *iommu; /* IOMMU used by this device */
-   struct dmar_domain *domain; /* pointer to domain */
-};
-
 struct dmar_rmrr_unit {
struct list_head list;  /* list of rmrr units   */
struct acpi_dmar_header *hdr;   /* ACPI header  */
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 5c734bd..f42b46c 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -436,6 +436,25 @@ struct intel_iommu {
u32 flags;  /* Software defined flags */
 };
 
+/* PCI domain-device relationship */
+struct device_domain_info {
+   struct list_head link;  /* link to domain siblings */
+   struct list_head global; /* link to global list */
+   u8 bus; /* PCI bus number */
+   u8 devfn;   /* PCI devfn number */
+   u8 pasid_supported:3;
+   u8 pasid_enabled:1;
+   u8 pri_supported:1;
+   u8 pri_enabled:1;
+   u8 ats_supported:1;
+   u8 ats_enabled:1;
+   u8 ats_qdep;
+   u64 fault_mask; /* selected IOMMU faults to be reported */
+   struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
+   struct intel_iommu *iommu; /* IOMMU used by this device */
+   struct dmar_domain *domain; /* pointer to domain */
+};
+
 static inline void __iommu_flush_cache(
struct intel_iommu *iommu, void *addr, int size)
 {
-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v2 03/16] iommu: introduce iommu invalidate API function

2017-10-05 Thread Jacob Pan
From: "Liu, Yi L" 

When an SVM capable device is assigned to a guest, the first level page
tables are owned by the guest and the guest PASID table pointer is
linked to the device context entry of the physical IOMMU.

Host IOMMU driver has no knowledge of caching structure updates unless
the guest invalidation activities are passed down to the host. The
primary usage is derived from emulated IOMMU in the guest, where QEMU
can trap invalidation activities before passing them down to the
host/physical IOMMU.
Since the invalidation data are obtained from user space and will be
written into physical IOMMU, we must allow security check at various
layers. Therefore, generic invalidation data format are proposed here,
model specific IOMMU drivers need to convert them into their own format.

Signed-off-by: Liu, Yi L 
Signed-off-by: Jacob Pan 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/iommu.c  | 14 +++
 include/linux/iommu.h  | 12 +
 include/uapi/linux/iommu.h | 62 ++
 3 files changed, 88 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 761cf50..5a14154 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1341,6 +1341,20 @@ int iommu_unbind_pasid_table(struct iommu_domain 
*domain, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_unbind_pasid_table);
 
+int iommu_invalidate(struct iommu_domain *domain,
+   struct device *dev, struct tlb_invalidate_info *inv_info)
+{
+   int ret = 0;
+
+   if (unlikely(!domain->ops->invalidate))
+   return -ENODEV;
+
+   ret = domain->ops->invalidate(domain, dev, inv_info);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_invalidate);
+
 static void __iommu_detach_device(struct iommu_domain *domain,
  struct device *dev)
 {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 672cc06..4af1820 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -190,6 +190,7 @@ struct iommu_resv_region {
  * @pgsize_bitmap: bitmap of all possible supported page sizes
  * @bind_pasid_table: bind pasid table pointer for guest SVM
  * @unbind_pasid_table: unbind pasid table pointer and restore defaults
+ * @invalidate: invalidate translation caches
  */
 struct iommu_ops {
bool (*capable)(enum iommu_cap);
@@ -243,6 +244,8 @@ struct iommu_ops {
struct pasid_table_config *pasidt_binfo);
int (*unbind_pasid_table)(struct iommu_domain *domain,
struct device *dev);
+   int (*invalidate)(struct iommu_domain *domain,
+   struct device *dev, struct tlb_invalidate_info *inv_info);
 
unsigned long pgsize_bitmap;
 };
@@ -309,6 +312,9 @@ extern int iommu_bind_pasid_table(struct iommu_domain 
*domain,
struct device *dev, struct pasid_table_config *pasidt_binfo);
 extern int iommu_unbind_pasid_table(struct iommu_domain *domain,
struct device *dev);
+extern int iommu_invalidate(struct iommu_domain *domain,
+   struct device *dev, struct tlb_invalidate_info *inv_info);
+
 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
 extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 phys_addr_t paddr, size_t size, int prot);
@@ -721,6 +727,12 @@ int iommu_unbind_pasid_table(struct iommu_domain *domain, 
struct device *dev)
return -EINVAL;
 }
 
+static inline int iommu_invalidate(struct iommu_domain *domain,
+   struct device *dev, struct tlb_invalidate_info *inv_info)
+{
+   return -EINVAL;
+}
+
 #endif /* CONFIG_IOMMU_API */
 
 #endif /* __LINUX_IOMMU_H */
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
index aeeaf0e..ee11aae 100644
--- a/include/uapi/linux/iommu.h
+++ b/include/uapi/linux/iommu.h
@@ -52,4 +52,66 @@ struct pasid_table_config {
};
 };
 
+enum iommu_inv_granularity {
+   IOMMU_INV_GRANU_GLOBAL, /* all TLBs invalidated */
+   IOMMU_INV_GRANU_DOMAIN, /* all TLBs associated with a domain */
+   IOMMU_INV_GRANU_DEVICE, /* caching structure associated with a
+* device ID
+*/
+   IOMMU_INV_GRANU_DOMAN_PAGE, /* address range with a domain */
+   IOMMU_INV_GRANU_ALL_PASID,  /* cache of a given PASID */
+   IOMMU_INV_GRANU_PASID_SEL,  /* only invalidate specified PASID */
+
+   IOMMU_INV_GRANU_NG_ALL_PASID,   /* non-global within all PASIDs */
+   IOMMU_INV_GRANU_NG_PASID,   /* non-global within a PASIDs */
+   IOMMU_INV_GRANU_PAGE_PASID, /* page-selective within a PASID */
+   IOMMU_INV_NR_GRANU,
+};
+
+enum iommu_inv_type {
+   IOMMU_INV_TYPE_DTLB,/* device IOTLB */
+   IOMMU_INV_TYPE_TLB, /* IOMMU paging structure cache */
+   IOMMU_INV_TYPE_PASID,   /* PASID

[PATCH v2 04/16] iommu/vt-d: support flushing more TLB types

2017-10-05 Thread Jacob Pan
Signed-off-by: Jacob Pan 
---
 drivers/iommu/dmar.c| 53 ++---
 drivers/iommu/intel-iommu.c |  3 ++-
 include/linux/intel-iommu.h | 10 +++--
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c
index 57c920c..2fbff8b 100644
--- a/drivers/iommu/dmar.c
+++ b/drivers/iommu/dmar.c
@@ -1336,11 +1336,25 @@ void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, 
u64 addr,
qi_submit_sync(&desc, iommu);
 }
 
-void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
-   u64 addr, unsigned mask)
+void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr, u32 pasid,
+   unsigned int size_order, u64 granu, bool global)
 {
struct qi_desc desc;
 
+   desc.low = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) |
+   QI_EIOTLB_GRAN(granu) | QI_EIOTLB_TYPE;
+   desc.high = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_GL(global) |
+   QI_EIOTLB_IH(0) | QI_EIOTLB_AM(size_order);
+   qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+   u16 qdep, u64 addr, unsigned mask)
+{
+   struct qi_desc desc;
+
+   pr_debug_ratelimited("%s: sid %d, pfsid %d, qdep %d, addr %llx, mask 
%d\n",
+   __func__, sid, pfsid, qdep, addr, mask);
if (mask) {
BUG_ON(addr & ((1 << (VTD_PAGE_SHIFT + mask)) - 1));
addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1;
@@ -1352,7 +1366,40 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 
sid, u16 qdep,
qdep = 0;
 
desc.low = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) |
-  QI_DIOTLB_TYPE;
+  QI_DIOTLB_TYPE | QI_DEV_IOTLB_SID(pfsid);
+
+   qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+   u32 pasid,  u16 qdep, u64 addr, unsigned size)
+{
+   struct qi_desc desc;
+
+   desc.low = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) |
+   QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE |
+   QI_DEV_EIOTLB_PFSID(pfsid);
+
+   /* If S bit is 0, we only flush a single page. If S bit is set,
+* The least significant zero bit indicates the size. VT-d spec
+* 6.5.2.6
+*/
+   if (!size)
+   desc.high = QI_DEV_EIOTLB_ADDR(addr) & ~QI_DEV_EIOTLB_SIZE;
+   else {
+   unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size);
+
+   desc.high = QI_DEV_EIOTLB_ADDR(addr & ~mask) | 
QI_DEV_EIOTLB_SIZE;
+   }
+   qi_submit_sync(&desc, iommu);
+}
+
+void qi_flush_pasid(struct intel_iommu *iommu, u16 did, u64 granu, int pasid)
+{
+   struct qi_desc desc;
+
+   desc.high = 0;
+   desc.low = QI_PC_TYPE | QI_PC_DID(did) | QI_PC_GRAN(granu) | 
QI_PC_PASID(pasid);
 
qi_submit_sync(&desc, iommu);
 }
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 7ae569c..6832f73 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1567,7 +1567,8 @@ static void iommu_flush_dev_iotlb(struct dmar_domain 
*domain,
 
sid = info->bus << 8 | info->devfn;
qdep = info->ats_qdep;
-   qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
+   qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
+   qdep, addr, mask);
}
spin_unlock_irqrestore(&device_domain_lock, flags);
 }
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 485a5b4..e42d317 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -305,6 +305,7 @@ enum {
 #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32)
 #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0x) << 16)
 #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4)
+#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid 
& 0xff0) << 48))
 #define QI_DEV_EIOTLB_MAX_INVS 32
 
 #define QI_PGRP_IDX(idx)   (((u64)(idx)) << 55)
@@ -450,8 +451,13 @@ extern void qi_flush_context(struct intel_iommu *iommu, 
u16 did, u16 sid,
 u8 fm, u64 type);
 extern void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
  unsigned int size_order, u64 type);
-extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 qdep,
-  u64 addr, unsigned mask);
+extern void qi_flush_eiotlb(struct intel_iommu *iommu, u16 did, u64 addr,
+   u32 pasid, unsigned int size_order, u64 type, bool 
global);
+extern void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+   u16 qdep, u64 addr, unsigned mask);
+extern void qi_flush_dev_eiotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid,
+  

[PATCH v2 02/16] iommu/vt-d: add bind_pasid_table function

2017-10-05 Thread Jacob Pan
Add Intel VT-d ops to the generic iommu_bind_pasid_table API
functions.

The primary use case is for direct assignment of SVM capable
device. Originated from emulated IOMMU in the guest, the request goes
through many layers (e.g. VFIO). Upon calling host IOMMU driver, caller
passes guest PASID table pointer (GPA) and size.

Device context table entry is modified by Intel IOMMU specific
bind_pasid_table function. This will turn on nesting mode and matching
translation type.

The unbind operation restores default context mapping.

Signed-off-by: Jacob Pan 
Signed-off-by: Liu, Yi L 
Signed-off-by: Ashok Raj 
---
 drivers/iommu/intel-iommu.c   | 117 ++
 include/linux/dma_remapping.h |   1 +
 2 files changed, 118 insertions(+)

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 209d99a..7ae569c 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -5200,6 +5200,7 @@ static void intel_iommu_put_resv_regions(struct device 
*dev,
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 #define MAX_NR_PASID_BITS (20)
+#define MIN_NR_PASID_BITS (5)
 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
 {
/*
@@ -5326,6 +5327,118 @@ struct intel_iommu *intel_svm_device_to_iommu(struct 
device *dev)
 
return iommu;
 }
+
+static int intel_iommu_bind_pasid_table(struct iommu_domain *domain,
+   struct device *dev, struct pasid_table_config *pasidt_binfo)
+{
+   struct intel_iommu *iommu;
+   struct context_entry *context;
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+   struct device_domain_info *info;
+   struct pci_dev *pdev;
+   u8 bus, devfn, host_table_pasid_bits;
+   u16 did, sid;
+   int ret = 0;
+   unsigned long flags;
+   u64 ctx_lo;
+
+   iommu = device_to_iommu(dev, &bus, &devfn);
+   if (!iommu)
+   return -ENODEV;
+   /* VT-d spec 9.4 says pasid table size is encoded as 2^(x+5) */
+   host_table_pasid_bits = intel_iommu_get_pts(iommu) + MIN_NR_PASID_BITS;
+   if (!pasidt_binfo || pasidt_binfo->pasid_bits > host_table_pasid_bits ||
+   pasidt_binfo->pasid_bits < MIN_NR_PASID_BITS) {
+   pr_err("Invalid gPASID bits %d, host range %d - %d\n",
+   pasidt_binfo->pasid_bits,
+   MIN_NR_PASID_BITS, host_table_pasid_bits);
+   return -ERANGE;
+   }
+
+   pdev = to_pci_dev(dev);
+   if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
+   return -EINVAL;
+   sid = PCI_DEVID(bus, devfn);
+
+   info = dev->archdata.iommu;
+   if (!info || !info->pasid_supported) {
+   dev_err(dev, "No PASID support\n");
+   ret = -EINVAL;
+   goto out;
+   }
+   if (!info->pasid_enabled) {
+   ret = pci_enable_pasid(pdev, info->pasid_supported & ~1);
+   if (ret)
+   goto out;
+   }
+   if (!device_context_mapped(iommu, bus, devfn)) {
+   pr_warn("ctx not mapped for bus devfn %x:%x\n", bus, devfn);
+   ret = -EINVAL;
+   goto out;
+   }
+   spin_lock_irqsave(&iommu->lock, flags);
+   context = iommu_context_addr(iommu, bus, devfn, 0);
+   if (!context) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   /* Anticipate guest to use SVM and owns the first level, so we turn
+* nested mode on
+*/
+   ctx_lo = context[0].lo;
+   ctx_lo |= CONTEXT_NESTE | CONTEXT_PRS | CONTEXT_PASIDE;
+   ctx_lo &= ~CONTEXT_TT_MASK;
+   ctx_lo |= CONTEXT_TT_DEV_IOTLB << 2;
+   context[0].lo = ctx_lo;
+
+   /* Assign guest PASID table pointer and size order */
+   ctx_lo = (pasidt_binfo->base_ptr & VTD_PAGE_MASK) |
+   (pasidt_binfo->pasid_bits - MIN_NR_PASID_BITS);
+   context[1].lo = ctx_lo;
+   /* make sure context entry is updated before flushing */
+   wmb();
+   did = dmar_domain->iommu_did[iommu->seq_id];
+   iommu->flush.flush_context(iommu, did,
+   (((u16)bus) << 8) | devfn,
+   DMA_CCMD_MASK_NOBIT,
+   DMA_CCMD_DEVICE_INVL);
+   iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+out_unlock:
+   spin_unlock_irqrestore(&iommu->lock, flags);
+out:
+   return ret;
+}
+
+static int intel_iommu_unbind_pasid_table(struct iommu_domain *domain,
+   struct device *dev)
+{
+   struct intel_iommu *iommu;
+   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+   struct device_domain_info *info;
+   u8 bus, devfn;
+
+   iommu = device_to_iommu(dev, &bus, &devfn);
+   if (!iommu)
+   return -ENODEV;
+   /*
+* REVISIT: we might want to clear the PASID table pointer
+* as part of context cl

[PATCH v2 00/16] IOMMU driver support for SVM virtualization

2017-10-05 Thread Jacob Pan
Hi All,

Shared virtual memory (SVM) space between devices and applications can
reduce programming complexity and enhance security. To enable SVM in
the guest, i.e. shared guest application address space and physical
device DMA address, IOMMU driver must provide some new functionalities.

This patchset is a follow-up on the discussions held at LPC 2017
VFIO/IOMMU/PCI track. Slides and notes can be found here:
https://linuxplumbersconf.org/2017/ocw/events/LPC2017/tracks/636

The complete guest SVM support also involves changes in QEMU and VFIO,
which has been posted earlier.
https://www.spinics.net/lists/kvm/msg148798.html

This is the IOMMU portion follow up of the more complete series of the
kernel changes to support vSVM. Please refer to the link below for more
details. https://www.spinics.net/lists/kvm/msg148819.html

Generic APIs are introduced in addition to Intel VT-d specific changes,
the goal is to have common interfaces across IOMMU and device types for
both VFIO and other in-kernel users.

At the top level, new IOMMU interfaces are introduced as follows:
 - bind guest PASID table
 - passdown invalidations of translation caches
 - IOMMU device fault reporting including page request/response and
   non-recoverable faults.

For IOMMU detected device fault reporting, struct device is extended to
provide callback and tracking at device level. The original proposal was
discussed here "Error handling for I/O memory management units"
(https://lwn.net/Articles/608914/). I have experimented two alternative
solutions:
1. use a shared group notifier, this does not scale well also causes unwanted
notification traffic when group sibling device is reported with faults.
2. place fault callback at device IOMMU arch data, e.g. device_domain_info
in Intel/FSL IOMMU driver. This will cause code duplication, since per
device fault reporting is generic.

The additional patches are Intel VT-d specific, which either implements or
replaces existing private interfaces with the generic ones.

Changelog:

V2
- Replaced hybrid interface data model (generic data + vendor specific
data) with all generic data. This will have the security benefit where
data passed from user space can be sanitized by all software layers if
needed.
- Addressed review comments from V1
- Use per device fault report data
- Support page request/response communications between host IOMMU and
guest or other in-kernel users.
- Added unrecoverable fault reporting to DMAR
- Use threaded IRQ function for DMAR fault interrupt and fault
reporting

Jacob Pan (15):
  iommu: introduce bind_pasid_table API function
  iommu/vt-d: add bind_pasid_table function
  iommu/vt-d: support flushing more TLB types
  iommu/vt-d: add iommu invalidate function
  iommu/vt-d: move device_domain_info to header
  iommu/vt-d: assign PFSID in device TLB invalidation
  iommu: introduce device fault data
  driver core: add iommu device fault reporting data
  iommu: introduce device fault report API
  iommu/vt-d: use threaded irq for dmar_fault
  iommu/vt-d: report unrecoverable device faults
  iommu/intel-svm: notify page request to guest
  iommu/intel-svm: replace dev ops with fault report API
  iommu: introduce page response function
  iommu/vt-d: add intel iommu page response function

Liu, Yi L (1):
  iommu: introduce iommu invalidate API function

 drivers/iommu/dmar.c  | 151 ++-
 drivers/iommu/intel-iommu.c   | 329 +++---
 drivers/iommu/intel-svm.c |  94 ++--
 drivers/iommu/iommu.c | 103 -
 include/linux/device.h|   3 +
 include/linux/dma_remapping.h |   1 +
 include/linux/intel-iommu.h   |  42 +-
 include/linux/intel-svm.h |  20 +--
 include/linux/iommu.h | 172 ++
 include/uapi/linux/iommu.h| 117 +++
 10 files changed, 975 insertions(+), 57 deletions(-)
 create mode 100644 include/uapi/linux/iommu.h

-- 
2.7.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v8 1/5] Doc: iommu/arm-smmu-v3: Add workaround for HiSilicon erratum 161010801

2017-10-05 Thread Rob Herring
On Wed, Sep 27, 2017 at 02:32:37PM +0100, Shameer Kolothum wrote:
> From: John Garry 
> 
> The HiSilicon erratum 161010801 describes the limitation of HiSilicon
> platforms hip06/hip07 to support the SMMU mappings for MSI transactions.
> 
> On these platforms, GICv3 ITS translator is presented with the deviceID
> by extending the MSI payload data to 64 bits to include the deviceID.
> Hence, the PCIe controller on this platforms has to differentiate the MSI
> payload against other DMA payload and has to modify the MSI payload.
> This basically makes it difficult for this platforms to have a SMMU
> translation for MSI.
> 
> This patch adds a compatible string to implement this errata for
> HiSilicon Hi161x SMMUv3 model on hip06/hip07 platforms.
> 
> Also, the arm64 silicon errata is updated with this same erratum.
> 
> Signed-off-by: John Garry 
> [Shameer: Modified to use compatible string for errata]
> Signed-off-by: Shameer Kolothum 
> ---
>  Documentation/arm64/silicon-errata.txt  | 1 +
>  Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt | 9 -
>  2 files changed, 9 insertions(+), 1 deletion(-)

Acked-by: Rob Herring 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/arm-smmu-v3: Ensure we sync STE when only changing config field

2017-10-05 Thread Robin Murphy
On 05/10/17 17:49, Will Deacon wrote:
> The SMMUv3 architecture permits caching of data structures deemed to be
> "reachable" by the SMU, which includes STEs marked as invalid. When
> transitioning an STE to a bypass/fault configuration at init or detach
> time, we mistakenly elide the CMDQ_OP_CFGI_STE operation in some cases,
> therefore potentially leaving the old STE state cached in the SMMU.
> 
> This patch fixes the problem by ensuring that we perform the
> CMDQ_OP_CFGI_STE operation irrespective of the validity of the previous
> STE.

Reviewed-by: Robin Murphy 

> Cc: Robin Murphy 
> Reported-by: Eric Auger 
> Signed-off-by: Will Deacon 
> ---
>  drivers/iommu/arm-smmu-v3.c | 7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 47f52b1ab838..91fdabdb4de6 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -1085,8 +1085,11 @@ static void arm_smmu_write_strtab_ent(struct 
> arm_smmu_device *smmu, u32 sid,
>   dst[1] = cpu_to_le64(STRTAB_STE_1_SHCFG_INCOMING
><< STRTAB_STE_1_SHCFG_SHIFT);
>   dst[2] = 0; /* Nuke the VMID */
> - if (ste_live)
> - arm_smmu_sync_ste_for_sid(smmu, sid);
> + /*
> +  * The SMMU can perform negative caching, so we must sync
> +  * the STE regardless of whether the old value was live.
> +  */
> + arm_smmu_sync_ste_for_sid(smmu, sid);
>   return;
>   }
>  
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH] iommu/arm-smmu-v3: Ensure we sync STE when only changing config field

2017-10-05 Thread Will Deacon
The SMMUv3 architecture permits caching of data structures deemed to be
"reachable" by the SMU, which includes STEs marked as invalid. When
transitioning an STE to a bypass/fault configuration at init or detach
time, we mistakenly elide the CMDQ_OP_CFGI_STE operation in some cases,
therefore potentially leaving the old STE state cached in the SMMU.

This patch fixes the problem by ensuring that we perform the
CMDQ_OP_CFGI_STE operation irrespective of the validity of the previous
STE.

Cc: Robin Murphy 
Reported-by: Eric Auger 
Signed-off-by: Will Deacon 
---
 drivers/iommu/arm-smmu-v3.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 47f52b1ab838..91fdabdb4de6 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1085,8 +1085,11 @@ static void arm_smmu_write_strtab_ent(struct 
arm_smmu_device *smmu, u32 sid,
dst[1] = cpu_to_le64(STRTAB_STE_1_SHCFG_INCOMING
 << STRTAB_STE_1_SHCFG_SHIFT);
dst[2] = 0; /* Nuke the VMID */
-   if (ste_live)
-   arm_smmu_sync_ste_for_sid(smmu, sid);
+   /*
+* The SMMU can perform negative caching, so we must sync
+* the STE regardless of whether the old value was live.
+*/
+   arm_smmu_sync_ste_for_sid(smmu, sid);
return;
}
 
-- 
2.1.4

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC v2 2/4] iommu/arm-smmu-v3: Add tlbi_on_map option

2017-10-05 Thread Auger Eric
Hi Will,

On 23/08/2017 18:42, Will Deacon wrote:
> Hi Eric,
> 
> On Wed, Aug 23, 2017 at 02:36:53PM +0200, Auger Eric wrote:
>> On 23/08/2017 12:25, Will Deacon wrote:
>>> On Tue, Aug 22, 2017 at 10:09:15PM +0300, Michael S. Tsirkin wrote:
 On Fri, Aug 18, 2017 at 05:49:42AM +0300, Michael S. Tsirkin wrote:
> On Thu, Aug 17, 2017 at 05:34:25PM +0100, Will Deacon wrote:
>> On Fri, Aug 11, 2017 at 03:45:28PM +0200, Eric Auger wrote:
>>> When running a virtual SMMU on a guest we sometimes need to trap
>>> all changes to the translation structures. This is especially useful
>>> to integrate with VFIO. This patch adds a new option that forces
>>> the IO_PGTABLE_QUIRK_TLBI_ON_MAP to be applied on LPAE page tables.
>>>
>>> TLBI commands then can be trapped.
>>>
>>> Signed-off-by: Eric Auger 
>>>
>>> ---
>>> v1 -> v2:
>>> - rebase on v4.13-rc2
>>> ---
>>>  Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt | 4 
>>>  drivers/iommu/arm-smmu-v3.c | 5 +
>>>  2 files changed, 9 insertions(+)
>>>
>>> diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt 
>>> b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt
>>> index c9abbf3..ebb85e9 100644
>>> --- a/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt
>>> +++ b/Documentation/devicetree/bindings/iommu/arm,smmu-v3.txt
>>> @@ -52,6 +52,10 @@ the PCIe specification.
>>>  
>>> devicetree/bindings/interrupt-controller/msi.txt
>>>for a description of the msi-parent property.
>>>  
>>> +- tlbi-on-map   : invalidate caches whenever there is an update of
>>> +  any remapping structure (updates to not-present 
>>> or
>>> +  present entries).
>>> +
>>
>> My position on this hasn't changed, so NAK for this patch. If you want to
>> emulate something outside of the SMMUv3 architecture, please do so, but
>> don't pretend that it's an SMMUv3.
>>
>> Will
>
> What if the emulated device does not list arm,smmu-v3, listing
> qemu,ssmu-v3 as compatible? Would that address the concern?

 Will, can you comment on this please? Are you open to reusing the code
 in drivers/iommu/arm-smmu-v3.c to support a paravirtual device that does
 not claim to be compatible with smmuv3 but does try to behave very close to
 it except it can cache non-present structures? Or would you rather
 the code to support this is forked to qemu-smmu-v3.c?
>>>
>>> I still don't understand why this is preferable to a PV IOMMU
>>> implementation. Not only is this proposing to issue TLB maintenance on
>>> map, but the maintenance command itself is entirely made up. Why not just
>>> have a map command? Anyway, I'm reluctant to add this hack to the driver 
>>> until:
>>>
>>>   1. There is a compelling reason to pursue this approach instead of a
>>>  PV approach (including performance measurements).
>>>
>>>   2. There is a specification for the QEMU fork of the ARM SMMUv3
>>>  architecture, including the semantics of the new command being proposed
>>>  and what exactly the TLB maintenance requirements are on map (for
>>>  example, what if I change an STE or a CD -- are they cached too?).
>> I am not sure I catch this last point. At the moment whenever the smmuv3
>> driver issues data structure invalidation commands (CMD_CFGI_*), those
>> are trapped and I replay the mappings on host side. I have not changed
>> anything on that side.
> 
> But STEs and CDs have very similar rules to TLBs: you don't need to issue
> invalidation if the data structure is transitioning from invalid to valid.

While looking at chapter "4.8 virtualisation" of the smmuv3 spec, I
understand that if we were to use the 2 stages we would need to trap on
STE updates since they are owned by the hyp.

Spec says "updates to a guest STE are accompanied by a CMD_CFGI_STE (or
similar) issued from the guest. So I understand invalidation of CDs are
not mandated by the spec but invalidation of STEs if the data structure
is transitioning from invalid to valid would be requested. Is that
correct? I fail to understand if this is currently done by the smmuv3
driver though.


> If you're caching those in QEMU, how do you keep them up-to-date? I can
> also guarantee you that there will be additional data structures added
> in future versions of the architecture, so you'll need to consider how
> you want to operate when running on newer hardware.
> 
>> I introduced a new map implementation defined command because the per
>> page CMD_TLBI_NH_VA IOVA invalidation command was not efficient/usable
>> with use cases such as DPDK on guest. I understood the spec provisions
>> for such implementation defined commands.

Also if we were to use dual stage, command queue accesses still would be
trapped. So if a guest

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread Robin Murphy
On 05/10/17 14:08, John Garry wrote:
> On 05/10/2017 13:44, Robin Murphy wrote:
>> On 05/10/17 13:37, John Garry wrote:
>>> On 05/10/2017 12:07, Robin Murphy wrote:
 On 04/10/17 14:50, Lorenzo Pieralisi wrote:
> On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:
>> On 27/09/17 14:32, Shameer Kolothum wrote:
>>> From: John Garry 
>>>
>>> On some platforms msi-controller address regions have to be excluded
>>> from normal IOVA allocation in that they are detected and decoded in
>>> a HW specific way by system components and so they cannot be
>>> considered
>>> normal IOVA address space.
>>>
>>> Add a helper function that retrieves msi address regions through
>>> device
>>> tree msi mapping, so that these regions will not be translated by
>>> IOMMU
>>> and will be excluded from IOVA allocations.
>>>
>>> Signed-off-by: John Garry 
>>> [Shameer: Modified msi-parent retrieval logic]
>>> Signed-off-by: Shameer Kolothum
>>> 
>>> ---
>>>  drivers/iommu/of_iommu.c | 95
>>> 
>>>  include/linux/of_iommu.h | 10 +
>>>  2 files changed, 105 insertions(+)
>>>
>>> diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
>>> index e60e3db..ffd7fb7 100644
>>> --- a/drivers/iommu/of_iommu.c
>>> +++ b/drivers/iommu/of_iommu.c
>>> @@ -21,6 +21,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include 
>>>  #include 
>>>  #include 
>>> @@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
>>>  return ops->of_xlate(dev, iommu_spec);
>>>  }
>>>
>>> +static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void
>>> *data)
>>> +{
>>> +    u32 *rid = data;
>>> +
>>> +    *rid = alias;
>>> +    return 0;
>>> +}
>>> +
>>>  struct of_pci_iommu_alias_info {
>>>  struct device *dev;
>>>  struct device_node *np;
>>> @@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev
>>> *pdev, u16 alias, void *data)
>>>  return info->np == pdev->bus->dev.of_node;
>>>  }
>>>
>>> +static int of_iommu_alloc_resv_region(struct device_node *np,
>>> +  struct list_head *head)
>>> +{
>>> +    int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
>>> +    struct iommu_resv_region *region;
>>> +    struct resource res;
>>> +    int err;
>>> +
>>> +    err = of_address_to_resource(np, 0, &res);
>>
>> What is the rational for registering the first region only? Surely
>> you
>> can have more than one region in an MSI controller (yes, your
>> particular
>> case is the ITS which has only one, but we're trying to do something
>> generic here).
>>
>> Another issue I have with this code is that it inserts all of the ITS
>> MMIO in the RESV_MSI range. As long as we don't generate any page
>> tables
>> for this, we're fine. But if we ever change this, we'll end-up
>> with the
>> ITS programming interface being exposed to a device, which
>> wouldn't be
>> acceptable.
>>
>> Why can't we have some generic property instead, that would describe
>> the
>> actual ranges that cannot be translated? That way, no random
>> insertion
>> of a random range, and relatively future proof.

 Indeed. Furthermore, IORT has the advantage of being limited to a few
 distinct device types and SBSA-compliant system topologies, so the
 ITS-chasing is reasonable there (modulo only reserving
 GITS_TRANSLATER).
 The scope of DT covers more embedded things as well like PCI host
 controllers with internal MSI doorbells, and potentially even
 direct-mapped memory regions for things like bootloader framebuffers to
 prevent display glitches - a generic address/size/flags description
 of a
 region could work for just about everything.

> IORT code has the same issue (ie it reserves all ITS regions) and I do
> not know where a property can be added to describe those ranges (IORT
> specs ? I'd rather not) in ACPI other than the IORT tables entries
> themselves.
>
>> I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd
>> feel
>> like a much nicer and simpler solution to this problem.
>
> It could be but if we throw ACPI into the picture we have to knock
> together Hisilicon specific namespace bindings to handle this and
> quickly.

 As above I'm happy with the ITS-specific solution for ACPI given the
 limits of IORT. What I had in mind for DT was something tied in with
 the
 generic IOMMU bindings. Something like this is probably easiest to
 handle, but does rather spread the information around:


   pci {
   ...
   iommu-map = <0 0 &iommu1 0x

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread John Garry

On 05/10/2017 13:44, Robin Murphy wrote:

On 05/10/17 13:37, John Garry wrote:

On 05/10/2017 12:07, Robin Murphy wrote:

On 04/10/17 14:50, Lorenzo Pieralisi wrote:

On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:

On 27/09/17 14:32, Shameer Kolothum wrote:

From: John Garry 

On some platforms msi-controller address regions have to be excluded
from normal IOVA allocation in that they are detected and decoded in
a HW specific way by system components and so they cannot be
considered
normal IOVA address space.

Add a helper function that retrieves msi address regions through
device
tree msi mapping, so that these regions will not be translated by
IOMMU
and will be excluded from IOVA allocations.

Signed-off-by: John Garry 
[Shameer: Modified msi-parent retrieval logic]
Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/of_iommu.c | 95

 include/linux/of_iommu.h | 10 +
 2 files changed, 105 insertions(+)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index e60e3db..ffd7fb7 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
 return ops->of_xlate(dev, iommu_spec);
 }

+static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
+{
+u32 *rid = data;
+
+*rid = alias;
+return 0;
+}
+
 struct of_pci_iommu_alias_info {
 struct device *dev;
 struct device_node *np;
@@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev
*pdev, u16 alias, void *data)
 return info->np == pdev->bus->dev.of_node;
 }

+static int of_iommu_alloc_resv_region(struct device_node *np,
+  struct list_head *head)
+{
+int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+struct iommu_resv_region *region;
+struct resource res;
+int err;
+
+err = of_address_to_resource(np, 0, &res);


What is the rational for registering the first region only? Surely you
can have more than one region in an MSI controller (yes, your
particular
case is the ITS which has only one, but we're trying to do something
generic here).

Another issue I have with this code is that it inserts all of the ITS
MMIO in the RESV_MSI range. As long as we don't generate any page
tables
for this, we're fine. But if we ever change this, we'll end-up with the
ITS programming interface being exposed to a device, which wouldn't be
acceptable.

Why can't we have some generic property instead, that would describe
the
actual ranges that cannot be translated? That way, no random insertion
of a random range, and relatively future proof.


Indeed. Furthermore, IORT has the advantage of being limited to a few
distinct device types and SBSA-compliant system topologies, so the
ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
The scope of DT covers more embedded things as well like PCI host
controllers with internal MSI doorbells, and potentially even
direct-mapped memory regions for things like bootloader framebuffers to
prevent display glitches - a generic address/size/flags description of a
region could work for just about everything.


IORT code has the same issue (ie it reserves all ITS regions) and I do
not know where a property can be added to describe those ranges (IORT
specs ? I'd rather not) in ACPI other than the IORT tables entries
themselves.


I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd
feel
like a much nicer and simpler solution to this problem.


It could be but if we throw ACPI into the picture we have to knock
together Hisilicon specific namespace bindings to handle this and
quickly.


As above I'm happy with the ITS-specific solution for ACPI given the
limits of IORT. What I had in mind for DT was something tied in with the
generic IOMMU bindings. Something like this is probably easiest to
handle, but does rather spread the information around:


  pci {
  ...
  iommu-map = <0 0 &iommu1 0x1>;
  iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
  <0x3456 0x1000 IOMMU_MSI>;
  };

  display {
  ...
  iommus = <&iommu1 0x2>;
  /* simplefb region */
  iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
  };


Alternatively, something inspired by reserved-memory might perhaps be
conceptually neater, at the risk of being more complicated:


  iommu1: iommu@acbd {
  ...
  #iommu-cells = <1>;

  iommu-reserved-ranges {
  #address-cells = <1>;
  #size-cells = <1>;

its0_resv: msi@1234 {
  compatible = "iommu-msi-region";
  reg = <0x1234 0x1000>;
  };

its1_resv: msi@3456 {
  compatible = "iommu-msi-region";
  reg = <0x3456 0x1000>;
  };

fb_resv: direct@1234 {
  compatible = "iomm

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread Robin Murphy
On 05/10/17 12:57, Will Deacon wrote:
> On Thu, Oct 05, 2017 at 12:07:26PM +0100, Robin Murphy wrote:
>> On 04/10/17 14:50, Lorenzo Pieralisi wrote:
>>> On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:
 On 27/09/17 14:32, Shameer Kolothum wrote:
> From: John Garry 
>
> On some platforms msi-controller address regions have to be excluded
> from normal IOVA allocation in that they are detected and decoded in
> a HW specific way by system components and so they cannot be considered
> normal IOVA address space.
>
> Add a helper function that retrieves msi address regions through device
> tree msi mapping, so that these regions will not be translated by IOMMU
> and will be excluded from IOVA allocations.
>
> Signed-off-by: John Garry 
> [Shameer: Modified msi-parent retrieval logic]
> Signed-off-by: Shameer Kolothum 
> ---
>  drivers/iommu/of_iommu.c | 95 
> 
>  include/linux/of_iommu.h | 10 +
>  2 files changed, 105 insertions(+)
>
> diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
> index e60e3db..ffd7fb7 100644
> --- a/drivers/iommu/of_iommu.c
> +++ b/drivers/iommu/of_iommu.c
> @@ -21,6 +21,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
>   return ops->of_xlate(dev, iommu_spec);
>  }
>  
> +static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
> +{
> + u32 *rid = data;
> +
> + *rid = alias;
> + return 0;
> +}
> +
>  struct of_pci_iommu_alias_info {
>   struct device *dev;
>   struct device_node *np;
> @@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev *pdev, 
> u16 alias, void *data)
>   return info->np == pdev->bus->dev.of_node;
>  }
>  
> +static int of_iommu_alloc_resv_region(struct device_node *np,
> +   struct list_head *head)
> +{
> + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
> + struct iommu_resv_region *region;
> + struct resource res;
> + int err;
> +
> + err = of_address_to_resource(np, 0, &res);

 What is the rational for registering the first region only? Surely you
 can have more than one region in an MSI controller (yes, your particular
 case is the ITS which has only one, but we're trying to do something
 generic here).

 Another issue I have with this code is that it inserts all of the ITS
 MMIO in the RESV_MSI range. As long as we don't generate any page tables
 for this, we're fine. But if we ever change this, we'll end-up with the
 ITS programming interface being exposed to a device, which wouldn't be
 acceptable.

 Why can't we have some generic property instead, that would describe the
 actual ranges that cannot be translated? That way, no random insertion
 of a random range, and relatively future proof.
>>
>> Indeed. Furthermore, IORT has the advantage of being limited to a few
>> distinct device types and SBSA-compliant system topologies, so the
>> ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
>> The scope of DT covers more embedded things as well like PCI host
>> controllers with internal MSI doorbells, and potentially even
>> direct-mapped memory regions for things like bootloader framebuffers to
>> prevent display glitches - a generic address/size/flags description of a
>> region could work for just about everything.
>>
>>> IORT code has the same issue (ie it reserves all ITS regions) and I do
>>> not know where a property can be added to describe those ranges (IORT
>>> specs ? I'd rather not) in ACPI other than the IORT tables entries
>>> themselves.
>>>
 I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd feel
 like a much nicer and simpler solution to this problem.
>>>
>>> It could be but if we throw ACPI into the picture we have to knock
>>> together Hisilicon specific namespace bindings to handle this and
>>> quickly.
>>
>> As above I'm happy with the ITS-specific solution for ACPI given the
>> limits of IORT. What I had in mind for DT was something tied in with the
>> generic IOMMU bindings. Something like this is probably easiest to
>> handle, but does rather spread the information around:
>>
>>
>>   pci {
>>  ...
>>  iommu-map = <0 0 &iommu1 0x1>;
>>  iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
>>  <0x3456 0x1000 IOMMU_MSI>;
>>   };
>>
>>   display {
>>  ...
>>  iommus = <&iommu1 0x2>;
>>  /* simplefb region */
>>  iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
>>   };
>>
>>
>> Alternatively, something inspired by reserved-memory might perhaps be
>> conceptually neater, a

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread Robin Murphy
On 05/10/17 13:37, John Garry wrote:
> On 05/10/2017 12:07, Robin Murphy wrote:
>> On 04/10/17 14:50, Lorenzo Pieralisi wrote:
>>> On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:
 On 27/09/17 14:32, Shameer Kolothum wrote:
> From: John Garry 
>
> On some platforms msi-controller address regions have to be excluded
> from normal IOVA allocation in that they are detected and decoded in
> a HW specific way by system components and so they cannot be
> considered
> normal IOVA address space.
>
> Add a helper function that retrieves msi address regions through
> device
> tree msi mapping, so that these regions will not be translated by
> IOMMU
> and will be excluded from IOVA allocations.
>
> Signed-off-by: John Garry 
> [Shameer: Modified msi-parent retrieval logic]
> Signed-off-by: Shameer Kolothum 
> ---
>  drivers/iommu/of_iommu.c | 95
> 
>  include/linux/of_iommu.h | 10 +
>  2 files changed, 105 insertions(+)
>
> diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
> index e60e3db..ffd7fb7 100644
> --- a/drivers/iommu/of_iommu.c
> +++ b/drivers/iommu/of_iommu.c
> @@ -21,6 +21,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
>  return ops->of_xlate(dev, iommu_spec);
>  }
>
> +static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
> +{
> +    u32 *rid = data;
> +
> +    *rid = alias;
> +    return 0;
> +}
> +
>  struct of_pci_iommu_alias_info {
>  struct device *dev;
>  struct device_node *np;
> @@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev
> *pdev, u16 alias, void *data)
>  return info->np == pdev->bus->dev.of_node;
>  }
>
> +static int of_iommu_alloc_resv_region(struct device_node *np,
> +  struct list_head *head)
> +{
> +    int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
> +    struct iommu_resv_region *region;
> +    struct resource res;
> +    int err;
> +
> +    err = of_address_to_resource(np, 0, &res);

 What is the rational for registering the first region only? Surely you
 can have more than one region in an MSI controller (yes, your
 particular
 case is the ITS which has only one, but we're trying to do something
 generic here).

 Another issue I have with this code is that it inserts all of the ITS
 MMIO in the RESV_MSI range. As long as we don't generate any page
 tables
 for this, we're fine. But if we ever change this, we'll end-up with the
 ITS programming interface being exposed to a device, which wouldn't be
 acceptable.

 Why can't we have some generic property instead, that would describe
 the
 actual ranges that cannot be translated? That way, no random insertion
 of a random range, and relatively future proof.
>>
>> Indeed. Furthermore, IORT has the advantage of being limited to a few
>> distinct device types and SBSA-compliant system topologies, so the
>> ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
>> The scope of DT covers more embedded things as well like PCI host
>> controllers with internal MSI doorbells, and potentially even
>> direct-mapped memory regions for things like bootloader framebuffers to
>> prevent display glitches - a generic address/size/flags description of a
>> region could work for just about everything.
>>
>>> IORT code has the same issue (ie it reserves all ITS regions) and I do
>>> not know where a property can be added to describe those ranges (IORT
>>> specs ? I'd rather not) in ACPI other than the IORT tables entries
>>> themselves.
>>>
 I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd
 feel
 like a much nicer and simpler solution to this problem.
>>>
>>> It could be but if we throw ACPI into the picture we have to knock
>>> together Hisilicon specific namespace bindings to handle this and
>>> quickly.
>>
>> As above I'm happy with the ITS-specific solution for ACPI given the
>> limits of IORT. What I had in mind for DT was something tied in with the
>> generic IOMMU bindings. Something like this is probably easiest to
>> handle, but does rather spread the information around:
>>
>>
>>   pci {
>>   ...
>>   iommu-map = <0 0 &iommu1 0x1>;
>>   iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
>>   <0x3456 0x1000 IOMMU_MSI>;
>>   };
>>
>>   display {
>>   ...
>>   iommus = <&iommu1 0x2>;
>>   /* simplefb region */
>>   iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
>>   };
>>
>>
>> Alternatively, something inspired by reserved-memory might per

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread John Garry

On 05/10/2017 12:07, Robin Murphy wrote:

On 04/10/17 14:50, Lorenzo Pieralisi wrote:

On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:

On 27/09/17 14:32, Shameer Kolothum wrote:

From: John Garry 

On some platforms msi-controller address regions have to be excluded
from normal IOVA allocation in that they are detected and decoded in
a HW specific way by system components and so they cannot be considered
normal IOVA address space.

Add a helper function that retrieves msi address regions through device
tree msi mapping, so that these regions will not be translated by IOMMU
and will be excluded from IOVA allocations.

Signed-off-by: John Garry 
[Shameer: Modified msi-parent retrieval logic]
Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/of_iommu.c | 95 
 include/linux/of_iommu.h | 10 +
 2 files changed, 105 insertions(+)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index e60e3db..ffd7fb7 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
return ops->of_xlate(dev, iommu_spec);
 }

+static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
+{
+   u32 *rid = data;
+
+   *rid = alias;
+   return 0;
+}
+
 struct of_pci_iommu_alias_info {
struct device *dev;
struct device_node *np;
@@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 
alias, void *data)
return info->np == pdev->bus->dev.of_node;
 }

+static int of_iommu_alloc_resv_region(struct device_node *np,
+ struct list_head *head)
+{
+   int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+   struct iommu_resv_region *region;
+   struct resource res;
+   int err;
+
+   err = of_address_to_resource(np, 0, &res);


What is the rational for registering the first region only? Surely you
can have more than one region in an MSI controller (yes, your particular
case is the ITS which has only one, but we're trying to do something
generic here).

Another issue I have with this code is that it inserts all of the ITS
MMIO in the RESV_MSI range. As long as we don't generate any page tables
for this, we're fine. But if we ever change this, we'll end-up with the
ITS programming interface being exposed to a device, which wouldn't be
acceptable.

Why can't we have some generic property instead, that would describe the
actual ranges that cannot be translated? That way, no random insertion
of a random range, and relatively future proof.


Indeed. Furthermore, IORT has the advantage of being limited to a few
distinct device types and SBSA-compliant system topologies, so the
ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
The scope of DT covers more embedded things as well like PCI host
controllers with internal MSI doorbells, and potentially even
direct-mapped memory regions for things like bootloader framebuffers to
prevent display glitches - a generic address/size/flags description of a
region could work for just about everything.


IORT code has the same issue (ie it reserves all ITS regions) and I do
not know where a property can be added to describe those ranges (IORT
specs ? I'd rather not) in ACPI other than the IORT tables entries
themselves.


I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd feel
like a much nicer and simpler solution to this problem.


It could be but if we throw ACPI into the picture we have to knock
together Hisilicon specific namespace bindings to handle this and
quickly.


As above I'm happy with the ITS-specific solution for ACPI given the
limits of IORT. What I had in mind for DT was something tied in with the
generic IOMMU bindings. Something like this is probably easiest to
handle, but does rather spread the information around:


  pci {
...
iommu-map = <0 0 &iommu1 0x1>;
iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
<0x3456 0x1000 IOMMU_MSI>;
  };

  display {
...
iommus = <&iommu1 0x2>;
/* simplefb region */
iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
  };


Alternatively, something inspired by reserved-memory might perhaps be
conceptually neater, at the risk of being more complicated:


  iommu1: iommu@acbd {
...
#iommu-cells = <1>;

iommu-reserved-ranges {
#address-cells = <1>;
#size-cells = <1>;

its0_resv: msi@1234 {
compatible = "iommu-msi-region";
reg = <0x1234 0x1000>;
};

its1_resv: msi@3456 {
compatible = "iommu-msi-region";
reg = <0x3456 0x1000>;
 

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread Will Deacon
On Thu, Oct 05, 2017 at 12:07:26PM +0100, Robin Murphy wrote:
> On 04/10/17 14:50, Lorenzo Pieralisi wrote:
> > On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:
> >> On 27/09/17 14:32, Shameer Kolothum wrote:
> >>> From: John Garry 
> >>>
> >>> On some platforms msi-controller address regions have to be excluded
> >>> from normal IOVA allocation in that they are detected and decoded in
> >>> a HW specific way by system components and so they cannot be considered
> >>> normal IOVA address space.
> >>>
> >>> Add a helper function that retrieves msi address regions through device
> >>> tree msi mapping, so that these regions will not be translated by IOMMU
> >>> and will be excluded from IOVA allocations.
> >>>
> >>> Signed-off-by: John Garry 
> >>> [Shameer: Modified msi-parent retrieval logic]
> >>> Signed-off-by: Shameer Kolothum 
> >>> ---
> >>>  drivers/iommu/of_iommu.c | 95 
> >>> 
> >>>  include/linux/of_iommu.h | 10 +
> >>>  2 files changed, 105 insertions(+)
> >>>
> >>> diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
> >>> index e60e3db..ffd7fb7 100644
> >>> --- a/drivers/iommu/of_iommu.c
> >>> +++ b/drivers/iommu/of_iommu.c
> >>> @@ -21,6 +21,7 @@
> >>>  #include 
> >>>  #include 
> >>>  #include 
> >>> +#include 
> >>>  #include 
> >>>  #include 
> >>>  #include 
> >>> @@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
> >>>   return ops->of_xlate(dev, iommu_spec);
> >>>  }
> >>>  
> >>> +static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
> >>> +{
> >>> + u32 *rid = data;
> >>> +
> >>> + *rid = alias;
> >>> + return 0;
> >>> +}
> >>> +
> >>>  struct of_pci_iommu_alias_info {
> >>>   struct device *dev;
> >>>   struct device_node *np;
> >>> @@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev *pdev, 
> >>> u16 alias, void *data)
> >>>   return info->np == pdev->bus->dev.of_node;
> >>>  }
> >>>  
> >>> +static int of_iommu_alloc_resv_region(struct device_node *np,
> >>> +   struct list_head *head)
> >>> +{
> >>> + int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
> >>> + struct iommu_resv_region *region;
> >>> + struct resource res;
> >>> + int err;
> >>> +
> >>> + err = of_address_to_resource(np, 0, &res);
> >>
> >> What is the rational for registering the first region only? Surely you
> >> can have more than one region in an MSI controller (yes, your particular
> >> case is the ITS which has only one, but we're trying to do something
> >> generic here).
> >>
> >> Another issue I have with this code is that it inserts all of the ITS
> >> MMIO in the RESV_MSI range. As long as we don't generate any page tables
> >> for this, we're fine. But if we ever change this, we'll end-up with the
> >> ITS programming interface being exposed to a device, which wouldn't be
> >> acceptable.
> >>
> >> Why can't we have some generic property instead, that would describe the
> >> actual ranges that cannot be translated? That way, no random insertion
> >> of a random range, and relatively future proof.
> 
> Indeed. Furthermore, IORT has the advantage of being limited to a few
> distinct device types and SBSA-compliant system topologies, so the
> ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
> The scope of DT covers more embedded things as well like PCI host
> controllers with internal MSI doorbells, and potentially even
> direct-mapped memory regions for things like bootloader framebuffers to
> prevent display glitches - a generic address/size/flags description of a
> region could work for just about everything.
> 
> > IORT code has the same issue (ie it reserves all ITS regions) and I do
> > not know where a property can be added to describe those ranges (IORT
> > specs ? I'd rather not) in ACPI other than the IORT tables entries
> > themselves.
> > 
> >> I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd feel
> >> like a much nicer and simpler solution to this problem.
> > 
> > It could be but if we throw ACPI into the picture we have to knock
> > together Hisilicon specific namespace bindings to handle this and
> > quickly.
> 
> As above I'm happy with the ITS-specific solution for ACPI given the
> limits of IORT. What I had in mind for DT was something tied in with the
> generic IOMMU bindings. Something like this is probably easiest to
> handle, but does rather spread the information around:
> 
> 
>   pci {
>   ...
>   iommu-map = <0 0 &iommu1 0x1>;
>   iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
>   <0x3456 0x1000 IOMMU_MSI>;
>   };
> 
>   display {
>   ...
>   iommus = <&iommu1 0x2>;
>   /* simplefb region */
>   iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
>   };
> 
> 
> Alternatively, something inspired by reserved-memory might perhaps be
> conceptually neater, at the risk of being more complicated:
> 
> 
>   iommu1: i

Re: [PATCH v8 3/5] iommu/of: Add msi address regions reservation helper

2017-10-05 Thread Robin Murphy
On 04/10/17 14:50, Lorenzo Pieralisi wrote:
> On Wed, Oct 04, 2017 at 12:22:04PM +0100, Marc Zyngier wrote:
>> On 27/09/17 14:32, Shameer Kolothum wrote:
>>> From: John Garry 
>>>
>>> On some platforms msi-controller address regions have to be excluded
>>> from normal IOVA allocation in that they are detected and decoded in
>>> a HW specific way by system components and so they cannot be considered
>>> normal IOVA address space.
>>>
>>> Add a helper function that retrieves msi address regions through device
>>> tree msi mapping, so that these regions will not be translated by IOMMU
>>> and will be excluded from IOVA allocations.
>>>
>>> Signed-off-by: John Garry 
>>> [Shameer: Modified msi-parent retrieval logic]
>>> Signed-off-by: Shameer Kolothum 
>>> ---
>>>  drivers/iommu/of_iommu.c | 95 
>>> 
>>>  include/linux/of_iommu.h | 10 +
>>>  2 files changed, 105 insertions(+)
>>>
>>> diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
>>> index e60e3db..ffd7fb7 100644
>>> --- a/drivers/iommu/of_iommu.c
>>> +++ b/drivers/iommu/of_iommu.c
>>> @@ -21,6 +21,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include 
>>>  #include 
>>>  #include 
>>> @@ -138,6 +139,14 @@ static int of_iommu_xlate(struct device *dev,
>>> return ops->of_xlate(dev, iommu_spec);
>>>  }
>>>  
>>> +static int __get_pci_rid(struct pci_dev *pdev, u16 alias, void *data)
>>> +{
>>> +   u32 *rid = data;
>>> +
>>> +   *rid = alias;
>>> +   return 0;
>>> +}
>>> +
>>>  struct of_pci_iommu_alias_info {
>>> struct device *dev;
>>> struct device_node *np;
>>> @@ -163,6 +172,73 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 
>>> alias, void *data)
>>> return info->np == pdev->bus->dev.of_node;
>>>  }
>>>  
>>> +static int of_iommu_alloc_resv_region(struct device_node *np,
>>> + struct list_head *head)
>>> +{
>>> +   int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
>>> +   struct iommu_resv_region *region;
>>> +   struct resource res;
>>> +   int err;
>>> +
>>> +   err = of_address_to_resource(np, 0, &res);
>>
>> What is the rational for registering the first region only? Surely you
>> can have more than one region in an MSI controller (yes, your particular
>> case is the ITS which has only one, but we're trying to do something
>> generic here).
>>
>> Another issue I have with this code is that it inserts all of the ITS
>> MMIO in the RESV_MSI range. As long as we don't generate any page tables
>> for this, we're fine. But if we ever change this, we'll end-up with the
>> ITS programming interface being exposed to a device, which wouldn't be
>> acceptable.
>>
>> Why can't we have some generic property instead, that would describe the
>> actual ranges that cannot be translated? That way, no random insertion
>> of a random range, and relatively future proof.

Indeed. Furthermore, IORT has the advantage of being limited to a few
distinct device types and SBSA-compliant system topologies, so the
ITS-chasing is reasonable there (modulo only reserving GITS_TRANSLATER).
The scope of DT covers more embedded things as well like PCI host
controllers with internal MSI doorbells, and potentially even
direct-mapped memory regions for things like bootloader framebuffers to
prevent display glitches - a generic address/size/flags description of a
region could work for just about everything.

> IORT code has the same issue (ie it reserves all ITS regions) and I do
> not know where a property can be added to describe those ranges (IORT
> specs ? I'd rather not) in ACPI other than the IORT tables entries
> themselves.
> 
>> I'm not sure where to declare it (PCIe node? IOMMU node?), but it'd feel
>> like a much nicer and simpler solution to this problem.
> 
> It could be but if we throw ACPI into the picture we have to knock
> together Hisilicon specific namespace bindings to handle this and
> quickly.

As above I'm happy with the ITS-specific solution for ACPI given the
limits of IORT. What I had in mind for DT was something tied in with the
generic IOMMU bindings. Something like this is probably easiest to
handle, but does rather spread the information around:


  pci {
...
iommu-map = <0 0 &iommu1 0x1>;
iommu-reserved-ranges = <0x1234 0x1000 IOMMU_MSI>,
<0x3456 0x1000 IOMMU_MSI>;
  };

  display {
...
iommus = <&iommu1 0x2>;
/* simplefb region */
iommu-reserved-ranges = <0x8008 0x8 IOMMU_DIRECT>,
  };


Alternatively, something inspired by reserved-memory might perhaps be
conceptually neater, at the risk of being more complicated:


  iommu1: iommu@acbd {
...
#iommu-cells = <1>;

iommu-reserved-ranges {
#address-cells = <1>;
#size-cells = <1>;

its0_resv: msi@1234 {
compatible = "iommu-msi-region";

Re: [PATCH] dt-bindings: iommu: ipmmu-vmsa: Use generic node name

2017-10-05 Thread Simon Horman
On Wed, Oct 04, 2017 at 02:33:08PM +0200, Geert Uytterhoeven wrote:
> Use the preferred generic node name in the example.
> 
> Signed-off-by: Geert Uytterhoeven 

Reviewed-by: Simon Horman 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Task based virtual address spaces

2017-10-05 Thread Jean-Philippe Brucker
Hi Jordan,

On 04/10/17 20:43, Jordan Crouse wrote:
> Trying to start back up the conversation about multiple address
> spaces for IOMMU devices. If you will remember Jean-Philippe posted
> some patches back in February for SVM on arm-smmu-v3.
> 
> For quite some time the downstream Snapdragon kernels have supported
> something we call "per-process" page tables for the GPU. As with full SVM
> this involves creating a virtual address space per task but unlike SVM
> we don't automatically share the page table from the CPU. Instead we
> want to create a new page table and explicitly map/unmap address ranges
> into it. We provide the physical address of the page table to the GPU and
> it goes through the mechanics of programming the TTBR0 and invalidating
> the TLB when starts executing a submission for a given task.

Why does the GPU need the pgd? Does it implement its own MMU specifically
for process contexts? I understand you don't use PASIDs/SSIDs to isolate
process PT but context switch instead?

> As with all things IOMMU this discussion needs to be split into two parts -
> the API and the implementation. I want to focus on the generic API for this
> email. Shortly after Jean-Philippe posted his patches I sent out a rough
> prototype of how the downstream solution worked [1]:
> 
> +-+   +--+
> | "master" domain |  ---> | "dynamic" domain |
> +-+  \+--+
>   \
>\  +--+
> - | "dynamic" domain |
>   +--+

I also considered using hierarchical domains in my first prototype, but it
didn't seem to fit the IOMMU API. In the RFC that I intend to post this
week, I propose an iommu_process structure for everything process related.

I'm not sure if my new proposal fits your model since I didn't intend
iommu_process to be controllable externally with an IOMMU map/unmap
interface (the meat of the bind/unbind API is really page table sharing).
In v2 bind/unbind still only returns a PASID, not the process structure,
but I'll Cc you so we can work something out.

> Given a "master" domain (created in the normal way) we can create any number
> of "dynamic" domains which share the same configuration as the master (table
> format, context bank, quirks, etc). When the dynamic domain is allocated/
> attached it creates a new page table - for all intents and purposes this is
> a "real" domain except that it doesn't actually touch the hardware. We can
> use this domain with iommu_map() / iommu_unmap() as usual and then pass the
> physical address (acquired through a IOMMU domain attribute) to the GPU and
> everything works.
> 
> The main goal for this approach was to try to use the iommu API instead of
> teaching the GPU driver how to deal with several generations of page table
> formats and IOMMU devices. Shoehorning it into the domain struct was a
> path of least resistance that has served Snapdragon well but it was never
> really anything we considered to be a generic solution.
> 
> In the SVM patches, Jean-Philippe introduces iommu_bind_task():
> https://patchwork.codeaurora.org/patch/184777/. Given a struct task and
> a bit of other stuff it goes off and does SVM magic.
> 
> My proposal would be to extend this slightly and return a iommu_task
> struct from iommu_bind_task:
> 
> struct iommu_task *iommu_bind_task(struct device *dev, strut task_strut *task,
>   int *pasid, int flags, void *priv);

Since the GPU driver acts as a proxy and the PT are not shared, I suppose
you don't need the task_struct at all? Or maybe just for cleaning up on
process exit?

Thanks,
Jean

> For implementations that did real SVM the rest would behave the same, but for
> targets with explicit map requirements the iommu_task token could then be
> used for map/unmap:
> 
> iommu_task_map(struct iommu_task *task, unsigned long iova, phys_addr_t paddr,
>   size_t size, int prot);
> 
> iommu_task_unmap(struct iommu_task *task, unsigned long iova, size_t size);
> 
> int iommu_unbind_task(struct device *dev, struct iommu_task *task, int pasid,
>   int flags);
> 
> (Note that I'm typing these up on the fly - don't take these prototypes as
> gospel. This is mostly just a representation of the hooks we would need).
> 
> Internally this would come down into the device drivers with code similar
> to https://patchwork.codeaurora.org/patch/184751/. At the device level
> the biggest question figuring out if this is the "full SVM" or "explicit"
> model - we could handle that with compatible strings or dt quirks or even
> a flag to iommu_bind_task(). On Snapdragon we have the additional
> requirement to get the physical address for the page table. That could
> be encoded into pasid or returned in the iommu_task struct or a task
> attribute function.>
> Comments welcome of course. I think that if we can focus on getting a good
> generic API n