[PATCH 7/7] iommu/iova: fix iovad->dma_32bit_pfn as the last pfn of dma32

2017-03-21 Thread Zhen Lei
To make sure iovad->cached32_node and iovad->cached64_node can exactly
control dma32 and dma64 area. It also help us to remove the parameter
pfn_32bit of init_iova_domain.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/amd_iommu.c|  7 ++-
 drivers/iommu/dma-iommu.c| 22 +-
 drivers/iommu/intel-iommu.c  | 11 +++
 drivers/iommu/iova.c |  4 ++--
 drivers/misc/mic/scif/scif_rma.c |  3 +--
 include/linux/iova.h |  2 +-
 6 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 98940d1..78c8b93 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -61,7 +61,6 @@
 /* IO virtual address start page frame number */
 #define IOVA_START_PFN (1)
 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN  IOVA_PFN(DMA_BIT_MASK(32))
 
 /* Reserved IOVA ranges */
 #define MSI_RANGE_START(0xfee0)
@@ -1776,8 +1775,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(void)
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
 
-   init_iova_domain(&dma_dom->iovad, PAGE_SIZE,
-IOVA_START_PFN, DMA_32BIT_PFN);
+   init_iova_domain(&dma_dom->iovad, PAGE_SIZE, IOVA_START_PFN);
 
/* Initialize reserved ranges */
copy_reserved_iova(&reserved_iova_ranges, &dma_dom->iovad);
@@ -2747,8 +2745,7 @@ static int init_reserved_iova_ranges(void)
struct pci_dev *pdev = NULL;
struct iova *val;
 
-   init_iova_domain(&reserved_iova_ranges, PAGE_SIZE,
-IOVA_START_PFN, DMA_32BIT_PFN);
+   init_iova_domain(&reserved_iova_ranges, PAGE_SIZE, IOVA_START_PFN);
 
lockdep_set_class(&reserved_iova_ranges.iova_rbtree_lock,
  &reserved_rbtree_key);
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 48d36ce..7064d32 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -223,18 +223,7 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
/* ...then finally give it a kicking to make sure it fits */
base_pfn = max_t(unsigned long, base_pfn,
domain->geometry.aperture_start >> order);
-   end_pfn = min_t(unsigned long, end_pfn,
-   domain->geometry.aperture_end >> order);
}
-   /*
-* PCI devices may have larger DMA masks, but still prefer allocating
-* within a 32-bit mask to avoid DAC addressing. Such limitations don't
-* apply to the typical platform device, so for those we may as well
-* leave the cache limit at the top of their range to save an rb_last()
-* traversal on every allocation.
-*/
-   if (pci)
-   end_pfn &= DMA_BIT_MASK(32) >> order;
 
/* start_pfn is always nonzero for an already-initialised domain */
if (iovad->start_pfn) {
@@ -243,16 +232,15 @@ int iommu_dma_init_domain(struct iommu_domain *domain, 
dma_addr_t base,
pr_warn("Incompatible range for DMA domain\n");
return -EFAULT;
}
-   /*
-* If we have devices with different DMA masks, move the free
-* area cache limit down for the benefit of the smaller one.
-*/
-   iovad->dma_32bit_pfn = min(end_pfn, iovad->dma_32bit_pfn);
} else {
-   init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+   init_iova_domain(iovad, 1UL << order, base_pfn);
if (pci)
iova_reserve_pci_windows(to_pci_dev(dev), iovad);
}
+
+   if (end_pfn < iovad->dma_32bit_pfn)
+   dev_dbg(dev, "ancient device or dma range missed some bits?");
+
return 0;
 }
 EXPORT_SYMBOL(iommu_dma_init_domain);
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 238ad34..de467c1 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -82,8 +82,6 @@
 #define IOVA_START_PFN (1)
 
 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
-#define DMA_32BIT_PFN  IOVA_PFN(DMA_BIT_MASK(32))
-#define DMA_64BIT_PFN  IOVA_PFN(DMA_BIT_MASK(64))
 
 /* page table handling */
 #define LEVEL_STRIDE   (9)
@@ -1869,8 +1867,7 @@ static int dmar_init_reserved_ranges(void)
struct iova *iova;
int i;
 
-   init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
-   DMA_32BIT_PFN);
+   init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
 
lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
&reserved_rbtree_key);
@@ -1928,8 +1925,7 @@ static int domain_init(struct dmar_domain *domain, struct 
intel_iommu *iommu,
int adjust_width,

[PATCH 1/7] iommu/iova: fix incorrect variable types

2017-03-21 Thread Zhen Lei
Keep these four variables type consistent with the paramters of function
__alloc_and_insert_iova_range and the members of struct iova:

1. static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,

2. struct iova {
unsigned long   pfn_hi;
unsigned long   pfn_lo;

In fact, limit_pfn is most likely larger than 32 bits on DMA64.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b7268a1..8ba8b496 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -104,8 +104,8 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
  * Computes the padding size required, to make the start address
  * naturally aligned on the power-of-two order of its size
  */
-static unsigned int
-iova_get_pad_size(unsigned int size, unsigned int limit_pfn)
+static unsigned long
+iova_get_pad_size(unsigned long size, unsigned long limit_pfn)
 {
return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1);
 }
@@ -117,7 +117,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain 
*iovad,
struct rb_node *prev, *curr = NULL;
unsigned long flags;
unsigned long saved_pfn;
-   unsigned int pad_size = 0;
+   unsigned long pad_size = 0;
 
/* Walk the tree backwards */
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/7] iommu/iova: to optimize the allocation performance of dma64

2017-03-21 Thread Zhen Lei
Currently we always search free iova space for dma64 begin at the last
node of iovad rb-tree. In the worst case, there maybe too many nodes exist
at the tail, so that we should traverse many times for the first loop in
__alloc_and_insert_iova_range. As we traced, more than 10K times for the
case of iperf.

__alloc_and_insert_iova_range:
..
curr = __get_cached_rbnode(iovad, &limit_pfn);
//--> return rb_last(&iovad->rbroot);
while (curr) {
..
curr = rb_prev(curr);
}

So add cached64_node to take the same effect as cached32_node, and add
the start_pfn boundary of dma64, to prevent a iova cross both dma32 and
dma64 area.
|---|--|
|<--cached32_node-->||
|   |
start_pfn dma_32bit_pfn + 1

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 46 +++---
 include/linux/iova.h |  5 +++--
 2 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 87a9332..23abe84 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -37,10 +37,15 @@ insert_iova_boundary(struct iova_domain *iovad)
 {
struct iova *iova;
unsigned long start_pfn_32bit = iovad->start_pfn;
+   unsigned long start_pfn_64bit = iovad->dma_32bit_pfn + 1;
 
iova = reserve_iova(iovad, start_pfn_32bit, start_pfn_32bit);
BUG_ON(!iova);
iovad->cached32_node = &iova->node;
+
+   iova = reserve_iova(iovad, start_pfn_64bit, start_pfn_64bit);
+   BUG_ON(!iova);
+   iovad->cached64_node = &iova->node;
 }
 
 void
@@ -62,8 +67,8 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
init_iova_rcaches(iovad);
 
/*
-* Insert boundary nodes for dma32. So cached32_node can not be NULL in
-* future.
+* Insert boundary nodes for dma32 and dma64. So cached32_node and
+* cached64_node can not be NULL in future.
 */
insert_iova_boundary(iovad);
 }
@@ -75,10 +80,10 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned 
long *limit_pfn)
struct rb_node *cached_node;
struct rb_node *next_node;
 
-   if (*limit_pfn > iovad->dma_32bit_pfn)
-   return rb_last(&iovad->rbroot);
-   else
+   if (*limit_pfn <= iovad->dma_32bit_pfn)
cached_node = iovad->cached32_node;
+   else
+   cached_node = iovad->cached64_node;
 
next_node = rb_next(cached_node);
if (next_node) {
@@ -94,29 +99,32 @@ static void
 __cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
 {
struct iova *cached_iova;
+   struct rb_node **cached_node;
 
-   if (new->pfn_hi > iovad->dma_32bit_pfn)
-   return;
+   if (new->pfn_hi <= iovad->dma_32bit_pfn)
+   cached_node = &iovad->cached32_node;
+   else
+   cached_node = &iovad->cached64_node;
 
-   cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+   cached_iova = rb_entry(*cached_node, struct iova, node);
if (new->pfn_lo <= cached_iova->pfn_lo)
-   iovad->cached32_node = rb_prev(&new->node);
+   *cached_node = rb_prev(&new->node);
 }
 
 static void
 __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free)
 {
struct iova *cached_iova;
-   struct rb_node *curr;
+   struct rb_node **cached_node;
 
-   curr = iovad->cached32_node;
-   cached_iova = rb_entry(curr, struct iova, node);
+   if (free->pfn_hi <= iovad->dma_32bit_pfn)
+   cached_node = &iovad->cached32_node;
+   else
+   cached_node = &iovad->cached64_node;
 
-   if (free->pfn_lo >= cached_iova->pfn_lo) {
-   /* only cache if it's below 32bit pfn */
-   if (free->pfn_hi <= iovad->dma_32bit_pfn)
-   iovad->cached32_node = rb_prev(&free->node);
-   }
+   cached_iova = rb_entry(*cached_node, struct iova, node);
+   if (free->pfn_lo >= cached_iova->pfn_lo)
+   *cached_node = rb_prev(&free->node);
 }
 
 /*
@@ -283,7 +291,7 @@ EXPORT_SYMBOL_GPL(iova_cache_put);
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
- * @limit_pfn: - max limit address
+ * @limit_pfn: - max limit address(included)
  * @size_aligned: - set if size_aligned address range is required
  * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
  * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
@@ -402,7 +410,7 @@ EXPORT_SYMBOL_GPL(free_iova);
  * alloc_iova_fast - allocates an iova from rcache
  * @iovad: - iova domain in question
  * @size: - size of page frames to allocate
- * @limit_pfn: - max limit address
+ * @limit_pfn: 

[PATCH 3/7] iommu/iova: insert start_pfn boundary of dma32

2017-03-21 Thread Zhen Lei
Reserve the first granule size memory(start at start_pfn) as boundary
iova, to make sure that iovad->cached32_node can not be NULL in future.
Meanwhile, changed the assignment of iovad->cached32_node from rb_next to
rb_prev of &free->node in function __cached_rbnode_delete_update.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 63 ++--
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 1c49969..b5a148e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -32,6 +32,17 @@ static unsigned long iova_rcache_get(struct iova_domain 
*iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 
+static void
+insert_iova_boundary(struct iova_domain *iovad)
+{
+   struct iova *iova;
+   unsigned long start_pfn_32bit = iovad->start_pfn;
+
+   iova = reserve_iova(iovad, start_pfn_32bit, start_pfn_32bit);
+   BUG_ON(!iova);
+   iovad->cached32_node = &iova->node;
+}
+
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
unsigned long start_pfn, unsigned long pfn_32bit)
@@ -45,27 +56,38 @@ init_iova_domain(struct iova_domain *iovad, unsigned long 
granule,
 
spin_lock_init(&iovad->iova_rbtree_lock);
iovad->rbroot = RB_ROOT;
-   iovad->cached32_node = NULL;
iovad->granule = granule;
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = pfn_32bit;
init_iova_rcaches(iovad);
+
+   /*
+* Insert boundary nodes for dma32. So cached32_node can not be NULL in
+* future.
+*/
+   insert_iova_boundary(iovad);
 }
 EXPORT_SYMBOL_GPL(init_iova_domain);
 
 static struct rb_node *
 __get_cached_rbnode(struct iova_domain *iovad, unsigned long *limit_pfn)
 {
-   if ((*limit_pfn > iovad->dma_32bit_pfn) ||
-   (iovad->cached32_node == NULL))
+   struct rb_node *cached_node;
+   struct rb_node *next_node;
+
+   if (*limit_pfn > iovad->dma_32bit_pfn)
return rb_last(&iovad->rbroot);
-   else {
-   struct rb_node *prev_node = rb_prev(iovad->cached32_node);
-   struct iova *curr_iova =
-   rb_entry(iovad->cached32_node, struct iova, node);
-   *limit_pfn = curr_iova->pfn_lo - 1;
-   return prev_node;
+   else
+   cached_node = iovad->cached32_node;
+
+   next_node = rb_next(cached_node);
+   if (next_node) {
+   struct iova *next_iova = rb_entry(next_node, struct iova, node);
+
+   *limit_pfn = min(*limit_pfn, next_iova->pfn_lo - 1);
}
+
+   return cached_node;
 }
 
 static void
@@ -83,20 +105,13 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
struct iova *cached_iova;
struct rb_node *curr;
 
-   if (!iovad->cached32_node)
-   return;
curr = iovad->cached32_node;
cached_iova = rb_entry(curr, struct iova, node);
 
if (free->pfn_lo >= cached_iova->pfn_lo) {
-   struct rb_node *node = rb_next(&free->node);
-   struct iova *iova = rb_entry(node, struct iova, node);
-
/* only cache if it's below 32bit pfn */
-   if (node && iova->pfn_lo < iovad->dma_32bit_pfn)
-   iovad->cached32_node = node;
-   else
-   iovad->cached32_node = NULL;
+   if (free->pfn_hi <= iovad->dma_32bit_pfn)
+   iovad->cached32_node = rb_prev(&free->node);
}
 }
 
@@ -114,7 +129,7 @@ static int __alloc_and_insert_iova_range(struct iova_domain 
*iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
 {
-   struct rb_node *prev, *curr = NULL;
+   struct rb_node *prev, *curr;
unsigned long flags;
unsigned long saved_pfn;
unsigned long pad_size = 0;
@@ -144,13 +159,9 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
curr = rb_prev(curr);
}
 
-   if (!curr) {
-   if (size_aligned)
-   pad_size = iova_get_pad_size(size, limit_pfn);
-   if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
-   spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
-   return -ENOMEM;
-   }
+   if (unlikely(!curr)) {
+   spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+   return -ENOMEM;
}
 
/* pfn_lo will point to size aligned address if size_aligned is set */
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/7] iommu/iova: cut down judgement times

2017-03-21 Thread Zhen Lei
Below judgement can only be satisfied at the last time, which produced 2N
judgements(suppose N times failed, 0 or 1 time successed) in vain.

if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
return iova;
}

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 8ba8b496..1c49969 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -312,15 +312,12 @@ private_find_iova(struct iova_domain *iovad, unsigned 
long pfn)
while (node) {
struct iova *iova = rb_entry(node, struct iova, node);
 
-   /* If pfn falls within iova's range, return iova */
-   if ((pfn >= iova->pfn_lo) && (pfn <= iova->pfn_hi)) {
-   return iova;
-   }
-
if (pfn < iova->pfn_lo)
node = node->rb_left;
-   else if (pfn > iova->pfn_lo)
+   else if (pfn > iova->pfn_hi)
node = node->rb_right;
+   else
+   return iova;/* pfn falls within iova's range */
}
 
return NULL;
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 6/7] iommu/iova: move the caculation of pad mask out of loop

2017-03-21 Thread Zhen Lei
I'm not sure whether the compiler can optimize it, but move it out will
be better. At least, it does not require lock protection.

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 23abe84..68754e4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -127,23 +127,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, 
struct iova *free)
*cached_node = rb_prev(&free->node);
 }
 
-/*
- * Computes the padding size required, to make the start address
- * naturally aligned on the power-of-two order of its size
- */
-static unsigned long
-iova_get_pad_size(unsigned long size, unsigned long limit_pfn)
-{
-   return (limit_pfn + 1 - size) & (__roundup_pow_of_two(size) - 1);
-}
-
 static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
unsigned long size, unsigned long limit_pfn,
struct iova *new, bool size_aligned)
 {
struct rb_node *prev, *curr;
unsigned long flags;
-   unsigned long pad_size = 0;
+   unsigned long pad_mask, pad_size = 0;
+
+   if (size_aligned)
+   pad_mask = __roundup_pow_of_two(size) - 1;
 
/* Walk the tree backwards */
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
@@ -157,8 +150,13 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
else if (limit_pfn < curr_iova->pfn_hi)
goto adjust_limit_pfn;
else {
+   /*
+* Computes the padding size required, to make the start
+* address naturally aligned on the power-of-two order
+* of its size
+*/
if (size_aligned)
-   pad_size = iova_get_pad_size(size, limit_pfn);
+   pad_size = (limit_pfn + 1 - size) & pad_mask;
if ((curr_iova->pfn_hi + size + pad_size) <= limit_pfn)
break;  /* found a free slot */
}
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 0/7] iommu/iova: improve the allocation performance of dma64

2017-03-21 Thread Zhen Lei
64 bits devices is very common now. But currently we only defined a 
cached32_node
to optimize the allocation performance of dma32, and I saw some dma64 drivers 
chose
to allocate iova from dma32 space first, maybe becuase of current dma64 
performance
problem or some other reasons.

For example:(in drivers/iommu/amd_iommu.c)
static unsigned long dma_ops_alloc_iova(..
{
..
if (dma_mask > DMA_BIT_MASK(32))
pfn = alloc_iova_fast(&dma_dom->iovad, pages,
  IOVA_PFN(DMA_BIT_MASK(32)));
if (!pfn)
pfn = alloc_iova_fast(&dma_dom->iovad, pages, 
IOVA_PFN(dma_mask));

For the details of why dma64 iova allocation performance is very bad, please 
refer the
description of patch-5.

In this patch series, I added a cached64_node to manage the dma64 iova 
space(iova>=4G), it
takes the same effect as cached32_node(iova<4G).

Below it's the performance data before and after my patch series:
(before)$ iperf -s

Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)

[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35898
[ ID] Interval   Transfer Bandwidth
[  4]  0.0-10.2 sec  7.88 MBytes  6.48 Mbits/sec
[  5] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35900
[  5]  0.0-10.3 sec  7.88 MBytes  6.43 Mbits/sec
[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 35902
[  4]  0.0-10.3 sec  7.88 MBytes  6.43 Mbits/sec

(after)$ iperf -s

Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)

[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36330
[ ID] Interval   Transfer Bandwidth
[  4]  0.0-10.0 sec  1.09 GBytes   933 Mbits/sec
[  5] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36332
[  5]  0.0-10.0 sec  1.10 GBytes   939 Mbits/sec
[  4] local 192.168.1.106 port 5001 connected with 192.168.1.198 port 36334
[  4]  0.0-10.0 sec  1.10 GBytes   938 Mbits/sec


Zhen Lei (7):
  iommu/iova: fix incorrect variable types
  iommu/iova: cut down judgement times
  iommu/iova: insert start_pfn boundary of dma32
  iommu/iova: adjust __cached_rbnode_insert_update
  iommu/iova: to optimize the allocation performance of dma64
  iommu/iova: move the caculation of pad mask out of loop
  iommu/iova: fix iovad->dma_32bit_pfn as the last pfn of dma32

 drivers/iommu/amd_iommu.c|   7 +-
 drivers/iommu/dma-iommu.c|  22 ++
 drivers/iommu/intel-iommu.c  |  11 +--
 drivers/iommu/iova.c | 143 +--
 drivers/misc/mic/scif/scif_rma.c |   3 +-
 include/linux/iova.h |   7 +-
 6 files changed, 94 insertions(+), 99 deletions(-)

-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/7] iommu/iova: adjust __cached_rbnode_insert_update

2017-03-21 Thread Zhen Lei
For case 2 and 3, adjust cached32_node to the new place, case 1 keep no
change.

For example:
case1: (the right part was allocated)
|--|
|<-free>|<--new_iova-->|
|
|
   cached32_node

case2: (all was allocated)
|--|
|<-new_iova--->|
|
|
   cached32_node

case3:
|---|..|-|
|..free..|<--new_iova-->|
|  |
|  |
   cached32_node(new) cached32_node(old)

Signed-off-by: Zhen Lei 
---
 drivers/iommu/iova.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b5a148e..87a9332 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -91,12 +91,16 @@ __get_cached_rbnode(struct iova_domain *iovad, unsigned 
long *limit_pfn)
 }
 
 static void
-__cached_rbnode_insert_update(struct iova_domain *iovad,
-   unsigned long limit_pfn, struct iova *new)
+__cached_rbnode_insert_update(struct iova_domain *iovad, struct iova *new)
 {
-   if (limit_pfn != iovad->dma_32bit_pfn)
+   struct iova *cached_iova;
+
+   if (new->pfn_hi > iovad->dma_32bit_pfn)
return;
-   iovad->cached32_node = &new->node;
+
+   cached_iova = rb_entry(iovad->cached32_node, struct iova, node);
+   if (new->pfn_lo <= cached_iova->pfn_lo)
+   iovad->cached32_node = rb_prev(&new->node);
 }
 
 static void
@@ -131,12 +135,10 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
 {
struct rb_node *prev, *curr;
unsigned long flags;
-   unsigned long saved_pfn;
unsigned long pad_size = 0;
 
/* Walk the tree backwards */
spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
-   saved_pfn = limit_pfn;
curr = __get_cached_rbnode(iovad, &limit_pfn);
prev = curr;
while (curr) {
@@ -197,11 +199,10 @@ static int __alloc_and_insert_iova_range(struct 
iova_domain *iovad,
rb_link_node(&new->node, parent, entry);
rb_insert_color(&new->node, &iovad->rbroot);
}
-   __cached_rbnode_insert_update(iovad, saved_pfn, new);
+   __cached_rbnode_insert_update(iovad, new);
 
spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 
-
return 0;
 }
 
-- 
2.5.0


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread jacob pan
On Tue, 21 Mar 2017 19:37:56 +
Jean-Philippe Brucker  wrote:

> > For invalidation, I've following info in in pseudo code.
> > struct iommu_svm_tlb_invalidate_info
> > {
> >__u32 inv_type;
> > #define IOTLB_INV   (1 << 0)
> > #define EXTENDED_IOTLB_INV  (1 << 1)
> > #define DEVICE_IOTLB_INV(1 << 2)
> > #define EXTENDED_DEVICE_IOTLB_INV   (1 << 3)
> > #define PASID_CACHE_INV (1 << 4)
> >__u32 pasid;
> >__u64 addr;
> >__u64 size;
> >__u8 granularity;
> > #define DEFAULT_INV_GRN0
> > #define PAGE_SELECTIVE_INV (1 << 0)
> > #define PASID_SELECVIVE_INV(1 << 1)
> >__u64 flags;
> > #define INVALIDATE_HINT_BIT(1 << 0)
> > #define GLOBAL_HINT_BIT(1 << 1)
> > #define DRAIN_READ_BIT (1 << 2)
> > #define DRAIN_WRITE_BIT(1 << 3)
> > #define DEVICE_TLB_GLOBAL_BIT  (1 << 4)
> >__u8 mip;
> >__u16 pfsid;
> > };  
> 
> This would also benefit from being split into generic and
> architectural parts. Former would be defined in VFIO, latter would be
> in the IOMMU driver.
> 
> struct tlb_invalidate_info
> {
>   __u8 granularity
> #define DEFAULT_INV_GRN   0   /* What is default? */
> #define PAGE_SELECTIVE_INV(1 << 0)
> #define PASID_SELECTIVE_INV   (1 << 1)
>   __u32 pasid;
>   __u64 addr;
>   __u64 size;
> 
>   /* Since IOMMU format has already been validated for this
> table, the IOMMU driver knows that the following structure is in a
>  format it knows */
>   __u8 opaque[];
> };
> 
> struct tlb_invalidate_info_intel
> {
>   __u32 inv_type;
>   ...
>   __u64 flags;
>   ...
>   __u8 mip;
>   __u16 pfsid;
> };

I presume for Intel/arch specific part, the opaque data can simply be
the descriptor itself. i.e. struct qi_desc. There is no need to
generalize since it has already been validated to be architecture
specific data.

Arch specific IOMMU driver can take further action on the opaque data,
e.g. replace guest DID with host DID based on other info passed down
from the caller.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 04/30] iommu/arm-smmu-v3: Add support for PCI ATS

2017-03-21 Thread Jean-Philippe Brucker
On 08/03/17 15:26, Sinan Kaya wrote:
> On 2/27/2017 2:54 PM, Jean-Philippe Brucker wrote:
>> +ats_enabled = !arm_smmu_enable_ats(master);
>> +
> 
> You should make ats_supported field in IORT table part of the decision
> process for when to enable ATS.
> 

Agreed. I will also draft a proposal for adding the ATS property to PCI
host controller DT bindings, it seems to be missing.

Thanks,
Jean-Philippe
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread Jean-Philippe Brucker
On 21/03/17 07:04, Liu, Yi L wrote:
> Hi Jean,
> 
> I'm working on virtual SVM, and have some comments on the VFIO channel
> definition.

Thanks a lot for the comments, this is quite interesting to me. I just
have some concerns about portability so I'm proposing a way to be slightly
more generic below.

>> -Original Message-
>> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
>> boun...@lists.linux-foundation.org] On Behalf Of Jean-Philippe Brucker
>> Sent: Tuesday, February 28, 2017 3:55 AM
>> Cc: Shanker Donthineni ; k...@vger.kernel.org;
>> Catalin Marinas ; Sinan Kaya
>> ; Will Deacon ;
>> iommu@lists.linux-foundation.org; Harv Abdulhamid ;
>> linux-...@vger.kernel.org; Bjorn Helgaas ; David
>> Woodhouse ; linux-arm-ker...@lists.infradead.org; Nate
>> Watterson 
>> Subject: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
>>
>> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
>> between a device and a process address space, identified by a 
>> device-specific ID
>> named PASID. This allows the device to target DMA transactions at the process
>> virtual addresses without a need for mapping and unmapping buffers 
>> explicitly in the
>> IOMMU. The process page tables are shared with the IOMMU, and mechanisms such
>> as PCI ATS/PRI may be used to handle faults. VFIO_DEVICE_UNBIND_TASK removed
>> a bond identified by a PASID.
>>
>> Also add a capability flag in device info to detect whether the system and 
>> the device
>> support SVM.
>>
>> Users need to specify the state of a PASID when unbinding, with flags
>> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
>> PASID invalidation is specific to each device and only partially covered by 
>> the
>> specification:
>>
>> * Device must have an implementation-defined mechanism for stopping the
>>   use of a PASID. When this mechanism finishes, the device has stopped
>>   issuing transactions for this PASID and all transactions for this PASID
>>   have been flushed to the IOMMU.
>>
>> * Device may either wait for all outstanding PRI requests for this PASID
>>   to finish, or issue a Stop Marker message, a barrier that separates PRI
>>   requests affecting this instance of the PASID from PRI requests
>>   affecting the next instance. In the first case, we say that the PASID is
>>   "clean", in the second case it is "flushed" (and the IOMMU has to wait
>>   for the Stop Marker before reassigning the PASID.)
>>
>> We expect similar distinctions for platform devices. Ideally there should be 
>> a callback
>> for each PCI device, allowing the IOMMU to ask the device to stop using a 
>> PASID.
>> When the callback returns, the PASID is either flushed or clean and the 
>> return value
>> tells which.
>>
>> For the moment I don't know how to implement this callback for PCI, so if 
>> the user
>> forgets to call unbind with either "clean" or "flushed", the PASID is never 
>> reused. For
>> platform devices, it might be simpler to implement since we could associate 
>> an
>> invalidate_pasid callback to a DT compatible string, as is currently done 
>> for reset.
>>
>> Signed-off-by: Jean-Philippe Brucker 
> 
> [...]
> 
>>  drivers/vfio/pci/vfio_pci.c |  24 ++
>>  drivers/vfio/vfio.c | 104 
>> 
>>  include/uapi/linux/vfio.h   |  55 +++
>>  3 files changed, 183 insertions(+)
>>
> ...
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
>> 519eff362c1c..3fe4197a5ea0 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -198,6 +198,7 @@ struct vfio_device_info {
>>  #define VFIO_DEVICE_FLAGS_PCI   (1 << 1)/* vfio-pci device */
>>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */
>>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)/* vfio-amba device */
>> +#define VFIO_DEVICE_FLAGS_SVM   (1 << 4)/* Device supports 
>> bind/unbind */
>>  __u32   num_regions;/* Max region index + 1 */
>>  __u32   num_irqs;   /* Max IRQ index + 1 */
>>  };
>> @@ -409,6 +410,60 @@ struct vfio_irq_set {
>>   */
>>  #define VFIO_DEVICE_RESET   _IO(VFIO_TYPE, VFIO_BASE + 11)
>>
>> +struct vfio_device_svm {
>> +__u32   argsz;
>> +__u32   flags;
>> +#define VFIO_SVM_PASID_RELEASE_FLUSHED  (1 << 0)
>> +#define VFIO_SVM_PASID_RELEASE_CLEAN(1 << 1)
>> +__u32   pasid;
>> +};
> 
> For virtual SVM work, the VFIO channel would be used to passdown guest
> PASID tale PTR and invalidation information. And may have further usage
> except the above.
> 
> Here is the virtual SVM design doc which illustrates the VFIO usage.
> https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> 
> For the guest PASID table ptr passdown, I've following message in pseudo code.
> struct pasid_table_info {
> __u64 ptr;
> __u32 size;
>  };

There should probably be a way to specify the table format, so tha

Re: [PATCH v2 5/5] iommu: Allow default domain type to be set on the kernel command line

2017-03-21 Thread Will Deacon
On Tue, Mar 21, 2017 at 05:46:29PM +, Robin Murphy wrote:
> On 21/03/17 17:21, Will Deacon wrote:
> > On Tue, Mar 21, 2017 at 04:45:27PM +0100, Joerg Roedel wrote:
> >> On Fri, Mar 10, 2017 at 08:49:36PM +, Will Deacon wrote:
> >>> @@ -1014,8 +1027,8 @@ struct iommu_group *iommu_group_get_for_dev(struct 
> >>> device *dev)
> >>>* IOMMU driver.
> >>>*/
> >>>   if (!group->default_domain) {
> >>> - group->default_domain = __iommu_domain_alloc(dev->bus,
> >>> -  IOMMU_DOMAIN_DMA);
> >>> + group->default_domain =
> >>> + __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> >>
> >> It would be good to have a fall-back here if we are talking to an IOMMU
> >> driver that uses default domains, but does not support identity-mapped
> >> domains (yet). Exynos and Rockchip IOMMU drivers seem to fall into this
> >> category. A dev_warn() also makes sense in case allocating a identity
> >> domain fails.
> > 
> > Sure, something like the diff below?
> > 
> > Will
> > 
> > --->8
> > 
> > 
> > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > index 42a842e3f95f..f787626a745d 100644
> > --- a/drivers/iommu/iommu.c
> > +++ b/drivers/iommu/iommu.c
> > @@ -1027,10 +1027,19 @@ struct iommu_group *iommu_group_get_for_dev(struct 
> > device *dev)
> >  * IOMMU driver.
> >  */
> > if (!group->default_domain) {
> > -   group->default_domain =
> > -   __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> > +   struct iommu_domain *dom;
> > +
> > +   dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> > +   if (!dom) {
> > +   dev_warn(dev,
> > +"failed to allocate default IOMMU domain of 
> > type %u; falling back to IOMMU_DOMAIN_DMA",
> > +iommu_def_domain_type);
> 
> Conversely, that's going to be noisy if iommu_def_domain_type was
> IOMMU_DOMAIN_DMA to begin with. I think it makes sense to warn if the
> user asked for a specific default domain type on the command line and
> that didn't work, but maybe not to bother otherwise. Plus, if they asked
> for passthrough, then not allocating a default domain at all is probably
> closer to the desired result than installing a DMA ops domain would be.

You're right -- I'll hack this about to check if the default type isn't
DOMAIN_DMA before warning about the allocation failure.

Cheers,

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 5/5] iommu: Allow default domain type to be set on the kernel command line

2017-03-21 Thread Robin Murphy
On 21/03/17 17:21, Will Deacon wrote:
> On Tue, Mar 21, 2017 at 04:45:27PM +0100, Joerg Roedel wrote:
>> On Fri, Mar 10, 2017 at 08:49:36PM +, Will Deacon wrote:
>>> @@ -1014,8 +1027,8 @@ struct iommu_group *iommu_group_get_for_dev(struct 
>>> device *dev)
>>>  * IOMMU driver.
>>>  */
>>> if (!group->default_domain) {
>>> -   group->default_domain = __iommu_domain_alloc(dev->bus,
>>> -IOMMU_DOMAIN_DMA);
>>> +   group->default_domain =
>>> +   __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
>>
>> It would be good to have a fall-back here if we are talking to an IOMMU
>> driver that uses default domains, but does not support identity-mapped
>> domains (yet). Exynos and Rockchip IOMMU drivers seem to fall into this
>> category. A dev_warn() also makes sense in case allocating a identity
>> domain fails.
> 
> Sure, something like the diff below?
> 
> Will
> 
> --->8
> 
> 
> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> index 42a842e3f95f..f787626a745d 100644
> --- a/drivers/iommu/iommu.c
> +++ b/drivers/iommu/iommu.c
> @@ -1027,10 +1027,19 @@ struct iommu_group *iommu_group_get_for_dev(struct 
> device *dev)
>* IOMMU driver.
>*/
>   if (!group->default_domain) {
> - group->default_domain =
> - __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> + struct iommu_domain *dom;
> +
> + dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> + if (!dom) {
> + dev_warn(dev,
> +  "failed to allocate default IOMMU domain of 
> type %u; falling back to IOMMU_DOMAIN_DMA",
> +  iommu_def_domain_type);

Conversely, that's going to be noisy if iommu_def_domain_type was
IOMMU_DOMAIN_DMA to begin with. I think it makes sense to warn if the
user asked for a specific default domain type on the command line and
that didn't work, but maybe not to bother otherwise. Plus, if they asked
for passthrough, then not allocating a default domain at all is probably
closer to the desired result than installing a DMA ops domain would be.

Robin.

> + dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
> + }
> +
> + group->default_domain = dom;
>   if (!group->domain)
> - group->domain = group->default_domain;
> + group->domain = dom;
>   }
>  
>   ret = iommu_group_add_device(group, dev);
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 4/5] iommu/arm-smmu-v3: Install bypass STEs for IOMMU_DOMAIN_IDENTITY domains

2017-03-21 Thread Robin Murphy
On 21/03/17 17:08, Will Deacon wrote:
> Hi Robin,
> 
> On Thu, Mar 16, 2017 at 06:19:48PM +, Robin Murphy wrote:
>> On 16/03/17 16:24, Nate Watterson wrote:
>>> On 2017-03-10 15:49, Will Deacon wrote:
 In preparation for allowing the default domain type to be overridden,
 this patch adds support for IOMMU_DOMAIN_IDENTITY domains to the
 ARM SMMUv3 driver.

 An identity domain is created by placing the corresponding stream table
 entries into "bypass" mode, which allows transactions to flow through
 the SMMU without any translation.

>>>
>>> What about masters that require SMMU intervention to override their
>>> native memory attributes to make them consistent with the CCA (acpi)
>>> or dma-coherent (dt) values specified in FW?
>>
>> Well, we've already broken them ;) My interpretation of "dma-coherent"
>> is as the equivalent of DACS=1,CPM=1, i.e. not dependent on SMMU
>> override. For the CCA=1,DACS=0 case (I'm going to pretend the DT
>> equivalent will never exist...) the first problem to solve is how to
>> inherit the appropriate configuration from the firmware, because right
>> now we're not even pretending to support that.
> 
> Indeed, and that would need to be added as a separate patch series when
> the need arises.
> 
  /* Nuke the existing STE_0 value, as we're going to rewrite it */
 -val = ste->valid ? STRTAB_STE_0_V : 0;
 +val = STRTAB_STE_0_V;
 +
 +/* Bypass/fault */
 +if (!ste->assigned || !(ste->s1_cfg || ste->s2_cfg)) {
 +if (!ste->assigned && disable_bypass)
>>
>> ...yuck. After about 5 minutes of staring at that, I've convinced myself
>> that it would make much more sense to always clear the strtab_ent
>> configs on detach, such that you never need the outer !ste->assigned
>> check here...
> 
> I was deliberately keeping the strtab_ent intact in case we ever grow
> support for nested translation, where we might well want to detach a
> stage 1 but keep the stage 2 installed. I don't think the code is that
> bad, so I'd like to leave it like it is for now.

Sure, it would certainly be more awkward to recreate this logic from
scratch in future if we need it again. I suggested the cleanup since it
looked like an oversight, but if it's a conscious decision then that's
fine by me.

Robin.

> 
> Will
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 5/5] iommu: Allow default domain type to be set on the kernel command line

2017-03-21 Thread Will Deacon
On Tue, Mar 21, 2017 at 04:45:27PM +0100, Joerg Roedel wrote:
> On Fri, Mar 10, 2017 at 08:49:36PM +, Will Deacon wrote:
> > @@ -1014,8 +1027,8 @@ struct iommu_group *iommu_group_get_for_dev(struct 
> > device *dev)
> >  * IOMMU driver.
> >  */
> > if (!group->default_domain) {
> > -   group->default_domain = __iommu_domain_alloc(dev->bus,
> > -IOMMU_DOMAIN_DMA);
> > +   group->default_domain =
> > +   __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
> 
> It would be good to have a fall-back here if we are talking to an IOMMU
> driver that uses default domains, but does not support identity-mapped
> domains (yet). Exynos and Rockchip IOMMU drivers seem to fall into this
> category. A dev_warn() also makes sense in case allocating a identity
> domain fails.

Sure, something like the diff below?

Will

--->8


diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 42a842e3f95f..f787626a745d 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1027,10 +1027,19 @@ struct iommu_group *iommu_group_get_for_dev(struct 
device *dev)
 * IOMMU driver.
 */
if (!group->default_domain) {
-   group->default_domain =
-   __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
+   struct iommu_domain *dom;
+
+   dom = __iommu_domain_alloc(dev->bus, iommu_def_domain_type);
+   if (!dom) {
+   dev_warn(dev,
+"failed to allocate default IOMMU domain of 
type %u; falling back to IOMMU_DOMAIN_DMA",
+iommu_def_domain_type);
+   dom = __iommu_domain_alloc(dev->bus, IOMMU_DOMAIN_DMA);
+   }
+
+   group->default_domain = dom;
if (!group->domain)
-   group->domain = group->default_domain;
+   group->domain = dom;
}
 
ret = iommu_group_add_device(group, dev);
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 4/5] iommu/arm-smmu-v3: Install bypass STEs for IOMMU_DOMAIN_IDENTITY domains

2017-03-21 Thread Will Deacon
Hi Robin,

On Thu, Mar 16, 2017 at 06:19:48PM +, Robin Murphy wrote:
> On 16/03/17 16:24, Nate Watterson wrote:
> > On 2017-03-10 15:49, Will Deacon wrote:
> >> In preparation for allowing the default domain type to be overridden,
> >> this patch adds support for IOMMU_DOMAIN_IDENTITY domains to the
> >> ARM SMMUv3 driver.
> >>
> >> An identity domain is created by placing the corresponding stream table
> >> entries into "bypass" mode, which allows transactions to flow through
> >> the SMMU without any translation.
> >>
> > 
> > What about masters that require SMMU intervention to override their
> > native memory attributes to make them consistent with the CCA (acpi)
> > or dma-coherent (dt) values specified in FW?
> 
> Well, we've already broken them ;) My interpretation of "dma-coherent"
> is as the equivalent of DACS=1,CPM=1, i.e. not dependent on SMMU
> override. For the CCA=1,DACS=0 case (I'm going to pretend the DT
> equivalent will never exist...) the first problem to solve is how to
> inherit the appropriate configuration from the firmware, because right
> now we're not even pretending to support that.

Indeed, and that would need to be added as a separate patch series when
the need arises.

> >>  /* Nuke the existing STE_0 value, as we're going to rewrite it */
> >> -val = ste->valid ? STRTAB_STE_0_V : 0;
> >> +val = STRTAB_STE_0_V;
> >> +
> >> +/* Bypass/fault */
> >> +if (!ste->assigned || !(ste->s1_cfg || ste->s2_cfg)) {
> >> +if (!ste->assigned && disable_bypass)
> 
> ...yuck. After about 5 minutes of staring at that, I've convinced myself
> that it would make much more sense to always clear the strtab_ent
> configs on detach, such that you never need the outer !ste->assigned
> check here...

I was deliberately keeping the strtab_ent intact in case we ever grow
support for nested translation, where we might well want to detach a
stage 1 but keep the stage 2 installed. I don't think the code is that
bad, so I'd like to leave it like it is for now.

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/5] Implement SMMU passthrough using the default domain

2017-03-21 Thread Joerg Roedel
On Tue, Mar 21, 2017 at 04:42:41PM +, Will Deacon wrote:
> Hi Joerg,
> 
> On Tue, Mar 21, 2017 at 04:46:24PM +0100, Joerg Roedel wrote:
> > On Fri, Mar 10, 2017 at 08:49:31PM +, Will Deacon wrote:
> > > Will Deacon (5):
> > >   iommu/arm-smmu: Restrict domain attributes to UNMANAGED domains
> > >   iommu/arm-smmu: Install bypass S2CRs for IOMMU_DOMAIN_IDENTITY domains
> > >   iommu/arm-smmu-v3: Make arm_smmu_install_ste_for_dev return void
> > >   iommu/arm-smmu-v3: Install bypass STEs for IOMMU_DOMAIN_IDENTITY
> > > domains
> > >   iommu: Allow default domain type to be set on the kernel command line
> > > 
> > >  Documentation/admin-guide/kernel-parameters.txt |  6 ++
> > >  drivers/iommu/arm-smmu-v3.c | 76 
> > > +++--
> > >  drivers/iommu/arm-smmu.c| 26 -
> > >  drivers/iommu/iommu.c   | 17 +-
> > >  4 files changed, 90 insertions(+), 35 deletions(-)
> > 
> > Besides my one comment on the last patch this series looks good to me.
> > Do you plan to include it (with the fall-back) into your pull-request?
> 
> Yes, that would certainly be easiest for me, given the changes to the
> SMMU drivers. However, if you prefer it separately then I can do that too.

No, just send it with you usual bulk of other smmu patches.


Thanks,

Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/5] Implement SMMU passthrough using the default domain

2017-03-21 Thread Will Deacon
Hi Joerg,

On Tue, Mar 21, 2017 at 04:46:24PM +0100, Joerg Roedel wrote:
> On Fri, Mar 10, 2017 at 08:49:31PM +, Will Deacon wrote:
> > Will Deacon (5):
> >   iommu/arm-smmu: Restrict domain attributes to UNMANAGED domains
> >   iommu/arm-smmu: Install bypass S2CRs for IOMMU_DOMAIN_IDENTITY domains
> >   iommu/arm-smmu-v3: Make arm_smmu_install_ste_for_dev return void
> >   iommu/arm-smmu-v3: Install bypass STEs for IOMMU_DOMAIN_IDENTITY
> > domains
> >   iommu: Allow default domain type to be set on the kernel command line
> > 
> >  Documentation/admin-guide/kernel-parameters.txt |  6 ++
> >  drivers/iommu/arm-smmu-v3.c | 76 
> > +++--
> >  drivers/iommu/arm-smmu.c| 26 -
> >  drivers/iommu/iommu.c   | 17 +-
> >  4 files changed, 90 insertions(+), 35 deletions(-)
> 
> Besides my one comment on the last patch this series looks good to me.
> Do you plan to include it (with the fall-back) into your pull-request?

Yes, that would certainly be easiest for me, given the changes to the
SMMU drivers. However, if you prefer it separately then I can do that too.

Cheers,

Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread Deucher, Alexander
> -Original Message-
> From: 'j...@8bytes.org' [mailto:j...@8bytes.org]
> Sent: Tuesday, March 21, 2017 12:26 PM
> To: Deucher, Alexander
> Cc: Bridgman, John; Alex Deucher; Daniel Drake; Chris Chiu; amd-
> g...@lists.freedesktop.org; Nath, Arindam; iommu@lists.linux-
> foundation.org; Suthikulpanit, Suravee; Linux Upstreaming Team
> Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait
> loop timed out
> 
> On Tue, Mar 21, 2017 at 04:17:40PM +, Deucher, Alexander wrote:
> > > -Original Message-
> > > From: 'j...@8bytes.org' [mailto:j...@8bytes.org]
> > > Sent: Tuesday, March 21, 2017 12:11 PM
> > > To: Deucher, Alexander
> > > Cc: Alex Deucher; Daniel Drake; Chris Chiu; amd-
> g...@lists.freedesktop.org;
> > > Nath, Arindam; iommu@lists.linux-foundation.org; Suthikulpanit,
> Suravee;
> > > Linux Upstreaming Team
> > > Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-
> Wait
> > > loop timed out
> > >
> > > On Tue, Mar 21, 2017 at 04:01:53PM +, Deucher, Alexander wrote:
> > > > It seems to only affect Stoney systems, but not others (Carrizo,
> > > > Bristol, etc.).  Maybe we could just disable it on Stoney until we
> > > > root cause it.
> > >
> > > Completion-wait loop timeouts indicate something is seriously wrong.
> How
> > > can I detect whether I am running on a 'Stoney' system?
> >
> > + John
> >
> > I'm not sure if the iommu ids are different on stoney systems compared to
> Carrizo/Bristol systems.  The pci ids of the GPUs are different.  Stoney parts
> have 0x98E4 as the pci id for the GPU.
> >
> > >
> > > Other question, a shot into the dark, does the GPU on these systems
> have
> > > ATS? Probably yes, as they are likely HSA compatible.
> >
> > Stoney is a small APU.  Kind of a mini Carrizo.  While it may claim to 
> > support
> ATS, I don't think it was ever validated on Stoney, only Carrizo/Bristol.
> 
> Okay, so maybe ATS is broken in some way on these chips. When queue
> flushes happen it will also send the ATS-invalidates, and a queue flush
> can cause a storm of those. This may be the issue.
> 
> I am preparing a debug-patch that disables ATS for these GPUs so someone
> with such a chip can test it.

Thanks Joerg.

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread 'j...@8bytes.org'
On Tue, Mar 21, 2017 at 04:17:40PM +, Deucher, Alexander wrote:
> > -Original Message-
> > From: 'j...@8bytes.org' [mailto:j...@8bytes.org]
> > Sent: Tuesday, March 21, 2017 12:11 PM
> > To: Deucher, Alexander
> > Cc: Alex Deucher; Daniel Drake; Chris Chiu; amd-...@lists.freedesktop.org;
> > Nath, Arindam; iommu@lists.linux-foundation.org; Suthikulpanit, Suravee;
> > Linux Upstreaming Team
> > Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait
> > loop timed out
> > 
> > On Tue, Mar 21, 2017 at 04:01:53PM +, Deucher, Alexander wrote:
> > > It seems to only affect Stoney systems, but not others (Carrizo,
> > > Bristol, etc.).  Maybe we could just disable it on Stoney until we
> > > root cause it.
> > 
> > Completion-wait loop timeouts indicate something is seriously wrong. How
> > can I detect whether I am running on a 'Stoney' system?
> 
> + John
> 
> I'm not sure if the iommu ids are different on stoney systems compared to 
> Carrizo/Bristol systems.  The pci ids of the GPUs are different.  Stoney 
> parts have 0x98E4 as the pci id for the GPU.
> 
> > 
> > Other question, a shot into the dark, does the GPU on these systems have
> > ATS? Probably yes, as they are likely HSA compatible.
> 
> Stoney is a small APU.  Kind of a mini Carrizo.  While it may claim to 
> support ATS, I don't think it was ever validated on Stoney, only 
> Carrizo/Bristol.

Okay, so maybe ATS is broken in some way on these chips. When queue
flushes happen it will also send the ATS-invalidates, and a queue flush
can cause a storm of those. This may be the issue.

I am preparing a debug-patch that disables ATS for these GPUs so someone
with such a chip can test it.


Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Mason
On 21/03/2017 15:23, Liviu Dudau wrote:

> On Tue, Mar 21, 2017 at 02:01:57PM +0100, Mason wrote:
>
>> My PCIe controller is b/d/f 0/0/0.
>> It ignores all PCI addresses that do not fall within the range
>> defined in BAR0 of b/d/f 0/0/0.
>>
>> BAR0 has a configurable width of 2^(23+i) bytes, i < 8
>> It is implicitly split in 8 regions, which can map to
>> anywhere in CPU space. However, my understanding is that,
>> by default, the DMA framework expects PCI address X to
>> map to CPU address X...
>> (My understanding of that part is still a bit hazy.)
> 
> It looks very much like your PCIe controller (host bridge) has the 
> configuration
> registers on the wrong side of the bridge (PCIe one vs the more normal host 
> side).
> But otherwise it does look very much like an ATS system, where you program 
> how the
> PCIe bus addresses map into the system (you call them CPU) addresses. Do you 
> also
> have a way of controlling the direction of the mapping? (in other words, 
> those 8
> regions are only for translating request coming out of the PCIe bus into 
> system
> addresses, or can you also set the direct mapping of system address to PCIe 
> address?).

There are two (asymmetric) features for these mappings.

For bus-to-system accesses, the scheme described above
of "bus addresses within BAR0 fall into 1 of 8 regions,
and 8 registers define 8 target system addresses".

For system-to-bus accesses, there is a single "offset"
register which gets added to the system address, to form
the bus address. I just use a 0 offset.

(This is made slightly more complex on later chips because
someone in marketing thought it would look good on the
data sheet to support 4 GB of RAM in a SoC using 32-bit
processors. There is an additional cpu-to-foo mapping.)

> As for DMA framework expectations, I'm not sure how that plays a role, unless 
> you
> have some DMA engines on the PCIe bus.

I think the USB3 PCIe card I'm using is a bus master, so
it's able to push data to RAM without any help from the
ARM core.

Regards.

P.S. something in your message is confusing my MUA, likely
your Mail-Followup-To field. My MUA thinks it should not
send you my reply. Is that really what you want?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread Deucher, Alexander
> -Original Message-
> From: 'j...@8bytes.org' [mailto:j...@8bytes.org]
> Sent: Tuesday, March 21, 2017 12:11 PM
> To: Deucher, Alexander
> Cc: Alex Deucher; Daniel Drake; Chris Chiu; amd-...@lists.freedesktop.org;
> Nath, Arindam; iommu@lists.linux-foundation.org; Suthikulpanit, Suravee;
> Linux Upstreaming Team
> Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait
> loop timed out
> 
> On Tue, Mar 21, 2017 at 04:01:53PM +, Deucher, Alexander wrote:
> > It seems to only affect Stoney systems, but not others (Carrizo,
> > Bristol, etc.).  Maybe we could just disable it on Stoney until we
> > root cause it.
> 
> Completion-wait loop timeouts indicate something is seriously wrong. How
> can I detect whether I am running on a 'Stoney' system?

+ John

I'm not sure if the iommu ids are different on stoney systems compared to 
Carrizo/Bristol systems.  The pci ids of the GPUs are different.  Stoney parts 
have 0x98E4 as the pci id for the GPU.

> 
> Other question, a shot into the dark, does the GPU on these systems have
> ATS? Probably yes, as they are likely HSA compatible.

Stoney is a small APU.  Kind of a mini Carrizo.  While it may claim to support 
ATS, I don't think it was ever validated on Stoney, only Carrizo/Bristol.

Alex


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread Nath, Arindam
>-Original Message-
>From: 'j...@8bytes.org' [mailto:j...@8bytes.org]
>Sent: Tuesday, March 21, 2017 9:41 PM
>To: Deucher, Alexander
>Cc: Alex Deucher; Daniel Drake; Chris Chiu; amd-...@lists.freedesktop.org;
>Nath, Arindam; iommu@lists.linux-foundation.org; Suthikulpanit, Suravee;
>Linux Upstreaming Team
>Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait
>loop timed out
>
>On Tue, Mar 21, 2017 at 04:01:53PM +, Deucher, Alexander wrote:
>> It seems to only affect Stoney systems, but not others (Carrizo,
>> Bristol, etc.).  Maybe we could just disable it on Stoney until we
>> root cause it.
>
>Completion-wait loop timeouts indicate something is seriously wrong. How
>can I detect whether I am running on a 'Stoney' system?
>
>Other question, a shot into the dark, does the GPU on these systems have
>ATS? Probably yes, as they are likely HSA compatible.

The GPU on Stoney supports ATS.

Thanks,
Arindam

>
>
>   Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread 'j...@8bytes.org'
On Tue, Mar 21, 2017 at 04:01:53PM +, Deucher, Alexander wrote:
> It seems to only affect Stoney systems, but not others (Carrizo,
> Bristol, etc.).  Maybe we could just disable it on Stoney until we
> root cause it.

Completion-wait loop timeouts indicate something is seriously wrong. How
can I detect whether I am running on a 'Stoney' system?

Other question, a shot into the dark, does the GPU on these systems have
ATS? Probably yes, as they are likely HSA compatible.


Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread Deucher, Alexander
> -Original Message-
> From: j...@8bytes.org [mailto:j...@8bytes.org]
> Sent: Tuesday, March 21, 2017 11:57 AM
> To: Alex Deucher
> Cc: Daniel Drake; Deucher, Alexander; Chris Chiu; amd-
> g...@lists.freedesktop.org; Nath, Arindam; iommu@lists.linux-
> foundation.org; Suthikulpanit, Suravee; Linux Upstreaming Team
> Subject: Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait
> loop timed out
> 
> On Fri, Mar 17, 2017 at 11:53:09AM -0400, Alex Deucher wrote:
> > On Fri, Mar 17, 2017 at 8:15 AM, Daniel Drake 
> wrote:
> > > Hi,
> > >
> > > On Mon, Mar 13, 2017 at 2:01 PM, Deucher, Alexander
> > >  wrote:
> > >> > We are unable to boot Acer Aspire E5-553G (AMD FX-9800P RADEON
> R7) nor
> > >> > Acer Aspire E5-523 with standard configurations because during boot
> > >> > the screen is flooded with the following error message over and over:
> > >> >
> > >> >   AMD-Vi: Completion-Wait loop timed out
> > >>
> > >> We ran into similar issues and bisected it to commit
> b1516a14657acf81a587e9a6e733a881625eee53.  I'm not too familiar with the
> IOMMU hardware to know if this is an iommu or display driver issue yet.
> > >
> > > We can confirm that reverting this commit solves the issue.
> > >
> > > Given that that commit is an optimization, but it has introduced a
> > > regression on multiple platforms, and has been like this for 8 months,
> > > it would be common practice to now revert this patch upstream until
> > > the regression is fixed. Could you please send a new patch to do this?
> > >
> > > Also, we would be happy to test any real solutions to this issue while
> > > we still have the affected units in hand.
> >
> > No objections to a revert here.
> 
> Big objection here. Since this only happens with amdgpu so far we
> shouldn't rule out a display-driver issue.
> 
> Reverting that patch basically destroys iommu-performance on AMD
> systems. Doing this for all devices just to make amdgpu working is
> overkill at this stage of the debugging.

It seems to only affect Stoney systems, but not others (Carrizo, Bristol, 
etc.).  Maybe we could just disable it on Stoney until we root cause it.

Alex

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: amd-iommu: can't boot with amdgpu, AMD-Vi: Completion-Wait loop timed out

2017-03-21 Thread j...@8bytes.org
On Fri, Mar 17, 2017 at 11:53:09AM -0400, Alex Deucher wrote:
> On Fri, Mar 17, 2017 at 8:15 AM, Daniel Drake  wrote:
> > Hi,
> >
> > On Mon, Mar 13, 2017 at 2:01 PM, Deucher, Alexander
> >  wrote:
> >> > We are unable to boot Acer Aspire E5-553G (AMD FX-9800P RADEON R7) nor
> >> > Acer Aspire E5-523 with standard configurations because during boot
> >> > the screen is flooded with the following error message over and over:
> >> >
> >> >   AMD-Vi: Completion-Wait loop timed out
> >>
> >> We ran into similar issues and bisected it to commit 
> >> b1516a14657acf81a587e9a6e733a881625eee53.  I'm not too familiar with the 
> >> IOMMU hardware to know if this is an iommu or display driver issue yet.
> >
> > We can confirm that reverting this commit solves the issue.
> >
> > Given that that commit is an optimization, but it has introduced a
> > regression on multiple platforms, and has been like this for 8 months,
> > it would be common practice to now revert this patch upstream until
> > the regression is fixed. Could you please send a new patch to do this?
> >
> > Also, we would be happy to test any real solutions to this issue while
> > we still have the affected units in hand.
> 
> No objections to a revert here.

Big objection here. Since this only happens with amdgpu so far we
shouldn't rule out a display-driver issue.

Reverting that patch basically destroys iommu-performance on AMD
systems. Doing this for all devices just to make amdgpu working is
overkill at this stage of the debugging.



Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v5] arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU

2017-03-21 Thread Catalin Marinas
On Tue, Mar 07, 2017 at 06:43:32PM +0100, Geert Uytterhoeven wrote:
> Add support for allocating physically contiguous DMA buffers on arm64
> systems with an IOMMU.  This can be useful when two or more devices
> with different memory requirements are involved in buffer sharing.
> 
> Note that as this uses the CMA allocator, setting the
> DMA_ATTR_FORCE_CONTIGUOUS attribute has a runtime-dependency on
> CONFIG_DMA_CMA, just like on arm32.
> 
> For arm64 systems using swiotlb, no changes are needed to support the
> allocation of physically contiguous DMA buffers:
>   - swiotlb always uses physically contiguous buffers (up to
> IO_TLB_SEGSIZE = 128 pages),
>   - arm64's __dma_alloc_coherent() already calls
> dma_alloc_from_contiguous() when CMA is available.
> 
> Signed-off-by: Geert Uytterhoeven 
> Acked-by: Laurent Pinchart 
> Reviewed-by: Robin Murphy 

Queued for 4.12. Thanks.

-- 
Catalin
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 0/5] Implement SMMU passthrough using the default domain

2017-03-21 Thread Joerg Roedel
On Fri, Mar 10, 2017 at 08:49:31PM +, Will Deacon wrote:
> Will Deacon (5):
>   iommu/arm-smmu: Restrict domain attributes to UNMANAGED domains
>   iommu/arm-smmu: Install bypass S2CRs for IOMMU_DOMAIN_IDENTITY domains
>   iommu/arm-smmu-v3: Make arm_smmu_install_ste_for_dev return void
>   iommu/arm-smmu-v3: Install bypass STEs for IOMMU_DOMAIN_IDENTITY
> domains
>   iommu: Allow default domain type to be set on the kernel command line
> 
>  Documentation/admin-guide/kernel-parameters.txt |  6 ++
>  drivers/iommu/arm-smmu-v3.c | 76 
> +++--
>  drivers/iommu/arm-smmu.c| 26 -
>  drivers/iommu/iommu.c   | 17 +-
>  4 files changed, 90 insertions(+), 35 deletions(-)

Besides my one comment on the last patch this series looks good to me.
Do you plan to include it (with the fall-back) into your pull-request?



Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v2 5/5] iommu: Allow default domain type to be set on the kernel command line

2017-03-21 Thread Joerg Roedel
Hi Will,

On Fri, Mar 10, 2017 at 08:49:36PM +, Will Deacon wrote:
> @@ -1014,8 +1027,8 @@ struct iommu_group *iommu_group_get_for_dev(struct 
> device *dev)
>* IOMMU driver.
>*/
>   if (!group->default_domain) {
> - group->default_domain = __iommu_domain_alloc(dev->bus,
> -  IOMMU_DOMAIN_DMA);
> + group->default_domain =
> + __iommu_domain_alloc(dev->bus, iommu_def_domain_type);

It would be good to have a fall-back here if we are talking to an IOMMU
driver that uses default domains, but does not support identity-mapped
domains (yet). Exynos and Rockchip IOMMU drivers seem to fall into this
category. A dev_warn() also makes sense in case allocating a identity
domain fails.


Joerg

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Liviu Dudau
On Tue, Mar 21, 2017 at 02:01:57PM +0100, Mason wrote:
> On 21/03/2017 13:31, Liviu Dudau wrote:
> 
> > On Tue, Mar 21, 2017 at 11:15:16AM +0100, Mason wrote:
> >>
> >> On 17/03/2017 17:11, Mason wrote:
> >>
> >>> + * QUIRK #5
> >>> + * Only transfers within the BAR are forwarded to the host.
> >>> + * By default, the DMA framework expects that
> >>> + * PCI address 0x8000_ -> CPU address 0x8000_
> >>> + * which is where DRAM0 is mapped.
> >>
> >> I have an additional question on this topic.
> >>
> >> In a typical system, PCI bus masters are able to access all of RAM,
> >> unless there is some kind of IOMMU, right?
> > 
> > Depends where the ATS (Address translation service) lives and what gets
> > programmed there, but usually one can restrict the RAM (or system address
> > space, to be more precise) that can be accessed from the PCI bus masters
> > by only providing PCI(e)-to-system bus mappings that make sense. Remember
> > that on any PCI bus you can only see addresses that are a subset of your
> > parent bus addresses. If the root PCI bus only allows access to a predefined
> > range of addresses, then that is all you can access in the system.
> 
> Hmmm, where do I configure the range of addresses which the root
> PCI bus allows access to? In the DT?

No, that should be the firmware's job. Or the secure world OS.

> 
> >> I suppose one may consider the above limitation ("Only transfers
> >> within the BAR are forwarded to the host") as some form of weird
> >> IOMMU? (There is, in fact, some remapping logic in the controller
> >> setup which I haven't discussed so far.)
> > 
> > Not sure I understand the quirk. What BAR are you referring to when you say
> > "within the BAR"? The bus master's? Are you saying that only if you write
> > to a PCI bus address XXX that is in the range assigned by the parent
> > bus then you can escape your bus hierarchy and target the system RAM? That
> > sounds backwards to me.
> 
> My PCIe controller is b/d/f 0/0/0.
> It ignores all PCI addresses that do not fall within the range
> defined in BAR0 of b/d/f 0/0/0.
> 
> BAR0 has a configurable width of 2^(23+i) bytes, i < 8
> It is implicitly split in 8 regions, which can map to
> anywhere in CPU space. However, my understanding is that,
> by default, the DMA framework expects PCI address X to
> map to CPU address X...
> (My understanding of that part is still a bit hazy.)

It looks very much like your PCIe controller (host bridge) has the configuration
registers on the wrong side of the bridge (PCIe one vs the more normal host 
side).
But otherwise it does look very much like an ATS system, where you program how 
the
PCIe bus addresses map into the system (you call them CPU) addresses. Do you 
also
have a way of controlling the direction of the mapping? (in other words, those 8
regions are only for translating request coming out of the PCIe bus into system
addresses, or can you also set the direct mapping of system address to PCIe 
address?).

As for DMA framework expectations, I'm not sure how that plays a role, unless 
you
have some DMA engines on the PCIe bus.

Best regards,
Liviu

> 
> >> Since this SoC is used for TV, the media cartels mandate some way
> >> to limit where PCI bus masters can peek/poke in RAM.
> > 
> > And that is usually done by either having a TrustZone controller on top of
> > your PCI host bridge or by putting an IOMMU.
> 
> It seems we don't have this HW, therefore some kind of custom
> HW quirk was required.
> 
> Regards.

-- 

| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---
¯\_(ツ)_/¯
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v2] iommu: iova: Consolidate code for adding new node to iovad domain rbtree

2017-03-21 Thread Joerg Roedel
On Fri, Feb 24, 2017 at 12:13:37PM +0100, Marek Szyprowski wrote:
> This patch consolidates almost the same code used in iova_insert_rbtree()
> and __alloc_and_insert_iova_range() functions. While touching this code,
> replace BUG() with WARN_ON(1) to avoid taking down the whole system in
> case of corrupted iova tree or incorrect calls.
> 
> Signed-off-by: Marek Szyprowski 
> ---
> v2:
> - replaced BUG() with WARN_ON(1) as suggested by Robin Murphy
> 
> v1:
> - initial version

Applied, thanks.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Mason
On 21/03/2017 13:31, Liviu Dudau wrote:

> On Tue, Mar 21, 2017 at 11:15:16AM +0100, Mason wrote:
>>
>> On 17/03/2017 17:11, Mason wrote:
>>
>>> + * QUIRK #5
>>> + * Only transfers within the BAR are forwarded to the host.
>>> + * By default, the DMA framework expects that
>>> + * PCI address 0x8000_ -> CPU address 0x8000_
>>> + * which is where DRAM0 is mapped.
>>
>> I have an additional question on this topic.
>>
>> In a typical system, PCI bus masters are able to access all of RAM,
>> unless there is some kind of IOMMU, right?
> 
> Depends where the ATS (Address translation service) lives and what gets
> programmed there, but usually one can restrict the RAM (or system address
> space, to be more precise) that can be accessed from the PCI bus masters
> by only providing PCI(e)-to-system bus mappings that make sense. Remember
> that on any PCI bus you can only see addresses that are a subset of your
> parent bus addresses. If the root PCI bus only allows access to a predefined
> range of addresses, then that is all you can access in the system.

Hmmm, where do I configure the range of addresses which the root
PCI bus allows access to? In the DT?

>> I suppose one may consider the above limitation ("Only transfers
>> within the BAR are forwarded to the host") as some form of weird
>> IOMMU? (There is, in fact, some remapping logic in the controller
>> setup which I haven't discussed so far.)
> 
> Not sure I understand the quirk. What BAR are you referring to when you say
> "within the BAR"? The bus master's? Are you saying that only if you write
> to a PCI bus address XXX that is in the range assigned by the parent
> bus then you can escape your bus hierarchy and target the system RAM? That
> sounds backwards to me.

My PCIe controller is b/d/f 0/0/0.
It ignores all PCI addresses that do not fall within the range
defined in BAR0 of b/d/f 0/0/0.

BAR0 has a configurable width of 2^(23+i) bytes, i < 8
It is implicitly split in 8 regions, which can map to
anywhere in CPU space. However, my understanding is that,
by default, the DMA framework expects PCI address X to
map to CPU address X...
(My understanding of that part is still a bit hazy.)

>> Since this SoC is used for TV, the media cartels mandate some way
>> to limit where PCI bus masters can peek/poke in RAM.
> 
> And that is usually done by either having a TrustZone controller on top of
> your PCI host bridge or by putting an IOMMU.

It seems we don't have this HW, therefore some kind of custom
HW quirk was required.

Regards.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Liviu Dudau
On Tue, Mar 21, 2017 at 11:15:16AM +0100, Mason wrote:
> [ Adding iommu ML ]
> 
> On 17/03/2017 17:11, Mason wrote:
> 
> > + * QUIRK #5
> > + * Only transfers within the BAR are forwarded to the host.
> > + * By default, the DMA framework expects that
> > + * PCI address 0x8000_ -> CPU address 0x8000_
> > + * which is where DRAM0 is mapped.
> 
> I have an additional question on this topic.
> 
> In a typical system, PCI bus masters are able to access all of RAM,
> unless there is some kind of IOMMU, right?

Depends where the ATS (Address translation service) lives and what gets
programmed there, but usually one can restrict the RAM (or system address
space, to be more precise) that can be accessed from the PCI bus masters
by only providing PCI(e)-to-system bus mappings that make sense. Remember
that on any PCI bus you can only see addresses that are a subset of your
parent bus addresses. If the root PCI bus only allows access to a predefined
range of addresses, then that is all you can access in the system.

> 
> I suppose one may consider the above limitation ("Only transfers
> within the BAR are forwarded to the host") as some form of weird
> IOMMU? (There is, in fact, some remapping logic in the controller
> setup which I haven't discussed so far.)

Not sure I understand the quirk. What BAR are you referring to when you say
"within the BAR"? The bus master's? Are you saying that only if you write
to a PCI bus address XXX that is in the range assigned by the parent
bus then you can escape your bus hierarchy and target the system RAM? That
sounds backwards to me.

> 
> Since this SoC is used for TV, the media cartels mandate some way
> to limit where PCI bus masters can peek/poke in RAM.

And that is usually done by either having a TrustZone controller on top of
your PCI host bridge or by putting an IOMMU.

Best regards,
Liviu

> 
> Regards.

-- 

| I would like to |
| fix the world,  |
| but they're not |
| giving me the   |
 \ source code!  /
  ---
¯\_(ツ)_/¯
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Thibaud Cornic
On 21/03/2017 13:43, Mason wrote:
> On 21/03/2017 12:36, Robin Murphy wrote:
>
>> On 21/03/17 10:15, Mason wrote:
>>
>>> I suppose one may consider the above limitation ("Only transfers
>>> within the BAR are forwarded to the host") as some form of weird
>>> IOMMU? (There is, in fact, some remapping logic in the controller
>>> setup which I haven't discussed so far.)
>> Not really. If it's the 8x128MB region thing mentioned elsewhere, that's
>> far too coarse a granularity to be much use with the existing IOMMU API
>> (this vaguely reminds me of a similar discussion about programmable
>> interconnects ages ago). Unless it's actually got some sort of GART-type
>> thing or better capable of page-granularity mappings within a
>> significantly-sized region, I'd put that idea to bed.
> I had a feeling my use-case was too quirky for a sane API :-)
>
>>> Since this SoC is used for TV, the media cartels mandate some way
>>> to limit where PCI bus masters can peek/poke in RAM.
>> FWIW, since that sounds like more of a box-ticking exercise than a real
>> practical concern, I'd point out that content protection is more or less
>> the poster child for TrustZone, and your TZASC should help tick that box
>> regardless.
> Interesting. In fact, our Linux runs as the non-secure OS,
> and we do have a custom "secure" OS running in parallel.
>
> My knowledge of TZ is limited to "call this SMC to offline
> that CPU", but our local TZ expert is CCed :-)
>
> TZASC = TrustZone Address Space Controller
>
> http://infocenter.arm.com/help/topic/com.arm.doc.ddi0431c/CHDBBGIC.html
>
> Is there a TZASC embedded in every ARM core?
> We're using Cortex-A9 MPCore r3p0 + PL310 r3p2
We don't have the TZASC, but a proprietary solution, on which the PCIe
can't be distinguished from other I/Os (e.g. SATA, USB).
But the requirement of limiting device accesses to only certain regions
of the DRAM is only for PCIe (it's special in the sense the device can
initiate accesses without any action on our chip's side).
Since we already had those 8x128MB remaps (which we now understand seem
wrong, as the root complex is not supposed to expose a BAR, but rather
forward all accesses to the internal bus), we simply put a lock on them,
and our partners were satisfied.

Regards.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Mason
On 21/03/2017 12:36, Robin Murphy wrote:

> On 21/03/17 10:15, Mason wrote:
>
>> I suppose one may consider the above limitation ("Only transfers
>> within the BAR are forwarded to the host") as some form of weird
>> IOMMU? (There is, in fact, some remapping logic in the controller
>> setup which I haven't discussed so far.)
> 
> Not really. If it's the 8x128MB region thing mentioned elsewhere, that's
> far too coarse a granularity to be much use with the existing IOMMU API
> (this vaguely reminds me of a similar discussion about programmable
> interconnects ages ago). Unless it's actually got some sort of GART-type
> thing or better capable of page-granularity mappings within a
> significantly-sized region, I'd put that idea to bed.

I had a feeling my use-case was too quirky for a sane API :-)

>> Since this SoC is used for TV, the media cartels mandate some way
>> to limit where PCI bus masters can peek/poke in RAM.
> 
> FWIW, since that sounds like more of a box-ticking exercise than a real
> practical concern, I'd point out that content protection is more or less
> the poster child for TrustZone, and your TZASC should help tick that box
> regardless.

Interesting. In fact, our Linux runs as the non-secure OS,
and we do have a custom "secure" OS running in parallel.

My knowledge of TZ is limited to "call this SMC to offline
that CPU", but our local TZ expert is CCed :-)

TZASC = TrustZone Address Space Controller

http://infocenter.arm.com/help/topic/com.arm.doc.ddi0431c/CHDBBGIC.html

Is there a TZASC embedded in every ARM core?
We're using Cortex-A9 MPCore r3p0 + PL310 r3p2

Regards.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Robin Murphy
On 21/03/17 10:15, Mason wrote:
> [ Adding iommu ML ]
> 
> On 17/03/2017 17:11, Mason wrote:
> 
>> + * QUIRK #5
>> + * Only transfers within the BAR are forwarded to the host.
>> + * By default, the DMA framework expects that
>> + * PCI address 0x8000_ -> CPU address 0x8000_
>> + * which is where DRAM0 is mapped.
> 
> I have an additional question on this topic.
> 
> In a typical system, PCI bus masters are able to access all of RAM,
> unless there is some kind of IOMMU, right?

Typically, yes (as always, exceptions exist, but it happens that we
still need some work to actually support those cases properly).

> I suppose one may consider the above limitation ("Only transfers
> within the BAR are forwarded to the host") as some form of weird
> IOMMU? (There is, in fact, some remapping logic in the controller
> setup which I haven't discussed so far.)

Not really. If it's the 8x128MB region thing mentioned elsewhere, that's
far too coarse a granularity to be much use with the existing IOMMU API
(this vaguely reminds me of a similar discussion about programmable
interconnects ages ago). Unless it's actually got some sort of GART-type
thing or better capable of page-granularity mappings within a
significantly-sized region, I'd put that idea to bed.

> Since this SoC is used for TV, the media cartels mandate some way
> to limit where PCI bus masters can peek/poke in RAM.

FWIW, since that sounds like more of a box-ticking exercise than a real
practical concern, I'd point out that content protection is more or less
the poster child for TrustZone, and your TZASC should help tick that box
regardless.

Robin.

> 
> Regards.
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v0.001] PCI: Add support for tango PCIe controller

2017-03-21 Thread Mason
[ Adding iommu ML ]

On 17/03/2017 17:11, Mason wrote:

> + * QUIRK #5
> + * Only transfers within the BAR are forwarded to the host.
> + * By default, the DMA framework expects that
> + * PCI address 0x8000_ -> CPU address 0x8000_
> + * which is where DRAM0 is mapped.

I have an additional question on this topic.

In a typical system, PCI bus masters are able to access all of RAM,
unless there is some kind of IOMMU, right?

I suppose one may consider the above limitation ("Only transfers
within the BAR are forwarded to the host") as some form of weird
IOMMU? (There is, in fact, some remapping logic in the controller
setup which I haven't discussed so far.)

Since this SoC is used for TV, the media cartels mandate some way
to limit where PCI bus masters can peek/poke in RAM.

Regards.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory

2017-03-21 Thread Liu, Yi L
Hi Jean,

I'm working on virtual SVM, and have some comments on the VFIO channel
definition.

> -Original Message-
> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
> boun...@lists.linux-foundation.org] On Behalf Of Jean-Philippe Brucker
> Sent: Tuesday, February 28, 2017 3:55 AM
> Cc: Shanker Donthineni ; k...@vger.kernel.org;
> Catalin Marinas ; Sinan Kaya
> ; Will Deacon ;
> iommu@lists.linux-foundation.org; Harv Abdulhamid ;
> linux-...@vger.kernel.org; Bjorn Helgaas ; David
> Woodhouse ; linux-arm-ker...@lists.infradead.org; Nate
> Watterson 
> Subject: [RFC PATCH 29/30] vfio: Add support for Shared Virtual Memory
> 
> Add two new ioctl for VFIO devices. VFIO_DEVICE_BIND_TASK creates a bond
> between a device and a process address space, identified by a device-specific 
> ID
> named PASID. This allows the device to target DMA transactions at the process
> virtual addresses without a need for mapping and unmapping buffers explicitly 
> in the
> IOMMU. The process page tables are shared with the IOMMU, and mechanisms such
> as PCI ATS/PRI may be used to handle faults. VFIO_DEVICE_UNBIND_TASK removed
> a bond identified by a PASID.
> 
> Also add a capability flag in device info to detect whether the system and 
> the device
> support SVM.
> 
> Users need to specify the state of a PASID when unbinding, with flags
> VFIO_PASID_RELEASE_FLUSHED and VFIO_PASID_RELEASE_CLEAN. Even for PCI,
> PASID invalidation is specific to each device and only partially covered by 
> the
> specification:
> 
> * Device must have an implementation-defined mechanism for stopping the
>   use of a PASID. When this mechanism finishes, the device has stopped
>   issuing transactions for this PASID and all transactions for this PASID
>   have been flushed to the IOMMU.
> 
> * Device may either wait for all outstanding PRI requests for this PASID
>   to finish, or issue a Stop Marker message, a barrier that separates PRI
>   requests affecting this instance of the PASID from PRI requests
>   affecting the next instance. In the first case, we say that the PASID is
>   "clean", in the second case it is "flushed" (and the IOMMU has to wait
>   for the Stop Marker before reassigning the PASID.)
> 
> We expect similar distinctions for platform devices. Ideally there should be 
> a callback
> for each PCI device, allowing the IOMMU to ask the device to stop using a 
> PASID.
> When the callback returns, the PASID is either flushed or clean and the 
> return value
> tells which.
> 
> For the moment I don't know how to implement this callback for PCI, so if the 
> user
> forgets to call unbind with either "clean" or "flushed", the PASID is never 
> reused. For
> platform devices, it might be simpler to implement since we could associate an
> invalidate_pasid callback to a DT compatible string, as is currently done for 
> reset.
> 
> Signed-off-by: Jean-Philippe Brucker 

[...]

>  drivers/vfio/pci/vfio_pci.c |  24 ++
>  drivers/vfio/vfio.c | 104 
> 
>  include/uapi/linux/vfio.h   |  55 +++
>  3 files changed, 183 insertions(+)
> 
...
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index
> 519eff362c1c..3fe4197a5ea0 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -198,6 +198,7 @@ struct vfio_device_info {
>  #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */
>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)  /* vfio-platform device */
>  #define VFIO_DEVICE_FLAGS_AMBA  (1 << 3) /* vfio-amba device */
> +#define VFIO_DEVICE_FLAGS_SVM(1 << 4)/* Device supports 
> bind/unbind */
>   __u32   num_regions;/* Max region index + 1 */
>   __u32   num_irqs;   /* Max IRQ index + 1 */
>  };
> @@ -409,6 +410,60 @@ struct vfio_irq_set {
>   */
>  #define VFIO_DEVICE_RESET_IO(VFIO_TYPE, VFIO_BASE + 11)
> 
> +struct vfio_device_svm {
> + __u32   argsz;
> + __u32   flags;
> +#define VFIO_SVM_PASID_RELEASE_FLUSHED   (1 << 0)
> +#define VFIO_SVM_PASID_RELEASE_CLEAN (1 << 1)
> + __u32   pasid;
> +};

For virtual SVM work, the VFIO channel would be used to passdown guest
PASID tale PTR and invalidation information. And may have further usage
except the above.

Here is the virtual SVM design doc which illustrates the VFIO usage.
https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html

For the guest PASID table ptr passdown, I've following message in pseudo code.
struct pasid_table_info {
__u64 ptr;
__u32 size;
 };

For invalidation, I've following info in in pseudo code.
struct iommu_svm_tlb_invalidate_info
{
   __u32 inv_type;
#define IOTLB_INV   (1 << 0)
#define EXTENDED_IOTLB_INV  (1 << 1)
#define DEVICE_IOTLB_INV(1 << 2)
#define EXTENDED_DEVICE_IOTLB_INV   (1 << 3)
#define PASID_CACHE_INV (1 << 4)
   __u32 pasid;
   __u64 addr