[PATCH v1 3/3] iommu/amd: Optimize the IOMMU queue flush

2017-06-05 Thread Tom Lendacky
After reducing the amount of MMIO performed by the IOMMU during operation,
perf data shows that flushing the TLB for all protection domains during
DMA unmapping is a performance issue. It is not necessary to flush the
TLBs for all protection domains, only the protection domains associated
with iova's on the flush queue.

Create a separate queue that tracks the protection domains associated with
the iova's on the flush queue. This new queue optimizes the flushing of
TLBs to the required protection domains.

Reviewed-by: Arindam Nath 
Signed-off-by: Tom Lendacky 
---
 drivers/iommu/amd_iommu.c |   56 -
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 856103b..a5e77f0 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -103,7 +103,18 @@ struct flush_queue {
struct flush_queue_entry *entries;
 };
 
+struct flush_pd_queue_entry {
+   struct protection_domain *pd;
+};
+
+struct flush_pd_queue {
+   /* No lock needed, protected by flush_queue lock */
+   unsigned next;
+   struct flush_pd_queue_entry *entries;
+};
+
 static DEFINE_PER_CPU(struct flush_queue, flush_queue);
+static DEFINE_PER_CPU(struct flush_pd_queue, flush_pd_queue);
 
 static atomic_t queue_timer_on;
 static struct timer_list queue_timer;
@@ -2227,16 +2238,20 @@ static struct iommu_group 
*amd_iommu_device_group(struct device *dev)
  *
  */
 
-static void __queue_flush(struct flush_queue *queue)
+static void __queue_flush(struct flush_queue *queue,
+ struct flush_pd_queue *pd_queue)
 {
-   struct protection_domain *domain;
unsigned long flags;
int idx;
 
/* First flush TLB of all known domains */
spin_lock_irqsave(_iommu_pd_lock, flags);
-   list_for_each_entry(domain, _iommu_pd_list, list)
-   domain_flush_tlb(domain);
+   for (idx = 0; idx < pd_queue->next; ++idx) {
+   struct flush_pd_queue_entry *entry;
+
+   entry = pd_queue->entries + idx;
+   domain_flush_tlb(entry->pd);
+   }
spin_unlock_irqrestore(_iommu_pd_lock, flags);
 
/* Wait until flushes have completed */
@@ -2255,6 +2270,7 @@ static void __queue_flush(struct flush_queue *queue)
entry->dma_dom = NULL;
}
 
+   pd_queue->next = 0;
queue->next = 0;
 }
 
@@ -2263,13 +2279,15 @@ static void queue_flush_all(void)
int cpu;
 
for_each_possible_cpu(cpu) {
+   struct flush_pd_queue *pd_queue;
struct flush_queue *queue;
unsigned long flags;
 
queue = per_cpu_ptr(_queue, cpu);
+   pd_queue = per_cpu_ptr(_pd_queue, cpu);
spin_lock_irqsave(>lock, flags);
if (queue->next > 0)
-   __queue_flush(queue);
+   __queue_flush(queue, pd_queue);
spin_unlock_irqrestore(>lock, flags);
}
 }
@@ -2283,6 +2301,8 @@ static void queue_flush_timeout(unsigned long unsused)
 static void queue_add(struct dma_ops_domain *dma_dom,
  unsigned long address, unsigned long pages)
 {
+   struct flush_pd_queue_entry *pd_entry;
+   struct flush_pd_queue *pd_queue;
struct flush_queue_entry *entry;
struct flush_queue *queue;
unsigned long flags;
@@ -2292,10 +2312,22 @@ static void queue_add(struct dma_ops_domain *dma_dom,
address >>= PAGE_SHIFT;
 
queue = get_cpu_ptr(_queue);
+   pd_queue = get_cpu_ptr(_pd_queue);
spin_lock_irqsave(>lock, flags);
 
if (queue->next == FLUSH_QUEUE_SIZE)
-   __queue_flush(queue);
+   __queue_flush(queue, pd_queue);
+
+   for (idx = 0; idx < pd_queue->next; ++idx) {
+   pd_entry = pd_queue->entries + idx;
+   if (pd_entry->pd == _dom->domain)
+   break;
+   }
+   if (idx == pd_queue->next) {
+   /* New protection domain, add it to the list */
+   pd_entry = pd_queue->entries + pd_queue->next++;
+   pd_entry->pd = _dom->domain;
+   }
 
idx   = queue->next++;
entry = queue->entries + idx;
@@ -2309,6 +2341,7 @@ static void queue_add(struct dma_ops_domain *dma_dom,
if (atomic_cmpxchg(_timer_on, 0, 1) == 0)
mod_timer(_timer, jiffies + msecs_to_jiffies(10));
 
+   put_cpu_ptr(_pd_queue);
put_cpu_ptr(_queue);
 }
 
@@ -2810,6 +2843,8 @@ int __init amd_iommu_init_api(void)
return ret;
 
for_each_possible_cpu(cpu) {
+   struct flush_pd_queue *pd_queue = per_cpu_ptr(_pd_queue,
+ cpu);
struct flush_queue *queue 

[PATCH v1 2/3] iommu/amd: Reduce delay waiting for command buffer space

2017-06-05 Thread Tom Lendacky
Currently if there is no room to add a command to the command buffer, the
driver performs a "completion wait" which only returns when all commands
on the queue have been processed. There is no need to wait for the entire
command queue to be executed before adding the next command.

Update the driver to perform the same udelay() loop that the "completion
wait" performs, but instead re-read the head pointer to determine if
sufficient space is available.  The very first time it is found that there
is no space available, the udelay() will be skipped to immediately perform
the opportunistic read of the head pointer. If it is still found that
there is not sufficient space, then the udelay() will be performed.

Signed-off-by: Leo Duran 
Signed-off-by: Tom Lendacky 
---
 drivers/iommu/amd_iommu.c |   33 +
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index faf0ddf..856103b 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1019,7 +1019,7 @@ static int __iommu_queue_command_sync(struct amd_iommu 
*iommu,
  struct iommu_cmd *cmd,
  bool sync)
 {
-   bool read_head = true;
+   unsigned int count = 0;
u32 left, next_tail;
 
next_tail = (iommu->cmd_buf_tail + sizeof(*cmd)) % CMD_BUFFER_SIZE;
@@ -1027,33 +1027,26 @@ static int __iommu_queue_command_sync(struct amd_iommu 
*iommu,
left  = (iommu->cmd_buf_head - next_tail) % CMD_BUFFER_SIZE;
 
if (left <= 0x20) {
-   struct iommu_cmd sync_cmd;
-   int ret;
-
-   if (read_head) {
-   /* Update head and recheck remaining space */
-   iommu->cmd_buf_head = readl(iommu->mmio_base +
-   MMIO_CMD_HEAD_OFFSET);
-   read_head = false;
-   goto again;
-   }
-
-   read_head = true;
-
-   iommu->cmd_sem = 0;
+   /* Skip udelay() the first time around */
+   if (count++) {
+   if (count == LOOP_TIMEOUT) {
+   pr_err("AMD-Vi: Command buffer timeout\n");
+   return -EIO;
+   }
 
-   build_completion_wait(_cmd, (u64)>cmd_sem);
-   copy_cmd_to_buffer(iommu, _cmd);
+   udelay(1);
+   }
 
-   if ((ret = wait_on_sem(>cmd_sem)) != 0)
-   return ret;
+   /* Update head and recheck remaining space */
+   iommu->cmd_buf_head = readl(iommu->mmio_base +
+   MMIO_CMD_HEAD_OFFSET);
 
goto again;
}
 
copy_cmd_to_buffer(iommu, cmd);
 
-   /* We need to sync now to make sure all commands are processed */
+   /* Do we need to make sure all commands are processed? */
iommu->need_sync = sync;
 
return 0;

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v1 0/3] iommu/amd: AMD IOMMU performance updates 2017-06-05

2017-06-05 Thread Tom Lendacky
This patch series addresses some performance issues in the AMD IOMMU
driver:

- Reduce the amount of MMIO performed during command submission
- When the command queue is (near) full, only wait till there is enough
  room for the command rather than wait for the whole queue to be empty
- Limit the flushing of protection domain TLBs to only the protection
  domains associated with the iova data being freed

This patch series is based on the master branch of the iommu tree.

---

Tom Lendacky (3):
  iommu/amd: Reduce amount of MMIO when submitting commands
  iommu/amd: Reduce delay waiting for command buffer space
  iommu/amd: Optimize the IOMMU queue flush


 drivers/iommu/amd_iommu.c   |  100 ---
 drivers/iommu/amd_iommu_init.c  |2 +
 drivers/iommu/amd_iommu_types.h |2 +
 3 files changed, 77 insertions(+), 27 deletions(-)

-- 
Tom Lendacky
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/7] dt-bindings: PCI: Describe ATS property for root complex nodes

2017-06-05 Thread Rob Herring
On Thu, Jun 01, 2017 at 01:28:01PM +0100, Jean-Philippe Brucker wrote:
> On 31/05/17 18:23, Rob Herring wrote:
> > On Wed, May 24, 2017 at 07:01:38PM +0100, Jean-Philippe Brucker wrote:
> >> Address Translation Service (ATS) is an extension to PCIe allowing
> >> endpoints to manage their own IOTLB, called Address Translation Cache
> >> (ATC). Instead of having every memory transaction processed by the IOMMU,
> >> the endpoint can first send an Address Translation Requests for an IOVA,
> >> obtain the corresponding Physical Address from the IOMMU and store it in
> >> its ATC. Subsequent transactions to this memory region can be performed on
> >> the PA, in which case they are marked 'translated' and (partially) bypass
> >> the IOMMU.
> >>
> >> Since the extension uses fields that were previously reserved in the
> >> PCIe Translation Layer Packet, it seems ill-advised to enabled it on a
> >> system that doesn't fully support ATS.
> >>
> >> To "old" root complexes that simply ignored the new AT field, an Address
> >> Translation Request will look exactly like a Memory Read Request, so the
> >> root bridge will forward a memory read to the IOMMU instead of a
> >> translation request. If the access succeeds, the RC will send a Read
> >> Completion, which looks like a Translation Completion, back to the
> >> endpoint. As a result the endpoint might end up storing the content of
> >> memory instead of a physical address in its ATC. In reality, it's more
> >> likely that the size fields will be invalid and either end will detect the
> >> error, but in any case, it is undesirable.
> >>
> >> Add a way for firmware to tell the OS that ATS is supported by the PCI
> >> root complex.
> > 
> > Can't firmware have already enabled ATS? Often for things like this, not 
> > present means "use firmware setting".
> 
> I don't think it's up to firmware to enable ATS in endpoints, because it
> depends on IOMMU properties (e.g. configured page size). It must also be
> enabled after the PASID capability, which the OS may or may not want to
> enable.
> 
> While endpoints have ATS capability and config register, there is no
> architected mechanism in root complexes as far as I know. So firmware may
> have a mechanism outside the OS scope to toggle ATS in the root complex.
> If there is a bug and firmware cannot enable ATS, then the OS must be made
> aware of it, so that it doesn't enable ATS in endpoints, or else we might
> end up with silent memory corruption as described above. (Lack of ATS may
> slow the system down but shouldn't be fatal.)
> 
> If the SMMU supports ATS, then the root complex attached to it will most
> likely supports ATS. The switch in this patch simply allows firmware to
> confirm that.
> 
> >> Signed-off-by: Jean-Philippe Brucker 
> >> ---
> >>  Documentation/devicetree/bindings/pci/pci-iommu.txt | 8 
> >>  1 file changed, 8 insertions(+)
> >>
> >> diff --git a/Documentation/devicetree/bindings/pci/pci-iommu.txt 
> >> b/Documentation/devicetree/bindings/pci/pci-iommu.txt
> >> index 0def586fdcdf..f21a68ec471a 100644
> >> --- a/Documentation/devicetree/bindings/pci/pci-iommu.txt
> >> +++ b/Documentation/devicetree/bindings/pci/pci-iommu.txt
> >> @@ -44,6 +44,14 @@ Optional properties
> >>  - iommu-map-mask: A mask to be applied to each Requester ID prior to being
> >>mapped to an IOMMU specifier per the iommu-map property.
> >>  
> >> +- ats-supported: if present, the root complex supports the Address
> >> +  Translation Service (ATS). It is able to interpret the AT field in PCIe
> >> +  Transaction Layer Packets, and forward Translation Completions or
> >> +  Invalidation Requests to endpoints.
> > 
> > Why can't this be based on the compatible strings?
> 
> Host controllers like the generic ECAM one should be able to advertise
> ATS, if for instance the virtual IOMMU in Qemu offers a channel for ATS
> invalidation. In that case we would have pci-host-{e,}cam-generic{-ats,}
> compatible strings and double the number of compatible strings each time
> we add a similar capability.

It would not double the compatibles. A given SoC will either support ATS 
or not, right? A given compatible will imply whether ATS is supported or 
not.

pci-host-{e,}cam-generic is a special case. I'm okay with having a 
property for that I suppose. We should not require this property though 
and allow for it to be implied by the SoC specific compatible as well.

Rob
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu