On Sat, 2013-09-21 at 21:59 -0500, David Woodhouse wrote:
> Here's a completely untested work-in-progress that attempts to fix that.
> I'll be able to test it myself on about Tuesday when I'm home from New
> Orleans and awake...

Or might have been if my laptop's hard drive hadn't died. Here's an
updated version which extends the use of domain_unmap() to other places
it needs to happen. Similarly untested by anything but the compiler.

I'll be able to test it myself on about Monday when I'm home from
Portland and awake...

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 15e9b57..cd6e568 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -413,6 +413,7 @@ struct deferred_flush_tables {
        int next;
        struct iova *iova[HIGH_WATER_MARK];
        struct dmar_domain *domain[HIGH_WATER_MARK];
+       struct page *freelist[HIGH_WATER_MARK];
 };
 
 static struct deferred_flush_tables *deferred_flush;
@@ -945,6 +946,123 @@ static void dma_pte_free_pagetable(struct dmar_domain 
*domain,
        }
 }
 
+/* When a page at a given level is being unlinked from its parent, we don't
+   need to *modify* it at all. All we need to do is make a list of all the
+   pages which can be freed just as soon as we've flushed the IOTLB and we
+   know the hardware page-walk will no longer touch them.
+   The 'pte' argument is the *parent* PTE, pointing to the page that is to
+   be freed. */
+static struct page *dma_pte_list_pagetables(struct dmar_domain *domain, int 
level,
+                                           struct dma_pte *pte, struct page 
*freelist)
+{
+       struct page *pg;
+
+       pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
+       pg->freelist = freelist;
+       freelist = pg;
+
+       if (level > 1) {
+               pte = page_address(pg);
+
+               do {
+                       if (dma_pte_present(pte) && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+
+               } while (!first_pte_in_page(++pte));
+       }
+
+       return freelist;
+}
+
+static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
+                                       struct dma_pte *pte, unsigned long pfn,
+                                       unsigned long start_pfn, unsigned long 
last_pfn,
+                                       struct page *freelist)
+{
+       struct dma_pte *first_pte = NULL, *last_pte = NULL;
+
+       pfn = max(start_pfn, pfn);
+       pte = &pte[pfn_level_offset(pfn, level)];
+
+       do {
+               unsigned long level_pfn;
+               struct dma_pte *level_pte;
+
+               if (!dma_pte_present(pte))
+                       goto next;
+
+               level_pfn = pfn & level_mask(level - 1);
+               level_pte = phys_to_virt(dma_pte_addr(pte));
+
+               /* If range covers entire pagetable, free it */
+               if (start_pfn <= level_pfn &&
+                   last_pfn >= level_pfn + level_size(level)) {
+
+                       /* These suborbinate page tables are going away 
entirely. Don't
+                          bother to clear them; we're just going to *free* 
them. */
+                       if (level > 1 && !dma_pte_superpage(pte))
+                               freelist = dma_pte_list_pagetables(domain, 
level - 1, pte, freelist);
+
+                       dma_clear_pte(pte);
+                       if (!first_pte)
+                               first_pte = pte;
+                       last_pte = pte;
+               } else {
+                       /* Recurse down into a level that isn't *entirely* 
obsolete */
+                       freelist = dma_pte_clear_level(domain, level - 1, 
level_pte,
+                                                      level_pfn, start_pfn, 
last_pfn, freelist);
+               }
+next:
+               pfn += level_size(level);
+       } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
+
+       if (first_pte)
+               domain_flush_cache(domain, first_pte,
+                                  (void *)++last_pte - (void *)first_pte);
+
+       return freelist;
+}
+
+/* We can't just free the pages because the IOMMU may still be walking
+   the page tables, and may have cached the intermediate levels. The
+   pages can only be freed after the IOTLB flush has been done. */
+struct page *domain_unmap(struct dmar_domain *domain,
+                         unsigned long start_pfn,
+                         unsigned long last_pfn)
+{
+       int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       struct page *freelist = NULL;
+
+       BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
+       BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
+       BUG_ON(start_pfn > last_pfn);
+
+       /* we don't need lock here; nobody else touches the iova range */
+       freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
+                                      domain->pgd, 0, start_pfn, last_pfn, 
NULL);
+
+       /* free pgd */
+       if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
+               struct page *pgd_page = virt_to_page(domain->pgd);
+               pgd_page->freelist = freelist;
+               freelist = pgd_page;
+
+               domain->pgd = NULL;
+       }
+
+       return freelist;
+}
+
+void dma_free_pagelist(struct page *freelist)
+{
+       struct page *pg;
+
+       while ((pg = freelist)) {
+               freelist = pg->freelist;
+               free_pgtable_page(pg);
+       }
+}
+
 /* iommu handling */
 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
@@ -1054,7 +1172,7 @@ static void __iommu_flush_iotlb(struct intel_iommu 
*iommu, u16 did,
                break;
        case DMA_TLB_PSI_FLUSH:
                val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
-               /* Note: always flush non-leaf currently */
+               /* IH bit is passed in as part of address */
                val_iva = size_order | addr;
                break;
        default:
@@ -1165,13 +1283,15 @@ static void iommu_flush_dev_iotlb(struct dmar_domain 
*domain,
 }
 
 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
-                                 unsigned long pfn, unsigned int pages, int 
map)
+                                 unsigned long pfn, unsigned int pages, int 
ih, int map)
 {
        unsigned int mask = ilog2(__roundup_pow_of_two(pages));
        uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
 
        BUG_ON(pages == 0);
 
+       if (ih)
+               ih = 1 << 6;
        /*
         * Fallback to domain selective flush if no PSI support or the size is
         * too big.
@@ -1182,7 +1302,7 @@ static void iommu_flush_iotlb_psi(struct intel_iommu 
*iommu, u16 did,
                iommu->flush.flush_iotlb(iommu, did, 0, 0,
                                                DMA_TLB_DSI_FLUSH);
        else
-               iommu->flush.flush_iotlb(iommu, did, addr, mask,
+               iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
                                                DMA_TLB_PSI_FLUSH);
 
        /*
@@ -1517,6 +1637,7 @@ static void domain_exit(struct dmar_domain *domain)
 {
        struct dmar_drhd_unit *drhd;
        struct intel_iommu *iommu;
+       struct page *freelist = NULL;
 
        /* Domain 0 is reserved, so dont process it */
        if (!domain)
@@ -1530,16 +1651,14 @@ static void domain_exit(struct dmar_domain *domain)
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
 
-       /* clear ptes */
-       dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+       freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
        for_each_active_iommu(iommu, drhd)
                if (test_bit(iommu->seq_id, domain->iommu_bmp))
                        iommu_detach_domain(domain, iommu);
 
+       dma_free_pagelist(freelist);
+
        free_domain_mem(domain);
 }
 
@@ -2850,7 +2969,7 @@ static dma_addr_t __intel_map_single(struct device 
*hwdev, phys_addr_t paddr,
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, 
mm_to_dma_pfn(iova->pfn_lo), size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, 
mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -2902,13 +3021,16 @@ static void flush_unmaps(void)
                        /* On real hardware multiple invalidations are 
expensive */
                        if (cap_caching_mode(iommu->cap))
                                iommu_flush_iotlb_psi(iommu, domain->id,
-                               iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 
0);
+                                       iova->pfn_lo, iova->pfn_hi - 
iova->pfn_lo + 1,
+                                       !deferred_flush[i].freelist[j], 0);
                        else {
                                mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - 
iova->pfn_lo + 1));
                                
iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
                                                (uint64_t)iova->pfn_lo << 
PAGE_SHIFT, mask);
                        }
                        __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
+                       if (deferred_flush[i].freelist[j])
+                               
dma_free_pagelist(deferred_flush[i].freelist[j]);
                }
                deferred_flush[i].next = 0;
        }
@@ -2925,7 +3047,7 @@ static void flush_unmaps_timeout(unsigned long data)
        spin_unlock_irqrestore(&async_umap_flush_lock, flags);
 }
 
-static void add_unmap(struct dmar_domain *dom, struct iova *iova)
+static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page 
*freelist)
 {
        unsigned long flags;
        int next, iommu_id;
@@ -2941,6 +3063,7 @@ static void add_unmap(struct dmar_domain *dom, struct 
iova *iova)
        next = deferred_flush[iommu_id].next;
        deferred_flush[iommu_id].domain[next] = dom;
        deferred_flush[iommu_id].iova[next] = iova;
+       deferred_flush[iommu_id].freelist[next] = freelist;
        deferred_flush[iommu_id].next++;
 
        if (!timer_on) {
@@ -2960,6 +3083,7 @@ static void intel_unmap_page(struct device *dev, 
dma_addr_t dev_addr,
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(dev))
                return;
@@ -2980,19 +3104,16 @@ static void intel_unmap_page(struct device *dev, 
dma_addr_t dev_addr,
        pr_debug("Device %s unmapping: pfn %lx-%lx\n",
                 pci_name(pdev), start_pfn, last_pfn);
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3054,6 +3175,7 @@ static void intel_unmap_sg(struct device *hwdev, struct 
scatterlist *sglist,
        unsigned long start_pfn, last_pfn;
        struct iova *iova;
        struct intel_iommu *iommu;
+       struct page *freelist;
 
        if (iommu_no_mapping(hwdev))
                return;
@@ -3071,19 +3193,16 @@ static void intel_unmap_sg(struct device *hwdev, struct 
scatterlist *sglist,
        start_pfn = mm_to_dma_pfn(iova->pfn_lo);
        last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
 
-       /*  clear the whole page */
-       dma_pte_clear_range(domain, start_pfn, last_pfn);
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, start_pfn, last_pfn);
+       freelist = domain_unmap(domain, start_pfn, last_pfn);
 
        if (intel_iommu_strict) {
                iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
-                                     last_pfn - start_pfn + 1, 0);
+                                     last_pfn - start_pfn + 1, !freelist, 0);
                /* free iova */
                __free_iova(&domain->iovad, iova);
+               dma_free_pagelist(freelist);
        } else {
-               add_unmap(domain, iova);
+               add_unmap(domain, iova, freelist);
                /*
                 * queue up the release of the unmap to save the 1/6th of the
                 * cpu used up by the iotlb flush operation...
@@ -3166,7 +3285,7 @@ static int intel_map_sg(struct device *hwdev, struct 
scatterlist *sglist, int ne
 
        /* it's a non-present to present mapping. Only flush if caching mode */
        if (cap_caching_mode(iommu->cap))
-               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
+               iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 
1);
        else
                iommu_flush_write_buffer(iommu);
 
@@ -3952,6 +4071,8 @@ static void iommu_free_vm_domain(struct dmar_domain 
*domain)
 
 static void vm_domain_exit(struct dmar_domain *domain)
 {
+       struct page *freelist;
+
        /* Domain 0 is reserved, so dont process it */
        if (!domain)
                return;
@@ -3960,13 +4081,12 @@ static void vm_domain_exit(struct dmar_domain *domain)
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
 
-       /* clear ptes */
-       dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
-
-       /* free page tables */
-       dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
+       freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
        iommu_free_vm_domain(domain);
+
+       dma_free_pagelist(freelist);
+
        free_domain_mem(domain);
 }
 
@@ -4110,18 +4230,43 @@ static int intel_iommu_map(struct iommu_domain *domain,
 }
 
 static size_t intel_iommu_unmap(struct iommu_domain *domain,
-                            unsigned long iova, size_t size)
+                               unsigned long iova, size_t size)
 {
        struct dmar_domain *dmar_domain = domain->priv;
-       int order;
+       struct page *freelist = NULL;
+       struct intel_iommu *iommu;
+       unsigned long start_pfn, last_pfn;
+       unsigned int npages;
+       int iommu_id, num, ndomains;
+
+       start_pfn = iova >> VTD_PAGE_SHIFT;
+       last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
+
+       freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
+
+       npages = last_pfn - start_pfn + 1;
+
+       for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
+               iommu = g_iommus[iommu_id];
+
+               /*
+                * find bit position of dmar_domain
+                */
+               ndomains = cap_ndoms(iommu->cap);
+               for_each_set_bit(num, iommu->domain_ids, ndomains) {
+                       if (iommu->domains[num] == dmar_domain)
+                               iommu_flush_iotlb_psi(iommu, num, start_pfn,
+                                                    npages, !freelist, 0);
+              }
+
+       }
 
-       order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
-                           (iova + size - 1) >> VTD_PAGE_SHIFT);
+       dma_free_pagelist(freelist);
 
        if (dmar_domain->max_addr == iova + size)
                dmar_domain->max_addr = iova;
 
-       return PAGE_SIZE << order;
+       return size;
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,


-- 
dwmw2

Attachment: smime.p7s
Description: S/MIME cryptographic signature

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to