On Fri Jun 23, 2017 at 11:23:26AM +0530, Linu Cherian wrote:
> 
> Robin,
> Was trying to understand the new changes. Had few questions on 
> arm_lpae_install_table. 
> 
> On Thu Jun 22, 2017 at 04:53:54PM +0100, Robin Murphy wrote:
> > For parallel I/O with multiple concurrent threads servicing the same
> > device (or devices, if several share a domain), serialising page table
> > updates becomes a massive bottleneck. On reflection, though, we don't
> > strictly need to do that - for valid IOMMU API usage, there are in fact
> > only two races that we need to guard against: multiple map requests for
> > different blocks within the same region, when the intermediate-level
> > table for that region does not yet exist; and multiple unmaps of
> > different parts of the same block entry. Both of those are fairly easily
> > solved by using a cmpxchg to install the new table, such that if we then
> > find that someone else's table got there first, we can simply free ours
> > and continue.
> > 
> > Make the requisite changes such that we can withstand being called
> > without the caller maintaining a lock. In theory, this opens up a few
> > corners in which wildly misbehaving callers making nonsensical
> > overlapping requests might lead to crashes instead of just unpredictable
> > results, but correct code really does not deserve to pay a significant
> > performance cost for the sake of masking bugs in theoretical broken code.
> > 
> > Signed-off-by: Robin Murphy <robin.mur...@arm.com>
> > ---
> > 
> > v2:
> >  - Fix barriers in install_table
> >  - Make a few more PTE accesses with {READ,WRITE}_ONCE() just in case
> >  - Minor cosmetics
> > 
> >  drivers/iommu/io-pgtable-arm.c | 72 
> > +++++++++++++++++++++++++++++++++---------
> >  1 file changed, 57 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
> > index 6334f51912ea..52700fa958c2 100644
> > --- a/drivers/iommu/io-pgtable-arm.c
> > +++ b/drivers/iommu/io-pgtable-arm.c
> > @@ -20,6 +20,7 @@
> >  
> >  #define pr_fmt(fmt)        "arm-lpae io-pgtable: " fmt
> >  
> > +#include <linux/atomic.h>
> >  #include <linux/iommu.h>
> >  #include <linux/kernel.h>
> >  #include <linux/sizes.h>
> > @@ -99,6 +100,8 @@
> >  #define ARM_LPAE_PTE_ATTR_HI_MASK  (((arm_lpae_iopte)6) << 52)
> >  #define ARM_LPAE_PTE_ATTR_MASK             (ARM_LPAE_PTE_ATTR_LO_MASK |    
> > \
> >                                      ARM_LPAE_PTE_ATTR_HI_MASK)
> > +/* Software bit for solving coherency races */
> > +#define ARM_LPAE_PTE_SW_SYNC               (((arm_lpae_iopte)1) << 55)
> >  
> >  /* Stage-1 PTE */
> >  #define ARM_LPAE_PTE_AP_UNPRIV             (((arm_lpae_iopte)1) << 6)
> > @@ -249,15 +252,20 @@ static void __arm_lpae_free_pages(void *pages, size_t 
> > size,
> >     free_pages_exact(pages, size);
> >  }
> >  
> > +static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep,
> > +                           struct io_pgtable_cfg *cfg)
> > +{
> > +   dma_sync_single_for_device(cfg->iommu_dev, __arm_lpae_dma_addr(ptep),
> > +                              sizeof(*ptep), DMA_TO_DEVICE);
> > +}
> > +
> >  static void __arm_lpae_set_pte(arm_lpae_iopte *ptep, arm_lpae_iopte pte,
> >                            struct io_pgtable_cfg *cfg)
> >  {
> >     *ptep = pte;
> >  
> >     if (!(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA))
> > -           dma_sync_single_for_device(cfg->iommu_dev,
> > -                                      __arm_lpae_dma_addr(ptep),
> > -                                      sizeof(pte), DMA_TO_DEVICE);
> > +           __arm_lpae_sync_pte(ptep, cfg);
> >  }
> >  
> >  static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
> > @@ -314,16 +322,30 @@ static int arm_lpae_init_pte(struct 
> > arm_lpae_io_pgtable *data,
> >  
> >  static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
> >                                          arm_lpae_iopte *ptep,
> > +                                        arm_lpae_iopte curr,
> >                                          struct io_pgtable_cfg *cfg)
> >  {
> > -   arm_lpae_iopte new;
> > +   arm_lpae_iopte old, new;
> >  
> >     new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE;
> >     if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
> >             new |= ARM_LPAE_PTE_NSTABLE;
> >  
> > -   __arm_lpae_set_pte(ptep, new, cfg);
> > -   return new;
> > +   /* Ensure the table itself is visible before its PTE can be */
> > +   wmb();
> 
> Could you please give more hints on why this is required.
> 
> 
> > +
> > +   old = cmpxchg64_relaxed(ptep, curr, new);
> > +
> > +   if ((cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA) ||
> > +       (old & ARM_LPAE_PTE_SW_SYNC))
> > +           return old;
> > +
> > +   /* Even if it's not ours, there's no point waiting; just kick it */
> > +   __arm_lpae_sync_pte(ptep, cfg);
> > +   if (old == curr)
> > +           WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
> 
> How is it ensured that the cache operations are completed before we flag them 
> with
>       ARM_LPAE_PTE_SW_SYNC. The previous version had a wmb() after the sync 
> operation.
> 
> 
> > +
> > +   return old;
> >  }
> >
> 


Observed a performance drop of close to 1G, 
while testing on the 10G network interface with this patch series compared to 
v1.
Moving the wmb() as in v1 restores it back.

@@ -332,7 +332,6 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte 
*table,
                new |= ARM_LPAE_PTE_NSTABLE;
 
        /* Ensure the table itself is visible before its PTE can be */
-       wmb();
 
        old = cmpxchg64_relaxed(ptep, curr, new);
 
@@ -342,6 +341,7 @@ static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte 
*table,
 
        /* Even if it's not ours, there's no point waiting; just kick it */
        __arm_lpae_sync_pte(ptep, cfg);
+       wmb();
        if (old == curr)
                WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);
 



-- 
Linu cherian
_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Reply via email to