On Thu, 10 Dec 2020, Rahul Singh wrote:
> Linux SMMUv3 code implements the commands-queue insertion based on
> atomic operations implemented in Linux. Atomic functions used by the
> commands-queue insertion are not implemented in XEN therefore revert the
> patch that implemented the commands-queue insertion based on atomic
> operations.
> 
> Reverted the other patches also that are implemented based on the code
> that introduced the atomic-operations.
> 
> Atomic operations are introduced in the patch "iommu/arm-smmu-v3: Reduce
> contention during command-queue insertion" that fixed the bottleneck of
> the SMMU command queue insertion operation. A new algorithm for
> inserting commands into the queue is introduced in this patch, which is
> lock-free on the fast-path.
> 
> Consequence of reverting the patch is that the command queue insertion
> will be slow for large systems as spinlock will be used to serializes
> accesses from all CPUs to the single queue supported by the hardware.
> 
> Once the proper atomic operations will be available in XEN the driver
> can be updated.
> 
> Following commits are reverted in this patch:
> 1. "iommu/arm-smmu-v3: Add SMMUv3.2 range invalidation support"
>     commit 6a481a95d4c198a2dd0a61f8877b92a375757db8.
> 2. "iommu/arm-smmu-v3: Batch ATC invalidation commands"
>     commit 9e773aee8c3e1b3ba019c5c7f8435aaa836c6130.
> 3. "iommu/arm-smmu-v3: Batch context descriptor invalidation"
>     commit edd0351e7bc49555d8b5ad8438a65a7ca262c9f0.
> 4. "iommu/arm-smmu-v3: Add command queue batching helpers
>     commit 4ce8da453640147101bda418640394637c1a7cfc.
> 5. "iommu/arm-smmu-v3: Fix ATC invalidation ordering wrt main TLBs"
>     commit 353e3cf8590cf182a9f42e67993de3aca91e8090.
> 6. "iommu/arm-smmu-v3: Defer TLB invalidation until ->iotlb_sync()"
>     commit 2af2e72b18b499fa36d3f7379fd010ff25d2a984.
> 7. "iommu/arm-smmu-v3: Reduce contention during command-queue insertion"
>     commit 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b.
> 
> Signed-off-by: Rahul Singh <rahul.si...@arm.com>

Acked-by: Stefano Stabellini <sstabell...@kernel.org>


> ---
> Changes in v3:
> - Added consequences of reverting this patch in commit message.
> - List all the commits that are reverted in this patch in commit
>   message.
> 
> ---
>  xen/drivers/passthrough/arm/smmu-v3.c | 878 ++++++--------------------
>  1 file changed, 186 insertions(+), 692 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/arm/smmu-v3.c 
> b/xen/drivers/passthrough/arm/smmu-v3.c
> index f578677a5c..8b7747ed38 100644
> --- a/xen/drivers/passthrough/arm/smmu-v3.c
> +++ b/xen/drivers/passthrough/arm/smmu-v3.c
> @@ -69,9 +69,6 @@
>  #define IDR1_SSIDSIZE                        GENMASK(10, 6)
>  #define IDR1_SIDSIZE                 GENMASK(5, 0)
>  
> -#define ARM_SMMU_IDR3                        0xc
> -#define IDR3_RIL                     (1 << 10)
> -
>  #define ARM_SMMU_IDR5                        0x14
>  #define IDR5_STALL_MAX                       GENMASK(31, 16)
>  #define IDR5_GRAN64K                 (1 << 6)
> @@ -187,7 +184,7 @@
>  
>  #define Q_IDX(llq, p)                        ((p) & ((1 << 
> (llq)->max_n_shift) - 1))
>  #define Q_WRP(llq, p)                        ((p) & (1 << 
> (llq)->max_n_shift))
> -#define Q_OVERFLOW_FLAG                      (1U << 31)
> +#define Q_OVERFLOW_FLAG                      (1 << 31)
>  #define Q_OVF(p)                     ((p) & Q_OVERFLOW_FLAG)
>  #define Q_ENT(q, p)                  ((q)->base +                    \
>                                        Q_IDX(&((q)->llq), p) *        \
> @@ -330,15 +327,6 @@
>  #define CMDQ_ERR_CERROR_ABT_IDX              2
>  #define CMDQ_ERR_CERROR_ATC_INV_IDX  3
>  
> -#define CMDQ_PROD_OWNED_FLAG         Q_OVERFLOW_FLAG
> -
> -/*
> - * This is used to size the command queue and therefore must be at least
> - * BITS_PER_LONG so that the valid_map works correctly (it relies on the
> - * total number of queue entries being a multiple of BITS_PER_LONG).
> - */
> -#define CMDQ_BATCH_ENTRIES           BITS_PER_LONG
> -
>  #define CMDQ_0_OP                    GENMASK_ULL(7, 0)
>  #define CMDQ_0_SSV                   (1UL << 11)
>  
> @@ -351,14 +339,9 @@
>  #define CMDQ_CFGI_1_LEAF             (1UL << 0)
>  #define CMDQ_CFGI_1_RANGE            GENMASK_ULL(4, 0)
>  
> -#define CMDQ_TLBI_0_NUM                      GENMASK_ULL(16, 12)
> -#define CMDQ_TLBI_RANGE_NUM_MAX              31
> -#define CMDQ_TLBI_0_SCALE            GENMASK_ULL(24, 20)
>  #define CMDQ_TLBI_0_VMID             GENMASK_ULL(47, 32)
>  #define CMDQ_TLBI_0_ASID             GENMASK_ULL(63, 48)
>  #define CMDQ_TLBI_1_LEAF             (1UL << 0)
> -#define CMDQ_TLBI_1_TTL                      GENMASK_ULL(9, 8)
> -#define CMDQ_TLBI_1_TG                       GENMASK_ULL(11, 10)
>  #define CMDQ_TLBI_1_VA_MASK          GENMASK_ULL(63, 12)
>  #define CMDQ_TLBI_1_IPA_MASK         GENMASK_ULL(51, 12)
>  
> @@ -407,8 +390,9 @@
>  #define PRIQ_1_ADDR_MASK             GENMASK_ULL(63, 12)
>  
>  /* High-level queue structures */
> -#define ARM_SMMU_POLL_TIMEOUT_US     1000000 /* 1s! */
> -#define ARM_SMMU_POLL_SPIN_COUNT     10
> +#define ARM_SMMU_POLL_TIMEOUT_US     100
> +#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US        1000000 /* 1s! */
> +#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT        10
>  
>  #define MSI_IOVA_BASE                        0x8000000
>  #define MSI_IOVA_LENGTH                      0x100000
> @@ -483,13 +467,9 @@ struct arm_smmu_cmdq_ent {
>               #define CMDQ_OP_TLBI_S2_IPA     0x2a
>               #define CMDQ_OP_TLBI_NSNH_ALL   0x30
>               struct {
> -                     u8                      num;
> -                     u8                      scale;
>                       u16                     asid;
>                       u16                     vmid;
>                       bool                    leaf;
> -                     u8                      ttl;
> -                     u8                      tg;
>                       u64                     addr;
>               } tlbi;
>  
> @@ -513,24 +493,15 @@ struct arm_smmu_cmdq_ent {
>  
>               #define CMDQ_OP_CMD_SYNC        0x46
>               struct {
> +                     u32                     msidata;
>                       u64                     msiaddr;
>               } sync;
>       };
>  };
>  
>  struct arm_smmu_ll_queue {
> -     union {
> -             u64                     val;
> -             struct {
> -                     u32             prod;
> -                     u32             cons;
> -             };
> -             struct {
> -                     atomic_t        prod;
> -                     atomic_t        cons;
> -             } atomic;
> -             u8                      __pad[SMP_CACHE_BYTES];
> -     } ____cacheline_aligned_in_smp;
> +     u32                             prod;
> +     u32                             cons;
>       u32                             max_n_shift;
>  };
>  
> @@ -548,23 +519,9 @@ struct arm_smmu_queue {
>       u32 __iomem                     *cons_reg;
>  };
>  
> -struct arm_smmu_queue_poll {
> -     ktime_t                         timeout;
> -     unsigned int                    delay;
> -     unsigned int                    spin_cnt;
> -     bool                            wfe;
> -};
> -
>  struct arm_smmu_cmdq {
>       struct arm_smmu_queue           q;
> -     atomic_long_t                   *valid_map;
> -     atomic_t                        owner_prod;
> -     atomic_t                        lock;
> -};
> -
> -struct arm_smmu_cmdq_batch {
> -     u64                             cmds[CMDQ_BATCH_ENTRIES * 
> CMDQ_ENT_DWORDS];
> -     int                             num;
> +     spinlock_t                      lock;
>  };
>  
>  struct arm_smmu_evtq {
> @@ -647,7 +604,6 @@ struct arm_smmu_device {
>  #define ARM_SMMU_FEAT_HYP            (1 << 12)
>  #define ARM_SMMU_FEAT_STALL_FORCE    (1 << 13)
>  #define ARM_SMMU_FEAT_VAX            (1 << 14)
> -#define ARM_SMMU_FEAT_RANGE_INV              (1 << 15)
>       u32                             features;
>  
>  #define ARM_SMMU_OPT_SKIP_PREFETCH   (1 << 0)
> @@ -660,6 +616,8 @@ struct arm_smmu_device {
>  
>       int                             gerr_irq;
>       int                             combined_irq;
> +     u32                             sync_nr;
> +     u8                              prev_cmd_opcode;
>  
>       unsigned long                   ias; /* IPA */
>       unsigned long                   oas; /* PA */
> @@ -677,6 +635,12 @@ struct arm_smmu_device {
>  
>       struct arm_smmu_strtab_cfg      strtab_cfg;
>  
> +     /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */
> +     union {
> +             u32                     sync_count;
> +             u64                     padding;
> +     };
> +
>       /* IOMMU core code handle */
>       struct iommu_device             iommu;
>  };
> @@ -763,21 +727,6 @@ static void parse_driver_options(struct arm_smmu_device 
> *smmu)
>  }
>  
>  /* Low-level queue manipulation functions */
> -static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n)
> -{
> -     u32 space, prod, cons;
> -
> -     prod = Q_IDX(q, q->prod);
> -     cons = Q_IDX(q, q->cons);
> -
> -     if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons))
> -             space = (1 << q->max_n_shift) - (prod - cons);
> -     else
> -             space = cons - prod;
> -
> -     return space >= n;
> -}
> -
>  static bool queue_full(struct arm_smmu_ll_queue *q)
>  {
>       return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) &&
> @@ -790,12 +739,9 @@ static bool queue_empty(struct arm_smmu_ll_queue *q)
>              Q_WRP(q, q->prod) == Q_WRP(q, q->cons);
>  }
>  
> -static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod)
> +static void queue_sync_cons_in(struct arm_smmu_queue *q)
>  {
> -     return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) &&
> -             (Q_IDX(q, q->cons) > Q_IDX(q, prod))) ||
> -            ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) &&
> -             (Q_IDX(q, q->cons) <= Q_IDX(q, prod)));
> +     q->llq.cons = readl_relaxed(q->cons_reg);
>  }
>  
>  static void queue_sync_cons_out(struct arm_smmu_queue *q)
> @@ -826,34 +772,46 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q)
>       return ret;
>  }
>  
> -static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n)
> +static void queue_sync_prod_out(struct arm_smmu_queue *q)
>  {
> -     u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n;
> -     return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
> +     writel(q->llq.prod, q->prod_reg);
>  }
>  
> -static void queue_poll_init(struct arm_smmu_device *smmu,
> -                         struct arm_smmu_queue_poll *qp)
> +static void queue_inc_prod(struct arm_smmu_ll_queue *q)
>  {
> -     qp->delay = 1;
> -     qp->spin_cnt = 0;
> -     qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> -     qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US);
> +     u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
> +     q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
>  }
>  
> -static int queue_poll(struct arm_smmu_queue_poll *qp)
> +/*
> + * Wait for the SMMU to consume items. If sync is true, wait until the queue
> + * is empty. Otherwise, wait until there is at least one free slot.
> + */
> +static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe)
>  {
> -     if (ktime_compare(ktime_get(), qp->timeout) > 0)
> -             return -ETIMEDOUT;
> +     ktime_t timeout;
> +     unsigned int delay = 1, spin_cnt = 0;
>  
> -     if (qp->wfe) {
> -             wfe();
> -     } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) {
> -             cpu_relax();
> -     } else {
> -             udelay(qp->delay);
> -             qp->delay *= 2;
> -             qp->spin_cnt = 0;
> +     /* Wait longer if it's a CMD_SYNC */
> +     timeout = ktime_add_us(ktime_get(), sync ?
> +                                         ARM_SMMU_CMDQ_SYNC_TIMEOUT_US :
> +                                         ARM_SMMU_POLL_TIMEOUT_US);
> +
> +     while (queue_sync_cons_in(q),
> +           (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) {
> +             if (ktime_compare(ktime_get(), timeout) > 0)
> +                     return -ETIMEDOUT;
> +
> +             if (wfe) {
> +                     wfe();
> +             } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) {
> +                     cpu_relax();
> +                     continue;
> +             } else {
> +                     udelay(delay);
> +                     delay *= 2;
> +                     spin_cnt = 0;
> +             }
>       }
>  
>       return 0;
> @@ -867,6 +825,17 @@ static void queue_write(__le64 *dst, u64 *src, size_t 
> n_dwords)
>               *dst++ = cpu_to_le64(*src++);
>  }
>  
> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> +{
> +     if (queue_full(&q->llq))
> +             return -ENOSPC;
> +
> +     queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords);
> +     queue_inc_prod(&q->llq);
> +     queue_sync_prod_out(q);
> +     return 0;
> +}
> +
>  static void queue_read(__le64 *dst, u64 *src, size_t n_dwords)
>  {
>       int i;
> @@ -916,22 +885,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
> arm_smmu_cmdq_ent *ent)
>               cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31);
>               break;
>       case CMDQ_OP_TLBI_NH_VA:
> -             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> -             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
>               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> -             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> -             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>               cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK;
>               break;
>       case CMDQ_OP_TLBI_S2_IPA:
> -             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num);
> -             cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale);
>               cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
>               cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf);
> -             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl);
> -             cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg);
>               cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK;
>               break;
>       case CMDQ_OP_TLBI_NH_ASID:
> @@ -964,14 +925,20 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
> arm_smmu_cmdq_ent *ent)
>               cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp);
>               break;
>       case CMDQ_OP_CMD_SYNC:
> -             if (ent->sync.msiaddr) {
> +             if (ent->sync.msiaddr)
>                       cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, 
> CMDQ_SYNC_0_CS_IRQ);
> -                     cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
> -             } else {
> +             else
>                       cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, 
> CMDQ_SYNC_0_CS_SEV);
> -             }
>               cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH);
>               cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, 
> ARM_SMMU_MEMATTR_OIWB);
> +             /*
> +              * Commands are written little-endian, but we want the SMMU to
> +              * receive MSIData, and thus write it back to memory, in CPU
> +              * byte order, so big-endian needs an extra byteswap here.
> +              */
> +             cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA,
> +                                  cpu_to_le32(ent->sync.msidata));
> +             cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK;
>               break;
>       default:
>               return -ENOENT;
> @@ -980,27 +947,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct 
> arm_smmu_cmdq_ent *ent)
>       return 0;
>  }
>  
> -static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device 
> *smmu,
> -                                      u32 prod)
> -{
> -     struct arm_smmu_queue *q = &smmu->cmdq.q;
> -     struct arm_smmu_cmdq_ent ent = {
> -             .opcode = CMDQ_OP_CMD_SYNC,
> -     };
> -
> -     /*
> -      * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI
> -      * payload, so the write will zero the entire command on that platform.
> -      */
> -     if (smmu->features & ARM_SMMU_FEAT_MSI &&
> -         smmu->features & ARM_SMMU_FEAT_COHERENCY) {
> -             ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) *
> -                                q->ent_dwords * 8;
> -     }
> -
> -     arm_smmu_cmdq_build_cmd(cmd, &ent);
> -}
> -
>  static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
>  {
>       static const char *cerror_str[] = {
> @@ -1059,474 +1005,109 @@ static void arm_smmu_cmdq_skip_err(struct 
> arm_smmu_device *smmu)
>       queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
>  }
>  
> -/*
> - * Command queue locking.
> - * This is a form of bastardised rwlock with the following major changes:
> - *
> - * - The only LOCK routines are exclusive_trylock() and shared_lock().
> - *   Neither have barrier semantics, and instead provide only a control
> - *   dependency.
> - *
> - * - The UNLOCK routines are supplemented with shared_tryunlock(), which
> - *   fails if the caller appears to be the last lock holder (yes, this is
> - *   racy). All successful UNLOCK routines have RELEASE semantics.
> - */
> -static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq)
> -{
> -     int val;
> -
> -     /*
> -      * We can try to avoid the cmpxchg() loop by simply incrementing the
> -      * lock counter. When held in exclusive state, the lock counter is set
> -      * to INT_MIN so these increments won't hurt as the value will remain
> -      * negative.
> -      */
> -     if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0)
> -             return;
> -
> -     do {
> -             val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0);
> -     } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val);
> -}
> -
> -static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq)
> -{
> -     (void)atomic_dec_return_release(&cmdq->lock);
> -}
> -
> -static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq)
> +static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd)
>  {
> -     if (atomic_read(&cmdq->lock) == 1)
> -             return false;
> -
> -     arm_smmu_cmdq_shared_unlock(cmdq);
> -     return true;
> -}
> -
> -#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)         \
> -({                                                                   \
> -     bool __ret;                                                     \
> -     local_irq_save(flags);                                          \
> -     __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN);       \
> -     if (!__ret)                                                     \
> -             local_irq_restore(flags);                               \
> -     __ret;                                                          \
> -})
> -
> -#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags)               
> \
> -({                                                                   \
> -     atomic_set_release(&cmdq->lock, 0);                             \
> -     local_irq_restore(flags);                                       \
> -})
> -
> -
> -/*
> - * Command queue insertion.
> - * This is made fiddly by our attempts to achieve some sort of scalability
> - * since there is one queue shared amongst all of the CPUs in the system.  If
> - * you like mixed-size concurrency, dependency ordering and relaxed atomics,
> - * then you'll *love* this monstrosity.
> - *
> - * The basic idea is to split the queue up into ranges of commands that are
> - * owned by a given CPU; the owner may not have written all of the commands
> - * itself, but is responsible for advancing the hardware prod pointer when
> - * the time comes. The algorithm is roughly:
> - *
> - *   1. Allocate some space in the queue. At this point we also discover
> - *      whether the head of the queue is currently owned by another CPU,
> - *      or whether we are the owner.
> - *
> - *   2. Write our commands into our allocated slots in the queue.
> - *
> - *   3. Mark our slots as valid in arm_smmu_cmdq.valid_map.
> - *
> - *   4. If we are an owner:
> - *           a. Wait for the previous owner to finish.
> - *           b. Mark the queue head as unowned, which tells us the range
> - *              that we are responsible for publishing.
> - *           c. Wait for all commands in our owned range to become valid.
> - *           d. Advance the hardware prod pointer.
> - *           e. Tell the next owner we've finished.
> - *
> - *   5. If we are inserting a CMD_SYNC (we may or may not have been an
> - *      owner), then we need to stick around until it has completed:
> - *           a. If we have MSIs, the SMMU can write back into the CMD_SYNC
> - *              to clear the first 4 bytes.
> - *           b. Otherwise, we spin waiting for the hardware cons pointer to
> - *              advance past our command.
> - *
> - * The devil is in the details, particularly the use of locking for handling
> - * SYNC completion and freeing up space in the queue before we think that it 
> is
> - * full.
> - */
> -static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq,
> -                                            u32 sprod, u32 eprod, bool set)
> -{
> -     u32 swidx, sbidx, ewidx, ebidx;
> -     struct arm_smmu_ll_queue llq = {
> -             .max_n_shift    = cmdq->q.llq.max_n_shift,
> -             .prod           = sprod,
> -     };
> -
> -     ewidx = BIT_WORD(Q_IDX(&llq, eprod));
> -     ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG;
> -
> -     while (llq.prod != eprod) {
> -             unsigned long mask;
> -             atomic_long_t *ptr;
> -             u32 limit = BITS_PER_LONG;
> -
> -             swidx = BIT_WORD(Q_IDX(&llq, llq.prod));
> -             sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG;
> -
> -             ptr = &cmdq->valid_map[swidx];
> -
> -             if ((swidx == ewidx) && (sbidx < ebidx))
> -                     limit = ebidx;
> -
> -             mask = GENMASK(limit - 1, sbidx);
> -
> -             /*
> -              * The valid bit is the inverse of the wrap bit. This means
> -              * that a zero-initialised queue is invalid and, after marking
> -              * all entries as valid, they become invalid again when we
> -              * wrap.
> -              */
> -             if (set) {
> -                     atomic_long_xor(mask, ptr);
> -             } else { /* Poll */
> -                     unsigned long valid;
> +     struct arm_smmu_queue *q = &smmu->cmdq.q;
> +     bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>  
> -                     valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask;
> -                     atomic_long_cond_read_relaxed(ptr, (VAL & mask) == 
> valid);
> -             }
> +     smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]);
>  
> -             llq.prod = queue_inc_prod_n(&llq, limit - sbidx);
> +     while (queue_insert_raw(q, cmd) == -ENOSPC) {
> +             if (queue_poll_cons(q, false, wfe))
> +                     dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>       }
>  }
>  
> -/* Mark all entries in the range [sprod, eprod) as valid */
> -static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq,
> -                                     u32 sprod, u32 eprod)
> -{
> -     __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true);
> -}
> -
> -/* Wait for all entries in the range [sprod, eprod) to become valid */
> -static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
> -                                      u32 sprod, u32 eprod)
> -{
> -     __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false);
> -}
> -
> -/* Wait for the command queue to become non-full */
> -static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
> -                                          struct arm_smmu_ll_queue *llq)
> +static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> +                                 struct arm_smmu_cmdq_ent *ent)
>  {
> +     u64 cmd[CMDQ_ENT_DWORDS];
>       unsigned long flags;
> -     struct arm_smmu_queue_poll qp;
> -     struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -     int ret = 0;
>  
> -     /*
> -      * Try to update our copy of cons by grabbing exclusive cmdq access. If
> -      * that fails, spin until somebody else updates it for us.
> -      */
> -     if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) {
> -             WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg));
> -             arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags);
> -             llq->val = READ_ONCE(cmdq->q.llq.val);
> -             return 0;
> +     if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
> +             dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> +                      ent->opcode);
> +             return;
>       }
>  
> -     queue_poll_init(smmu, &qp);
> -     do {
> -             llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
> -             if (!queue_full(llq))
> -                     break;
> -
> -             ret = queue_poll(&qp);
> -     } while (!ret);
> -
> -     return ret;
> +     spin_lock_irqsave(&smmu->cmdq.lock, flags);
> +     arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +     spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  }
>  
>  /*
> - * Wait until the SMMU signals a CMD_SYNC completion MSI.
> - * Must be called with the cmdq lock held in some capacity.
> + * The difference between val and sync_idx is bounded by the maximum size of
> + * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic.
>   */
> -static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
> -                                       struct arm_smmu_ll_queue *llq)
> -{
> -     int ret = 0;
> -     struct arm_smmu_queue_poll qp;
> -     struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -     u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
> -
> -     queue_poll_init(smmu, &qp);
> -
> -     /*
> -      * The MSI won't generate an event, since it's being written back
> -      * into the command queue.
> -      */
> -     qp.wfe = false;
> -     smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp)));
> -     llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1);
> -     return ret;
> -}
> -
> -/*
> - * Wait until the SMMU cons index passes llq->prod.
> - * Must be called with the cmdq lock held in some capacity.
> - */
> -static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
> -                                            struct arm_smmu_ll_queue *llq)
> -{
> -     struct arm_smmu_queue_poll qp;
> -     struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -     u32 prod = llq->prod;
> -     int ret = 0;
> -
> -     queue_poll_init(smmu, &qp);
> -     llq->val = READ_ONCE(smmu->cmdq.q.llq.val);
> -     do {
> -             if (queue_consumed(llq, prod))
> -                     break;
> -
> -             ret = queue_poll(&qp);
> -
> -             /*
> -              * This needs to be a readl() so that our subsequent call
> -              * to arm_smmu_cmdq_shared_tryunlock() can fail accurately.
> -              *
> -              * Specifically, we need to ensure that we observe all
> -              * shared_lock()s by other CMD_SYNCs that share our owner,
> -              * so that a failing call to tryunlock() means that we're
> -              * the last one out and therefore we can safely advance
> -              * cmdq->q.llq.cons. Roughly speaking:
> -              *
> -              * CPU 0                CPU1                    CPU2 (us)
> -              *
> -              * if (sync)
> -              *      shared_lock();
> -              *
> -              * dma_wmb();
> -              * set_valid_map();
> -              *
> -              *                      if (owner) {
> -              *                              poll_valid_map();
> -              *                              <control dependency>
> -              *                              writel(prod_reg);
> -              *
> -              *                                              readl(cons_reg);
> -              *                                              tryunlock();
> -              *
> -              * Requires us to see CPU 0's shared_lock() acquisition.
> -              */
> -             llq->cons = readl(cmdq->q.cons_reg);
> -     } while (!ret);
> -
> -     return ret;
> -}
> -
> -static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
> -                                      struct arm_smmu_ll_queue *llq)
> +static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 
> sync_idx)
>  {
> -     if (smmu->features & ARM_SMMU_FEAT_MSI &&
> -         smmu->features & ARM_SMMU_FEAT_COHERENCY)
> -             return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
> -
> -     return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
> -}
> -
> -static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 
> *cmds,
> -                                     u32 prod, int n)
> -{
> -     int i;
> -     struct arm_smmu_ll_queue llq = {
> -             .max_n_shift    = cmdq->q.llq.max_n_shift,
> -             .prod           = prod,
> -     };
> +     ktime_t timeout;
> +     u32 val;
>  
> -     for (i = 0; i < n; ++i) {
> -             u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS];
> +     timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US);
> +     val = smp_cond_load_acquire(&smmu->sync_count,
> +                                 (int)(VAL - sync_idx) >= 0 ||
> +                                 !ktime_before(ktime_get(), timeout));
>  
> -             prod = queue_inc_prod_n(&llq, i);
> -             queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS);
> -     }
> +     return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0;
>  }
>  
> -/*
> - * This is the actual insertion function, and provides the following
> - * ordering guarantees to callers:
> - *
> - * - There is a dma_wmb() before publishing any commands to the queue.
> - *   This can be relied upon to order prior writes to data structures
> - *   in memory (such as a CD or an STE) before the command.
> - *
> - * - On completion of a CMD_SYNC, there is a control dependency.
> - *   This can be relied upon to order subsequent writes to memory (e.g.
> - *   freeing an IOVA) after completion of the CMD_SYNC.
> - *
> - * - Command insertion is totally ordered, so if two CPUs each race to
> - *   insert their own list of commands then all of the commands from one
> - *   CPU will appear before any of the commands from the other CPU.
> - */
> -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
> -                                    u64 *cmds, int n, bool sync)
> +static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu)
>  {
> -     u64 cmd_sync[CMDQ_ENT_DWORDS];
> -     u32 prod;
> +     u64 cmd[CMDQ_ENT_DWORDS];
>       unsigned long flags;
> -     bool owner;
> -     struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -     struct arm_smmu_ll_queue llq = {
> -             .max_n_shift = cmdq->q.llq.max_n_shift,
> -     }, head = llq;
> -     int ret = 0;
> -
> -     /* 1. Allocate some space in the queue */
> -     local_irq_save(flags);
> -     llq.val = READ_ONCE(cmdq->q.llq.val);
> -     do {
> -             u64 old;
> -
> -             while (!queue_has_space(&llq, n + sync)) {
> -                     local_irq_restore(flags);
> -                     if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
> -                             dev_err_ratelimited(smmu->dev, "CMDQ 
> timeout\n");
> -                     local_irq_save(flags);
> -             }
> -
> -             head.cons = llq.cons;
> -             head.prod = queue_inc_prod_n(&llq, n + sync) |
> -                                          CMDQ_PROD_OWNED_FLAG;
> -
> -             old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val);
> -             if (old == llq.val)
> -                     break;
> -
> -             llq.val = old;
> -     } while (1);
> -     owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG);
> -     head.prod &= ~CMDQ_PROD_OWNED_FLAG;
> -     llq.prod &= ~CMDQ_PROD_OWNED_FLAG;
> -
> -     /*
> -      * 2. Write our commands into the queue
> -      * Dependency ordering from the cmpxchg() loop above.
> -      */
> -     arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
> -     if (sync) {
> -             prod = queue_inc_prod_n(&llq, n);
> -             arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod);
> -             queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
> -
> -             /*
> -              * In order to determine completion of our CMD_SYNC, we must
> -              * ensure that the queue can't wrap twice without us noticing.
> -              * We achieve that by taking the cmdq lock as shared before
> -              * marking our slot as valid.
> -              */
> -             arm_smmu_cmdq_shared_lock(cmdq);
> -     }
> -
> -     /* 3. Mark our slots as valid, ensuring commands are visible first */
> -     dma_wmb();
> -     arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod);
> -
> -     /* 4. If we are the owner, take control of the SMMU hardware */
> -     if (owner) {
> -             /* a. Wait for previous owner to finish */
> -             atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod);
> -
> -             /* b. Stop gathering work by clearing the owned flag */
> -             prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG,
> -                                                &cmdq->q.llq.atomic.prod);
> -             prod &= ~CMDQ_PROD_OWNED_FLAG;
> -
> -             /*
> -              * c. Wait for any gathered work to be written to the queue.
> -              * Note that we read our own entries so that we have the control
> -              * dependency required by (d).
> -              */
> -             arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod);
> +     struct arm_smmu_cmdq_ent ent = {
> +             .opcode = CMDQ_OP_CMD_SYNC,
> +             .sync   = {
> +                     .msiaddr = virt_to_phys(&smmu->sync_count),
> +             },
> +     };
>  
> -             /*
> -              * d. Advance the hardware prod pointer
> -              * Control dependency ordering from the entries becoming valid.
> -              */
> -             writel_relaxed(prod, cmdq->q.prod_reg);
> +     spin_lock_irqsave(&smmu->cmdq.lock, flags);
>  
> -             /*
> -              * e. Tell the next owner we're done
> -              * Make sure we've updated the hardware first, so that we don't
> -              * race to update prod and potentially move it backwards.
> -              */
> -             atomic_set_release(&cmdq->owner_prod, prod);
> +     /* Piggy-back on the previous command if it's a SYNC */
> +     if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) {
> +             ent.sync.msidata = smmu->sync_nr;
> +     } else {
> +             ent.sync.msidata = ++smmu->sync_nr;
> +             arm_smmu_cmdq_build_cmd(cmd, &ent);
> +             arm_smmu_cmdq_insert_cmd(smmu, cmd);
>       }
>  
> -     /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
> -     if (sync) {
> -             llq.prod = queue_inc_prod_n(&llq, n);
> -             ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
> -             if (ret) {
> -                     dev_err_ratelimited(smmu->dev,
> -                                         "CMD_SYNC timeout at 0x%08x [hwprod 
> 0x%08x, hwcons 0x%08x]\n",
> -                                         llq.prod,
> -                                         readl_relaxed(cmdq->q.prod_reg),
> -                                         readl_relaxed(cmdq->q.cons_reg));
> -             }
> +     spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -             /*
> -              * Try to unlock the cmq lock. This will fail if we're the last
> -              * reader, in which case we can safely update cmdq->q.llq.cons
> -              */
> -             if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) {
> -                     WRITE_ONCE(cmdq->q.llq.cons, llq.cons);
> -                     arm_smmu_cmdq_shared_unlock(cmdq);
> -             }
> -     }
> -
> -     local_irq_restore(flags);
> -     return ret;
> +     return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata);
>  }
>  
> -static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
> -                                struct arm_smmu_cmdq_ent *ent)
> +static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
>       u64 cmd[CMDQ_ENT_DWORDS];
> +     unsigned long flags;
> +     bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> +     struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC };
> +     int ret;
>  
> -     if (arm_smmu_cmdq_build_cmd(cmd, ent)) {
> -             dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n",
> -                      ent->opcode);
> -             return -EINVAL;
> -     }
> +     arm_smmu_cmdq_build_cmd(cmd, &ent);
>  
> -     return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false);
> -}
> +     spin_lock_irqsave(&smmu->cmdq.lock, flags);
> +     arm_smmu_cmdq_insert_cmd(smmu, cmd);
> +     ret = queue_poll_cons(&smmu->cmdq.q, true, wfe);
> +     spin_unlock_irqrestore(&smmu->cmdq.lock, flags);
>  
> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
> -{
> -     return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true);
> +     return ret;
>  }
>  
> -static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
> -                                 struct arm_smmu_cmdq_batch *cmds,
> -                                 struct arm_smmu_cmdq_ent *cmd)
> +static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu)
>  {
> -     if (cmds->num == CMDQ_BATCH_ENTRIES) {
> -             arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
> -             cmds->num = 0;
> -     }
> -     arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd);
> -     cmds->num++;
> -}
> +     int ret;
> +     bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) &&
> +                (smmu->features & ARM_SMMU_FEAT_COHERENCY);
>  
> -static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
> -                                   struct arm_smmu_cmdq_batch *cmds)
> -{
> -     return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
> +     ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu)
> +               : __arm_smmu_cmdq_issue_sync(smmu);
> +     if (ret)
> +             dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n");
> +     return ret;
>  }
>  
>  /* Context descriptor manipulation functions */
> @@ -1536,7 +1117,6 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain 
> *smmu_domain,
>       size_t i;
>       unsigned long flags;
>       struct arm_smmu_master *master;
> -     struct arm_smmu_cmdq_batch cmds = {};
>       struct arm_smmu_device *smmu = smmu_domain->smmu;
>       struct arm_smmu_cmdq_ent cmd = {
>               .opcode = CMDQ_OP_CFGI_CD,
> @@ -1550,12 +1130,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain 
> *smmu_domain,
>       list_for_each_entry(master, &smmu_domain->devices, domain_head) {
>               for (i = 0; i < master->num_sids; i++) {
>                       cmd.cfgi.sid = master->sids[i];
> -                     arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> +                     arm_smmu_cmdq_issue_cmd(smmu, &cmd);
>               }
>       }
>       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  
> -     arm_smmu_cmdq_batch_submit(smmu, &cmds);
> +     arm_smmu_cmdq_issue_sync(smmu);
>  }
>  
>  static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
> @@ -2190,16 +1770,17 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, 
> size_t size,
>       cmd->atc.size   = log2_span;
>  }
>  
> -static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
> +static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
> +                                struct arm_smmu_cmdq_ent *cmd)
>  {
>       int i;
> -     struct arm_smmu_cmdq_ent cmd;
>  
> -     arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
> +     if (!master->ats_enabled)
> +             return 0;
>  
>       for (i = 0; i < master->num_sids; i++) {
> -             cmd.atc.sid = master->sids[i];
> -             arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
> +             cmd->atc.sid = master->sids[i];
> +             arm_smmu_cmdq_issue_cmd(master->smmu, cmd);
>       }
>  
>       return arm_smmu_cmdq_issue_sync(master->smmu);
> @@ -2208,11 +1789,10 @@ static int arm_smmu_atc_inv_master(struct 
> arm_smmu_master *master)
>  static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
>                                  int ssid, unsigned long iova, size_t size)
>  {
> -     int i;
> +     int ret = 0;
>       unsigned long flags;
>       struct arm_smmu_cmdq_ent cmd;
>       struct arm_smmu_master *master;
> -     struct arm_smmu_cmdq_batch cmds = {};
>  
>       if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
>               return 0;
> @@ -2237,18 +1817,11 @@ static int arm_smmu_atc_inv_domain(struct 
> arm_smmu_domain *smmu_domain,
>       arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
>  
>       spin_lock_irqsave(&smmu_domain->devices_lock, flags);
> -     list_for_each_entry(master, &smmu_domain->devices, domain_head) {
> -             if (!master->ats_enabled)
> -                     continue;
> -
> -             for (i = 0; i < master->num_sids; i++) {
> -                     cmd.atc.sid = master->sids[i];
> -                     arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
> -             }
> -     }
> +     list_for_each_entry(master, &smmu_domain->devices, domain_head)
> +             ret |= arm_smmu_atc_inv_master(master, &cmd);
>       spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
>  
> -     return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
> +     return ret ? -ETIMEDOUT : 0;
>  }
>  
>  /* IO_PGTABLE API */
> @@ -2270,26 +1843,23 @@ static void arm_smmu_tlb_inv_context(void *cookie)
>       /*
>        * NOTE: when io-pgtable is in non-strict mode, we may get here with
>        * PTEs previously cleared by unmaps on the current CPU not yet visible
> -      * to the SMMU. We are relying on the dma_wmb() implicit during cmd
> -      * insertion to guarantee those are observed before the TLBI. Do be
> -      * careful, 007.
> +      * to the SMMU. We are relying on the DSB implicit in
> +      * queue_sync_prod_out() to guarantee those are observed before the
> +      * TLBI. Do be careful, 007.
>        */
>       arm_smmu_cmdq_issue_cmd(smmu, &cmd);
>       arm_smmu_cmdq_issue_sync(smmu);
> -     arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0);
>  }
>  
> -static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size,
> -                                size_t granule, bool leaf,
> -                                struct arm_smmu_domain *smmu_domain)
> +static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size,
> +                                       size_t granule, bool leaf, void 
> *cookie)
>  {
> +     struct arm_smmu_domain *smmu_domain = cookie;
>       struct arm_smmu_device *smmu = smmu_domain->smmu;
> -     unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0;
> -     size_t inv_range = granule;
> -     struct arm_smmu_cmdq_batch cmds = {};
>       struct arm_smmu_cmdq_ent cmd = {
>               .tlbi = {
>                       .leaf   = leaf,
> +                     .addr   = iova,
>               },
>       };
>  
> @@ -2304,78 +1874,37 @@ static void arm_smmu_tlb_inv_range(unsigned long 
> iova, size_t size,
>               cmd.tlbi.vmid   = smmu_domain->s2_cfg.vmid;
>       }
>  
> -     if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> -             /* Get the leaf page size */
> -             tg = __ffs(smmu_domain->domain.pgsize_bitmap);
> -
> -             /* Convert page size of 12,14,16 (log2) to 1,2,3 */
> -             cmd.tlbi.tg = (tg - 10) / 2;
> -
> -             /* Determine what level the granule is at */
> -             cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3));
> -
> -             num_pages = size >> tg;
> -     }
> -
> -     while (iova < end) {
> -             if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
> -                     /*
> -                      * On each iteration of the loop, the range is 5 bits
> -                      * worth of the aligned size remaining.
> -                      * The range in pages is:
> -                      *
> -                      * range = (num_pages & (0x1f << __ffs(num_pages)))
> -                      */
> -                     unsigned long scale, num;
> -
> -                     /* Determine the power of 2 multiple number of pages */
> -                     scale = __ffs(num_pages);
> -                     cmd.tlbi.scale = scale;
> -
> -                     /* Determine how many chunks of 2^scale size we have */
> -                     num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX;
> -                     cmd.tlbi.num = num - 1;
> -
> -                     /* range is num * 2^scale * pgsize */
> -                     inv_range = num << (scale + tg);
> -
> -                     /* Clear out the lower order bits for the next 
> iteration */
> -                     num_pages -= num << scale;
> -             }
> -
> -             cmd.tlbi.addr = iova;
> -             arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
> -             iova += inv_range;
> -     }
> -     arm_smmu_cmdq_batch_submit(smmu, &cmds);
> -
> -     /*
> -      * Unfortunately, this can't be leaf-only since we may have
> -      * zapped an entire table.
> -      */
> -     arm_smmu_atc_inv_domain(smmu_domain, 0, start, size);
> +     do {
> +             arm_smmu_cmdq_issue_cmd(smmu, &cmd);
> +             cmd.tlbi.addr += granule;
> +     } while (size -= granule);
>  }
>  
>  static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
>                                        unsigned long iova, size_t granule,
>                                        void *cookie)
>  {
> -     struct arm_smmu_domain *smmu_domain = cookie;
> -     struct iommu_domain *domain = &smmu_domain->domain;
> -
> -     iommu_iotlb_gather_add_page(domain, gather, iova, granule);
> +     arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie);
>  }
>  
>  static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size,
>                                 size_t granule, void *cookie)
>  {
> -     arm_smmu_tlb_inv_range(iova, size, granule, false, cookie);
> +     struct arm_smmu_domain *smmu_domain = cookie;
> +     struct arm_smmu_device *smmu = smmu_domain->smmu;
> +
> +     arm_smmu_tlb_inv_range_nosync(iova, size, granule, false, cookie);
> +     arm_smmu_cmdq_issue_sync(smmu);
>  }
>  
>  static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size,
>                                 size_t granule, void *cookie)
>  {
> -     arm_smmu_tlb_inv_range(iova, size, granule, true, cookie);
> +     struct arm_smmu_domain *smmu_domain = cookie;
> +     struct arm_smmu_device *smmu = smmu_domain->smmu;
> +
> +     arm_smmu_tlb_inv_range_nosync(iova, size, granule, true, cookie);
> +     arm_smmu_cmdq_issue_sync(smmu);
>  }
>  
>  static const struct iommu_flush_ops arm_smmu_flush_ops = {
> @@ -2701,6 +2230,7 @@ static void arm_smmu_enable_ats(struct arm_smmu_master 
> *master)
>  
>  static void arm_smmu_disable_ats(struct arm_smmu_master *master)
>  {
> +     struct arm_smmu_cmdq_ent cmd;
>       struct arm_smmu_domain *smmu_domain = master->domain;
>  
>       if (!master->ats_enabled)
> @@ -2712,7 +2242,8 @@ static void arm_smmu_disable_ats(struct arm_smmu_master 
> *master)
>        * ATC invalidation via the SMMU.
>        */
>       wmb();
> -     arm_smmu_atc_inv_master(master);
> +     arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd);
> +     arm_smmu_atc_inv_master(master, &cmd);
>       atomic_dec(&smmu_domain->nr_ats_masters);
>  }
>  
> @@ -2856,13 +2387,18 @@ static int arm_smmu_map(struct iommu_domain *domain, 
> unsigned long iova,
>  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>                            size_t size, struct iommu_iotlb_gather *gather)
>  {
> +     int ret;
>       struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>       struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>  
>       if (!ops)
>               return 0;
>  
> -     return ops->unmap(ops, iova, size, gather);
> +     ret = ops->unmap(ops, iova, size, gather);
> +     if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size))
> +             return 0;
> +
> +     return ret;
>  }
>  
>  static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain)
> @@ -2876,10 +2412,10 @@ static void arm_smmu_flush_iotlb_all(struct 
> iommu_domain *domain)
>  static void arm_smmu_iotlb_sync(struct iommu_domain *domain,
>                               struct iommu_iotlb_gather *gather)
>  {
> -     struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> +     struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu;
>  
> -     arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start,
> -                            gather->pgsize, true, smmu_domain);
> +     if (smmu)
> +             arm_smmu_cmdq_issue_sync(smmu);
>  }
>  
>  static phys_addr_t
> @@ -3177,49 +2713,18 @@ static int arm_smmu_init_one_queue(struct 
> arm_smmu_device *smmu,
>       return 0;
>  }
>  
> -static void arm_smmu_cmdq_free_bitmap(void *data)
> -{
> -     unsigned long *bitmap = data;
> -     bitmap_free(bitmap);
> -}
> -
> -static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
> -{
> -     int ret = 0;
> -     struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
> -     unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
> -     atomic_long_t *bitmap;
> -
> -     atomic_set(&cmdq->owner_prod, 0);
> -     atomic_set(&cmdq->lock, 0);
> -
> -     bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL);
> -     if (!bitmap) {
> -             dev_err(smmu->dev, "failed to allocate cmdq bitmap\n");
> -             ret = -ENOMEM;
> -     } else {
> -             cmdq->valid_map = bitmap;
> -             devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap);
> -     }
> -
> -     return ret;
> -}
> -
>  static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
>  {
>       int ret;
>  
>       /* cmdq */
> +     spin_lock_init(&smmu->cmdq.lock);
>       ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD,
>                                     ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS,
>                                     "cmdq");
>       if (ret)
>               return ret;
>  
> -     ret = arm_smmu_cmdq_init(smmu);
> -     if (ret)
> -             return ret;
> -
>       /* evtq */
>       ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD,
>                                     ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS,
> @@ -3800,15 +3305,9 @@ static int arm_smmu_device_hw_probe(struct 
> arm_smmu_device *smmu)
>       /* Queue sizes, capped to ensure natural alignment */
>       smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
>                                            FIELD_GET(IDR1_CMDQS, reg));
> -     if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) {
> -             /*
> -              * We don't support splitting up batches, so one batch of
> -              * commands plus an extra sync needs to fit inside the command
> -              * queue. There's also no way we can handle the weird alignment
> -              * restrictions on the base pointer for a unit-length queue.
> -              */
> -             dev_err(smmu->dev, "command queue size <= %d entries not 
> supported\n",
> -                     CMDQ_BATCH_ENTRIES);
> +     if (!smmu->cmdq.q.llq.max_n_shift) {
> +             /* Odd alignment restrictions on the base, so ignore for now */
> +             dev_err(smmu->dev, "unit-length command queue not supported\n");
>               return -ENXIO;
>       }
>  
> @@ -3828,11 +3327,6 @@ static int arm_smmu_device_hw_probe(struct 
> arm_smmu_device *smmu)
>       if (smmu->sid_bits <= STRTAB_SPLIT)
>               smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB;
>  
> -     /* IDR3 */
> -     reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
> -     if (FIELD_GET(IDR3_RIL, reg))
> -             smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
> -
>       /* IDR5 */
>       reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5);
>  
> -- 
> 2.17.1
> 

Reply via email to