On Thu, 10 Dec 2020, Rahul Singh wrote: > Linux SMMUv3 code implements the commands-queue insertion based on > atomic operations implemented in Linux. Atomic functions used by the > commands-queue insertion are not implemented in XEN therefore revert the > patch that implemented the commands-queue insertion based on atomic > operations. > > Reverted the other patches also that are implemented based on the code > that introduced the atomic-operations. > > Atomic operations are introduced in the patch "iommu/arm-smmu-v3: Reduce > contention during command-queue insertion" that fixed the bottleneck of > the SMMU command queue insertion operation. A new algorithm for > inserting commands into the queue is introduced in this patch, which is > lock-free on the fast-path. > > Consequence of reverting the patch is that the command queue insertion > will be slow for large systems as spinlock will be used to serializes > accesses from all CPUs to the single queue supported by the hardware. > > Once the proper atomic operations will be available in XEN the driver > can be updated. > > Following commits are reverted in this patch: > 1. "iommu/arm-smmu-v3: Add SMMUv3.2 range invalidation support" > commit 6a481a95d4c198a2dd0a61f8877b92a375757db8. > 2. "iommu/arm-smmu-v3: Batch ATC invalidation commands" > commit 9e773aee8c3e1b3ba019c5c7f8435aaa836c6130. > 3. "iommu/arm-smmu-v3: Batch context descriptor invalidation" > commit edd0351e7bc49555d8b5ad8438a65a7ca262c9f0. > 4. "iommu/arm-smmu-v3: Add command queue batching helpers > commit 4ce8da453640147101bda418640394637c1a7cfc. > 5. "iommu/arm-smmu-v3: Fix ATC invalidation ordering wrt main TLBs" > commit 353e3cf8590cf182a9f42e67993de3aca91e8090. > 6. "iommu/arm-smmu-v3: Defer TLB invalidation until ->iotlb_sync()" > commit 2af2e72b18b499fa36d3f7379fd010ff25d2a984. > 7. "iommu/arm-smmu-v3: Reduce contention during command-queue insertion" > commit 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b. > > Signed-off-by: Rahul Singh <rahul.si...@arm.com>
Acked-by: Stefano Stabellini <sstabell...@kernel.org> > --- > Changes in v3: > - Added consequences of reverting this patch in commit message. > - List all the commits that are reverted in this patch in commit > message. > > --- > xen/drivers/passthrough/arm/smmu-v3.c | 878 ++++++-------------------- > 1 file changed, 186 insertions(+), 692 deletions(-) > > diff --git a/xen/drivers/passthrough/arm/smmu-v3.c > b/xen/drivers/passthrough/arm/smmu-v3.c > index f578677a5c..8b7747ed38 100644 > --- a/xen/drivers/passthrough/arm/smmu-v3.c > +++ b/xen/drivers/passthrough/arm/smmu-v3.c > @@ -69,9 +69,6 @@ > #define IDR1_SSIDSIZE GENMASK(10, 6) > #define IDR1_SIDSIZE GENMASK(5, 0) > > -#define ARM_SMMU_IDR3 0xc > -#define IDR3_RIL (1 << 10) > - > #define ARM_SMMU_IDR5 0x14 > #define IDR5_STALL_MAX GENMASK(31, 16) > #define IDR5_GRAN64K (1 << 6) > @@ -187,7 +184,7 @@ > > #define Q_IDX(llq, p) ((p) & ((1 << > (llq)->max_n_shift) - 1)) > #define Q_WRP(llq, p) ((p) & (1 << > (llq)->max_n_shift)) > -#define Q_OVERFLOW_FLAG (1U << 31) > +#define Q_OVERFLOW_FLAG (1 << 31) > #define Q_OVF(p) ((p) & Q_OVERFLOW_FLAG) > #define Q_ENT(q, p) ((q)->base + \ > Q_IDX(&((q)->llq), p) * \ > @@ -330,15 +327,6 @@ > #define CMDQ_ERR_CERROR_ABT_IDX 2 > #define CMDQ_ERR_CERROR_ATC_INV_IDX 3 > > -#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG > - > -/* > - * This is used to size the command queue and therefore must be at least > - * BITS_PER_LONG so that the valid_map works correctly (it relies on the > - * total number of queue entries being a multiple of BITS_PER_LONG). > - */ > -#define CMDQ_BATCH_ENTRIES BITS_PER_LONG > - > #define CMDQ_0_OP GENMASK_ULL(7, 0) > #define CMDQ_0_SSV (1UL << 11) > > @@ -351,14 +339,9 @@ > #define CMDQ_CFGI_1_LEAF (1UL << 0) > #define CMDQ_CFGI_1_RANGE GENMASK_ULL(4, 0) > > -#define CMDQ_TLBI_0_NUM GENMASK_ULL(16, 12) > -#define CMDQ_TLBI_RANGE_NUM_MAX 31 > -#define CMDQ_TLBI_0_SCALE GENMASK_ULL(24, 20) > #define CMDQ_TLBI_0_VMID GENMASK_ULL(47, 32) > #define CMDQ_TLBI_0_ASID GENMASK_ULL(63, 48) > #define CMDQ_TLBI_1_LEAF (1UL << 0) > -#define CMDQ_TLBI_1_TTL GENMASK_ULL(9, 8) > -#define CMDQ_TLBI_1_TG GENMASK_ULL(11, 10) > #define CMDQ_TLBI_1_VA_MASK GENMASK_ULL(63, 12) > #define CMDQ_TLBI_1_IPA_MASK GENMASK_ULL(51, 12) > > @@ -407,8 +390,9 @@ > #define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) > > /* High-level queue structures */ > -#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */ > -#define ARM_SMMU_POLL_SPIN_COUNT 10 > +#define ARM_SMMU_POLL_TIMEOUT_US 100 > +#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ > +#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10 > > #define MSI_IOVA_BASE 0x8000000 > #define MSI_IOVA_LENGTH 0x100000 > @@ -483,13 +467,9 @@ struct arm_smmu_cmdq_ent { > #define CMDQ_OP_TLBI_S2_IPA 0x2a > #define CMDQ_OP_TLBI_NSNH_ALL 0x30 > struct { > - u8 num; > - u8 scale; > u16 asid; > u16 vmid; > bool leaf; > - u8 ttl; > - u8 tg; > u64 addr; > } tlbi; > > @@ -513,24 +493,15 @@ struct arm_smmu_cmdq_ent { > > #define CMDQ_OP_CMD_SYNC 0x46 > struct { > + u32 msidata; > u64 msiaddr; > } sync; > }; > }; > > struct arm_smmu_ll_queue { > - union { > - u64 val; > - struct { > - u32 prod; > - u32 cons; > - }; > - struct { > - atomic_t prod; > - atomic_t cons; > - } atomic; > - u8 __pad[SMP_CACHE_BYTES]; > - } ____cacheline_aligned_in_smp; > + u32 prod; > + u32 cons; > u32 max_n_shift; > }; > > @@ -548,23 +519,9 @@ struct arm_smmu_queue { > u32 __iomem *cons_reg; > }; > > -struct arm_smmu_queue_poll { > - ktime_t timeout; > - unsigned int delay; > - unsigned int spin_cnt; > - bool wfe; > -}; > - > struct arm_smmu_cmdq { > struct arm_smmu_queue q; > - atomic_long_t *valid_map; > - atomic_t owner_prod; > - atomic_t lock; > -}; > - > -struct arm_smmu_cmdq_batch { > - u64 cmds[CMDQ_BATCH_ENTRIES * > CMDQ_ENT_DWORDS]; > - int num; > + spinlock_t lock; > }; > > struct arm_smmu_evtq { > @@ -647,7 +604,6 @@ struct arm_smmu_device { > #define ARM_SMMU_FEAT_HYP (1 << 12) > #define ARM_SMMU_FEAT_STALL_FORCE (1 << 13) > #define ARM_SMMU_FEAT_VAX (1 << 14) > -#define ARM_SMMU_FEAT_RANGE_INV (1 << 15) > u32 features; > > #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) > @@ -660,6 +616,8 @@ struct arm_smmu_device { > > int gerr_irq; > int combined_irq; > + u32 sync_nr; > + u8 prev_cmd_opcode; > > unsigned long ias; /* IPA */ > unsigned long oas; /* PA */ > @@ -677,6 +635,12 @@ struct arm_smmu_device { > > struct arm_smmu_strtab_cfg strtab_cfg; > > + /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */ > + union { > + u32 sync_count; > + u64 padding; > + }; > + > /* IOMMU core code handle */ > struct iommu_device iommu; > }; > @@ -763,21 +727,6 @@ static void parse_driver_options(struct arm_smmu_device > *smmu) > } > > /* Low-level queue manipulation functions */ > -static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) > -{ > - u32 space, prod, cons; > - > - prod = Q_IDX(q, q->prod); > - cons = Q_IDX(q, q->cons); > - > - if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) > - space = (1 << q->max_n_shift) - (prod - cons); > - else > - space = cons - prod; > - > - return space >= n; > -} > - > static bool queue_full(struct arm_smmu_ll_queue *q) > { > return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && > @@ -790,12 +739,9 @@ static bool queue_empty(struct arm_smmu_ll_queue *q) > Q_WRP(q, q->prod) == Q_WRP(q, q->cons); > } > > -static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod) > +static void queue_sync_cons_in(struct arm_smmu_queue *q) > { > - return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && > - (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || > - ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && > - (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); > + q->llq.cons = readl_relaxed(q->cons_reg); > } > > static void queue_sync_cons_out(struct arm_smmu_queue *q) > @@ -826,34 +772,46 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q) > return ret; > } > > -static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n) > +static void queue_sync_prod_out(struct arm_smmu_queue *q) > { > - u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; > - return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); > + writel(q->llq.prod, q->prod_reg); > } > > -static void queue_poll_init(struct arm_smmu_device *smmu, > - struct arm_smmu_queue_poll *qp) > +static void queue_inc_prod(struct arm_smmu_ll_queue *q) > { > - qp->delay = 1; > - qp->spin_cnt = 0; > - qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > - qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); > + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; > + q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); > } > > -static int queue_poll(struct arm_smmu_queue_poll *qp) > +/* > + * Wait for the SMMU to consume items. If sync is true, wait until the queue > + * is empty. Otherwise, wait until there is at least one free slot. > + */ > +static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe) > { > - if (ktime_compare(ktime_get(), qp->timeout) > 0) > - return -ETIMEDOUT; > + ktime_t timeout; > + unsigned int delay = 1, spin_cnt = 0; > > - if (qp->wfe) { > - wfe(); > - } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) { > - cpu_relax(); > - } else { > - udelay(qp->delay); > - qp->delay *= 2; > - qp->spin_cnt = 0; > + /* Wait longer if it's a CMD_SYNC */ > + timeout = ktime_add_us(ktime_get(), sync ? > + ARM_SMMU_CMDQ_SYNC_TIMEOUT_US : > + ARM_SMMU_POLL_TIMEOUT_US); > + > + while (queue_sync_cons_in(q), > + (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) { > + if (ktime_compare(ktime_get(), timeout) > 0) > + return -ETIMEDOUT; > + > + if (wfe) { > + wfe(); > + } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { > + cpu_relax(); > + continue; > + } else { > + udelay(delay); > + delay *= 2; > + spin_cnt = 0; > + } > } > > return 0; > @@ -867,6 +825,17 @@ static void queue_write(__le64 *dst, u64 *src, size_t > n_dwords) > *dst++ = cpu_to_le64(*src++); > } > > +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) > +{ > + if (queue_full(&q->llq)) > + return -ENOSPC; > + > + queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords); > + queue_inc_prod(&q->llq); > + queue_sync_prod_out(q); > + return 0; > +} > + > static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) > { > int i; > @@ -916,22 +885,14 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct > arm_smmu_cmdq_ent *ent) > cmd[1] |= FIELD_PREP(CMDQ_CFGI_1_RANGE, 31); > break; > case CMDQ_OP_TLBI_NH_VA: > - cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num); > - cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale); > cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); > cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); > cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); > - cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl); > - cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg); > cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_VA_MASK; > break; > case CMDQ_OP_TLBI_S2_IPA: > - cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_NUM, ent->tlbi.num); > - cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_SCALE, ent->tlbi.scale); > cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); > cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_LEAF, ent->tlbi.leaf); > - cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TTL, ent->tlbi.ttl); > - cmd[1] |= FIELD_PREP(CMDQ_TLBI_1_TG, ent->tlbi.tg); > cmd[1] |= ent->tlbi.addr & CMDQ_TLBI_1_IPA_MASK; > break; > case CMDQ_OP_TLBI_NH_ASID: > @@ -964,14 +925,20 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct > arm_smmu_cmdq_ent *ent) > cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); > break; > case CMDQ_OP_CMD_SYNC: > - if (ent->sync.msiaddr) { > + if (ent->sync.msiaddr) > cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, > CMDQ_SYNC_0_CS_IRQ); > - cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; > - } else { > + else > cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, > CMDQ_SYNC_0_CS_SEV); > - } > cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); > cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, > ARM_SMMU_MEMATTR_OIWB); > + /* > + * Commands are written little-endian, but we want the SMMU to > + * receive MSIData, and thus write it back to memory, in CPU > + * byte order, so big-endian needs an extra byteswap here. > + */ > + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, > + cpu_to_le32(ent->sync.msidata)); > + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; > break; > default: > return -ENOENT; > @@ -980,27 +947,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct > arm_smmu_cmdq_ent *ent) > return 0; > } > > -static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device > *smmu, > - u32 prod) > -{ > - struct arm_smmu_queue *q = &smmu->cmdq.q; > - struct arm_smmu_cmdq_ent ent = { > - .opcode = CMDQ_OP_CMD_SYNC, > - }; > - > - /* > - * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI > - * payload, so the write will zero the entire command on that platform. > - */ > - if (smmu->features & ARM_SMMU_FEAT_MSI && > - smmu->features & ARM_SMMU_FEAT_COHERENCY) { > - ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) * > - q->ent_dwords * 8; > - } > - > - arm_smmu_cmdq_build_cmd(cmd, &ent); > -} > - > static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) > { > static const char *cerror_str[] = { > @@ -1059,474 +1005,109 @@ static void arm_smmu_cmdq_skip_err(struct > arm_smmu_device *smmu) > queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); > } > > -/* > - * Command queue locking. > - * This is a form of bastardised rwlock with the following major changes: > - * > - * - The only LOCK routines are exclusive_trylock() and shared_lock(). > - * Neither have barrier semantics, and instead provide only a control > - * dependency. > - * > - * - The UNLOCK routines are supplemented with shared_tryunlock(), which > - * fails if the caller appears to be the last lock holder (yes, this is > - * racy). All successful UNLOCK routines have RELEASE semantics. > - */ > -static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) > -{ > - int val; > - > - /* > - * We can try to avoid the cmpxchg() loop by simply incrementing the > - * lock counter. When held in exclusive state, the lock counter is set > - * to INT_MIN so these increments won't hurt as the value will remain > - * negative. > - */ > - if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) > - return; > - > - do { > - val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); > - } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); > -} > - > -static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) > -{ > - (void)atomic_dec_return_release(&cmdq->lock); > -} > - > -static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) > +static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) > { > - if (atomic_read(&cmdq->lock) == 1) > - return false; > - > - arm_smmu_cmdq_shared_unlock(cmdq); > - return true; > -} > - > -#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) \ > -({ \ > - bool __ret; \ > - local_irq_save(flags); \ > - __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \ > - if (!__ret) \ > - local_irq_restore(flags); \ > - __ret; \ > -}) > - > -#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) > \ > -({ \ > - atomic_set_release(&cmdq->lock, 0); \ > - local_irq_restore(flags); \ > -}) > - > - > -/* > - * Command queue insertion. > - * This is made fiddly by our attempts to achieve some sort of scalability > - * since there is one queue shared amongst all of the CPUs in the system. If > - * you like mixed-size concurrency, dependency ordering and relaxed atomics, > - * then you'll *love* this monstrosity. > - * > - * The basic idea is to split the queue up into ranges of commands that are > - * owned by a given CPU; the owner may not have written all of the commands > - * itself, but is responsible for advancing the hardware prod pointer when > - * the time comes. The algorithm is roughly: > - * > - * 1. Allocate some space in the queue. At this point we also discover > - * whether the head of the queue is currently owned by another CPU, > - * or whether we are the owner. > - * > - * 2. Write our commands into our allocated slots in the queue. > - * > - * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map. > - * > - * 4. If we are an owner: > - * a. Wait for the previous owner to finish. > - * b. Mark the queue head as unowned, which tells us the range > - * that we are responsible for publishing. > - * c. Wait for all commands in our owned range to become valid. > - * d. Advance the hardware prod pointer. > - * e. Tell the next owner we've finished. > - * > - * 5. If we are inserting a CMD_SYNC (we may or may not have been an > - * owner), then we need to stick around until it has completed: > - * a. If we have MSIs, the SMMU can write back into the CMD_SYNC > - * to clear the first 4 bytes. > - * b. Otherwise, we spin waiting for the hardware cons pointer to > - * advance past our command. > - * > - * The devil is in the details, particularly the use of locking for handling > - * SYNC completion and freeing up space in the queue before we think that it > is > - * full. > - */ > -static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq, > - u32 sprod, u32 eprod, bool set) > -{ > - u32 swidx, sbidx, ewidx, ebidx; > - struct arm_smmu_ll_queue llq = { > - .max_n_shift = cmdq->q.llq.max_n_shift, > - .prod = sprod, > - }; > - > - ewidx = BIT_WORD(Q_IDX(&llq, eprod)); > - ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; > - > - while (llq.prod != eprod) { > - unsigned long mask; > - atomic_long_t *ptr; > - u32 limit = BITS_PER_LONG; > - > - swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); > - sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; > - > - ptr = &cmdq->valid_map[swidx]; > - > - if ((swidx == ewidx) && (sbidx < ebidx)) > - limit = ebidx; > - > - mask = GENMASK(limit - 1, sbidx); > - > - /* > - * The valid bit is the inverse of the wrap bit. This means > - * that a zero-initialised queue is invalid and, after marking > - * all entries as valid, they become invalid again when we > - * wrap. > - */ > - if (set) { > - atomic_long_xor(mask, ptr); > - } else { /* Poll */ > - unsigned long valid; > + struct arm_smmu_queue *q = &smmu->cmdq.q; > + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > > - valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; > - atomic_long_cond_read_relaxed(ptr, (VAL & mask) == > valid); > - } > + smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); > > - llq.prod = queue_inc_prod_n(&llq, limit - sbidx); > + while (queue_insert_raw(q, cmd) == -ENOSPC) { > + if (queue_poll_cons(q, false, wfe)) > + dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); > } > } > > -/* Mark all entries in the range [sprod, eprod) as valid */ > -static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq, > - u32 sprod, u32 eprod) > -{ > - __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true); > -} > - > -/* Wait for all entries in the range [sprod, eprod) to become valid */ > -static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, > - u32 sprod, u32 eprod) > -{ > - __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false); > -} > - > -/* Wait for the command queue to become non-full */ > -static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, > - struct arm_smmu_ll_queue *llq) > +static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, > + struct arm_smmu_cmdq_ent *ent) > { > + u64 cmd[CMDQ_ENT_DWORDS]; > unsigned long flags; > - struct arm_smmu_queue_poll qp; > - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > - int ret = 0; > > - /* > - * Try to update our copy of cons by grabbing exclusive cmdq access. If > - * that fails, spin until somebody else updates it for us. > - */ > - if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) { > - WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg)); > - arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags); > - llq->val = READ_ONCE(cmdq->q.llq.val); > - return 0; > + if (arm_smmu_cmdq_build_cmd(cmd, ent)) { > + dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", > + ent->opcode); > + return; > } > > - queue_poll_init(smmu, &qp); > - do { > - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); > - if (!queue_full(llq)) > - break; > - > - ret = queue_poll(&qp); > - } while (!ret); > - > - return ret; > + spin_lock_irqsave(&smmu->cmdq.lock, flags); > + arm_smmu_cmdq_insert_cmd(smmu, cmd); > + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > } > > /* > - * Wait until the SMMU signals a CMD_SYNC completion MSI. > - * Must be called with the cmdq lock held in some capacity. > + * The difference between val and sync_idx is bounded by the maximum size of > + * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. > */ > -static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, > - struct arm_smmu_ll_queue *llq) > -{ > - int ret = 0; > - struct arm_smmu_queue_poll qp; > - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > - u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); > - > - queue_poll_init(smmu, &qp); > - > - /* > - * The MSI won't generate an event, since it's being written back > - * into the command queue. > - */ > - qp.wfe = false; > - smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp))); > - llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1); > - return ret; > -} > - > -/* > - * Wait until the SMMU cons index passes llq->prod. > - * Must be called with the cmdq lock held in some capacity. > - */ > -static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, > - struct arm_smmu_ll_queue *llq) > -{ > - struct arm_smmu_queue_poll qp; > - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > - u32 prod = llq->prod; > - int ret = 0; > - > - queue_poll_init(smmu, &qp); > - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); > - do { > - if (queue_consumed(llq, prod)) > - break; > - > - ret = queue_poll(&qp); > - > - /* > - * This needs to be a readl() so that our subsequent call > - * to arm_smmu_cmdq_shared_tryunlock() can fail accurately. > - * > - * Specifically, we need to ensure that we observe all > - * shared_lock()s by other CMD_SYNCs that share our owner, > - * so that a failing call to tryunlock() means that we're > - * the last one out and therefore we can safely advance > - * cmdq->q.llq.cons. Roughly speaking: > - * > - * CPU 0 CPU1 CPU2 (us) > - * > - * if (sync) > - * shared_lock(); > - * > - * dma_wmb(); > - * set_valid_map(); > - * > - * if (owner) { > - * poll_valid_map(); > - * <control dependency> > - * writel(prod_reg); > - * > - * readl(cons_reg); > - * tryunlock(); > - * > - * Requires us to see CPU 0's shared_lock() acquisition. > - */ > - llq->cons = readl(cmdq->q.cons_reg); > - } while (!ret); > - > - return ret; > -} > - > -static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, > - struct arm_smmu_ll_queue *llq) > +static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 > sync_idx) > { > - if (smmu->features & ARM_SMMU_FEAT_MSI && > - smmu->features & ARM_SMMU_FEAT_COHERENCY) > - return __arm_smmu_cmdq_poll_until_msi(smmu, llq); > - > - return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); > -} > - > -static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 > *cmds, > - u32 prod, int n) > -{ > - int i; > - struct arm_smmu_ll_queue llq = { > - .max_n_shift = cmdq->q.llq.max_n_shift, > - .prod = prod, > - }; > + ktime_t timeout; > + u32 val; > > - for (i = 0; i < n; ++i) { > - u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS]; > + timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); > + val = smp_cond_load_acquire(&smmu->sync_count, > + (int)(VAL - sync_idx) >= 0 || > + !ktime_before(ktime_get(), timeout)); > > - prod = queue_inc_prod_n(&llq, i); > - queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS); > - } > + return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; > } > > -/* > - * This is the actual insertion function, and provides the following > - * ordering guarantees to callers: > - * > - * - There is a dma_wmb() before publishing any commands to the queue. > - * This can be relied upon to order prior writes to data structures > - * in memory (such as a CD or an STE) before the command. > - * > - * - On completion of a CMD_SYNC, there is a control dependency. > - * This can be relied upon to order subsequent writes to memory (e.g. > - * freeing an IOVA) after completion of the CMD_SYNC. > - * > - * - Command insertion is totally ordered, so if two CPUs each race to > - * insert their own list of commands then all of the commands from one > - * CPU will appear before any of the commands from the other CPU. > - */ > -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, > - u64 *cmds, int n, bool sync) > +static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) > { > - u64 cmd_sync[CMDQ_ENT_DWORDS]; > - u32 prod; > + u64 cmd[CMDQ_ENT_DWORDS]; > unsigned long flags; > - bool owner; > - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > - struct arm_smmu_ll_queue llq = { > - .max_n_shift = cmdq->q.llq.max_n_shift, > - }, head = llq; > - int ret = 0; > - > - /* 1. Allocate some space in the queue */ > - local_irq_save(flags); > - llq.val = READ_ONCE(cmdq->q.llq.val); > - do { > - u64 old; > - > - while (!queue_has_space(&llq, n + sync)) { > - local_irq_restore(flags); > - if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) > - dev_err_ratelimited(smmu->dev, "CMDQ > timeout\n"); > - local_irq_save(flags); > - } > - > - head.cons = llq.cons; > - head.prod = queue_inc_prod_n(&llq, n + sync) | > - CMDQ_PROD_OWNED_FLAG; > - > - old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); > - if (old == llq.val) > - break; > - > - llq.val = old; > - } while (1); > - owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); > - head.prod &= ~CMDQ_PROD_OWNED_FLAG; > - llq.prod &= ~CMDQ_PROD_OWNED_FLAG; > - > - /* > - * 2. Write our commands into the queue > - * Dependency ordering from the cmpxchg() loop above. > - */ > - arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); > - if (sync) { > - prod = queue_inc_prod_n(&llq, n); > - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); > - queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); > - > - /* > - * In order to determine completion of our CMD_SYNC, we must > - * ensure that the queue can't wrap twice without us noticing. > - * We achieve that by taking the cmdq lock as shared before > - * marking our slot as valid. > - */ > - arm_smmu_cmdq_shared_lock(cmdq); > - } > - > - /* 3. Mark our slots as valid, ensuring commands are visible first */ > - dma_wmb(); > - arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod); > - > - /* 4. If we are the owner, take control of the SMMU hardware */ > - if (owner) { > - /* a. Wait for previous owner to finish */ > - atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod); > - > - /* b. Stop gathering work by clearing the owned flag */ > - prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG, > - &cmdq->q.llq.atomic.prod); > - prod &= ~CMDQ_PROD_OWNED_FLAG; > - > - /* > - * c. Wait for any gathered work to be written to the queue. > - * Note that we read our own entries so that we have the control > - * dependency required by (d). > - */ > - arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod); > + struct arm_smmu_cmdq_ent ent = { > + .opcode = CMDQ_OP_CMD_SYNC, > + .sync = { > + .msiaddr = virt_to_phys(&smmu->sync_count), > + }, > + }; > > - /* > - * d. Advance the hardware prod pointer > - * Control dependency ordering from the entries becoming valid. > - */ > - writel_relaxed(prod, cmdq->q.prod_reg); > + spin_lock_irqsave(&smmu->cmdq.lock, flags); > > - /* > - * e. Tell the next owner we're done > - * Make sure we've updated the hardware first, so that we don't > - * race to update prod and potentially move it backwards. > - */ > - atomic_set_release(&cmdq->owner_prod, prod); > + /* Piggy-back on the previous command if it's a SYNC */ > + if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) { > + ent.sync.msidata = smmu->sync_nr; > + } else { > + ent.sync.msidata = ++smmu->sync_nr; > + arm_smmu_cmdq_build_cmd(cmd, &ent); > + arm_smmu_cmdq_insert_cmd(smmu, cmd); > } > > - /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ > - if (sync) { > - llq.prod = queue_inc_prod_n(&llq, n); > - ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); > - if (ret) { > - dev_err_ratelimited(smmu->dev, > - "CMD_SYNC timeout at 0x%08x [hwprod > 0x%08x, hwcons 0x%08x]\n", > - llq.prod, > - readl_relaxed(cmdq->q.prod_reg), > - readl_relaxed(cmdq->q.cons_reg)); > - } > + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > > - /* > - * Try to unlock the cmq lock. This will fail if we're the last > - * reader, in which case we can safely update cmdq->q.llq.cons > - */ > - if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) { > - WRITE_ONCE(cmdq->q.llq.cons, llq.cons); > - arm_smmu_cmdq_shared_unlock(cmdq); > - } > - } > - > - local_irq_restore(flags); > - return ret; > + return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); > } > > -static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, > - struct arm_smmu_cmdq_ent *ent) > +static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > { > u64 cmd[CMDQ_ENT_DWORDS]; > + unsigned long flags; > + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > + struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; > + int ret; > > - if (arm_smmu_cmdq_build_cmd(cmd, ent)) { > - dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", > - ent->opcode); > - return -EINVAL; > - } > + arm_smmu_cmdq_build_cmd(cmd, &ent); > > - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); > -} > + spin_lock_irqsave(&smmu->cmdq.lock, flags); > + arm_smmu_cmdq_insert_cmd(smmu, cmd); > + ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); > + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > > -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > -{ > - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); > + return ret; > } > > -static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, > - struct arm_smmu_cmdq_batch *cmds, > - struct arm_smmu_cmdq_ent *cmd) > +static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > { > - if (cmds->num == CMDQ_BATCH_ENTRIES) { > - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); > - cmds->num = 0; > - } > - arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd); > - cmds->num++; > -} > + int ret; > + bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && > + (smmu->features & ARM_SMMU_FEAT_COHERENCY); > > -static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, > - struct arm_smmu_cmdq_batch *cmds) > -{ > - return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); > + ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) > + : __arm_smmu_cmdq_issue_sync(smmu); > + if (ret) > + dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); > + return ret; > } > > /* Context descriptor manipulation functions */ > @@ -1536,7 +1117,6 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain > *smmu_domain, > size_t i; > unsigned long flags; > struct arm_smmu_master *master; > - struct arm_smmu_cmdq_batch cmds = {}; > struct arm_smmu_device *smmu = smmu_domain->smmu; > struct arm_smmu_cmdq_ent cmd = { > .opcode = CMDQ_OP_CFGI_CD, > @@ -1550,12 +1130,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain > *smmu_domain, > list_for_each_entry(master, &smmu_domain->devices, domain_head) { > for (i = 0; i < master->num_sids; i++) { > cmd.cfgi.sid = master->sids[i]; > - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); > + arm_smmu_cmdq_issue_cmd(smmu, &cmd); > } > } > spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); > > - arm_smmu_cmdq_batch_submit(smmu, &cmds); > + arm_smmu_cmdq_issue_sync(smmu); > } > > static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, > @@ -2190,16 +1770,17 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, > size_t size, > cmd->atc.size = log2_span; > } > > -static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) > +static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, > + struct arm_smmu_cmdq_ent *cmd) > { > int i; > - struct arm_smmu_cmdq_ent cmd; > > - arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); > + if (!master->ats_enabled) > + return 0; > > for (i = 0; i < master->num_sids; i++) { > - cmd.atc.sid = master->sids[i]; > - arm_smmu_cmdq_issue_cmd(master->smmu, &cmd); > + cmd->atc.sid = master->sids[i]; > + arm_smmu_cmdq_issue_cmd(master->smmu, cmd); > } > > return arm_smmu_cmdq_issue_sync(master->smmu); > @@ -2208,11 +1789,10 @@ static int arm_smmu_atc_inv_master(struct > arm_smmu_master *master) > static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, > int ssid, unsigned long iova, size_t size) > { > - int i; > + int ret = 0; > unsigned long flags; > struct arm_smmu_cmdq_ent cmd; > struct arm_smmu_master *master; > - struct arm_smmu_cmdq_batch cmds = {}; > > if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) > return 0; > @@ -2237,18 +1817,11 @@ static int arm_smmu_atc_inv_domain(struct > arm_smmu_domain *smmu_domain, > arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); > > spin_lock_irqsave(&smmu_domain->devices_lock, flags); > - list_for_each_entry(master, &smmu_domain->devices, domain_head) { > - if (!master->ats_enabled) > - continue; > - > - for (i = 0; i < master->num_sids; i++) { > - cmd.atc.sid = master->sids[i]; > - arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd); > - } > - } > + list_for_each_entry(master, &smmu_domain->devices, domain_head) > + ret |= arm_smmu_atc_inv_master(master, &cmd); > spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); > > - return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds); > + return ret ? -ETIMEDOUT : 0; > } > > /* IO_PGTABLE API */ > @@ -2270,26 +1843,23 @@ static void arm_smmu_tlb_inv_context(void *cookie) > /* > * NOTE: when io-pgtable is in non-strict mode, we may get here with > * PTEs previously cleared by unmaps on the current CPU not yet visible > - * to the SMMU. We are relying on the dma_wmb() implicit during cmd > - * insertion to guarantee those are observed before the TLBI. Do be > - * careful, 007. > + * to the SMMU. We are relying on the DSB implicit in > + * queue_sync_prod_out() to guarantee those are observed before the > + * TLBI. Do be careful, 007. > */ > arm_smmu_cmdq_issue_cmd(smmu, &cmd); > arm_smmu_cmdq_issue_sync(smmu); > - arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); > } > > -static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size, > - size_t granule, bool leaf, > - struct arm_smmu_domain *smmu_domain) > +static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, > + size_t granule, bool leaf, void > *cookie) > { > + struct arm_smmu_domain *smmu_domain = cookie; > struct arm_smmu_device *smmu = smmu_domain->smmu; > - unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0; > - size_t inv_range = granule; > - struct arm_smmu_cmdq_batch cmds = {}; > struct arm_smmu_cmdq_ent cmd = { > .tlbi = { > .leaf = leaf, > + .addr = iova, > }, > }; > > @@ -2304,78 +1874,37 @@ static void arm_smmu_tlb_inv_range(unsigned long > iova, size_t size, > cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; > } > > - if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { > - /* Get the leaf page size */ > - tg = __ffs(smmu_domain->domain.pgsize_bitmap); > - > - /* Convert page size of 12,14,16 (log2) to 1,2,3 */ > - cmd.tlbi.tg = (tg - 10) / 2; > - > - /* Determine what level the granule is at */ > - cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3)); > - > - num_pages = size >> tg; > - } > - > - while (iova < end) { > - if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { > - /* > - * On each iteration of the loop, the range is 5 bits > - * worth of the aligned size remaining. > - * The range in pages is: > - * > - * range = (num_pages & (0x1f << __ffs(num_pages))) > - */ > - unsigned long scale, num; > - > - /* Determine the power of 2 multiple number of pages */ > - scale = __ffs(num_pages); > - cmd.tlbi.scale = scale; > - > - /* Determine how many chunks of 2^scale size we have */ > - num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX; > - cmd.tlbi.num = num - 1; > - > - /* range is num * 2^scale * pgsize */ > - inv_range = num << (scale + tg); > - > - /* Clear out the lower order bits for the next > iteration */ > - num_pages -= num << scale; > - } > - > - cmd.tlbi.addr = iova; > - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); > - iova += inv_range; > - } > - arm_smmu_cmdq_batch_submit(smmu, &cmds); > - > - /* > - * Unfortunately, this can't be leaf-only since we may have > - * zapped an entire table. > - */ > - arm_smmu_atc_inv_domain(smmu_domain, 0, start, size); > + do { > + arm_smmu_cmdq_issue_cmd(smmu, &cmd); > + cmd.tlbi.addr += granule; > + } while (size -= granule); > } > > static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, > unsigned long iova, size_t granule, > void *cookie) > { > - struct arm_smmu_domain *smmu_domain = cookie; > - struct iommu_domain *domain = &smmu_domain->domain; > - > - iommu_iotlb_gather_add_page(domain, gather, iova, granule); > + arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); > } > > static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, > size_t granule, void *cookie) > { > - arm_smmu_tlb_inv_range(iova, size, granule, false, cookie); > + struct arm_smmu_domain *smmu_domain = cookie; > + struct arm_smmu_device *smmu = smmu_domain->smmu; > + > + arm_smmu_tlb_inv_range_nosync(iova, size, granule, false, cookie); > + arm_smmu_cmdq_issue_sync(smmu); > } > > static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, > size_t granule, void *cookie) > { > - arm_smmu_tlb_inv_range(iova, size, granule, true, cookie); > + struct arm_smmu_domain *smmu_domain = cookie; > + struct arm_smmu_device *smmu = smmu_domain->smmu; > + > + arm_smmu_tlb_inv_range_nosync(iova, size, granule, true, cookie); > + arm_smmu_cmdq_issue_sync(smmu); > } > > static const struct iommu_flush_ops arm_smmu_flush_ops = { > @@ -2701,6 +2230,7 @@ static void arm_smmu_enable_ats(struct arm_smmu_master > *master) > > static void arm_smmu_disable_ats(struct arm_smmu_master *master) > { > + struct arm_smmu_cmdq_ent cmd; > struct arm_smmu_domain *smmu_domain = master->domain; > > if (!master->ats_enabled) > @@ -2712,7 +2242,8 @@ static void arm_smmu_disable_ats(struct arm_smmu_master > *master) > * ATC invalidation via the SMMU. > */ > wmb(); > - arm_smmu_atc_inv_master(master); > + arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); > + arm_smmu_atc_inv_master(master, &cmd); > atomic_dec(&smmu_domain->nr_ats_masters); > } > > @@ -2856,13 +2387,18 @@ static int arm_smmu_map(struct iommu_domain *domain, > unsigned long iova, > static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > size_t size, struct iommu_iotlb_gather *gather) > { > + int ret; > struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > > if (!ops) > return 0; > > - return ops->unmap(ops, iova, size, gather); > + ret = ops->unmap(ops, iova, size, gather); > + if (ret && arm_smmu_atc_inv_domain(smmu_domain, 0, iova, size)) > + return 0; > + > + return ret; > } > > static void arm_smmu_flush_iotlb_all(struct iommu_domain *domain) > @@ -2876,10 +2412,10 @@ static void arm_smmu_flush_iotlb_all(struct > iommu_domain *domain) > static void arm_smmu_iotlb_sync(struct iommu_domain *domain, > struct iommu_iotlb_gather *gather) > { > - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > + struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; > > - arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start, > - gather->pgsize, true, smmu_domain); > + if (smmu) > + arm_smmu_cmdq_issue_sync(smmu); > } > > static phys_addr_t > @@ -3177,49 +2713,18 @@ static int arm_smmu_init_one_queue(struct > arm_smmu_device *smmu, > return 0; > } > > -static void arm_smmu_cmdq_free_bitmap(void *data) > -{ > - unsigned long *bitmap = data; > - bitmap_free(bitmap); > -} > - > -static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) > -{ > - int ret = 0; > - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; > - unsigned int nents = 1 << cmdq->q.llq.max_n_shift; > - atomic_long_t *bitmap; > - > - atomic_set(&cmdq->owner_prod, 0); > - atomic_set(&cmdq->lock, 0); > - > - bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); > - if (!bitmap) { > - dev_err(smmu->dev, "failed to allocate cmdq bitmap\n"); > - ret = -ENOMEM; > - } else { > - cmdq->valid_map = bitmap; > - devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap); > - } > - > - return ret; > -} > - > static int arm_smmu_init_queues(struct arm_smmu_device *smmu) > { > int ret; > > /* cmdq */ > + spin_lock_init(&smmu->cmdq.lock); > ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, > ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, > "cmdq"); > if (ret) > return ret; > > - ret = arm_smmu_cmdq_init(smmu); > - if (ret) > - return ret; > - > /* evtq */ > ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, > ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, > @@ -3800,15 +3305,9 @@ static int arm_smmu_device_hw_probe(struct > arm_smmu_device *smmu) > /* Queue sizes, capped to ensure natural alignment */ > smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, > FIELD_GET(IDR1_CMDQS, reg)); > - if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) { > - /* > - * We don't support splitting up batches, so one batch of > - * commands plus an extra sync needs to fit inside the command > - * queue. There's also no way we can handle the weird alignment > - * restrictions on the base pointer for a unit-length queue. > - */ > - dev_err(smmu->dev, "command queue size <= %d entries not > supported\n", > - CMDQ_BATCH_ENTRIES); > + if (!smmu->cmdq.q.llq.max_n_shift) { > + /* Odd alignment restrictions on the base, so ignore for now */ > + dev_err(smmu->dev, "unit-length command queue not supported\n"); > return -ENXIO; > } > > @@ -3828,11 +3327,6 @@ static int arm_smmu_device_hw_probe(struct > arm_smmu_device *smmu) > if (smmu->sid_bits <= STRTAB_SPLIT) > smmu->features &= ~ARM_SMMU_FEAT_2_LVL_STRTAB; > > - /* IDR3 */ > - reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3); > - if (FIELD_GET(IDR3_RIL, reg)) > - smmu->features |= ARM_SMMU_FEAT_RANGE_INV; > - > /* IDR5 */ > reg = readl_relaxed(smmu->base + ARM_SMMU_IDR5); > > -- > 2.17.1 >