Hi Robin, This mostly looks good. Just a few comments below.
On Thu, Aug 31, 2017 at 02:44:27PM +0100, Robin Murphy wrote: > As an IRQ, the CMD_SYNC interrupt is not particularly useful, not least > because we often need to wait for sync completion within someone else's > IRQ handler anyway. However, when the SMMU is both coherent and supports > MSIs, we can have a lot more fun by not using it as an interrupt at all. > Following the example suggested in the architecture and using a write > targeting normal memory, we can let callers wait on a status variable > outside the lock instead of having to stall the entire queue or even > touch MMIO registers. Since multiple sync commands are guaranteed to > complete in order, a simple incrementing sequence count is all we need > to unambiguously support any realistic number of overlapping waiters. > > Signed-off-by: Robin Murphy <robin.mur...@arm.com> > --- > > v2: Remove redundant 'bool msi' command member, other cosmetic tweaks > > drivers/iommu/arm-smmu-v3.c | 47 > +++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 45 insertions(+), 2 deletions(-) > > diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c > index f066725298cd..311f482b93d5 100644 > --- a/drivers/iommu/arm-smmu-v3.c > +++ b/drivers/iommu/arm-smmu-v3.c > @@ -377,7 +377,16 @@ > > #define CMDQ_SYNC_0_CS_SHIFT 12 > #define CMDQ_SYNC_0_CS_NONE (0UL << CMDQ_SYNC_0_CS_SHIFT) > +#define CMDQ_SYNC_0_CS_IRQ (1UL << CMDQ_SYNC_0_CS_SHIFT) > #define CMDQ_SYNC_0_CS_SEV (2UL << CMDQ_SYNC_0_CS_SHIFT) > +#define CMDQ_SYNC_0_MSH_SHIFT 22 > +#define CMDQ_SYNC_0_MSH_ISH (3UL << CMDQ_SYNC_0_MSH_SHIFT) > +#define CMDQ_SYNC_0_MSIATTR_SHIFT 24 > +#define CMDQ_SYNC_0_MSIATTR_OIWB (0xfUL << CMDQ_SYNC_0_MSIATTR_SHIFT) > +#define CMDQ_SYNC_0_MSIDATA_SHIFT 32 > +#define CMDQ_SYNC_0_MSIDATA_MASK 0xffffffffUL > +#define CMDQ_SYNC_1_MSIADDR_SHIFT 0 > +#define CMDQ_SYNC_1_MSIADDR_MASK 0xffffffffffffcUL > > /* Event queue */ > #define EVTQ_ENT_DWORDS 4 > @@ -409,6 +418,7 @@ > /* High-level queue structures */ > #define ARM_SMMU_POLL_TIMEOUT_US 100 > #define ARM_SMMU_CMDQ_DRAIN_TIMEOUT_US 1000000 /* 1s! */ > +#define ARM_SMMU_SYNC_TIMEOUT_US 1000000 /* 1s! */ We only ever do this when waiting for the queue to drain, so may as well just reuse the drain timeout. > #define MSI_IOVA_BASE 0x8000000 > #define MSI_IOVA_LENGTH 0x100000 > @@ -504,6 +514,10 @@ struct arm_smmu_cmdq_ent { > } pri; > > #define CMDQ_OP_CMD_SYNC 0x46 > + struct { > + u32 msidata; > + u64 msiaddr; > + } sync; > }; > }; > > @@ -617,6 +631,9 @@ struct arm_smmu_device { > int gerr_irq; > int combined_irq; > > + atomic_t sync_nr; > + u32 sync_count; It's probably worth sticking these in separate cachelines so we don't get spurious wakeups when sync_nr is incremented. (yes, I know it should be the ERG, but that can be unreasonably huge!). > + > unsigned long ias; /* IPA */ > unsigned long oas; /* PA */ > unsigned long pgsize_bitmap; > @@ -878,7 +895,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct > arm_smmu_cmdq_ent *ent) > } > break; > case CMDQ_OP_CMD_SYNC: > - cmd[0] |= CMDQ_SYNC_0_CS_SEV; > + if (ent->sync.msiaddr) > + cmd[0] |= CMDQ_SYNC_0_CS_IRQ; > + else > + cmd[0] |= CMDQ_SYNC_0_CS_SEV; > + cmd[0] |= CMDQ_SYNC_0_MSH_ISH | CMDQ_SYNC_0_MSIATTR_OIWB; > + cmd[0] |= (u64)ent->sync.msidata << CMDQ_SYNC_0_MSIDATA_SHIFT; > + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; > break; > default: > return -ENOENT; > @@ -964,21 +987,40 @@ static void arm_smmu_cmdq_issue_cmd(struct > arm_smmu_device *smmu, > spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > } > > +static int arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 sync_idx) > +{ > + ktime_t timeout = ktime_add_us(ktime_get(), ARM_SMMU_SYNC_TIMEOUT_US); > + u32 val = smp_cond_load_acquire(&smmu->sync_count, > + (int)(VAL - sync_idx) >= 0 || > + !ktime_before(ktime_get(), timeout)); > + > + return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; There are some theoretical overflow issues here which I don't think will ever occur in practice, but deserve at least a comment to explain why. > +} > + > static void arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) > { > u64 cmd[CMDQ_ENT_DWORDS]; > unsigned long flags; > bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); > + bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && > + (smmu->features & ARM_SMMU_FEAT_COHERENCY); I don't think this is sufficient for the case where we fail to setup MSIs and fall back on legacy IRQs. > struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; > int ret; > > + if (msi) { > + ent.sync.msidata = atomic_inc_return(&smmu->sync_nr); I don't think you need barrier semantics here. > + ent.sync.msiaddr = virt_to_phys(&smmu->sync_count); > + } > arm_smmu_cmdq_build_cmd(cmd, &ent); > > spin_lock_irqsave(&smmu->cmdq.lock, flags); > arm_smmu_cmdq_insert_cmd(smmu, cmd); > - ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); > + if (!msi) > + ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); > spin_unlock_irqrestore(&smmu->cmdq.lock, flags); > > + if (msi) > + ret = arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); This looks like the queue polling should be wrapped up in a function that returns with the spinlock released. Will _______________________________________________ iommu mailing list iommu@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/iommu