Altix supports "posted DMA", so that DMA may complete out of order. In some cases it's necessary for a driver to ensure that in-flight DMA has been flushed to memory for correct operation.
In particular this can be a problem with Infiniband, where writes to Completion Queues can race with DMA of data. The following patch addresses this problem by allowing a memory region to be mapped with a "barrier" attribute. (On Altix, writes to memory regions with the barrier attribute have the side effect that in-flight DMA gets flushed to host memory.) The only change to core code is the addition of a no-op stub function "dma_flags_set_dmaflush()" in linux/dma-mapping.h. Everything else is handled in architecture-specific or driver code. Signed-off-by: Arthur Kepner <[EMAIL PROTECTED]> -- arch/ia64/sn/pci/pci_dma.c | 35 ++++++++++++++++++++------- drivers/infiniband/core/umem.c | 8 ++++-- drivers/infiniband/hw/mthca/mthca_provider.c | 11 +++++++- drivers/infiniband/hw/mthca/mthca_user.h | 10 ++++++- include/asm-ia64/dma-mapping.h | 0 include/asm-ia64/sn/io.h | 26 ++++++++++++++++++++ include/linux/dma-mapping.h | 7 +++++ include/rdma/ib_umem.h | 4 +-- 8 files changed, 86 insertions(+), 15 deletions(-) diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d79ddac..754240b 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(sn_dma_free_coherent); * @dev: device to map for * @cpu_addr: kernel virtual address of the region to map * @size: size of the region - * @direction: DMA direction + * @flags: DMA direction, and arch-specific attributes * * Map the region pointed to by @cpu_addr for DMA and return the * DMA address. @@ -167,17 +167,23 @@ EXPORT_SYMBOL(sn_dma_free_coherent); * figure out how to save dmamap handle so can use two step. */ dma_addr_t sn_dma_map_single(struct device *dev, void *cpu_addr, size_t size, - int direction) + int flags) { dma_addr_t dma_addr; unsigned long phys_addr; struct pci_dev *pdev = to_pci_dev(dev); struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); + int dmaflush = dma_flags_get_dmaflush(flags); BUG_ON(dev->bus != &pci_bus_type); phys_addr = __pa(cpu_addr); - dma_addr = provider->dma_map(pdev, phys_addr, size, SN_DMA_ADDR_PHYS); + if (dmaflush) + dma_addr = provider->dma_map_consistent(pdev, phys_addr, size, + SN_DMA_ADDR_PHYS); + else + dma_addr = provider->dma_map(pdev, phys_addr, size, + SN_DMA_ADDR_PHYS); if (!dma_addr) { printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__); return 0; @@ -240,18 +246,20 @@ EXPORT_SYMBOL(sn_dma_unmap_sg); * @dev: device to map for * @sg: scatterlist to map * @nhwentries: number of entries - * @direction: direction of the DMA transaction + * @flags: direction of the DMA transaction, and arch-specific attributes * * Maps each entry of @sg for DMA. */ int sn_dma_map_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - int direction) + int flags) { unsigned long phys_addr; struct scatterlist *saved_sg = sg; struct pci_dev *pdev = to_pci_dev(dev); struct sn_pcibus_provider *provider = SN_PCIDEV_BUSPROVIDER(pdev); int i; + int dmaflush = dma_flags_get_dmaflush(flags); + int direction = dma_flags_get_direction(flags); BUG_ON(dev->bus != &pci_bus_type); @@ -259,12 +267,21 @@ int sn_dma_map_sg(struct device *dev, struct scatterlist *sg, int nhwentries, * Setup a DMA address for each entry in the scatterlist. */ for (i = 0; i < nhwentries; i++, sg++) { + dma_addr_t dma_addr; phys_addr = SG_ENT_PHYS_ADDRESS(sg); - sg->dma_address = provider->dma_map(pdev, - phys_addr, sg->length, - SN_DMA_ADDR_PHYS); - if (!sg->dma_address) { + if (dmaflush) { + dma_addr = provider->dma_map_consistent(pdev, + phys_addr, + sg->length, + SN_DMA_ADDR_PHYS); + } else { + dma_addr = provider->dma_map(pdev, + phys_addr, sg->length, + SN_DMA_ADDR_PHYS); + } + + if (!(sg->dma_address = dma_addr)) { printk(KERN_ERR "%s: out of ATEs\n", __FUNCTION__); /* diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 26d0470..c626d2c 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -64,9 +64,11 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d * @addr: userspace virtual address to start at * @size: length of region to pin * @access: IB_ACCESS_xxx flags for memory being pinned + * @dmaflush: map this memory "coherently", if necessary + * (for architectures that support posted DMA) */ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access) + size_t size, int access, int dmaflush) { struct ib_umem *umem; struct page **page_list; @@ -78,6 +80,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, int ret; int off; int i; + int flags = dmaflush ? dma_flags_set_dmaflush(DMA_BIDIRECTIONAL): + DMA_BIDIRECTIONAL; if (!can_do_mlock()) return ERR_PTR(-EPERM); @@ -155,7 +159,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, chunk->nmap = ib_dma_map_sg(context->device, &chunk->page_list[0], chunk->nents, - DMA_BIDIRECTIONAL); + flags); if (chunk->nmap <= 0) { for (i = 0; i < chunk->nents; ++i) put_page(chunk->page_list[i].page); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 6bcde1c..a94d4cf 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1017,6 +1017,8 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct mthca_dev *dev = to_mdev(pd->device); struct ib_umem_chunk *chunk; struct mthca_mr *mr; + struct mthca_reg_mr ucmd; + int dmaflush; u64 *pages; int shift, n, len; int i, j, k; @@ -1027,7 +1029,14 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->uobject->context, start, length, acc); + if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + err = -EFAULT; + goto err; + } + dmaflush = (int) ucmd.mr_attrs & MTHCA_MR_DMAFLUSH; + + mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, + dmaflush); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err; diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h index 02cc0a7..fa8c339 100644 --- a/drivers/infiniband/hw/mthca/mthca_user.h +++ b/drivers/infiniband/hw/mthca/mthca_user.h @@ -41,7 +41,7 @@ * Increment this value if any changes that break userspace ABI * compatibility are made. */ -#define MTHCA_UVERBS_ABI_VERSION 1 +#define MTHCA_UVERBS_ABI_VERSION 2 /* * Make sure that all structs defined in this file remain laid out so @@ -61,6 +61,14 @@ struct mthca_alloc_pd_resp { __u32 reserved; }; +struct mthca_reg_mr { + __u32 mr_attrs; +#define MTHCA_MR_DMAFLUSH 0x1 /* flush in-flight DMA on a write to + * memory region (IA64_SGI_SN2 only) */ + __u32 reserved; +}; + + struct mthca_create_cq { __u32 lkey; __u32 pdn; diff --git a/include/asm-ia64/dma-mapping.h b/include/asm-ia64/dma-mapping.h diff --git a/include/asm-ia64/sn/io.h b/include/asm-ia64/sn/io.h index 41c73a7..c82eb90 100644 --- a/include/asm-ia64/sn/io.h +++ b/include/asm-ia64/sn/io.h @@ -271,4 +271,30 @@ sn_pci_set_vchan(struct pci_dev *pci_dev, unsigned long *addr, int vchan) return 0; } +#define ARCH_DOES_POSTED_DMA +/* here we steal some upper bits from the "direction" argument to the + * dma_map_* routines */ +#define DMA_ATTR_SHIFT 8 +/* bottom 8 bits for direction, remaining bits for additional "attributes" */ +#define DMA_FLUSH_ATTR 0x1 +/* For now the only attribute is "flush in-flight dma when writing to + * this DMA mapped memory" */ +#define DMA_DIR_MASK ((1 << DMA_ATTR_SHIFT) - 1) +#define DMA_ATTR_MASK ~DMA_DIR_MASK + +static inline int +dma_flags_set_dmaflush(int dir) { + return (dir | (DMA_FLUSH_ATTR<< DMA_ATTR_SHIFT)); +} + +static inline int +dma_flags_get_direction(int dir) { + return (dir & DMA_DIR_MASK); +} + +static inline int +dma_flags_get_dmaflush(int dir) { + return (((dir & DMA_ATTR_MASK) >> DMA_ATTR_SHIFT) & DMA_FLUSH_ATTR); +} + #endif /* _ASM_SN_IO_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2dc21cb..594a651 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -99,4 +99,11 @@ static inline void dmam_release_declared_memory(struct device *dev) } #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ +#ifndef ARCH_DOES_POSTED_DMA +static inline int +dma_flags_set_dmaflush(int dir) { + return (dir); +} +#endif /* ARCH_DOES_POSTED_DMA */ + #endif diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index c533d6c..b7aaeb0 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -61,7 +61,7 @@ struct ib_umem_chunk { #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, - size_t size, int access); + size_t size, int access, int dmaflush); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); @@ -71,7 +71,7 @@ int ib_umem_page_count(struct ib_umem *umem); static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, - int access) { + int access, int dmaflush) { return ERR_PTR(-EINVAL); } static inline void ib_umem_release(struct ib_umem *umem) { } -- Arthur - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/