Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
On Tue, Oct 27, 2020 at 08:33:52PM +, Xiong, Jianxin wrote: > > > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr > > > *imr, > > > * Returns: > > > * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages > > > that are > > > * not accessible, or the MR is no longer valid. > > > + * -EAGAIN: The operation should be retried > > > + * > > > + * >0: Number of pages mapped > > > + */ > > > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem > > > *umem, > > > +u64 io_virt, size_t bcnt, u32 *bytes_mapped, > > > +u32 flags) > > > +{ > > > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem); > > > + u64 user_va; > > > + u64 end; > > > + int npages; > > > + int err; > > > + > > > + if (unlikely(io_virt < mr->mmkey.iova)) > > > + return -EFAULT; > > > + if (check_add_overflow(io_virt - mr->mmkey.iova, > > > +(u64)umem->address, &user_va)) > > > + return -EFAULT; > > > + /* Overflow has alreddy been checked at the umem creation time */ > > > + end = umem->address + umem->length; > > > + if (unlikely(user_va >= end || end - user_va < bcnt)) > > > + return -EFAULT; > > > > Why duplicate this sequence? Caller does it > > The sequence in the caller is for umem_odp only. Nothing about umem_odp in this code though?? > > > /* prefetch with write-access must be supported by the MR */ > > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && > > > - !odp->umem.writable) > > > + !mr->umem->writable) > > > > ?? > There is no need to use umem_odp here, mr->umem is the same as &odp->umem. > This change makes the code works for both umem_odp and umem_dmabuf. Ok Can you please also think about how to test this? I very much prefer to see new pyverbs tests for new APIs. Distros are running the rdma-core test suite, if you want this to work widely we need a public test for it. Thanks, Jason ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
RE: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
> -Original Message- > From: Jason Gunthorpe > Sent: Wednesday, October 28, 2020 9:36 AM > To: Xiong, Jianxin > Cc: linux-r...@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford > ; Leon Romanovsky > ; Sumit Semwal ; Christian Koenig > ; Vetter, Daniel > > Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory > region > > On Tue, Oct 27, 2020 at 08:33:52PM +, Xiong, Jianxin wrote: > > > > @@ -801,6 +816,52 @@ static int pagefault_implicit_mr(struct mlx5_ib_mr > > > > *imr, > > > > * Returns: > > > > * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages > > > > that are > > > > * not accessible, or the MR is no longer valid. > > > > + * -EAGAIN: The operation should be retried > > > > + * > > > > + * >0: Number of pages mapped > > > > + */ > > > > +static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, struct ib_umem > > > > *umem, > > > > + u64 io_virt, size_t bcnt, u32 > > > > *bytes_mapped, > > > > + u32 flags) > > > > +{ > > > > + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(umem); > > > > + u64 user_va; > > > > + u64 end; > > > > + int npages; > > > > + int err; > > > > + > > > > + if (unlikely(io_virt < mr->mmkey.iova)) > > > > + return -EFAULT; > > > > + if (check_add_overflow(io_virt - mr->mmkey.iova, > > > > + (u64)umem->address, &user_va)) > > > > + return -EFAULT; > > > > + /* Overflow has alreddy been checked at the umem creation time > > > > */ > > > > + end = umem->address + umem->length; > > > > + if (unlikely(user_va >= end || end - user_va < bcnt)) > > > > + return -EFAULT; > > > > > > Why duplicate this sequence? Caller does it > > > > The sequence in the caller is for umem_odp only. > > Nothing about umem_odp in this code though?? The code in the caller uses ib_umem_end(odp) instead of the 'end' here, but we can consolidate that with some minor changes. > > > > > /* prefetch with write-access must be supported by the MR */ > > > > if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && > > > > - !odp->umem.writable) > > > > + !mr->umem->writable) > > > > > > ?? > > > There is no need to use umem_odp here, mr->umem is the same as &odp->umem. > > This change makes the code works for both umem_odp and umem_dmabuf. > > Ok > > Can you please also think about how to test this? I very much prefer to see > new pyverbs tests for new APIs. > > Distros are running the rdma-core test suite, if you want this to work widely > we need a public test for it. > Will look into that. > Thanks, > Jason ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote: > diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c > index b261797..3bc412b 100644 > +++ b/drivers/infiniband/hw/mlx5/mr.c > @@ -1,5 +1,6 @@ > /* > * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. > + * Copyright (c) 2020, Intel Corporation. All rights reserved. > * > * This software is available to you under a choice of one of two > * licenses. You may choose to be licensed under the terms of the GNU > @@ -36,6 +37,8 @@ > #include > #include > #include > +#include > +#include > #include > #include > #include > @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, > int npages, > dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); > if (mr->umem->is_odp) { > mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); > + } else if (mr->umem->is_dmabuf && (flags & > MLX5_IB_UPD_XLT_ZAP)) { > + memset(xlt, 0, size); > } else { > __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx, > npages, xlt, > @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, > u64 start, u64 length, > return ERR_PTR(err); > } > > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) > +{ > + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; > + struct mlx5_ib_mr *mr = umem_dmabuf->device_context; > + > + mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP); > + ib_umem_dmabuf_unmap_pages(umem_dmabuf); > +} > + > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { > + .allow_peer2peer = 1, > + .move_notify = mlx5_ib_dmabuf_invalidate_cb, > +}; > + > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, > + u64 length, u64 virt_addr, > + int fd, int access_flags, > + struct ib_udata *udata) > +{ > + struct mlx5_ib_dev *dev = to_mdev(pd->device); > + struct mlx5_ib_mr *mr = NULL; > + struct ib_umem *umem; > + struct ib_umem_dmabuf *umem_dmabuf; > + int npages; > + int order; > + int err; > + > + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) > + return ERR_PTR(-EOPNOTSUPP); > + > + mlx5_ib_dbg(dev, > + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, > access_flags 0x%x\n", > + offset, virt_addr, length, fd, access_flags); > + > + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) > + return ERR_PTR(-EINVAL); > + > + umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, > access_flags, > + &mlx5_ib_dmabuf_attach_ops); > + if (IS_ERR(umem)) { > + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); > + return ERR_PTR(PTR_ERR(umem)); > + } > + > + npages = ib_umem_num_pages(umem); > + if (!npages) { ib_umem_get should reject invalid umems like this > + mlx5_ib_warn(dev, "avoid zero region\n"); > + ib_umem_release(umem); > + return ERR_PTR(-EINVAL); > + } > + > + order = ilog2(roundup_pow_of_two(npages)); Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the argument for this scenario > + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", > + npages, npages, order, PAGE_SHIFT); > + > + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages, > + PAGE_SHIFT, order, access_flags); > + if (IS_ERR(mr)) > + mr = NULL; > + > + if (!mr) { > + mutex_lock(&dev->slow_path_mutex); > + mr = reg_create(NULL, pd, virt_addr, length, umem, npages, > + PAGE_SHIFT, access_flags, false); > + mutex_unlock(&dev->slow_path_mutex); > + } Rebase on the mlx5 series just posted and use their version of this code sequence, this is just not quite right > + err = mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, > + MLX5_IB_UPD_XLT_ENABLE | MLX5_IB_UPD_XLT_ZAP); > + > + if (err) { > + dereg_mr(dev, mr); > + return ERR_PTR(err); > + } The current mlx5 code preloads the buffer with the right data, zapping is fairly expensive, mapping and loading is the same cost > @@ -1536,7 +1646,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int > flags, u64 start, > if (!mr->umem) > return -EINVAL; > > - if (is_odp_mr(mr)) > + if (is_odp_mr(mr) || is_dmabuf_mr(mr)) > return -EOPNOTSUPP; > > if (flags & IB_MR_REREG_TRANS) { > @@ -1695,7 +1805,7 @@ static void dereg_mr(struct mlx5_ib_dev *de
RE: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
> -Original Message- > From: Jason Gunthorpe > Sent: Tuesday, October 27, 2020 1:08 PM > To: Xiong, Jianxin > Cc: linux-r...@vger.kernel.org; dri-devel@lists.freedesktop.org; Doug Ledford > ; Leon Romanovsky > ; Sumit Semwal ; Christian Koenig > ; Vetter, Daniel > > Subject: Re: [PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory > region > > On Fri, Oct 23, 2020 at 09:40:01AM -0700, Jianxin Xiong wrote: > > > diff --git a/drivers/infiniband/hw/mlx5/mr.c > > b/drivers/infiniband/hw/mlx5/mr.c index b261797..3bc412b 100644 > > +++ b/drivers/infiniband/hw/mlx5/mr.c > > @@ -1,5 +1,6 @@ > > /* > > * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. > > + * Copyright (c) 2020, Intel Corporation. All rights reserved. > > * > > * This software is available to you under a choice of one of two > > * licenses. You may choose to be licensed under the terms of the > > GNU @@ -36,6 +37,8 @@ #include #include > > #include > > +#include > > +#include > > #include > > #include > > #include > > @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 > > idx, int npages, > > dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); > > if (mr->umem->is_odp) { > > mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); > > + } else if (mr->umem->is_dmabuf && (flags & > > MLX5_IB_UPD_XLT_ZAP)) { > > + memset(xlt, 0, size); > > } else { > > __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx, > >npages, xlt, > > @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, > > u64 start, u64 length, > > return ERR_PTR(err); > > } > > > > +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment > > +*attach) { > > + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; > > + struct mlx5_ib_mr *mr = umem_dmabuf->device_context; > > + > > + mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP); > > + ib_umem_dmabuf_unmap_pages(umem_dmabuf); > > +} > > + > > +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { > > + .allow_peer2peer = 1, > > + .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; > > + > > +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, > > +u64 length, u64 virt_addr, > > +int fd, int access_flags, > > +struct ib_udata *udata) > > +{ > > + struct mlx5_ib_dev *dev = to_mdev(pd->device); > > + struct mlx5_ib_mr *mr = NULL; > > + struct ib_umem *umem; > > + struct ib_umem_dmabuf *umem_dmabuf; > > + int npages; > > + int order; > > + int err; > > + > > + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) > > + return ERR_PTR(-EOPNOTSUPP); > > + > > + mlx5_ib_dbg(dev, > > + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, > > access_flags 0x%x\n", > > + offset, virt_addr, length, fd, access_flags); > > + > > + if (!mlx5_ib_can_load_pas_with_umr(dev, length)) > > + return ERR_PTR(-EINVAL); > > + > > + umem = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, > > access_flags, > > + &mlx5_ib_dmabuf_attach_ops); > > + if (IS_ERR(umem)) { > > + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem)); > > + return ERR_PTR(PTR_ERR(umem)); > > + } > > + > > + npages = ib_umem_num_pages(umem); > > + if (!npages) { > > ib_umem_get should reject invalid umems like this Will move the check to ib_umem_dmabuf_get(). > > > + mlx5_ib_warn(dev, "avoid zero region\n"); > > + ib_umem_release(umem); > > + return ERR_PTR(-EINVAL); > > + } > > + > > + order = ilog2(roundup_pow_of_two(npages)); > > Must always call ib_umem_find_best_pgsz(), specify PAGE_SIZE as the argument > for this scenario Will fix. > > > + mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n", > > + npages, npages, order, PAGE_SHIFT); > > + > > + mr = alloc_mr_from_cache(pd, umem, virt_addr, length, npages, > > +PAGE
[PATCH v6 4/4] RDMA/mlx5: Support dma-buf based userspace memory region
Implement the new driver method 'reg_user_mr_dmabuf'. Utilize the core functions to import dma-buf based memory region and update the mappings. Add code to handle dma-buf related page fault. Signed-off-by: Jianxin Xiong Reviewed-by: Sean Hefty Acked-by: Michael J. Ruhl Acked-by: Christian Koenig Acked-by: Daniel Vetter --- drivers/infiniband/hw/mlx5/main.c| 2 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 11 drivers/infiniband/hw/mlx5/mr.c | 114 ++- drivers/infiniband/hw/mlx5/odp.c | 70 +++-- 4 files changed, 191 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 89e04ca..ec4ad2f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. */ #include @@ -4060,6 +4061,7 @@ static int mlx5_ib_enable_driver(struct ib_device *dev) .query_srq = mlx5_ib_query_srq, .query_ucontext = mlx5_ib_query_ucontext, .reg_user_mr = mlx5_ib_reg_user_mr, + .reg_user_mr_dmabuf = mlx5_ib_reg_user_mr_dmabuf, .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b1f2b34..4b72ff9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. */ #ifndef MLX5_IB_H @@ -639,6 +640,12 @@ static inline bool is_odp_mr(struct mlx5_ib_mr *mr) mr->umem->is_odp; } +static inline bool is_dmabuf_mr(struct mlx5_ib_mr *mr) +{ + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && + mr->umem->is_dmabuf; +} + struct mlx5_ib_mw { struct ib_mwibmw; struct mlx5_core_mkey mmkey; @@ -1174,6 +1181,10 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, +u64 length, u64 virt_addr, +int fd, int access_flags, +struct ib_udata *udata); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index b261797..3bc412b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. + * Copyright (c) 2020, Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -36,6 +37,8 @@ #include #include #include +#include +#include #include #include #include @@ -1113,6 +1116,8 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); if (mr->umem->is_odp) { mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + } else if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) { + memset(xlt, 0, size); } else { __mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx, npages, xlt, @@ -1462,6 +1467,111 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_PTR(err); } +static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) +{ + struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; + struct mlx5_ib_mr *mr = umem_dmabuf->device_context; + + mlx5_ib_update_xlt(mr, 0, mr->npages, PAGE_SHIFT, MLX5_IB_UPD_XLT_ZAP); + ib_umem_dmabuf_unmap_pages(umem_dmabuf); +} + +static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { + .allow_peer2peer = 1, + .move_notify = mlx5_ib_dmabuf_invalidate_cb, +}; + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, +u64 length, u64 virt_addr, +int fd, int access_flag