Until there is a solution to the dma-to-dax vs truncate problem it is
not safe to allow RDMA to create long standing memory registrations
against filesytem-dax vmas. Device-dax vmas do not have this problem and
are explicitly allowed.

This is temporary until a "memory registration with layout-lease"
mechanism can be implemented, and is limited to non-ODP (On Demand
Paging) capable RDMA devices.

Cc: Sean Hefty <sean.he...@intel.com>
Cc: Doug Ledford <dledf...@redhat.com>
Cc: Hal Rosenstock <hal.rosenst...@gmail.com>
Cc: Jeff Moyer <jmo...@redhat.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
Cc: Jason Gunthorpe <jguntho...@obsidianresearch.com>
Cc: <linux-r...@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 drivers/infiniband/core/umem.c |   49 +++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 21e60b1e2ff4..c30d286c1f24 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -147,19 +147,21 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
        umem->hugetlb   = 1;
 
        page_list = (struct page **) __get_free_page(GFP_KERNEL);
-       if (!page_list) {
-               put_pid(umem->pid);
-               kfree(umem);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!page_list)
+               goto err_pagelist;
 
        /*
-        * if we can't alloc the vma_list, it's not so bad;
-        * just assume the memory is not hugetlb memory
+        * If DAX is enabled we need the vma to protect against
+        * registering filesystem-dax memory. Otherwise we can tolerate
+        * a failure to allocate the vma_list and just assume that all
+        * vmas are not hugetlb-vmas.
         */
        vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
-       if (!vma_list)
+       if (!vma_list) {
+               if (IS_ENABLED(CONFIG_FS_DAX))
+                       goto err_vmalist;
                umem->hugetlb = 0;
+       }
 
        npages = ib_umem_num_pages(umem);
 
@@ -199,15 +201,34 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
                if (ret < 0)
                        goto out;
 
-               umem->npages += ret;
                cur_base += ret * PAGE_SIZE;
                npages   -= ret;
 
                for_each_sg(sg_list_start, sg, ret, i) {
-                       if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
-                               umem->hugetlb = 0;
+                       struct vm_area_struct *vma;
+                       struct inode *inode;
 
                        sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+                       umem->npages++;
+
+                       if (!vma_list)
+                               continue;
+                       vma = vma_list[i];
+
+                       if (!is_vm_hugetlb_page(vma))
+                               umem->hugetlb = 0;
+
+                       if (!vma_is_dax(vma))
+                               continue;
+
+                       /* device-dax is safe for rdma... */
+                       inode = file_inode(vma->vm_file);
+                       if (inode->i_mode == S_IFCHR)
+                               continue;
+
+                       /* ...filesystem-dax is not. */
+                       ret = -EOPNOTSUPP;
+                       goto out;
                }
 
                /* preparing for next loop */
@@ -242,6 +263,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, 
unsigned long addr,
        free_page((unsigned long) page_list);
 
        return ret < 0 ? ERR_PTR(ret) : umem;
+err_vmalist:
+       free_page((unsigned long) page_list);
+err_pagelist:
+       put_pid(umem->pid);
+       kfree(umem);
+       return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(ib_umem_get);
 

Reply via email to