On 07/18/23 01:26, Vivek Kasireddy wrote:
> A user or admin can configure a VMM (Qemu) Guest's memory to be
> backed by hugetlb pages for various reasons. However, a Guest OS
> would still allocate (and pin) buffers that are backed by regular
> 4k sized pages. In order to map these buffers and create dma-bufs
> for them on the Host, we first need to find the hugetlb pages where
> the buffer allocations are located and then determine the offsets
> of individual chunks (within those pages) and use this information
> to eventually populate a scatterlist.
> 
> Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500 options
> were passed to the Host kernel and Qemu was launched with these
> relevant options: qemu-system-x86_64 -m 4096m....
> -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080
> -display gtk,gl=on
> -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M
> -machine memory-backend=mem1
> 
> Replacing -display gtk,gl=on with -display gtk,gl=off above would
> exercise the mmap handler.
> 
> v2: Updated get_sg_table() to manually populate the scatterlist for
>     both huge page and non-huge-page cases.
> 
> Cc: David Hildenbrand <da...@redhat.com>
> Cc: Mike Kravetz <mike.krav...@oracle.com>
> Cc: Hugh Dickins <hu...@google.com>
> Cc: Peter Xu <pet...@redhat.com>
> Cc: Jason Gunthorpe <j...@nvidia.com>
> Cc: Gerd Hoffmann <kra...@redhat.com>
> Cc: Dongwon Kim <dongwon....@intel.com>
> Cc: Junxiao Chang <junxiao.ch...@intel.com>
> Signed-off-by: Vivek Kasireddy <vivek.kasire...@intel.com>
> ---
>  drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++------
>  1 file changed, 71 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> index 820c993c8659..10c47bf77fb5 100644
> --- a/drivers/dma-buf/udmabuf.c
> +++ b/drivers/dma-buf/udmabuf.c
> @@ -10,6 +10,7 @@
>  #include <linux/miscdevice.h>
>  #include <linux/module.h>
>  #include <linux/shmem_fs.h>
> +#include <linux/hugetlb.h>
>  #include <linux/slab.h>
>  #include <linux/udmabuf.h>
>  #include <linux/vmalloc.h>
> @@ -28,6 +29,7 @@ struct udmabuf {
>       struct page **pages;
>       struct sg_table *sg;
>       struct miscdevice *device;
> +     pgoff_t *offsets;
>  };
>  
>  static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
>               return VM_FAULT_SIGBUS;
>  
>       pfn = page_to_pfn(ubuf->pages[pgoff]);
> +     if (ubuf->offsets) {
> +             pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
> +     }
> +
>       return vmf_insert_pfn(vma, vmf->address, pfn);
>  }
>  
> @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device *dev, 
> struct dma_buf *buf,
>  {
>       struct udmabuf *ubuf = buf->priv;
>       struct sg_table *sg;
> +     struct scatterlist *sgl;
> +     pgoff_t offset;
> +     unsigned long i = 0;
>       int ret;
>  
>       sg = kzalloc(sizeof(*sg), GFP_KERNEL);
>       if (!sg)
>               return ERR_PTR(-ENOMEM);
> -     ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> -                                     0, ubuf->pagecount << PAGE_SHIFT,
> -                                     GFP_KERNEL);
> +
> +     ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
>       if (ret < 0)
> -             goto err;
> +             goto err_alloc;
> +
> +     for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
> +             offset = ubuf->offsets ? ubuf->offsets[i] : 0;
> +             sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
> +     }
>       ret = dma_map_sgtable(dev, sg, direction, 0);
>       if (ret < 0)
> -             goto err;
> +             goto err_map;
>       return sg;
>  
> -err:
> +err_map:
>       sg_free_table(sg);
> +err_alloc:
>       kfree(sg);
>       return ERR_PTR(ret);
>  }
> @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf)
>  
>       for (pg = 0; pg < ubuf->pagecount; pg++)
>               put_page(ubuf->pages[pg]);
> +     kfree(ubuf->offsets);
>       kfree(ubuf->pages);
>       kfree(ubuf);
>  }
> @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice *device,
>       struct udmabuf *ubuf;
>       struct dma_buf *buf;
>       pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> -     struct page *page;
> +     struct page *page, *hpage = NULL;
> +     pgoff_t hpoff, chunkoff, maxchunks;
> +     struct hstate *hpstate;
>       int seals, ret = -EINVAL;
>       u32 i, flags;
>  
> @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice *device,
>               if (!memfd)
>                       goto err;
>               mapping = memfd->f_mapping;
> -             if (!shmem_mapping(mapping))
> +             if (!shmem_mapping(mapping) && !is_file_hugepages(memfd))
>                       goto err;
>               seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
>               if (seals == -EINVAL)
> @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice *device,
>                       goto err;
>               pgoff = list[i].offset >> PAGE_SHIFT;
>               pgcnt = list[i].size   >> PAGE_SHIFT;
> +             if (is_file_hugepages(memfd)) {
> +                     if (!ubuf->offsets) {
> +                             ubuf->offsets = kmalloc_array(ubuf->pagecount,
> +                                                           
> sizeof(*ubuf->offsets),
> +                                                           GFP_KERNEL);
> +                             if (!ubuf->offsets) {
> +                                     ret = -ENOMEM;
> +                                     goto err;
> +                             }
> +                     }
> +                     hpstate = hstate_file(memfd);
> +                     hpoff = list[i].offset >> huge_page_shift(hpstate);
> +                     chunkoff = (list[i].offset &
> +                                 ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
> +                     maxchunks = huge_page_size(hpstate) >> PAGE_SHIFT;
> +             }
>               for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> -                     page = shmem_read_mapping_page(mapping, pgoff + pgidx);
> -                     if (IS_ERR(page)) {
> -                             ret = PTR_ERR(page);
> -                             goto err;
> +                     if (is_file_hugepages(memfd)) {
> +                             if (!hpage) {
> +                                     hpage = find_get_page_flags(mapping, 
> hpoff,
> +                                                                 
> FGP_ACCESSED);
> +                                     if (!hpage) {
> +                                             ret = -EINVAL;
> +                                             goto err;
> +                                     }
> +                             }
> +                             get_page(hpage);

Is the intention to increase the ref count of the hugetlb page once for
each 'sub-page' added?  Or, am I reading that incorrectly?

> +                             ubuf->pages[pgbuf] = hpage;

Ah!, answering my own question.  Since the 'head page' is added to the
array the ref count of the head page will decremented in
release_udmabuf.

> +                             ubuf->offsets[pgbuf++] = chunkoff << PAGE_SHIFT;
> +                             if (++chunkoff == maxchunks) {
> +                                     put_page(hpage);
> +                                     hpage = NULL;
> +                                     chunkoff = 0;
> +                                     hpoff++;
> +                             }
> +                     } else {
> +                             page = shmem_read_mapping_page(mapping, pgoff + 
> pgidx);

It may not matter to your users, but the semantics for hugetlb and shmem
pages is different.  hugetlb requires the pages exist in the page cache
while shmem will create/add pages to the cache if necessary.

> +                             if (IS_ERR(page)) {
> +                                     ret = PTR_ERR(page);
> +                                     goto err;
> +                             }
> +                             ubuf->pages[pgbuf++] = page;
>                       }
> -                     ubuf->pages[pgbuf++] = page;
>               }
>               fput(memfd);
>               memfd = NULL;
> +             if (hpage) {
> +                     put_page(hpage);
> +                     hpage = NULL;
> +             }
>       }
>  
>       exp_info.ops  = &udmabuf_ops;
> @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice *device,
>               put_page(ubuf->pages[--pgbuf]);
>       if (memfd)
>               fput(memfd);
> +     kfree(ubuf->offsets);
>       kfree(ubuf->pages);
>       kfree(ubuf);
>       return ret;
> -- 
> 2.39.2
> 

Nothing else stands out,

Acked-by: Mike Kravetz <mike.krav...@oracle.com>

I see there is a RFC for the coherency issue with hole punch.
-- 
Mike Kravetz

Reply via email to