Hi Mike,

> 
> On 07/18/23 01:26, Vivek Kasireddy wrote:
> > A user or admin can configure a VMM (Qemu) Guest's memory to be
> > backed by hugetlb pages for various reasons. However, a Guest OS
> > would still allocate (and pin) buffers that are backed by regular
> > 4k sized pages. In order to map these buffers and create dma-bufs
> > for them on the Host, we first need to find the hugetlb pages where
> > the buffer allocations are located and then determine the offsets
> > of individual chunks (within those pages) and use this information
> > to eventually populate a scatterlist.
> >
> > Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500
> options
> > were passed to the Host kernel and Qemu was launched with these
> > relevant options: qemu-system-x86_64 -m 4096m....
> > -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080
> > -display gtk,gl=on
> > -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M
> > -machine memory-backend=mem1
> >
> > Replacing -display gtk,gl=on with -display gtk,gl=off above would
> > exercise the mmap handler.
> >
> > v2: Updated get_sg_table() to manually populate the scatterlist for
> >     both huge page and non-huge-page cases.
> >
> > Cc: David Hildenbrand <da...@redhat.com>
> > Cc: Mike Kravetz <mike.krav...@oracle.com>
> > Cc: Hugh Dickins <hu...@google.com>
> > Cc: Peter Xu <pet...@redhat.com>
> > Cc: Jason Gunthorpe <j...@nvidia.com>
> > Cc: Gerd Hoffmann <kra...@redhat.com>
> > Cc: Dongwon Kim <dongwon....@intel.com>
> > Cc: Junxiao Chang <junxiao.ch...@intel.com>
> > Signed-off-by: Vivek Kasireddy <vivek.kasire...@intel.com>
> > ---
> >  drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++--
> ----
> >  1 file changed, 71 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c
> > index 820c993c8659..10c47bf77fb5 100644
> > --- a/drivers/dma-buf/udmabuf.c
> > +++ b/drivers/dma-buf/udmabuf.c
> > @@ -10,6 +10,7 @@
> >  #include <linux/miscdevice.h>
> >  #include <linux/module.h>
> >  #include <linux/shmem_fs.h>
> > +#include <linux/hugetlb.h>
> >  #include <linux/slab.h>
> >  #include <linux/udmabuf.h>
> >  #include <linux/vmalloc.h>
> > @@ -28,6 +29,7 @@ struct udmabuf {
> >     struct page **pages;
> >     struct sg_table *sg;
> >     struct miscdevice *device;
> > +   pgoff_t *offsets;
> >  };
> >
> >  static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
> > @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault
> *vmf)
> >             return VM_FAULT_SIGBUS;
> >
> >     pfn = page_to_pfn(ubuf->pages[pgoff]);
> > +   if (ubuf->offsets) {
> > +           pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
> > +   }
> > +
> >     return vmf_insert_pfn(vma, vmf->address, pfn);
> >  }
> >
> > @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device
> *dev, struct dma_buf *buf,
> >  {
> >     struct udmabuf *ubuf = buf->priv;
> >     struct sg_table *sg;
> > +   struct scatterlist *sgl;
> > +   pgoff_t offset;
> > +   unsigned long i = 0;
> >     int ret;
> >
> >     sg = kzalloc(sizeof(*sg), GFP_KERNEL);
> >     if (!sg)
> >             return ERR_PTR(-ENOMEM);
> > -   ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount,
> > -                                   0, ubuf->pagecount << PAGE_SHIFT,
> > -                                   GFP_KERNEL);
> > +
> > +   ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL);
> >     if (ret < 0)
> > -           goto err;
> > +           goto err_alloc;
> > +
> > +   for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) {
> > +           offset = ubuf->offsets ? ubuf->offsets[i] : 0;
> > +           sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset);
> > +   }
> >     ret = dma_map_sgtable(dev, sg, direction, 0);
> >     if (ret < 0)
> > -           goto err;
> > +           goto err_map;
> >     return sg;
> >
> > -err:
> > +err_map:
> >     sg_free_table(sg);
> > +err_alloc:
> >     kfree(sg);
> >     return ERR_PTR(ret);
> >  }
> > @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf)
> >
> >     for (pg = 0; pg < ubuf->pagecount; pg++)
> >             put_page(ubuf->pages[pg]);
> > +   kfree(ubuf->offsets);
> >     kfree(ubuf->pages);
> >     kfree(ubuf);
> >  }
> > @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice
> *device,
> >     struct udmabuf *ubuf;
> >     struct dma_buf *buf;
> >     pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit;
> > -   struct page *page;
> > +   struct page *page, *hpage = NULL;
> > +   pgoff_t hpoff, chunkoff, maxchunks;
> > +   struct hstate *hpstate;
> >     int seals, ret = -EINVAL;
> >     u32 i, flags;
> >
> > @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice
> *device,
> >             if (!memfd)
> >                     goto err;
> >             mapping = memfd->f_mapping;
> > -           if (!shmem_mapping(mapping))
> > +           if (!shmem_mapping(mapping) &&
> !is_file_hugepages(memfd))
> >                     goto err;
> >             seals = memfd_fcntl(memfd, F_GET_SEALS, 0);
> >             if (seals == -EINVAL)
> > @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice
> *device,
> >                     goto err;
> >             pgoff = list[i].offset >> PAGE_SHIFT;
> >             pgcnt = list[i].size   >> PAGE_SHIFT;
> > +           if (is_file_hugepages(memfd)) {
> > +                   if (!ubuf->offsets) {
> > +                           ubuf->offsets = kmalloc_array(ubuf-
> >pagecount,
> > +                                                         sizeof(*ubuf-
> >offsets),
> > +                                                         GFP_KERNEL);
> > +                           if (!ubuf->offsets) {
> > +                                   ret = -ENOMEM;
> > +                                   goto err;
> > +                           }
> > +                   }
> > +                   hpstate = hstate_file(memfd);
> > +                   hpoff = list[i].offset >> huge_page_shift(hpstate);
> > +                   chunkoff = (list[i].offset &
> > +                               ~huge_page_mask(hpstate)) >>
> PAGE_SHIFT;
> > +                   maxchunks = huge_page_size(hpstate) >>
> PAGE_SHIFT;
> > +           }
> >             for (pgidx = 0; pgidx < pgcnt; pgidx++) {
> > -                   page = shmem_read_mapping_page(mapping, pgoff
> + pgidx);
> > -                   if (IS_ERR(page)) {
> > -                           ret = PTR_ERR(page);
> > -                           goto err;
> > +                   if (is_file_hugepages(memfd)) {
> > +                           if (!hpage) {
> > +                                   hpage =
> find_get_page_flags(mapping, hpoff,
> > +
> FGP_ACCESSED);
> > +                                   if (!hpage) {
> > +                                           ret = -EINVAL;
> > +                                           goto err;
> > +                                   }
> > +                           }
> > +                           get_page(hpage);
> 
> Is the intention to increase the ref count of the hugetlb page once for
> each 'sub-page' added?  Or, am I reading that incorrectly?
Yes, that's the intention; I figured it makes sense to do it that way.

> 
> > +                           ubuf->pages[pgbuf] = hpage;
> 
> Ah!, answering my own question.  Since the 'head page' is added to the
> array the ref count of the head page will decremented in
> release_udmabuf.
Yes, that's right.

> 
> > +                           ubuf->offsets[pgbuf++] = chunkoff <<
> PAGE_SHIFT;
> > +                           if (++chunkoff == maxchunks) {
> > +                                   put_page(hpage);
> > +                                   hpage = NULL;
> > +                                   chunkoff = 0;
> > +                                   hpoff++;
> > +                           }
> > +                   } else {
> > +                           page =
> shmem_read_mapping_page(mapping, pgoff + pgidx);
> 
> It may not matter to your users, but the semantics for hugetlb and shmem
> pages is different.  hugetlb requires the pages exist in the page cache
> while shmem will create/add pages to the cache if necessary.
Ok; got it.

> 
> > +                           if (IS_ERR(page)) {
> > +                                   ret = PTR_ERR(page);
> > +                                   goto err;
> > +                           }
> > +                           ubuf->pages[pgbuf++] = page;
> >                     }
> > -                   ubuf->pages[pgbuf++] = page;
> >             }
> >             fput(memfd);
> >             memfd = NULL;
> > +           if (hpage) {
> > +                   put_page(hpage);
> > +                   hpage = NULL;
> > +           }
> >     }
> >
> >     exp_info.ops  = &udmabuf_ops;
> > @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice
> *device,
> >             put_page(ubuf->pages[--pgbuf]);
> >     if (memfd)
> >             fput(memfd);
> > +   kfree(ubuf->offsets);
> >     kfree(ubuf->pages);
> >     kfree(ubuf);
> >     return ret;
> > --
> > 2.39.2
> >
> 
> Nothing else stands out,
> 
> Acked-by: Mike Kravetz <mike.krav...@oracle.com>
Thank you for the quick review!

> 
> I see there is a RFC for the coherency issue with hole punch.
Yes.

Thanks,
Vivek

> --
> Mike Kravetz

Reply via email to