On 10/12/2017 07:16 AM, Mike Kravetz wrote:
> Add new MAP_CONTIG flag to mmap system call.  Check for flag in normal
> mmap flag processing.  If present, pre-allocate a contiguous set of
> pages to back the mapping.  These pages will be used a fault time, and
> the MAP_CONTIG flag implies populating the mapping at the mmap time.
> 
> Signed-off-by: Mike Kravetz <mike.krav...@oracle.com>
> ---
>  include/uapi/asm-generic/mman.h |  1 +
>  mm/mmap.c                       | 94 
> +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 95 insertions(+)
> 
> diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
> index 7162cd4cca73..e8046b4c4ac4 100644
> --- a/include/uapi/asm-generic/mman.h
> +++ b/include/uapi/asm-generic/mman.h
> @@ -12,6 +12,7 @@
>  #define MAP_NONBLOCK 0x10000         /* do not block on IO */
>  #define MAP_STACK    0x20000         /* give out an address that is best 
> suited for process/thread stacks */
>  #define MAP_HUGETLB  0x40000         /* create a huge page mapping */
> +#define MAP_CONTIG   0x80000         /* back with contiguous pages */
>  
>  /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
>  
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 680506faceae..aee7917ee073 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -167,6 +167,16 @@ static struct vm_area_struct *remove_vma(struct 
> vm_area_struct *vma)
>  {
>       struct vm_area_struct *next = vma->vm_next;
>  
> +     if (vma->vm_flags & VM_CONTIG) {
> +             /*
> +              * Do any necessary clean up when freeing a vma backed
> +              * by a contiguous allocation.
> +              *
> +              * Not very useful in it's present form.
> +              */
> +             VM_BUG_ON(!vma->vm_private_data);
> +             vma->vm_private_data = NULL;
> +     }
>       might_sleep();
>       if (vma->vm_ops && vma->vm_ops->close)
>               vma->vm_ops->close(vma);
> @@ -1378,6 +1388,18 @@ unsigned long do_mmap(struct file *file, unsigned long 
> addr,
>       vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
>                       mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
>  
> +     /*
> +      * MAP_CONTIG has some restrictions,
> +      * and also implies additional mmap and vma flags.
> +      */
> +     if (flags & MAP_CONTIG) {
> +             if (!(flags & MAP_ANONYMOUS))
> +                     return -EINVAL;
> +
> +             flags |= MAP_POPULATE | MAP_LOCKED;
> +             vm_flags |= (VM_CONTIG | VM_LOCKED | VM_DONTEXPAND);
> +     }
> +
>       if (flags & MAP_LOCKED)
>               if (!can_do_mlock())
>                       return -EPERM;
> @@ -1547,6 +1569,71 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct 
> __user *, arg)
>  #endif /* __ARCH_WANT_SYS_OLD_MMAP */
>  
>  /*
> + * Attempt to allocate a contiguous range of pages to back the
> + * specified vma.  vm_private_data is used as a 'pointer' to the
> + * allocated pages.  Larger requests and more fragmented memory
> + * make the allocation more likely to fail.  So, caller must deal
> + * with this situation.
> + */
> +static long __alloc_vma_contig_range(struct vm_area_struct *vma)
> +{
> +     gfp_t gfp = GFP_HIGHUSER | __GFP_ZERO;

Would it be GFP_HIGHUSER_MOVABLE instead ? Why __GFP_ZERO ? If its
coming from Buddy, every thing should have already been zeroed out
in there. Am I missing something ?

> +     unsigned long order;
> +
> +     VM_BUG_ON_VMA(vma->vm_private_data != NULL, vma);
> +     order = get_order(vma->vm_end - vma->vm_start);
> +
> +     /*
> +      * FIXME - Incomplete implementation.  For now, just handle
> +      * allocations < MAX_ORDER in size.  However, this should really
> +      * handle arbitrary size allocations.
> +      */
> +     if (order >= MAX_ORDER)
> +             return -ENOMEM;
> +
> +     vma->vm_private_data = alloc_pages_vma(gfp, order, vma, vma->vm_start,
> +                                             numa_node_id(), false);

This is where I was experimenting for requests beyond MAX_ORDER
with alloc_contig_range().

> +     if (!vma->vm_private_data)
> +             return -ENOMEM;
> +
> +     /*
> +      * split large allocation so it can be treated as individual
> +      * pages when populating the mapping and at unmap time.
> +      */
> +     if (order) {
> +             unsigned long vma_pages = (vma->vm_end - vma->vm_start) /
> +                                                             PAGE_SIZE;
> +             unsigned long order_pages = 1 << order;
> +             unsigned long i;
> +             struct page *page = vma->vm_private_data;
> +
> +             split_page((struct page *)vma->vm_private_data, order);
> +
> +             /*
> +              * 'order' rounds up size of vma to next power of 2.  We
> +              * will not need/use the extra pages so free them now.
> +              */
> +             for (i = vma_pages; i < order_pages; i++)
> +                     put_page(page + i);

Interesting and this should be kept there a little longer if we would like
to support expansion of this VMA to the next power of 2 which we originally
allocated for any way.

> +     }
> +
> +     return 0;
> +}
> +
> +static void __free_vma_contig_range(struct vm_area_struct *vma)
> +{
> +     struct page *page = vma->vm_private_data;
> +     unsigned long n_pages = (vma->vm_end - vma->vm_start) / PAGE_SIZE;
> +     unsigned long i;
> +
> +     if (!page)
> +             return;
> +
> +     for (i = 0; i < n_pages; i++)
> +             put_page(page + i);
> +}
> +
> +/*
>   * Some shared mappigns will want the pages marked read-only
>   * to track write events. If so, we'll downgrade vm_page_prot
>   * to the private version (using protection_map[] without the
> @@ -1669,6 +1756,12 @@ unsigned long mmap_region(struct file *file, unsigned 
> long addr,
>       vma->vm_pgoff = pgoff;
>       INIT_LIST_HEAD(&vma->anon_vma_chain);
>  
> +     if (vm_flags & VM_CONTIG) {
> +             error = __alloc_vma_contig_range(vma);
> +             if (error)
> +                     goto free_vma;
> +     }
> +

You wanted to have this outside of mmap_sem lock right ?

Reply via email to