> 
> Page faults can race with fallocate hole punch.  If a page fault happens
> between the unmap and remove operations, the page is not removed and
> remains within the hole.  This is not the desired behavior.  The race
> is difficult to detect in user level code as even in the non-race
> case, a page within the hole could be faulted back in before fallocate
> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
> this race will be easier to observe.
> 
> If this race is detected and a page is mapped, the remove operation
> (remove_inode_hugepages) will unmap the page before removing.  The unmap
> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
> so that no other faults will be processed until the page is removed.
> 
> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
> remove_inode_hugepages to satisfy the new reference.
> 
> Signed-off-by: Mike Kravetz <mike.krav...@oracle.com>
> ---
>  fs/hugetlbfs/inode.c | 139 
> +++++++++++++++++++++++++++------------------------
>  1 file changed, 73 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 0444760..0871d70 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>       delete_from_page_cache(page);
>  }
> 
> +static inline void
> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> +{
> +     struct vm_area_struct *vma;
> +
> +     /*
> +      * end == 0 indicates that the entire range after
> +      * start should be unmapped.
> +      */
> +     vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {

[1] perhaps end can be reused.

> +             unsigned long v_offset;
> +
> +             /*
> +              * Can the expression below overflow on 32-bit arches?
> +              * No, because the interval tree returns us only those vmas
> +              * which overlap the truncated area starting at pgoff,
> +              * and no vma on a 32-bit arch can span beyond the 4GB.
> +              */
> +             if (vma->vm_pgoff < start)
> +                     v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> +             else
> +                     v_offset = 0;
> +
> +             if (end) {
> +                     end = ((end - start) << PAGE_SHIFT) +
> +                            vma->vm_start + v_offset;

[2] end is input to be pgoff_t, but changed to be the type of v_offset.
Further we cannot handle the case that end is input to be zero.
See the diff below please.

> +                     if (end > vma->vm_end)
> +                             end = vma->vm_end;
> +             } else
> +                     end = vma->vm_end;
> +
> +             unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +     }
> +}
> +
> 
>  /*
>   * remove_inode_hugepages handles two distinct cases: truncation and hole
>   * punch.  There are subtle differences in operation for each case.
> -
> + *
>   * truncation is indicated by end of range being LLONG_MAX
>   *   In this case, we first scan the range and release found pages.
>   *   After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
> @@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, 
> loff_t lstart,
> 
>               for (i = 0; i < pagevec_count(&pvec); ++i) {
>                       struct page *page = pvec.pages[i];
> +                     bool rsv_on_error;
>                       u32 hash;
> 
>                       /*
> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode *inode, 
> loff_t lstart,
>                                                       mapping, next, 0);
>                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
> 
> -                     lock_page(page);
> -                     if (likely(!page_mapped(page))) {
> -                             bool rsv_on_error = !PagePrivate(page);
> -                             /*
> -                              * We must free the huge page and remove
> -                              * from page cache (remove_huge_page) BEFORE
> -                              * removing the region/reserve map
> -                              * (hugetlb_unreserve_pages).  In rare out
> -                              * of memory conditions, removal of the
> -                              * region/reserve map could fail.  Before
> -                              * free'ing the page, note PagePrivate which
> -                              * is used in case of error.
> -                              */
> -                             remove_huge_page(page);
> -                             freed++;
> -                             if (!truncate_op) {
> -                                     if (unlikely(hugetlb_unreserve_pages(
> -                                                     inode, next,
> -                                                     next + 1, 1)))
> -                                             hugetlb_fix_reserve_counts(
> -                                                     inode, rsv_on_error);
> -                             }
> -                     } else {
> -                             /*
> -                              * If page is mapped, it was faulted in after
> -                              * being unmapped.  It indicates a race between
> -                              * hole punch and page fault.  Do nothing in
> -                              * this case.  Getting here in a truncate
> -                              * operation is a bug.
> -                              */
> +                     /*
> +                      * If page is mapped, it was faulted in after being
> +                      * unmapped in caller.  Unmap (again) now after taking
> +                      * the fault mutex.  The mutex will prevent faults
> +                      * until we finish removing the page.
> +                      *
> +                      * This race can only happen in the hole punch case.
> +                      * Getting here in a truncate operation is a bug.
> +                      */
> +                     if (unlikely(page_mapped(page))) {
>                               BUG_ON(truncate_op);
> +
> +                             i_mmap_lock_write(mapping);
> +                             hugetlb_vmdelete_list(&mapping->i_mmap,
> +                                     next * pages_per_huge_page(h),
> +                                     (next + 1) * pages_per_huge_page(h));
> +                             i_mmap_unlock_write(mapping);
> +                     }
> +
> +                     lock_page(page);
> +                     /*
> +                      * We must free the huge page and remove from page
> +                      * cache (remove_huge_page) BEFORE removing the
> +                      * region/reserve map (hugetlb_unreserve_pages).  In
> +                      * rare out of memory conditions, removal of the
> +                      * region/reserve map could fail.  Before free'ing
> +                      * the page, note PagePrivate which is used in case
> +                      * of error.
> +                      */
> +                     rsv_on_error = !PagePrivate(page);
> +                     remove_huge_page(page);
> +                     freed++;
> +                     if (!truncate_op) {
> +                             if (unlikely(hugetlb_unreserve_pages(inode,
> +                                                     next, next + 1, 1)))
> +                                     hugetlb_fix_reserve_counts(inode,
> +                                                             rsv_on_error);
>                       }
> 
>                       unlock_page(page);
> @@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
>       clear_inode(inode);
>  }
> 
> -static inline void
> -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
> -{
> -     struct vm_area_struct *vma;
> -
> -     /*
> -      * end == 0 indicates that the entire range after
> -      * start should be unmapped.
> -      */
> -     vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> -             unsigned long v_offset;
> -
> -             /*
> -              * Can the expression below overflow on 32-bit arches?
> -              * No, because the interval tree returns us only those vmas
> -              * which overlap the truncated area starting at pgoff,
> -              * and no vma on a 32-bit arch can span beyond the 4GB.
> -              */
> -             if (vma->vm_pgoff < start)
> -                     v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
> -             else
> -                     v_offset = 0;
> -
> -             if (end) {
> -                     end = ((end - start) << PAGE_SHIFT) +
> -                            vma->vm_start + v_offset;
> -                     if (end > vma->vm_end)
> -                             end = vma->vm_end;
> -             } else
> -                     end = vma->vm_end;
> -
> -             unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> -     }
> -}
> -
>  static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>  {
>       pgoff_t pgoff;
> --
> 2.4.3
> 

--- a/fs/hugetlbfs/inode.c      Thu Jan  7 15:04:35 2016
+++ b/fs/hugetlbfs/inode.c      Thu Jan  7 15:31:03 2016
@@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
         * end == 0 indicates that the entire range after
         * start should be unmapped.
         */
-       vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+       if (!end)
+               end = ULONG_MAX;
+       vma_interval_tree_foreach(vma, root, start, end) {
                unsigned long v_offset;
+               unsigned long v_end;
 
                /*
                 * Can the expression below overflow on 32-bit arches?
@@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
                else
                        v_offset = 0;
 
-               if (end) {
-                       end = ((end - start) << PAGE_SHIFT) +
+               v_end = ((end - start) << PAGE_SHIFT) +
                               vma->vm_start + v_offset;
-                       if (end > vma->vm_end)
-                               end = vma->vm_end;
-               } else
-                       end = vma->vm_end;
+               if (v_end > vma->vm_end)
+                       v_end = vma->vm_end;
 
-               unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
+               unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, 
NULL);
        }
 }
 
--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to