On 01/07/2016 12:06 AM, Hillf Danton wrote:
>>
>> Page faults can race with fallocate hole punch.  If a page fault happens
>> between the unmap and remove operations, the page is not removed and
>> remains within the hole.  This is not the desired behavior.  The race
>> is difficult to detect in user level code as even in the non-race
>> case, a page within the hole could be faulted back in before fallocate
>> returns.  If userfaultfd is expanded to support hugetlbfs in the future,
>> this race will be easier to observe.
>>
>> If this race is detected and a page is mapped, the remove operation
>> (remove_inode_hugepages) will unmap the page before removing.  The unmap
>> within remove_inode_hugepages occurs with the hugetlb_fault_mutex held
>> so that no other faults will be processed until the page is removed.
>>
>> The (unmodified) routine hugetlb_vmdelete_list was moved ahead of
>> remove_inode_hugepages to satisfy the new reference.
>>
>> Signed-off-by: Mike Kravetz <mike.krav...@oracle.com>
>> ---
>>  fs/hugetlbfs/inode.c | 139 
>> +++++++++++++++++++++++++++------------------------
>>  1 file changed, 73 insertions(+), 66 deletions(-)
>>
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 0444760..0871d70 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -324,11 +324,46 @@ static void remove_huge_page(struct page *page)
>>      delete_from_page_cache(page);
>>  }
>>
>> +static inline void
>> +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> +{
>> +    struct vm_area_struct *vma;
>> +
>> +    /*
>> +     * end == 0 indicates that the entire range after
>> +     * start should be unmapped.
>> +     */
>> +    vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> 
> [1] perhaps end can be reused.
> 
>> +            unsigned long v_offset;
>> +
>> +            /*
>> +             * Can the expression below overflow on 32-bit arches?
>> +             * No, because the interval tree returns us only those vmas
>> +             * which overlap the truncated area starting at pgoff,
>> +             * and no vma on a 32-bit arch can span beyond the 4GB.
>> +             */
>> +            if (vma->vm_pgoff < start)
>> +                    v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> +            else
>> +                    v_offset = 0;
>> +
>> +            if (end) {
>> +                    end = ((end - start) << PAGE_SHIFT) +
>> +                           vma->vm_start + v_offset;
> 
> [2] end is input to be pgoff_t, but changed to be the type of v_offset.
> Further we cannot handle the case that end is input to be zero.
> See the diff below please.

Thanks Hillf.

This bug is part of the existing code.  I did not modify current
hugetlb_vmdelete_list code, just moved it as part of this patch.
Therefore, I will create a separate patch to fix this issue.

-- 
Mike Kravetz

> 
>> +                    if (end > vma->vm_end)
>> +                            end = vma->vm_end;
>> +            } else
>> +                    end = vma->vm_end;
>> +
>> +            unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> +    }
>> +}
>> +
>>
>>  /*
>>   * remove_inode_hugepages handles two distinct cases: truncation and hole
>>   * punch.  There are subtle differences in operation for each case.
>> -
>> + *
>>   * truncation is indicated by end of range being LLONG_MAX
>>   *  In this case, we first scan the range and release found pages.
>>   *  After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
>> @@ -379,6 +414,7 @@ static void remove_inode_hugepages(struct inode *inode, 
>> loff_t lstart,
>>
>>              for (i = 0; i < pagevec_count(&pvec); ++i) {
>>                      struct page *page = pvec.pages[i];
>> +                    bool rsv_on_error;
>>                      u32 hash;
>>
>>                      /*
>> @@ -395,37 +431,43 @@ static void remove_inode_hugepages(struct inode 
>> *inode, loff_t lstart,
>>                                                      mapping, next, 0);
>>                      mutex_lock(&hugetlb_fault_mutex_table[hash]);
>>
>> -                    lock_page(page);
>> -                    if (likely(!page_mapped(page))) {
>> -                            bool rsv_on_error = !PagePrivate(page);
>> -                            /*
>> -                             * We must free the huge page and remove
>> -                             * from page cache (remove_huge_page) BEFORE
>> -                             * removing the region/reserve map
>> -                             * (hugetlb_unreserve_pages).  In rare out
>> -                             * of memory conditions, removal of the
>> -                             * region/reserve map could fail.  Before
>> -                             * free'ing the page, note PagePrivate which
>> -                             * is used in case of error.
>> -                             */
>> -                            remove_huge_page(page);
>> -                            freed++;
>> -                            if (!truncate_op) {
>> -                                    if (unlikely(hugetlb_unreserve_pages(
>> -                                                    inode, next,
>> -                                                    next + 1, 1)))
>> -                                            hugetlb_fix_reserve_counts(
>> -                                                    inode, rsv_on_error);
>> -                            }
>> -                    } else {
>> -                            /*
>> -                             * If page is mapped, it was faulted in after
>> -                             * being unmapped.  It indicates a race between
>> -                             * hole punch and page fault.  Do nothing in
>> -                             * this case.  Getting here in a truncate
>> -                             * operation is a bug.
>> -                             */
>> +                    /*
>> +                     * If page is mapped, it was faulted in after being
>> +                     * unmapped in caller.  Unmap (again) now after taking
>> +                     * the fault mutex.  The mutex will prevent faults
>> +                     * until we finish removing the page.
>> +                     *
>> +                     * This race can only happen in the hole punch case.
>> +                     * Getting here in a truncate operation is a bug.
>> +                     */
>> +                    if (unlikely(page_mapped(page))) {
>>                              BUG_ON(truncate_op);
>> +
>> +                            i_mmap_lock_write(mapping);
>> +                            hugetlb_vmdelete_list(&mapping->i_mmap,
>> +                                    next * pages_per_huge_page(h),
>> +                                    (next + 1) * pages_per_huge_page(h));
>> +                            i_mmap_unlock_write(mapping);
>> +                    }
>> +
>> +                    lock_page(page);
>> +                    /*
>> +                     * We must free the huge page and remove from page
>> +                     * cache (remove_huge_page) BEFORE removing the
>> +                     * region/reserve map (hugetlb_unreserve_pages).  In
>> +                     * rare out of memory conditions, removal of the
>> +                     * region/reserve map could fail.  Before free'ing
>> +                     * the page, note PagePrivate which is used in case
>> +                     * of error.
>> +                     */
>> +                    rsv_on_error = !PagePrivate(page);
>> +                    remove_huge_page(page);
>> +                    freed++;
>> +                    if (!truncate_op) {
>> +                            if (unlikely(hugetlb_unreserve_pages(inode,
>> +                                                    next, next + 1, 1)))
>> +                                    hugetlb_fix_reserve_counts(inode,
>> +                                                            rsv_on_error);
>>                      }
>>
>>                      unlock_page(page);
>> @@ -452,41 +494,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
>>      clear_inode(inode);
>>  }
>>
>> -static inline void
>> -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
>> -{
>> -    struct vm_area_struct *vma;
>> -
>> -    /*
>> -     * end == 0 indicates that the entire range after
>> -     * start should be unmapped.
>> -     */
>> -    vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
>> -            unsigned long v_offset;
>> -
>> -            /*
>> -             * Can the expression below overflow on 32-bit arches?
>> -             * No, because the interval tree returns us only those vmas
>> -             * which overlap the truncated area starting at pgoff,
>> -             * and no vma on a 32-bit arch can span beyond the 4GB.
>> -             */
>> -            if (vma->vm_pgoff < start)
>> -                    v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
>> -            else
>> -                    v_offset = 0;
>> -
>> -            if (end) {
>> -                    end = ((end - start) << PAGE_SHIFT) +
>> -                           vma->vm_start + v_offset;
>> -                    if (end > vma->vm_end)
>> -                            end = vma->vm_end;
>> -            } else
>> -                    end = vma->vm_end;
>> -
>> -            unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
>> -    }
>> -}
>> -
>>  static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
>>  {
>>      pgoff_t pgoff;
>> --
>> 2.4.3
>>
> 
> --- a/fs/hugetlbfs/inode.c    Thu Jan  7 15:04:35 2016
> +++ b/fs/hugetlbfs/inode.c    Thu Jan  7 15:31:03 2016
> @@ -461,8 +461,11 @@ hugetlb_vmdelete_list(struct rb_root *ro
>        * end == 0 indicates that the entire range after
>        * start should be unmapped.
>        */
> -     vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
> +     if (!end)
> +             end = ULONG_MAX;
> +     vma_interval_tree_foreach(vma, root, start, end) {
>               unsigned long v_offset;
> +             unsigned long v_end;
>  
>               /*
>                * Can the expression below overflow on 32-bit arches?
> @@ -475,15 +478,12 @@ hugetlb_vmdelete_list(struct rb_root *ro
>               else
>                       v_offset = 0;
>  
> -             if (end) {
> -                     end = ((end - start) << PAGE_SHIFT) +
> +             v_end = ((end - start) << PAGE_SHIFT) +
>                              vma->vm_start + v_offset;
> -                     if (end > vma->vm_end)
> -                             end = vma->vm_end;
> -             } else
> -                     end = vma->vm_end;
> +             if (v_end > vma->vm_end)
> +                     v_end = vma->vm_end;
>  
> -             unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL);
> +             unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, 
> NULL);
>       }
>  }
>  
> --
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to