On Wed 30-01-19 13:44:20, Vlastimil Babka wrote:
> After "mm/mincore: make mincore() more conservative" we sometimes restrict the
> information about page cache residency, which we have to do without breaking
> existing userspace, if possible. We thus fake the resulting values as 1, which
> should be safer than faking them as 0, as there might theoretically exist code
> that would try to fault in the page(s) until mincore() returns 1.
> 
> Faking 1 however means that such code would not fault in a page even if it was
> not in page cache, with unwanted performance implications. We can improve the
> situation by revisting the approach of 574823bfab82 ("Change mincore() to 
> count
> "mapped" pages rather than "cached" pages") but only applying it to cases 
> where
> page cache residency check is restricted. Thus mincore() will return 0 for an
> unmapped page (which may or may not be resident in a pagecache), and 1 after
> the process faults it in.
> 
> One potential downside is that mincore() will be again able to recognize when 
> a
> previously mapped page was reclaimed. While that might be useful for some
> attack scenarios, it's not as crucial as recognizing that somebody else 
> faulted
> the page in, and there are also other ways to recognize reclaimed pages 
> anyway.

Is this really worth it? Do we know about any specific usecase that
would benefit from this change? TBH I would rather wait for the report
than add a hard to evaluate side channel.

> Cc: Jiri Kosina <[email protected]>
> Cc: Dominique Martinet <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Dave Chinner <[email protected]>
> Cc: Kevin Easton <[email protected]>
> Cc: Matthew Wilcox <[email protected]>
> Cc: Cyril Hrubis <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Kirill A. Shutemov <[email protected]>
> Cc: Daniel Gruss <[email protected]>
> Signed-off-by: Vlastimil Babka <[email protected]>
> ---
>  mm/mincore.c | 49 +++++++++++++++++++++++++++++++++----------------
>  1 file changed, 33 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/mincore.c b/mm/mincore.c
> index 747a4907a3ac..d6784a803ae7 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -21,12 +21,18 @@
>  #include <linux/uaccess.h>
>  #include <asm/pgtable.h>
>  
> +struct mincore_walk_private {
> +     unsigned char *vec;
> +     bool can_check_pagecache;
> +};
> +
>  static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long 
> addr,
>                       unsigned long end, struct mm_walk *walk)
>  {
>  #ifdef CONFIG_HUGETLB_PAGE
>       unsigned char present;
> -     unsigned char *vec = walk->private;
> +     struct mincore_walk_private *walk_private = walk->private;
> +     unsigned char *vec = walk_private->vec;
>  
>       /*
>        * Hugepages under user process are always in RAM and never
> @@ -35,7 +41,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, 
> unsigned long addr,
>       present = pte && !huge_pte_none(huge_ptep_get(pte));
>       for (; addr != end; vec++, addr += PAGE_SIZE)
>               *vec = present;
> -     walk->private = vec;
> +     walk_private->vec = vec;
>  #else
>       BUG();
>  #endif
> @@ -85,7 +91,8 @@ static unsigned char mincore_page(struct address_space 
> *mapping, pgoff_t pgoff)
>  }
>  
>  static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
> -                             struct vm_area_struct *vma, unsigned char *vec)
> +                             struct vm_area_struct *vma, unsigned char *vec,
> +                             bool can_check_pagecache)
>  {
>       unsigned long nr = (end - addr) >> PAGE_SHIFT;
>       int i;
> @@ -95,7 +102,9 @@ static int __mincore_unmapped_range(unsigned long addr, 
> unsigned long end,
>  
>               pgoff = linear_page_index(vma, addr);
>               for (i = 0; i < nr; i++, pgoff++)
> -                     vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
> +                     vec[i] = can_check_pagecache ?
> +                              mincore_page(vma->vm_file->f_mapping, pgoff)
> +                              : 0;
>       } else {
>               for (i = 0; i < nr; i++)
>                       vec[i] = 0;
> @@ -106,8 +115,11 @@ static int __mincore_unmapped_range(unsigned long addr, 
> unsigned long end,
>  static int mincore_unmapped_range(unsigned long addr, unsigned long end,
>                                  struct mm_walk *walk)
>  {
> -     walk->private += __mincore_unmapped_range(addr, end,
> -                                               walk->vma, walk->private);
> +     struct mincore_walk_private *walk_private = walk->private;
> +     unsigned char *vec = walk_private->vec;
> +
> +     walk_private->vec += __mincore_unmapped_range(addr, end, walk->vma,
> +                             vec, walk_private->can_check_pagecache);
>       return 0;
>  }
>  
> @@ -117,7 +129,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
> addr, unsigned long end,
>       spinlock_t *ptl;
>       struct vm_area_struct *vma = walk->vma;
>       pte_t *ptep;
> -     unsigned char *vec = walk->private;
> +     struct mincore_walk_private *walk_private = walk->private;
> +     unsigned char *vec = walk_private->vec;
>       int nr = (end - addr) >> PAGE_SHIFT;
>  
>       ptl = pmd_trans_huge_lock(pmd, vma);
> @@ -128,7 +141,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
> addr, unsigned long end,
>       }
>  
>       if (pmd_trans_unstable(pmd)) {
> -             __mincore_unmapped_range(addr, end, vma, vec);
> +             __mincore_unmapped_range(addr, end, vma, vec,
> +                                     walk_private->can_check_pagecache);
>               goto out;
>       }
>  
> @@ -138,7 +152,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
> addr, unsigned long end,
>  
>               if (pte_none(pte))
>                       __mincore_unmapped_range(addr, addr + PAGE_SIZE,
> -                                              vma, vec);
> +                              vma, vec, walk_private->can_check_pagecache);
>               else if (pte_present(pte))
>                       *vec = 1;
>               else { /* pte is a swap entry */
> @@ -152,8 +166,12 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long 
> addr, unsigned long end,
>                               *vec = 1;
>                       } else {
>  #ifdef CONFIG_SWAP
> -                             *vec = mincore_page(swap_address_space(entry),
> +                             if (walk_private->can_check_pagecache)
> +                                     *vec = mincore_page(
> +                                                 swap_address_space(entry),
>                                                   swp_offset(entry));
> +                             else
> +                                     *vec = 0;
>  #else
>                               WARN_ON(1);
>                               *vec = 1;
> @@ -187,22 +205,21 @@ static long do_mincore(unsigned long addr, unsigned 
> long pages, unsigned char *v
>       struct vm_area_struct *vma;
>       unsigned long end;
>       int err;
> +     struct mincore_walk_private walk_private = {
> +             .vec = vec
> +     };
>       struct mm_walk mincore_walk = {
>               .pmd_entry = mincore_pte_range,
>               .pte_hole = mincore_unmapped_range,
>               .hugetlb_entry = mincore_hugetlb,
> -             .private = vec,
> +             .private = &walk_private
>       };
>  
>       vma = find_vma(current->mm, addr);
>       if (!vma || addr < vma->vm_start)
>               return -ENOMEM;
>       end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
> -     if (!can_do_mincore(vma)) {
> -             unsigned long pages = (end - addr) >> PAGE_SHIFT;
> -             memset(vec, 1, pages);
> -             return pages;
> -     }
> +     walk_private.can_check_pagecache = can_do_mincore(vma);
>       mincore_walk.mm = vma->vm_mm;
>       err = walk_page_range(addr, end, &mincore_walk);
>       if (err < 0)
> -- 
> 2.20.1

-- 
Michal Hocko
SUSE Labs

Reply via email to