On Wed Oct 23, 2024 at 7:24 PM EEST, Lorenzo Stoakes wrote:
> Implement a new lightweight guard page feature, that is regions of userland
> virtual memory that, when accessed, cause a fatal signal to arise.
>
> Currently users must establish PROT_NONE ranges to achieve this.

A bit off-topic but other hack with PROT_NONE is to allocate naturally
aligned ranges:

1. mmap() of 2*N size with PROT_NONE.
2. mmap(MAP_FIXED) of N size from that range.


>
> However this is very costly memory-wise - we need a VMA for each and every
> one of these regions AND they become unmergeable with surrounding VMAs.
>
> In addition repeated mmap() calls require repeated kernel context switches
> and contention of the mmap lock to install these ranges, potentially also
> having to unmap memory if installed over existing ranges.
>
> The lightweight guard approach eliminates the VMA cost altogether - rather
> than establishing a PROT_NONE VMA, it operates at the level of page table
> entries - establishing PTE markers such that accesses to them cause a fault
> followed by a SIGSGEV signal being raised.
>
> This is achieved through the PTE marker mechanism, which we have already
> extended to provide PTE_MARKER_GUARD, which we installed via the generic
> page walking logic which we have extended for this purpose.
>
> These guard ranges are established with MADV_GUARD_INSTALL. If the range in
> which they are installed contain any existing mappings, they will be
> zapped, i.e. free the range and unmap memory (thus mimicking the behaviour
> of MADV_DONTNEED in this respect).
>
> Any existing guard entries will be left untouched. There is therefore no
> nesting of guarded pages.
>
> Guarded ranges are NOT cleared by MADV_DONTNEED nor MADV_FREE (in both
> instances the memory range may be reused at which point a user would expect
> guards to still be in place), but they are cleared via MADV_GUARD_REMOVE,
> process teardown or unmapping of memory ranges.
>
> The guard property can be removed from ranges via MADV_GUARD_REMOVE. The
> ranges over which this is applied, should they contain non-guard entries,
> will be untouched, with only guard entries being cleared.
>
> We permit this operation on anonymous memory only, and only VMAs which are
> non-special, non-huge and not mlock()'d (if we permitted this we'd have to
> drop locked pages which would be rather counterintuitive).
>
> Racing page faults can cause repeated attempts to install guard pages that
> are interrupted, result in a zap, and this process can end up being
> repeated. If this happens more than would be expected in normal operation,
> we rescind locks and retry the whole thing, which avoids lock contention in
> this scenario.
>
> Suggested-by: Vlastimil Babka <[email protected]>
> Suggested-by: Jann Horn <[email protected]>
> Suggested-by: David Hildenbrand <[email protected]>
> Signed-off-by: Lorenzo Stoakes <[email protected]>
> ---
>  arch/alpha/include/uapi/asm/mman.h     |   3 +
>  arch/mips/include/uapi/asm/mman.h      |   3 +
>  arch/parisc/include/uapi/asm/mman.h    |   3 +
>  arch/xtensa/include/uapi/asm/mman.h    |   3 +
>  include/uapi/asm-generic/mman-common.h |   3 +
>  mm/internal.h                          |   6 +
>  mm/madvise.c                           | 225 +++++++++++++++++++++++++
>  mm/mseal.c                             |   1 +
>  8 files changed, 247 insertions(+)
>
> diff --git a/arch/alpha/include/uapi/asm/mman.h 
> b/arch/alpha/include/uapi/asm/mman.h
> index 763929e814e9..1e700468a685 100644
> --- a/arch/alpha/include/uapi/asm/mman.h
> +++ b/arch/alpha/include/uapi/asm/mman.h
> @@ -78,6 +78,9 @@
>  
>  #define MADV_COLLAPSE        25              /* Synchronous hugepage 
> collapse */
>  
> +#define MADV_GUARD_INSTALL 102               /* fatal signal on access to 
> range */
> +#define MADV_GUARD_REMOVE 103                /* unguard range */
> +
>  /* compatibility flags */
>  #define MAP_FILE     0
>  
> diff --git a/arch/mips/include/uapi/asm/mman.h 
> b/arch/mips/include/uapi/asm/mman.h
> index 9c48d9a21aa0..b700dae28c48 100644
> --- a/arch/mips/include/uapi/asm/mman.h
> +++ b/arch/mips/include/uapi/asm/mman.h
> @@ -105,6 +105,9 @@
>  
>  #define MADV_COLLAPSE        25              /* Synchronous hugepage 
> collapse */
>  
> +#define MADV_GUARD_INSTALL 102               /* fatal signal on access to 
> range */
> +#define MADV_GUARD_REMOVE 103                /* unguard range */
> +
>  /* compatibility flags */
>  #define MAP_FILE     0
>  
> diff --git a/arch/parisc/include/uapi/asm/mman.h 
> b/arch/parisc/include/uapi/asm/mman.h
> index 68c44f99bc93..b6a709506987 100644
> --- a/arch/parisc/include/uapi/asm/mman.h
> +++ b/arch/parisc/include/uapi/asm/mman.h
> @@ -75,6 +75,9 @@
>  #define MADV_HWPOISON     100                /* poison a page for testing */
>  #define MADV_SOFT_OFFLINE 101                /* soft offline page for 
> testing */
>  
> +#define MADV_GUARD_INSTALL 102               /* fatal signal on access to 
> range */
> +#define MADV_GUARD_REMOVE 103                /* unguard range */
> +
>  /* compatibility flags */
>  #define MAP_FILE     0
>  
> diff --git a/arch/xtensa/include/uapi/asm/mman.h 
> b/arch/xtensa/include/uapi/asm/mman.h
> index 1ff0c858544f..99d4ccee7f6e 100644
> --- a/arch/xtensa/include/uapi/asm/mman.h
> +++ b/arch/xtensa/include/uapi/asm/mman.h
> @@ -113,6 +113,9 @@
>  
>  #define MADV_COLLAPSE        25              /* Synchronous hugepage 
> collapse */
>  
> +#define MADV_GUARD_INSTALL 102               /* fatal signal on access to 
> range */
> +#define MADV_GUARD_REMOVE 103                /* unguard range */
> +
>  /* compatibility flags */
>  #define MAP_FILE     0
>  
> diff --git a/include/uapi/asm-generic/mman-common.h 
> b/include/uapi/asm-generic/mman-common.h
> index 6ce1f1ceb432..1ea2c4c33b86 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -79,6 +79,9 @@
>  
>  #define MADV_COLLAPSE        25              /* Synchronous hugepage 
> collapse */
>  
> +#define MADV_GUARD_INSTALL 102               /* fatal signal on access to 
> range */
> +#define MADV_GUARD_REMOVE 103                /* unguard range */
> +
>  /* compatibility flags */
>  #define MAP_FILE     0
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index fb1fb0c984e4..fcf08b5e64dc 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -423,6 +423,12 @@ extern unsigned long highest_memmap_pfn;
>   */
>  #define MAX_RECLAIM_RETRIES 16
>  
> +/*
> + * Maximum number of attempts we make to install guard pages before we give 
> up
> + * and return -ERESTARTNOINTR to have userspace try again.
> + */
> +#define MAX_MADVISE_GUARD_RETRIES 3
> +
>  /*
>   * in mm/vmscan.c:
>   */
> diff --git a/mm/madvise.c b/mm/madvise.c
> index e871a72a6c32..48eba25e25fe 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -60,6 +60,8 @@ static int madvise_need_mmap_write(int behavior)
>       case MADV_POPULATE_READ:
>       case MADV_POPULATE_WRITE:
>       case MADV_COLLAPSE:
> +     case MADV_GUARD_INSTALL:
> +     case MADV_GUARD_REMOVE:
>               return 0;
>       default:
>               /* be safe, default to 1. list exceptions explicitly */
> @@ -1017,6 +1019,214 @@ static long madvise_remove(struct vm_area_struct *vma,
>       return error;
>  }
>  
> +static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
> +{
> +     vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
> +
> +     /*
> +      * A user could lock after setting a guard range but that's fine, as
> +      * they'd not be able to fault in. The issue arises when we try to zap
> +      * existing locked VMAs. We don't want to do that.
> +      */
> +     if (!allow_locked)
> +             disallowed |= VM_LOCKED;
> +
> +     if (!vma_is_anonymous(vma))
> +             return false;
> +
> +     if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
> +             return false;
> +
> +     return true;
> +}
> +
> +static bool is_guard_pte_marker(pte_t ptent)
> +{
> +     return is_pte_marker(ptent) &&
> +             is_guard_swp_entry(pte_to_swp_entry(ptent));
> +}
> +
> +static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
> +                                unsigned long next, struct mm_walk *walk)
> +{
> +     pud_t pudval = pudp_get(pud);
> +
> +     /* If huge return >0 so we abort the operation + zap. */
> +     return pud_trans_huge(pudval) || pud_devmap(pudval);
> +}
> +
> +static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
> +                                unsigned long next, struct mm_walk *walk)
> +{
> +     pmd_t pmdval = pmdp_get(pmd);
> +
> +     /* If huge return >0 so we abort the operation + zap. */
> +     return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
> +}
> +
> +static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
> +                                unsigned long next, struct mm_walk *walk)
> +{
> +     pte_t pteval = ptep_get(pte);
> +     unsigned long *nr_pages = (unsigned long *)walk->private;
> +
> +     /* If there is already a guard page marker, we have nothing to do. */
> +     if (is_guard_pte_marker(pteval)) {
> +             (*nr_pages)++;
> +
> +             return 0;
> +     }
> +
> +     /* If populated return >0 so we abort the operation + zap. */
> +     return 1;
> +}
> +
> +static int guard_install_set_pte(unsigned long addr, unsigned long next,
> +                              pte_t *ptep, struct mm_walk *walk)
> +{
> +     unsigned long *nr_pages = (unsigned long *)walk->private;
> +
> +     /* Simply install a PTE marker, this causes segfault on access. */
> +     *ptep = make_pte_marker(PTE_MARKER_GUARD);
> +     (*nr_pages)++;
> +
> +     return 0;
> +}
> +
> +static const struct mm_walk_ops guard_install_walk_ops = {
> +     .pud_entry              = guard_install_pud_entry,
> +     .pmd_entry              = guard_install_pmd_entry,
> +     .pte_entry              = guard_install_pte_entry,
> +     .install_pte            = guard_install_set_pte,
> +     .walk_lock              = PGWALK_RDLOCK,
> +};
> +
> +static long madvise_guard_install(struct vm_area_struct *vma,
> +                              struct vm_area_struct **prev,
> +                              unsigned long start, unsigned long end)
> +{
> +     long err;
> +     int i;
> +
> +     *prev = vma;
> +     if (!is_valid_guard_vma(vma, /* allow_locked = */false))
> +             return -EINVAL;
> +
> +     /*
> +      * If we install guard markers, then the range is no longer
> +      * empty from a page table perspective and therefore it's
> +      * appropriate to have an anon_vma.
> +      *
> +      * This ensures that on fork, we copy page tables correctly.
> +      */
> +     err = anon_vma_prepare(vma);
> +     if (err)
> +             return err;
> +
> +     /*
> +      * Optimistically try to install the guard marker pages first. If any
> +      * non-guard pages are encountered, give up and zap the range before
> +      * trying again.
> +      *
> +      * We try a few times before giving up and releasing back to userland to
> +      * loop around, releasing locks in the process to avoid contention. This
> +      * would only happen if there was a great many racing page faults.
> +      *
> +      * In most cases we should simply install the guard markers immediately
> +      * with no zap or looping.
> +      */
> +     for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
> +             unsigned long nr_pages = 0;
> +
> +             /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
> +             err = walk_page_range_mm(vma->vm_mm, start, end,
> +                                      &guard_install_walk_ops, &nr_pages);
> +             if (err < 0)
> +                     return err;
> +
> +             if (err == 0) {
> +                     unsigned long nr_expected_pages = PHYS_PFN(end - start);
> +
> +                     VM_WARN_ON(nr_pages != nr_expected_pages);
> +                     return 0;
> +             }
> +
> +             /*
> +              * OK some of the range have non-guard pages mapped, zap
> +              * them. This leaves existing guard pages in place.
> +              */
> +             zap_page_range_single(vma, start, end - start, NULL);
> +     }
> +
> +     /*
> +      * We were unable to install the guard pages due to being raced by page
> +      * faults. This should not happen ordinarily. We return to userspace and
> +      * immediately retry, relieving lock contention.
> +      */
> +     return -ERESTARTNOINTR;
> +}
> +
> +static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
> +                               unsigned long next, struct mm_walk *walk)
> +{
> +     pud_t pudval = pudp_get(pud);
> +
> +     /* If huge, cannot have guard pages present, so no-op - skip. */
> +     if (pud_trans_huge(pudval) || pud_devmap(pudval))
> +             walk->action = ACTION_CONTINUE;
> +
> +     return 0;
> +}
> +
> +static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
> +                               unsigned long next, struct mm_walk *walk)
> +{
> +     pmd_t pmdval = pmdp_get(pmd);
> +
> +     /* If huge, cannot have guard pages present, so no-op - skip. */
> +     if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
> +             walk->action = ACTION_CONTINUE;
> +
> +     return 0;
> +}
> +
> +static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
> +                               unsigned long next, struct mm_walk *walk)
> +{
> +     pte_t ptent = ptep_get(pte);
> +
> +     if (is_guard_pte_marker(ptent)) {
> +             /* Simply clear the PTE marker. */
> +             pte_clear_not_present_full(walk->mm, addr, pte, false);
> +             update_mmu_cache(walk->vma, addr, pte);
> +     }
> +
> +     return 0;
> +}
> +
> +static const struct mm_walk_ops guard_remove_walk_ops = {
> +     .pud_entry              = guard_remove_pud_entry,
> +     .pmd_entry              = guard_remove_pmd_entry,
> +     .pte_entry              = guard_remove_pte_entry,
> +     .walk_lock              = PGWALK_RDLOCK,
> +};
> +
> +static long madvise_guard_remove(struct vm_area_struct *vma,
> +                              struct vm_area_struct **prev,
> +                              unsigned long start, unsigned long end)
> +{
> +     *prev = vma;
> +     /*
> +      * We're ok with removing guards in mlock()'d ranges, as this is a
> +      * non-destructive action.
> +      */
> +     if (!is_valid_guard_vma(vma, /* allow_locked = */true))
> +             return -EINVAL;
> +
> +     return walk_page_range(vma->vm_mm, start, end,
> +                            &guard_remove_walk_ops, NULL);
> +}
> +
>  /*
>   * Apply an madvise behavior to a region of a vma.  madvise_update_vma
>   * will handle splitting a vm area into separate areas, each area with its 
> own
> @@ -1098,6 +1308,10 @@ static int madvise_vma_behavior(struct vm_area_struct 
> *vma,
>               break;
>       case MADV_COLLAPSE:
>               return madvise_collapse(vma, prev, start, end);
> +     case MADV_GUARD_INSTALL:
> +             return madvise_guard_install(vma, prev, start, end);
> +     case MADV_GUARD_REMOVE:
> +             return madvise_guard_remove(vma, prev, start, end);
>       }
>  
>       anon_name = anon_vma_name(vma);
> @@ -1197,6 +1411,8 @@ madvise_behavior_valid(int behavior)
>       case MADV_DODUMP:
>       case MADV_WIPEONFORK:
>       case MADV_KEEPONFORK:
> +     case MADV_GUARD_INSTALL:
> +     case MADV_GUARD_REMOVE:
>  #ifdef CONFIG_MEMORY_FAILURE
>       case MADV_SOFT_OFFLINE:
>       case MADV_HWPOISON:
> @@ -1490,6 +1706,15 @@ static ssize_t vector_madvise(struct mm_struct *mm, 
> struct iov_iter *iter,
>       while (iov_iter_count(iter)) {
>               ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
>                                iter_iov_len(iter), behavior);
> +             /*
> +              * We cannot return this, as we instead return the number of
> +              * successful operations. Since all this would achieve in a
> +              * single madvise() invocation is to re-enter the syscall, and
> +              * we have already rescinded locks, it should be no problem to
> +              * simply try again.
> +              */
> +             if (ret == -ERESTARTNOINTR)
> +                     continue;
>               if (ret < 0)
>                       break;
>               iov_iter_advance(iter, iter_iov_len(iter));
> diff --git a/mm/mseal.c b/mm/mseal.c
> index ece977bd21e1..81d6e980e8a9 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior)
>       case MADV_REMOVE:
>       case MADV_DONTFORK:
>       case MADV_WIPEONFORK:
> +     case MADV_GUARD_INSTALL:
>               return true;
>       }
>  

Acked-by: Jarkko Sakkinen <[email protected]>
Tested-by: Jarkko Sakkinen <[email protected]>

BR, Jarkko

Reply via email to