maps under per-vma lock

Liam R. Howlett Mon, 07 Jul 2025 11:32:19 -0700

* Suren Baghdasaryan <sur...@google.com> [250704 02:07]:
> With maple_tree supporting vma tree traversal under RCU and per-vma
> locks, /proc/pid/maps can be read while holding individual vma locks
> instead of locking the entire address space.
> Completely lockless approach (walking vma tree under RCU) would be quite
> complex with the main issue being get_vma_name() using callbacks which
> might not work correctly with a stable vma copy, requiring original
> (unstable) vma - see special_mapping_name() for an example.
> When per-vma lock acquisition fails, we take the mmap_lock for reading,
> lock the vma, release the mmap_lock and continue. This fallback to mmap
> read lock guarantees the reader to make forward progress even during
> lock contention. This will interfere with the writer but for a very
> short time while we are acquiring the per-vma lock and only when there
> was contention on the vma reader is interested in. We shouldn't see a
> repeated fallback to mmap read locks in practice, as this require a
> very unlikely series of lock contentions (for instance due to repeated
> vma split operations). However even if this did somehow happen, we would
> still progress.
> One case requiring special handling is when vma changes between the
> time it was found and the time it got locked. A problematic case would
> be if vma got shrunk so that it's start moved higher in the address
> space and a new vma was installed at the beginning:
> 
> reader found:               |--------VMA A--------|
> VMA is modified:            |-VMA B-|----VMA A----|
> reader locks modified VMA A
> reader reports VMA A:       |  gap  |----VMA A----|
> 
> This would result in reporting a gap in the address space that does not
> exist. To prevent this we retry the lookup after locking the vma, however
> we do that only when we identify a gap and detect that the address space
> was changed after we found the vma.
> This change is designed to reduce mmap_lock contention and prevent a
> process reading /proc/pid/maps files (often a low priority task, such
> as monitoring/data collection services) from blocking address space
> updates. Note that this change has a userspace visible disadvantage:
> it allows for sub-page data tearing as opposed to the previous mechanism
> where data tearing could happen only between pages of generated output
> data. Since current userspace considers data tearing between pages to be
> acceptable, we assume is will be able to handle sub-page data tearing
> as well.
> 
> Signed-off-by: Suren Baghdasaryan <sur...@google.com>


Reviewed-by: Liam R. Howlett <liam.howl...@oracle.com>

> ---
>  fs/proc/internal.h        |   5 ++
>  fs/proc/task_mmu.c        | 118 ++++++++++++++++++++++++++++++++++----
>  include/linux/mmap_lock.h |  11 ++++
>  mm/madvise.c              |   3 +-
>  mm/mmap_lock.c            |  88 ++++++++++++++++++++++++++++
>  5 files changed, 214 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 3d48ffe72583..7c235451c5ea 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -384,6 +384,11 @@ struct proc_maps_private {
>       struct task_struct *task;
>       struct mm_struct *mm;
>       struct vma_iterator iter;
> +     loff_t last_pos;
> +#ifdef CONFIG_PER_VMA_LOCK
> +     bool mmap_locked;
> +     struct vm_area_struct *locked_vma;
> +#endif
>  #ifdef CONFIG_NUMA
>       struct mempolicy *task_mempolicy;
>  #endif
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index b8bc06d05a72..ff3fe488ce51 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct 
> proc_maps_private *priv)
>  }
>  #endif
>  
> -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
> -                                             loff_t *ppos)
> +#ifdef CONFIG_PER_VMA_LOCK
> +
> +static void unlock_vma(struct proc_maps_private *priv)
> +{
> +     if (priv->locked_vma) {
> +             vma_end_read(priv->locked_vma);
> +             priv->locked_vma = NULL;
> +     }
> +}
> +
> +static const struct seq_operations proc_pid_maps_op;
> +
> +static inline bool lock_vma_range(struct seq_file *m,
> +                               struct proc_maps_private *priv)
> +{
> +     /*
> +      * smaps and numa_maps perform page table walk, therefore require
> +      * mmap_lock but maps can be read with locking just the vma.
> +      */
> +     if (m->op != &proc_pid_maps_op) {
> +             if (mmap_read_lock_killable(priv->mm))
> +                     return false;
> +
> +             priv->mmap_locked = true;
> +     } else {
> +             rcu_read_lock();
> +             priv->locked_vma = NULL;
> +             priv->mmap_locked = false;
> +     }
> +
> +     return true;
> +}
> +
> +static inline void unlock_vma_range(struct proc_maps_private *priv)
> +{
> +     if (priv->mmap_locked) {
> +             mmap_read_unlock(priv->mm);
> +     } else {
> +             unlock_vma(priv);
> +             rcu_read_unlock();
> +     }
> +}
> +
> +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> +                                        loff_t last_pos)
> +{
> +     struct vm_area_struct *vma;
> +
> +     if (priv->mmap_locked)
> +             return vma_next(&priv->iter);
> +
> +     unlock_vma(priv);
> +     vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
> +     if (!IS_ERR_OR_NULL(vma))
> +             priv->locked_vma = vma;
> +
> +     return vma;
> +}
> +
> +#else /* CONFIG_PER_VMA_LOCK */
> +
> +static inline bool lock_vma_range(struct seq_file *m,
> +                               struct proc_maps_private *priv)
>  {
> -     struct vm_area_struct *vma = vma_next(&priv->iter);
> +     return mmap_read_lock_killable(priv->mm) == 0;
> +}
> +
> +static inline void unlock_vma_range(struct proc_maps_private *priv)
> +{
> +     mmap_read_unlock(priv->mm);
> +}
> +
> +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
> +                                        loff_t last_pos)
> +{
> +     return vma_next(&priv->iter);
> +}
>  
> +#endif /* CONFIG_PER_VMA_LOCK */
> +
> +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
> +{
> +     struct proc_maps_private *priv = m->private;
> +     struct vm_area_struct *vma;
> +
> +     vma = get_next_vma(priv, *ppos);
> +     /* EINTR is possible */
> +     if (IS_ERR(vma))
> +             return vma;
> +
> +     /* Store previous position to be able to restart if needed */
> +     priv->last_pos = *ppos;
>       if (vma) {
> -             *ppos = vma->vm_start;
> +             /*
> +              * Track the end of the reported vma to ensure position changes
> +              * even if previous vma was merged with the next vma and we
> +              * found the extended vma with the same vm_start.
> +              */
> +             *ppos = vma->vm_end;
>       } else {
> -             *ppos = -2;
> +             *ppos = -2; /* -2 indicates gate vma */
>               vma = get_gate_vma(priv->mm);
>       }
>  
> @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
>               return NULL;
>       }
>  
> -     if (mmap_read_lock_killable(mm)) {
> +     if (!lock_vma_range(m, priv)) {
>               mmput(mm);
>               put_task_struct(priv->task);
>               priv->task = NULL;
>               return ERR_PTR(-EINTR);
>       }
>  
> +     /*
> +      * Reset current position if last_addr was set before
> +      * and it's not a sentinel.
> +      */
> +     if (last_addr > 0)
> +             *ppos = last_addr = priv->last_pos;
>       vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
>       hold_task_mempolicy(priv);
>       if (last_addr == -2)
>               return get_gate_vma(mm);
>  
> -     return proc_get_vma(priv, ppos);
> +     return proc_get_vma(m, ppos);
>  }
>  
>  static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
>  {
>       if (*ppos == -2) {
> -             *ppos = -1;
> +             *ppos = -1; /* -1 indicates no more vmas */
>               return NULL;
>       }
> -     return proc_get_vma(m->private, ppos);
> +     return proc_get_vma(m, ppos);
>  }
>  
>  static void m_stop(struct seq_file *m, void *v)
> @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v)
>               return;
>  
>       release_task_mempolicy(priv);
> -     mmap_read_unlock(mm);
> +     unlock_vma_range(priv);
>       mmput(mm);
>       put_task_struct(priv->task);
>       priv->task = NULL;
> diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
> index 5da384bd0a26..1f4f44951abe 100644
> --- a/include/linux/mmap_lock.h
> +++ b/include/linux/mmap_lock.h
> @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma);
>  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
>                                         unsigned long address);
>  
> +/*
> + * Locks next vma pointed by the iterator. Confirms the locked vma has not
> + * been modified and will retry under mmap_lock protection if modification
> + * was detected. Should be called from read RCU section.
> + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
> + * process was interrupted.
> + */
> +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> +                                  struct vma_iterator *iter,
> +                                  unsigned long address);
> +
>  #else /* CONFIG_PER_VMA_LOCK */
>  
>  static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
> diff --git a/mm/madvise.c b/mm/madvise.c
> index a34c2c89a53b..e61e32b2cd91 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref)
>  
>  struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
>  {
> -     mmap_assert_locked(vma->vm_mm);
> +     if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
> +             vma_assert_locked(vma);
>  
>       return vma->anon_name;
>  }
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index 5f725cc67334..ed0e5e2171cd 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct 
> mm_struct *mm,
>       count_vm_vma_lock_event(VMA_LOCK_ABORT);
>       return NULL;
>  }
> +
> +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm,
> +                                                    struct vma_iterator 
> *iter,
> +                                                    unsigned long address)
> +{
> +     struct vm_area_struct *vma;
> +     int ret;
> +
> +     ret = mmap_read_lock_killable(mm);
> +     if (ret)
> +             return ERR_PTR(ret);
> +
> +     /* Lookup the vma at the last position again under mmap_read_lock */
> +     vma_iter_init(iter, mm, address);
> +     vma = vma_next(iter);
> +     if (vma)
> +             vma_start_read_locked(vma);
> +
> +     mmap_read_unlock(mm);
> +
> +     return vma;
> +}
> +
> +struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
> +                                  struct vma_iterator *iter,
> +                                  unsigned long address)
> +{
> +     struct vm_area_struct *vma;
> +     unsigned int mm_wr_seq;
> +     bool mmap_unlocked;
> +
> +     RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
> +retry:
> +     /* Start mmap_lock speculation in case we need to verify the vma later 
> */
> +     mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
> +     vma = vma_next(iter);
> +     if (!vma)
> +             return NULL;
> +
> +     vma = vma_start_read(mm, vma);
> +
> +     if (IS_ERR_OR_NULL(vma)) {
> +             /*
> +              * Retry immediately if the vma gets detached from under us.
> +              * Infinite loop should not happen because the vma we find will
> +              * have to be constantly knocked out from under us.
> +              */
> +             if (PTR_ERR(vma) == -EAGAIN) {
> +                     vma_iter_init(iter, mm, address);
> +                     goto retry;
> +             }
> +
> +             goto out;
> +     }
> +
> +     /*
> +      * Verify the vma we locked belongs to the same address space and it's
> +      * not behind of the last search position.
> +      */
> +     if (unlikely(vma->vm_mm != mm || address >= vma->vm_end))
> +             goto out_unlock;
> +
> +     /*
> +      * vma can be ahead of the last search position but we need to verify
> +      * it was not shrunk after we found it and another vma has not been
> +      * installed ahead of it. Otherwise we might observe a gap that should
> +      * not be there.
> +      */
> +     if (address < vma->vm_start) {
> +             /* Verify only if the address space might have changed since 
> vma lookup. */
> +             if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) 
> {
> +                     vma_iter_init(iter, mm, address);
> +                     if (vma != vma_next(iter))
> +                             goto out_unlock;
> +             }
> +     }
> +
> +     return vma;
> +
> +out_unlock:
> +     vma_end_read(vma);
> +out:
> +     rcu_read_unlock();
> +     vma = lock_vma_under_mmap_lock(mm, iter, address);
> +     rcu_read_lock();
> +
> +     return vma;
> +}
>  #endif /* CONFIG_PER_VMA_LOCK */
>  
>  #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
> -- 
> 2.50.0.727.gbf7dc18ff4-goog
>

Re: [PATCH v6 7/8] fs/proc/task_mmu: read proc/pid/maps under per-vma lock

Reply via email to