* Suren Baghdasaryan <sur...@google.com> [250704 02:07]: > With maple_tree supporting vma tree traversal under RCU and per-vma > locks, /proc/pid/maps can be read while holding individual vma locks > instead of locking the entire address space. > Completely lockless approach (walking vma tree under RCU) would be quite > complex with the main issue being get_vma_name() using callbacks which > might not work correctly with a stable vma copy, requiring original > (unstable) vma - see special_mapping_name() for an example. > When per-vma lock acquisition fails, we take the mmap_lock for reading, > lock the vma, release the mmap_lock and continue. This fallback to mmap > read lock guarantees the reader to make forward progress even during > lock contention. This will interfere with the writer but for a very > short time while we are acquiring the per-vma lock and only when there > was contention on the vma reader is interested in. We shouldn't see a > repeated fallback to mmap read locks in practice, as this require a > very unlikely series of lock contentions (for instance due to repeated > vma split operations). However even if this did somehow happen, we would > still progress. > One case requiring special handling is when vma changes between the > time it was found and the time it got locked. A problematic case would > be if vma got shrunk so that it's start moved higher in the address > space and a new vma was installed at the beginning: > > reader found: |--------VMA A--------| > VMA is modified: |-VMA B-|----VMA A----| > reader locks modified VMA A > reader reports VMA A: | gap |----VMA A----| > > This would result in reporting a gap in the address space that does not > exist. To prevent this we retry the lookup after locking the vma, however > we do that only when we identify a gap and detect that the address space > was changed after we found the vma. > This change is designed to reduce mmap_lock contention and prevent a > process reading /proc/pid/maps files (often a low priority task, such > as monitoring/data collection services) from blocking address space > updates. Note that this change has a userspace visible disadvantage: > it allows for sub-page data tearing as opposed to the previous mechanism > where data tearing could happen only between pages of generated output > data. Since current userspace considers data tearing between pages to be > acceptable, we assume is will be able to handle sub-page data tearing > as well. > > Signed-off-by: Suren Baghdasaryan <sur...@google.com>
Reviewed-by: Liam R. Howlett <liam.howl...@oracle.com> > --- > fs/proc/internal.h | 5 ++ > fs/proc/task_mmu.c | 118 ++++++++++++++++++++++++++++++++++---- > include/linux/mmap_lock.h | 11 ++++ > mm/madvise.c | 3 +- > mm/mmap_lock.c | 88 ++++++++++++++++++++++++++++ > 5 files changed, 214 insertions(+), 11 deletions(-) > > diff --git a/fs/proc/internal.h b/fs/proc/internal.h > index 3d48ffe72583..7c235451c5ea 100644 > --- a/fs/proc/internal.h > +++ b/fs/proc/internal.h > @@ -384,6 +384,11 @@ struct proc_maps_private { > struct task_struct *task; > struct mm_struct *mm; > struct vma_iterator iter; > + loff_t last_pos; > +#ifdef CONFIG_PER_VMA_LOCK > + bool mmap_locked; > + struct vm_area_struct *locked_vma; > +#endif > #ifdef CONFIG_NUMA > struct mempolicy *task_mempolicy; > #endif > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index b8bc06d05a72..ff3fe488ce51 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -127,15 +127,107 @@ static void release_task_mempolicy(struct > proc_maps_private *priv) > } > #endif > > -static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, > - loff_t *ppos) > +#ifdef CONFIG_PER_VMA_LOCK > + > +static void unlock_vma(struct proc_maps_private *priv) > +{ > + if (priv->locked_vma) { > + vma_end_read(priv->locked_vma); > + priv->locked_vma = NULL; > + } > +} > + > +static const struct seq_operations proc_pid_maps_op; > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) > +{ > + /* > + * smaps and numa_maps perform page table walk, therefore require > + * mmap_lock but maps can be read with locking just the vma. > + */ > + if (m->op != &proc_pid_maps_op) { > + if (mmap_read_lock_killable(priv->mm)) > + return false; > + > + priv->mmap_locked = true; > + } else { > + rcu_read_lock(); > + priv->locked_vma = NULL; > + priv->mmap_locked = false; > + } > + > + return true; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) > +{ > + if (priv->mmap_locked) { > + mmap_read_unlock(priv->mm); > + } else { > + unlock_vma(priv); > + rcu_read_unlock(); > + } > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + struct vm_area_struct *vma; > + > + if (priv->mmap_locked) > + return vma_next(&priv->iter); > + > + unlock_vma(priv); > + vma = lock_next_vma(priv->mm, &priv->iter, last_pos); > + if (!IS_ERR_OR_NULL(vma)) > + priv->locked_vma = vma; > + > + return vma; > +} > + > +#else /* CONFIG_PER_VMA_LOCK */ > + > +static inline bool lock_vma_range(struct seq_file *m, > + struct proc_maps_private *priv) > { > - struct vm_area_struct *vma = vma_next(&priv->iter); > + return mmap_read_lock_killable(priv->mm) == 0; > +} > + > +static inline void unlock_vma_range(struct proc_maps_private *priv) > +{ > + mmap_read_unlock(priv->mm); > +} > + > +static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, > + loff_t last_pos) > +{ > + return vma_next(&priv->iter); > +} > > +#endif /* CONFIG_PER_VMA_LOCK */ > + > +static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos) > +{ > + struct proc_maps_private *priv = m->private; > + struct vm_area_struct *vma; > + > + vma = get_next_vma(priv, *ppos); > + /* EINTR is possible */ > + if (IS_ERR(vma)) > + return vma; > + > + /* Store previous position to be able to restart if needed */ > + priv->last_pos = *ppos; > if (vma) { > - *ppos = vma->vm_start; > + /* > + * Track the end of the reported vma to ensure position changes > + * even if previous vma was merged with the next vma and we > + * found the extended vma with the same vm_start. > + */ > + *ppos = vma->vm_end; > } else { > - *ppos = -2; > + *ppos = -2; /* -2 indicates gate vma */ > vma = get_gate_vma(priv->mm); > } > > @@ -163,28 +255,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos) > return NULL; > } > > - if (mmap_read_lock_killable(mm)) { > + if (!lock_vma_range(m, priv)) { > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > return ERR_PTR(-EINTR); > } > > + /* > + * Reset current position if last_addr was set before > + * and it's not a sentinel. > + */ > + if (last_addr > 0) > + *ppos = last_addr = priv->last_pos; > vma_iter_init(&priv->iter, mm, (unsigned long)last_addr); > hold_task_mempolicy(priv); > if (last_addr == -2) > return get_gate_vma(mm); > > - return proc_get_vma(priv, ppos); > + return proc_get_vma(m, ppos); > } > > static void *m_next(struct seq_file *m, void *v, loff_t *ppos) > { > if (*ppos == -2) { > - *ppos = -1; > + *ppos = -1; /* -1 indicates no more vmas */ > return NULL; > } > - return proc_get_vma(m->private, ppos); > + return proc_get_vma(m, ppos); > } > > static void m_stop(struct seq_file *m, void *v) > @@ -196,7 +294,7 @@ static void m_stop(struct seq_file *m, void *v) > return; > > release_task_mempolicy(priv); > - mmap_read_unlock(mm); > + unlock_vma_range(priv); > mmput(mm); > put_task_struct(priv->task); > priv->task = NULL; > diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h > index 5da384bd0a26..1f4f44951abe 100644 > --- a/include/linux/mmap_lock.h > +++ b/include/linux/mmap_lock.h > @@ -309,6 +309,17 @@ void vma_mark_detached(struct vm_area_struct *vma); > struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, > unsigned long address); > > +/* > + * Locks next vma pointed by the iterator. Confirms the locked vma has not > + * been modified and will retry under mmap_lock protection if modification > + * was detected. Should be called from read RCU section. > + * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the > + * process was interrupted. > + */ > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address); > + > #else /* CONFIG_PER_VMA_LOCK */ > > static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} > diff --git a/mm/madvise.c b/mm/madvise.c > index a34c2c89a53b..e61e32b2cd91 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -108,7 +108,8 @@ void anon_vma_name_free(struct kref *kref) > > struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) > { > - mmap_assert_locked(vma->vm_mm); > + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) > + vma_assert_locked(vma); > > return vma->anon_name; > } > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > index 5f725cc67334..ed0e5e2171cd 100644 > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -178,6 +178,94 @@ struct vm_area_struct *lock_vma_under_rcu(struct > mm_struct *mm, > count_vm_vma_lock_event(VMA_LOCK_ABORT); > return NULL; > } > + > +static struct vm_area_struct *lock_vma_under_mmap_lock(struct mm_struct *mm, > + struct vma_iterator > *iter, > + unsigned long address) > +{ > + struct vm_area_struct *vma; > + int ret; > + > + ret = mmap_read_lock_killable(mm); > + if (ret) > + return ERR_PTR(ret); > + > + /* Lookup the vma at the last position again under mmap_read_lock */ > + vma_iter_init(iter, mm, address); > + vma = vma_next(iter); > + if (vma) > + vma_start_read_locked(vma); > + > + mmap_read_unlock(mm); > + > + return vma; > +} > + > +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, > + struct vma_iterator *iter, > + unsigned long address) > +{ > + struct vm_area_struct *vma; > + unsigned int mm_wr_seq; > + bool mmap_unlocked; > + > + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); > +retry: > + /* Start mmap_lock speculation in case we need to verify the vma later > */ > + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); > + vma = vma_next(iter); > + if (!vma) > + return NULL; > + > + vma = vma_start_read(mm, vma); > + > + if (IS_ERR_OR_NULL(vma)) { > + /* > + * Retry immediately if the vma gets detached from under us. > + * Infinite loop should not happen because the vma we find will > + * have to be constantly knocked out from under us. > + */ > + if (PTR_ERR(vma) == -EAGAIN) { > + vma_iter_init(iter, mm, address); > + goto retry; > + } > + > + goto out; > + } > + > + /* > + * Verify the vma we locked belongs to the same address space and it's > + * not behind of the last search position. > + */ > + if (unlikely(vma->vm_mm != mm || address >= vma->vm_end)) > + goto out_unlock; > + > + /* > + * vma can be ahead of the last search position but we need to verify > + * it was not shrunk after we found it and another vma has not been > + * installed ahead of it. Otherwise we might observe a gap that should > + * not be there. > + */ > + if (address < vma->vm_start) { > + /* Verify only if the address space might have changed since > vma lookup. */ > + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) > { > + vma_iter_init(iter, mm, address); > + if (vma != vma_next(iter)) > + goto out_unlock; > + } > + } > + > + return vma; > + > +out_unlock: > + vma_end_read(vma); > +out: > + rcu_read_unlock(); > + vma = lock_vma_under_mmap_lock(mm, iter, address); > + rcu_read_lock(); > + > + return vma; > +} > #endif /* CONFIG_PER_VMA_LOCK */ > > #ifdef CONFIG_LOCK_MM_AND_FIND_VMA > -- > 2.50.0.727.gbf7dc18ff4-goog >