On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote: > Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers") > prevented the oom reaper from unmapping private anonymous memory with the > oom reaper when the oom victim mm had mmu notifiers registered. > > The rationale is that doing mmu_notifier_invalidate_range_{start,end}() > around the unmap_page_range(), which is needed, can block and the oom > killer will stall forever waiting for the victim to exit, which may not > be possible without reaping. > > That concern is real, but only true for mmu notifiers that have blockable > invalidate_range_{start,end}() callbacks. This patch adds a "flags" field > to mmu notifier ops that can set a bit to indicate that these callbacks do > not block. > > The implementation is steered toward an expensive slowpath, such as after > the oom reaper has grabbed mm->mmap_sem of a still alive oom victim. > > Signed-off-by: David Rientjes <rient...@google.com>
Acked-by: Dimitri Sivanich <sivan...@hpe.com> > --- > v2: > - specifically exclude mmu_notifiers without invalidate callbacks > - move flags to mmu_notifier_ops per Paolo > - reverse flag from blockable -> not blockable per Christian > > drivers/infiniband/hw/hfi1/mmu_rb.c | 1 + > drivers/iommu/amd_iommu_v2.c | 1 + > drivers/iommu/intel-svm.c | 1 + > drivers/misc/sgi-gru/grutlbpurge.c | 1 + > include/linux/mmu_notifier.h | 21 +++++++++++++++++++++ > mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++++ > virt/kvm/kvm_main.c | 1 + > 7 files changed, 57 insertions(+) > > diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c > b/drivers/infiniband/hw/hfi1/mmu_rb.c > --- a/drivers/infiniband/hw/hfi1/mmu_rb.c > +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c > @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler, > static void handle_remove(struct work_struct *work); > > static const struct mmu_notifier_ops mn_opts = { > + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, > .invalidate_range_start = mmu_notifier_range_start, > }; > > diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c > --- a/drivers/iommu/amd_iommu_v2.c > +++ b/drivers/iommu/amd_iommu_v2.c > @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct > mm_struct *mm) > } > > static const struct mmu_notifier_ops iommu_mn = { > + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, > .release = mn_release, > .clear_flush_young = mn_clear_flush_young, > .invalidate_range = mn_invalidate_range, > diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c > --- a/drivers/iommu/intel-svm.c > +++ b/drivers/iommu/intel-svm.c > @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, > struct mm_struct *mm) > } > > static const struct mmu_notifier_ops intel_mmuops = { > + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, > .release = intel_mm_release, > .change_pte = intel_change_pte, > .invalidate_range = intel_invalidate_range, > diff --git a/drivers/misc/sgi-gru/grutlbpurge.c > b/drivers/misc/sgi-gru/grutlbpurge.c > --- a/drivers/misc/sgi-gru/grutlbpurge.c > +++ b/drivers/misc/sgi-gru/grutlbpurge.c > @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct > mm_struct *mm) > > > static const struct mmu_notifier_ops gru_mmuops = { > + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, > .invalidate_range_start = gru_invalidate_range_start, > .invalidate_range_end = gru_invalidate_range_end, > .release = gru_release, > diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h > --- a/include/linux/mmu_notifier.h > +++ b/include/linux/mmu_notifier.h > @@ -10,6 +10,9 @@ > struct mmu_notifier; > struct mmu_notifier_ops; > > +/* mmu_notifier_ops flags */ > +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) > + > #ifdef CONFIG_MMU_NOTIFIER > > /* > @@ -26,6 +29,15 @@ struct mmu_notifier_mm { > }; > > struct mmu_notifier_ops { > + /* > + * Flags to specify behavior of callbacks for this MMU notifier. > + * Used to determine which context an operation may be called. > + * > + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not > + * block > + */ > + int flags; > + > /* > * Called either by mmu_notifier_unregister or when the mm is > * being destroyed by exit_mmap, always before all pages are > @@ -137,6 +149,9 @@ struct mmu_notifier_ops { > * page. Pages will no longer be referenced by the linux > * address space but may still be referenced by sptes until > * the last refcount is dropped. > + * > + * If both of these callbacks cannot block, mmu_notifier_ops.flags > + * should have MMU_INVALIDATE_DOES_NOT_BLOCK set. > */ > void (*invalidate_range_start)(struct mmu_notifier *mn, > struct mm_struct *mm, > @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct > mm_struct *mm, > bool only_end); > extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, > unsigned long start, unsigned long end); > +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); > > static inline void mmu_notifier_release(struct mm_struct *mm) > { > @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct > mm_struct *mm, > { > } > > +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) > +{ > + return 0; > +} > + > static inline void mmu_notifier_mm_init(struct mm_struct *mm) > { > } > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c > --- a/mm/mmu_notifier.c > +++ b/mm/mmu_notifier.c > @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct > *mm, > } > EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); > > +/* > + * Must be called while holding mm->mmap_sem for either read or write. > + * The result is guaranteed to be valid until mm->mmap_sem is dropped. > + */ > +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) > +{ > + struct mmu_notifier *mn; > + int id; > + int ret = 0; > + > + WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem)); > + > + if (!mm_has_notifiers(mm)) > + return ret; > + > + id = srcu_read_lock(&srcu); > + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { > + if (!mn->ops->invalidate_range && > + !mn->ops->invalidate_range_start && > + !mn->ops->invalidate_range_end) > + continue; > + > + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { > + ret = 1; > + break; > + } > + } > + srcu_read_unlock(&srcu, id); > + return ret; > +} > + > static int do_mmu_notifier_register(struct mmu_notifier *mn, > struct mm_struct *mm, > int take_mmap_sem) > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier > *mn, > } > > static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { > + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, > .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, > .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, > .clear_flush_young = kvm_mmu_notifier_clear_flush_young,