On Thu, Dec 14, 2017 at 01:30:56PM -0800, David Rientjes wrote:
> Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers")
> prevented the oom reaper from unmapping private anonymous memory with the
> oom reaper when the oom victim mm had mmu notifiers registered.
> 
> The rationale is that doing mmu_notifier_invalidate_range_{start,end}()
> around the unmap_page_range(), which is needed, can block and the oom
> killer will stall forever waiting for the victim to exit, which may not
> be possible without reaping.
> 
> That concern is real, but only true for mmu notifiers that have blockable
> invalidate_range_{start,end}() callbacks.  This patch adds a "flags" field
> to mmu notifier ops that can set a bit to indicate that these callbacks do
> not block.
> 
> The implementation is steered toward an expensive slowpath, such as after
> the oom reaper has grabbed mm->mmap_sem of a still alive oom victim.
> 
> Signed-off-by: David Rientjes <rient...@google.com>

Acked-by: Dimitri Sivanich <sivan...@hpe.com>

> ---
>  v2:
>    - specifically exclude mmu_notifiers without invalidate callbacks
>    - move flags to mmu_notifier_ops per Paolo
>    - reverse flag from blockable -> not blockable per Christian
> 
>  drivers/infiniband/hw/hfi1/mmu_rb.c |  1 +
>  drivers/iommu/amd_iommu_v2.c        |  1 +
>  drivers/iommu/intel-svm.c           |  1 +
>  drivers/misc/sgi-gru/grutlbpurge.c  |  1 +
>  include/linux/mmu_notifier.h        | 21 +++++++++++++++++++++
>  mm/mmu_notifier.c                   | 31 +++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c                 |  1 +
>  7 files changed, 57 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c 
> b/drivers/infiniband/hw/hfi1/mmu_rb.c
> --- a/drivers/infiniband/hw/hfi1/mmu_rb.c
> +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
> @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
>  static void handle_remove(struct work_struct *work);
>  
>  static const struct mmu_notifier_ops mn_opts = {
> +     .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>       .invalidate_range_start = mmu_notifier_range_start,
>  };
>  
> diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
> --- a/drivers/iommu/amd_iommu_v2.c
> +++ b/drivers/iommu/amd_iommu_v2.c
> @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct 
> mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops iommu_mn = {
> +     .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
>       .release                = mn_release,
>       .clear_flush_young      = mn_clear_flush_young,
>       .invalidate_range       = mn_invalidate_range,
> diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
> --- a/drivers/iommu/intel-svm.c
> +++ b/drivers/iommu/intel-svm.c
> @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, 
> struct mm_struct *mm)
>  }
>  
>  static const struct mmu_notifier_ops intel_mmuops = {
> +     .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
>       .release = intel_mm_release,
>       .change_pte = intel_change_pte,
>       .invalidate_range = intel_invalidate_range,
> diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
> b/drivers/misc/sgi-gru/grutlbpurge.c
> --- a/drivers/misc/sgi-gru/grutlbpurge.c
> +++ b/drivers/misc/sgi-gru/grutlbpurge.c
> @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct 
> mm_struct *mm)
>  
>  
>  static const struct mmu_notifier_ops gru_mmuops = {
> +     .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
>       .invalidate_range_start = gru_invalidate_range_start,
>       .invalidate_range_end   = gru_invalidate_range_end,
>       .release                = gru_release,
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -10,6 +10,9 @@
>  struct mmu_notifier;
>  struct mmu_notifier_ops;
>  
> +/* mmu_notifier_ops flags */
> +#define MMU_INVALIDATE_DOES_NOT_BLOCK        (0x01)
> +
>  #ifdef CONFIG_MMU_NOTIFIER
>  
>  /*
> @@ -26,6 +29,15 @@ struct mmu_notifier_mm {
>  };
>  
>  struct mmu_notifier_ops {
> +     /*
> +      * Flags to specify behavior of callbacks for this MMU notifier.
> +      * Used to determine which context an operation may be called.
> +      *
> +      * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_{start,end} does not
> +      *                                block
> +      */
> +     int flags;
> +
>       /*
>        * Called either by mmu_notifier_unregister or when the mm is
>        * being destroyed by exit_mmap, always before all pages are
> @@ -137,6 +149,9 @@ struct mmu_notifier_ops {
>        * page. Pages will no longer be referenced by the linux
>        * address space but may still be referenced by sptes until
>        * the last refcount is dropped.
> +      *
> +      * If both of these callbacks cannot block, mmu_notifier_ops.flags
> +      * should have MMU_INVALIDATE_DOES_NOT_BLOCK set.
>        */
>       void (*invalidate_range_start)(struct mmu_notifier *mn,
>                                      struct mm_struct *mm,
> @@ -218,6 +233,7 @@ extern void __mmu_notifier_invalidate_range_end(struct 
> mm_struct *mm,
>                                 bool only_end);
>  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
>                                 unsigned long start, unsigned long end);
> +extern int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
>  
>  static inline void mmu_notifier_release(struct mm_struct *mm)
>  {
> @@ -457,6 +473,11 @@ static inline void mmu_notifier_invalidate_range(struct 
> mm_struct *mm,
>  {
>  }
>  
> +static inline int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +     return 0;
> +}
> +
>  static inline void mmu_notifier_mm_init(struct mm_struct *mm)
>  {
>  }
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct 
> *mm,
>  }
>  EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
>  
> +/*
> + * Must be called while holding mm->mmap_sem for either read or write.
> + * The result is guaranteed to be valid until mm->mmap_sem is dropped.
> + */
> +int mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
> +{
> +     struct mmu_notifier *mn;
> +     int id;
> +     int ret = 0;
> +
> +     WARN_ON_ONCE(down_write_trylock(&mm->mmap_sem));
> +
> +     if (!mm_has_notifiers(mm))
> +             return ret;
> +
> +     id = srcu_read_lock(&srcu);
> +     hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
> +             if (!mn->ops->invalidate_range &&
> +                 !mn->ops->invalidate_range_start &&
> +                 !mn->ops->invalidate_range_end)
> +                             continue;
> +
> +             if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
> +                     ret = 1;
> +                     break;
> +             }
> +     }
> +     srcu_read_unlock(&srcu, id);
> +     return ret;
> +}
> +
>  static int do_mmu_notifier_register(struct mmu_notifier *mn,
>                                   struct mm_struct *mm,
>                                   int take_mmap_sem)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier 
> *mn,
>  }
>  
>  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
> +     .flags                  = MMU_INVALIDATE_DOES_NOT_BLOCK,
>       .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
>       .invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
>       .clear_flush_young      = kvm_mmu_notifier_clear_flush_young,

Reply via email to