On 2018/06/15 5:42, David Rientjes wrote:
>  Note: I understand there is an objection based on timeout based delays.
>  This is currently the only possible way to avoid oom killing important
>  processes completely unnecessarily.  If the oom reaper can someday free
>  all memory, including mlocked memory and those mm's with blockable mmu
>  notifiers, and is guaranteed to always be able to grab mm->mmap_sem,
>  this can be removed.  I do not believe any such guarantee is possible
>  and consider the massive killing of additional processes unnecessarily
>  to be a regression introduced by the oom reaper and its very quick
>  setting of MMF_OOM_SKIP to allow additional processes to be oom killed.
> 

Here is my version for your proposal including my anti-lockup series.
My version is using OOM badness score as a feedback for deciding when to give 
up.

---
 drivers/tty/sysrq.c            |   2 -
 include/linux/memcontrol.h     |   9 +-
 include/linux/oom.h            |   7 +-
 include/linux/sched.h          |   7 +-
 include/linux/sched/coredump.h |   1 -
 kernel/fork.c                  |   2 +
 mm/memcontrol.c                |  24 +--
 mm/mmap.c                      |  17 +-
 mm/oom_kill.c                  | 383 +++++++++++++++++------------------------
 mm/page_alloc.c                |  73 +++-----
 10 files changed, 202 insertions(+), 323 deletions(-)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 6364890..c8b66b9 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -376,10 +376,8 @@ static void moom_callback(struct work_struct *ignored)
                .order = -1,
        };
 
-       mutex_lock(&oom_lock);
        if (!out_of_memory(&oc))
                pr_info("OOM request ignored. No task eligible\n");
-       mutex_unlock(&oom_lock);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..a82360a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -382,8 +382,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
-int mem_cgroup_scan_tasks(struct mem_cgroup *,
-                         int (*)(struct task_struct *, void *), void *);
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+                          void (*fn)(struct task_struct *, void *), void *arg);
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
@@ -850,10 +850,9 @@ static inline void mem_cgroup_iter_break(struct mem_cgroup 
*root,
 {
 }
 
-static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-               int (*fn)(struct task_struct *, void *), void *arg)
+static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+               void (*fn)(struct task_struct *, void *), void *arg)
 {
-       return 0;
 }
 
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 6adac11..09cfa8e 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -44,8 +44,6 @@ struct oom_control {
        unsigned long chosen_points;
 };
 
-extern struct mutex oom_lock;
-
 static inline void set_current_oom_origin(void)
 {
        current->signal->oom_flag_origin = true;
@@ -68,7 +66,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
 
 /*
  * Use this helper if tsk->mm != mm and the victim mm needs a special
- * handling. This is guaranteed to stay true after once set.
+ * handling.
  */
 static inline bool mm_is_oom_victim(struct mm_struct *mm)
 {
@@ -95,7 +93,8 @@ static inline int check_stable_address_space(struct mm_struct 
*mm)
        return 0;
 }
 
-void __oom_reap_task_mm(struct mm_struct *mm);
+extern void oom_reap_mm(struct mm_struct *mm);
+extern bool try_oom_notifier(void);
 
 extern unsigned long oom_badness(struct task_struct *p,
                struct mem_cgroup *memcg, const nodemask_t *nodemask,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 87bf02d..e23fc7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1162,9 +1162,10 @@ struct task_struct {
        unsigned long                   task_state_change;
 #endif
        int                             pagefault_disabled;
-#ifdef CONFIG_MMU
-       struct task_struct              *oom_reaper_list;
-#endif
+       struct list_head                oom_victim_list;
+       unsigned long                   last_oom_compared;
+       unsigned long                   last_oom_score;
+       unsigned char                   oom_reap_stall_count;
 #ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
 #endif
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index ec912d0..d30615e 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -66,7 +66,6 @@ static inline int get_dumpable(struct mm_struct *mm)
 
 #define MMF_HAS_UPROBES                19      /* has uprobes */
 #define MMF_RECALC_UPROBES     20      /* MMF_HAS_UPROBES can be wrong */
-#define MMF_OOM_SKIP           21      /* mm is of no interest for the OOM 
killer */
 #define MMF_UNSTABLE           22      /* mm is unstable for copy_from_user */
 #define MMF_HUGE_ZERO_PAGE     23      /* mm has ever used the global huge 
zero page */
 #define MMF_DISABLE_THP                24      /* disable THP for all VMAs */
diff --git a/kernel/fork.c b/kernel/fork.c
index 9440d61..5ad2b19 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -977,6 +977,8 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+       if (unlikely(mm_is_oom_victim(mm)))
+               clear_bit(MMF_OOM_VICTIM, &mm->flags);
        mmdrop(mm);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e6f0d5e..35c33bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -884,17 +884,14 @@ static void invalidate_reclaim_iterators(struct 
mem_cgroup *dead_memcg)
  * @arg: argument passed to @fn
  *
  * This function iterates over tasks attached to @memcg or to any of its
- * descendants and calls @fn for each task. If @fn returns a non-zero
- * value, the function breaks the iteration loop and returns the value.
- * Otherwise, it will iterate over all tasks and return 0.
+ * descendants and calls @fn for each task.
  *
  * This function must not be called for the root memory cgroup.
  */
-int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
-                         int (*fn)(struct task_struct *, void *), void *arg)
+void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
+                          void (*fn)(struct task_struct *, void *), void *arg)
 {
        struct mem_cgroup *iter;
-       int ret = 0;
 
        BUG_ON(memcg == root_mem_cgroup);
 
@@ -903,15 +900,10 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                struct task_struct *task;
 
                css_task_iter_start(&iter->css, 0, &it);
-               while (!ret && (task = css_task_iter_next(&it)))
-                       ret = fn(task, arg);
+               while ((task = css_task_iter_next(&it)))
+                       fn(task, arg);
                css_task_iter_end(&it);
-               if (ret) {
-                       mem_cgroup_iter_break(memcg, iter);
-                       break;
-               }
        }
-       return ret;
 }
 
 /**
@@ -1206,12 +1198,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
                .gfp_mask = gfp_mask,
                .order = order,
        };
-       bool ret;
 
-       mutex_lock(&oom_lock);
-       ret = out_of_memory(&oc);
-       mutex_unlock(&oom_lock);
-       return ret;
+       return out_of_memory(&oc);
 }
 
 #if MAX_NUMNODES > 1
diff --git a/mm/mmap.c b/mm/mmap.c
index d1eb87e..2b422dd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3059,25 +3059,18 @@ void exit_mmap(struct mm_struct *mm)
        if (unlikely(mm_is_oom_victim(mm))) {
                /*
                 * Manually reap the mm to free as much memory as possible.
-                * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
-                * this mm from further consideration.  Taking mm->mmap_sem for
-                * write after setting MMF_OOM_SKIP will guarantee that the oom
-                * reaper will not run on this mm again after mmap_sem is
-                * dropped.
+                * Then, tell oom_has_pending_victims() no longer try to call
+                * oom_reap_mm() by taking mm->mmap_sem for write.
                 *
                 * Nothing can be holding mm->mmap_sem here and the above call
                 * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
-                * __oom_reap_task_mm() will not block.
+                * oom_reap_mm() will not block.
                 *
                 * This needs to be done before calling munlock_vma_pages_all(),
-                * which clears VM_LOCKED, otherwise the oom reaper cannot
+                * which clears VM_LOCKED, otherwise oom_reap_mm() cannot
                 * reliably test it.
                 */
-               mutex_lock(&oom_lock);
-               __oom_reap_task_mm(mm);
-               mutex_unlock(&oom_lock);
-
-               set_bit(MMF_OOM_SKIP, &mm->flags);
+               oom_reap_mm(mm);
                down_write(&mm->mmap_sem);
                up_write(&mm->mmap_sem);
        }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 84081e7..36bc02f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,7 +38,6 @@
 #include <linux/freezer.h>
 #include <linux/ftrace.h>
 #include <linux/ratelimit.h>
-#include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
 
@@ -49,11 +48,17 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/oom.h>
 
+static inline unsigned long oom_victim_mm_score(struct mm_struct *mm)
+{
+       return get_mm_rss(mm) + get_mm_counter(mm, MM_SWAPENTS) +
+               mm_pgtables_bytes(mm) / PAGE_SIZE;
+}
+
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 
-DEFINE_MUTEX(oom_lock);
+static DEFINE_MUTEX(oom_lock);
 
 #ifdef CONFIG_NUMA
 /**
@@ -201,19 +206,19 @@ unsigned long oom_badness(struct task_struct *p, struct 
mem_cgroup *memcg,
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
 
+       if (tsk_is_oom_victim(p))
+               return 0;
+
        p = find_lock_task_mm(p);
        if (!p)
                return 0;
 
        /*
         * Do not even consider tasks which are explicitly marked oom
-        * unkillable or have been already oom reaped or the are in
-        * the middle of vfork
+        * unkillable or they are in the middle of vfork
         */
        adj = (long)p->signal->oom_score_adj;
-       if (adj == OOM_SCORE_ADJ_MIN ||
-                       test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
-                       in_vfork(p)) {
+       if (adj == OOM_SCORE_ADJ_MIN || in_vfork(p)) {
                task_unlock(p);
                return 0;
        }
@@ -222,8 +227,7 @@ unsigned long oom_badness(struct task_struct *p, struct 
mem_cgroup *memcg,
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-       points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-               mm_pgtables_bytes(p->mm) / PAGE_SIZE;
+       points = oom_victim_mm_score(p->mm);
        task_unlock(p);
 
        /* Normalize to oom_score_adj units */
@@ -304,25 +308,13 @@ static enum oom_constraint constrained_alloc(struct 
oom_control *oc)
        return CONSTRAINT_NONE;
 }
 
-static int oom_evaluate_task(struct task_struct *task, void *arg)
+static void oom_evaluate_task(struct task_struct *task, void *arg)
 {
        struct oom_control *oc = arg;
        unsigned long points;
 
        if (oom_unkillable_task(task, NULL, oc->nodemask))
-               goto next;
-
-       /*
-        * This task already has access to memory reserves and is being killed.
-        * Don't allow any other task to have access to the reserves unless
-        * the task has MMF_OOM_SKIP because chances that it would release
-        * any memory is quite low.
-        */
-       if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
-               if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
-                       goto next;
-               goto abort;
-       }
+               return;
 
        /*
         * If task is allocating a lot of memory and has been marked to be
@@ -335,29 +327,22 @@ static int oom_evaluate_task(struct task_struct *task, 
void *arg)
 
        points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
        if (!points || points < oc->chosen_points)
-               goto next;
+               return;
 
        /* Prefer thread group leaders for display purposes */
        if (points == oc->chosen_points && thread_group_leader(oc->chosen))
-               goto next;
+               return;
 select:
        if (oc->chosen)
                put_task_struct(oc->chosen);
        get_task_struct(task);
        oc->chosen = task;
        oc->chosen_points = points;
-next:
-       return 0;
-abort:
-       if (oc->chosen)
-               put_task_struct(oc->chosen);
-       oc->chosen = (void *)-1UL;
-       return 1;
 }
 
 /*
  * Simple selection loop. We choose the process with the highest number of
- * 'points'. In case scan was aborted, oc->chosen is set to -1.
+ * 'points'.
  */
 static void select_bad_process(struct oom_control *oc)
 {
@@ -368,8 +353,7 @@ static void select_bad_process(struct oom_control *oc)
 
                rcu_read_lock();
                for_each_process(p)
-                       if (oom_evaluate_task(p, oc))
-                               break;
+                       oom_evaluate_task(p, oc);
                rcu_read_unlock();
        }
 
@@ -451,6 +435,29 @@ static void dump_header(struct oom_control *oc, struct 
task_struct *p)
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+static bool victim_mm_stalling(struct task_struct *p, struct mm_struct *mm)
+{
+       unsigned long score;
+
+       if (time_before(jiffies, p->last_oom_compared + HZ / 10))
+               return false;
+       score = oom_victim_mm_score(mm);
+       if (score < p->last_oom_score)
+               p->oom_reap_stall_count = 0;
+       else
+               p->oom_reap_stall_count++;
+       p->last_oom_score = oom_victim_mm_score(mm);
+       p->last_oom_compared = jiffies;
+       if (p->oom_reap_stall_count < 30)
+               return false;
+       pr_info("Gave up waiting for process %d (%s) total-vm:%lukB, 
anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+               task_pid_nr(p), p->comm, K(mm->total_vm),
+               K(get_mm_counter(mm, MM_ANONPAGES)),
+               K(get_mm_counter(mm, MM_FILEPAGES)),
+               K(get_mm_counter(mm, MM_SHMEMPAGES)));
+       return true;
+}
+
 /*
  * task->mm can be NULL if the task is the exited group leader.  So to
  * determine whether the task is using a particular mm, we examine all the
@@ -469,17 +476,10 @@ bool process_shares_mm(struct task_struct *p, struct 
mm_struct *mm)
        return false;
 }
 
-#ifdef CONFIG_MMU
-/*
- * OOM Reaper kernel thread which tries to reap the memory used by the OOM
- * victim (if that is possible) to help the OOM killer to move on.
- */
-static struct task_struct *oom_reaper_th;
-static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
-static struct task_struct *oom_reaper_list;
-static DEFINE_SPINLOCK(oom_reaper_lock);
+static LIST_HEAD(oom_victim_list);
 
-void __oom_reap_task_mm(struct mm_struct *mm)
+#ifdef CONFIG_MMU
+void oom_reap_mm(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
 
@@ -518,152 +518,20 @@ void __oom_reap_task_mm(struct mm_struct *mm)
                }
        }
 }
-
-static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
-{
-       bool ret = true;
-
-       /*
-        * We have to make sure to not race with the victim exit path
-        * and cause premature new oom victim selection:
-        * oom_reap_task_mm             exit_mm
-        *   mmget_not_zero
-        *                                mmput
-        *                                  atomic_dec_and_test
-        *                                exit_oom_victim
-        *                              [...]
-        *                              out_of_memory
-        *                                select_bad_process
-        *                                  # no TIF_MEMDIE task selects new 
victim
-        *  unmap_page_range # frees some memory
-        */
-       mutex_lock(&oom_lock);
-
-       if (!down_read_trylock(&mm->mmap_sem)) {
-               ret = false;
-               trace_skip_task_reaping(tsk->pid);
-               goto unlock_oom;
-       }
-
-       /*
-        * If the mm has invalidate_{start,end}() notifiers that could block,
-        * sleep to give the oom victim some more time.
-        * TODO: we really want to get rid of this ugly hack and make sure that
-        * notifiers cannot block for unbounded amount of time
-        */
-       if (mm_has_blockable_invalidate_notifiers(mm)) {
-               up_read(&mm->mmap_sem);
-               schedule_timeout_idle(HZ);
-               goto unlock_oom;
-       }
-
-       /*
-        * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
-        * work on the mm anymore. The check for MMF_OOM_SKIP must run
-        * under mmap_sem for reading because it serializes against the
-        * down_write();up_write() cycle in exit_mmap().
-        */
-       if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
-               up_read(&mm->mmap_sem);
-               trace_skip_task_reaping(tsk->pid);
-               goto unlock_oom;
-       }
-
-       trace_start_task_reaping(tsk->pid);
-
-       __oom_reap_task_mm(mm);
-
-       pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, 
file-rss:%lukB, shmem-rss:%lukB\n",
-                       task_pid_nr(tsk), tsk->comm,
-                       K(get_mm_counter(mm, MM_ANONPAGES)),
-                       K(get_mm_counter(mm, MM_FILEPAGES)),
-                       K(get_mm_counter(mm, MM_SHMEMPAGES)));
-       up_read(&mm->mmap_sem);
-
-       trace_finish_task_reaping(tsk->pid);
-unlock_oom:
-       mutex_unlock(&oom_lock);
-       return ret;
-}
-
-#define MAX_OOM_REAP_RETRIES 10
-static void oom_reap_task(struct task_struct *tsk)
-{
-       int attempts = 0;
-       struct mm_struct *mm = tsk->signal->oom_mm;
-
-       /* Retry the down_read_trylock(mmap_sem) a few times */
-       while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
-               schedule_timeout_idle(HZ/10);
-
-       if (attempts <= MAX_OOM_REAP_RETRIES ||
-           test_bit(MMF_OOM_SKIP, &mm->flags))
-               goto done;
-
-       pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
-               task_pid_nr(tsk), tsk->comm);
-       debug_show_all_locks();
-
-done:
-       tsk->oom_reaper_list = NULL;
-
-       /*
-        * Hide this mm from OOM killer because it has been either reaped or
-        * somebody can't call up_write(mmap_sem).
-        */
-       set_bit(MMF_OOM_SKIP, &mm->flags);
-
-       /* Drop a reference taken by wake_oom_reaper */
-       put_task_struct(tsk);
-}
-
-static int oom_reaper(void *unused)
-{
-       while (true) {
-               struct task_struct *tsk = NULL;
-
-               wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
-               spin_lock(&oom_reaper_lock);
-               if (oom_reaper_list != NULL) {
-                       tsk = oom_reaper_list;
-                       oom_reaper_list = tsk->oom_reaper_list;
-               }
-               spin_unlock(&oom_reaper_lock);
-
-               if (tsk)
-                       oom_reap_task(tsk);
-       }
-
-       return 0;
-}
+#endif
 
 static void wake_oom_reaper(struct task_struct *tsk)
 {
-       /* tsk is already queued? */
-       if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+       if (tsk->oom_victim_list.next)
                return;
 
        get_task_struct(tsk);
-
-       spin_lock(&oom_reaper_lock);
-       tsk->oom_reaper_list = oom_reaper_list;
-       oom_reaper_list = tsk;
-       spin_unlock(&oom_reaper_lock);
-       trace_wake_reaper(tsk->pid);
-       wake_up(&oom_reaper_wait);
-}
-
-static int __init oom_init(void)
-{
-       oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
-       return 0;
-}
-subsys_initcall(oom_init)
-#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
-{
+       tsk->oom_reap_stall_count = 0;
+       tsk->last_oom_compared = jiffies;
+       tsk->last_oom_score = oom_victim_mm_score(tsk->signal->oom_mm);
+       lockdep_assert_held(&oom_lock);
+       list_add_tail(&tsk->oom_victim_list, &oom_victim_list);
 }
-#endif /* CONFIG_MMU */
 
 /**
  * mark_oom_victim - mark the given task as OOM victim
@@ -806,10 +674,11 @@ static bool task_will_free_mem(struct task_struct *task)
                return false;
 
        /*
-        * This task has already been drained by the oom reaper so there are
-        * only small chances it will free some more
+        * If memory reserves granted to this task was not sufficient, allow
+        * killing more processes after oom_has_pending_victims() completed
+        * reaping this mm.
         */
-       if (test_bit(MMF_OOM_SKIP, &mm->flags))
+       if (tsk_is_oom_victim(task))
                return false;
 
        if (atomic_read(&mm->mm_users) <= 1)
@@ -946,7 +815,6 @@ static void oom_kill_process(struct oom_control *oc, const 
char *message)
                        continue;
                if (is_global_init(p)) {
                        can_oom_reap = false;
-                       set_bit(MMF_OOM_SKIP, &mm->flags);
                        pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
                                        task_pid_nr(victim), victim->comm,
                                        task_pid_nr(p), p->comm);
@@ -1009,6 +877,72 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
+bool try_oom_notifier(void)
+{
+       static DEFINE_MUTEX(lock);
+       unsigned long freed = 0;
+
+       /*
+        * In order to protect OOM notifiers which are not thread safe and to
+        * avoid excessively releasing memory from OOM notifiers which release
+        * memory every time, this lock serializes/excludes concurrent calls to
+        * OOM notifiers.
+        */
+       if (!mutex_trylock(&lock))
+               return true;
+       blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
+       mutex_unlock(&lock);
+       return freed > 0;
+}
+
+/*
+ * Currently a reference to "struct task_struct" taken by wake_oom_reaper()
+ * will remain on the oom_victim_list until somebody finds that this mm has
+ * already completed __mmput() or had not completed for too long.
+ */
+static bool oom_has_pending_victims(struct oom_control *oc)
+{
+       struct task_struct *p, *tmp;
+       bool ret = false;
+       bool gaveup = false;
+
+       lockdep_assert_held(&oom_lock);
+       list_for_each_entry_safe(p, tmp, &oom_victim_list, oom_victim_list) {
+               struct mm_struct *mm = p->signal->oom_mm;
+
+               /* Forget about mm which already completed __mmput(). */
+               if (!test_bit(MMF_OOM_VICTIM, &mm->flags))
+                       goto remove;
+               /* Skip OOM victims which current thread cannot select. */
+               if (oom_unkillable_task(p, oc->memcg, oc->nodemask))
+                       continue;
+               ret = true;
+#ifdef CONFIG_MMU
+               /*
+                * We need to hold mmap_sem for read, in order to safely test
+                * MMF_UNSTABLE flag and blockable invalidate notifiers.
+                */
+               if (down_read_trylock(&mm->mmap_sem)) {
+                       if (!test_bit(MMF_UNSTABLE, &mm->flags) &&
+                           !mm_has_blockable_invalidate_notifiers(mm))
+                               oom_reap_mm(mm);
+                       up_read(&mm->mmap_sem);
+               }
+#endif
+               /* Forget if this mm didn't complete __mmput() for too long. */
+               if (!victim_mm_stalling(p, mm))
+                       continue;
+               gaveup = true;
+remove:
+               list_del(&p->oom_victim_list);
+               put_task_struct(p);
+       }
+       if (gaveup)
+               debug_show_all_locks();
+
+       return ret && !is_sysrq_oom(oc);
+}
+
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @oc: pointer to struct oom_control
@@ -1020,18 +954,8 @@ int unregister_oom_notifier(struct notifier_block *nb)
  */
 bool out_of_memory(struct oom_control *oc)
 {
-       unsigned long freed = 0;
        enum oom_constraint constraint = CONSTRAINT_NONE;
-
-       if (oom_killer_disabled)
-               return false;
-
-       if (!is_memcg_oom(oc)) {
-               blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-               if (freed > 0)
-                       /* Got some memory back in the last second. */
-                       return true;
-       }
+       const char *prompt;
 
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
@@ -1045,15 +969,6 @@ bool out_of_memory(struct oom_control *oc)
        }
 
        /*
-        * The OOM killer does not compensate for IO-less reclaim.
-        * pagefault_out_of_memory lost its gfp context so we have to
-        * make sure exclude 0 mask - all other users should have at least
-        * ___GFP_DIRECT_RECLAIM to get here.
-        */
-       if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
-               return true;
-
-       /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA and memcg) that may require different handling.
         */
@@ -1067,32 +982,46 @@ bool out_of_memory(struct oom_control *oc)
            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
                get_task_struct(current);
                oc->chosen = current;
-               oom_kill_process(oc, "Out of memory 
(oom_kill_allocating_task)");
-               return true;
+               prompt = "Out of memory (oom_kill_allocating_task)";
+       } else {
+               select_bad_process(oc);
+               prompt = !is_memcg_oom(oc) ? "Out of memory" :
+                       "Memory cgroup out of memory";
        }
-
-       select_bad_process(oc);
        /* Found nothing?!?! Either we hang forever, or we panic. */
-       if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
+       if (!oc->chosen) {
+               if (is_sysrq_oom(oc) || is_memcg_oom(oc))
+                       return false;
                dump_header(oc, NULL);
                panic("Out of memory and no killable processes...\n");
        }
-       if (oc->chosen && oc->chosen != (void *)-1UL) {
-               oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
-                                "Memory cgroup out of memory");
-               /*
-                * Give the killed process a good chance to exit before trying
-                * to allocate memory again.
-                */
-               schedule_timeout_killable(1);
-       }
-       return !!oc->chosen;
+       mutex_lock(&oom_lock);
+       /*
+        * If there are OOM victims which current thread can select,
+        * wait for them to reach __mmput().
+        *
+        * If oom_killer_disable() is in progress, we can't select new OOM
+        * victims.
+        *
+        * The OOM killer does not compensate for IO-less reclaim.
+        * pagefault_out_of_memory lost its gfp context so we have to
+        * make sure exclude 0 mask - all other users should have at least
+        * ___GFP_DIRECT_RECLAIM to get here.
+        *
+        * Otherwise, invoke the OOM-killer.
+        */
+       if (oom_has_pending_victims(oc) || oom_killer_disabled ||
+           (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)))
+               put_task_struct(oc->chosen);
+       else
+               oom_kill_process(oc, prompt);
+       mutex_unlock(&oom_lock);
+       return !oom_killer_disabled;
 }
 
 /*
  * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
- * killing is already in progress so do nothing.
+ * memory-hogging task.
  */
 void pagefault_out_of_memory(void)
 {
@@ -1107,8 +1036,6 @@ void pagefault_out_of_memory(void)
        if (mem_cgroup_oom_synchronize(true))
                return;
 
-       if (!mutex_trylock(&oom_lock))
-               return;
        out_of_memory(&oc);
-       mutex_unlock(&oom_lock);
+       schedule_timeout_killable(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100..cd7f9db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3460,29 +3460,16 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, 
const char *fmt, ...)
        };
        struct page *page;
 
-       *did_some_progress = 0;
-
-       /*
-        * Acquire the oom lock.  If that fails, somebody else is
-        * making progress for us.
-        */
-       if (!mutex_trylock(&oom_lock)) {
-               *did_some_progress = 1;
-               schedule_timeout_uninterruptible(1);
-               return NULL;
-       }
+       *did_some_progress = try_oom_notifier();
 
        /*
         * Go through the zonelist yet one more time, keep very high watermark
         * here, this is only to catch a parallel oom killing, we must fail if
-        * we're still under heavy pressure. But make sure that this reclaim
-        * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
-        * allocation which will never fail due to oom_lock already held.
+        * we're still under heavy pressure.
         */
-       page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
-                                     ~__GFP_DIRECT_RECLAIM, order,
+       page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL), order,
                                      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
-       if (page)
+       if (page || *did_some_progress)
                goto out;
 
        /* Coredumps can quickly deplete all memory reserves */
@@ -3531,7 +3518,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, 
const char *fmt, ...)
                                        ALLOC_NO_WATERMARKS, ac);
        }
 out:
-       mutex_unlock(&oom_lock);
        return page;
 }
 
@@ -3863,21 +3849,6 @@ static void wake_all_kswapds(unsigned int order, gfp_t 
gfp_mask,
        return alloc_flags;
 }
 
-static bool oom_reserves_allowed(struct task_struct *tsk)
-{
-       if (!tsk_is_oom_victim(tsk))
-               return false;
-
-       /*
-        * !MMU doesn't have oom reaper so give access to memory reserves
-        * only to the thread with TIF_MEMDIE set
-        */
-       if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
-               return false;
-
-       return true;
-}
-
 /*
  * Distinguish requests which really need access to full memory
  * reserves from oom victims which can live with a portion of it
@@ -3893,7 +3864,7 @@ static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
        if (!in_interrupt()) {
                if (current->flags & PF_MEMALLOC)
                        return ALLOC_NO_WATERMARKS;
-               else if (oom_reserves_allowed(current))
+               else if (tsk_is_oom_victim(current))
                        return ALLOC_OOM;
        }
 
@@ -3922,6 +3893,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 {
        struct zone *zone;
        struct zoneref *z;
+       bool ret = false;
 
        /*
         * Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +3957,26 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
                                }
                        }
 
-                       /*
-                        * Memory allocation/reclaim might be called from a WQ
-                        * context and the current implementation of the WQ
-                        * concurrency control doesn't recognize that
-                        * a particular WQ is congested if the worker thread is
-                        * looping without ever sleeping. Therefore we have to
-                        * do a short sleep here rather than calling
-                        * cond_resched().
-                        */
-                       if (current->flags & PF_WQ_WORKER)
-                               schedule_timeout_uninterruptible(1);
-                       else
-                               cond_resched();
-
-                       return true;
+                       ret = true;
+                       goto out;
                }
        }
 
-       return false;
+out:
+       /*
+        * Memory allocation/reclaim might be called from a WQ
+        * context and the current implementation of the WQ
+        * concurrency control doesn't recognize that
+        * a particular WQ is congested if the worker thread is
+        * looping without ever sleeping. Therefore we have to
+        * do a short sleep here rather than calling
+        * cond_resched().
+        */
+       if (current->flags & PF_WQ_WORKER)
+               schedule_timeout_uninterruptible(1);
+       else
+               cond_resched();
+       return ret;
 }
 
 static inline bool
-- 
1.8.3.1


Reply via email to