From: Vladimir Davydov <vdavy...@parallels.com>

The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.

https://jira.sw.ru/browse/PSBM-17930

Signed-off-by: Vladimir Davydov <vdavy...@parallels.com>
[aryabinin: vz8 rebase]
Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
---
 include/linux/memcontrol.h |  6 +++
 include/linux/oom.h        |  4 ++
 mm/oom_kill.c              | 97 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c26041c681f2..0efabad868ce 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -260,6 +260,12 @@ struct mem_cgroup {
        /* OOM-Killer disable */
        int             oom_kill_disable;
 
+       int             oom_rage;
+       spinlock_t      oom_rage_lock;
+       unsigned long   prev_oom_time;
+       unsigned long   oom_time;
+
+
        /* memory.events */
        struct cgroup_file events_file;
 
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b0ee726c1672..9a6d16a1ace5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -15,6 +15,9 @@ struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
 
+#define OOM_BASE_RAGE  -10
+#define OOM_MAX_RAGE   20
+
 /*
  * Details of the page allocation that triggered the oom killer that are used 
to
  * determine what should be killed.
@@ -44,6 +47,7 @@ struct oom_control {
        unsigned long totalpages;
        struct task_struct *chosen;
        unsigned long chosen_points;
+       unsigned long overdraft;
 };
 
 extern struct mutex oom_lock;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ab436d94ae5d..e746b41d558c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
 
 DEFINE_MUTEX(oom_lock);
 
@@ -947,6 +948,101 @@ static int oom_kill_memcg_member(struct task_struct 
*task, void *message)
        return 0;
 }
 
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(struct oom_control *oc)
+{
+       static DEFINE_RATELIMIT_STATE(berserker_rs,
+                               DEFAULT_RATELIMIT_INTERVAL,
+                               DEFAULT_RATELIMIT_BURST);
+       struct task_struct *p;
+       struct mem_cgroup *memcg;
+       unsigned long now = jiffies;
+       int rage;
+       int killed = 0;
+
+       memcg = oc->memcg ?: root_mem_cgroup;
+
+       spin_lock(&memcg->oom_rage_lock);
+       memcg->prev_oom_time = memcg->oom_time;
+       memcg->oom_time = now;
+       /*
+        * Increase rage if oom happened recently in this context, reset
+        * rage otherwise.
+        *
+        * previous oom                            this oom (unfinished)
+        * +++++++++----------------------------++++++++
+        *        ^                                    ^
+        *  prev_oom_time  <<oom_relaxation>>      oom_time
+        */
+       if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation))
+               memcg->oom_rage = OOM_BASE_RAGE;
+       else if (memcg->oom_rage < OOM_MAX_RAGE)
+               memcg->oom_rage++;
+       rage = memcg->oom_rage;
+       spin_unlock(&memcg->oom_rage_lock);
+
+       if (rage < 0)
+               return;
+
+       /*
+        * So, we are in rage. Kill (1 << rage) youngest tasks that are
+        * as bad as the victim.
+        */
+       read_lock(&tasklist_lock);
+       list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+               unsigned long tsk_points;
+               unsigned long tsk_overdraft;
+
+               if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+                       fatal_signal_pending(p) || p->flags & PF_EXITING ||
+                       oom_unkillable_task(p, oc->memcg, oc->nodemask))
+                       continue;
+
+               tsk_points = oom_badness(p, oc->memcg, oc->nodemask,
+                                       oc->totalpages, &tsk_overdraft);
+               if (tsk_overdraft < oc->overdraft)
+                       continue;
+
+               /*
+                * oom_badness never returns a negative value, even if
+                * oom_score_adj would make badness so, instead it
+                * returns 1. So we do not kill task with badness 1 if
+                * the victim has badness > 1 so as not to risk killing
+                * protected tasks.
+                */
+               if (tsk_points <= 1 && oc->chosen_points > 1)
+                       continue;
+
+               /*
+                * Consider tasks as equally bad if they have equal
+                * normalized scores.
+                */
+               if (tsk_points * 1000 / oc->totalpages <
+                       oc->chosen_points * 1000 / oc->totalpages)
+                       continue;
+
+               if (__ratelimit(&berserker_rs)) {
+                       task_lock(p);
+                       pr_err("Rage kill process %d (%s)\n",
+                               task_pid_nr(p), p->comm);
+                       task_unlock(p);
+               }
+
+               count_vm_event(OOM_KILL);
+               memcg_memory_event(memcg, MEMCG_OOM_KILL);
+
+               do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
+
+               if (++killed >= 1 << rage)
+                       break;
+       }
+       read_unlock(&tasklist_lock);
+
+       pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
 static void oom_kill_process(struct oom_control *oc, const char *message)
 {
        struct task_struct *victim = oc->chosen;
@@ -990,6 +1086,7 @@ static void oom_kill_process(struct oom_control *oc, const 
char *message)
                                      (void*)message);
                mem_cgroup_put(oom_group);
        }
+       oom_berserker(oc);
 }
 
 /*
-- 
2.26.2

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to