The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh8-4.18.0-240.1.1.vz8.5.3 ------> commit fd0c0eddf619ad335ed60170bdb7024e6df818d6 Author: Vladimir Davydov <vdavydov....@gmail.com> Date: Mon Dec 21 19:49:32 2020 +0300
oom: resurrect berserker mode The logic behind the OOM berserker is the same as in PCS6: if processes are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by default), we increase "rage" (min -10, max 20) and kill 1 << "rage" youngest worst processes if "rage" >= 0. https://jira.sw.ru/browse/PSBM-17930 Signed-off-by: Vladimir Davydov <vdavy...@parallels.com> [aryabinin: vz8 rebase] Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- include/linux/memcontrol.h | 6 +++ include/linux/oom.h | 5 +++ mm/oom_kill.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 917e6ab9b1ab..d4d49160ee40 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -258,6 +258,12 @@ struct mem_cgroup { /* OOM-Killer disable */ int oom_kill_disable; + int oom_rage; + spinlock_t oom_rage_lock; + unsigned long prev_oom_time; + unsigned long oom_time; + + /* memory.events */ struct cgroup_file events_file; diff --git a/include/linux/oom.h b/include/linux/oom.h index 0dc94a5bad9e..8ae3aaa00a0f 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -22,6 +22,10 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; + +#define OOM_BASE_RAGE -10 +#define OOM_MAX_RAGE 20 + /* * Details of the page allocation that triggered the oom killer that are used to * determine what should be killed. @@ -51,6 +55,7 @@ struct oom_control { unsigned long totalpages; struct task_struct *chosen; unsigned long chosen_points; + unsigned long overdraft; /* Used to print the constraint info. */ enum oom_constraint constraint; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fe34e85f62ec..353fb22da98c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,6 +53,7 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; +int sysctl_oom_relaxation = HZ; DEFINE_MUTEX(oom_lock); @@ -955,6 +956,101 @@ static int oom_kill_memcg_member(struct task_struct *task, void *message) return 0; } +/* + * Kill more processes if oom happens too often in this context. + */ +static void oom_berserker(struct oom_control *oc) +{ + static DEFINE_RATELIMIT_STATE(berserker_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct task_struct *p; + struct mem_cgroup *memcg; + unsigned long now = jiffies; + int rage; + int killed = 0; + + memcg = oc->memcg ?: root_mem_cgroup; + + spin_lock(&memcg->oom_rage_lock); + memcg->prev_oom_time = memcg->oom_time; + memcg->oom_time = now; + /* + * Increase rage if oom happened recently in this context, reset + * rage otherwise. + * + * previous oom this oom (unfinished) + * +++++++++----------------------------++++++++ + * ^ ^ + * prev_oom_time <<oom_relaxation>> oom_time + */ + if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation)) + memcg->oom_rage = OOM_BASE_RAGE; + else if (memcg->oom_rage < OOM_MAX_RAGE) + memcg->oom_rage++; + rage = memcg->oom_rage; + spin_unlock(&memcg->oom_rage_lock); + + if (rage < 0) + return; + + /* + * So, we are in rage. Kill (1 << rage) youngest tasks that are + * as bad as the victim. + */ + read_lock(&tasklist_lock); + list_for_each_entry_reverse(p, &init_task.tasks, tasks) { + unsigned long tsk_points; + unsigned long tsk_overdraft; + + if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) || + fatal_signal_pending(p) || p->flags & PF_EXITING || + oom_unkillable_task(p, oc->memcg, oc->nodemask)) + continue; + + tsk_points = oom_badness(p, oc->memcg, oc->nodemask, + oc->totalpages, &tsk_overdraft); + if (tsk_overdraft < oc->overdraft) + continue; + + /* + * oom_badness never returns a negative value, even if + * oom_score_adj would make badness so, instead it + * returns 1. So we do not kill task with badness 1 if + * the victim has badness > 1 so as not to risk killing + * protected tasks. + */ + if (tsk_points <= 1 && oc->chosen_points > 1) + continue; + + /* + * Consider tasks as equally bad if they have equal + * normalized scores. + */ + if (tsk_points * 1000 / oc->totalpages < + oc->chosen_points * 1000 / oc->totalpages) + continue; + + if (__ratelimit(&berserker_rs)) { + task_lock(p); + pr_err("Rage kill process %d (%s)\n", + task_pid_nr(p), p->comm); + task_unlock(p); + } + + count_vm_event(OOM_KILL); + memcg_memory_event(memcg, MEMCG_OOM_KILL); + + do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID); + + if (++killed >= 1 << rage) + break; + } + read_unlock(&tasklist_lock); + + pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed); +} + static void oom_kill_process(struct oom_control *oc, const char *message) { struct task_struct *victim = oc->chosen; @@ -998,6 +1094,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) (void*)message); mem_cgroup_put(oom_group); } + oom_berserker(oc); } /* _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel