The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-229.7.2.vz7.8.6 ------> commit e651315e4475767b41a7e028c6127b25c5754312 Author: Vladimir Davydov <vdavy...@parallels.com> Date: Thu Oct 15 17:53:03 2015 +0400
oom: resurrect berserker mode Patchset description: oom enhancements - part 2 - Patches 1-2 prepare memcg for upcoming changes in oom design. - Patch 3 reworks oom locking design so that the executioner waits for victim to exit. This is necessary to increase oom kill rate, which is essential for berserker mode. - Patch 4 drops unused OOM_SCAN_ABORT - Patch 5 introduces oom timeout. https://jira.sw.ru/browse/PSBM-38581 - Patch 6 makes oom fairer when it comes to selecting a victim among different containers. https://jira.sw.ru/browse/PSBM-37915 - Patch 7 prepares oom for introducing berserker mode - Patch 8 resurrects oom berserker mode, which is supposed to cope with actively forking processes. https://jira.sw.ru/browse/PSBM-17930 https://jira.sw.ru/browse/PSBM-26973 Changes in v3: - rework oom_trylock (patch 3) - select exiting process instead of aborting oom scan so as not to keep busy-waiting for an exiting process to exit (patches 3, 4) - cleanup oom timeout handling + fix stuck process trace dumped multiple times on timeout (patch 5) - set max_overdraft to ULONG_MAX on selected processes (patch 6) - rework oom berserker process selection logic (patches 7, 8) Changes in v2: - s/time_after/time_after_eq to avoid BUG_ON in oom_trylock (patch 4) - propagate victim to the context that initiated oom in oom_unlock (patch 6) - always set oom_end on releasing oom context (patch 6) Vladimir Davydov (8): memcg: add mem_cgroup_get/put helpers memcg: add lock for protecting memcg->oom_notify list oom: rework locking design oom: introduce oom timeout oom: drop OOM_SCAN_ABORT oom: rework logic behind memory.oom_guarantee oom: pass points and overdraft to oom_kill_process oom: resurrect berserker mode Reviewed-by: Kirill Tkhai <ktk...@odin.com> ========================================= This patch description: The logic behind the OOM berserker is the same as in PCS6: if processes are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by default), we increase "rage" (min -10, max 20) and kill 1 << "rage" youngest worst processes if "rage" >= 0. https://jira.sw.ru/browse/PSBM-17930 Signed-off-by: Vladimir Davydov <vdavy...@parallels.com> --- include/linux/oom.h | 3 ++ kernel/sysctl.c | 7 ++++ mm/oom_kill.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6ea83b2..acf58fc 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -35,7 +35,9 @@ struct oom_context { struct task_struct *victim; bool marked; unsigned long oom_start; + unsigned long oom_end; unsigned long overdraft; + int rage; wait_queue_head_t waitq; }; @@ -126,4 +128,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; extern int sysctl_panic_on_oom; +extern int sysctl_oom_relaxation; #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 976f48c..9c081e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1184,6 +1184,13 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "oom_relaxation", + .data = &sysctl_oom_relaxation, + .maxlen = sizeof(sysctl_oom_relaxation), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, .maxlen = sizeof(sysctl_overcommit_ratio), diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d8a89c0..6d16154 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -42,13 +42,18 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; +int sysctl_oom_relaxation = HZ; static DEFINE_SPINLOCK(oom_context_lock); #define OOM_TIMEOUT (5 * HZ) +#define OOM_BASE_RAGE -10 +#define OOM_MAX_RAGE 20 + #ifndef CONFIG_MEMCG struct oom_context oom_ctx = { + .rage = OOM_BASE_RAGE, .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(oom_ctx.waitq), }; #endif @@ -59,6 +64,8 @@ void init_oom_context(struct oom_context *ctx) ctx->victim = NULL; ctx->marked = false; ctx->oom_start = 0; + ctx->oom_end = 0; + ctx->rage = OOM_BASE_RAGE; init_waitqueue_head(&ctx->waitq); } @@ -67,6 +74,7 @@ static void __release_oom_context(struct oom_context *ctx) ctx->owner = NULL; ctx->victim = NULL; ctx->marked = false; + ctx->oom_end = jiffies; wake_up_all(&ctx->waitq); } @@ -690,6 +698,102 @@ void oom_unlock(struct mem_cgroup *memcg) mem_cgroup_put(victim_memcg); } +/* + * Kill more processes if oom happens too often in this context. + */ +static void oom_berserker(unsigned long points, unsigned long overdraft, + unsigned long totalpages, struct mem_cgroup *memcg, + nodemask_t *nodemask) +{ + static DEFINE_RATELIMIT_STATE(berserker_rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct oom_context *ctx; + struct task_struct *p; + int rage; + int killed = 0; + + spin_lock(&oom_context_lock); + ctx = mem_cgroup_oom_context(memcg); + if (ctx->owner != current) { + /* Lost ownership on timeout */ + spin_unlock(&oom_context_lock); + return; + } + /* + * Increase rage if oom happened recently in this context, reset + * rage otherwise. + * + * previous oom this oom (unfinished) + * ++++++++++++----------------------------++++++++ + * ^ ^ + * oom_end <<oom_relaxation>> oom_start + */ + if (time_after(ctx->oom_start, ctx->oom_end + sysctl_oom_relaxation)) + ctx->rage = OOM_BASE_RAGE; + else if (ctx->rage < OOM_MAX_RAGE) + ctx->rage++; + rage = ctx->rage; + spin_unlock(&oom_context_lock); + + if (rage < 0) + return; + + /* + * So, we are in rage. Kill (1 << rage) youngest tasks that are + * as bad as the victim. + */ + read_lock(&tasklist_lock); + list_for_each_entry_reverse(p, &init_task.tasks, tasks) { + unsigned long tsk_points; + unsigned long tsk_overdraft; + + if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) || + fatal_signal_pending(p) || p->flags & PF_EXITING || + oom_unkillable_task(p, memcg, nodemask)) + continue; + + tsk_points = oom_badness(p, memcg, nodemask, totalpages, + &tsk_overdraft); + if (tsk_overdraft < overdraft) + continue; + + /* + * oom_badness never returns a negative value, even if + * oom_score_adj would make badness so, instead it + * returns 1. So we do not kill task with badness 1 if + * the victim has badness > 1 so as not to risk killing + * protected tasks. + */ + if (tsk_points <= 1 && points > 1) + continue; + + /* + * Consider tasks as equally bad if they have equal + * normalized scores. + */ + if (tsk_points * 1000 / totalpages < + points * 1000 / totalpages) + continue; + + if (__ratelimit(&berserker_rs)) { + task_lock(p); + pr_err("Rage kill process %d (%s)\n", + task_pid_nr(p), p->comm); + task_unlock(p); + } + + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + mem_cgroup_note_oom_kill(memcg, p); + + if (++killed >= 1 << rage) + break; + } + read_unlock(&tasklist_lock); + + pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon @@ -805,6 +909,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mem_cgroup_note_oom_kill(memcg, victim); put_task_struct(victim); + + oom_berserker(points, overdraft, totalpages, memcg, nodemask); } #undef K _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel