Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-12-02 Thread Michal Hocko
On Tue 01-12-15 15:43:53, Andrew Morton wrote:
[...]
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > index 1ecc0bc..bdbf83b 100644
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -42,6 +42,7 @@
> >  int sysctl_panic_on_oom;
> >  int sysctl_oom_kill_allocating_task;
> >  int sysctl_oom_dump_tasks = 1;
> > +int sysctl_oom_dump_stack = 1;
> >  
> >  DEFINE_MUTEX(oom_lock);
> >  
> > @@ -384,7 +385,8 @@ static void dump_header(struct oom_control *oc, struct 
> > task_struct *p,
> > current->signal->oom_score_adj);
> > cpuset_print_task_mems_allowed(current);
> > task_unlock(current);
> > -   dump_stack();
> > +   if (sysctl_oom_dump_stack)
> > +   dump_stack();
> > if (memcg)
> > mem_cgroup_print_oom_info(memcg, p);
> > else
> 
> The patch seems reasonable to me, but it's missing the required update
> to Documentation/sysctl/vm.txt.

I thought we have agreed to go via KERN_DEBUG log level for the
backtrace. Aristeu has posted a patch for that already.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-12-01 Thread David Rientjes
On Tue, 1 Dec 2015, Andrew Morton wrote:

> > --- a/include/linux/oom.h
> > +++ b/include/linux/oom.h
> > @@ -115,6 +115,7 @@ static inline bool task_will_free_mem(struct 
> > task_struct *task)
> >  
> >  /* sysctls */
> >  extern int sysctl_oom_dump_tasks;
> > +extern int sysctl_oom_dump_stack;
> >  extern int sysctl_oom_kill_allocating_task;
> >  extern int sysctl_panic_on_oom;
> >  #endif /* _INCLUDE_LINUX_OOM_H */
> > diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> > index e69201d..c812523 100644
> > --- a/kernel/sysctl.c
> > +++ b/kernel/sysctl.c
> > @@ -1176,6 +1176,13 @@ static struct ctl_table vm_table[] = {
> > .proc_handler   = proc_dointvec,
> > },
> > {
> > +   .procname   = "oom_dump_stack",
> > +   .data   = &sysctl_oom_dump_stack,
> > +   .maxlen = sizeof(sysctl_oom_dump_stack),
> > +   .mode   = 0644,
> > +   .proc_handler   = proc_dointvec,
> > +   },
> > +   {
> > .procname   = "overcommit_ratio",
> > .data   = &sysctl_overcommit_ratio,
> > .maxlen = sizeof(sysctl_overcommit_ratio),
> > diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> > index 1ecc0bc..bdbf83b 100644
> > --- a/mm/oom_kill.c
> > +++ b/mm/oom_kill.c
> > @@ -42,6 +42,7 @@
> >  int sysctl_panic_on_oom;
> >  int sysctl_oom_kill_allocating_task;
> >  int sysctl_oom_dump_tasks = 1;
> > +int sysctl_oom_dump_stack = 1;
> >  
> >  DEFINE_MUTEX(oom_lock);
> >  
> > @@ -384,7 +385,8 @@ static void dump_header(struct oom_control *oc, struct 
> > task_struct *p,
> > current->signal->oom_score_adj);
> > cpuset_print_task_mems_allowed(current);
> > task_unlock(current);
> > -   dump_stack();
> > +   if (sysctl_oom_dump_stack)
> > +   dump_stack();
> > if (memcg)
> > mem_cgroup_print_oom_info(memcg, p);
> > else
> 
> The patch seems reasonable to me, but it's missing the required update
> to Documentation/sysctl/vm.txt.
> 

Not sure why we'd want yet-another-sysctl for something that can trivially 
filtered from the log, but owell.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-12-01 Thread Andrew Morton
On Fri, 23 Oct 2015 17:02:30 -0400 Aristeu Rozanski  wrote:

> One of the largest chunks of log messages in a OOM is from dump_stack() and in
> some cases it isn't even necessary to figure out what's going on. In
> systems with multiple tenants/containers with limited resources each
> OOMs can be way more frequent and being able to reduce the amount of log
> output for each situation is useful.
> 
> This patch adds a sysctl to allow disabling dump_stack() during an OOM while
> keeping the default to behave the same way it behaves today.

Can you get the same effect by using "dmesg -n "?  Probably not, I
didn't look.

> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -115,6 +115,7 @@ static inline bool task_will_free_mem(struct task_struct 
> *task)
>  
>  /* sysctls */
>  extern int sysctl_oom_dump_tasks;
> +extern int sysctl_oom_dump_stack;
>  extern int sysctl_oom_kill_allocating_task;
>  extern int sysctl_panic_on_oom;
>  #endif /* _INCLUDE_LINUX_OOM_H */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..c812523 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1176,6 +1176,13 @@ static struct ctl_table vm_table[] = {
>   .proc_handler   = proc_dointvec,
>   },
>   {
> + .procname   = "oom_dump_stack",
> + .data   = &sysctl_oom_dump_stack,
> + .maxlen = sizeof(sysctl_oom_dump_stack),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec,
> + },
> + {
>   .procname   = "overcommit_ratio",
>   .data   = &sysctl_overcommit_ratio,
>   .maxlen = sizeof(sysctl_overcommit_ratio),
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 1ecc0bc..bdbf83b 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -42,6 +42,7 @@
>  int sysctl_panic_on_oom;
>  int sysctl_oom_kill_allocating_task;
>  int sysctl_oom_dump_tasks = 1;
> +int sysctl_oom_dump_stack = 1;
>  
>  DEFINE_MUTEX(oom_lock);
>  
> @@ -384,7 +385,8 @@ static void dump_header(struct oom_control *oc, struct 
> task_struct *p,
>   current->signal->oom_score_adj);
>   cpuset_print_task_mems_allowed(current);
>   task_unlock(current);
> - dump_stack();
> + if (sysctl_oom_dump_stack)
> + dump_stack();
>   if (memcg)
>   mem_cgroup_print_oom_info(memcg, p);
>   else

The patch seems reasonable to me, but it's missing the required update
to Documentation/sysctl/vm.txt.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-28 Thread David Rientjes
On Tue, 27 Oct 2015, Aristeu Rozanski wrote:

> Hi Michal,
> On Tue, Oct 27, 2015 at 05:20:47PM +0100, Michal Hocko wrote:
> > Yes this is a mess. But I think it is worth cleaning up.
> > dump_stack_print_info (arch independent) has a log level parameter.
> > show_stack_log_lvl (x86) has a loglevel parameter which is unused.
> > 
> > I haven't checked other architectures but the transition doesn't have to
> > be all at once I guess.
> 
> Ok, will keep working on it then.
> 

No objection on changing the loglevel of the stack trace from the oom 
killer and the bonus is that we can avoid yet another tunable, yay!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-27 Thread Aristeu Rozanski
Hi Michal,
On Tue, Oct 27, 2015 at 05:20:47PM +0100, Michal Hocko wrote:
> Yes this is a mess. But I think it is worth cleaning up.
> dump_stack_print_info (arch independent) has a log level parameter.
> show_stack_log_lvl (x86) has a loglevel parameter which is unused.
> 
> I haven't checked other architectures but the transition doesn't have to
> be all at once I guess.

Ok, will keep working on it then.

-- 
Aristeu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-27 Thread Michal Hocko
On Tue 27-10-15 11:43:42, Aristeu Rozanski wrote:
> Hi Michal,
> On Tue, Oct 27, 2015 at 09:09:21AM +0100, Michal Hocko wrote:
> > On Mon 26-10-15 13:40:49, Aristeu Rozanski wrote:
> > > Hi Michal,
> > > On Mon, Oct 26, 2015 at 06:20:12PM +0100, Michal Hocko wrote:
> > [...]
> > > > Would it make more sense to distinguish different parts of the OOM
> > > > report by loglevel properly?
> > > > pr_err - killed task report
> > > > pr_warning - oom invocation + memory info
> > > > pr_notice - task list
> > > > pr_info - stack trace
> > > 
> > > That'd work, yes, but I'd think the stack trace would be pr_debug. At a
> > > point that you suspect the OOM killer isn't doing the right thing picking
> > > up tasks and you need more information.
> > 
> > Stack trace should be independent on the oom victim selection because
> > the selection should be as much deterministic as possible - so it should
> > only depend on the memory consumption. I do agree that the exact trace
> > is not very useful for the (maybe) majority of OOM reports. I am trying
> > to remember when it was really useful the last time and have trouble to
> > find an example. So I would tend to agree that pr_debug would me more
> > suitable.
> 
> Only problem I see so far with this approach is that it'll require
> reworing show_stack() on all architectures in order to be able to pass
> and use log level and I'm wondering if it's something people will find
> useful for other uses.

Yes this is a mess. But I think it is worth cleaning up.
dump_stack_print_info (arch independent) has a log level parameter.
show_stack_log_lvl (x86) has a loglevel parameter which is unused.

I haven't checked other architectures but the transition doesn't have to
be all at once I guess.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-27 Thread Aristeu Rozanski
Hi Michal,
On Tue, Oct 27, 2015 at 09:09:21AM +0100, Michal Hocko wrote:
> On Mon 26-10-15 13:40:49, Aristeu Rozanski wrote:
> > Hi Michal,
> > On Mon, Oct 26, 2015 at 06:20:12PM +0100, Michal Hocko wrote:
> [...]
> > > Would it make more sense to distinguish different parts of the OOM
> > > report by loglevel properly?
> > > pr_err - killed task report
> > > pr_warning - oom invocation + memory info
> > > pr_notice - task list
> > > pr_info - stack trace
> > 
> > That'd work, yes, but I'd think the stack trace would be pr_debug. At a
> > point that you suspect the OOM killer isn't doing the right thing picking
> > up tasks and you need more information.
> 
> Stack trace should be independent on the oom victim selection because
> the selection should be as much deterministic as possible - so it should
> only depend on the memory consumption. I do agree that the exact trace
> is not very useful for the (maybe) majority of OOM reports. I am trying
> to remember when it was really useful the last time and have trouble to
> find an example. So I would tend to agree that pr_debug would me more
> suitable.

Only problem I see so far with this approach is that it'll require
reworing show_stack() on all architectures in order to be able to pass
and use log level and I'm wondering if it's something people will find
useful for other uses.

-- 
Aristeu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-27 Thread Michal Hocko
On Mon 26-10-15 13:40:49, Aristeu Rozanski wrote:
> Hi Michal,
> On Mon, Oct 26, 2015 at 06:20:12PM +0100, Michal Hocko wrote:
[...]
> > Would it make more sense to distinguish different parts of the OOM
> > report by loglevel properly?
> > pr_err - killed task report
> > pr_warning - oom invocation + memory info
> > pr_notice - task list
> > pr_info - stack trace
> 
> That'd work, yes, but I'd think the stack trace would be pr_debug. At a
> point that you suspect the OOM killer isn't doing the right thing picking
> up tasks and you need more information.

Stack trace should be independent on the oom victim selection because
the selection should be as much deterministic as possible - so it should
only depend on the memory consumption. I do agree that the exact trace
is not very useful for the (maybe) majority of OOM reports. I am trying
to remember when it was really useful the last time and have trouble to
find an example. So I would tend to agree that pr_debug would me more
suitable.
-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-26 Thread David Rientjes
On Fri, 23 Oct 2015, Aristeu Rozanski wrote:

> One of the largest chunks of log messages in a OOM is from dump_stack() and in
> some cases it isn't even necessary to figure out what's going on. In
> systems with multiple tenants/containers with limited resources each
> OOMs can be way more frequent and being able to reduce the amount of log
> output for each situation is useful.
> 
> This patch adds a sysctl to allow disabling dump_stack() during an OOM while
> keeping the default to behave the same way it behaves today.
> 
> Cc: Greg Thelen 
> Cc: Johannes Weiner 
> Cc: linux...@kvack.org
> Cc: cgro...@vger.kernel.org
> Signed-off-by: Aristeu Rozanski 

There's lots of information in the oom log that is irrelevant depending on 
the context in which the oom condition occurred.  Removing the stack trace 
would have made things like commit 9a185e5861e8 ("/proc/stat: convert to 
single_open_size()") harder to fix.  In that case, we were calling the oom 
killer on large file reads from procfs when we could have easily have 
used vmalloc() instead.

When you have a memcg oom kill, the state of the system's memory can 
usually be suppressed because it only occurred because a memcg hierarchy 
reached its limit and has nothing to do with the exhaustion of RAM.

We already control oom output with global sysctls like vm.oom_dump_tasks 
and memcg tunables like memory.oom_verbose.  I'm not sure that adding more 
and more tunables simply to control the oom killer output is in the best 
interest of either procfs or a long-term maintainable kernel.

I can understand the usefulness of having a very small amount of output to 
the kernel log and then enabling tunables to investigate why oom kills are 
happening, but in many situations I've found to only have the oom killer 
output left behind in a kernel log and the situation is not on-going so I 
can't start diagnosing the problem if I don't know what triggered it.

I think adding additional sysctls to control oom killer output is in the 
wrong direction.  I do agree with removing anything that is irrelevant in 
all situations, however.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-26 Thread Aristeu Rozanski
On Mon, Oct 26, 2015 at 12:01:11PM -0400, Johannes Weiner wrote:
> I think this makes sense.
> 
> The high volume log output is not just annoying, we have also had
> reports from people whose machines locked up as they tried to log
> hundreds of containers through a low-bandwidth serial console.
> 
> Could you include sample output of before and after in the changelog
> to provide an immediate comparison on what we are saving?

Sure, at the end of email.

> Should we make the knob specific to the stack dump or should it be
> more generic, so that we could potentially save even more output?

Perhaps what Michal proposed, to use printk() levels.

Sure:
wc -l on the logs:
  47 /tmp/today.txt
  27 /tmp/without_dump_stack.txt

as is:

[248285.939528] memhog invoked oom-killer: gfp_mask=0x280da, order=0, 
oom_score_adj=0
[248285.939531] memhog cpuset=/ mems_allowed=0
[248285.939535] CPU: 1 PID: 2130 Comm: memhog Not tainted 4.3.0-rc6+ #132
[248285.939536] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[248285.939538]  88007c037d58 812bda53 8800313990c0 
811b2f3a
[248285.939539]  1288 811082a9 0001 
88007fffbb28
[248285.939541]  0003 0206 88007c037d58 

[248285.939542] Call Trace:
[248285.939548]  [] ? dump_stack+0x40/0x5d
[248285.939550]  [] ? dump_header+0x76/0x1e1
[248285.939553]  [] ? delayacct_end+0x39/0x60
[248285.939556]  [] ? oom_kill_process+0x1be/0x380
[248285.939559]  [] ? security_capable_noaudit+0x3e/0x60
[248285.939563]  [] ? out_of_memory+0x44b/0x460
[248285.939565]  [] ? __alloc_pages_nodemask+0x893/0x9e0
[248285.939567]  [] ? alloc_pages_vma+0xc7/0x230
[248285.939570]  [] ? mem_cgroup_try_charge+0x7c/0x1a0
[248285.939572]  [] ? handle_mm_fault+0x130a/0x1680
[248285.939574]  [] ? do_mmap+0x336/0x420
[248285.939575]  [] ? vm_mmap_pgoff+0x9c/0xc0
[248285.939578]  [] ? __do_page_fault+0x186/0x410
[248285.939581]  [] ? async_page_fault+0x28/0x30
[248285.939582] Mem-Info:
[248285.939585] active_anon:501670 inactive_anon:43 isolated_anon:0
 active_file:16 inactive_file:21 isolated_file:0
 unevictable:0 dirty:9 writeback:0 unstable:0
 slab_reclaimable:1780 slab_unreclaimable:2045
 mapped:6 shmem:59 pagetables:1474 bounce:0
 free:3388 free_pcp:0 free_cma:0
[248285.939587] Node 0 DMA free:7988kB min:40kB low:48kB high:60kB 
active_anon:7540kB inactive_anon:4kB active_file:8kB inactive_file:8kB 
unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15992kB 
managed:15908kB mlocked:0kB dirty:0kB writeback:0kB mapped:12kB shmem:12kB 
slab_reclaimable:28kB slab_unreclaimable:56kB kernel_stack:64kB pagetables:56kB 
unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB 
writeback_tmp:0kB pages_scanned:100 all_unreclaimable? yes
[248285.939590] lowmem_reserve[]: 0 1988 1988 1988
[248285.939592] Node 0 DMA32 free:5564kB min:5588kB low:6984kB high:8380kB 
active_anon:1999140kB inactive_anon:168kB active_file:56kB inactive_file:76kB 
unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2080760kB 
managed:2038396kB mlocked:0kB dirty:36kB writeback:0kB mapped:12kB shmem:224kB 
slab_reclaimable:7092kB slab_unreclaimable:8124kB kernel_stack:2448kB 
pagetables:5840kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB 
free_cma:0kB writeback_tmp:0kB pages_scanned:1260 all_unreclaimable? yes
[248285.939595] lowmem_reserve[]: 0 0 0 0
[248285.939597] Node 0 DMA: 6*4kB (UEM) 2*8kB (UE) 5*16kB (UE) 0*32kB 5*64kB 
(UE) 5*128kB (UEM) 3*256kB (UE) 2*512kB (EM) 3*1024kB (UE) 1*2048kB (U) 
0*4096kB = 7992kB
[248285.939604] Node 0 DMA32: 197*4kB (UEM) 46*8kB (UE) 11*16kB (UE) 4*32kB 
(UEM) 0*64kB 3*128kB (UEM) 1*256kB (M) 3*512kB (U) 2*1024kB (UM) 0*2048kB 
0*4096kB = 5684kB
[248285.939610] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 
hugepages_size=2048kB
[248285.939611] 80 total pagecache pages
[248285.939612] 0 pages in swap cache
[248285.939613] Swap cache stats: add 0, delete 0, find 0/0
[248285.939613] Free swap  = 0kB
[248285.939614] Total swap = 0kB
[248285.939614] 524188 pages RAM
[248285.939615] 0 pages HighMem/MovableOnly
[248285.939616] 10612 pages reserved
[248285.939616] 0 pages hwpoisoned
[248285.939617] Out of memory: Kill process 2130 (memhog) score 943 or 
sacrifice child
[248285.939726] Killed process 2130 (memhog) total-vm:1998720kB, 
anon-rss:1994396kB, file-rss:4kB
-

without stack trace:
-
[248310.662881] memhog invoked oom-killer: gfp_mask=0x201da, order=0, 
oom_score_adj=0
[248310.662885] memhog cpuset=/ mems_allowed=0
[248310.662888] Mem-Info:
[248310.662891] active_anon:501678 inactive_anon:43 isolated_anon:0
 active_file:23 inactive_file:13 isolated_file:0
 unevictable:0 dirty:16 writeback:0 unstable:0
 

Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-26 Thread Aristeu Rozanski
Hi Michal,
On Mon, Oct 26, 2015 at 06:20:12PM +0100, Michal Hocko wrote:
> I can see why you want to reduce the amount of information, I guess you
> have tried to reduce the loglevel but this hasn't helped because
> dump_stack uses default log level which is too low to be usable, right?
> Or are there any other reasons?

One would be that the stack trace isn't very useful for users IMHO.

> I am not sure sysctl is a good way to tell this particular restriction
> on the output. What if somebody else doesn't want to see the list of
> eligible tasks? Should we add another knob?
>
> Would it make more sense to distinguish different parts of the OOM
> report by loglevel properly?
> pr_err - killed task report
> pr_warning - oom invocation + memory info
> pr_notice - task list
> pr_info - stack trace

That'd work, yes, but I'd think the stack trace would be pr_debug. At a
point that you suspect the OOM killer isn't doing the right thing picking
up tasks and you need more information.

-- 
Aristeu

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-26 Thread Michal Hocko
On Fri 23-10-15 17:02:30, Aristeu Rozanski wrote:
> One of the largest chunks of log messages in a OOM is from dump_stack() and in
> some cases it isn't even necessary to figure out what's going on. In
> systems with multiple tenants/containers with limited resources each
> OOMs can be way more frequent and being able to reduce the amount of log
> output for each situation is useful.

I can see why you want to reduce the amount of information, I guess you
have tried to reduce the loglevel but this hasn't helped because
dump_stack uses default log level which is too low to be usable, right?
Or are there any other reasons?
 
> This patch adds a sysctl to allow disabling dump_stack() during an OOM while
> keeping the default to behave the same way it behaves today.

I am not sure sysctl is a good way to tell this particular restriction
on the output. What if somebody else doesn't want to see the list of
eligible tasks? Should we add another knob?

Would it make more sense to distinguish different parts of the OOM
report by loglevel properly?
pr_err - killed task report
pr_warning - oom invocation + memory info
pr_notice - task list
pr_info - stack trace

> Cc: Greg Thelen 
> Cc: Johannes Weiner 
> Cc: linux...@kvack.org
> Cc: cgro...@vger.kernel.org
> Signed-off-by: Aristeu Rozanski 
> ---
>  include/linux/oom.h | 1 +
>  kernel/sysctl.c | 7 +++
>  mm/oom_kill.c   | 4 +++-
>  3 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 03e6257..bdd03e5 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -115,6 +115,7 @@ static inline bool task_will_free_mem(struct task_struct 
> *task)
>  
>  /* sysctls */
>  extern int sysctl_oom_dump_tasks;
> +extern int sysctl_oom_dump_stack;
>  extern int sysctl_oom_kill_allocating_task;
>  extern int sysctl_panic_on_oom;
>  #endif /* _INCLUDE_LINUX_OOM_H */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..c812523 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1176,6 +1176,13 @@ static struct ctl_table vm_table[] = {
>   .proc_handler   = proc_dointvec,
>   },
>   {
> + .procname   = "oom_dump_stack",
> + .data   = &sysctl_oom_dump_stack,
> + .maxlen = sizeof(sysctl_oom_dump_stack),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec,
> + },
> + {
>   .procname   = "overcommit_ratio",
>   .data   = &sysctl_overcommit_ratio,
>   .maxlen = sizeof(sysctl_overcommit_ratio),
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 1ecc0bc..bdbf83b 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -42,6 +42,7 @@
>  int sysctl_panic_on_oom;
>  int sysctl_oom_kill_allocating_task;
>  int sysctl_oom_dump_tasks = 1;
> +int sysctl_oom_dump_stack = 1;
>  
>  DEFINE_MUTEX(oom_lock);
>  
> @@ -384,7 +385,8 @@ static void dump_header(struct oom_control *oc, struct 
> task_struct *p,
>   current->signal->oom_score_adj);
>   cpuset_print_task_mems_allowed(current);
>   task_unlock(current);
> - dump_stack();
> + if (sysctl_oom_dump_stack)
> + dump_stack();
>   if (memcg)
>   mem_cgroup_print_oom_info(memcg, p);
>   else
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 

-- 
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] oom_kill: add option to disable dump_stack()

2015-10-26 Thread Johannes Weiner
On Fri, Oct 23, 2015 at 05:02:30PM -0400, Aristeu Rozanski wrote:
> One of the largest chunks of log messages in a OOM is from dump_stack() and in
> some cases it isn't even necessary to figure out what's going on. In
> systems with multiple tenants/containers with limited resources each
> OOMs can be way more frequent and being able to reduce the amount of log
> output for each situation is useful.
> 
> This patch adds a sysctl to allow disabling dump_stack() during an OOM while
> keeping the default to behave the same way it behaves today.
> 
> Cc: Greg Thelen 
> Cc: Johannes Weiner 
> Cc: linux...@kvack.org
> Cc: cgro...@vger.kernel.org
> Signed-off-by: Aristeu Rozanski 

I think this makes sense.

The high volume log output is not just annoying, we have also had
reports from people whose machines locked up as they tried to log
hundreds of containers through a low-bandwidth serial console.

Could you include sample output of before and after in the changelog
to provide an immediate comparison on what we are saving?

Should we make the knob specific to the stack dump or should it be
more generic, so that we could potentially save even more output?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] oom_kill: add option to disable dump_stack()

2015-10-23 Thread Aristeu Rozanski
One of the largest chunks of log messages in a OOM is from dump_stack() and in
some cases it isn't even necessary to figure out what's going on. In
systems with multiple tenants/containers with limited resources each
OOMs can be way more frequent and being able to reduce the amount of log
output for each situation is useful.

This patch adds a sysctl to allow disabling dump_stack() during an OOM while
keeping the default to behave the same way it behaves today.

Cc: Greg Thelen 
Cc: Johannes Weiner 
Cc: linux...@kvack.org
Cc: cgro...@vger.kernel.org
Signed-off-by: Aristeu Rozanski 
---
 include/linux/oom.h | 1 +
 kernel/sysctl.c | 7 +++
 mm/oom_kill.c   | 4 +++-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 03e6257..bdd03e5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -115,6 +115,7 @@ static inline bool task_will_free_mem(struct task_struct 
*task)
 
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
+extern int sysctl_oom_dump_stack;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
 #endif /* _INCLUDE_LINUX_OOM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d..c812523 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1176,6 +1176,13 @@ static struct ctl_table vm_table[] = {
.proc_handler   = proc_dointvec,
},
{
+   .procname   = "oom_dump_stack",
+   .data   = &sysctl_oom_dump_stack,
+   .maxlen = sizeof(sysctl_oom_dump_stack),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   },
+   {
.procname   = "overcommit_ratio",
.data   = &sysctl_overcommit_ratio,
.maxlen = sizeof(sysctl_overcommit_ratio),
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ecc0bc..bdbf83b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,6 +42,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
+int sysctl_oom_dump_stack = 1;
 
 DEFINE_MUTEX(oom_lock);
 
@@ -384,7 +385,8 @@ static void dump_header(struct oom_control *oc, struct 
task_struct *p,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
-   dump_stack();
+   if (sysctl_oom_dump_stack)
+   dump_stack();
if (memcg)
mem_cgroup_print_oom_info(memcg, p);
else
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/