Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-12 Thread Don Zickus
On Fri, Sep 25, 2015 at 01:15:56PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.

Passed my testing with this on and off.

Acked-by: Don Zickus 

> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>   CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
> +static unsigned long hardlockup_allcpu_dumped;
>  /*
>   * We may not want to enable hard lockup detection by 

Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-12 Thread Don Zickus
On Fri, Sep 25, 2015 at 01:15:56PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.

Passed my testing with this on and off.

Acked-by: Don Zickus 

> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>   CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
> +static unsigned long hardlockup_allcpu_dumped;
>  /*
>   * We 

Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Aaron Tomlin
On Thu 2015-10-01 09:45 +0200, Jiri Kosina wrote:
> That should be fine. Worst case scenario is hardlockup and softlockup 
> trigerring 'in parallel' on different CPUs, and all-cpu backtrace being 
> triggered twice.
> 
> Frankly, I've never seen this happen in practice (hardlockup and 
> softlockup triggering at different CPUs at the very same time).
> 
> We could possibly make a global 'all_cpus_dump_in_progress' flag so that 
> we guarantee only one stream of dumps, but we'd need to convert everybody 
> using this facility (e.g. RCU stall detector, and whoever else) to be 
> aware of it as well, otherwise it wouldn't make too much sense.
> 
> Something to add to TODO I guess.

This could indeed be worth further investigation.


-- 
Aaron Tomlin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Jiri Kosina
On Thu, 1 Oct 2015, Aaron Tomlin wrote:

> > +   /*
> > +* Perform all-CPU dump only once to avoid multiple hardlockups
> > +* generating interleaving traces
> > +*/
> > +   if (sysctl_hardlockup_all_cpu_backtrace &&
> > +   !test_and_set_bit(0, _allcpu_dumped))
> > +   trigger_allbutself_cpu_backtrace();
> 
> How does this play when 'softlockup_all_cpu_backtrace' is enabled too?

That should be fine. Worst case scenario is hardlockup and softlockup 
trigerring 'in parallel' on different CPUs, and all-cpu backtrace being 
triggered twice.

Frankly, I've never seen this happen in practice (hardlockup and 
softlockup triggering at different CPUs at the very same time).

We could possibly make a global 'all_cpus_dump_in_progress' flag so that 
we guarantee only one stream of dumps, but we'd need to convert everybody 
using this facility (e.g. RCU stall detector, and whoever else) to be 
aware of it as well, otherwise it wouldn't make too much sense.

Something to add to TODO I guess.

> 
> > +
> > +   if (hardlockup_panic)
> > +   panic("Hard LOCKUP");
> >  
> > __this_cpu_write(hard_watchdog_warn, true);
> > return;
> 
> This does indeed appear similar to Linus commit ed235875
> ("kernel/watchdog.c: print traces for all cpus on lockup detection");
> albeit for the hardlockup detector.
> 
> Looks fine to me. Thanks!
> 
> Reviewed-by: Aaron Tomlin 

Thanks,

-- 
Jiri Kosina
SUSE Labs

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Aaron Tomlin
On Fri 2015-09-25 13:15 +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.
> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>   CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
> +static unsigned long hardlockup_allcpu_dumped;
>  /*
>   * We may not want to enable hard lockup detection by default in all cases,
>   * for example when running the kernel as a guest on a 

Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Aaron Tomlin
On Fri 2015-09-25 13:15 +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.
> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>   CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
> +static unsigned long hardlockup_allcpu_dumped;
>  /*
>   * We may not want to enable hard lockup detection by default in all cases,
>   * for example when 

Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Jiri Kosina
On Thu, 1 Oct 2015, Aaron Tomlin wrote:

> > +   /*
> > +* Perform all-CPU dump only once to avoid multiple hardlockups
> > +* generating interleaving traces
> > +*/
> > +   if (sysctl_hardlockup_all_cpu_backtrace &&
> > +   !test_and_set_bit(0, _allcpu_dumped))
> > +   trigger_allbutself_cpu_backtrace();
> 
> How does this play when 'softlockup_all_cpu_backtrace' is enabled too?

That should be fine. Worst case scenario is hardlockup and softlockup 
trigerring 'in parallel' on different CPUs, and all-cpu backtrace being 
triggered twice.

Frankly, I've never seen this happen in practice (hardlockup and 
softlockup triggering at different CPUs at the very same time).

We could possibly make a global 'all_cpus_dump_in_progress' flag so that 
we guarantee only one stream of dumps, but we'd need to convert everybody 
using this facility (e.g. RCU stall detector, and whoever else) to be 
aware of it as well, otherwise it wouldn't make too much sense.

Something to add to TODO I guess.

> 
> > +
> > +   if (hardlockup_panic)
> > +   panic("Hard LOCKUP");
> >  
> > __this_cpu_write(hard_watchdog_warn, true);
> > return;
> 
> This does indeed appear similar to Linus commit ed235875
> ("kernel/watchdog.c: print traces for all cpus on lockup detection");
> albeit for the hardlockup detector.
> 
> Looks fine to me. Thanks!
> 
> Reviewed-by: Aaron Tomlin 

Thanks,

-- 
Jiri Kosina
SUSE Labs

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-10-01 Thread Aaron Tomlin
On Thu 2015-10-01 09:45 +0200, Jiri Kosina wrote:
> That should be fine. Worst case scenario is hardlockup and softlockup 
> trigerring 'in parallel' on different CPUs, and all-cpu backtrace being 
> triggered twice.
> 
> Frankly, I've never seen this happen in practice (hardlockup and 
> softlockup triggering at different CPUs at the very same time).
> 
> We could possibly make a global 'all_cpus_dump_in_progress' flag so that 
> we guarantee only one stream of dumps, but we'd need to convert everybody 
> using this facility (e.g. RCU stall detector, and whoever else) to be 
> aware of it as well, otherwise it wouldn't make too much sense.
> 
> Something to add to TODO I guess.

This could indeed be worth further investigation.


-- 
Aaron Tomlin
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-09-25 Thread Don Zickus
On Fri, Sep 25, 2015 at 01:15:56PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.

Hi Jiri,

This seems to mimic the very similar softlockup_all_cpu_backtrace
functionality, so I am ok with this.  Let me try and run some tests on this
to make sure nothing regressed.

Cheers,
Don

> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>   

Re: [PATCH] watchdog: perform all-CPU backtrace in case of hard lockup

2015-09-25 Thread Don Zickus
On Fri, Sep 25, 2015 at 01:15:56PM +0200, Jiri Kosina wrote:
> From: Jiri Kosina 
> 
> In many cases of hardlockup reports, it's actually not possible to know 
> why it triggered, because the CPU that got stuck is usually waiting on a 
> resource (with IRQs disabled) in posession of some other CPU is holding.
> 
> IOW, we are often looking at the stacktrace of the victim and not the 
> actual offender.
> 
> Introduce sysctl / cmdline parameter that makes it possible to have 
> hardlockup detector perform all-CPU backtrace.

Hi Jiri,

This seems to mimic the very similar softlockup_all_cpu_backtrace
functionality, so I am ok with this.  Let me try and run some tests on this
to make sure nothing regressed.

Cheers,
Don

> 
> Signed-off-by: Jiri Kosina 
> ---
>  Documentation/kernel-parameters.txt |  5 +
>  Documentation/sysctl/kernel.txt | 12 
>  include/linux/nmi.h |  1 +
>  kernel/sysctl.c |  9 +
>  kernel/watchdog.c   | 33 -
>  5 files changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/Documentation/kernel-parameters.txt 
> b/Documentation/kernel-parameters.txt
> index 22a4b68..b4af96e 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1246,6 +1246,11 @@ bytes respectively. Such letter suffixes can also be 
> entirely omitted.
>   Format:  such that (rxsize & ~0x1fffc0) 
> == 0.
>   Default: 1024
>  
> + hardlockup_all_cpu_backtrace=
> + [KNL] Should the hard-lockup detector generate
> + backtraces on all cpus.
> + Format: 
> +
>   hashdist=   [KNL,NUMA] Large hashes allocated during boot
>   are distributed across NUMA nodes.  Defaults on
>   for 64-bit NUMA, off otherwise.
> diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
> index 6fccb69..af70d15 100644
> --- a/Documentation/sysctl/kernel.txt
> +++ b/Documentation/sysctl/kernel.txt
> @@ -33,6 +33,7 @@ show up in /proc/sys/kernel:
>  - domainname
>  - hostname
>  - hotplug
> +- hardlockup_all_cpu_backtrace
>  - hung_task_panic
>  - hung_task_check_count
>  - hung_task_timeout_secs
> @@ -293,6 +294,17 @@ domain names are in general different. For a detailed 
> discussion
>  see the hostname(1) man page.
>  
>  ==
> +hardlockup_all_cpu_backtrace:
> +
> +This value controls the hard lockup detector behavior when a hard
> +lockup condition is detected as to whether or not to gather further
> +debug information. If enabled, arch-specific all-CPU stack dumping
> +will be initiated.
> +
> +0: do nothing. This is the default behavior.
> +
> +1: on detection capture more debug information.
> +==
>  
>  hotplug:
>  
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index 78488e0..7ec5b86 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -73,6 +73,7 @@ extern int watchdog_user_enabled;
>  extern int watchdog_thresh;
>  extern unsigned long *watchdog_cpumask_bits;
>  extern int sysctl_softlockup_all_cpu_backtrace;
> +extern int sysctl_hardlockup_all_cpu_backtrace;
>  struct ctl_table;
>  extern int proc_watchdog(struct ctl_table *, int ,
>void __user *, size_t *, loff_t *);
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e69201d..efb0370 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -897,6 +897,15 @@ static struct ctl_table kern_table[] = {
>   .extra1 = ,
>   .extra2 = ,
>   },
> + {
> + .procname   = "hardlockup_all_cpu_backtrace",
> + .data   = _hardlockup_all_cpu_backtrace,
> + .maxlen = sizeof(int),
> + .mode   = 0644,
> + .proc_handler   = proc_dointvec_minmax,
> + .extra1 = ,
> + .extra2 = ,
> + },
>  #endif /* CONFIG_SMP */
>  #endif
>  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 64ed1c3..202999c 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
>  
>  #ifdef CONFIG_SMP
>  int __read_mostly sysctl_softlockup_all_cpu_backtrace;
> +int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
>  #else
>  #define sysctl_softlockup_all_cpu_backtrace 0
> +#define sysctl_hardlockup_all_cpu_backtrace 0
>  #endif
>  static struct cpumask watchdog_cpumask __read_mostly;
>  unsigned long *watchdog_cpumask_bits = cpumask_bits(_cpumask);
> @@ -112,6 +114,7 @@ static unsigned long soft_lockup_nmi_warn;
>  #ifdef CONFIG_HARDLOCKUP_DETECTOR
>  static int hardlockup_panic =
>