On 06/07, Peter Zijlstra wrote: > > On Thu, Jun 07, 2018 at 02:33:12PM +0200, Peter Zijlstra wrote: > > > +static int softlockup_stop_fn(void *data) > > { > > + watchdog_disable(smp_processor_id()); > > + return 0; > > } > > > > +static void softlockup_stop_all(void) > > { > > + int cpu; > > + > > + for_each_cpu(cpu, &watchdog_allowed_mask) > > + stop_one_cpu(cpu, softlockup_stop_fn, NULL); > > + > > + cpumask_clear(&watchdog_allowed_mask); > > } > > Bugger, that one doesn't quite work.. watchdog_disable() ends up calling > a sleeping function. I forgot to enable all the debug cruft when > testing..
And probably there is another problem. Both watchdog_disable(cpu) and watchdog_nmi_disable(cpu) assume that cpu == smp_processor_id(), this arg is simply ignored. but lockup_detector_offline_cpu(cpu) is called by cpuhp_invoke_callback(), so in this case watchdog_disable(dying_cpu) is simply wrong. May be we can do something like below? Then softlockup_stop_all() can simply do for_each_cpu(cpu, &watchdog_allowed_mask) watchdog_disable(cpu); watchdog_nmi_disable() is __weak, but at first glance arch/sparc/kernel/nmi.c does everything correctly. Oleg. --- x/kernel/watchdog.c +++ x/kernel/watchdog.c @@ -108,13 +108,13 @@ __setup("hardlockup_all_cpu_backtrace=", */ int __weak watchdog_nmi_enable(unsigned int cpu) { - hardlockup_detector_perf_enable(); + hardlockup_detector_perf_enable(cpu); return 0; } void __weak watchdog_nmi_disable(unsigned int cpu) { - hardlockup_detector_perf_disable(); + hardlockup_detector_perf_disable(cpu); } /* Return 0, if a NMI watchdog is available. Error code otherwise */ @@ -479,7 +479,7 @@ static void watchdog_enable(unsigned int static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = per_cpu_ptr(&watchdog_hrtimer, cpu); watchdog_set_prio(SCHED_NORMAL, 0); /* --- x/kernel/watchdog_hld.c +++ x/kernel/watchdog_hld.c @@ -162,9 +162,8 @@ static void watchdog_overflow_callback(s return; } -static int hardlockup_detector_event_create(void) +static int hardlockup_detector_event_create(int cpu) { - unsigned int cpu = smp_processor_id(); struct perf_event_attr *wd_attr; struct perf_event *evt; @@ -179,37 +178,37 @@ static int hardlockup_detector_event_cre PTR_ERR(evt)); return PTR_ERR(evt); } - this_cpu_write(watchdog_ev, evt); + raw_cpu_write(watchdog_ev, evt); return 0; } /** * hardlockup_detector_perf_enable - Enable the local event */ -void hardlockup_detector_perf_enable(void) +void hardlockup_detector_perf_enable(int cpu) { - if (hardlockup_detector_event_create()) + if (hardlockup_detector_event_create(cpu)) return; /* use original value for check */ if (!atomic_fetch_inc(&watchdog_cpus)) pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); - perf_event_enable(this_cpu_read(watchdog_ev)); + perf_event_enable(raw_cpu_read(watchdog_ev)); } /** * hardlockup_detector_perf_disable - Disable the local event */ -void hardlockup_detector_perf_disable(void) +void hardlockup_detector_perf_disable(int cpu) { - struct perf_event *event = this_cpu_read(watchdog_ev); + struct perf_event *event = per_cpu_read(watchdog_ev, cpu); if (event) { perf_event_disable(event); - this_cpu_write(watchdog_ev, NULL); - this_cpu_write(dead_event, event); - cpumask_set_cpu(smp_processor_id(), &dead_events_mask); + raw_cpu_write(watchdog_ev, cpu, NULL); + raw_cpu_write(dead_event, cpu, event); + cpumask_set_cpu(cpu, &dead_events_mask); atomic_dec(&watchdog_cpus); } }