Konstantin Baydarov wrote:
> Problem:
> Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI
> watchdog enabled) hangs when kernel modules are automatically loaded.
>
> Root Cause:
> Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the
> whole
> system to hang.
> If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and
> Master CPU haven't unset debugger_active, then Slave CPU can reenter
> kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is
> zero(Master CPU have set procindebug[MasterCPU] to zero before exit), Slave
> loops in kgdb_wait():
> ...
> /* Wait till master processor goes completely into the debugger.
> */
> while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
> int i = 10; /* an arbitrary number */
>
> while (--i)
> cpu_relax();
> }
> ...
> Slave CPU loops until Master CPU completely exits KGDB and set
> debugger_active to zero.
> But when debugger_active became zero, Slave CPU don't leaves loop, instead it
> hangs in while loop, because it starts to check procindebug[-1], because
> atomic_read(&debugger_active) = 0:
> ...
> while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...}
> ...
> For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler and
> stops accept NMIs. It leads to whole system hang.
>
> How Solved:
> New atomic variable debugger_exiting was added. It's set when Master CPU
> starts
> waiting Slave CPUs, and is reset after debugger_active is set to zero.
> Variable
> debugger_exiting is checked in kgdb_notify() and kgdb_nmihook wouldn't be
> called until debugger_exiting equal zero. So debugger_exiting guaranties that
> Slave CPU won't reenter kgdb_wait() until Master CPU completely leaves KGDB.
> Patch against kernel 2.6.24.3.
>
> Signed-off-by: Konstantin Baydarov <[EMAIL PROTECTED]>
Acked-by: Sergei Shtylyov <[EMAIL PROTECTED]>
> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> @@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b
>
> switch (cmd) {
> case DIE_NMI:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> kgdb_nmihook(raw_smp_processor_id(), regs);
> return NOTIFY_STOP;
> }
> return NOTIFY_DONE;
> case DIE_NMI_IPI:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> if (kgdb_nmihook(raw_smp_processor_id(), regs))
> return NOTIFY_DONE;
> @@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b
> }
> return NOTIFY_DONE;
> case DIE_NMIWATCHDOG:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> kgdb_nmihook(raw_smp_processor_id(), regs);
> return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> @@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b
>
> switch (cmd) {
> case DIE_NMI:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> kgdb_nmihook(raw_smp_processor_id(), regs);
> return NOTIFY_STOP;
> }
> return NOTIFY_DONE;
> case DIE_NMI_IPI:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> if (kgdb_nmihook(raw_smp_processor_id(), regs))
> return NOTIFY_DONE;
> @@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b
> }
> return NOTIFY_DONE;
> case DIE_NMIWATCHDOG:
> - if (atomic_read(&debugger_active)) {
> + if (atomic_read(&debugger_active) &&
> + !atomic_read(&debugger_exiting)) {
> /* KGDB CPU roundup */
> kgdb_nmihook(raw_smp_processor_id(), regs);
> return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h
> +++ ko_2_6_24_3_kgdb/include/linux/kgdb.h
> @@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_
> extern int kgdb_nmihook(int cpu, void *regs);
> extern int debugger_step;
> extern atomic_t debugger_active;
> +extern atomic_t debugger_exiting;
> #else
> /* Stubs for when KGDB is not set. */
> static const atomic_t debugger_active = ATOMIC_INIT(0);
> Index: ko_2_6_24_3_kgdb/kernel/kgdb.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c
> +++ ko_2_6_24_3_kgdb/kernel/kgdb.c
> @@ -117,6 +117,8 @@ int debugger_step;
> static atomic_t kgdb_sync = ATOMIC_INIT(-1);
> atomic_t debugger_active;
> EXPORT_SYMBOL(debugger_active);
> +atomic_t debugger_exiting = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(debugger_exiting);
>
> /* Our I/O buffers. */
> static char remcom_in_buffer[BUFMAX];
> @@ -1526,6 +1528,7 @@ default_handle:
> atomic_set(&procindebug[processor], 0);
>
> if (!debugger_step || !kgdb_contthread) {
> + atomic_set(&debugger_exiting, 1);
> for (i = 0; i < NR_CPUS; i++)
> spin_unlock(&slavecpulocks[i]);
> /* Wait till all the processors have quit
> @@ -1557,6 +1560,7 @@ default_handle:
> kgdb_restore:
> /* Free debugger_active */
> atomic_set(&debugger_active, 0);
> + atomic_set(&debugger_exiting, 0);
> atomic_set(&kgdb_sync, -1);
> clocksource_touch_watchdog();
> kgdb_softlock_skip[processor] = 1;
WBR, Sergei
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Kgdb-bugreport mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport