Konstantin Baydarov wrote:

> Problem:
> Sometimes(after remote gdb was connected) x86 SMP kernel(with KGDB and NMI 
> watchdog enabled) hangs when kernel modules are automatically loaded.
> 
> Root Cause: 
>   Slave CPU hangs in kgdb_wait() when master CPU leaves KGDB, causing the 
> whole
> system to hang.
>   If watchdog NMI occurs when Slave CPU have already exited kgdb_wait() and 
> Master CPU haven't unset debugger_active, then Slave CPU can reenter 
> kgdb_wait(). As (procindebug[atomic_read(&debugger_active) - 1) is 
> zero(Master CPU have set procindebug[MasterCPU] to zero before exit), Slave 
> loops in kgdb_wait():
> ...
>       /* Wait till master processor goes completely into the debugger.
>        */
>       while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
>               int i = 10;     /* an arbitrary number */
> 
>               while (--i)
>                       cpu_relax();
>       }
> ...
> Slave CPU loops until Master CPU completely exits KGDB and set 
> debugger_active to zero.
> But when debugger_active became zero, Slave CPU don't leaves loop, instead it 
> hangs in while loop, because it starts to check procindebug[-1], because 
> atomic_read(&debugger_active) = 0:
> ...
> while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])){...}
> ...
> For me procindebug[-1] is always zero, so Slave CPU hangs in NMI handler and 
> stops accept NMIs. It leads to whole system hang.
> 
> How Solved:
>   New atomic variable debugger_exiting was added. It's set when Master CPU 
> starts
> waiting Slave CPUs, and is reset after debugger_active is set to zero. 
> Variable
> debugger_exiting is checked in kgdb_notify() and kgdb_nmihook wouldn't be
> called until debugger_exiting equal zero. So debugger_exiting guaranties that 
> Slave CPU won't reenter kgdb_wait() until Master CPU completely leaves KGDB.
>   Patch against kernel 2.6.24.3.
> 
> Signed-off-by: Konstantin Baydarov <[EMAIL PROTECTED]>

Acked-by: Sergei Shtylyov <[EMAIL PROTECTED]>

> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_32.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_32.c
> @@ -326,14 +326,16 @@ static int kgdb_notify(struct notifier_b
>  
>       switch (cmd) {
>       case DIE_NMI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
>               }
>               return NOTIFY_DONE;
>       case DIE_NMI_IPI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       if (kgdb_nmihook(raw_smp_processor_id(), regs))
>                               return NOTIFY_DONE;
> @@ -341,7 +343,8 @@ static int kgdb_notify(struct notifier_b
>               }
>               return NOTIFY_DONE;
>       case DIE_NMIWATCHDOG:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/arch/x86/kernel/kgdb_64.c
> +++ ko_2_6_24_3_kgdb/arch/x86/kernel/kgdb_64.c
> @@ -406,14 +406,16 @@ static int kgdb_notify(struct notifier_b
>  
>       switch (cmd) {
>       case DIE_NMI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
>               }
>               return NOTIFY_DONE;
>       case DIE_NMI_IPI:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       if (kgdb_nmihook(raw_smp_processor_id(), regs))
>                               return NOTIFY_DONE;
> @@ -421,7 +423,8 @@ static int kgdb_notify(struct notifier_b
>               }
>               return NOTIFY_DONE;
>       case DIE_NMIWATCHDOG:
> -             if (atomic_read(&debugger_active)) {
> +             if (atomic_read(&debugger_active) &&
> +                 !atomic_read(&debugger_exiting)) {
>                       /* KGDB CPU roundup */
>                       kgdb_nmihook(raw_smp_processor_id(), regs);
>                       return NOTIFY_STOP;
> Index: ko_2_6_24_3_kgdb/include/linux/kgdb.h
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/include/linux/kgdb.h
> +++ ko_2_6_24_3_kgdb/include/linux/kgdb.h
> @@ -281,6 +281,7 @@ extern int kgdb_handle_exception(int ex_
>  extern int kgdb_nmihook(int cpu, void *regs);
>  extern int debugger_step;
>  extern atomic_t debugger_active;
> +extern atomic_t debugger_exiting;
>  #else
>  /* Stubs for when KGDB is not set. */
>  static const atomic_t debugger_active = ATOMIC_INIT(0);
> Index: ko_2_6_24_3_kgdb/kernel/kgdb.c
> ===================================================================
> --- ko_2_6_24_3_kgdb.orig/kernel/kgdb.c
> +++ ko_2_6_24_3_kgdb/kernel/kgdb.c
> @@ -117,6 +117,8 @@ int debugger_step;
>  static atomic_t kgdb_sync = ATOMIC_INIT(-1);
>  atomic_t debugger_active;
>  EXPORT_SYMBOL(debugger_active);
> +atomic_t debugger_exiting = ATOMIC_INIT(0);
> +EXPORT_SYMBOL(debugger_exiting);
>  
>  /* Our I/O buffers. */
>  static char remcom_in_buffer[BUFMAX];
> @@ -1526,6 +1528,7 @@ default_handle:
>       atomic_set(&procindebug[processor], 0);
>  
>       if (!debugger_step || !kgdb_contthread) {
> +             atomic_set(&debugger_exiting, 1);
>               for (i = 0; i < NR_CPUS; i++)
>                       spin_unlock(&slavecpulocks[i]);
>               /* Wait till all the processors have quit
> @@ -1557,6 +1560,7 @@ default_handle:
>   kgdb_restore:
>       /* Free debugger_active */
>       atomic_set(&debugger_active, 0);
> +     atomic_set(&debugger_exiting, 0);
>       atomic_set(&kgdb_sync, -1);
>       clocksource_touch_watchdog();
>       kgdb_softlock_skip[processor] = 1;

WBR, Sergei

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
Kgdb-bugreport mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kgdb-bugreport

Reply via email to