From: Bernd Zeimetz <[EMAIL PROTECTED]> Date: Fri, 02 Nov 2007 16:37:25 +0100
> I've sent g several times to sysrq, output is attached. > According to top the two hanging aptitude processes were running on CPU > 1 + 3. > > 3204 root 20 0 19552 5088 4072 R 100 0.1 6:54.49 1 aptitude > 3203 root 20 0 19552 5088 4072 R 100 0.1 6:56.39 3 aptitude Ok, the key in the trace is: Nov 2 16:25:30 titan kernel: [ 978.134874] CPU[ 1]: TSTATE[0000000080009603] TPC[000000000067d2e0] TNPC[000000000067d2d4] TASK[aptitude:3204] Nov 2 16:25:30 titan kernel: [ 978.257809] TPC[_write_unlock_irq+0x20/0x110] ... Nov 2 16:25:30 titan kernel: [ 978.507778] CPU[ 3]: TSTATE[0000000011009605] TPC[00000000004419f8] TNPC[00000000004419fc] TASK[aptitude:3203] Nov 2 16:25:30 titan kernel: [ 978.630707] TPC[cheetah_xcall_deliver+0x174/0x23c] The first symbol is misleading, it says _write_unlock_irq but actually in the assembler the PC is in the spinlock read spinning loop section. So actually it's hanging in _spin_lock(). CPU #3 is trying to send a cross-call message interrupt, but for some reason that isn't making forward progress. Let's see what's calling these things by adding some more debugging information. Please retry the test with the following patch on top of the original sysrq-g debugging patch and please get new logs when it hangs. Thanks! --- arch/sparc64/kernel/process.c.ORIG 2007-11-03 20:53:27.000000000 -0700 +++ arch/sparc64/kernel/process.c 2007-11-03 21:05:47.000000000 -0700 @@ -49,6 +49,7 @@ #include <asm/hypervisor.h> #include <asm/sstate.h> #include <asm/irq_regs.h> +#include <asm/smp.h> /* #define VERBOSE_SHOWREGS */ @@ -394,7 +395,11 @@ struct global_reg_snapshot { unsigned long tstate; unsigned long tpc; unsigned long tnpc; + unsigned long o7; + unsigned long i7; struct thread_info *thread; + unsigned long pad1; + unsigned long pad2; } global_reg_snapshot[NR_CPUS]; static DEFINE_SPINLOCK(global_reg_snapshot_lock); @@ -413,6 +418,8 @@ static void sysrq_handle_globreg(int key global_reg_snapshot[cpu].tstate = regs->tstate; global_reg_snapshot[cpu].tpc = regs->tpc; global_reg_snapshot[cpu].tnpc = regs->tnpc; + global_reg_snapshot[cpu].o7 = regs->u_regs[UREG_I7]; + global_reg_snapshot[cpu].i7 = 0; } else { global_reg_snapshot[cpu].tstate = 0; global_reg_snapshot[cpu].tpc = 0; @@ -432,9 +439,19 @@ static void sysrq_handle_globreg(int key ((tp && tp->task) ? tp->task->comm : "NULL"), ((tp && tp->task) ? tp->task->pid : -1)); #ifdef CONFIG_KALLSYMS - if ((gp->tstate & TSTATE_PRIV) && (gp->tpc != 0UL)) { - sprint_symbol(buffer, gp->tpc); - printk(" TPC[%s]\n", buffer); + if (gp->tstate & TSTATE_PRIV) { + if (gp->tpc != 0UL) { + sprint_symbol(buffer, gp->tpc); + printk(" TPC[%s]\n", buffer); + } + if (gp->o7 != 0UL) { + sprint_symbol(buffer, gp->o7); + printk(" O7[%s]\n", buffer); + } + if (gp->i7 != 0UL) { + sprint_symbol(buffer, gp->i7); + printk(" I7[%s]\n", buffer); + } } #endif } --- arch/sparc64/mm/ultra.S.ORIG 2007-11-03 20:53:27.000000000 -0700 +++ arch/sparc64/mm/ultra.S 2007-11-03 20:57:12.000000000 -0700 @@ -528,7 +528,7 @@ xcall_fetch_glob_regs: sethi %hi(global_reg_snapshot), %g1 or %g1, %lo(global_reg_snapshot), %g1 __GET_CPUID(%g2) - sllx %g2, 5, %g3 + sllx %g2, 6, %g3 add %g1, %g3, %g1 rdpr %tstate, %g7 stx %g7, [%g1 + 0x00] @@ -536,12 +536,14 @@ xcall_fetch_glob_regs: stx %g7, [%g1 + 0x08] rdpr %tnpc, %g7 stx %g7, [%g1 + 0x10] + stx %o7, [%g1 + 0x18] + stx %i7, [%g1 + 0x20] sethi %hi(trap_block), %g7 or %g7, %lo(trap_block), %g7 sllx %g2, TRAP_BLOCK_SZ_SHIFT, %g2 add %g7, %g2, %g7 ldx [%g7 + TRAP_PER_CPU_THREAD], %g3 - stx %g3, [%g1 + 0x18] + stx %g3, [%g1 + 0x28] retry #ifdef DCACHE_ALIASING_POSSIBLE - To unsubscribe from this list: send the line "unsubscribe sparclinux" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html