Re: Panic in scheduler code with SCHED_ULE during boot tomulti-user.

Jeff Roberson Fri, 04 Jul 2003 01:55:21 -0700

Excellent debugging.  Can you tell me whats in kseq_cpu[0] and
kseq_cpu[1]?  I think I may know what the problem is.  Do you have some
negative niced processes or some positive nice processes?



On Thu, 3 Jul 2003, Richard Todd wrote:

> Hi.  Last night I upgraded to the most recent -current source and
> rebuilt everything, and decided on building the kernel to try the new
> SCHED_ULE scheduler (I had been using SCHED_4BSD before).  Alas, the
> experiment did not go well; every time I booted the machine, I got a
> panic just as the system was about to put up the login prompt.
> Switching the kernel config back to SCHED_4BSD and building a kernel
> with the same (last night's) sources gave me a working kernel.  This
> is on a dual-processor PII/400 box.  Below I list what I've got from a
> kernel coredump of the SCHED_ULE kernel; I've added my comments on the gdb
> listing preceded by "#" signs.
>
> ichotolot# gdb -k ./kernel.debug ./vmcore.45
> GNU gdb 5.2.1 (FreeBSD)
> Copyright 2002 Free Software Foundation, Inc.
> GDB is free software, covered by the GNU General Public License, and you are
> welcome to change it and/or distribute copies of it under certain conditions.
> Type "show copying" to see the conditions.
> There is absolutely no warranty for GDB.  Type "show warranty" for details.
> This GDB was configured as "i386-undermydesk-freebsd"...
> panic: page fault
> panic messages:
> ---
> Fatal trap 12: page fault while in kernel mode
> cpuid = 1; lapic.id = 01000000
> fault virtual address = 0x38
> fault code            = supervisor read, page not present
> instruction pointer   = 0x8:0xc036835d
> stack pointer         = 0x10:0xe1cfbbbc
> frame pointer         = 0x10:0xe1cfbbcc
> code segment          = base 0x0, limit 0xfffff, type 0x1b
>                       = DPL 0, pres 1, def32 1, gran 1
> processor eflags      = interrupt enabled, resume, IOPL = 0
> current process               = 649 (squid)
> trap number           = 12
> panic: page fault
> cpuid = 1; lapic.id = 01000000
> boot() called on cpu#1
>
> syncing disks, buffers remaining... panic: absolutely cannot call smp_ipi_shootdown 
> with interrupts already disabled
> cpuid = 1; lapic.id = 01000000
> boot() called on cpu#1
> Uptime: 1m12s
> Dumping 638 MB
>  16 32 48 64 80 96 112 128 144 160 176 192 208 224 240 256 272 288 304 320 336 352 
> 368 384 400 416 432 448 464 480 496 512 528 544 560 576 592 608 624
> ---
> Reading symbols from 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug...done.
> Loaded symbols for 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/acpi/acpi.ko.debug
> Reading symbols from 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug...done.
> Loaded symbols for 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linprocfs/linprocfs.ko.debug
> Reading symbols from 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug...done.
> Loaded symbols for 
> /usr/src/sys/i386/compile/ICHOTOLOTSMP/modules/usr/src/sys/modules/linux/linux.ko.debug
> Reading symbols from /boot/kernel/green_saver.ko...done.
> Loaded symbols for /boot/kernel/green_saver.ko
> #0  doadump () at ../../../kern/kern_shutdown.c:240
> 240           dumping++;
> (kgdb) bt
> #0  doadump () at ../../../kern/kern_shutdown.c:240
> #1  0xc03547c0 in boot (howto=260) at ../../../kern/kern_shutdown.c:372
> #2  0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
> #3  0xc050f9db in smp_tlb_shootdown (vector=0, addr1=0, addr2=0)
>     at ../../../i386/i386/mp_machdep.c:2387
> #4  0xc050fc79 in smp_invlpg_range (addr1=0, addr2=0)
>     at ../../../i386/i386/mp_machdep.c:2519
> #5  0xc0511df8 in pmap_invalidate_range (pmap=0xc06dc620, sva=3568271360,
>     eva=1) at ../../../i386/i386/pmap.c:719
> #6  0xc0512118 in pmap_qenter (sva=3568271360, m=0xe1cfb8c0, count=-1)
>     at ../../../i386/i386/pmap.c:943
> #7  0xc03a0448 in vm_hold_load_pages (bp=0xd199d440, from=3568271360,
>     to=3568279552) at ../../../kern/vfs_bio.c:3574
> #8  0xc039ea5c in allocbuf (bp=0xd199d440, size=6144)
>     at ../../../kern/vfs_bio.c:2752
> #9  0xc039e6fe in geteblk (size=6144) at ../../../kern/vfs_bio.c:2634
> #10 0xc039b210 in bwrite (bp=0xd188b8e0) at ../../../kern/vfs_bio.c:818
> #11 0xc039bc6c in bawrite (bp=0x0) at ../../../kern/vfs_bio.c:1153
> #12 0xc03a4860 in vop_stdfsync (ap=0xe1cfba14)
>     at ../../../kern/vfs_default.c:742
> #13 0xc031ba10 in spec_fsync (ap=0xe1cfba14)
>     at ../../../fs/specfs/spec_vnops.c:417
> #14 0xc031ae38 in spec_vnoperate (ap=0x0)
>     at ../../../fs/specfs/spec_vnops.c:122
> #15 0xc04ad2d7 in ffs_sync (mp=0xc5336400, waitfor=2, cred=0xc1c27e80,
>     td=0xc0669ec0) at vnode_if.h:624
> #16 0xc03b0b1b in sync (td=0xc0669ec0, uap=0x0)
>     at ../../../kern/vfs_syscalls.c:142
> #17 0xc03542e2 in boot (howto=256) at ../../../kern/kern_shutdown.c:281
> #18 0xc0354ba6 in panic () at ../../../kern/kern_shutdown.c:550
> #19 0xc0517292 in trap_fatal (frame=0xe1cfbb7c, eva=0)
>     at ../../../i386/i386/trap.c:836
> #20 0xc0516863 in trap (frame=
>       {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, 
> tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 
> 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 
> 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
>     at ../../../i386/i386/trap.c:256
> #21 0xc04ff988 in calltrap () at {standard input}:97
> #22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
> #23 0xc035a636 in choosethread () at ../../../kern/kern_switch.c:140
> #24 0xc035c340 in mi_switch () at ../../../kern/kern_synch.c:524
> #25 0xc034b686 in _mtx_lock_sleep (m=0xc066e680, opts=0,
>     file=0xc058251d "../../../kern/kern_descrip.c", line=826)
>     at ../../../kern/kern_mutex.c:641
> #26 0xc034b0f8 in _mtx_lock_flags (m=0xc066e680, opts=0,
>     file=0xc058251d "../../../kern/kern_descrip.c", line=826)
>     at ../../../kern/kern_mutex.c:331
> #27 0xc0335e11 in close (td=0xc5757130, uap=0x0)
>     at ../../../kern/kern_descrip.c:826
> #28 0xc05175ee in syscall (frame=
>       {tf_fs = 47, tf_es = 47, tf_ds = 47, tf_edi = -1077936744, tf_esi = 0, tf_ebp 
> = -1077936840, tf_isp = -506479244, tf_ebx = 689, tf_edx = -1, tf_ecx = 9, tf_eax = 
> 6, tf_trapno = 12, tf_err = 2, tf_eip = 672626595, tf_cs = 31, tf_eflags = 663, 
> tf_esp = -1077936900, tf_ss = 47}) at ../../../i386/i386/trap.c:1023
> #29 0xc04ff9dd in Xint0x80_syscall () at {standard input}:139
> ---Can't read userspace from dump, or kernel process---
> # The initial panic seems to be around frame 22, so let's go there.
> (kgdb) fr 22
> #22 0xc036947b in sched_choose () at ../../../kern/sched_ule.c:1164
> 1164                  kseq_move(kseq, PCPU_GET(cpuid));
> (kgdb) p kseq
> $1 = (struct kseq *) 0x0
> (kgdb) l 1160
> 1155                   */
> 1156                  if ((kseq = kseq_load_highest()) == NULL)
> 1157                          return (NULL);
> 1158
> 1159                  /*
> 1160                   * Remove this kse from this kseq and runq and then requeue
> 1161                   * on the current processor.  Then we will dequeue it
> 1162                   * normally above.
> 1163                   */
> 1164                  kseq_move(kseq, PCPU_GET(cpuid));
> # Hmm, that's odd.  You'd think that if kseq returned by kseq_load_highest()
> # was null, line 1157 would have returned and we'd never hit line 1164.
> # Weird.  Could be gdb is just being confused here, though.  Hard to say.
> # Anyway, on further inspection, it seems the fatal trap wasn't in
> # sched_choose() at all, but elsewhere; look at the eip in the trap frame:
> (kgdb) fr 20
> #20 0xc0516863 in trap (frame=
>       {tf_fs = -1067057128, tf_es = 16, tf_ds = -1067974640, tf_edi = -982159056, 
> tf_esi = -1066987296, tf_ebp = -506479668, tf_isp = -506479704, tf_ebx = 0, tf_edx = 
> 2, tf_ecx = 1, tf_eax = 0, tf_trapno = 12, tf_err = 0, tf_eip = -1070169251, tf_cs = 
> 8, tf_eflags = 66178, tf_esp = -1066987296, tf_ss = 0})
>     at ../../../i386/i386/trap.c:256
> 256                           trap_fatal(&frame, eva);
> (kgdb) p/x frame.tf_eip
> $2 = 0xc036835d
> (kgdb) disass 0xc036835d
> Dump of assembler code for function kseq_move:
> 0xc0368340 <kseq_move>:       push   %ebp
> 0xc0368341 <kseq_move+1>:     mov    %esp,%ebp
> 0xc0368343 <kseq_move+3>:     sub    $0x10,%esp
> 0xc0368346 <kseq_move+6>:     mov    %ebx,0xfffffff8(%ebp)
> 0xc0368349 <kseq_move+9>:     mov    %esi,0xfffffffc(%ebp)
> 0xc036834c <kseq_move+12>:    mov    0x8(%ebp),%esi
> 0xc036834f <kseq_move+15>:    mov    %esi,(%esp,1)
> 0xc0368352 <kseq_move+18>:    call   0xc03683a0 <kseq_choose>
> 0xc0368357 <kseq_move+23>:    mov    %eax,%ebx
> 0xc0368359 <kseq_move+25>:    mov    %eax,0x4(%esp,1)
> 0xc036835d <kseq_move+29>:    mov    0x38(%eax),%eax
> 0xc0368360 <kseq_move+32>:    mov    0x4(%eax),%eax
> 0xc0368363 <kseq_move+35>:    mov    %eax,(%esp,1)
> 0xc0368366 <kseq_move+38>:    call   0xc035ac50 <runq_remove>
> 0xc036836b <kseq_move+43>:    movl   $0x4,0x30(%ebx)
> 0xc0368372 <kseq_move+50>:    mov    %ebx,0x4(%esp,1)
> 0xc0368376 <kseq_move+54>:    mov    %esi,(%esp,1)
> 0xc0368379 <kseq_move+57>:    call   0xc0367f30 <kseq_rem>
> 0xc036837e <kseq_move+62>:    mov    0x38(%ebx),%edx
> 0xc0368381 <kseq_move+65>:    mov    0xc(%ebp),%eax
> 0xc0368384 <kseq_move+68>:    mov    %al,0x14(%edx)
> 0xc0368387 <kseq_move+71>:    mov    %ebx,(%esp,1)
> 0xc036838a <kseq_move+74>:    call   0xc0369490 <sched_add>
> 0xc036838f <kseq_move+79>:    mov    0xfffffff8(%ebp),%ebx
> 0xc0368392 <kseq_move+82>:    mov    0xfffffffc(%ebp),%esi
> 0xc0368395 <kseq_move+85>:    mov    %ebp,%esp
> 0xc0368397 <kseq_move+87>:    pop    %ebp
> 0xc0368398 <kseq_move+88>:    ret
> End of assembler dump.
> # Aha, the trap occured right at this instruction:
> # 0xc036835d <kseq_move+29>:  mov    0x38(%eax),%eax
> # Note from the trapframe that eax was 0 at that point, hence we're
> # dereferencing a NULL pointer.  Apparently this NULL was returned by
> # kseq_choose, and the actual null deref is the "ke->ke_runq" at line 448:
> (kgdb) l kseq_move
> 439             return (NULL);
> 440     }
> 441
> 442     void
> 443     kseq_move(struct kseq *from, int cpu)
> 444     {
> 445             struct kse *ke;
> 446
> 447             ke = kseq_choose(from);
> 448             runq_remove(ke->ke_runq, ke);
> (kgdb) l
> 449             ke->ke_state = KES_THREAD;
> 450             kseq_rem(from, ke);
> 451
> 452             ke->ke_cpu = cpu;
> 453             sched_add(ke);
> 454     }
> 455     #endif
> 456
> 457     struct kse *
> 458     kseq_choose(struct kseq *kseq)
> (kgdb) q
> ichotolot#
>
> So that's what I found.  Any suggestions on how to track this down further?
> _______________________________________________
> [EMAIL PROTECTED] mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "[EMAIL PROTECTED]"
>

_______________________________________________
[EMAIL PROTECTED] mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-current
To unsubscribe, send any mail to "[EMAIL PROTECTED]"

Re: Panic in scheduler code with SCHED_ULE during boot tomulti-user.

Reply via email to