Hi, After hitting the pvclock-related issue, I recompiled my 2.6.27-rc6 kernel without CONFIG_KVM_CLOCK. It stays up far longer, but I see the following guest crash when I stress it (with a source build):
BUG: unable to handle kernel paging request at d97b8000 IP: [<c0496f64>] __slab_alloc+0x1cd/0x3a1 Oops: 0002 [#1] SMP DEBUG_PAGEALLOC Modules linked in: ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 ipt_REJECT iptable_filter ip_tables bridge stp ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi scsi_transport_iscsi nfs lockd nfs_acl sunrpc ip6t_REJECT xt_tcpudp nf_conntrack_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables x_tables ipv6 dm_mirror dm_log dm_multipath dm_mod virtio_net floppy pcspkr virtio_pci i2c_piix4 i2c_core sr_mod cdrom ata_piix pata_acpi ata_generic ext3 jbd mbcache [last unloaded: microcode] Pid: 3020, comm: sed Not tainted (2.6.27-rc6 #1) EIP: 0060:[<c0496f64>] EFLAGS: 00210006 CPU: 0 EIP is at __slab_alloc+0x1cd/0x3a1 EAX: 5a5a5a5a EBX: 00000009 ECX: 00000800 EDX: 00002000 ESI: c15f9f20 EDI: d97b8000 EBP: c096ed0c ESP: c096ece0 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process sed (pid: 3020, ti=c096e000 task=d93d14e0 task.ti=d9024000) Stack: d97b8000 c096ecec ffffffff 00000020 df83cbe0 00000000 d93d14e0 00000020 00200292 000000b8 00000000 c096ed3c c049738d c06196a1 c18abd58 00000000 c06196a1 00000020 df83cbe0 00200286 000005fa 00000020 d8ed4700 c096ed60 Call Trace: [<c049738d>] ? kmem_cache_alloc+0x63/0xd1 [<c06196a1>] ? __alloc_skb+0x2e/0x10c [<c06196a1>] ? __alloc_skb+0x2e/0x10c [<c06196a1>] ? __alloc_skb+0x2e/0x10c [<c061979b>] ? __netdev_alloc_skb+0x1c/0x39 [<e08572ac>] ? try_fill_recv+0x37/0x153 [virtio_net] [<c044ab9a>] ? lock_release_holdtime+0x43/0x48 [<e08579b4>] ? virtnet_poll+0x239/0x2f7 [virtio_net] [<c06209a5>] ? net_rx_action+0xde/0x204 [<c0432f93>] ? __do_softirq+0x89/0xf1 [<c0432f0a>] ? __do_softirq+0x0/0xf1 [<c0406e51>] ? do_softirq+0x7e/0xdf [<c046b8e2>] ? handle_fasteoi_irq+0x0/0xbd [<c0432e6c>] ? irq_exit+0x4c/0x8b [<c0406f57>] ? do_IRQ+0xa5/0xbe [<c040558c>] ? common_interrupt+0x28/0x30 [<c0499b45>] ? css_put+0x20/0x23 [<c0499bfa>] ? __mem_cgroup_uncharge_common+0xb2/0xce [<c0499d4d>] ? mem_cgroup_uncharge_page+0x12/0x14 [<c048aa06>] ? page_remove_rmap+0xe4/0xfd [<c0484a8b>] ? unmap_vmas+0x36d/0x50d [<c0487d23>] ? exit_mmap+0x57/0xa2 [<c042c60f>] ? mmput+0x3f/0x90 [<c043012d>] ? exit_mm+0xed/0xf5 [<c0431375>] ? do_exit+0x1cc/0x734 [<c044354b>] ? up_read+0x1b/0x2e [<c0431963>] ? sys_exit_group+0x0/0x16 [<c0431977>] ? sys_exit_group+0x14/0x16 [<c0404b96>] ? syscall_call+0x7/0xb ======================= Code: fe ff 89 45 d4 8b 45 e4 f6 40 01 08 74 2c 89 f0 e8 54 e5 ff ff ba 00 10 00 00 8b 7d d4 89 c1 b8 5a 5a 5a 5a d3 e2 89 d1 c1 e9 02 <f3> ab f6 c2 02 74 02 66 ab f6 c2 01 74 01 aa 8b 5d d4 89 5d f0 EIP: [<c0496f64>] __slab_alloc+0x1cd/0x3a1 SS:ESP 0068:c096ece0 Kernel panic - not syncing: Fatal exception in interrupt ------------[ cut here ]------------ WARNING: at kernel/smp.c:332 smp_call_function_mask+0x38/0x18a() Modules linked in: ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 ipt_REJECT iptable_filter ip_tables bridge stp ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi scsi_transport_iscsi nfs lockd nfs_acl sunrpc ip6t_REJECT xt_tcpudp nf_conntrack_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables x_tables ipv6 dm_mirror dm_log dm_multipath dm_mod virtio_net floppy pcspkr virtio_pci i2c_piix4 i2c_core sr_mod cdrom ata_piix pata_acpi ata_generic ext3 jbd mbcache [last unloaded: microcode] Pid: 3020, comm: sed Tainted: G D 2.6.27-rc6 #1 [<c042e716>] warn_on_slowpath+0x46/0x6a [<c0518b58>] ? __delay+0xe/0x10 [<c04757e0>] ? time_hardirqs_off+0xe/0x1f [<c044b056>] ? trace_hardirqs_off_caller+0x15/0x97 [<c044b0e3>] ? trace_hardirqs_off+0xb/0xd [<c04757e0>] ? time_hardirqs_off+0xe/0x1f [<c04757e0>] ? time_hardirqs_off+0xe/0x1f [<c044b056>] ? trace_hardirqs_off_caller+0x15/0x97 [<c044b0e3>] ? trace_hardirqs_off+0xb/0xd [<c042ecff>] ? release_console_sem+0x1be/0x1c6 [<c0451826>] smp_call_function_mask+0x38/0x18a [<c041512a>] ? stop_this_cpu+0x0/0x4f [<c04757e0>] ? time_hardirqs_off+0xe/0x1f [<c044b056>] ? trace_hardirqs_off_caller+0x15/0x97 [<c044b0e3>] ? trace_hardirqs_off+0xb/0xd [<c0692018>] ? _spin_unlock_irqrestore+0x3e/0x55 [<c04152d2>] ? native_smp_send_stop+0x4/0x6e [<c04714f1>] ? ftrace_record_ip+0x1b3/0x1cc [<c0692018>] ? _spin_unlock_irqrestore+0x3e/0x55 [<c045198f>] smp_call_function+0x17/0x19 [<c04152ee>] native_smp_send_stop+0x20/0x6e [<c042e62f>] panic+0x53/0xf4 [<c06924e1>] oops_end+0x87/0x9b [<c040608e>] die+0x5c/0x64 [<c0693ec9>] do_page_fault+0x4f5/0x5af [<c06939d4>] ? do_page_fault+0x0/0x5af [<c06922b2>] error_code+0x72/0x78 [<c048007b>] ? shrink_page_list+0x3ba/0x560 [<c0496f64>] ? __slab_alloc+0x1cd/0x3a1 [<c049738d>] kmem_cache_alloc+0x63/0xd1 [<c06196a1>] ? __alloc_skb+0x2e/0x10c [<c06196a1>] ? __alloc_skb+0x2e/0x10c [<c06196a1>] __alloc_skb+0x2e/0x10c [<c061979b>] __netdev_alloc_skb+0x1c/0x39 [<e08572ac>] try_fill_recv+0x37/0x153 [virtio_net] [<c044ab9a>] ? lock_release_holdtime+0x43/0x48 [<e08579b4>] virtnet_poll+0x239/0x2f7 [virtio_net] [<c06209a5>] net_rx_action+0xde/0x204 [<c0432f93>] __do_softirq+0x89/0xf1 [<c0432f0a>] ? __do_softirq+0x0/0xf1 [<c0406e51>] do_softirq+0x7e/0xdf [<c046b8e2>] ? handle_fasteoi_irq+0x0/0xbd [<c0432e6c>] irq_exit+0x4c/0x8b [<c0406f57>] do_IRQ+0xa5/0xbe [<c040558c>] common_interrupt+0x28/0x30 [<c0499b45>] ? css_put+0x20/0x23 [<c0499bfa>] __mem_cgroup_uncharge_common+0xb2/0xce [<c0499d4d>] mem_cgroup_uncharge_page+0x12/0x14 [<c048aa06>] page_remove_rmap+0xe4/0xfd [<c0484a8b>] unmap_vmas+0x36d/0x50d [<c0487d23>] exit_mmap+0x57/0xa2 [<c042c60f>] mmput+0x3f/0x90 [<c043012d>] exit_mm+0xed/0xf5 [<c0431375>] do_exit+0x1cc/0x734 [<c044354b>] ? up_read+0x1b/0x2e [<c0431963>] sys_exit_group+0x0/0x16 [<c0431977>] sys_exit_group+0x14/0x16 [<c0404b96>] syscall_call+0x7/0xb ======================= ---[ end trace 443e075b33442f93 ]--- I initially thought this might be virtio-related, because the crash ended up near virtio_net. To be sure, I tried again without the virtio NIC, and got a different oops which may be more indicative of the root cause: ========================= [ BUG: held lock freed! ] ------------------------- init/1 is freeing memory d6193000-d6193fff, with a lock still held there! (&anon_vma->lock){--..}, at: [<c048a69e>] page_lock_anon_vma+0x3e/0x5d 3 locks held by init/1: #0: (&mm->mmap_sem){----}, at: [<c0693ba0>] do_page_fault+0x1cc/0x5af #1: (rcu_read_lock){..--}, at: [<c048a660>] page_lock_anon_vma+0x0/0x5d #2: (&anon_vma->lock){--..}, at: [<c048a69e>] page_lock_anon_vma+0x3e/0x5d stack backtrace: Pid: 1, comm: init Not tainted 2.6.27-rc6 #1 [<c044c96d>] debug_check_no_locks_freed+0xea/0x13d [<c047b23f>] free_hot_cold_page+0x56/0x14a [<c047b381>] free_hot_page+0xf/0x11 [<c047b4d7>] __free_pages+0x2a/0x35 [<c04969d9>] __free_slab+0xa3/0xab [<c0496b3b>] rcu_free_slab+0x13/0x15 [<c046c97e>] __rcu_process_callbacks+0x112/0x177 [<c046ca03>] rcu_process_callbacks+0x20/0x3a [<c0432f93>] __do_softirq+0x89/0xf1 [<c0432f0a>] ? __do_softirq+0x0/0xf1 [<c0406e51>] do_softirq+0x7e/0xdf [<c0432e6c>] irq_exit+0x4c/0x8b [<c041689f>] smp_apic_timer_interrupt+0x73/0x84 [<c0405691>] apic_timer_interrupt+0x2d/0x34 [<c044d77e>] ? lock_acquire+0x6f/0x81 [<c048a69e>] ? page_lock_anon_vma+0x3e/0x5d [<c0691b21>] _spin_lock+0x23/0x50 [<c048a69e>] ? page_lock_anon_vma+0x3e/0x5d [<c048a69e>] page_lock_anon_vma+0x3e/0x5d [<c048ad98>] page_referenced+0x47/0xfd [<c04218e7>] ? need_resched+0x14/0x1e [<c047fe00>] shrink_page_list+0x13f/0x560 [<c044b0e3>] ? trace_hardirqs_off+0xb/0xd [<c04099c0>] ? native_sched_clock+0x97/0xb5 [<c044c413>] ? mark_lock+0x20/0x2fa [<c044c732>] ? mark_held_locks+0x45/0x5c [<c044c881>] ? trace_hardirqs_on+0xb/0xd [<c044c841>] ? trace_hardirqs_on_caller+0xf8/0x12d [<c0480327>] shrink_inactive_list+0xdd/0x299 [<c04805ca>] shrink_zone+0xe7/0x106 [<c0480958>] do_try_to_free_pages+0x17b/0x2b9 [<c0480b7a>] try_to_free_pages+0x6c/0x74 [<c047f72b>] ? isolate_pages_global+0x0/0x43 [<c047bda7>] __alloc_pages_internal+0x22e/0x38a [<c047dbe9>] __do_page_cache_readahead+0xd5/0x19e [<c047dce2>] do_page_cache_readahead+0x30/0x3e [<c047783a>] filemap_fault+0x13b/0x2db [<c0483d1b>] __do_fault+0x40/0x2ff [<c044b056>] ? trace_hardirqs_off_caller+0x15/0x97 [<c04099c0>] ? native_sched_clock+0x97/0xb5 [<c0482f11>] ? page_address+0x1a/0x87 [<c044c413>] ? mark_lock+0x20/0x2fa [<c04854b3>] handle_mm_fault+0x342/0x6f9 [<c0693ba0>] ? do_page_fault+0x1cc/0x5af [<c044363f>] ? down_read_trylock+0x3e/0x48 [<c0693c5a>] do_page_fault+0x286/0x5af [<c06939d4>] ? do_page_fault+0x0/0x5af [<c06922b2>] error_code+0x72/0x78 Thanks! -- Dan Smith IBM Linux Technology Center Open Hypervisor Team email: [EMAIL PROTECTED]
pgpjwvgEySpW1.pgp
Description: PGP signature