On 26. 02. 26, 11:37, Jiri Slaby wrote:
On 06. 02. 26, 12:54, Matthieu Baerts wrote:
Our CI for the MPTCP subsystem is now regularly hitting various stalls
before even starting the MPTCP test suite. These issues are visible on
top of the latest net and net-next trees, which have been sync with
Linus' tree yesterday. All these issues have been seen on a "public CI"
using GitHub-hosted runners with KVM support, where the tested kernel is
launched in a nested (I suppose) VM. I can see the issue with or without
debug.config. According to the logs, it might have started around
v6.19-rc0, but I was unavailable for a few weeks, and I couldn't react
quicker, sorry for that. Unfortunately, I cannot reproduce this locally,
and the CI doesn't currently have the ability to execute bisections.
Hmm, after the switch of the qemu guest kernels to 6.19, our (opensuse)
build service is stalling in smp_call_function_many_cond() randomly too:
https://bugzilla.suse.com/show_bug.cgi?id=1258936
The attachment from there contains sysrq-t logs too:
https://bugzilla.suse.com/attachment.cgi?id=888612
A small update. Just in case this rings a bell somewhere.
We have a qemu mem dump from the affected kernel. It shows that both
CPU0 and CPU1 are waiting for CPU2's rq lock. CPU2 is in userspace.
crash> bt -xsc 0
PID: 6483 TASK: ffff8d1759c20000 CPU: 0 COMMAND: "compile"
[exception RIP: native_halt+14]
RIP: ffffffffb9d1124e RSP: ffffcead0696f9a0 RFLAGS: 00000046
RAX: 0000000000000003 RBX: 0000000000040000 RCX: 00000000fffffff8
RDX: ffff8d1a7ffc5140 RSI: 0000000000000003 RDI: ffff8d1a6fd35dc0
RBP: ffff8d1a6fd35dc0 R8: ffff8d1a6fc36dc0 R9: fffffffffffffff8
R10: 0000000000000000 R11: 0000000000000004 R12: ffff8d1a6fc36dc0
R13: 0000000000000000 R14: ffff8d1a7ffc5140 R15: ffffcead0696fad0
CS: 0010 SS: 0018
#0 [ffffcead0696f9a0] kvm_wait+0x44 at ffffffffb9d0fe54
#1 [ffffcead0696f9a8] __pv_queued_spin_lock_slowpath+0x247 at
ffffffffbaafb507
#2 [ffffcead0696f9d8] _raw_spin_lock+0x29 at ffffffffbaafadf9
#3 [ffffcead0696f9e0] raw_spin_rq_lock_nested+0x1c at ffffffffb9d8c12c
#4 [ffffcead0696f9f8] _raw_spin_rq_lock_irqsave+0x17 at ffffffffb9d96ca7
#5 [ffffcead0696fa08] sched_balance_rq+0x56d at ffffffffb9da718d
#6 [ffffcead0696fb18] pick_next_task_fair+0x240 at ffffffffb9da7e00
#7 [ffffcead0696fb88] __schedule+0x19e at ffffffffbaaf00de
#8 [ffffcead0696fc40] schedule+0x27 at ffffffffbaaf1697
#9 [ffffcead0696fc50] futex_do_wait+0x4a at ffffffffb9e61c5a
#10 [ffffcead0696fc68] __futex_wait+0x8e at ffffffffb9e6241e
#11 [ffffcead0696fd30] futex_wait+0x6b at ffffffffb9e624fb
#12 [ffffcead0696fdc0] do_futex+0xc5 at ffffffffb9e5e305
#13 [ffffcead0696fdc8] __x64_sys_futex+0x112 at ffffffffb9e5e932
#14 [ffffcead0696fe38] do_syscall_64+0x81 at ffffffffbaae2a61
#15 [ffffcead0696ff40] entry_SYSCALL_64_after_hwframe+0x76 at
ffffffffb9a0012f
RIP: 0000000000495303 RSP: 000000c000073c98 RFLAGS: 00000286
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000495303
RDX: 0000000000000000 RSI: 0000000000000080 RDI: 000000c000058958
RBP: 000000c000073ce0 R8: 0000000000000000 R9: 0000000000000000
R10: 0000000000000000 R11: 0000000000000286 R12: 0000000000000024
R13: 0000000000000001 R14: 000000c000002c40 R15: 0000000000000001
ORIG_RAX: 00000000000000ca CS: 0033 SS: 002b
crash> bt -xsc 1
PID: 6481 TASK: ffff8d1759c8b680 CPU: 1 COMMAND: "compile"
[exception RIP: __pv_queued_spin_lock_slowpath+190]
RIP: ffffffffbaafb37e RSP: ffffcead000f8b38 RFLAGS: 00000046
RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001
RDX: 0000000000040003 RSI: 0000000000040003 RDI: ffff8d1a6fd35dc0
RBP: ffff8d1a6fd35dc0 R8: 0000000000000000 R9: 00000001000c3f60
R10: ffffffffbbc75960 R11: ffffcead000f8a48 R12: ffff8d1a6fcb6dc0
R13: 0000000000000001 R14: 0000000000000000 R15: ffffffffbbe65940
CS: 0010 SS: 0000
#0 [ffffcead000f8b60] _raw_spin_lock+0x29 at ffffffffbaafadf9
#1 [ffffcead000f8b68] raw_spin_rq_lock_nested+0x1c at ffffffffb9d8c12c
#2 [ffffcead000f8b80] _raw_spin_rq_lock_irqsave+0x17 at ffffffffb9dc9cc7
#3 [ffffcead000f8b90] print_cfs_rq+0xce at ffffffffb9dd0d8e
#4 [ffffcead000f8c98] print_cfs_stats+0x62 at ffffffffb9da9ee2
#5 [ffffcead000f8cc8] print_cpu+0x243 at ffffffffb9dcbe73
#6 [ffffcead000f8d00] sysrq_sched_debug_show+0x2e at ffffffffb9dd1b7e
#7 [ffffcead000f8d18] show_state_filter+0xcd at ffffffffb9d91f4d
#8 [ffffcead000f8d40] sysrq_handle_showstate+0x10 at ffffffffba60b750
#9 [ffffcead000f8d48] __handle_sysrq.cold+0x9b at ffffffffb9c4f486
#10 [ffffcead000f8d70] sysrq_filter+0xd7 at ffffffffba60c237
#11 [ffffcead000f8d98] input_handle_events_filter+0x45 at ffffffffba766c05
#12 [ffffcead000f8dd0] input_pass_values+0x134 at ffffffffba766ec4
#13 [ffffcead000f8e00] input_event_dispose+0x156 at ffffffffba767046
#14 [ffffcead000f8e20] input_event+0x58 at ffffffffba76ac18
#15 [ffffcead000f8e50] atkbd_receive_byte+0x64d at ffffffffba772e6d
#16 [ffffcead000f8ea8] ps2_interrupt+0x9d at ffffffffba7665ed
#17 [ffffcead000f8ed0] serio_interrupt+0x4f at ffffffffba761e0f
#18 [ffffcead000f8f00] i8042_handle_data+0x11c at ffffffffba76316c
#19 [ffffcead000f8f40] i8042_interrupt+0x11 at ffffffffba763581
#20 [ffffcead000f8f50] __handle_irq_event_percpu+0x55 at ffffffffb9df1e15
#21 [ffffcead000f8f90] handle_irq_event+0x38 at ffffffffb9df2058
#22 [ffffcead000f8fb0] handle_edge_irq+0xc5 at ffffffffb9df7b95
#23 [ffffcead000f8fd0] __common_interrupt+0x44 at ffffffffb9cc2354
#24 [ffffcead000f8ff0] common_interrupt+0x80 at ffffffffbaae6090
--- <IRQ stack> ---
#25 [ffffcead06bcfb98] asm_common_interrupt+0x26 at ffffffffb9a01566
[exception RIP: smp_call_function_many_cond+304]
RIP: ffffffffb9e63080 RSP: ffffcead06bcfc40 RFLAGS: 00000202
RAX: 0000000000000011 RBX: 0000000000000202 RCX: ffff8d1a6fc3f800
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000001 R8: ffff8d174009cc30 R9: 0000000000000000
R10: ffff8d174009c0d8 R11: 0000000000000000 R12: 0000000000000001
R13: 0000000000000003 R14: ffff8d1a6fcb7280 R15: 0000000000000001
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0000
#26 [ffffcead06bcfcb0] on_each_cpu_cond_mask+0x24 at ffffffffb9e634f4
#27 [ffffcead06bcfcb8] flush_tlb_mm_range+0x1b1 at ffffffffb9d225d1
#28 [ffffcead06bcfd08] ptep_clear_flush+0x93 at ffffffffba066e13
#29 [ffffcead06bcfd30] do_wp_page+0x6a2 at ffffffffba04c692
#30 [ffffcead06bcfdb8] __handle_mm_fault+0xa49 at ffffffffba055c79
#31 [ffffcead06bcfe98] handle_mm_fault+0xe7 at ffffffffba056297
#32 [ffffcead06bcfed8] do_user_addr_fault+0x21a at ffffffffb9d1db6a
#33 [ffffcead06bcff18] exc_page_fault+0x69 at ffffffffbaae99c9
#34 [ffffcead06bcff40] asm_exc_page_fault+0x26 at ffffffffb9a012a6
RIP: 000000000042351c RSP: 000000c0013aafd0 RFLAGS: 00010246
RAX: 0000000000000002 RBX: 00000000017584c0 RCX: 0000000000000000
RDX: 0000000000000005 RSI: 000000000163edc0 RDI: 0000000000000003
RBP: 000000c0013ab080 R8: 0000000000000001 R9: 00007f0d9853f800
R10: 00007f0d98334e00 R11: 00007f0d98afa020 R12: 00007f0d98afa020
R13: 0000000000000050 R14: 000000c000002380 R15: 0000000000000001
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
RIP: 000000000042351c RSP: 000000c0013aafd0 RFLAGS: 00010246
RAX: 0000000000000002 RBX: 00000000017584c0 RCX: 0000000000000000
RDX: 0000000000000005 RSI: 000000000163edc0 RDI: 0000000000000003
RBP: 000000c0013ab080 R8: 0000000000000001 R9: 00007f0d9853f800
R10: 00007f0d98334e00 R11: 00007f0d98afa020 R12: 00007f0d98afa020
R13: 0000000000000050 R14: 000000c000002380 R15: 0000000000000001
ORIG_RAX: ffffffffffffffff CS: 0033 SS: 002b
crash> bt -xsc 2
PID: 6540 TASK: ffff8d1773ae3680 CPU: 2 COMMAND: "compile"
RIP: 0000000000495372 RSP: 000000c00003e000 RFLAGS: 00000206
RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000495372
RDX: 0000000000000000 RSI: 000000c00003e000 RDI: 00000000000d0f00
RBP: 00007ffcf8a71aa8 R8: 000000c00005a090 R9: 000000c000002700
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000491580
R13: 000000c00005a008 R14: 00000000017222e0 R15: ffffffffffffffff
ORIG_RAX: 0000000000000038 CS: 0033 SS: 002b
The state of the lock:
crash> struct rq.__lock -x ffff8d1a6fd35dc0
__lock = {
raw_lock = {
{
val = {
counter = 0x40003
},
{
locked = 0x3,
pending = 0x0
},
{
locked_pending = 0x3,
tail = 0x4
}
}
}
},
thanks,
--
js
suse labs