Please find below another debug session.
The machine is amd64, 32 cpus, 128GB RAM. Can reproduce the hang every
time now by doing:
cd /usr/ports/lang/rust
while :
do
make clean=build
make
done
Tested with a -current kernel, compiled with WITNESS and MP_LOCKDEBUG
and __mp_lock_spinout = 50L * INT_MAX.
I've included traces on all the cpus with state "stopped". Couldn't
get the "stopping" cpus to stop with mach stopcpu N. Trying to mach
ddbcpu N on a "stopping" cpu will freeze ddb.
Please advise if more tests (with examine, for example) are needed.
Thanks,
--Kor
pmap_tlb_shootwait: spun outStopped at db_enter+0x14: popq %rbp
ddb{4}> trace
db_enter() at db_enter+0x14
pmap_page_remove(fffffd820275f330) at pmap_page_remove+0x38e
uvm_anfree_list(fffffd9ab1a14c28,0) at uvm_anfree_list+0x63
amap_wipeout(fffffd9acebcd428) at amap_wipeout+0x198
uvm_unmap_detach(ffff800055efd160,0) at uvm_unmap_detach+0xbe
sys_munmap(ffff800055eefc80,ffff800055efd260,ffff800055efd1e0) at sys_munmap+0x
185
syscall(ffff800055efd260) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7241ffaab50, count: -8
ddb{4}> ps /o
TID PID UID PRFLAGS PFLAGS CPU COMMAND
456410 30896 0 0x3 0x4000000 12 rustc
325785 280 0 0x3 0x4000000 7 rustc
124074 32980 0 0x3 0x4000000 6 rustc
282307 73501 0 0x3 0x4000000 3 rustc
277851 95108 0 0x3 0x4000000 14 rustc
32783 29238 0 0x3 0x4000000 10 rustc
57743 21941 0 0x3 0x4000000 11 rustc
294566 82050 0 0x3 0x4000000 15 rustc
485680 73773 0 0x3 0x4000000 0 rustc
408387 61268 0 0x3 0x4000000 8 rustc
*466075 29506 0 0x3 0x4000000 4 rustc
356664 33536 0 0x3 0x4000000 2K rustc
475723 30794 0 0x3 0x4000000 1 rustc
ddb{4}> show witness /b
Number of known direct relationships is 492
Lock order reversal between "&mp->mnt_lock"(rwlock) and "&ip->i_lock"(rrwlock)!
witness: incomplete path, depth 4
ddb{4}> show all locks
Process 30896 (rustc) thread 0xffff8000560599a0 (456410)
exclusive rwlock amaplk r = 0 (0xfffffd9aea13db58)
shared rwlock vmmaplk r = 0 (0xfffffd9a7052e548)
Process 280 (rustc) thread 0xffff8000560457e0 (325785)
exclusive rwlock amaplk r = 0 (0xfffffd9ccf8dd2e8)
shared rwlock vmmaplk r = 0 (0xfffffd9a7052eae8)
Process 32980 (rustc) thread 0xffff800056044da0 (124074)
exclusive rwlock amaplk r = 0 (0xfffffd9aa72f5a48)
shared rwlock vmmaplk r = 0 (0xfffffd9cba272ea0)
Process 73501 (rustc) thread 0xffff800055ff3cd0 (282307)
exclusive rwlock amaplk r = 0 (0xfffffd9aed6be188)
exclusive rwlock vmmaplk r = 0 (0xfffffd9cba272900)
Process 95108 (rustc) thread 0xffff800055f63a00 (277851)
exclusive rwlock amaplk r = 0 (0xfffffd995e22f2a0)
exclusive rwlock vmmaplk r = 0 (0xfffffd9cba272360)
Process 29238 (rustc) thread 0xffff80005603dcf8 (32783)
exclusive rwlock amaplk r = 0 (0xfffffd9aa72f5418)
shared rwlock vmmaplk r = 0 (0xfffffd9cba272720)
Process 21941 (rustc) thread 0xffff800055f622f0 (57743)
exclusive rwlock amaplk r = 0 (0xfffffd9c0a363e30)
exclusive rwlock vmmaplk r = 0 (0xfffffd9c739bd6e0)
Process 82050 (rustc) thread 0xffff800056021020 (294566)
exclusive rwlock amaplk r = 0 (0xfffffd9affc98df8)
shared rwlock vmmaplk r = 0 (0xfffffd9c739bdc80)
Process 61268 (rustc) thread 0xffff800055ff2ae0 (408387)
exclusive rwlock uobjlk r = 0 (0xfffffd9b600e19c0)
shared rwlock vmmaplk r = 0 (0xfffffd9b0eb4dad8)
Process 29506 (rustc) thread 0xffff800055eefc80 (466075)
exclusive rwlock amaplk r = 0 (0xfffffd9acebeb3d8)
Process 33536 (rustc) thread 0xffff800055f62060 (356664)
exclusive rrwlock inode r = 0 (0xfffffd9d738f39b8)
exclusive kernel_lock &kernel_lock r = 0 (0xffffffff82ad8348)
Process 30794 (rustc) thread 0xffff80005603da68 (475723)
exclusive rwlock amaplk r = 0 (0xfffffd9a8b63f4f8)
exclusive rwlock vmmaplk r = 0 (0xfffffd9b0eb4dcb8)
ddb{4}> mach cpuinfo
0: stopping
1: stopping
2: stopped
3: stopped
* 4: ddb
5: stopped
6: stopping
7: stopping
8: stopped
9: stopping
10: stopping
11: stopping
12: stopping
13: stopping
14: stopping
15: stopping
16: stopping
17: stopping
18: stopped
19: stopped
20: stopping
21: stopped
22: stopping
23: stopping
24: stopped
25: stopped
26: stopped
27: stopped
28: stopping
29: stopped
30: stopping
31: stopped
ddb{4}> mach ddbcpu 2
Stopped at x86_ipi_db+0x16: leave
ddb{2}> trace
x86_ipi_db(ffff8000552b3ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_kremove(ffff80004120f000,10000) at pmap_kremove+0xa3
buf_unmap(fffffd9b9a906e58) at buf_unmap+0xbf
buf_map(fffffd9be2bbc6c8) at buf_map+0x147
buf_get(fffffd9ad87407c8,4e,10000) at buf_get+0x360
getblk(fffffd9ad87407c8,4e,10000,0,ffffffffffffffff) at getblk+0x7b
ffs2_balloc(fffffd9d738f3918,4e0000,10000,fffffda07f7cb680,0,ffff80005608ce08) a
t ffs2_balloc+0xe93
ffs_write(ffff80005608ce88) at ffs_write+0x21d
VOP_WRITE(fffffd9ad87407c8,ffff80005608cfe8,1,fffffda07f7cb680) at VOP_WRITE+0x
45
vn_write(fffffd9a5b6cc090,ffff80005608cfe8,0) at vn_write+0xdb
dofilewritev(ffff800055f62060,9,ffff80005608cfe8,0,ffff80005608d080) at dofilew
ritev+0x171
sys_write(ffff800055f62060,ffff80005608d100,ffff80005608d080) at sys_write+0x55
syscall(ffff80005608d100) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0xcd471be5ad0, count: -16
ddb{2}> mach ddbcpu 3
Stopped at x86_ipi_db+0x16: leave
ddb{3}> trace
x86_ipi_db(ffff8000552bcff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_do_remove(fffffd99b72e2800,26013ba1000,26013ba2000,0) at pmap_do_remove+0x
5d7
uvm_unmap_kill_entry_withlock(fffffd9cba272800,fffffd9bb369a5a8,1) at uvm_unmap
_kill_entry_withlock+0x133
uvm_unmap_remove(fffffd9cba272800,26013ba1000,26013ba2000,ffff80005630b870,0,1,
e00485a81190c8ec) at uvm_unmap_remove+0x32f
sys_munmap(ffff800055ff3cd0,ffff80005630b970,ffff80005630b8f0) at sys_munmap+0x
10b
syscall(ffff80005630b970) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x2603c8b7300, count: -9
ddb{3}> mach ddbcpu 5
Stopped at x86_ipi_db+0x16: leave
ddb{5}> trace
x86_ipi_db(ffff8000552ceff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000552ceff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{5}> mach ddbcpu 8
Stopped at x86_ipi_db+0x16: leave
ddb{8}> trace
x86_ipi_db(ffff8000552e9ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_enter(fffffd9a9cc3e700,4504ddd0000,1b73fec000,1,20) at pmap_enter+0x703
uvm_fault_lower_lookup(ffff8000560fdbc8,ffff8000560fdc00,ffff8000560fdb40) at u
vm_fault_lower_lookup+0x126
uvm_fault_lower(ffff8000560fdbc8,ffff8000560fdc00,ffff8000560fdb40) at uvm_faul
t_lower+0x5c
uvm_fault(fffffd9b0eb4d9d8,4504ddcf000,0,1) at uvm_fault+0x1c5
upageflttrap(ffff8000560fdd30,4504ddcf215) at upageflttrap+0x6c
usertrap(ffff8000560fdd30) at usertrap+0x204
recall_trap() at recall_trap+0x8
end of kernel
end trace frame: 0x4507e19e4f0, count: -10
ddb{8}> mach ddbcpu 12
Stopped at x86_ipi_db+0x16: leave
ddb{18}> trace
x86_ipi_db(ffff800055343ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055343ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{18}> mach ddbcpu 13
Stopped at x86_ipi_db+0x16: leave
ddb{19}> trace
x86_ipi_db(ffff80005534cff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005534cff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{19}> mach ddbcpu 15
Stopped at x86_ipi_db+0x16: leave
ddb{21}> trace
x86_ipi_db(ffff80005535eff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005535eff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{21}> mach ddbcpu 18
Stopped at x86_ipi_db+0x16: leave
ddb{24}> trace
x86_ipi_db(ffff800055379ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055379ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{24}> mach ddbcpu 19
Stopped at x86_ipi_db+0x16: leave
ddb{25}> trace
x86_ipi_db(ffff800055382ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055382ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{25}> mach ddbcpu 1A
Stopped at x86_ipi_db+0x16: leave
ddb{26}> trace
x86_ipi_db(ffff80005538bff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005538bff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{26}> mach ddbcpu 1B
Stopped at x86_ipi_db+0x16: leave
ddb{27}> trace
x86_ipi_db(ffff800055394ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055394ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{27}> mach ddbcpu 1D
Stopped at x86_ipi_db+0x16: leave
ddb{29}> trace
x86_ipi_db(ffff8000553a6ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000553a6ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{29}> mach ddbcpu 1F
Stopped at x86_ipi_db+0x16: leave
ddb{31}> trace
x86_ipi_db(ffff8000553b8ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000553b8ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5
ddb{31}> mach stopcpu 1E
ddb{31}> mach cpuinfo
0: stopped
1: stopping
2: stopped
3: stopped
4: stopped
5: stopped
6: stopping
7: stopping
8: stopped
9: stopping
10: stopping
11: stopping
12: stopping
13: stopping
14: stopping
15: stopping
16: stopping
17: stopping
18: stopped
19: stopped
20: stopping
21: stopped
22: stopping
23: stopping
24: stopped
25: stopped
26: stopped
27: stopped
28: stopping
29: stopped
30: stopping
* 31: ddb
On Tue, Jul 8, 2025 at 11:26 AM K R <[email protected]> wrote:
>
> Also hangs when building lang/rust, but on a amd64 machine. It may or
> may not be the same problem here.
>
> hw.machine=amd64
> hw.ncpufound=32
>
> Please see the ddb session below:
>
> pmap_tlb_shootwait: spun outStopped at db_enter+0x14: popq %rbp
>
> ddb{4}> mach cpuinfo
> 0: stopped
> 1: stopping
> 2: stopped
> 3: stopping
> * 4: ddb
> 5: stopped
> 6: stopped
> 7: stopped
> 8: stopping
> 9: stopped
> 10: stopping
> 11: stopped
> 12: stopped
> 13: stopped
> 14: stopping
> 15: stopped
> 16: stopped
> 17: stopping
> 18: stopped
> 19: stopping
> 20: stopping
> 21: stopped
> 22: stopped
> 23: stopped
> 24: stopped
> 25: stopping
> 26: stopping
> 27: stopping
> 28: stopped
> 29: stopped
> 30: stopped
> 31: stopping
>
> ddb{4}> trace
> db_enter() at db_enter+0x14
> pmap_enter(fffffd9ca7996300,7abfca6d000,1b190bd000,3,22) at pmap_enter+0x725
> uvm_fault_lower(ffff800055f9fc68,ffff800055f9fca0,ffff800055f9fbe0) at
> uvm_faul
> t_lower+0x27f
> uvm_fault(fffffd9ca478cd40,7abfca6d000,0,2) at uvm_fault+0x1c5
> upageflttrap(ffff800055f9fdd0,7abfca6d000) at upageflttrap+0x6c
> usertrap(ffff800055f9fdd0) at usertrap+0x204
> recall_trap() at recall_trap+0x8
> end of kernel
> end trace frame: 0x7abaa855d00, count: -7
>
> ddb{4}> show witness /b
> Number of known direct relationships is 409
>
> Lock order reversal between "&mp->mnt_lock"(rwlock) and
> "&ip->i_lock"(rrwlock)!
>
> witness: incomplete path, depth 4
>
> ddb{4}> show all locks
> Process 59464 (rustc) thread 0xffff800055f225a8 (329198)
> exclusive rwlock uobjlk r = 0 (0xfffffd9c9aaff7c0)
> shared rwlock vmmaplk r = 0 (0xfffffd9bd6ba7318)
> Process 73261 (rustc) thread 0xffff8000560e17c8 (436440)
> exclusive rwlock amaplk r = 0 (0xfffffd9bc55cf9b8)
> exclusive rwlock kmmaplk r = 0 (0xffffffff82aa4fd8)
> exclusive kernel_lock &kernel_lock r = 0 (0xffffffff82ad7218)
> exclusive rwlock sysctllk r = 0 (0xffffffff82a07f28)
> Process 51291 (rustc) thread 0xffff8000560980a0 (388856)
> exclusive rwlock amaplk r = 0 (0xfffffd9a88984150)
> shared rwlock vmmaplk r = 0 (0xfffffd9ca478ce40)
> Process 94708 (rustc) thread 0xffff8000560b3528 (399610)
> exclusive rwlock amaplk r = 0 (0xfffffd9a9db30d98)
> exclusive rwlock vmmaplk r = 0 (0xfffffd9bd6ba78b8)
>
> ddb{4}> ps /o
> TID PID UID PRFLAGS PFLAGS CPU COMMAND
> 134839 3940 0 0x1 0 3 cargo
> 329198 59464 0 0x3 0 0 rustc
> 436440 73261 0 0x3 0x4000000 1K rustc
> *388856 51291 0 0x3 0x4000000 4 rustc
> 281161 5443 0 0x1003 0x4080000 9 rustc
> 314622 5443 0 0x1003 0x4080000 11 rustc
> 399610 94708 0 0x3 0x4000000 6 rustc
>
> ddb{4}> mach ddbcpu 0
> Stopped at x86_ipi_db+0x16: leave
>
> ddb{0}> trace
> x86_ipi_db(ffffffff82a4aff0) at x86_ipi_db+0x16
> x86_ipi_handler() at x86_ipi_handler+0x80
> Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
> __mp_lock(ffffffff82ad7010) at __mp_lock+0xbe
> softintr_dispatch(0) at softintr_dispatch+0xeb
> dosoftint(0) at dosoftint+0x3f
> Xsoftclock() at Xsoftclock+0x27
> pmap_enter(fffffd9a742fef00,45cf03f2000,1be3abf000,4,20) at pmap_enter+0x703
> uvm_fault_lower_lookup(ffff800055e9ada8,ffff800055e9ade0,ffff800055e9ad20) at
> u
> vm_fault_lower_lookup+0x126
> uvm_fault_lower(ffff800055e9ada8,ffff800055e9ade0,ffff800055e9ad20) at
> uvm_faul
> t_lower+0x5c
> uvm_fault(fffffd9bd6ba7218,45cf03ef000,0,4) at uvm_fault+0x1c5
> upageflttrap(ffff800055e9af10,45cf03ef310) at upageflttrap+0x6c
> usertrap(ffff800055e9af10) at usertrap+0x204
> recall_trap() at recall_trap+0x8
> end of kernel
> end trace frame: 0x78859e075000, count: -14
>
> On Mon, Jul 7, 2025 at 10:50 AM Martin Pieuchot <[email protected]> wrote:
> >
> > On 07/07/25(Mon) 14:11, Jeremie Courreges-Anglas wrote:
> > > Thanks folks for your replies,
> > >
> > > On Mon, Jul 07, 2025 at 12:26:21PM +0200, Mark Kettenis wrote:
> > > > > Date: Mon, 7 Jul 2025 08:17:37 +0200
> > > > > From: Martin Pieuchot <[email protected]>
> > > > >
> > > > > On 06/07/25(Sun) 21:15, Jeremie Courreges-Anglas wrote:
> > > > > > On Tue, Jul 01, 2025 at 06:18:37PM +0200, Jeremie Courreges-Anglas
> > > > > > wrote:
> > > > > > > On Tue, Jun 24, 2025 at 05:21:56PM +0200, Jeremie
> > > > > > > Courreges-Anglas wrote:
> > > > > > > >
> > > > > > > > I think it's uvm_purge(), as far as I can see it happens when
> > > > > > > > building
> > > > > > > > rust with cvs up -D2025/06/04 in /sys, not with -D2025/06/03.
> > > > > > > > Maybe I
> > > > > > > > missed lang/rust when testing the diff.
> > > > > > > >
> > > > > > > > This is with additional MP_LOCKDEBUG support for mutexes, and
> > > > > > > > __mp_lock_spinout = 50L * INT_MAX.
> > > > > > > >
> > > > > > > > Suggested by claudio: tr /t 0t269515 fails.
> > > > > > >
> > > > > > > Should be fixed, at least for CPUs that ddb actually managed to
> > > > > > > stop...
> > > > > > >
> > > > > > > > WITNESS doesn't flag an obvious lock ordering issue. I'm not
> > > > > > > > even
> > > > > > > > sure there is one. It also happen with CPU_MAX_BUSY_CYCLES ==
> > > > > > > > 64.
> > > > > > > >
> > > > > > > > Maybe we're still hammering too much the locks? Input and
> > > > > > > > ideas to
> > > > > > > > test welcome. Right now I'm running with just uvm_purge()
> > > > > > > > reverted.
> > > > > > >
> > > > > > > Reverting uvm_purge() did not help. I've been able to reproduce
> > > > > > > the
> > > > > > > hangs up to cvs up -D2025/05/18, backporting the arm64 ddb trace
> > > > > > > fix,
> > > > > > > mpi's mutex backoff diff and the mtx_enter MP_LOCKDEBUG diff.
> > > > > > > Currently trying to reproduce with cvs up -D2025/05/07 as
> > > > > > > suggested by
> > > > > > > dlg.
> > > > > >
> > > > > > cvs up -D2025/05/07 also did not help. -current still locks up, but
> > > > > > the db_mutex fix helps get proper stacktraces:
> > > > >
> > > > > Thanks for the report.
> > > > >
> > > > > > --8<--
> > > > > > login: mtf_fnxe0:00nf2ffmrte8 0m21xt2 8xlef_ cf2nktft esl8r_u0f
> > > > > > k00t sxor2f0
> > > > > >
> > > > > > 0tf
> > > > > > 8oeff
> > > > > >
> > > > > > ftff
> > > > > > efStopped at mtx_enter+0x13c: ldr x26, [x25,#2376]
> > > > >
> > > > > What is your value of __mp_lock_spinout?
> > >
> > > 50L * INT_MAX
> > >
> > > > > >From the ddb traces I understand that the `sched_lock' mutex is
> > > > > >contended.
> > > > > It's not clear to me why but I believe that's because rustc use
> > > > > sched_yield(2) in a loop. Could you figure out where this syscall is
> > > > > coming from?
> > >
> > > Frankly, I don't know if I want to try and figure it out. ^^"
> >
> > I understand. However this is IMHO the crucial piece of work that has
> > to be done. Alternatively one can replace all _spinlock() in libc with
> > a primitive that do not call a syscall in a loop.
> >
> > > > > I'm upset that sched_yield() is still used and causing trouble. Now
> > > > > that `sched_lock' is a mutex without guarantee of progress it is easy
> > > > > to hang the machine by calling it in a loop. A proper solution would
> > > > > be
> > > > > to stop using sched_yield(2). This will bite us as long as it is
> > > > > here.
> > >
> > > Regarding its uses, there's one in librthread/rthread_sched.c which
> > > looks legitimate, and also another in libc that caught my eye recently:
> > >
> > > /*
> > > * internal support functions
> > > */
> > > void
> > > _spinlock(volatile _atomic_lock_t *lock)
> > > {
> > > while (_atomic_lock(lock))
> > > sched_yield();
> > > membar_enter_after_atomic();
> > > }
> > > DEF_STRONG(_spinlock);
> > >
> > > That one looks a bit heavy-handed...
> >
> > This is certainly this one, question is which one of them?
> >
> > > Now, if the problem really is that our mutexes aren't fair, then isn't
> > > any mutex (incl the sched lock) usable for such hammering from
> > > userland?
> >
> > Hammering is the problem here. In the new ddb trace we see 4 rustc
> > threads in the middle of sched_yield(2)... This is really an example
> > of how not to do multi-threading...
> >
> > The one on CPU6 is in the middle of mi_switch() and doesn't appear on
> > ps /o. It holds the `sched_lock' and there's no guarantee that CPU0
> > which holds the KERNEL_LOCK() and want to do wakeup(9) will grab it.
> >
> > > > Hmm, well, mutexes might not be fair, but they should guarantee
> > > > forward progress. And the LL/SC primitives on arm64 do guarantee
> > > > forward progress, but there are certain conditions. And I think our
> > > > current MI mutex implementation violates those conditions. And
> > > > WITNESS and MP_LOCKDEBUG might actually be making things worse.
> > >
> > > I thought LL/SC guarantees depended on the length of the section
> > > between the load and store, but... I usually fail to find what I'm
> > > looking for when looking at ARM docs.
> > >
> > > > Now on the M2 we should be using CAS instead of LL/SC.
> > >
> > > Just to rule out an obvious issue:
> > >
> > > ddb{0}> x arm64_has_lse
> > > arm64_has_lse: 1
> > >
> > > > The
> > > > architecture reference manual isn't explicit about forward guarantees
> > > > for that instruction. So I need to dig a bit deeper into this. I
> > > > doubt there is no forward progress guarantee for those instructions,
> > > > but maybe there are similar conditions on how these get used.
> > > >
> > > > Will dig deeper and see if we can fix the mutex implementation on
> > > > arm64.
> > >
> > > ack, thanks
> >
> > I'm not sure the issue is with the primitive.
> >
> > > > > A workaround would be to use a backoff mechanism inside the loop.
> > > > >
> > > > > Another workaround could be to never spin on the `sched_lock' in
> > > > > sys_sched_yield() and instead sleep.
> > > > >
> > > > > Because of such contention all wakeup(9)s inside the pmemrange
> > > > > allocator
> > > > > add contention on the `fpageqlock' which is what is "hanging" your
> > > > > machine.
> > > > >
> > > > > Diff below is another workaround that might help.
> > >
> > > Alas it doesn't seem to help enough to avoid hangs. Here's another
> > > fresh report using your diff.
> >
> > It seems to help because the contention is now on the kernel lock and no
> > longer on the global pmemrange mutex.
> >
> > > --8<--
> > > login: __mp_lock_spin: 0xffffff8001326508 lock spun out
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{4}> mach cpuinfo
> > > 0: stopped
> > > 1: stopped
> > > 2: stopped
> > > 3: stopped
> > > * 4: ddb
> > > 5: stopped
> > > 6: stopped
> > > 7: stopped
> > > 8: stopped
> > > 9: stopped
> > > ddb{4}> mach ddb 0
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{0}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __aarch64_cas8_acq_rel+0x1c
> > > __aarch64_cas8_acq_rel() at wakeup_n+0x44
> > > wakeup_n() at task_add+0x6c
> > > task_add() at ifiq_input+0x1f4
> > > ifiq_input() at bge_rxeof+0x32c
> > > bge_rxeof() at bge_intr+0x174
> > > bge_intr() at aplintc_irq_handler+0x170
> > > aplintc_irq_handler() at arm_cpu_irq+0x44
> > > arm_cpu_irq() at handle_el1h_irq+0x68
> > > handle_el1h_irq() at mtx_enter+0x104
> > > mtx_enter() at endtsleep+0x30
> > > endtsleep() at timeout_run+0xb4
> > > timeout_run() at softclock_process_tick_timeout+0xc4
> > > softclock_process_tick_timeout() at softclock+0xe8
> > > softclock() at softintr_dispatch+0xc8
> > > softintr_dispatch() at arm_do_pending_intr+0xf4
> > > arm_do_pending_intr() at mi_switch+0x1b0
> > > mi_switch() at ast+0x98
> > > ast() at handle_el0_sync+0x9c
> > > handle_el0_sync() at 0x19ae4be318
> > > --- trap ---
> > > end of kernel
> > > ddb{0}> mach ddb 1
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{1}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0x104
> > > __mp_lock() at __mp_acquire_count+0x34
> > > __mp_acquire_count() at uiomove+0x12c
> > > uiomove() at ffs_read+0x198
> > > ffs_read() at VOP_READ+0x3c
> > > VOP_READ() at vn_rdwr+0xb0
> > > vn_rdwr() at vmcmd_map_readvn+0x8c
> > > vmcmd_map_readvn() at exec_process_vmcmds+0x78
> > > exec_process_vmcmds() at sys_execve+0x620
> > > sys_execve() at svc_handler+0x430
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x13812aa6d8
> > > --- trap ---
> > > end of kernel
> > > ddb{1}> mach ddb 2
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{2}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0x104
> > > __mp_lock() at doopenat+0x160
> > > doopenat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x4521566bc
> > > --- trap ---
> > > end of kernel
> > > ddb{2}> mach ddb 3
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{3}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __aarch64_cas8_acq_rel+0x1c
> > > __aarch64_cas8_acq_rel() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{3}> mach ddb 4
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{4}> tr
> > > db_enter() at __mp_lock+0x128
> > > __mp_lock() at doopenat+0x160
> > > doopenat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x43e0b76bc
> > > --- trap ---
> > > end of kernel
> > > ddb{4}> mach ddb 5
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{5}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0xf8
> > > __mp_lock() at vn_statfile+0x2c
> > > vn_statfile() at sys_fstat+0x84
> > > sys_fstat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x43dcf999c
> > > --- trap ---
> > > end of kernel
> > > ddb{5}> mach ddb 6
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{6}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at agtimer_get_timecount_default+0x18
> > > agtimer_get_timecount_default() at nanouptime+0x44
> > > nanouptime() at tuagg_add_runtime+0x40
> > > tuagg_add_runtime() at mi_switch+0x90
> > > mi_switch() at sys_sched_yield+0x88
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{6}> mach ddb 7
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{7}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0xf8
> > > __mp_lock() at sys___realpath+0x154
> > > sys___realpath() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x4af9eacc8
> > > --- trap ---
> > > end of kernel
> > > ddb{7}> mach ddb 8
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{8}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at aplintc_splx+0x1f0
> > > aplintc_splx() at mtx_enter+0xe8
> > > mtx_enter() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{8}> mach ddb 9
> > > Stopped at db_enter+0x18: brk #0x0
> > > ddb{9}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at mtx_enter+0x10c
> > > mtx_enter() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{9}> show all locks
> > > CPU 0:
> > > shared mutex timeout r = 0 (0xffffff8001202d88)
> > > CPU 6:
> > > exclusive mutex &sched_lock r = 0 (0xffffff8001272148)
> > > Process 16821 (ninja) thread 0xffffff8113e80068 (92806)
> > > exclusive rrwlock inode r = 0 (0xffffff8055608760)
> > > Process 40072 (rustc) thread 0xffffff8113e814a8 (446746)
> > > exclusive rrwlock inode r = 0 (0xffffff81049e8538)
> > > Process 23063 (rustc) thread 0xffffff8113e819b8 (12581)
> > > exclusive kernel_lock &kernel_lock r = 0 (0xffffff8001326d10)
> > > ddb{9}> show witness /b
> > > Number of known direct relationships is 443
> > >
> > > Lock order reversal between "&mp->mnt_lock"(rwlock) and
> > > "&ip->i_lock"(rrwlock)!
> > >
> > > witness: incomplete path, depth 8
> > > ddb{9}> ps /o
> > > TID PID UID PRFLAGS PFLAGS CPU COMMAND
> > > 92806 16821 55 0x1005 0 1 ninja
> > > 285104 167 55 0x3 0 2 c++
> > > 202504 45511 55 0x3 0 7 c++
> > > 288505 96331 55 0x3 0 4 c++
> > > 134651 73257 55 0x3 0 5 c++
> > > 230543 23063 55 0x2000003 0x4000000 3 rustc
> > > 12581 23063 55 0x2000003 0x4000000 0K rustc
> > > 4934 23063 55 0x2000003 0x4000000 8 rustc
> > > * 89729 23063 55 0x2000003 0x4000000 9 rustc
> > > ddb{9}> ps
> > > PID TID PPID UID S FLAGS WAIT COMMAND
> > > 16821 92806 23175 55 7 0x1005 ninja
> > > 167 285104 63255 55 7 0x3 c++
> > > 63255 492627 23175 55 3 0x10008b sigsusp sh
> > > 40072 77074 38114 55 3 0x2000083 fsleep rustc
> > > 40072 100553 38114 55 3 0x6000083 piperd rustc
> > > 40072 446746 38114 55 2 0x6000003 rustc
> > > 45511 202504 4053 55 7 0x3 c++
> > > 4053 80003 23175 55 3 0x10008b sigsusp sh
> > > 96331 288505 78782 55 7 0x3 c++
> > > 78782 47058 23175 55 3 0x10008b sigsusp sh
> > > 73257 134651 99268 55 7 0x3 c++
> > > 99268 333490 23175 55 3 0x10008b sigsusp sh
> > > 23063 335091 38114 55 3 0x2000083 fsleep rustc
> > > 23063 363979 38114 55 3 0x6000083 piperd rustc
> > > 23063 428745 38114 55 3 0x6000083 fsleep rustc
> > > 23063 279995 38114 55 3 0x6000083 fsleep rustc
> > > 23063 181877 38114 55 3 0x6000083 fsleep rustc
> > > 23063 230543 38114 55 7 0x6000003 rustc
> > > 23063 216080 38114 55 2 0x6000003 rustc
> > > 23063 12581 38114 55 7 0x6000003 rustc
> > > 23063 4934 38114 55 7 0x6000003 rustc
> > > *23063 89729 38114 55 7 0x6000003 rustc
> > > 38114 473886 68745 55 3 0x2000083 fsleep cargo
> > > 38114 327225 68745 55 3 0x6000083 fsleep cargo
> > > 38114 207803 68745 55 3 0x6000083 kqread cargo
> > > 38114 442514 68745 55 3 0x6000083 kqread cargo
> > > 68745 91160 66756 55 3 0x2200083 wait python3.12
> > > 66756 145557 20253 55 3 0x10008b sigsusp make
> > > 23175 305437 95548 55 3 0x8b kqread ninja
> > > 95548 326376 95211 55 3 0x83 kqread cmake
> > > 95211 452507 60695 55 3 0x10008b sigsusp make
> > > 20253 453028 74977 1000 3 0x10008b sigsusp make
> > > 74977 118496 85512 1000 3 0x10008b sigsusp sh
> > > 60695 512137 28928 1000 3 0x10008b sigsusp make
> > > 28928 132313 46768 1000 3 0x10008b sigsusp sh
> > > 46768 510404 31892 1000 3 0x10008b sigsusp make
> > > 85512 168761 5112 1000 3 0x10008b sigsusp make
> > > 31892 41853 33691 1000 3 0x10008b sigsusp ksh
> > > 5112 45762 33691 1000 3 0x10008b sigsusp ksh
> > > 33691 339742 1 1000 3 0x100080 kqread tmux
> > > 50447 258378 72048 1000 3 0x100083 kqread tmux
> > > 52948 303715 1 1000 3 0x1000b8 kqread ssh-agent
> > > 72048 282038 80004 1000 3 0x10008b sigsusp ksh
> > > 80004 478023 75098 1000 3 0x98 kqread sshd-session
> > > 75098 88401 26790 0 3 0x92 kqread sshd-session
> > > 16285 372990 1 0 3 0x100083 ttyin getty
> > > 88950 327084 1 0 3 0x100083 ttyin getty
> > > 32299 412662 1 0 3 0x100083 ttyin getty
> > > 9624 388470 1 0 3 0x100083 ttyin getty
> > > 95942 199073 1 0 3 0x100083 ttyin getty
> > > 13386 235018 1 0 3 0x100083 ttyin getty
> > > 71288 139796 1 0 3 0x100098 kqread cron
> > > 58489 449220 1 99 3 0x1100090 kqread sndiod
> > > 88897 458345 1 110 3 0x100090 kqread sndiod
> > > 21307 145191 57732 95 3 0x1100092 kqread smtpd
> > > 85854 126275 57732 103 3 0x1100092 kqread smtpd
> > > 62845 395981 57732 95 3 0x1100092 kqread smtpd
> > > 68484 261396 57732 95 3 0x100092 kqread smtpd
> > > 96401 408083 57732 95 3 0x1100092 kqread smtpd
> > > 83314 38871 57732 95 3 0x1100092 kqread smtpd
> > > 57732 508617 1 0 3 0x100080 kqread smtpd
> > > 26790 121973 1 0 3 0x88 kqread sshd
> > > 29643 367483 12647 0 3 0x80 nfsd nfsd
> > > 94049 228378 12647 0 3 0x80 nfsd nfsd
> > > 6946 26461 12647 0 3 0x80 nfsd nfsd
> > > 27528 512446 12647 0 3 0x80 nfsd nfsd
> > > 12647 261198 1 0 3 0x80 netacc nfsd
> > > 40776 365439 11876 0 3 0x80 kqread mountd
> > > 11876 241363 1 0 3 0x100080 kqread mountd
> > > 56481 14460 1 28 3 0x1100090 kqread portmap
> > > 59772 105103 17556 83 3 0x100092 kqread ntpd
> > > 17556 51510 78827 83 3 0x1100092 kqread ntpd
> > > 78827 213143 1 0 3 0x100080 kqread ntpd
> > > 58198 68980 84345 74 3 0x1100092 bpf pflogd
> > > 84345 315647 1 0 3 0x80 sbwait pflogd
> > > 4462 325006 88024 73 3 0x1100090 kqread syslogd
> > > 88024 393773 1 0 3 0x100082 sbwait syslogd
> > > 20177 177437 1 0 3 0x100080 kqread resolvd
> > > 1916 512556 22600 77 3 0x100092 kqread dhcpleased
> > > 89301 356030 22600 77 3 0x100092 kqread dhcpleased
> > > 22600 388065 1 0 3 0x80 kqread dhcpleased
> > > 9674 57681 39128 115 3 0x100092 kqread slaacd
> > > 99969 364740 39128 115 3 0x100092 kqread slaacd
> > > 39128 377692 1 0 3 0x100080 kqread slaacd
> > > 7488 127254 0 0 3 0x14200 bored wsdisplay0
> > > 80941 354199 0 0 3 0x14200 bored
> > > apple-dcp-afkep%02x
> > > 83845 251298 0 0 3 0x14200 bored
> > > apple-dcp-afkep%02x
> > > 48577 323255 0 0 3 0x14200 bored
> > > apple-dcp-afkep%02x
> > > 26744 433388 0 0 3 0x14200 bored drmrtk
> > > 23659 215894 0 0 3 0x14200 bored smr
> > > 62145 444779 0 0 3 0x14200 pgzero zerothread
> > > 60948 51173 0 0 3 0x14200 aiodoned aiodoned
> > > 61934 416352 0 0 3 0x14200 syncer update
> > > 60595 230019 0 0 3 0x14200 cleaner cleaner
> > > 59553 366424 0 0 3 0x14200 reaper reaper
> > > 57148 36768 0 0 3 0x14200 pgdaemon pagedaemon
> > > 4700 330347 0 0 3 0x14200 usbtsk usbtask
> > > 37673 384731 0 0 3 0x14200 usbatsk usbatsk
> > > 71799 233642 0 0 3 0x14200 bored drmtskl
> > > 72815 226502 0 0 3 0x14200 bored drmlwq
> > > 22325 377620 0 0 3 0x14200 bored drmlwq
> > > 58569 438671 0 0 3 0x14200 bored drmlwq
> > > 35034 511019 0 0 3 0x14200 bored drmlwq
> > > 51245 343392 0 0 3 0x14200 bored drmubwq
> > > 5539 210307 0 0 3 0x14200 bored drmubwq
> > > 96204 103587 0 0 3 0x14200 bored drmubwq
> > > 99133 343617 0 0 3 0x14200 bored drmubwq
> > > 62111 109225 0 0 3 0x14200 bored drmhpwq
> > > 54844 103964 0 0 3 0x14200 bored drmhpwq
> > > 98273 6361 0 0 3 0x14200 bored drmhpwq
> > > 69125 196587 0 0 3 0x14200 bored drmhpwq
> > > 36820 327811 0 0 3 0x14200 bored drmwq
> > > 685 197752 0 0 3 0x14200 bored drmwq
> > > 12733 475339 0 0 3 0x14200 bored drmwq
> > > 50628 50813 0 0 3 0x14200 bored drmwq
> > > 38742 421411 0 0 3 0x14200 bored sensors
> > > 15252 489393 0 0 3 0x14200 bored sleep
> > > 45280 351306 0 0 3 0x40014200 idle9
> > > 18476 19261 0 0 3 0x40014200 idle8
> > > 73955 129108 0 0 3 0x40014200 idle7
> > > 40102 188571 0 0 3 0x40014200 idle6
> > > 68869 24066 0 0 3 0x40014200 idle5
> > > 16358 379475 0 0 3 0x40014200 idle4
> > > 28248 237855 0 0 3 0x40014200 idle3
> > > 14101 490124 0 0 3 0x40014200 idle2
> > > 48008 167892 0 0 3 0x40014200 idle1
> > > 44662 441379 0 0 3 0x14200 bored softnet3
> > > 18453 428842 0 0 3 0x14200 bored softnet2
> > > 49989 83580 0 0 3 0x14200 bored softnet1
> > > 93898 345769 0 0 3 0x14200 bored softnet0
> > > 64190 206376 0 0 3 0x14200 bored systqmp
> > > 95969 416641 0 0 3 0x14200 bored systq
> > > 38042 457408 0 0 3 0x14200 tmoslp softclockmp
> > > 57671 339967 0 0 3 0x40014200 tmoslp softclock
> > > 32509 359030 0 0 3 0x40014200 idle0
> > > 99051 67429 0 0 3 0x14200 kmalloc kmthread
> > > 1 12407 0 0 3 0x82 wait init
> > > 0 0 -1 0 3 0x10200 scheduler swapper
> > > ddb{9}>
> > > -->8--
> > >
> > > --
> > > jca
> >
> >