Please find below another debug session.

The machine is amd64, 32 cpus, 128GB RAM. Can reproduce the hang every
time now by doing:

cd /usr/ports/lang/rust
while :
do
  make clean=build
  make
done

Tested with a -current kernel, compiled with WITNESS and MP_LOCKDEBUG
and __mp_lock_spinout = 50L * INT_MAX.

I've included traces on all the cpus with state "stopped".  Couldn't
get the "stopping" cpus to stop with mach stopcpu N.  Trying to mach
ddbcpu N on a "stopping" cpu will freeze ddb.

Please advise if more tests (with examine, for example) are needed.

Thanks,
--Kor

pmap_tlb_shootwait: spun outStopped at  db_enter+0x14:  popq    %rbp

ddb{4}> trace
db_enter() at db_enter+0x14
pmap_page_remove(fffffd820275f330) at pmap_page_remove+0x38e
uvm_anfree_list(fffffd9ab1a14c28,0) at uvm_anfree_list+0x63
amap_wipeout(fffffd9acebcd428) at amap_wipeout+0x198
uvm_unmap_detach(ffff800055efd160,0) at uvm_unmap_detach+0xbe
sys_munmap(ffff800055eefc80,ffff800055efd260,ffff800055efd1e0) at sys_munmap+0x
185
syscall(ffff800055efd260) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7241ffaab50, count: -8

ddb{4}> ps /o
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
 456410  30896      0         0x3  0x4000000   12  rustc
 325785    280      0         0x3  0x4000000    7  rustc
 124074  32980      0         0x3  0x4000000    6  rustc
 282307  73501      0         0x3  0x4000000    3  rustc
 277851  95108      0         0x3  0x4000000   14  rustc
  32783  29238      0         0x3  0x4000000   10  rustc
  57743  21941      0         0x3  0x4000000   11  rustc
 294566  82050      0         0x3  0x4000000   15  rustc
 485680  73773      0         0x3  0x4000000    0  rustc
 408387  61268      0         0x3  0x4000000    8  rustc
*466075  29506      0         0x3  0x4000000    4  rustc
 356664  33536      0         0x3  0x4000000    2K rustc
 475723  30794      0         0x3  0x4000000    1  rustc

ddb{4}> show witness /b
Number of known direct relationships is 492

Lock order reversal between "&mp->mnt_lock"(rwlock) and "&ip->i_lock"(rrwlock)!

witness: incomplete path, depth 4

ddb{4}> show all locks
Process 30896 (rustc) thread 0xffff8000560599a0 (456410)
exclusive rwlock amaplk r = 0 (0xfffffd9aea13db58)
shared rwlock vmmaplk r = 0 (0xfffffd9a7052e548)
Process 280 (rustc) thread 0xffff8000560457e0 (325785)
exclusive rwlock amaplk r = 0 (0xfffffd9ccf8dd2e8)
shared rwlock vmmaplk r = 0 (0xfffffd9a7052eae8)
Process 32980 (rustc) thread 0xffff800056044da0 (124074)
exclusive rwlock amaplk r = 0 (0xfffffd9aa72f5a48)
shared rwlock vmmaplk r = 0 (0xfffffd9cba272ea0)
Process 73501 (rustc) thread 0xffff800055ff3cd0 (282307)
exclusive rwlock amaplk r = 0 (0xfffffd9aed6be188)
exclusive rwlock vmmaplk r = 0 (0xfffffd9cba272900)
Process 95108 (rustc) thread 0xffff800055f63a00 (277851)
exclusive rwlock amaplk r = 0 (0xfffffd995e22f2a0)
exclusive rwlock vmmaplk r = 0 (0xfffffd9cba272360)
Process 29238 (rustc) thread 0xffff80005603dcf8 (32783)
exclusive rwlock amaplk r = 0 (0xfffffd9aa72f5418)
shared rwlock vmmaplk r = 0 (0xfffffd9cba272720)
Process 21941 (rustc) thread 0xffff800055f622f0 (57743)
exclusive rwlock amaplk r = 0 (0xfffffd9c0a363e30)
exclusive rwlock vmmaplk r = 0 (0xfffffd9c739bd6e0)
Process 82050 (rustc) thread 0xffff800056021020 (294566)
exclusive rwlock amaplk r = 0 (0xfffffd9affc98df8)
shared rwlock vmmaplk r = 0 (0xfffffd9c739bdc80)
Process 61268 (rustc) thread 0xffff800055ff2ae0 (408387)
exclusive rwlock uobjlk r = 0 (0xfffffd9b600e19c0)
shared rwlock vmmaplk r = 0 (0xfffffd9b0eb4dad8)
Process 29506 (rustc) thread 0xffff800055eefc80 (466075)
exclusive rwlock amaplk r = 0 (0xfffffd9acebeb3d8)
Process 33536 (rustc) thread 0xffff800055f62060 (356664)
exclusive rrwlock inode r = 0 (0xfffffd9d738f39b8)
exclusive kernel_lock &kernel_lock r = 0 (0xffffffff82ad8348)
Process 30794 (rustc) thread 0xffff80005603da68 (475723)
exclusive rwlock amaplk r = 0 (0xfffffd9a8b63f4f8)
exclusive rwlock vmmaplk r = 0 (0xfffffd9b0eb4dcb8)

ddb{4}> mach cpuinfo
    0: stopping
    1: stopping
    2: stopped
    3: stopped
*   4: ddb
    5: stopped
    6: stopping
    7: stopping
    8: stopped
    9: stopping
   10: stopping
   11: stopping
   12: stopping
   13: stopping
   14: stopping
   15: stopping
   16: stopping
   17: stopping
   18: stopped
   19: stopped
   20: stopping
   21: stopped
   22: stopping
   23: stopping
   24: stopped
   25: stopped
   26: stopped
   27: stopped
   28: stopping
   29: stopped
   30: stopping
   31: stopped

ddb{4}> mach ddbcpu 2
Stopped at      x86_ipi_db+0x16:        leave

ddb{2}> trace
x86_ipi_db(ffff8000552b3ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_kremove(ffff80004120f000,10000) at pmap_kremove+0xa3
buf_unmap(fffffd9b9a906e58) at buf_unmap+0xbf
buf_map(fffffd9be2bbc6c8) at buf_map+0x147
buf_get(fffffd9ad87407c8,4e,10000) at buf_get+0x360
getblk(fffffd9ad87407c8,4e,10000,0,ffffffffffffffff) at getblk+0x7b
ffs2_balloc(fffffd9d738f3918,4e0000,10000,fffffda07f7cb680,0,ffff80005608ce08) a
t ffs2_balloc+0xe93
ffs_write(ffff80005608ce88) at ffs_write+0x21d
VOP_WRITE(fffffd9ad87407c8,ffff80005608cfe8,1,fffffda07f7cb680) at VOP_WRITE+0x
45
vn_write(fffffd9a5b6cc090,ffff80005608cfe8,0) at vn_write+0xdb
dofilewritev(ffff800055f62060,9,ffff80005608cfe8,0,ffff80005608d080) at dofilew
ritev+0x171
sys_write(ffff800055f62060,ffff80005608d100,ffff80005608d080) at sys_write+0x55

syscall(ffff80005608d100) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0xcd471be5ad0, count: -16

ddb{2}> mach ddbcpu 3
Stopped at      x86_ipi_db+0x16:        leave

ddb{3}> trace
x86_ipi_db(ffff8000552bcff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_do_remove(fffffd99b72e2800,26013ba1000,26013ba2000,0) at pmap_do_remove+0x
5d7
uvm_unmap_kill_entry_withlock(fffffd9cba272800,fffffd9bb369a5a8,1) at uvm_unmap
_kill_entry_withlock+0x133
uvm_unmap_remove(fffffd9cba272800,26013ba1000,26013ba2000,ffff80005630b870,0,1,
e00485a81190c8ec) at uvm_unmap_remove+0x32f
sys_munmap(ffff800055ff3cd0,ffff80005630b970,ffff80005630b8f0) at sys_munmap+0x
10b
syscall(ffff80005630b970) at syscall+0x5f9
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x2603c8b7300, count: -9

ddb{3}> mach ddbcpu 5
Stopped at      x86_ipi_db+0x16:        leave

ddb{5}> trace
x86_ipi_db(ffff8000552ceff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000552ceff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{5}> mach ddbcpu 8
Stopped at      x86_ipi_db+0x16:        leave

ddb{8}> trace
x86_ipi_db(ffff8000552e9ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
pmap_enter(fffffd9a9cc3e700,4504ddd0000,1b73fec000,1,20) at pmap_enter+0x703
uvm_fault_lower_lookup(ffff8000560fdbc8,ffff8000560fdc00,ffff8000560fdb40) at u
vm_fault_lower_lookup+0x126
uvm_fault_lower(ffff8000560fdbc8,ffff8000560fdc00,ffff8000560fdb40) at uvm_faul
t_lower+0x5c
uvm_fault(fffffd9b0eb4d9d8,4504ddcf000,0,1) at uvm_fault+0x1c5
upageflttrap(ffff8000560fdd30,4504ddcf215) at upageflttrap+0x6c
usertrap(ffff8000560fdd30) at usertrap+0x204
recall_trap() at recall_trap+0x8
end of kernel
end trace frame: 0x4507e19e4f0, count: -10

ddb{8}> mach ddbcpu 12
Stopped at      x86_ipi_db+0x16:        leave

ddb{18}> trace
x86_ipi_db(ffff800055343ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055343ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{18}> mach ddbcpu 13
Stopped at      x86_ipi_db+0x16:        leave

ddb{19}> trace
x86_ipi_db(ffff80005534cff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005534cff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{19}> mach ddbcpu 15
Stopped at      x86_ipi_db+0x16:        leave

ddb{21}> trace
x86_ipi_db(ffff80005535eff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005535eff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{21}> mach ddbcpu 18
Stopped at      x86_ipi_db+0x16:        leave

ddb{24}> trace
x86_ipi_db(ffff800055379ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055379ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{24}> mach ddbcpu 19
Stopped at      x86_ipi_db+0x16:        leave

ddb{25}> trace
x86_ipi_db(ffff800055382ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055382ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{25}> mach ddbcpu 1A
Stopped at      x86_ipi_db+0x16:        leave

ddb{26}> trace
x86_ipi_db(ffff80005538bff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff80005538bff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{26}> mach ddbcpu 1B
Stopped at      x86_ipi_db+0x16:        leave

ddb{27}> trace
x86_ipi_db(ffff800055394ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff800055394ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{27}> mach ddbcpu 1D
Stopped at      x86_ipi_db+0x16:        leave

ddb{29}> trace
x86_ipi_db(ffff8000553a6ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000553a6ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{29}> mach ddbcpu 1F
Stopped at      x86_ipi_db+0x16:        leave

ddb{31}> trace
x86_ipi_db(ffff8000553b8ff0) at x86_ipi_db+0x16
x86_ipi_handler() at x86_ipi_handler+0x80
Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
acpicpu_idle() at acpicpu_idle+0x228
sched_idle(ffff8000553b8ff0) at sched_idle+0x24b
end trace frame: 0x0, count: -5

ddb{31}> mach stopcpu 1E

ddb{31}> mach cpuinfo
    0: stopped
    1: stopping
    2: stopped
    3: stopped
    4: stopped
    5: stopped
    6: stopping
    7: stopping
    8: stopped
    9: stopping
   10: stopping
   11: stopping
   12: stopping
   13: stopping
   14: stopping
   15: stopping
   16: stopping
   17: stopping
   18: stopped
   19: stopped
   20: stopping
   21: stopped
   22: stopping
   23: stopping
   24: stopped
   25: stopped
   26: stopped
   27: stopped
   28: stopping
   29: stopped
   30: stopping
*  31: ddb

On Tue, Jul 8, 2025 at 11:26 AM K R <[email protected]> wrote:
>
> Also hangs when building lang/rust, but on a amd64 machine.  It may or
> may not be the same problem here.
>
> hw.machine=amd64
> hw.ncpufound=32
>
> Please see the ddb session below:
>
> pmap_tlb_shootwait: spun outStopped at  db_enter+0x14:  popq    %rbp
>
> ddb{4}> mach cpuinfo
>     0: stopped
>     1: stopping
>     2: stopped
>     3: stopping
> *   4: ddb
>     5: stopped
>     6: stopped
>     7: stopped
>     8: stopping
>     9: stopped
>    10: stopping
>    11: stopped
>    12: stopped
>    13: stopped
>    14: stopping
>    15: stopped
>    16: stopped
>    17: stopping
>    18: stopped
>    19: stopping
>    20: stopping
>    21: stopped
>    22: stopped
>    23: stopped
>    24: stopped
>    25: stopping
>    26: stopping
>    27: stopping
>    28: stopped
>    29: stopped
>    30: stopped
>    31: stopping
>
> ddb{4}> trace
> db_enter() at db_enter+0x14
> pmap_enter(fffffd9ca7996300,7abfca6d000,1b190bd000,3,22) at pmap_enter+0x725
> uvm_fault_lower(ffff800055f9fc68,ffff800055f9fca0,ffff800055f9fbe0) at 
> uvm_faul
> t_lower+0x27f
> uvm_fault(fffffd9ca478cd40,7abfca6d000,0,2) at uvm_fault+0x1c5
> upageflttrap(ffff800055f9fdd0,7abfca6d000) at upageflttrap+0x6c
> usertrap(ffff800055f9fdd0) at usertrap+0x204
> recall_trap() at recall_trap+0x8
> end of kernel
> end trace frame: 0x7abaa855d00, count: -7
>
> ddb{4}> show witness /b
> Number of known direct relationships is 409
>
> Lock order reversal between "&mp->mnt_lock"(rwlock) and 
> "&ip->i_lock"(rrwlock)!
>
> witness: incomplete path, depth 4
>
> ddb{4}> show all locks
> Process 59464 (rustc) thread 0xffff800055f225a8 (329198)
> exclusive rwlock uobjlk r = 0 (0xfffffd9c9aaff7c0)
> shared rwlock vmmaplk r = 0 (0xfffffd9bd6ba7318)
> Process 73261 (rustc) thread 0xffff8000560e17c8 (436440)
> exclusive rwlock amaplk r = 0 (0xfffffd9bc55cf9b8)
> exclusive rwlock kmmaplk r = 0 (0xffffffff82aa4fd8)
> exclusive kernel_lock &kernel_lock r = 0 (0xffffffff82ad7218)
> exclusive rwlock sysctllk r = 0 (0xffffffff82a07f28)
> Process 51291 (rustc) thread 0xffff8000560980a0 (388856)
> exclusive rwlock amaplk r = 0 (0xfffffd9a88984150)
> shared rwlock vmmaplk r = 0 (0xfffffd9ca478ce40)
> Process 94708 (rustc) thread 0xffff8000560b3528 (399610)
> exclusive rwlock amaplk r = 0 (0xfffffd9a9db30d98)
> exclusive rwlock vmmaplk r = 0 (0xfffffd9bd6ba78b8)
>
> ddb{4}> ps /o
>     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
>  134839   3940      0         0x1          0    3  cargo
>  329198  59464      0         0x3          0    0  rustc
>  436440  73261      0         0x3  0x4000000    1K rustc
> *388856  51291      0         0x3  0x4000000    4  rustc
>  281161   5443      0      0x1003  0x4080000    9  rustc
>  314622   5443      0      0x1003  0x4080000   11  rustc
>  399610  94708      0         0x3  0x4000000    6  rustc
>
> ddb{4}> mach ddbcpu 0
> Stopped at      x86_ipi_db+0x16:        leave
>
> ddb{0}> trace
> x86_ipi_db(ffffffff82a4aff0) at x86_ipi_db+0x16
> x86_ipi_handler() at x86_ipi_handler+0x80
> Xresume_lapic_ipi() at Xresume_lapic_ipi+0x27
> __mp_lock(ffffffff82ad7010) at __mp_lock+0xbe
> softintr_dispatch(0) at softintr_dispatch+0xeb
> dosoftint(0) at dosoftint+0x3f
> Xsoftclock() at Xsoftclock+0x27
> pmap_enter(fffffd9a742fef00,45cf03f2000,1be3abf000,4,20) at pmap_enter+0x703
> uvm_fault_lower_lookup(ffff800055e9ada8,ffff800055e9ade0,ffff800055e9ad20) at 
> u
> vm_fault_lower_lookup+0x126
> uvm_fault_lower(ffff800055e9ada8,ffff800055e9ade0,ffff800055e9ad20) at 
> uvm_faul
> t_lower+0x5c
> uvm_fault(fffffd9bd6ba7218,45cf03ef000,0,4) at uvm_fault+0x1c5
> upageflttrap(ffff800055e9af10,45cf03ef310) at upageflttrap+0x6c
> usertrap(ffff800055e9af10) at usertrap+0x204
> recall_trap() at recall_trap+0x8
> end of kernel
> end trace frame: 0x78859e075000, count: -14
>
> On Mon, Jul 7, 2025 at 10:50 AM Martin Pieuchot <[email protected]> wrote:
> >
> > On 07/07/25(Mon) 14:11, Jeremie Courreges-Anglas wrote:
> > > Thanks folks for your replies,
> > >
> > > On Mon, Jul 07, 2025 at 12:26:21PM +0200, Mark Kettenis wrote:
> > > > > Date: Mon, 7 Jul 2025 08:17:37 +0200
> > > > > From: Martin Pieuchot <[email protected]>
> > > > >
> > > > > On 06/07/25(Sun) 21:15, Jeremie Courreges-Anglas wrote:
> > > > > > On Tue, Jul 01, 2025 at 06:18:37PM +0200, Jeremie Courreges-Anglas 
> > > > > > wrote:
> > > > > > > On Tue, Jun 24, 2025 at 05:21:56PM +0200, Jeremie 
> > > > > > > Courreges-Anglas wrote:
> > > > > > > >
> > > > > > > > I think it's uvm_purge(), as far as I can see it happens when 
> > > > > > > > building
> > > > > > > > rust with cvs up -D2025/06/04 in /sys, not with -D2025/06/03.  
> > > > > > > > Maybe I
> > > > > > > > missed lang/rust when testing the diff.
> > > > > > > >
> > > > > > > > This is with additional MP_LOCKDEBUG support for mutexes, and
> > > > > > > > __mp_lock_spinout = 50L * INT_MAX.
> > > > > > > >
> > > > > > > > Suggested by claudio: tr /t 0t269515 fails.
> > > > > > >
> > > > > > > Should be fixed, at least for CPUs that ddb actually managed to
> > > > > > > stop...
> > > > > > >
> > > > > > > > WITNESS doesn't flag an obvious lock ordering issue.  I'm not 
> > > > > > > > even
> > > > > > > > sure there is one.  It also happen with CPU_MAX_BUSY_CYCLES == 
> > > > > > > > 64.
> > > > > > > >
> > > > > > > > Maybe we're still hammering too much the locks?  Input and 
> > > > > > > > ideas to
> > > > > > > > test welcome.  Right now I'm running with just uvm_purge() 
> > > > > > > > reverted.
> > > > > > >
> > > > > > > Reverting uvm_purge() did not help. I've been able to reproduce 
> > > > > > > the
> > > > > > > hangs up to cvs up -D2025/05/18, backporting the arm64 ddb trace 
> > > > > > > fix,
> > > > > > > mpi's mutex backoff diff and the mtx_enter MP_LOCKDEBUG diff.
> > > > > > > Currently trying to reproduce with cvs up -D2025/05/07 as 
> > > > > > > suggested by
> > > > > > > dlg.
> > > > > >
> > > > > > cvs up -D2025/05/07 also did not help.  -current still locks up, but
> > > > > > the db_mutex fix helps get proper stacktraces:
> > > > >
> > > > > Thanks for the report.
> > > > >
> > > > > > --8<--
> > > > > > login: mtf_fnxe0:00nf2ffmrte8  0m21xt2  8xlef_ cf2nktft esl8r_u0f  
> > > > > > k00t  sxor2f0
> > > > > >
> > > > > >  0tf
> > > > > >      8oeff
> > > > > >
> > > > > >  ftff
> > > > > > efStopped at    mtx_enter+0x13c:        ldr     x26, [x25,#2376]
> > > > >
> > > > > What is your value of __mp_lock_spinout?
> > >
> > > 50L * INT_MAX
> > >
> > > > > >From the ddb traces I understand that the `sched_lock' mutex is 
> > > > > >contended.
> > > > > It's not clear to me why but I believe that's because rustc use
> > > > > sched_yield(2) in a loop.  Could you figure out where this syscall is
> > > > > coming from?
> > >
> > > Frankly, I don't know if I want to try and figure it out. ^^"
> >
> > I understand.  However this is IMHO the crucial piece of work that has
> > to be done.  Alternatively one can replace all _spinlock() in libc with
> > a primitive that do not call a syscall in a loop.
> >
> > > > > I'm upset that sched_yield() is still used and causing trouble.  Now
> > > > > that `sched_lock' is a mutex without guarantee of progress it is easy
> > > > > to hang the machine by calling it in a loop.  A proper solution would 
> > > > > be
> > > > > to stop using sched_yield(2).  This will bite us as long as it is 
> > > > > here.
> > >
> > > Regarding its uses, there's one in librthread/rthread_sched.c which
> > > looks legitimate, and also another in libc that caught my eye recently:
> > >
> > > /*
> > >  * internal support functions
> > >  */
> > > void
> > > _spinlock(volatile _atomic_lock_t *lock)
> > > {
> > >         while (_atomic_lock(lock))
> > >                 sched_yield();
> > >         membar_enter_after_atomic();
> > > }
> > > DEF_STRONG(_spinlock);
> > >
> > > That one looks a bit heavy-handed...
> >
> > This is certainly this one, question is which one of them?
> >
> > > Now, if the problem really is that our mutexes aren't fair, then isn't
> > > any mutex (incl the sched lock) usable for such hammering from
> > > userland?
> >
> > Hammering is the problem here.  In the new ddb trace we see 4 rustc
> > threads in the middle of sched_yield(2)...  This is really an example
> > of how not to do multi-threading...
> >
> > The one on CPU6 is in the middle of mi_switch() and doesn't appear on
> > ps /o.  It holds the `sched_lock' and there's no guarantee that CPU0
> > which holds the KERNEL_LOCK() and want to do wakeup(9) will grab it.
> >
> > > > Hmm, well, mutexes might not be fair, but they should guarantee
> > > > forward progress.  And the LL/SC primitives on arm64 do guarantee
> > > > forward progress, but there are certain conditions.  And I think our
> > > > current MI mutex implementation violates those conditions.  And
> > > > WITNESS and MP_LOCKDEBUG might actually be making things worse.
> > >
> > > I thought LL/SC guarantees depended on the length of the section
> > > between the load and store, but... I usually fail to find what I'm
> > > looking for when looking at ARM docs.
> > >
> > > > Now on the M2 we should be using CAS instead of LL/SC.
> > >
> > > Just to rule out an obvious issue:
> > >
> > > ddb{0}> x arm64_has_lse
> > > arm64_has_lse:             1
> > >
> > > > The
> > > > architecture reference manual isn't explicit about forward guarantees
> > > > for that instruction.  So I need to dig a bit deeper into this.  I
> > > > doubt there is no forward progress guarantee for those instructions,
> > > > but maybe there are similar conditions on how these get used.
> > > >
> > > > Will dig deeper and see if we can fix the mutex implementation on
> > > > arm64.
> > >
> > > ack, thanks
> >
> > I'm not sure the issue is with the primitive.
> >
> > > > > A workaround would be to use a backoff mechanism inside the loop.
> > > > >
> > > > > Another workaround could be to never spin on the `sched_lock' in
> > > > > sys_sched_yield() and instead sleep.
> > > > >
> > > > > Because of such contention all wakeup(9)s inside the pmemrange 
> > > > > allocator
> > > > > add contention on the `fpageqlock' which is what is "hanging" your 
> > > > > machine.
> > > > >
> > > > > Diff below is another workaround that might help.
> > >
> > > Alas it doesn't seem to help enough to avoid hangs.  Here's another
> > > fresh report using your diff.
> >
> > It seems to help because the contention is now on the kernel lock and no
> > longer on the global pmemrange mutex.
> >
> > > --8<--
> > > login: __mp_lock_spin: 0xffffff8001326508 lock spun out
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{4}> mach cpuinfo
> > >     0: stopped
> > >     1: stopped
> > >     2: stopped
> > >     3: stopped
> > > *   4: ddb
> > >     5: stopped
> > >     6: stopped
> > >     7: stopped
> > >     8: stopped
> > >     9: stopped
> > > ddb{4}> mach ddb 0
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{0}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __aarch64_cas8_acq_rel+0x1c
> > > __aarch64_cas8_acq_rel() at wakeup_n+0x44
> > > wakeup_n() at task_add+0x6c
> > > task_add() at ifiq_input+0x1f4
> > > ifiq_input() at bge_rxeof+0x32c
> > > bge_rxeof() at bge_intr+0x174
> > > bge_intr() at aplintc_irq_handler+0x170
> > > aplintc_irq_handler() at arm_cpu_irq+0x44
> > > arm_cpu_irq() at handle_el1h_irq+0x68
> > > handle_el1h_irq() at mtx_enter+0x104
> > > mtx_enter() at endtsleep+0x30
> > > endtsleep() at timeout_run+0xb4
> > > timeout_run() at softclock_process_tick_timeout+0xc4
> > > softclock_process_tick_timeout() at softclock+0xe8
> > > softclock() at softintr_dispatch+0xc8
> > > softintr_dispatch() at arm_do_pending_intr+0xf4
> > > arm_do_pending_intr() at mi_switch+0x1b0
> > > mi_switch() at ast+0x98
> > > ast() at handle_el0_sync+0x9c
> > > handle_el0_sync() at 0x19ae4be318
> > > --- trap ---
> > > end of kernel
> > > ddb{0}> mach ddb 1
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{1}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0x104
> > > __mp_lock() at __mp_acquire_count+0x34
> > > __mp_acquire_count() at uiomove+0x12c
> > > uiomove() at ffs_read+0x198
> > > ffs_read() at VOP_READ+0x3c
> > > VOP_READ() at vn_rdwr+0xb0
> > > vn_rdwr() at vmcmd_map_readvn+0x8c
> > > vmcmd_map_readvn() at exec_process_vmcmds+0x78
> > > exec_process_vmcmds() at sys_execve+0x620
> > > sys_execve() at svc_handler+0x430
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x13812aa6d8
> > > --- trap ---
> > > end of kernel
> > > ddb{1}> mach ddb 2
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{2}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0x104
> > > __mp_lock() at doopenat+0x160
> > > doopenat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x4521566bc
> > > --- trap ---
> > > end of kernel
> > > ddb{2}> mach ddb 3
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{3}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __aarch64_cas8_acq_rel+0x1c
> > > __aarch64_cas8_acq_rel() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{3}> mach ddb 4
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{4}> tr
> > > db_enter() at __mp_lock+0x128
> > > __mp_lock() at doopenat+0x160
> > > doopenat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x43e0b76bc
> > > --- trap ---
> > > end of kernel
> > > ddb{4}> mach ddb 5
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{5}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0xf8
> > > __mp_lock() at vn_statfile+0x2c
> > > vn_statfile() at sys_fstat+0x84
> > > sys_fstat() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x43dcf999c
> > > --- trap ---
> > > end of kernel
> > > ddb{5}> mach ddb 6
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{6}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at agtimer_get_timecount_default+0x18
> > > agtimer_get_timecount_default() at nanouptime+0x44
> > > nanouptime() at tuagg_add_runtime+0x40
> > > tuagg_add_runtime() at mi_switch+0x90
> > > mi_switch() at sys_sched_yield+0x88
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{6}> mach ddb 7
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{7}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at __mp_lock+0xf8
> > > __mp_lock() at sys___realpath+0x154
> > > sys___realpath() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x4af9eacc8
> > > --- trap ---
> > > end of kernel
> > > ddb{7}> mach ddb 8
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{8}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at aplintc_splx+0x1f0
> > > aplintc_splx() at mtx_enter+0xe8
> > > mtx_enter() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{8}> mach ddb 9
> > > Stopped at      db_enter+0x18:  brk     #0x0
> > > ddb{9}> tr
> > > db_enter() at aplintc_fiq_handler+0x6c
> > > aplintc_fiq_handler() at arm_cpu_fiq+0x44
> > > arm_cpu_fiq() at handle_el1h_fiq+0x68
> > > handle_el1h_fiq() at mtx_enter+0x10c
> > > mtx_enter() at sys_sched_yield+0x68
> > > sys_sched_yield() at svc_handler+0x450
> > > svc_handler() at do_el0_sync+0x1cc
> > > do_el0_sync() at handle_el0_sync+0x70
> > > handle_el0_sync() at 0x19ae4b2fdc
> > > --- trap ---
> > > end of kernel
> > > ddb{9}> show all locks
> > > CPU 0:
> > > shared mutex timeout r = 0 (0xffffff8001202d88)
> > > CPU 6:
> > > exclusive mutex &sched_lock r = 0 (0xffffff8001272148)
> > > Process 16821 (ninja) thread 0xffffff8113e80068 (92806)
> > > exclusive rrwlock inode r = 0 (0xffffff8055608760)
> > > Process 40072 (rustc) thread 0xffffff8113e814a8 (446746)
> > > exclusive rrwlock inode r = 0 (0xffffff81049e8538)
> > > Process 23063 (rustc) thread 0xffffff8113e819b8 (12581)
> > > exclusive kernel_lock &kernel_lock r = 0 (0xffffff8001326d10)
> > > ddb{9}> show witness /b
> > > Number of known direct relationships is 443
> > >
> > > Lock order reversal between "&mp->mnt_lock"(rwlock) and 
> > > "&ip->i_lock"(rrwlock)!
> > >
> > > witness: incomplete path, depth 8
> > > ddb{9}> ps /o
> > >     TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
> > >   92806  16821     55      0x1005          0    1  ninja
> > >  285104    167     55         0x3          0    2  c++
> > >  202504  45511     55         0x3          0    7  c++
> > >  288505  96331     55         0x3          0    4  c++
> > >  134651  73257     55         0x3          0    5  c++
> > >  230543  23063     55   0x2000003  0x4000000    3  rustc
> > >   12581  23063     55   0x2000003  0x4000000    0K rustc
> > >    4934  23063     55   0x2000003  0x4000000    8  rustc
> > > * 89729  23063     55   0x2000003  0x4000000    9  rustc
> > > ddb{9}> ps
> > >    PID     TID   PPID    UID  S       FLAGS  WAIT          COMMAND
> > >  16821   92806  23175     55  7      0x1005                ninja
> > >    167  285104  63255     55  7         0x3                c++
> > >  63255  492627  23175     55  3    0x10008b  sigsusp       sh
> > >  40072   77074  38114     55  3   0x2000083  fsleep        rustc
> > >  40072  100553  38114     55  3   0x6000083  piperd        rustc
> > >  40072  446746  38114     55  2   0x6000003                rustc
> > >  45511  202504   4053     55  7         0x3                c++
> > >   4053   80003  23175     55  3    0x10008b  sigsusp       sh
> > >  96331  288505  78782     55  7         0x3                c++
> > >  78782   47058  23175     55  3    0x10008b  sigsusp       sh
> > >  73257  134651  99268     55  7         0x3                c++
> > >  99268  333490  23175     55  3    0x10008b  sigsusp       sh
> > >  23063  335091  38114     55  3   0x2000083  fsleep        rustc
> > >  23063  363979  38114     55  3   0x6000083  piperd        rustc
> > >  23063  428745  38114     55  3   0x6000083  fsleep        rustc
> > >  23063  279995  38114     55  3   0x6000083  fsleep        rustc
> > >  23063  181877  38114     55  3   0x6000083  fsleep        rustc
> > >  23063  230543  38114     55  7   0x6000003                rustc
> > >  23063  216080  38114     55  2   0x6000003                rustc
> > >  23063   12581  38114     55  7   0x6000003                rustc
> > >  23063    4934  38114     55  7   0x6000003                rustc
> > > *23063   89729  38114     55  7   0x6000003                rustc
> > >  38114  473886  68745     55  3   0x2000083  fsleep        cargo
> > >  38114  327225  68745     55  3   0x6000083  fsleep        cargo
> > >  38114  207803  68745     55  3   0x6000083  kqread        cargo
> > >  38114  442514  68745     55  3   0x6000083  kqread        cargo
> > >  68745   91160  66756     55  3   0x2200083  wait          python3.12
> > >  66756  145557  20253     55  3    0x10008b  sigsusp       make
> > >  23175  305437  95548     55  3        0x8b  kqread        ninja
> > >  95548  326376  95211     55  3        0x83  kqread        cmake
> > >  95211  452507  60695     55  3    0x10008b  sigsusp       make
> > >  20253  453028  74977   1000  3    0x10008b  sigsusp       make
> > >  74977  118496  85512   1000  3    0x10008b  sigsusp       sh
> > >  60695  512137  28928   1000  3    0x10008b  sigsusp       make
> > >  28928  132313  46768   1000  3    0x10008b  sigsusp       sh
> > >  46768  510404  31892   1000  3    0x10008b  sigsusp       make
> > >  85512  168761   5112   1000  3    0x10008b  sigsusp       make
> > >  31892   41853  33691   1000  3    0x10008b  sigsusp       ksh
> > >   5112   45762  33691   1000  3    0x10008b  sigsusp       ksh
> > >  33691  339742      1   1000  3    0x100080  kqread        tmux
> > >  50447  258378  72048   1000  3    0x100083  kqread        tmux
> > >  52948  303715      1   1000  3    0x1000b8  kqread        ssh-agent
> > >  72048  282038  80004   1000  3    0x10008b  sigsusp       ksh
> > >  80004  478023  75098   1000  3        0x98  kqread        sshd-session
> > >  75098   88401  26790      0  3        0x92  kqread        sshd-session
> > >  16285  372990      1      0  3    0x100083  ttyin         getty
> > >  88950  327084      1      0  3    0x100083  ttyin         getty
> > >  32299  412662      1      0  3    0x100083  ttyin         getty
> > >   9624  388470      1      0  3    0x100083  ttyin         getty
> > >  95942  199073      1      0  3    0x100083  ttyin         getty
> > >  13386  235018      1      0  3    0x100083  ttyin         getty
> > >  71288  139796      1      0  3    0x100098  kqread        cron
> > >  58489  449220      1     99  3   0x1100090  kqread        sndiod
> > >  88897  458345      1    110  3    0x100090  kqread        sndiod
> > >  21307  145191  57732     95  3   0x1100092  kqread        smtpd
> > >  85854  126275  57732    103  3   0x1100092  kqread        smtpd
> > >  62845  395981  57732     95  3   0x1100092  kqread        smtpd
> > >  68484  261396  57732     95  3    0x100092  kqread        smtpd
> > >  96401  408083  57732     95  3   0x1100092  kqread        smtpd
> > >  83314   38871  57732     95  3   0x1100092  kqread        smtpd
> > >  57732  508617      1      0  3    0x100080  kqread        smtpd
> > >  26790  121973      1      0  3        0x88  kqread        sshd
> > >  29643  367483  12647      0  3        0x80  nfsd          nfsd
> > >  94049  228378  12647      0  3        0x80  nfsd          nfsd
> > >   6946   26461  12647      0  3        0x80  nfsd          nfsd
> > >  27528  512446  12647      0  3        0x80  nfsd          nfsd
> > >  12647  261198      1      0  3        0x80  netacc        nfsd
> > >  40776  365439  11876      0  3        0x80  kqread        mountd
> > >  11876  241363      1      0  3    0x100080  kqread        mountd
> > >  56481   14460      1     28  3   0x1100090  kqread        portmap
> > >  59772  105103  17556     83  3    0x100092  kqread        ntpd
> > >  17556   51510  78827     83  3   0x1100092  kqread        ntpd
> > >  78827  213143      1      0  3    0x100080  kqread        ntpd
> > >  58198   68980  84345     74  3   0x1100092  bpf           pflogd
> > >  84345  315647      1      0  3        0x80  sbwait        pflogd
> > >   4462  325006  88024     73  3   0x1100090  kqread        syslogd
> > >  88024  393773      1      0  3    0x100082  sbwait        syslogd
> > >  20177  177437      1      0  3    0x100080  kqread        resolvd
> > >   1916  512556  22600     77  3    0x100092  kqread        dhcpleased
> > >  89301  356030  22600     77  3    0x100092  kqread        dhcpleased
> > >  22600  388065      1      0  3        0x80  kqread        dhcpleased
> > >   9674   57681  39128    115  3    0x100092  kqread        slaacd
> > >  99969  364740  39128    115  3    0x100092  kqread        slaacd
> > >  39128  377692      1      0  3    0x100080  kqread        slaacd
> > >   7488  127254      0      0  3     0x14200  bored         wsdisplay0
> > >  80941  354199      0      0  3     0x14200  bored         
> > > apple-dcp-afkep%02x
> > >  83845  251298      0      0  3     0x14200  bored         
> > > apple-dcp-afkep%02x
> > >  48577  323255      0      0  3     0x14200  bored         
> > > apple-dcp-afkep%02x
> > >  26744  433388      0      0  3     0x14200  bored         drmrtk
> > >  23659  215894      0      0  3     0x14200  bored         smr
> > >  62145  444779      0      0  3     0x14200  pgzero        zerothread
> > >  60948   51173      0      0  3     0x14200  aiodoned      aiodoned
> > >  61934  416352      0      0  3     0x14200  syncer        update
> > >  60595  230019      0      0  3     0x14200  cleaner       cleaner
> > >  59553  366424      0      0  3     0x14200  reaper        reaper
> > >  57148   36768      0      0  3     0x14200  pgdaemon      pagedaemon
> > >   4700  330347      0      0  3     0x14200  usbtsk        usbtask
> > >  37673  384731      0      0  3     0x14200  usbatsk       usbatsk
> > >  71799  233642      0      0  3     0x14200  bored         drmtskl
> > >  72815  226502      0      0  3     0x14200  bored         drmlwq
> > >  22325  377620      0      0  3     0x14200  bored         drmlwq
> > >  58569  438671      0      0  3     0x14200  bored         drmlwq
> > >  35034  511019      0      0  3     0x14200  bored         drmlwq
> > >  51245  343392      0      0  3     0x14200  bored         drmubwq
> > >   5539  210307      0      0  3     0x14200  bored         drmubwq
> > >  96204  103587      0      0  3     0x14200  bored         drmubwq
> > >  99133  343617      0      0  3     0x14200  bored         drmubwq
> > >  62111  109225      0      0  3     0x14200  bored         drmhpwq
> > >  54844  103964      0      0  3     0x14200  bored         drmhpwq
> > >  98273    6361      0      0  3     0x14200  bored         drmhpwq
> > >  69125  196587      0      0  3     0x14200  bored         drmhpwq
> > >  36820  327811      0      0  3     0x14200  bored         drmwq
> > >    685  197752      0      0  3     0x14200  bored         drmwq
> > >  12733  475339      0      0  3     0x14200  bored         drmwq
> > >  50628   50813      0      0  3     0x14200  bored         drmwq
> > >  38742  421411      0      0  3     0x14200  bored         sensors
> > >  15252  489393      0      0  3     0x14200  bored         sleep
> > >  45280  351306      0      0  3  0x40014200                idle9
> > >  18476   19261      0      0  3  0x40014200                idle8
> > >  73955  129108      0      0  3  0x40014200                idle7
> > >  40102  188571      0      0  3  0x40014200                idle6
> > >  68869   24066      0      0  3  0x40014200                idle5
> > >  16358  379475      0      0  3  0x40014200                idle4
> > >  28248  237855      0      0  3  0x40014200                idle3
> > >  14101  490124      0      0  3  0x40014200                idle2
> > >  48008  167892      0      0  3  0x40014200                idle1
> > >  44662  441379      0      0  3     0x14200  bored         softnet3
> > >  18453  428842      0      0  3     0x14200  bored         softnet2
> > >  49989   83580      0      0  3     0x14200  bored         softnet1
> > >  93898  345769      0      0  3     0x14200  bored         softnet0
> > >  64190  206376      0      0  3     0x14200  bored         systqmp
> > >  95969  416641      0      0  3     0x14200  bored         systq
> > >  38042  457408      0      0  3     0x14200  tmoslp        softclockmp
> > >  57671  339967      0      0  3  0x40014200  tmoslp        softclock
> > >  32509  359030      0      0  3  0x40014200                idle0
> > >  99051   67429      0      0  3     0x14200  kmalloc       kmthread
> > >      1   12407      0      0  3        0x82  wait          init
> > >      0       0     -1      0  3     0x10200  scheduler     swapper
> > > ddb{9}>
> > > -->8--
> > >
> > > --
> > > jca
> >
> >

Reply via email to