On 29/07/22(Fri) 14:22, Theo Buehler wrote: > On Mon, Jul 11, 2022 at 01:05:19PM +0200, Martin Pieuchot wrote: > > On 11/07/22(Mon) 07:50, Theo Buehler wrote: > > > On Fri, Jun 03, 2022 at 03:02:36PM +0200, Theo Buehler wrote: > > > > > Please do note that this change can introduce/expose other issues. > > > > > > > > It seems that this diff causes occasional hangs when building snapshots > > > > on my mac M1 mini. This happened twice in 10 builds, both times in > > > > xenocara. Unfortunately, both times the machine became entirely > > > > unresponsive and as I don't have serial console, that's all the info I > > > > have... > > > > > > > > This machine has been very reliable and built >50 snaps without any hang > > > > over the last 2.5 months. I'm now trying snap builds in a loop without > > > > the diff to make sure the machine doesn't hang due to another recent > > > > kernel change. > > > > > > > > > > A little bit of info on this. The first three lines were a bit garbled on > > > the screen: > > > > > > panic: kernel diagnostic assertion "uvn->_oppa jai c: ke r > > > el d iag no tic a s rt n " map == UL L | | rw wr > > > k > > > ite held(amap->amap_lock)" failed: file "/ss/uvm/uvm_fault.c", line 846. > > > ernel diagnostic assertion "!_kernel_lock_held > > > Stopped at panic+0160: cmp w21, #0x0 ailed: file "/sys/kern/ > > > TID PID UID PRFLAGS PFLAGS CPU COMMAND > > > 411910 44540 21 0x100001 0 3 make > > > *436444 84241 21 0x100003 0 6 sh > > > 227952 53498 21 0x100003 0 5 sh > > > 258925 15765 21 0x101005 0 0 make > > > 128459 9649 21 0x100003 0 1 tradcpp > > > 287213 64216 21 0x100003 0x8 7 make > > > 173587 4617 1000 0x100000 0 2 tmux > > > 126511 69919 0 0x14000 0x200 4 softnet > > > db_enter() at panic+0x15c > > > panic() at __assert+0x24 > > > uvm_fault() at uvm_fault_upper_lookup+0x258 > > > uvm_fault_upper() at uvm_fault+0xec > > > uvm_fault() at udata_abort+0x128 > > > udata_abort() at do_el0_sync+0xdc > > > do_el0_sync() at handle_el0_sync+0x74 > > > https://www.openbsd.org/ddb.html describes the minimum info required in > > > bug > > > reports. Insufficient info makes it difficult to find and fix bugs. > > > ddb{6}> show panic > > > *cpu0: kernel diagnostic assertion "uvn->u_obj.uo_refs == 0" failed: file > > > "/sys/kern/uvn_vnode.c", line 231. > > > cpu6: kernel diagnostic assertion "amap == NULL || > > > rw_write_held(amap->am_lock)" failed: file "/sys/uvm/uvm_fault", line > > > 846. > > > cpu3: kernel diagnostic assertion "!_kernel_lock_held()" failed: file > > > "/sys/kern/kern_fork.c", line 678 > > > ddb{6}> mach ddbcpu 0 > > > > > > After pressing enter here, the machine locked up completely. > > > > It's hard for me to tell what's going on. I believe the interesting > > trace is the one from cpu0 that we don't have. Can you easily reproduce > > this? I'm trying on amd64 without luck. I'd glad if you could gather > > more infos. > > Sorry for the delay. I was only at home intermittently. I hit this three > times: > > panic: kernel diagnostic assertion "uvn->u_obj.uo_refs == 0" failed: file > "/sys/uvm/uvm_vnode.c", line 231
Yesterday gnezdo@ fixed a race in uvn_attach() that lead to the same assert. Here's an rebased diff for the bug discussed in this thread, could you try again and let us know? Thanks! Index: uvm/uvm_vnode.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v retrieving revision 1.127 diff -u -p -r1.127 uvm_vnode.c --- uvm/uvm_vnode.c 31 Aug 2022 09:07:35 -0000 1.127 +++ uvm/uvm_vnode.c 1 Sep 2022 12:54:27 -0000 @@ -163,11 +163,8 @@ uvn_attach(struct vnode *vp, vm_prot_t a */ rw_enter(uvn->u_obj.vmobjlock, RW_WRITE); if (uvn->u_flags & UVM_VNODE_VALID) { /* already active? */ + KASSERT(uvn->u_obj.uo_refs > 0); - /* regain vref if we were persisting */ - if (uvn->u_obj.uo_refs == 0) { - vref(vp); - } uvn->u_obj.uo_refs++; /* bump uvn ref! */ rw_exit(uvn->u_obj.vmobjlock); @@ -235,7 +232,7 @@ uvn_attach(struct vnode *vp, vm_prot_t a KASSERT(uvn->u_obj.uo_refs == 0); uvn->u_obj.uo_refs++; oldflags = uvn->u_flags; - uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST; + uvn->u_flags = UVM_VNODE_VALID; uvn->u_nio = 0; uvn->u_size = used_vnode_size; @@ -248,7 +245,7 @@ uvn_attach(struct vnode *vp, vm_prot_t a /* * add a reference to the vnode. this reference will stay as long * as there is a valid mapping of the vnode. dropped when the - * reference count goes to zero [and we either free or persist]. + * reference count goes to zero. */ vref(vp); if (oldflags & UVM_VNODE_WANTED) @@ -321,16 +318,6 @@ uvn_detach(struct uvm_object *uobj) */ vp->v_flag &= ~VTEXT; - /* - * we just dropped the last reference to the uvn. see if we can - * let it "stick around". - */ - if (uvn->u_flags & UVM_VNODE_CANPERSIST) { - /* won't block */ - uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES); - goto out; - } - /* its a goner! */ uvn->u_flags |= UVM_VNODE_DYING; @@ -380,7 +367,6 @@ uvn_detach(struct uvm_object *uobj) /* wake up any sleepers */ if (oldflags & UVM_VNODE_WANTED) wakeup(uvn); -out: rw_exit(uobj->vmobjlock); /* drop our reference to the vnode. */ @@ -496,8 +482,8 @@ uvm_vnp_terminate(struct vnode *vp) } /* - * done. now we free the uvn if its reference count is zero - * (true if we are zapping a persisting uvn). however, if we are + * done. now we free the uvn if its reference count is zero. + * however, if we are * terminating a uvn with active mappings we let it live ... future * calls down to the vnode layer will fail. */ @@ -505,14 +491,14 @@ uvm_vnp_terminate(struct vnode *vp) if (uvn->u_obj.uo_refs) { /* * uvn must live on it is dead-vnode state until all references - * are gone. restore flags. clear CANPERSIST state. + * are gone. restore flags. */ uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED| - UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST); + UVM_VNODE_WANTED); } else { /* * free the uvn now. note that the vref reference is already - * gone [it is dropped when we enter the persist state]. + * gone. */ if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) panic("uvm_vnp_terminate: io sync wanted bit set"); @@ -1349,46 +1335,14 @@ uvm_vnp_uncache(struct vnode *vp) } /* - * we have a valid, non-blocked uvn. clear persist flag. + * we have a valid, non-blocked uvn. * if uvn is currently active we can return now. */ - uvn->u_flags &= ~UVM_VNODE_CANPERSIST; if (uvn->u_obj.uo_refs) { rw_exit(uobj->vmobjlock); return FALSE; } - /* - * uvn is currently persisting! we have to gain a reference to - * it so that we can call uvn_detach to kill the uvn. - */ - vref(vp); /* seems ok, even with VOP_LOCK */ - uvn->u_obj.uo_refs++; /* value is now 1 */ - rw_exit(uobj->vmobjlock); - -#ifdef VFSLCKDEBUG - /* - * carry over sanity check from old vnode pager: the vnode should - * be VOP_LOCK'd, and we confirm it here. - */ - if ((vp->v_flag & VLOCKSWORK) && !VOP_ISLOCKED(vp)) - panic("uvm_vnp_uncache: vnode not locked!"); -#endif - - /* - * now drop our reference to the vnode. if we have the sole - * reference to the vnode then this will cause it to die [as we - * just cleared the persist flag]. we have to unlock the vnode - * while we are doing this as it may trigger I/O. - * - * XXX: it might be possible for uvn to get reclaimed while we are - * unlocked causing us to return TRUE when we should not. we ignore - * this as a false-positive return value doesn't hurt us. - */ - VOP_UNLOCK(vp); - uvn_detach(&uvn->u_obj); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - return TRUE; } @@ -1483,11 +1437,9 @@ uvm_vnp_sync(struct mount *mp) continue; /* - * gain reference. watch out for persisting uvns (need to - * regain vnode REF). + * gain reference. */ - if (uvn->u_obj.uo_refs == 0) - vref(vp); + KASSERT(uvn->u_obj.uo_refs > 0); uvn->u_obj.uo_refs++; SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq);