Page faults on vnode-backed objects commonly end up calling VOP_READ(9) or VOP_WRITE(9) to go through the buffer cache. This implies grabbing an inode lock after any UVM locking.
On the other hand changing the size of a vnode results in entering UVM, generally via calling uvm_vnp_setsize() with a locked inode. Syzkaller exposed a deadlock that anton@ fixed in r1.108 of uvm/uvm_vnode.c by making the page fault path grab the inode lock earlier. Sadly such change isn't compatible with a finer locking required to unlock the lower part of the UVM fault handler. UVM's code make use of the PG_BUSY flag to ask other threads to not touch a given page. This is done to keep the ownership of a page after having released its associated lock. This is currently hard to follow because the locking code has been removed ;) With the current fix, the PG_BUSY flag is set after grabbing the inode lock which creates a lock ordering problem with `uobj->vmlock' being released after setting the flag. So the diff below takes a different approach, if the thread that faulted finds that the `inode' is contended it stops there and restart the fault. This has the side effect of un-PG_BUSY the pages and allows the other thread to make progress. This is enough to move forward with `uobj->vmlock' without changing the interaction between the existing buffer cache and UVM locking (thanks!). I couldn't trigger the deadlock with regress/sys/uvm/vnode with this diff. Is the explanation clear enough? Comments? Oks? Index: uvm/uvm_vnode.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v retrieving revision 1.108 diff -u -p -r1.108 uvm_vnode.c --- uvm/uvm_vnode.c 26 Oct 2020 19:48:19 -0000 1.108 +++ uvm/uvm_vnode.c 23 Feb 2021 10:46:50 -0000 @@ -90,9 +90,6 @@ int uvn_io(struct uvm_vnode *, vm_page int uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t); void uvn_reference(struct uvm_object *); -int uvm_vnode_lock(struct uvm_vnode *); -void uvm_vnode_unlock(struct uvm_vnode *); - /* * master pager structure */ @@ -878,16 +875,11 @@ uvn_cluster(struct uvm_object *uobj, vof int uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags) { - struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; int retval; KERNEL_ASSERT_LOCKED(); - retval = uvm_vnode_lock(uvn); - if (retval) - return(retval); - retval = uvn_io(uvn, pps, npages, flags, UIO_WRITE); - uvm_vnode_unlock(uvn); + retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE); return(retval); } @@ -905,10 +897,9 @@ int uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps, int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags) { - struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; voff_t current_offset; struct vm_page *ptmp; - int lcv, result, gotpages, retval; + int lcv, result, gotpages; boolean_t done; KERNEL_ASSERT_LOCKED(); @@ -983,18 +974,6 @@ uvn_get(struct uvm_object *uobj, voff_t } /* - * Before getting non-resident pages which must be populate with data - * using I/O on the backing vnode, lock the same vnode. Such pages are - * about to be allocated and busied (i.e. PG_BUSY) by the current - * thread. Allocating and busying the page(s) before acquiring the - * vnode lock could cause a deadlock with uvn_flush() which acquires the - * vnode lock before waiting on pages to become unbusy and then flushed. - */ - retval = uvm_vnode_lock(uvn); - if (retval) - return(retval); - - /* * step 2: get non-resident or busy pages. * data structures are unlocked. * @@ -1080,15 +1059,14 @@ uvn_get(struct uvm_object *uobj, voff_t * we have a "fake/busy/clean" page that we just allocated. do * I/O to fill it with valid data. */ - result = uvn_io(uvn, &ptmp, 1, PGO_SYNCIO, UIO_READ); + result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1, + PGO_SYNCIO|PGO_NOWAIT, UIO_READ); /* * I/O done. because we used syncio the result can not be * PEND or AGAIN. */ if (result != VM_PAGER_OK) { - uvm_vnode_unlock(uvn); - if (ptmp->pg_flags & PG_WANTED) wakeup(ptmp); @@ -1119,15 +1097,12 @@ uvn_get(struct uvm_object *uobj, voff_t } - uvm_vnode_unlock(uvn); - return (VM_PAGER_OK); } /* * uvn_io: do I/O to a vnode * - * => uvn: the backing vnode must be locked * => prefer map unlocked (not required) * => flags: PGO_SYNCIO -- use sync. I/O * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. @@ -1145,6 +1120,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t int waitf, result, mapinflags; size_t got, wanted; int netunlocked = 0; + int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0; /* init values */ waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT; @@ -1213,17 +1189,42 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t } /* do the I/O! (XXX: curproc?) */ - if (rw == UIO_READ) - result = VOP_READ(vn, &uio, 0, curproc->p_ucred); - else - result = VOP_WRITE(vn, &uio, - (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0, - curproc->p_ucred); + /* + * This process may already have this vnode locked, if we faulted in + * copyin() or copyout() on a region backed by this vnode + * while doing I/O to the vnode. If this is the case, don't + * panic.. instead, return the error to the user. + * + * XXX this is a stopgap to prevent a panic. + * Ideally, this kind of operation *should* work. + */ + result = 0; + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) + result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags); + if (result == 0) { + /* NOTE: vnode now locked! */ + if (rw == UIO_READ) + result = VOP_READ(vn, &uio, 0, curproc->p_ucred); + else + result = VOP_WRITE(vn, &uio, + (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0, + curproc->p_ucred); + + if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) + VOP_UNLOCK(vn); + + } if (netunlocked) NET_LOCK(); - /* zero out rest of buffer (if needed) */ + + /* NOTE: vnode now unlocked (unless vnislocked) */ + /* + * result == unix style errno (0 == OK!) + * + * zero out rest of buffer (if needed) + */ if (result == 0) { got = wanted - uio.uio_resid; @@ -1244,14 +1245,16 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t wakeup(&uvn->u_nio); } - if (result == 0) + if (result == 0) { return(VM_PAGER_OK); - - if (result == EIO) { - /* Signal back to uvm_vnode_unlock(). */ - uvn->u_flags |= UVM_VNODE_IOERROR; + } else if (result == EBUSY) { + KASSERT(flags & PGO_NOWAIT); + return(VM_PAGER_AGAIN); + } else { + while (rebooting) + tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP); + return(VM_PAGER_ERROR); } - return(VM_PAGER_ERROR); } /* @@ -1467,53 +1470,4 @@ uvm_vnp_sync(struct mount *mp) } rw_exit_write(&uvn_sync_lock); -} - -int -uvm_vnode_lock(struct uvm_vnode *uvn) -{ - int error; - int netunlocked = 0; - - if (uvn->u_flags & UVM_VNODE_VNISLOCKED) - return(VM_PAGER_OK); - - /* - * This thread may already have the net lock, if we faulted in copyin() - * or copyout() in the network stack. - */ - if (rw_status(&netlock) == RW_WRITE) { - NET_UNLOCK(); - netunlocked = 1; - } - - /* - * This thread may already have this vnode locked, if we faulted in - * copyin() or copyout() on a region backed by this vnode - * while doing I/O to the vnode. If this is the case, don't panic but - * instead return an error; as dictated by the LK_RECURSEFAIL flag. - * - * XXX this is a stopgap to prevent a panic. - * Ideally, this kind of operation *should* work. - */ - error = vn_lock(uvn->u_vnode, LK_EXCLUSIVE | LK_RECURSEFAIL); - if (netunlocked) - NET_LOCK(); - return(error ? VM_PAGER_ERROR : VM_PAGER_OK); -} - -void -uvm_vnode_unlock(struct uvm_vnode *uvn) -{ - int error; - - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) - VOP_UNLOCK(uvn->u_vnode); - - error = uvn->u_flags & UVM_VNODE_IOERROR; - uvn->u_flags &= ~UVM_VNODE_IOERROR; - if (error) { - while (rebooting) - tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP); - } } Index: uvm/uvm_vnode.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_vnode.h,v retrieving revision 1.15 diff -u -p -r1.15 uvm_vnode.h --- uvm/uvm_vnode.h 26 Oct 2020 19:48:19 -0000 1.15 +++ uvm/uvm_vnode.h 23 Feb 2021 10:43:25 -0000 @@ -84,7 +84,6 @@ struct uvm_vnode { i/o sync to clear so it can do i/o */ #define UVM_VNODE_WRITEABLE 0x200 /* uvn has pages that are writeable */ -#define UVM_VNODE_IOERROR 0x400 /* i/o error occurred in uvn_io() */ /* * UVM_VNODE_BLOCKED: any condition that should new processes from Index: uvm/uvm_pager.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_pager.h,v retrieving revision 1.29 diff -u -p -r1.29 uvm_pager.h --- uvm/uvm_pager.h 11 Jul 2014 16:35:40 -0000 1.29 +++ uvm/uvm_pager.h 23 Feb 2021 10:46:26 -0000 @@ -111,6 +111,7 @@ struct uvm_pagerops { #define PGO_LOCKED 0x040 /* fault data structures are locked [get] */ #define PGO_PDFREECLUST 0x080 /* daemon's free cluster flag [uvm_pager_put] */ #define PGO_REALLOCSWAP 0x100 /* reallocate swap area [pager_dropcluster] */ +#define PGO_NOWAIT 0x200 /* do not wait for inode lock */ /* page we are not interested in getting */ #define PGO_DONTCARE ((struct vm_page *) -1L) /* [get only] */