Different fix for vnode deadlock

Martin Pieuchot Tue, 23 Feb 2021 03:23:07 -0800

Page faults on vnode-backed objects commonly end up calling VOP_READ(9)
or VOP_WRITE(9) to go through the buffer cache.  This implies grabbing
an inode lock after any UVM locking.


On the other hand changing the size of a vnode results in entering UVM,
generally via calling uvm_vnp_setsize() with a locked inode.

Syzkaller exposed a deadlock that anton@ fixed in r1.108 of uvm/uvm_vnode.c
by making the page fault path grab the inode lock earlier.  Sadly such
change isn't compatible with a finer locking required to unlock the lower
part of the UVM fault handler.

UVM's code make use of the PG_BUSY flag to ask other threads to not touch
a given page.  This is done to keep the ownership of a page after having
released its associated lock.  This is currently hard to follow because
the locking code has been removed ;)

With the current fix, the PG_BUSY flag is set after grabbing the inode
lock which creates a lock ordering problem with `uobj->vmlock' being
released after setting the flag.

So the diff below takes a different approach, if the thread that faulted
finds that the `inode' is contended it stops there and restart the fault.
This has the side effect of un-PG_BUSY the pages and allows the other
thread to make progress.

This is enough to move forward with `uobj->vmlock' without changing the
interaction between the existing buffer cache and UVM locking (thanks!).

I couldn't trigger the deadlock with regress/sys/uvm/vnode with this
diff. 

Is the explanation clear enough?  Comments?  Oks?

Index: uvm/uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.108
diff -u -p -r1.108 uvm_vnode.c
--- uvm/uvm_vnode.c     26 Oct 2020 19:48:19 -0000      1.108
+++ uvm/uvm_vnode.c     23 Feb 2021 10:46:50 -0000
@@ -90,9 +90,6 @@ int            uvn_io(struct uvm_vnode *, vm_page
 int             uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
 void            uvn_reference(struct uvm_object *);
 
-int             uvm_vnode_lock(struct uvm_vnode *);
-void            uvm_vnode_unlock(struct uvm_vnode *);
-
 /*
  * master pager structure
  */
@@ -878,16 +875,11 @@ uvn_cluster(struct uvm_object *uobj, vof
 int
 uvn_put(struct uvm_object *uobj, struct vm_page **pps, int npages, int flags)
 {
-       struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
        int retval;
 
        KERNEL_ASSERT_LOCKED();
 
-       retval = uvm_vnode_lock(uvn);
-       if (retval)
-               return(retval);
-       retval = uvn_io(uvn, pps, npages, flags, UIO_WRITE);
-       uvm_vnode_unlock(uvn);
+       retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE);
 
        return(retval);
 }
@@ -905,10 +897,9 @@ int
 uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
     int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
 {
-       struct uvm_vnode *uvn = (struct uvm_vnode *)uobj;
        voff_t current_offset;
        struct vm_page *ptmp;
-       int lcv, result, gotpages, retval;
+       int lcv, result, gotpages;
        boolean_t done;
 
        KERNEL_ASSERT_LOCKED();
@@ -983,18 +974,6 @@ uvn_get(struct uvm_object *uobj, voff_t 
        }
 
        /*
-        * Before getting non-resident pages which must be populate with data
-        * using I/O on the backing vnode, lock the same vnode. Such pages are
-        * about to be allocated and busied (i.e. PG_BUSY) by the current
-        * thread. Allocating and busying the page(s) before acquiring the
-        * vnode lock could cause a deadlock with uvn_flush() which acquires the
-        * vnode lock before waiting on pages to become unbusy and then flushed.
-        */
-       retval = uvm_vnode_lock(uvn);
-       if (retval)
-               return(retval);
-
-       /*
         * step 2: get non-resident or busy pages.
         * data structures are unlocked.
         *
@@ -1080,15 +1059,14 @@ uvn_get(struct uvm_object *uobj, voff_t 
                 * we have a "fake/busy/clean" page that we just allocated.  do
                 * I/O to fill it with valid data.
                 */
-               result = uvn_io(uvn, &ptmp, 1, PGO_SYNCIO, UIO_READ);
+               result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1,
+                   PGO_SYNCIO|PGO_NOWAIT, UIO_READ);
 
                /*
                 * I/O done.  because we used syncio the result can not be
                 * PEND or AGAIN.
                 */
                if (result != VM_PAGER_OK) {
-                       uvm_vnode_unlock(uvn);
-
                        if (ptmp->pg_flags & PG_WANTED)
                                wakeup(ptmp);
 
@@ -1119,15 +1097,12 @@ uvn_get(struct uvm_object *uobj, voff_t 
 
        }
 
-       uvm_vnode_unlock(uvn);
-
        return (VM_PAGER_OK);
 }
 
 /*
  * uvn_io: do I/O to a vnode
  *
- * => uvn: the backing vnode must be locked
  * => prefer map unlocked (not required)
  * => flags: PGO_SYNCIO -- use sync. I/O
  * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync.
@@ -1145,6 +1120,7 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
        int waitf, result, mapinflags;
        size_t got, wanted;
        int netunlocked = 0;
+       int lkflags = (flags & PGO_NOWAIT) ? LK_NOWAIT : 0;
 
        /* init values */
        waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT;
@@ -1213,17 +1189,42 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
        }
 
        /* do the I/O!  (XXX: curproc?) */
-       if (rw == UIO_READ)
-               result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
-       else
-               result = VOP_WRITE(vn, &uio,
-                   (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
-                   curproc->p_ucred);
+       /*
+        * This process may already have this vnode locked, if we faulted in
+        * copyin() or copyout() on a region backed by this vnode
+        * while doing I/O to the vnode.  If this is the case, don't
+        * panic.. instead, return the error to the user.
+        *
+        * XXX this is a stopgap to prevent a panic.
+        * Ideally, this kind of operation *should* work.
+        */
+       result = 0;
+       if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+               result = vn_lock(vn, LK_EXCLUSIVE | LK_RECURSEFAIL | lkflags);
+       if (result == 0) {
+               /* NOTE: vnode now locked! */
+               if (rw == UIO_READ)
+                       result = VOP_READ(vn, &uio, 0, curproc->p_ucred);
+               else
+                       result = VOP_WRITE(vn, &uio,
+                           (flags & PGO_PDFREECLUST) ? IO_NOCACHE : 0,
+                           curproc->p_ucred);
+
+               if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
+                       VOP_UNLOCK(vn);
+
+       }
 
        if (netunlocked)
                NET_LOCK();
 
-       /* zero out rest of buffer (if needed) */
+
+       /* NOTE: vnode now unlocked (unless vnislocked) */
+       /*
+        * result == unix style errno (0 == OK!)
+        *
+        * zero out rest of buffer (if needed)
+        */
        if (result == 0) {
                got = wanted - uio.uio_resid;
 
@@ -1244,14 +1245,16 @@ uvn_io(struct uvm_vnode *uvn, vm_page_t 
                wakeup(&uvn->u_nio);
        }
 
-       if (result == 0)
+       if (result == 0) {
                return(VM_PAGER_OK);
-
-       if (result == EIO) {
-               /* Signal back to uvm_vnode_unlock(). */
-               uvn->u_flags |= UVM_VNODE_IOERROR;
+       } else if (result == EBUSY) {
+               KASSERT(flags & PGO_NOWAIT);
+               return(VM_PAGER_AGAIN);
+       } else {
+               while (rebooting)
+                       tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
+               return(VM_PAGER_ERROR);
        }
-       return(VM_PAGER_ERROR);
 }
 
 /*
@@ -1467,53 +1470,4 @@ uvm_vnp_sync(struct mount *mp)
        }
 
        rw_exit_write(&uvn_sync_lock);
-}
-
-int
-uvm_vnode_lock(struct uvm_vnode *uvn)
-{
-       int error;
-       int netunlocked = 0;
-
-       if (uvn->u_flags & UVM_VNODE_VNISLOCKED)
-               return(VM_PAGER_OK);
-
-       /*
-        * This thread may already have the net lock, if we faulted in copyin()
-        * or copyout() in the network stack.
-        */
-       if (rw_status(&netlock) == RW_WRITE) {
-               NET_UNLOCK();
-               netunlocked = 1;
-       }
-
-       /*
-        * This thread may already have this vnode locked, if we faulted in
-        * copyin() or copyout() on a region backed by this vnode
-        * while doing I/O to the vnode. If this is the case, don't panic but
-        * instead return an error; as dictated by the LK_RECURSEFAIL flag.
-        *
-        * XXX this is a stopgap to prevent a panic.
-        * Ideally, this kind of operation *should* work.
-        */
-       error = vn_lock(uvn->u_vnode, LK_EXCLUSIVE | LK_RECURSEFAIL);
-       if (netunlocked)
-               NET_LOCK();
-       return(error ? VM_PAGER_ERROR : VM_PAGER_OK);
-}
-
-void
-uvm_vnode_unlock(struct uvm_vnode *uvn)
-{
-       int error;
-
-       if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0)
-               VOP_UNLOCK(uvn->u_vnode);
-
-       error = uvn->u_flags & UVM_VNODE_IOERROR;
-       uvn->u_flags &= ~UVM_VNODE_IOERROR;
-       if (error) {
-               while (rebooting)
-                       tsleep_nsec(&rebooting, PVM, "uvndead", INFSLP);
-       }
 }
Index: uvm/uvm_vnode.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.h,v
retrieving revision 1.15
diff -u -p -r1.15 uvm_vnode.h
--- uvm/uvm_vnode.h     26 Oct 2020 19:48:19 -0000      1.15
+++ uvm/uvm_vnode.h     23 Feb 2021 10:43:25 -0000
@@ -84,7 +84,6 @@ struct uvm_vnode {
                                           i/o sync to clear so it can do
                                           i/o */
 #define UVM_VNODE_WRITEABLE    0x200   /* uvn has pages that are writeable */
-#define UVM_VNODE_IOERROR      0x400   /* i/o error occurred in uvn_io() */
 
 /*
  * UVM_VNODE_BLOCKED: any condition that should new processes from
Index: uvm/uvm_pager.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.29
diff -u -p -r1.29 uvm_pager.h
--- uvm/uvm_pager.h     11 Jul 2014 16:35:40 -0000      1.29
+++ uvm/uvm_pager.h     23 Feb 2021 10:46:26 -0000
@@ -111,6 +111,7 @@ struct uvm_pagerops {
 #define PGO_LOCKED     0x040   /* fault data structures are locked [get] */
 #define PGO_PDFREECLUST        0x080   /* daemon's free cluster flag 
[uvm_pager_put] */
 #define PGO_REALLOCSWAP        0x100   /* reallocate swap area 
[pager_dropcluster] */
+#define PGO_NOWAIT     0x200   /* do not wait for inode lock */
 
 /* page we are not interested in getting */
 #define PGO_DONTCARE ((struct vm_page *) -1L)  /* [get only] */

Different fix for vnode deadlock

Reply via email to