On Fri, May 22, 2015 at 04:56:57PM +0000, Taylor R Campbell wrote:
>    Date: Fri, 22 May 2015 13:09:58 +0100
>    From: Patrick Welche <pr...@cam.ac.uk>
> 
>    There are a load of locking PRs involving vnd. I thought I would have a
>    stab at converting vnd to use condvars and mutexes (mutices?) as a first
>    step. I cobbled something together which allows me to
>    [...]
>    with a LOCKDEBUG kernel, so I may have guessed right, but I haven't found
>    any documentation on The Right Way, so please review carefully, assume
>    the worst, and tell me how to do it better!

I may have regressed: with the attached patch, I now get a locking
error in vndthread at:

        /*
         * Allocate a header for this transfer and link it to the
         * buffer
         */
        mutex_enter(&vnd->sc_sclock); <------------ XXX already locked?!
        vnx = VND_GETXFER(vnd); /* may sleep */
        mutex_exit(&vnd->sc_sclock);

(That is assuming that coredumps and gdb are accurate...)

> First thought: I'd suggest making a branch for this so you can work on
> it incrementally without breaking the world, or at least a local Git
> branch so you can work on separate commits for different issues.
> 
> Concur with all hannken@'s comments.  Small bug: you almost certainly
> need sc_sclock to be at IPL_BIO, not IPL_NONE, if you want it to
> replace splbio/splx pairs.  Minor nit: diff -pu, please.

Committed the whitespace trivia to remove distractions.
Changed to IPL_BIO - I wondered whether vnd shouldn't have a lower
priority than the filesystem of the file backing in it. I suppose
not as they both do i/o?
 
> You converted the VNF_LOCKED/VNF_WANTED flags into a mutex sc_sclock
> (why not just `sc_lock'?), but the semantics doesn't quite carry over:
> previously, vndlock would wait interruptibly, and the owner of
> VNF_LOCKED was allowed to do I/O.  But you can't do that with a mutex
> -- you can't wait interruptibly or do I/O while holding one.

I put vnd{,un}lock back, and hopefully converted it correctly to
condvar. So, after a call to vndlock, the lock was obtained
interruptibly and released.

> Right now you just drop sc_sclock around vndgetdisklabel, and don't
> take it for DIOCWDINFO &c., but then concurrent callers can stomp all
> over one another.
>
> You will probably have to introduce a new state, e.g. VNF_IO (or maybe
> just VNF_DISKLABELIO), and make anyone trying to use the disklabel or
> any other vnd-internal I/O wait for it to complete.

With vndlock back, the lock isn't held before vndgetdisklabel, so I
don't have to release it, and DIOCWDINFO is back within a vndlock/vndunlock
pair. If it was possible to acquire the lock, then nothing else should
be executing the code concurrently even though one isn't still holding
it? (readdisklabel calls vndstrategy which acquires the lock)

> 
>    - there was some sort of start kernel thread, thread sets variable,
>      meanwhile loop waiting for variable to be set. I removed that. On
>      the other hand I added a MUSTJOIN for the kernel thread exiting
>      side of things.
> 
> I suggest moving the kthread_join around a little bit.  Don't read
> sc_flags without the lock; instead, grab the thread while under the
> lock, and kthread_join it outside.

Is this what you mean:

                mutex_enter(&vnd->sc_sclock);
...
                kt = vnd->sc_kthread;
...
                mutex_exit(&vnd->sc_sclock);
                kthread_join(kt);


> I suspect you will need some way for vndset to wait until any pending
> vndclear is actually complete.  I don't see that immediately.  Note
> that you won't hit any of this in testing, no matter how hard you
> hammer on it, until you add D_MPSAFE to vnd_bdevsw/vnd_cdevsw.

I switched on all the MPSAFEty for all the fallout in this patch...
(Haven't considered vndset/clear yet)

>    - given that vnd is a pseudo-device, do I need an extra lock to care
>      of device creation/destruction? (What am I protecting against?
> 
> There is an MP safety issue with device_lookup_private: if you detach
> a driver just after someone does device_lookup_private, nothing causes
> the detach to block until the caller of device_lookup_private is done.
> But this is a general issue in autoconf, not a vnd-specific issue.  I
> don't, in my cursory glance, see any obvious MP safety issues in the
> attachment goo in your patch.

OK...

Thanks!

Patrick
Index: vndvar.h
===================================================================
RCS file: /cvsroot/src/sys/dev/vndvar.h,v
retrieving revision 1.34
diff -u -p -r1.34 vndvar.h
--- vndvar.h    25 May 2015 20:57:18 -0000      1.34
+++ vndvar.h    31 May 2015 11:52:59 -0000
@@ -107,14 +107,19 @@ struct vnode;
  */
 struct vnd_softc {
        device_t         sc_dev;
+       kmutex_t         sc_sclock;     /* protect this softc */
        int              sc_flags;      /* flags */
+       kcondvar_t       sc_intercv;    /* interruptible lock condvar */
        uint64_t         sc_size;       /* size of vnd */
        struct vnode    *sc_vp;         /* vnode */
        kauth_cred_t     sc_cred;       /* credentials */
-       int              sc_maxactive;  /* max # of active requests */
        struct bufq_state *sc_tab;      /* transfer queue */
+       kcondvar_t       sc_bufqcv;     /* queue's condvar */
        int              sc_pending;    /* number of pending transfers */
+       kcondvar_t       sc_pendingcv;  /* pending transfer condvar */
+       int              sc_maxactive;  /* max # of active requests */
        int              sc_active;     /* number of active transfers */
+       kcondvar_t       sc_activitycv; /* activity condvar */
        struct disk      sc_dkdev;      /* generic disk device info */
        struct vndgeom   sc_geom;       /* virtual geometry */
        struct pool      sc_vxpool;     /* vndxfer pool */
@@ -134,13 +139,11 @@ struct vnd_softc {
 #define        VNF_INITED      0x001   /* unit has been initialized */
 #define        VNF_WLABEL      0x002   /* label area is writable */
 #define        VNF_LABELLING   0x004   /* unit is currently being labelled */
-#define        VNF_WANTED      0x008   /* someone is waiting to obtain a lock 
*/
 #define        VNF_LOCKED      0x010   /* unit is locked */
 #define        VNF_READONLY    0x020   /* unit is read-only */
 #define        VNF_KLABEL      0x040   /* keep label on close */
 #define        VNF_VLABEL      0x080   /* label is valid */
 #define        VNF_KTHREAD     0x100   /* thread is running */
-#define        VNF_VUNCONF     0x200   /* device is unconfiguring */
 #define VNF_COMP       0x400   /* file is compressed */
 #define VNF_CLEARING   0x800   /* unit is being torn down */
 #define VNF_USE_VN_RDWR        0x1000  /* have to use vn_rdwr() */
Index: vnd.c
===================================================================
RCS file: /cvsroot/src/sys/dev/vnd.c,v
retrieving revision 1.245
diff -u -p -r1.245 vnd.c
--- vnd.c       25 May 2015 21:02:37 -0000      1.245
+++ vnd.c       31 May 2015 11:53:00 -0000
@@ -204,7 +204,7 @@ const struct bdevsw vnd_bdevsw = {
        .d_dump = vnddump,
        .d_psize = vndsize,
        .d_discard = nodiscard,
-       .d_flag = D_DISK
+       .d_flag = D_DISK | D_MPSAFE
 };
 
 const struct cdevsw vnd_cdevsw = {
@@ -219,7 +219,7 @@ const struct cdevsw vnd_cdevsw = {
        .d_mmap = nommap,
        .d_kqfilter = nokqfilter,
        .d_discard = nodiscard,
-       .d_flag = D_DISK
+       .d_flag = D_DISK | D_MPSAFE
 };
 
 static int     vnd_match(device_t, cfdata_t, void *);
@@ -241,12 +241,9 @@ static struct      dkdriver vnddkdriver = {
 void
 vndattach(int num)
 {
-       int error;
 
-       error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
-       if (error)
-               aprint_error("%s: unable to register cfattach\n",
-                   vnd_cd.cd_name);
+       if (config_cfattach_attach(vnd_cd.cd_name, &vnd_ca))
+               aprint_error("%s: unable to register\n", vnd_cd.cd_name);
 }
 
 static int
@@ -265,6 +262,11 @@ vnd_attach(device_t parent, device_t sel
        sc->sc_comp_offsets = NULL;
        sc->sc_comp_buff = NULL;
        sc->sc_comp_decombuf = NULL;
+       mutex_init(&sc->sc_sclock, MUTEX_DEFAULT, IPL_BIO);
+       cv_init(&sc->sc_intercv, "vndlck");
+       cv_init(&sc->sc_activitycv, "vndac");
+       cv_init(&sc->sc_bufqcv, "vndbp");
+       cv_init(&sc->sc_pendingcv, "vndpc");
        bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK);
        disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver);
        if (!pmf_device_register(self, NULL, NULL))
@@ -284,6 +286,11 @@ vnd_detach(device_t self, int flags)
        }
 
        pmf_device_deregister(self);
+       mutex_destroy(&sc->sc_sclock);
+       cv_destroy(&sc->sc_intercv);
+       cv_destroy(&sc->sc_activitycv);
+       cv_destroy(&sc->sc_bufqcv);
+       cv_destroy(&sc->sc_pendingcv);
        bufq_free(sc->sc_tab);
        disk_destroy(&sc->sc_dkdev);
 
@@ -374,7 +381,9 @@ vndopen(dev_t dev, int flags, int mode, 
                         */
                        if ((sc->sc_flags & VNF_VLABEL) == 0) {
                                sc->sc_flags |= VNF_VLABEL;
+                               /* mutex_exit(&sc->sc_sclock); */
                                vndgetdisklabel(dev, sc);
+                               /* mutex_enter(&sc->sc_sclock); */
                        }
                }
        }
@@ -470,7 +479,8 @@ vndstrategy(struct buf *bp)
            device_lookup_private(&vnd_cd, unit);
        struct disklabel *lp;
        daddr_t blkno;
-       int s = splbio();
+
+       mutex_enter(&vnd->sc_sclock);
 
        if (vnd == NULL) {
                bp->b_error = ENXIO;
@@ -545,18 +555,20 @@ vndstrategy(struct buf *bp)
                KASSERT(vnd->sc_pending >= 0 &&
                    vnd->sc_pending <= VND_MAXPENDING(vnd));
                while (vnd->sc_pending == VND_MAXPENDING(vnd))
-                       tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0);
+                       cv_wait(&vnd->sc_pendingcv, &vnd->sc_sclock);
                vnd->sc_pending++;
        }
+       mutex_exit(&vnd->sc_sclock);
        bufq_put(vnd->sc_tab, bp);
-       wakeup(&vnd->sc_tab);
-       splx(s);
+       mutex_enter(&vnd->sc_sclock);
+       cv_signal(&vnd->sc_bufqcv);
+       mutex_exit(&vnd->sc_sclock);
        return;
 
 done:
+       mutex_exit(&vnd->sc_sclock);
        bp->b_resid = bp->b_bcount;
        biodone(bp);
-       splx(s);
 }
 
 static bool
@@ -601,7 +613,8 @@ static void
 vndthread(void *arg)
 {
        struct vnd_softc *vnd = arg;
-       int s;
+
+       mutex_enter(&vnd->sc_sclock);
 
        /* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to
         * directly access the backing vnode.  If we can, use these two
@@ -620,31 +633,27 @@ vndthread(void *arg)
                    "using read/write operations");
 #endif
 
-       s = splbio();
-       vnd->sc_flags |= VNF_KTHREAD;
-       wakeup(&vnd->sc_kthread);
-
        /*
         * Dequeue requests and serve them depending on the available
         * vnode operations.
         */
-       while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
+       while ((vnd->sc_flags & VNF_KTHREAD) != 0) {
                struct vndxfer *vnx;
                struct buf *obp;
                struct buf *bp;
 
                obp = bufq_get(vnd->sc_tab);
                if (obp == NULL) {
-                       tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
+                       cv_wait(&vnd->sc_bufqcv, &vnd->sc_sclock);
                        continue;
                };
                if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
                        KASSERT(vnd->sc_pending > 0 &&
                            vnd->sc_pending <= VND_MAXPENDING(vnd));
                        if (vnd->sc_pending-- == VND_MAXPENDING(vnd))
-                               wakeup(&vnd->sc_pending);
+                               cv_signal(&vnd->sc_pendingcv);
                }
-               splx(s);
+               mutex_exit(&vnd->sc_sclock);
 #ifdef DEBUG
                if (vnddebug & VDB_FOLLOW)
                        printf("vndthread(%p)\n", obp);
@@ -672,17 +681,17 @@ vndthread(void *arg)
                 * Allocate a header for this transfer and link it to the
                 * buffer
                 */
-               s = splbio();
-               vnx = VND_GETXFER(vnd);
-               splx(s);
+               mutex_enter(&vnd->sc_sclock);
+               vnx = VND_GETXFER(vnd); /* may sleep */
+               mutex_exit(&vnd->sc_sclock);
                vnx->vx_vnd = vnd;
 
-               s = splbio();
+               mutex_enter(&vnd->sc_sclock);
                while (vnd->sc_active >= vnd->sc_maxactive) {
-                       tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
+                       cv_wait(&vnd->sc_activitycv, &vnd->sc_sclock);
                }
                vnd->sc_active++;
-               splx(s);
+               mutex_exit(&vnd->sc_sclock);
 
                /* Instrumentation. */
                disk_busy(&vnd->sc_dkdev);
@@ -706,17 +715,15 @@ vndthread(void *arg)
                else
                        handle_with_rdwr(vnd, obp, bp);
 
-               s = splbio();
+               mutex_enter(&vnd->sc_sclock);
                continue;
 
 done:
                biodone(obp);
-               s = splbio();
+               mutex_enter(&vnd->sc_sclock);
        }
 
-       vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
-       wakeup(&vnd->sc_kthread);
-       splx(s);
+       mutex_exit(&vnd->sc_sclock);
        kthread_exit(0);
 }
 
@@ -794,7 +801,7 @@ handle_with_rdwr(struct vnd_softc *vnd, 
 }
 
 /*
- * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP
+ * Handles the read/write request given in 'bp' using the vnode's VOP_BMAP
  * and VOP_STRATEGY operations.
  *
  * 'obp' is a pointer to the original request fed to the vnd device.
@@ -917,7 +924,8 @@ vndiodone(struct buf *bp)
        struct vndxfer *vnx = VND_BUFTOXFER(bp);
        struct vnd_softc *vnd = vnx->vx_vnd;
        struct buf *obp = bp->b_private;
-       int s = splbio();
+
+       mutex_enter(&vnd->sc_sclock);
 
        KASSERT(&vnx->vx_buf == bp);
        KASSERT(vnd->sc_active > 0);
@@ -931,9 +939,9 @@ vndiodone(struct buf *bp)
            (bp->b_flags & B_READ));
        vnd->sc_active--;
        if (vnd->sc_active == 0) {
-               wakeup(&vnd->sc_tab);
+               cv_broadcast(&vnd->sc_activitycv);
        }
-       splx(s);
+       mutex_exit(&vnd->sc_sclock);
        obp->b_error = bp->b_error;
        obp->b_resid = bp->b_resid;
        buf_destroy(bp);
@@ -1163,7 +1171,7 @@ vndioctl(dev_t dev, u_long cmd, void *da
                }
                KASSERT(l);
                error = VOP_GETATTR(nd.ni_vp, &vattr, l->l_cred);
-               if (!error && nd.ni_vp->v_type != VREG)
+               if (!error && vattr.va_type != VREG)
                        error = EOPNOTSUPP;
                if (!error && vattr.va_bytes < vattr.va_size)
                        /* File is definitely sparse, use vn_rdwr() */
@@ -1359,13 +1367,14 @@ vndioctl(dev_t dev, u_long cmd, void *da
                        vio->vnd_size = dbtob(vnd->sc_size);
                vnd->sc_flags |= VNF_INITED;
 
-               /* create the kernel thread, wait for it to be up */
-               error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd,
-                   &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev));
-               if (error)
+               vnd->sc_flags |= VNF_KTHREAD;
+               error = kthread_create(PRI_BIO,
+                   KTHREAD_MPSAFE | KTHREAD_MUSTJOIN, NULL,
+                   vndthread, vnd, &vnd->sc_kthread, "%s",
+                   device_xname(vnd->sc_dev));
+               if (error) {
+                       vnd->sc_flags &= ~VNF_KTHREAD;
                        goto close_and_exit;
-               while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
-                       tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
                }
 #ifdef DEBUG
                if (vnddebug & VDB_INIT)
@@ -1657,10 +1666,10 @@ vndshutdown(void)
 static void
 vndclear(struct vnd_softc *vnd, int myminor)
 {
+       struct lwp *kt;
        struct vnode *vp = vnd->sc_vp;
        int fflags = FREAD;
        int bmaj, cmaj, i, mn;
-       int s;
 
 #ifdef DEBUG
        if (vnddebug & VDB_FOLLOW)
@@ -1678,17 +1687,25 @@ vndclear(struct vnd_softc *vnd, int mymi
                        vdevgone(cmaj, mn, mn, VCHR);
        }
 
+       mutex_enter(&vnd->sc_sclock);
+
        if ((vnd->sc_flags & VNF_READONLY) == 0)
                fflags |= FWRITE;
 
-       s = splbio();
        bufq_drain(vnd->sc_tab);
-       splx(s);
+       cv_broadcast(&vnd->sc_bufqcv);
 
-       vnd->sc_flags |= VNF_VUNCONF;
-       wakeup(&vnd->sc_tab);
-       while (vnd->sc_flags & VNF_KTHREAD)
-               tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);
+       mutex_exit(&vnd->sc_sclock);
+
+       if (vnd->sc_flags & VNF_KTHREAD) {
+               mutex_enter(&vnd->sc_sclock);
+               vnd->sc_flags &= ~VNF_KTHREAD;
+               kt = vnd->sc_kthread;
+               cv_signal(&vnd->sc_bufqcv);
+               cv_signal(&vnd->sc_activitycv);
+               mutex_exit(&vnd->sc_sclock);
+               kthread_join(kt);
+       }
 
 #ifdef VND_COMPRESSION
        /* free the compressed file buffers */
@@ -1709,7 +1726,7 @@ vndclear(struct vnd_softc *vnd, int mymi
 #endif /* VND_COMPRESSION */
        vnd->sc_flags &=
            ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL
-             | VNF_VUNCONF | VNF_COMP | VNF_CLEARING);
+             | VNF_COMP | VNF_CLEARING);
        if (vp == NULL)
                panic("vndclear: null vp");
        (void) vn_close(vp, fflags, vnd->sc_cred);
@@ -1802,6 +1819,46 @@ vndgetdefaultlabel(struct vnd_softc *sc,
 }
 
 /*
+ * Wait interruptibly for an exclusive lock.
+ *
+ * XXX
+ * Several drivers do this; it should be abstracted and made MP-safe.
+ */
+static int
+vndlock(struct vnd_softc *sc)
+{
+       int error;
+
+       mutex_enter(&sc->sc_sclock);
+
+       error = 0;
+       while ((sc->sc_flags & VNF_LOCKED) != 0) {
+               error = cv_wait_sig(&sc->sc_intercv, &sc->sc_sclock);
+               if (error != 0)
+                       break;
+       }
+       if (error == 0)
+               sc->sc_flags |= VNF_LOCKED;
+
+       mutex_exit(&sc->sc_sclock);
+
+       return error;
+}
+
+/*
+ * Unlock and wake up any waiters.
+ */
+static void
+vndunlock(struct vnd_softc *sc)
+{
+
+       mutex_enter(&sc->sc_sclock);
+       sc->sc_flags &= ~VNF_LOCKED;
+       cv_signal(&sc->sc_intercv);
+       mutex_exit(&sc->sc_sclock);
+}
+
+/*
  * Read the disklabel from a vnd.  If one is not present, create a fake one.
  */
 static void
@@ -1853,40 +1910,6 @@ vndgetdisklabel(dev_t dev, struct vnd_so
        }
 }
 
-/*
- * Wait interruptibly for an exclusive lock.
- *
- * XXX
- * Several drivers do this; it should be abstracted and made MP-safe.
- */
-static int
-vndlock(struct vnd_softc *sc)
-{
-       int error;
-
-       while ((sc->sc_flags & VNF_LOCKED) != 0) {
-               sc->sc_flags |= VNF_WANTED;
-               if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
-                       return error;
-       }
-       sc->sc_flags |= VNF_LOCKED;
-       return 0;
-}
-
-/*
- * Unlock and wake up any waiters.
- */
-static void
-vndunlock(struct vnd_softc *sc)
-{
-
-       sc->sc_flags &= ~VNF_LOCKED;
-       if ((sc->sc_flags & VNF_WANTED) != 0) {
-               sc->sc_flags &= ~VNF_WANTED;
-               wakeup(sc);
-       }
-}
-
 #ifdef VND_COMPRESSION
 /* compressed file read */
 static void
@@ -2051,7 +2074,7 @@ vnd_modcmd(modcmd_t cmd, void *arg)
                error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
                if (error) {
                        config_cfdriver_detach(&vnd_cd);
-                       aprint_error("%s: unable to register cfattach\n",
+                       aprint_error("%s: unable to register\n",
                            vnd_cd.cd_name);
                        break;
                }

Reply via email to