Author: kib
Date: Thu Jun 21 09:19:41 2012
New Revision: 237365
URL: http://svn.freebsd.org/changeset/base/237365

Log:
  Fix locking for f_offset, vn_read() and vn_write() cases only, for now.
  
  It seems that intended locking protocol for struct file f_offset field
  was as follows: f_offset should always be changed under the vnode lock
  (except fcntl(2) and lseek(2) did not followed the rules). Since
  read(2) uses shared vnode lock, FOFFSET_LOCKED block is additionally
  taken to serialize shared vnode lock owners.
  
  This was broken first by enabling shared lock on writes, then by
  fadvise changes, which moved f_offset assigned from under vnode lock,
  and last by vn_io_fault() doing chunked i/o. More, due to uio_offset
  not yet valid in vn_io_fault(), the range lock for reads was taken on
  the wrong region.
  
  Change the locking for f_offset to always use FOFFSET_LOCKED block,
  which is placed before rangelocks in the lock order.
  
  Extract foffset_lock() and foffset_unlock() functions which implements
  FOFFSET_LOCKED lock, and consistently lock f_offset with it in the
  vn_io_fault() both for reads and writes, even if MNTK_NO_IOPF flag is
  not set for the vnode mount. Indicate that f_offset is already valid
  for vn_read() and vn_write() calls from vn_io_fault() with FOF_OFFSET
  flag, and assert that all callers of vn_read() and vn_write() follow
  this protocol.
  
  Extract get_advice() function to calculate the POSIX_FADV_XXX value
  for the i/o region, and use it were appropriate.
  
  Reviewed by:  jhb
  Tested by:    pho
  MFC after:    2 weeks

Modified:
  head/sys/kern/vfs_vnops.c

Modified: head/sys/kern/vfs_vnops.c
==============================================================================
--- head/sys/kern/vfs_vnops.c   Thu Jun 21 09:12:33 2012        (r237364)
+++ head/sys/kern/vfs_vnops.c   Thu Jun 21 09:19:41 2012        (r237365)
@@ -527,6 +527,66 @@ vn_rdwr_inchunks(rw, vp, base, len, offs
        return (error);
 }
 
+static void
+foffset_lock(struct file *fp, struct uio *uio, int flags)
+{
+       struct mtx *mtxp;
+
+       if ((flags & FOF_OFFSET) != 0)
+               return;
+
+       /*
+        * According to McKusick the vn lock was protecting f_offset here.
+        * It is now protected by the FOFFSET_LOCKED flag.
+        */
+       mtxp = mtx_pool_find(mtxpool_sleep, fp);
+       mtx_lock(mtxp);
+       while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+               fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+               msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+                   "vnread offlock", 0);
+       }
+       fp->f_vnread_flags |= FOFFSET_LOCKED;
+       uio->uio_offset = fp->f_offset;
+       mtx_unlock(mtxp);
+}
+
+static int
+get_advice(struct file *fp, struct uio *uio)
+{
+       struct mtx *mtxp;
+       int ret;
+
+       ret = POSIX_FADV_NORMAL;
+       if (fp->f_advice == NULL)
+               return (ret);
+
+       mtxp = mtx_pool_find(mtxpool_sleep, fp);
+       mtx_lock(mtxp);
+       if (uio->uio_offset >= fp->f_advice->fa_start &&
+           uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
+               ret = fp->f_advice->fa_advice;
+       mtx_unlock(mtxp);
+       return (ret);
+}
+
+static void
+foffset_unlock(struct file *fp, struct uio *uio, int flags)
+{
+       struct mtx *mtxp;
+
+       if ((flags & FOF_OFFSET) != 0)
+               return;
+
+       fp->f_offset = uio->uio_offset;
+       mtxp = mtx_pool_find(mtxpool_sleep, fp);
+       mtx_lock(mtxp);
+       if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+               wakeup(&fp->f_vnread_flags);
+       fp->f_vnread_flags = 0;
+       mtx_unlock(mtxp);
+}
+
 /*
  * File table vnode read routine.
  */
@@ -539,44 +599,22 @@ vn_read(fp, uio, active_cred, flags, td)
        struct thread *td;
 {
        struct vnode *vp;
-       int error, ioflag;
        struct mtx *mtxp;
+       int error, ioflag;
        int advice, vfslocked;
        off_t offset, start, end;
 
        KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
            uio->uio_td, td));
-       mtxp = NULL;
+       KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
        vp = fp->f_vnode;
        ioflag = 0;
        if (fp->f_flag & FNONBLOCK)
                ioflag |= IO_NDELAY;
        if (fp->f_flag & O_DIRECT)
                ioflag |= IO_DIRECT;
-       advice = POSIX_FADV_NORMAL;
+       advice = get_advice(fp, uio);
        vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-       /*
-        * According to McKusick the vn lock was protecting f_offset here.
-        * It is now protected by the FOFFSET_LOCKED flag.
-        */
-       if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
-               mtxp = mtx_pool_find(mtxpool_sleep, fp);
-               mtx_lock(mtxp);
-               if ((flags & FOF_OFFSET) == 0) {
-                       while (fp->f_vnread_flags & FOFFSET_LOCKED) {
-                               fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-                               msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-                                   "vnread offlock", 0);
-                       }
-                       fp->f_vnread_flags |= FOFFSET_LOCKED;
-                       uio->uio_offset = fp->f_offset;
-               }
-               if (fp->f_advice != NULL &&
-                   uio->uio_offset >= fp->f_advice->fa_start &&
-                   uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-                       advice = fp->f_advice->fa_advice;
-               mtx_unlock(mtxp);
-       }
        vn_lock(vp, LK_SHARED | LK_RETRY);
 
        switch (advice) {
@@ -596,14 +634,6 @@ vn_read(fp, uio, active_cred, flags, td)
        if (error == 0)
 #endif
                error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-       if ((flags & FOF_OFFSET) == 0) {
-               fp->f_offset = uio->uio_offset;
-               mtx_lock(mtxp);
-               if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-                       wakeup(&fp->f_vnread_flags);
-               fp->f_vnread_flags = 0;
-               mtx_unlock(mtxp);
-       }
        fp->f_nextoff = uio->uio_offset;
        VOP_UNLOCK(vp, 0);
        if (error == 0 && advice == POSIX_FADV_NOREUSE &&
@@ -625,6 +655,7 @@ vn_read(fp, uio, active_cred, flags, td)
                 */
                start = offset;
                end = uio->uio_offset - 1;
+               mtxp = mtx_pool_find(mtxpool_sleep, fp);
                mtx_lock(mtxp);
                if (fp->f_advice != NULL &&
                    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -656,13 +687,14 @@ vn_write(fp, uio, active_cred, flags, td
 {
        struct vnode *vp;
        struct mount *mp;
-       int error, ioflag, lock_flags;
        struct mtx *mtxp;
+       int error, ioflag, lock_flags;
        int advice, vfslocked;
        off_t offset, start, end;
 
        KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
            uio->uio_td, td));
+       KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
        vp = fp->f_vnode;
        vfslocked = VFS_LOCK_GIANT(vp->v_mount);
        if (vp->v_type == VREG)
@@ -681,6 +713,8 @@ vn_write(fp, uio, active_cred, flags, td
        if (vp->v_type != VCHR &&
            (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
                goto unlock;
+
+       advice = get_advice(fp, uio);
  
        if ((MNT_SHARED_WRITES(mp) ||
            ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
@@ -691,19 +725,6 @@ vn_write(fp, uio, active_cred, flags, td
        }
 
        vn_lock(vp, lock_flags | LK_RETRY);
-       if ((flags & FOF_OFFSET) == 0)
-               uio->uio_offset = fp->f_offset;
-       advice = POSIX_FADV_NORMAL;
-       mtxp = NULL;
-       if (fp->f_advice != NULL) {
-               mtxp = mtx_pool_find(mtxpool_sleep, fp);
-               mtx_lock(mtxp);
-               if (fp->f_advice != NULL &&
-                   uio->uio_offset >= fp->f_advice->fa_start &&
-                   uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
-                       advice = fp->f_advice->fa_advice;
-               mtx_unlock(mtxp);
-       }
        switch (advice) {
        case POSIX_FADV_NORMAL:
        case POSIX_FADV_SEQUENTIAL:
@@ -721,8 +742,6 @@ vn_write(fp, uio, active_cred, flags, td
        if (error == 0)
 #endif
                error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-       if ((flags & FOF_OFFSET) == 0)
-               fp->f_offset = uio->uio_offset;
        fp->f_nextoff = uio->uio_offset;
        VOP_UNLOCK(vp, 0);
        if (vp->v_type != VCHR)
@@ -761,6 +780,7 @@ vn_write(fp, uio, active_cred, flags, td
                 */
                start = offset;
                end = uio->uio_offset - 1;
+               mtxp = mtx_pool_find(mtxpool_sleep, fp);
                mtx_lock(mtxp);
                if (fp->f_advice != NULL &&
                    fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
@@ -845,11 +865,15 @@ vn_io_fault(struct file *fp, struct uio 
        else
                doio = vn_write;
        vp = fp->f_vnode;
+       foffset_lock(fp, uio, flags);
+
        if (uio->uio_segflg != UIO_USERSPACE || vp->v_type != VREG ||
            ((mp = vp->v_mount) != NULL &&
            (mp->mnt_kern_flag & MNTK_NO_IOPF) == 0) ||
-           !vn_io_fault_enable)
-               return (doio(fp, uio, active_cred, flags, td));
+           !vn_io_fault_enable) {
+               error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
+               goto out_last;
+       }
 
        /*
         * The UFS follows IO_UNIT directive and replays back both
@@ -882,7 +906,7 @@ vn_io_fault(struct file *fp, struct uio 
        }
 
        save = vm_fault_disable_pagefaults();
-       error = doio(fp, uio, active_cred, flags, td);
+       error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
        if (error != EFAULT)
                goto out;
 
@@ -933,7 +957,8 @@ vn_io_fault(struct file *fp, struct uio 
                td->td_ma = ma;
                td->td_ma_cnt = cnt;
 
-               error = doio(fp, &short_uio, active_cred, flags, td);
+               error = doio(fp, &short_uio, active_cred, flags | FOF_OFFSET,
+                   td);
                vm_page_unhold_pages(ma, cnt);
                adv = len - short_uio.uio_resid;
 
@@ -956,6 +981,8 @@ out:
        vm_fault_enable_pagefaults(save);
        vn_rangelock_unlock(vp, rl_cookie);
        free(uio_clone, M_IOV);
+out_last:
+       foffset_unlock(fp, uio, flags);
        return (error);
 }
 
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to