git: eb60ff1ee16a - main - unionfs: rework locking scheme to only lock a single vnode

Jason A. Harmening Fri, 12 Jul 2024 18:40:04 -0700

The branch main has been updated by jah:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=eb60ff1ee16a7d22415d6065d808ddf0b5c13d7a


commit eb60ff1ee16a7d22415d6065d808ddf0b5c13d7a
Author:     Jason A. Harmening <[email protected]>
AuthorDate: 2024-02-28 17:45:56 +0000
Commit:     Jason A. Harmening <[email protected]>
CommitDate: 2024-07-12 22:05:16 +0000

    unionfs: rework locking scheme to only lock a single vnode
    
    Instead of locking both the lower and upper vnodes, which is both
    complex and deadlock-prone, only lock the upper vnode, or the lower
    vnode if no upper vnode is present.
    
    In most cases this is all that is needed; for the cases in which
    both vnodes do need to be locked, this change also employs deadlock-
    avoiding techniques such as LK_NOWAIT and vn_lock_pair().
    
    There are still some corner cases in which the current implementation
    ends up taking multiple vnode locks across different filesystems
    without taking special steps to avoid deadlock; those cases have
    been noted in the comments.
    
    Differential Revision:  https://reviews.freebsd.org/D45398
    Reviewed by:            olce
    Tested by:              pho
---
 sys/fs/unionfs/union.h        |  32 +-
 sys/fs/unionfs/union_subr.c   | 768 +++++++++++++++++++++--------------
 sys/fs/unionfs/union_vfsops.c |  15 +-
 sys/fs/unionfs/union_vnops.c  | 910 +++++++++++++++++++++++-------------------
 4 files changed, 979 insertions(+), 746 deletions(-)

diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h
index 467db3b29ff8..c535a3b288b8 100644
--- a/sys/fs/unionfs/union.h
+++ b/sys/fs/unionfs/union.h
@@ -97,15 +97,17 @@ struct unionfs_node {
 
        char           *un_path;                /* path */
        int             un_pathlen;             /* strlen of path */
-       int             un_flag;                /* unionfs node flag */
-};
 
-/*
- * unionfs node flags
- * It needs the vnode with exclusive lock, when changing the un_flag variable.
- */
-#define UNIONFS_OPENEXTL       0x01    /* openextattr (lower) */
-#define UNIONFS_OPENEXTU       0x02    /* openextattr (upper) */
+       /*
+        * unionfs node flags
+        * Changing these flags requires the vnode to be locked exclusive.
+        */
+       #define UNIONFS_OPENEXTL                0x01    /* openextattr (lower) 
*/
+       #define UNIONFS_OPENEXTU                0x02    /* openextattr (upper) 
*/
+       #define UNIONFS_COPY_IN_PROGRESS        0x04    /* copy/dir shadow in 
progres */
+       #define UNIONFS_LOOKUP_IN_PROGRESS      0x08
+       unsigned int    un_flag;                /* unionfs node flag */
+};
 
 extern struct vop_vector unionfs_vnodeops;
 
@@ -136,29 +138,25 @@ void      unionfs_get_node_status(struct unionfs_node *, 
struct thread *,
 void   unionfs_tryrem_node_status(struct unionfs_node *,
            struct unionfs_node_status *);
 int    unionfs_check_rmdir(struct vnode *, struct ucred *, struct thread *td);
-int    unionfs_copyfile(struct unionfs_node *, int, struct ucred *,
+int    unionfs_copyfile(struct vnode *, int, struct ucred *,
            struct thread *);
 void   unionfs_create_uppervattr_core(struct unionfs_mount *, struct vattr *,
            struct vattr *, struct thread *);
 int    unionfs_create_uppervattr(struct unionfs_mount *, struct vnode *,
            struct vattr *, struct ucred *, struct thread *);
-int    unionfs_mkshadowdir(struct unionfs_mount *, struct vnode *,
-           struct unionfs_node *, struct componentname *, struct thread *);
+int    unionfs_mkshadowdir(struct vnode *, struct vnode *,
+           struct componentname *, struct thread *);
 int    unionfs_mkwhiteout(struct vnode *, struct vnode *,
            struct componentname *, struct thread *, char *, int);
 int    unionfs_relookup(struct vnode *, struct vnode **,
            struct componentname *, struct componentname *, struct thread *,
            char *, int, u_long);
-int    unionfs_relookup_for_create(struct vnode *, struct componentname *,
-           struct thread *);
-int    unionfs_relookup_for_delete(struct vnode *, struct componentname *,
-           struct thread *);
-int    unionfs_relookup_for_rename(struct vnode *, struct componentname *,
-           struct thread *);
 void   unionfs_forward_vop_start_pair(struct vnode *, int *,
            struct vnode *, int *);
 bool   unionfs_forward_vop_finish_pair(struct vnode *, struct vnode *, int,
            struct vnode *, struct vnode *, int);
+int    unionfs_set_in_progress_flag(struct vnode *, unsigned int);
+void   unionfs_clear_in_progress_flag(struct vnode *, unsigned int);
 
 static inline void
 unionfs_forward_vop_start(struct vnode *basevp, int *lkflags)
diff --git a/sys/fs/unionfs/union_subr.c b/sys/fs/unionfs/union_subr.c
index bb57f3d56ade..671322704dc5 100644
--- a/sys/fs/unionfs/union_subr.c
+++ b/sys/fs/unionfs/union_subr.c
@@ -203,19 +203,19 @@ unionfs_ins_cached_vnode(struct unionfs_node *uncp,
        struct unionfs_node_hashhead *hd;
        struct vnode *vp;
 
-       ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
-       ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
-       KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR,
-           ("%s: v_type != VDIR", __func__));
-       KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR,
-           ("%s: v_type != VDIR", __func__));
-
        vp = NULLVP;
        VI_LOCK(dvp);
-       if (uncp->un_uppervp != NULL)
+       if (uncp->un_uppervp != NULLVP) {
+               ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
+               KASSERT(uncp->un_uppervp->v_type == VDIR,
+                   ("%s: v_type != VDIR", __func__));
                vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
-       else if (uncp->un_lowervp != NULL)
+       } else if (uncp->un_lowervp != NULLVP) {
+               ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
+               KASSERT(uncp->un_lowervp->v_type == VDIR,
+                   ("%s: v_type != VDIR", __func__));
                vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
+       }
        if (vp == NULLVP) {
                hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ?
                    uncp->un_uppervp : uncp->un_lowervp));
@@ -276,9 +276,11 @@ unionfs_nodeget_cleanup(struct vnode *vp, struct 
unionfs_node *unp)
 
        if (unp->un_dvp != NULLVP)
                vrele(unp->un_dvp);
-       if (unp->un_uppervp != NULLVP)
+       if (unp->un_uppervp != NULLVP) {
                vput(unp->un_uppervp);
-       if (unp->un_lowervp != NULLVP)
+               if (unp->un_lowervp != NULLVP)
+                       vrele(unp->un_lowervp);
+       } else if (unp->un_lowervp != NULLVP)
                vput(unp->un_lowervp);
        if (unp->un_hashtbl != NULL)
                hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
@@ -314,7 +316,7 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
        *vpp = NULLVP;
 
        if (uppervp == NULLVP && lowervp == NULLVP)
-               panic("%s: upper and lower is null", __func__);
+               panic("%s: upper and lower are both null", __func__);
 
        vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type);
 
@@ -327,7 +329,9 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
                vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
                if (vp != NULLVP) {
                        *vpp = vp;
-                       goto unionfs_nodeget_out;
+                       if (lkflags != 0)
+                               vn_lock(*vpp, lkflags | LK_RETRY);
+                       return (0);
                }
        }
 
@@ -385,27 +389,47 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
        KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
            ("%s: NULL dvp for non-root vp %p", __func__, vp));
 
-       vn_lock_pair(lowervp, false, LK_EXCLUSIVE, uppervp, false,
-           LK_EXCLUSIVE);
+
+       /*
+        * NOTE: There is still a possibility for cross-filesystem locking here.
+        * If dvp has an upper FS component and is locked, while the new vnode
+        * created here only has a lower-layer FS component, then we will end
+        * up taking a lower-FS lock while holding an upper-FS lock.
+        * That situation could be dealt with here using vn_lock_pair().
+        * However, that would only address one instance out of many in which
+        * a child vnode lock is taken while holding a lock on its parent
+        * directory. This is done in many places in common VFS code, as well as
+        * a few places within unionfs (which could lead to the same cross-FS
+        * locking issue if, for example, the upper FS is another nested unionfs
+        * instance).  Additionally, it is unclear under what circumstances this
+        * specific lock sequence (a directory on one FS followed by a child of
+        * its 'peer' directory on another FS) would present the practical
+        * possibility of deadlock due to some other agent on the system
+        * attempting to lock those two specific vnodes in the opposite order.
+        */
+       if (uppervp != NULLVP)
+               vn_lock(uppervp, LK_EXCLUSIVE | LK_RETRY);
+       else
+               vn_lock(lowervp, LK_EXCLUSIVE | LK_RETRY);
        error = insmntque1(vp, mp);
        if (error != 0) {
                unionfs_nodeget_cleanup(vp, unp);
                return (error);
        }
-       if (lowervp != NULL && VN_IS_DOOMED(lowervp)) {
-               vput(lowervp);
-               unp->un_lowervp = lowervp = NULL;
-       }
-       if (uppervp != NULL && VN_IS_DOOMED(uppervp)) {
-               vput(uppervp);
-               unp->un_uppervp = uppervp = NULL;
-               if (lowervp != NULLVP)
-                       vp->v_vnlock = lowervp->v_vnlock;
-       }
-       if (lowervp == NULL && uppervp == NULL) {
-               unionfs_nodeget_cleanup(vp, unp);
-               return (ENOENT);
-       }
+       /*
+        * lowervp and uppervp should only be doomed by a forced unmount of
+        * their respective filesystems, but that can only happen if the
+        * unionfs instance is first unmounted.  We also effectively hold the
+        * lock on the new unionfs vnode at this point.  Therefore, if a
+        * unionfs umount has not yet reached the point at which the above
+        * insmntque1() would fail, then its vflush() call will end up
+        * blocked on our vnode lock, effectively also preventing unmount
+        * of the underlying filesystems.
+        */
+       VNASSERT(lowervp == NULLVP || !VN_IS_DOOMED(lowervp), vp,
+           ("%s: doomed lowervp %p", __func__, lowervp));
+       VNASSERT(uppervp == NULLVP || !VN_IS_DOOMED(uppervp), vp,
+           ("%s: doomed lowervp %p", __func__, uppervp));
 
        vn_set_state(vp, VSTATE_CONSTRUCTED);
 
@@ -413,18 +437,16 @@ unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
                *vpp = unionfs_ins_cached_vnode(unp, dvp);
        if (*vpp != NULLVP) {
                unionfs_nodeget_cleanup(vp, unp);
-               vp = *vpp;
-       } else {
-               if (uppervp != NULL)
-                       VOP_UNLOCK(uppervp);
-               if (lowervp != NULL)
-                       VOP_UNLOCK(lowervp);
+               if (lkflags != 0)
+                       vn_lock(*vpp, lkflags | LK_RETRY);
+               return (0);
+       } else
                *vpp = vp;
-       }
 
-unionfs_nodeget_out:
-       if (lkflags & LK_TYPE_MASK)
-               vn_lock(vp, lkflags | LK_RETRY);
+       if ((lkflags & LK_SHARED) != 0)
+               vn_lock(vp, LK_DOWNGRADE);
+       else if ((lkflags & LK_EXCLUSIVE) == 0)
+               VOP_UNLOCK(vp);
 
        return (0);
 }
@@ -443,6 +465,7 @@ unionfs_noderem(struct vnode *vp)
        struct vnode   *dvp;
        int             count;
        int             writerefs;
+       bool            unlock_lvp;
 
        /*
         * The root vnode lock may be recursed during unmount, because
@@ -455,18 +478,36 @@ unionfs_noderem(struct vnode *vp)
         */
        KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
            ("%s: vnode %p locked recursively", __func__, vp));
+
+       unp = VTOUNIONFS(vp);
+       VNASSERT(unp != NULL, vp, ("%s: already reclaimed", __func__));
+       lvp = unp->un_lowervp;
+       uvp = unp->un_uppervp;
+       dvp = unp->un_dvp;
+       unlock_lvp = (uvp == NULLVP);
+
+       /*
+        * Lock the lower vnode in addition to the upper vnode lock in order
+        * to synchronize against any unionfs_lock() operation which may still
+        * hold the lower vnode lock.  We do not need to do this for the root
+        * vnode, as the root vnode should always have both upper and lower
+        * base vnodes for its entire lifecycled, so unionfs_lock() should
+        * never attempt to lock its lower vnode in the first place.
+        * Moreover, during unmount of a non-"below" unionfs mount, the lower
+        * root vnode will already be locked as it is the covered vnode.
+        */
+       if (uvp != NULLVP && lvp != NULLVP && (vp->v_vflag & VV_ROOT) == 0) {
+               vn_lock_pair(uvp, true, LK_EXCLUSIVE, lvp, false, LK_EXCLUSIVE);
+               unlock_lvp = true;
+       }
+
        if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                panic("%s: failed to acquire lock for vnode lock", __func__);
-
        /*
         * Use the interlock to protect the clearing of v_data to
         * prevent faults in unionfs_lock().
         */
        VI_LOCK(vp);
-       unp = VTOUNIONFS(vp);
-       lvp = unp->un_lowervp;
-       uvp = unp->un_uppervp;
-       dvp = unp->un_dvp;
        unp->un_lowervp = unp->un_uppervp = NULLVP;
        vp->v_vnlock = &(vp->v_lock);
        vp->v_data = NULL;
@@ -502,18 +543,16 @@ unionfs_noderem(struct vnode *vp)
                    ("%s: write reference without upper vnode", __func__));
                VOP_ADD_WRITECOUNT(uvp, -writerefs);
        }
-       if (lvp != NULLVP)
-               VOP_UNLOCK(lvp);
        if (uvp != NULLVP)
-               VOP_UNLOCK(uvp);
+               vput(uvp);
+       if (unlock_lvp)
+               vput(lvp);
+       else if (lvp != NULLVP)
+               vrele(lvp);
 
        if (dvp != NULLVP)
                unionfs_rem_cached_vnode(unp, dvp);
 
-       if (lvp != NULLVP)
-               vrele(lvp);
-       if (uvp != NULLVP)
-               vrele(uvp);
        if (unp->un_path != NULL) {
                free(unp->un_path, M_UNIONFSPATH);
                unp->un_path = NULL;
@@ -696,110 +735,6 @@ unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
        return (error);
 }
 
-/*
- * relookup for CREATE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- *
- * If it called 'unionfs_copyfile' function by unionfs_link etc,
- * VOP_LOOKUP information is broken.
- * So it need relookup in order to create link etc.
- */
-int
-unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp,
-    struct thread *td)
-{
-       struct vnode *udvp;
-       struct vnode *vp;
-       struct componentname cn;
-       int error;
-
-       udvp = UNIONFSVPTOUPPERVP(dvp);
-       vp = NULLVP;
-
-       error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
-           cnp->cn_namelen, CREATE);
-       if (error)
-               return (error);
-
-       if (vp != NULLVP) {
-               if (udvp == vp)
-                       vrele(vp);
-               else
-                       vput(vp);
-
-               error = EEXIST;
-       }
-
-       return (error);
-}
-
-/*
- * relookup for DELETE namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp,
-    struct thread *td)
-{
-       struct vnode *udvp;
-       struct vnode *vp;
-       struct componentname cn;
-       int error;
-
-       udvp = UNIONFSVPTOUPPERVP(dvp);
-       vp = NULLVP;
-
-       error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
-           cnp->cn_namelen, DELETE);
-       if (error)
-               return (error);
-
-       if (vp == NULLVP)
-               error = ENOENT;
-       else {
-               if (udvp == vp)
-                       vrele(vp);
-               else
-                       vput(vp);
-       }
-
-       return (error);
-}
-
-/*
- * relookup for RENAME namei operation.
- *
- * dvp is unionfs vnode. dvp should be locked.
- */
-int
-unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp,
-    struct thread *td)
-{
-       struct vnode *udvp;
-       struct vnode *vp;
-       struct componentname cn;
-       int error;
-
-       udvp = UNIONFSVPTOUPPERVP(dvp);
-       vp = NULLVP;
-
-       error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
-           cnp->cn_namelen, RENAME);
-       if (error)
-               return (error);
-
-       if (vp != NULLVP) {
-               if (udvp == vp)
-                       vrele(vp);
-               else
-                       vput(vp);
-       }
-
-       return (error);
-}
-
 /*
  * Update the unionfs_node.
  * 
@@ -836,6 +771,8 @@ unionfs_node_update(struct unionfs_node *unp, struct vnode 
*uvp,
        vp->v_vnlock = uvp->v_vnlock;
        VI_UNLOCK(vp);
 
+       for (count = 0; count < lockrec + 1; count++)
+               VOP_UNLOCK(lvp);
        /*
         * Re-cache the unionfs vnode against the upper vnode
         */
@@ -850,19 +787,88 @@ unionfs_node_update(struct unionfs_node *unp, struct 
vnode *uvp,
        }
 }
 
+/*
+ * Mark a unionfs operation as being in progress, sleeping if the
+ * same operation is already in progress.
+ * This is useful, for example, during copy-up operations in which
+ * we may drop the target vnode lock, but we want to avoid the
+ * possibility of a concurrent copy-up on the same vnode triggering
+ * a spurious failure.
+ */
+int
+unionfs_set_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+       struct unionfs_node *unp;
+       int error;
+
+       error = 0;
+       ASSERT_VOP_ELOCKED(vp, __func__);
+       VI_LOCK(vp);
+       unp = VTOUNIONFS(vp);
+       while (error == 0 && (unp->un_flag & flag) != 0) {
+               VOP_UNLOCK(vp);
+               error = msleep(vp, VI_MTX(vp), PCATCH | PDROP, "unioncp", 0);
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+               VI_LOCK(vp);
+               if (error == 0) {
+                       /*
+                        * If we waited on a concurrent copy-up and that
+                        * copy-up was successful, return a non-fatal
+                        * indication that the desired operation is already
+                        * complete.  If we waited on a concurrent lookup,
+                        * return ERELOOKUP to indicate the VFS cache should
+                        * be re-queried to avoid creating a duplicate unionfs
+                        * vnode.
+                        */
+                       unp = VTOUNIONFS(vp);
+                       if (unp == NULL)
+                               error = ENOENT;
+                       else if (flag == UNIONFS_COPY_IN_PROGRESS &&
+                           unp->un_uppervp != NULLVP)
+                               error = EJUSTRETURN;
+                       else if (flag == UNIONFS_LOOKUP_IN_PROGRESS)
+                               error = ERELOOKUP;
+               }
+       }
+       if (error == 0)
+               unp->un_flag |= flag;
+       VI_UNLOCK(vp);
+
+       return (error);
+}
+
+void
+unionfs_clear_in_progress_flag(struct vnode *vp, unsigned int flag)
+{
+       struct unionfs_node *unp;
+
+       ASSERT_VOP_ELOCKED(vp, __func__);
+       unp = VTOUNIONFS(vp);
+       VI_LOCK(vp);
+       if (unp != NULL) {
+               VNASSERT((unp->un_flag & flag) != 0, vp,
+                   ("%s: copy not in progress", __func__));
+               unp->un_flag &= ~flag;
+       }
+       wakeup(vp);
+       VI_UNLOCK(vp);
+}
+
 /*
  * Create a new shadow dir.
  * 
- * udvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
  * 
  * If no error returned, unp will be updated.
  */
 int
-unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
-    struct unionfs_node *unp, struct componentname *cnp, struct thread *td)
+unionfs_mkshadowdir(struct vnode *dvp, struct vnode *vp,
+    struct componentname *cnp, struct thread *td)
 {
        struct vnode   *lvp;
        struct vnode   *uvp;
+       struct vnode   *udvp;
        struct vattr    va;
        struct vattr    lva;
        struct nameidata nd;
@@ -870,10 +876,25 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct 
vnode *udvp,
        struct ucred   *cred;
        struct ucred   *credbk;
        struct uidinfo *rootinfo;
+       struct unionfs_mount *ump;
+       struct unionfs_node *dunp;
+       struct unionfs_node *unp;
        int             error;
 
+       ASSERT_VOP_ELOCKED(dvp, __func__);
+       ASSERT_VOP_ELOCKED(vp, __func__);
+       ump = MOUNTTOUNIONFSMOUNT(vp->v_mount);
+       unp = VTOUNIONFS(vp);
        if (unp->un_uppervp != NULLVP)
                return (EEXIST);
+       dunp = VTOUNIONFS(dvp);
+       udvp = dunp->un_uppervp;
+
+       error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+       if (error == EJUSTRETURN)
+               return (0);
+       else if (error != 0)
+               return (error);
 
        lvp = unp->un_lowervp;
        uvp = NULLVP;
@@ -897,11 +918,29 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct 
vnode *udvp,
        NDPREINIT(&nd);
 
        if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
-               goto unionfs_mkshadowdir_abort;
+               goto unionfs_mkshadowdir_finish;
 
+       vref(udvp);
+       VOP_UNLOCK(vp);
        if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
-           cnp->cn_nameptr, cnp->cn_namelen, CREATE)))
-               goto unionfs_mkshadowdir_abort;
+           cnp->cn_nameptr, cnp->cn_namelen, CREATE))) {
+               /*
+                * When handling error cases here, we drop udvp's lock and
+                * then jump to exit code that relocks dvp, which in most
+                * cases will effectively relock udvp.  However, this is
+                * not guaranteed to be the case, as various calls made
+                * here (such as unionfs_relookup() above and VOP_MKDIR()
+                * below) may unlock and then relock udvp, allowing dvp to
+                * be reclaimed in the meantime.  In such a situation dvp
+                * will no longer share its lock with udvp.  Since
+                * performance isn't a concern for these error cases, it
+                * makes more sense to reuse the common code that locks
+                * dvp on exit than to explicitly check for reclamation
+                * of dvp.
+                */
+               vput(udvp);
+               goto unionfs_mkshadowdir_relock;
+       }
        if (uvp != NULLVP) {
                if (udvp == uvp)
                        vrele(uvp);
@@ -909,11 +948,14 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct 
vnode *udvp,
                        vput(uvp);
 
                error = EEXIST;
-               goto unionfs_mkshadowdir_abort;
+               vput(udvp);
+               goto unionfs_mkshadowdir_relock;
        }
 
-       if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
-               goto unionfs_mkshadowdir_abort;
+       if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH))) {
+               vput(udvp);
+               goto unionfs_mkshadowdir_relock;
+       }
        unionfs_create_uppervattr_core(ump, &lva, &va, td);
 
        /*
@@ -924,7 +966,7 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode 
*udvp,
         * component.  This *should* be fine, as cn_namelen will still
         * correctly indicate the length of only the current component,
         * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
-        * implementation
+        * implementation.
         * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
         * something like a local namei() operation and the temporary
         * NUL-termination will not have an effect on other threads.
@@ -934,27 +976,58 @@ unionfs_mkshadowdir(struct unionfs_mount *ump, struct 
vnode *udvp,
        *pathend = '\0';
        error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
        *pathend = pathterm;
-
-       if (!error) {
-               /*
-                * XXX The bug which cannot set uid/gid was corrected.
-                * Ignore errors.
-                */
-               va.va_type = VNON;
-               VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
-
+       if (error != 0) {
                /*
-                * VOP_SETATTR() may transiently drop uvp's lock, so it's
-                * important to call it before unionfs_node_update() transfers
-                * the unionfs vnode's lock from lvp to uvp; otherwise the
-                * unionfs vnode itself would be transiently unlocked and
-                * potentially doomed.
+                * See the comment after unionfs_relookup() above for an
+                * explanation of why we unlock udvp here only to relock
+                * dvp on exit.
                 */
-               unionfs_node_update(unp, uvp, td);
+               vput(udvp);
+               vn_finished_write(mp);
+               goto unionfs_mkshadowdir_relock;
        }
+
+       /*
+        * XXX The bug which cannot set uid/gid was corrected.
+        * Ignore errors.
+        */
+       va.va_type = VNON;
+       /*
+        * VOP_SETATTR() may transiently drop uvp's lock, so it's
+        * important to call it before unionfs_node_update() transfers
+        * the unionfs vnode's lock from lvp to uvp; otherwise the
+        * unionfs vnode itself would be transiently unlocked and
+        * potentially doomed.
+        */
+       VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
+
+       /*
+        * uvp may become doomed during VOP_VPUT_PAIR() if the implementation
+        * must temporarily drop uvp's lock.  However, since we hold a
+        * reference to uvp from the VOP_MKDIR() call above, this would require
+        * a forcible unmount of uvp's filesystem, which in turn can only
+        * happen if our unionfs instance is first forcibly unmounted.  We'll
+        * therefore catch this case in the NULL check of unp below.
+        */
+       VOP_VPUT_PAIR(udvp, &uvp, false);
        vn_finished_write(mp);
+       vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+       unp = VTOUNIONFS(vp);
+       if (unp == NULL) {
+               vput(uvp);
+               error = ENOENT;
+       } else
+               unionfs_node_update(unp, uvp, td);
+       VOP_UNLOCK(vp);
+
+unionfs_mkshadowdir_relock:
+       vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
+       vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+       if (error == 0 && (VN_IS_DOOMED(dvp) || VN_IS_DOOMED(vp)))
+               error = ENOENT;
 
-unionfs_mkshadowdir_abort:
+unionfs_mkshadowdir_finish:
+       unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
        cnp->cn_cred = credbk;
        chgproccnt(cred->cr_ruidinfo, -1, 0);
        crfree(cred);
@@ -1116,23 +1189,31 @@ unionfs_forward_vop_finish_pair(
 /*
  * Create a new whiteout.
  * 
- * udvp and dvp should be locked on entry and will be locked on return.
+ * dvp and vp are unionfs vnodes representing a parent directory and
+ * child file, should be locked on entry, and will be locked on return.
  */
 int
-unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
+unionfs_mkwhiteout(struct vnode *dvp, struct vnode *vp,
     struct componentname *cnp, struct thread *td, char *path, int pathlen)
 {
+       struct vnode   *udvp;
        struct vnode   *wvp;
        struct nameidata nd;
        struct mount   *mp;
        int             error;
-       int             lkflags;
+       bool            dvp_locked;
+
+       ASSERT_VOP_ELOCKED(dvp, __func__);
+       ASSERT_VOP_ELOCKED(vp, __func__);
 
+       udvp = VTOUNIONFS(dvp)->un_uppervp;
        wvp = NULLVP;
        NDPREINIT(&nd);
+       vref(udvp);
+       VOP_UNLOCK(vp);
        if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
            pathlen, CREATE))) {
-               return (error);
+               goto unionfs_mkwhiteout_cleanup;
        }
        if (wvp != NULLVP) {
                if (udvp == wvp)
@@ -1140,18 +1221,27 @@ unionfs_mkwhiteout(struct vnode *dvp, struct vnode 
*udvp,
                else
                        vput(wvp);
 
-               return (EEXIST);
+               if (nd.ni_cnd.cn_flags & ISWHITEOUT)
+                       error = 0;
+               else
+                       error = EEXIST;
+               goto unionfs_mkwhiteout_cleanup;
        }
 
        if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
-               goto unionfs_mkwhiteout_free_out;
-       unionfs_forward_vop_start(udvp, &lkflags);
+               goto unionfs_mkwhiteout_cleanup;
        error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
-       unionfs_forward_vop_finish(dvp, udvp, lkflags);
-
        vn_finished_write(mp);
 
-unionfs_mkwhiteout_free_out:
+unionfs_mkwhiteout_cleanup:
+       if (VTOUNIONFS(dvp) == NULL) {
+               vput(udvp);
+               dvp_locked = false;
+       } else {
+               vrele(udvp);
+               dvp_locked = true;
+       }
+       vn_lock_pair(dvp, dvp_locked, LK_EXCLUSIVE, vp, false, LK_EXCLUSIVE);
        return (error);
 }
 
@@ -1165,10 +1255,11 @@ unionfs_mkwhiteout_free_out:
  */
 static int
 unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
-    struct unionfs_node *unp, struct vattr *uvap, struct thread *td)
+    struct vnode *vp, struct vattr *uvap, struct thread *td)
 {
        struct unionfs_mount *ump;
-       struct vnode   *vp;
+       struct unionfs_node *unp;
+       struct vnode   *uvp;
        struct vnode   *lvp;
        struct ucred   *cred;
        struct vattr    lva;
@@ -1176,8 +1267,10 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct 
vnode *udvp,
        int             fmode;
        int             error;
 
+       ASSERT_VOP_ELOCKED(vp, __func__);
+       unp = VTOUNIONFS(vp);
        ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
-       vp = NULLVP;
+       uvp = NULLVP;
        lvp = unp->un_lowervp;
        cred = td->td_ucred;
        fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
@@ -1200,42 +1293,39 @@ unionfs_vn_create_on_upper(struct vnode **vpp, struct 
vnode *udvp,
        NDPREINIT(&nd);
 
        vref(udvp);
-       if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd, false)) != 0)
-               goto unionfs_vn_create_on_upper_free_out2;
-       vrele(udvp);
+       VOP_UNLOCK(vp);
+       if ((error = vfs_relookup(udvp, &uvp, &nd.ni_cnd, false)) != 0) {
+               vrele(udvp);
+               return (error);
+       }
 
-       if (vp != NULLVP) {
-               if (vp == udvp)
-                       vrele(vp);
+       if (uvp != NULLVP) {
+               if (uvp == udvp)
+                       vrele(uvp);
                else
-                       vput(vp);
+                       vput(uvp);
                error = EEXIST;
-               goto unionfs_vn_create_on_upper_free_out1;
+               goto unionfs_vn_create_on_upper_cleanup;
        }
 
-       if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0)
-               goto unionfs_vn_create_on_upper_free_out1;
+       if ((error = VOP_CREATE(udvp, &uvp, &nd.ni_cnd, uvap)) != 0)
+               goto unionfs_vn_create_on_upper_cleanup;
 
-       if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) {
-               vput(vp);
-               goto unionfs_vn_create_on_upper_free_out1;
+       if ((error = VOP_OPEN(uvp, fmode, cred, td, NULL)) != 0) {
+               vput(uvp);
+               goto unionfs_vn_create_on_upper_cleanup;
        }
-       error = VOP_ADD_WRITECOUNT(vp, 1);
-       CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
-           __func__, vp, vp->v_writecount);
+       error = VOP_ADD_WRITECOUNT(uvp, 1);
+       CTR3(KTR_VFS, "%s: newvp %p v_writecount increased to %d",
+           __func__, newvp, newvp->v_writecount);
        if (error == 0) {
-               *vpp = vp;
+               *vpp = uvp;
        } else {
-               VOP_CLOSE(vp, fmode, cred, td);
+               VOP_CLOSE(uvp, fmode, cred, td);
        }
 
-unionfs_vn_create_on_upper_free_out1:
-       VOP_UNLOCK(udvp);
-
-unionfs_vn_create_on_upper_free_out2:
-       KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path,
-           ("%s: cn_pnbuf changed", __func__));
-
+unionfs_vn_create_on_upper_cleanup:
+       vput(udvp);
        return (error);
 }
 
@@ -1310,13 +1400,18 @@ unionfs_copyfile_core(struct vnode *lvp, struct vnode 
*uvp,
  * 
  * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
  * docopy.
+ *
+ * vp is a unionfs vnode that should be locked on entry and will be
+ * locked on return.
  * 
  * If no error returned, unp will be updated.
  */
 int
-unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
+unionfs_copyfile(struct vnode *vp, int docopy, struct ucred *cred,
     struct thread *td)
 {
+       struct unionfs_node *unp;
+       struct unionfs_node *dunp;
        struct mount   *mp;
        struct vnode   *udvp;
        struct vnode   *lvp;
@@ -1324,6 +1419,8 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, 
struct ucred *cred,
        struct vattr    uva;
        int             error;
 
+       ASSERT_VOP_ELOCKED(vp, __func__);
+       unp = VTOUNIONFS(vp);
        lvp = unp->un_lowervp;
        uvp = NULLVP;
 
@@ -1333,22 +1430,51 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, 
struct ucred *cred,
                return (EINVAL);
        if (unp->un_uppervp != NULLVP)
                return (EEXIST);
-       udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp;
+
+       udvp = NULLVP;
+       VI_LOCK(unp->un_dvp);
+       dunp = VTOUNIONFS(unp->un_dvp);
+       if (dunp != NULL)
+               udvp = dunp->un_uppervp;
+       VI_UNLOCK(unp->un_dvp);
+
        if (udvp == NULLVP)
                return (EROFS);
        if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
                return (EROFS);
+       ASSERT_VOP_UNLOCKED(udvp, __func__);
+
+       error = unionfs_set_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
+       if (error == EJUSTRETURN)
+               return (0);
+       else if (error != 0)
+               return (error);
 
        error = VOP_ACCESS(lvp, VREAD, cred, td);
        if (error != 0)
-               return (error);
+               goto unionfs_copyfile_cleanup;
 
        if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
-               return (error);
-       error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td);
+               goto unionfs_copyfile_cleanup;
+       error = unionfs_vn_create_on_upper(&uvp, udvp, vp, &uva, td);
        if (error != 0) {
                vn_finished_write(mp);
-               return (error);
+               vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+               goto unionfs_copyfile_cleanup;
+       }
+
+       /*
+        * Note that it's still possible for e.g. VOP_WRITE to relock
+        * uvp below while holding vp[=lvp] locked.  Replacing
+        * unionfs_copyfile_core with vn_generic_copy_file_range() will
+        * allow us to avoid the problem by moving this vn_lock_pair()
+        * call much later.
+        */
+       vn_lock_pair(vp, false, LK_EXCLUSIVE, uvp, true, LK_EXCLUSIVE);
+       unp = VTOUNIONFS(vp);
+       if (unp == NULL) {
+               error = ENOENT;
+               goto unionfs_copyfile_cleanup;
        }
 
        if (docopy != 0) {
@@ -1369,18 +1495,30 @@ unionfs_copyfile(struct unionfs_node *unp, int docopy, 
struct ucred *cred,
                /* Reset the attributes. Ignore errors. */
                uva.va_type = VNON;
                VOP_SETATTR(uvp, &uva, cred);
+               unionfs_node_update(unp, uvp, td);
        }
 
-       unionfs_node_update(unp, uvp, td);
-
+unionfs_copyfile_cleanup:
+       unionfs_clear_in_progress_flag(vp, UNIONFS_COPY_IN_PROGRESS);
        return (error);
 }
 
 /*
- * It checks whether vp can rmdir. (check empty)
+ * Determine if the unionfs view of a directory is empty such that
+ * an rmdir operation can be permitted.
+ *
+ * We assume the VOP_RMDIR() against the upper layer vnode will take
+ * care of this check for us where the upper FS is concerned, so here
+ * we concentrate on the lower FS.  We need to check for the presence
+ * of files other than "." and ".." in the lower FS directory and
+ * then cross-check any files we find against the upper FS to see if
+ * a whiteout is present (in which case we treat the lower file as
+ * non-present).
  *
- * vp is unionfs vnode.
- * vp should be locked.
+ * The logic here is based heavily on vn_dir_check_empty().
+ *
+ * vp should be a locked unionfs node, and vp's lowervp should also be
+ * locked.
  */
*** 1636 LINES SKIPPED ***

git: eb60ff1ee16a - main - unionfs: rework locking scheme to only lock a single vnode

Reply via email to