Author: mjg
Date: Wed Jul  1 05:56:29 2020
New Revision: 362827
URL: https://svnweb.freebsd.org/changeset/base/362827

Log:
  vfs: protect vnodes with smr
  
  vget_prep_smr and vhold_smr can be used to ref a vnode while within vfs_smr
  section, allowing consumers to get away without locking.
  
  See vhold_smr and vdropl for comments explaining caveats.
  
  Reviewed by:  kib
  Testec by:    pho
  Differential Revision:        https://reviews.freebsd.org/D23913

Modified:
  head/sys/kern/vfs_subr.c
  head/sys/sys/vnode.h

Modified: head/sys/kern/vfs_subr.c
==============================================================================
--- head/sys/kern/vfs_subr.c    Wed Jul  1 04:12:41 2020        (r362826)
+++ head/sys/kern/vfs_subr.c    Wed Jul  1 05:56:29 2020        (r362827)
@@ -76,6 +76,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
+#include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
@@ -238,6 +239,8 @@ static uma_zone_t buf_trie_zone;
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
+__read_frequently smr_t vfs_smr;
+
 /*
  * The workitem queue.
  *
@@ -661,7 +664,8 @@ vntblinit(void *dummy __unused)
        vnode_list_reclaim_marker = vn_alloc_marker(NULL);
        TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
        vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
-           vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+           vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
+       vfs_smr = uma_zone_get_smr(vnode_zone);
        vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
            NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
        /*
@@ -1603,7 +1607,7 @@ alloc:
        if (vnlru_under(rnumvnodes, vlowat))
                vnlru_kick();
        mtx_unlock(&vnode_list_mtx);
-       return (uma_zalloc(vnode_zone, M_WAITOK));
+       return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static struct vnode *
@@ -1619,7 +1623,7 @@ vn_alloc(struct mount *mp)
                return (vn_alloc_hard(mp));
        }
 
-       return (uma_zalloc(vnode_zone, M_WAITOK));
+       return (uma_zalloc_smr(vnode_zone, M_WAITOK));
 }
 
 static void
@@ -1627,7 +1631,7 @@ vn_free(struct vnode *vp)
 {
 
        atomic_subtract_long(&numvnodes, 1);
-       uma_zfree(vnode_zone, vp);
+       uma_zfree_smr(vnode_zone, vp);
 }
 
 /*
@@ -1758,7 +1762,7 @@ freevnode(struct vnode *vp)
        CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
        bo = &vp->v_bufobj;
        VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
-       VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
+       VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
        VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
        VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
        VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
@@ -2848,8 +2852,30 @@ v_decr_devcount(struct vnode *vp)
  *
  * holdcnt can be manipulated using atomics without holding any locks,
  * except when transitioning 1<->0, in which case the interlock is held.
+ *
+ * Consumers which don't guarantee liveness of the vnode can use SMR to
+ * try to get a reference. Note this operation can fail since the vnode
+ * may be awaiting getting freed by the time they get to it.
  */
 enum vgetstate
+vget_prep_smr(struct vnode *vp)
+{
+       enum vgetstate vs;
+
+       VFS_SMR_ASSERT_ENTERED();
+
+       if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
+               vs = VGET_USECOUNT;
+       } else {
+               if (vhold_smr(vp))
+                       vs = VGET_HOLDCNT;
+               else
+                       vs = VGET_NONE;
+       }
+       return (vs);
+}
+
+enum vgetstate
 vget_prep(struct vnode *vp)
 {
        enum vgetstate vs;
@@ -2919,6 +2945,7 @@ vget_finish(struct vnode *vp, int flags, enum vgetstat
                ASSERT_VI_LOCKED(vp, __func__);
        else
                ASSERT_VI_UNLOCKED(vp, __func__);
+       VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
        VNPASS(vp->v_holdcnt > 0, vp);
        VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
 
@@ -3380,7 +3407,8 @@ vhold(struct vnode *vp)
 
        CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
        old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
-       VNASSERT(old >= 0, vp, ("%s: wrong hold count %d", __func__, old));
+       VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
+           ("%s: wrong hold count %d", __func__, old));
        if (old != 0)
                return;
        critical_enter();
@@ -3405,12 +3433,40 @@ vholdnz(struct vnode *vp)
        CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 #ifdef INVARIANTS
        int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
-       VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old));
+       VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
+           ("%s: wrong hold count %d", __func__, old));
 #else
        atomic_add_int(&vp->v_holdcnt, 1);
 #endif
 }
 
+/*
+ * Grab a hold count as long as the vnode is not getting freed.
+ *
+ * Only use this routine if vfs smr is the only protection you have against
+ * freeing the vnode.
+ */
+bool
+vhold_smr(struct vnode *vp)
+{
+       int count;
+
+       VFS_SMR_ASSERT_ENTERED();
+
+       count = atomic_load_int(&vp->v_holdcnt);
+       for (;;) {
+               if (count & VHOLD_NO_SMR) {
+                       VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
+                           ("non-zero hold count with flags %d\n", count));
+                       return (false);
+               }
+
+               VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
+               if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1))
+                       return (true);
+       }
+}
+
 static void __noinline
 vdbatch_process(struct vdbatch *vd)
 {
@@ -3581,11 +3637,25 @@ vdropl(struct vnode *vp)
                VI_UNLOCK(vp);
                return;
        }
-       if (VN_IS_DOOMED(vp)) {
-               freevnode(vp);
+       if (!VN_IS_DOOMED(vp)) {
+               vdrop_deactivate(vp);
                return;
        }
-       vdrop_deactivate(vp);
+       /*
+        * We may be racing against vhold_smr.
+        *
+        * If they win we can just pretend we never got this far, they will
+        * vdrop later.
+        */
+       if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) {
+               /*
+                * We lost the aforementioned race. Note that any subsequent
+                * access is invalid as they might have managed to vdropl on
+                * their own.
+                */
+               return;
+       }
+       freevnode(vp);
 }
 
 /*
@@ -4041,20 +4111,25 @@ static const char * const typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
+_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
+    "new hold count flag not added to vn_printf");
+
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
        va_list ap;
        char buf[256], buf2[16];
        u_long flags;
+       u_int holdcnt;
 
        va_start(ap, fmt);
        vprintf(fmt, ap);
        va_end(ap);
        printf("%p: ", (void *)vp);
        printf("type %s\n", typename[vp->v_type]);
+       holdcnt = atomic_load_int(&vp->v_holdcnt);
        printf("    usecount %d, writecount %d, refcount %d",
-           vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+           vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS);
        switch (vp->v_type) {
        case VDIR:
                printf(" mountedhere %p\n", vp->v_mountedhere);
@@ -4072,6 +4147,12 @@ vn_printf(struct vnode *vp, const char *fmt, ...)
                printf("\n");
                break;
        }
+       buf[0] = '\0';
+       buf[1] = '\0';
+       if (holdcnt & VHOLD_NO_SMR)
+               strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
+       printf("    hold count flags (%s)\n", buf + 1);
+
        buf[0] = '\0';
        buf[1] = '\0';
        if (vp->v_irflag & VIRF_DOOMED)

Modified: head/sys/sys/vnode.h
==============================================================================
--- head/sys/sys/vnode.h        Wed Jul  1 04:12:41 2020        (r362826)
+++ head/sys/sys/vnode.h        Wed Jul  1 05:56:29 2020        (r362827)
@@ -58,7 +58,7 @@
 enum vtype     { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
                  VMARKER };
 
-enum vgetstate { VGET_HOLDCNT, VGET_USECOUNT };
+enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
@@ -236,6 +236,9 @@ struct xvnode {
  *     VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *     are required for writing but the status may be checked with either.
  */
+#define        VHOLD_NO_SMR    (1<<29) /* Disable vhold_smr */
+#define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
+
 #define        VIRF_DOOMED     0x0001  /* This vnode is being recycled */
 
 #define        VI_TEXT_REF     0x0001  /* Text ref grabbed use ref */
@@ -657,12 +660,14 @@ void      vdrop(struct vnode *);
 void   vdropl(struct vnode *);
 int    vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int    vget(struct vnode *vp, int flags, struct thread *td);
+enum vgetstate vget_prep_smr(struct vnode *vp);
 enum vgetstate vget_prep(struct vnode *vp);
 int    vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void   vgone(struct vnode *vp);
 void   vhold(struct vnode *);
 void   vholdl(struct vnode *);
 void   vholdnz(struct vnode *);
+bool   vhold_smr(struct vnode *);
 void   vinactive(struct vnode *vp);
 int    vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int    vtruncbuf(struct vnode *vp, off_t length, int blksize);
@@ -973,6 +978,16 @@ int vn_dir_check_exec(struct vnode *vp, struct compone
 #define VFS_VOP_VECTOR_REGISTER(vnodeops) \
        SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
            vfs_vector_op_register, &vnodeops)
+
+#define VFS_SMR_DECLARE                                \
+       extern smr_t vfs_smr
+
+#define VFS_SMR()      vfs_smr
+#define vfs_smr_enter()        smr_enter(VFS_SMR())
+#define vfs_smr_exit() smr_exit(VFS_SMR())
+#define VFS_SMR_ASSERT_ENTERED()       SMR_ASSERT_ENTERED(VFS_SMR())
+#define VFS_SMR_ASSERT_NOT_ENTERED()   SMR_ASSERT_NOT_ENTERED(VFS_SMR())
+#define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR())
 
 #endif /* _KERNEL */
 
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

Reply via email to