Module Name: src Committed By: ad Date: Sun Dec 1 13:56:29 UTC 2019
Modified Files: src/sys/kern: vfs_subr.c vfs_vnode.c vfs_vnops.c vnode_if.sh src/sys/miscfs/genfs: genfs_vnops.c src/sys/sys: vnode.h vnode_impl.h Log Message: Minor vnode locking changes: - Stop using atomics to maniupulate v_usecount. It was a mistake to begin with. It doesn't work as intended unless the XLOCK bit is incorporated in v_usecount and we don't have that any more. When I introduced this 10+ years ago it was to reduce pressure on v_interlock but it doesn't do that, it just makes stuff disappear from lockstat output and introduces problems elsewhere. We could do atomic usecounts on vnodes but there has to be a well thought out scheme. - Resurrect LK_UPGRADE/LK_DOWNGRADE which will be needed to work effectively when there is increased use of shared locks on vnodes. - Allocate the vnode lock using rw_obj_alloc() to reduce false sharing of struct vnode. - Put all of the LRU lists into a single cache line, and do not requeue a vnode if it's already on the correct list and was requeued recently (less than a second ago). Kernel build before and after: 119.63s real 1453.16s user 2742.57s system 115.29s real 1401.52s user 2690.94s system To generate a diff of this commit: cvs rdiff -u -r1.474 -r1.475 src/sys/kern/vfs_subr.c cvs rdiff -u -r1.103 -r1.104 src/sys/kern/vfs_vnode.c cvs rdiff -u -r1.202 -r1.203 src/sys/kern/vfs_vnops.c cvs rdiff -u -r1.67 -r1.68 src/sys/kern/vnode_if.sh cvs rdiff -u -r1.199 -r1.200 src/sys/miscfs/genfs/genfs_vnops.c cvs rdiff -u -r1.283 -r1.284 src/sys/sys/vnode.h cvs rdiff -u -r1.17 -r1.18 src/sys/sys/vnode_impl.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/kern/vfs_subr.c diff -u src/sys/kern/vfs_subr.c:1.474 src/sys/kern/vfs_subr.c:1.475 --- src/sys/kern/vfs_subr.c:1.474 Sat Nov 16 10:05:44 2019 +++ src/sys/kern/vfs_subr.c Sun Dec 1 13:56:29 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $ */ +/* $NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $ */ /*- * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. @@ -68,7 +68,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.474 2019/11/16 10:05:44 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.475 2019/12/01 13:56:29 ad Exp $"); #ifdef _KERNEL_OPT #include "opt_ddb.h" @@ -1110,7 +1110,7 @@ vprint_common(struct vnode *vp, const ch vp->v_usecount, vp->v_writecount, vp->v_holdcnt); (*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n", prefix, vp->v_size, vp->v_writesize, vp->v_numoutput); - (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock); + (*pr)("%sdata %p lock %p\n", prefix, vp->v_data, vip->vi_lock); (*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state), vip->vi_key.vk_mount, vip->vi_key.vk_key_len); @@ -1543,7 +1543,7 @@ vfs_vnode_lock_print(void *vlock, int fu for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) { TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) { - if (&vip->vi_lock != vlock) + if (vip->vi_lock != vlock) continue; vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr); } Index: src/sys/kern/vfs_vnode.c diff -u src/sys/kern/vfs_vnode.c:1.103 src/sys/kern/vfs_vnode.c:1.104 --- src/sys/kern/vfs_vnode.c:1.103 Wed Feb 20 10:07:27 2019 +++ src/sys/kern/vfs_vnode.c Sun Dec 1 13:56:29 2019 @@ -1,7 +1,7 @@ -/* $NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $ */ +/* $NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $ */ /*- - * Copyright (c) 1997-2011 The NetBSD Foundation, Inc. + * Copyright (c) 1997-2011, 2019 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation @@ -143,20 +143,10 @@ * as vput(9), routines. Common points holding references are e.g. * file openings, current working directory, mount points, etc. * - * Note on v_usecount and its locking - * - * At nearly all points it is known that v_usecount could be zero, - * the vnode_t::v_interlock will be held. To change v_usecount away - * from zero, the interlock must be held. To change from a non-zero - * value to zero, again the interlock must be held. - * - * Changing the usecount from a non-zero value to a non-zero value can - * safely be done using atomic operations, without the interlock held. - * */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.103 2019/02/20 10:07:27 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.104 2019/12/01 13:56:29 ad Exp $"); #include <sys/param.h> #include <sys/kernel.h> @@ -181,33 +171,39 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c, #include <uvm/uvm.h> #include <uvm/uvm_readahead.h> +#include <uvm/uvm_stat.h> /* Flags to vrelel. */ -#define VRELEL_ASYNC_RELE 0x0001 /* Always defer to vrele thread. */ -#define VRELEL_FORCE_RELE 0x0002 /* Must always succeed. */ - -u_int numvnodes __cacheline_aligned; +#define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */ +#define VRELEL_FORCE 0x0002 /* Must always succeed. */ +#define VRELEL_NOINACT 0x0004 /* Don't bother calling VOP_INACTIVE(). */ + +#define LRU_VRELE 0 +#define LRU_FREE 1 +#define LRU_HOLD 2 +#define LRU_COUNT 3 /* * There are three lru lists: one holds vnodes waiting for async release, - * one is for vnodes which have no buffer/page references and - * one for those which do (i.e. v_holdcnt is non-zero). + * one is for vnodes which have no buffer/page references and one for those + * which do (i.e. v_holdcnt is non-zero). We put the lists into a single, + * private cache line as vnodes migrate between them while under the same + * lock (vdrain_lock). */ -static vnodelst_t lru_vrele_list __cacheline_aligned; -static vnodelst_t lru_free_list __cacheline_aligned; -static vnodelst_t lru_hold_list __cacheline_aligned; +u_int numvnodes __cacheline_aligned; +static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned; static kmutex_t vdrain_lock __cacheline_aligned; -static kcondvar_t vdrain_cv __cacheline_aligned; +static kcondvar_t vdrain_cv; static int vdrain_gen; static kcondvar_t vdrain_gen_cv; static bool vdrain_retry; static lwp_t * vdrain_lwp; SLIST_HEAD(hashhead, vnode_impl); static kmutex_t vcache_lock __cacheline_aligned; -static kcondvar_t vcache_cv __cacheline_aligned; +static kcondvar_t vcache_cv; static u_int vcache_hashsize; static u_long vcache_hashmask; -static struct hashhead *vcache_hashtab __cacheline_aligned; +static struct hashhead *vcache_hashtab; static pool_cache_t vcache_pool; static void lru_requeue(vnode_t *, vnodelst_t *); static vnodelst_t * lru_which(vnode_t *); @@ -378,17 +374,16 @@ vstate_change(vnode_t *vp, enum vnode_st void vfs_vnode_sysinit(void) { - int error __diagused; + int error __diagused, i; dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL); KASSERT(dead_rootmount != NULL); dead_rootmount->mnt_iflag |= IMNT_MPSAFE; mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE); - TAILQ_INIT(&lru_free_list); - TAILQ_INIT(&lru_hold_list); - TAILQ_INIT(&lru_vrele_list); - + for (i = 0; i < LRU_COUNT; i++) { + TAILQ_INIT(&lru_list[i]); + } vcache_init(); cv_init(&vdrain_cv, "vdrain"); @@ -452,9 +447,9 @@ lru_which(vnode_t *vp) KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt > 0) - return &lru_hold_list; + return &lru_list[LRU_HOLD]; else - return &lru_free_list; + return &lru_list[LRU_FREE]; } /* @@ -466,19 +461,39 @@ static void lru_requeue(vnode_t *vp, vnodelst_t *listhd) { vnode_impl_t *vip; + int d; - mutex_enter(&vdrain_lock); + /* + * If the vnode is on the correct list, and was put there recently, + * then leave it be, thus avoiding huge cache and lock contention. + */ vip = VNODE_TO_VIMPL(vp); + if (listhd == vip->vi_lrulisthd && + (hardclock_ticks - vip->vi_lrulisttm) < hz) { + return; + } + + mutex_enter(&vdrain_lock); + d = 0; if (vip->vi_lrulisthd != NULL) TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); else - numvnodes++; + d++; vip->vi_lrulisthd = listhd; + vip->vi_lrulisttm = hardclock_ticks; if (vip->vi_lrulisthd != NULL) TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); else - numvnodes--; - if (numvnodes > desiredvnodes || listhd == &lru_vrele_list) + d--; + if (d != 0) { + /* + * Looks strange? This is not a bug. Don't store + * numvnodes unless there is a change - avoid false + * sharing on MP. + */ + numvnodes += d; + } + if (numvnodes > desiredvnodes || listhd == &lru_list[LRU_VRELE]) cv_broadcast(&vdrain_cv); mutex_exit(&vdrain_lock); } @@ -491,33 +506,37 @@ void vrele_flush(struct mount *mp) { vnode_impl_t *vip, *marker; + vnode_t *vp; KASSERT(fstrans_is_owner(mp)); marker = VNODE_TO_VIMPL(vnalloc_marker(NULL)); mutex_enter(&vdrain_lock); - TAILQ_INSERT_HEAD(&lru_vrele_list, marker, vi_lrulist); + TAILQ_INSERT_HEAD(&lru_list[LRU_VRELE], marker, vi_lrulist); while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { - TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); - TAILQ_INSERT_AFTER(&lru_vrele_list, vip, marker, vi_lrulist); - if (vnis_marker(VIMPL_TO_VNODE(vip))) + TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); + TAILQ_INSERT_AFTER(&lru_list[LRU_VRELE], vip, marker, + vi_lrulist); + vp = VIMPL_TO_VNODE(vip); + if (vnis_marker(vp)) continue; - KASSERT(vip->vi_lrulisthd == &lru_vrele_list); + KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); - vip->vi_lrulisthd = &lru_hold_list; + vip->vi_lrulisthd = &lru_list[LRU_HOLD]; + vip->vi_lrulisttm = hardclock_ticks; TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); mutex_exit(&vdrain_lock); - mutex_enter(VIMPL_TO_VNODE(vip)->v_interlock); - vrelel(VIMPL_TO_VNODE(vip), VRELEL_FORCE_RELE); + mutex_enter(vp->v_interlock); + vrelel(vp, VRELEL_FORCE); mutex_enter(&vdrain_lock); } - TAILQ_REMOVE(&lru_vrele_list, marker, vi_lrulist); + TAILQ_REMOVE(&lru_list[LRU_VRELE], marker, vi_lrulist); mutex_exit(&vdrain_lock); vnfree_marker(VIMPL_TO_VNODE(marker)); @@ -555,7 +574,7 @@ vdrain_remove(vnode_t *vp) if (vcache_vget(vp) == 0) { if (!vrecycle(vp)) { mutex_enter(vp->v_interlock); - vrelel(vp, VRELEL_FORCE_RELE); + vrelel(vp, VRELEL_FORCE); } } fstrans_done(mp); @@ -584,16 +603,17 @@ vdrain_vrele(vnode_t *vp) * will put it back onto the right list before * its v_usecount reaches zero. */ - KASSERT(vip->vi_lrulisthd == &lru_vrele_list); + KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]); TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist); - vip->vi_lrulisthd = &lru_hold_list; + vip->vi_lrulisthd = &lru_list[LRU_HOLD]; + vip->vi_lrulisttm = hardclock_ticks; TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist); vdrain_retry = true; mutex_exit(&vdrain_lock); mutex_enter(vp->v_interlock); - vrelel(vp, VRELEL_FORCE_RELE); + vrelel(vp, VRELEL_FORCE); fstrans_done(mp); mutex_enter(&vdrain_lock); @@ -606,9 +626,6 @@ vdrain_vrele(vnode_t *vp) static void vdrain_thread(void *cookie) { - vnodelst_t *listhd[] = { - &lru_vrele_list, &lru_free_list, &lru_hold_list - }; int i; u_int target; vnode_impl_t *vip, *marker; @@ -621,22 +638,22 @@ vdrain_thread(void *cookie) vdrain_retry = false; target = desiredvnodes - desiredvnodes/10; - for (i = 0; i < __arraycount(listhd); i++) { - TAILQ_INSERT_HEAD(listhd[i], marker, vi_lrulist); + for (i = 0; i < LRU_COUNT; i++) { + TAILQ_INSERT_HEAD(&lru_list[i], marker, vi_lrulist); while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { - TAILQ_REMOVE(listhd[i], marker, vi_lrulist); - TAILQ_INSERT_AFTER(listhd[i], vip, marker, + TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); + TAILQ_INSERT_AFTER(&lru_list[i], vip, marker, vi_lrulist); if (vnis_marker(VIMPL_TO_VNODE(vip))) continue; - if (listhd[i] == &lru_vrele_list) + if (i == LRU_VRELE) vdrain_vrele(VIMPL_TO_VNODE(vip)); else if (numvnodes < target) break; else vdrain_remove(VIMPL_TO_VNODE(vip)); } - TAILQ_REMOVE(listhd[i], marker, vi_lrulist); + TAILQ_REMOVE(&lru_list[i], marker, vi_lrulist); } if (vdrain_retry) { @@ -663,35 +680,14 @@ vput(vnode_t *vp) } /* - * Try to drop reference on a vnode. Abort if we are releasing the - * last reference. Note: this _must_ succeed if not the last reference. - */ -static inline bool -vtryrele(vnode_t *vp) -{ - u_int use, next; - - for (use = vp->v_usecount;; use = next) { - if (use == 1) { - return false; - } - KASSERT(use > 1); - next = atomic_cas_uint(&vp->v_usecount, use, use - 1); - if (__predict_true(next == use)) { - return true; - } - } -} - -/* * Vnode release. If reference count drops to zero, call inactive * routine and either return to freelist or free to the pool. */ static void vrelel(vnode_t *vp, int flags) { - const bool async = ((flags & VRELEL_ASYNC_RELE) != 0); - const bool force = ((flags & VRELEL_FORCE_RELE) != 0); + const bool async = ((flags & VRELEL_ASYNC) != 0); + const bool force = ((flags & VRELEL_FORCE) != 0); bool recycle, defer; int error; @@ -706,7 +702,8 @@ vrelel(vnode_t *vp, int flags) * If not the last reference, just drop the reference count * and unlock. */ - if (vtryrele(vp)) { + if (vp->v_usecount > 1) { + vp->v_usecount--; mutex_exit(vp->v_interlock); return; } @@ -728,12 +725,14 @@ vrelel(vnode_t *vp, int flags) */ if ((curlwp == uvm.pagedaemon_lwp) || async) { defer = true; - } else { + } else if (force) { mutex_exit(vp->v_interlock); - error = vn_lock(vp, - LK_EXCLUSIVE | LK_RETRY | (force ? 0 : LK_NOWAIT)); + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); defer = (error != 0); mutex_enter(vp->v_interlock); + } else { + error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT); + defer = (error != 0); } KASSERT(mutex_owned(vp->v_interlock)); KASSERT(! (force && defer)); @@ -742,7 +741,7 @@ vrelel(vnode_t *vp, int flags) * Defer reclaim to the kthread; it's not safe to * clean it here. We donate it our last reference. */ - lru_requeue(vp, &lru_vrele_list); + lru_requeue(vp, &lru_list[LRU_VRELE]); mutex_exit(vp->v_interlock); return; } @@ -751,7 +750,8 @@ vrelel(vnode_t *vp, int flags) * If the node got another reference while we * released the interlock, don't try to inactivate it yet. */ - if (__predict_false(vtryrele(vp))) { + if (vp->v_usecount > 1) { + vp->v_usecount--; VOP_UNLOCK(vp); mutex_exit(vp->v_interlock); return; @@ -782,14 +782,16 @@ vrelel(vnode_t *vp, int flags) mutex_enter(vp->v_interlock); VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED); if (!recycle) { - if (vtryrele(vp)) { + if (vp->v_usecount > 1) { + vp->v_usecount--; mutex_exit(vp->v_interlock); return; } } /* Take care of space accounting. */ - if (vp->v_iflag & VI_EXECMAP) { + if ((vp->v_iflag & VI_EXECMAP) != 0 && + vp->v_uobj.uo_npages != 0) { atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); atomic_add_int(&uvmexp.filepages, @@ -810,7 +812,8 @@ vrelel(vnode_t *vp, int flags) KASSERT(vp->v_usecount > 0); } - if (atomic_dec_uint_nv(&vp->v_usecount) != 0) { + vp->v_usecount--; + if (vp->v_usecount != 0) { /* Gained another reference while being reclaimed. */ mutex_exit(vp->v_interlock); return; @@ -837,9 +840,6 @@ void vrele(vnode_t *vp) { - if (vtryrele(vp)) { - return; - } mutex_enter(vp->v_interlock); vrelel(vp, 0); } @@ -851,11 +851,8 @@ void vrele_async(vnode_t *vp) { - if (vtryrele(vp)) { - return; - } mutex_enter(vp->v_interlock); - vrelel(vp, VRELEL_ASYNC_RELE); + vrelel(vp, VRELEL_ASYNC); } /* @@ -868,7 +865,9 @@ vref(vnode_t *vp) KASSERT(vp->v_usecount != 0); - atomic_inc_uint(&vp->v_usecount); + mutex_enter(vp->v_interlock); + vp->v_usecount++; + mutex_exit(vp->v_interlock); } /* @@ -1006,7 +1005,7 @@ vrevoke(vnode_t *vp) if (VSTATE_GET(vp) == VS_RECLAIMED) { mutex_exit(vp->v_interlock); } else if (vp->v_type != VBLK && vp->v_type != VCHR) { - atomic_inc_uint(&vp->v_usecount); + vp->v_usecount++; mutex_exit(vp->v_interlock); vgone(vp); } else { @@ -1128,7 +1127,7 @@ vcache_alloc(void) vip = pool_cache_get(vcache_pool, PR_WAITOK); memset(vip, 0, sizeof(*vip)); - rw_init(&vip->vi_lock); + vip->vi_lock = rw_obj_alloc(); /* SLIST_INIT(&vip->vi_hash); */ /* LIST_INIT(&vip->vi_nclist); */ /* LIST_INIT(&vip->vi_dnclist); */ @@ -1143,7 +1142,7 @@ vcache_alloc(void) vip->vi_state = VS_LOADING; - lru_requeue(vp, &lru_free_list); + lru_requeue(vp, &lru_list[LRU_FREE]); return vip; } @@ -1192,7 +1191,7 @@ vcache_free(vnode_impl_t *vip) if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp); - rw_destroy(&vip->vi_lock); + rw_obj_free(vip->vi_lock); uvm_obj_destroy(&vp->v_uobj, true); cv_destroy(&vp->v_cv); pool_cache_put(vcache_pool, vip); @@ -1216,10 +1215,8 @@ vcache_tryvget(vnode_t *vp) error = ENOENT; else if (__predict_false(VSTATE_GET(vp) != VS_LOADED)) error = EBUSY; - else if (vp->v_usecount == 0) - vp->v_usecount = 1; else - atomic_inc_uint(&vp->v_usecount); + vp->v_usecount++; mutex_exit(vp->v_interlock); @@ -1253,11 +1250,7 @@ vcache_vget(vnode_t *vp) return ENOENT; } VSTATE_ASSERT(vp, VS_LOADED); - if (vp->v_usecount == 0) - vp->v_usecount = 1; - else - atomic_inc_uint(&vp->v_usecount); - + vp->v_usecount++; mutex_exit(vp->v_interlock); return 0; @@ -1571,7 +1564,7 @@ vcache_reclaim(vnode_t *vp) * while we clean it out. */ VSTATE_CHANGE(vp, VS_LOADED, VS_RECLAIMING); - if (vp->v_iflag & VI_EXECMAP) { + if ((vp->v_iflag & VI_EXECMAP) != 0 && vp->v_uobj.uo_npages != 0) { atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages); atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages); } Index: src/sys/kern/vfs_vnops.c diff -u src/sys/kern/vfs_vnops.c:1.202 src/sys/kern/vfs_vnops.c:1.203 --- src/sys/kern/vfs_vnops.c:1.202 Sun Nov 10 06:47:30 2019 +++ src/sys/kern/vfs_vnops.c Sun Dec 1 13:56:29 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_vnops.c,v 1.202 2019/11/10 06:47:30 mlelstv Exp $ */ +/* $NetBSD: vfs_vnops.c,v 1.203 2019/12/01 13:56:29 ad Exp $ */ /*- * Copyright (c) 2009 The NetBSD Foundation, Inc. @@ -66,7 +66,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.202 2019/11/10 06:47:30 mlelstv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.203 2019/12/01 13:56:29 ad Exp $"); #include "veriexec.h" @@ -1035,8 +1035,9 @@ vn_lock(struct vnode *vp, int flags) #if 0 KASSERT(vp->v_usecount > 0 || (vp->v_iflag & VI_ONWORKLST) != 0); #endif - KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY)) == 0); - KASSERT(!mutex_owned(vp->v_interlock)); + KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY| + LK_UPGRADE|LK_DOWNGRADE)) == 0); + KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock)); #ifdef DIAGNOSTIC if (wapbl_vphaswapbl(vp)) Index: src/sys/kern/vnode_if.sh diff -u src/sys/kern/vnode_if.sh:1.67 src/sys/kern/vnode_if.sh:1.68 --- src/sys/kern/vnode_if.sh:1.67 Fri Oct 11 08:04:52 2019 +++ src/sys/kern/vnode_if.sh Sun Dec 1 13:56:29 2019 @@ -29,7 +29,7 @@ copyright="\ * SUCH DAMAGE. */ " -SCRIPT_ID='$NetBSD: vnode_if.sh,v 1.67 2019/10/11 08:04:52 hannken Exp $' +SCRIPT_ID='$NetBSD: vnode_if.sh,v 1.68 2019/12/01 13:56:29 ad Exp $' # Script to produce VFS front-end sugar. # @@ -481,7 +481,7 @@ function bodynorm() { } if (fstrans == "LOCK") printf("\terror = vop_pre(%s, &mp, &mpsafe, %s);\n", - argname[0], "(flags & LK_NOWAIT ? FST_TRY : FST_YES)"); + argname[0], "(flags & (LK_UPGRADE|LK_DOWNGRADE) ? FST_NO : (flags & LK_NOWAIT ? FST_TRY : FST_YES))"); else if (fstrans == "UNLOCK") printf("\terror = vop_pre(%s, &mp, &mpsafe, FST_%s);\n", argname[0], "NO"); @@ -493,7 +493,7 @@ function bodynorm() { argname[0], name); if (fstrans == "LOCK") printf("\tvop_post(%s, mp, mpsafe, %s);\n", - argname[0], "(error ? FST_YES : FST_NO)"); + argname[0], "(flags & (LK_UPGRADE|LK_DOWNGRADE) ? FST_NO : (error ? FST_YES : FST_NO))"); else if (fstrans == "UNLOCK") printf("\tvop_post(%s, mp, mpsafe, FST_%s);\n", argname[0], "YES"); Index: src/sys/miscfs/genfs/genfs_vnops.c diff -u src/sys/miscfs/genfs/genfs_vnops.c:1.199 src/sys/miscfs/genfs/genfs_vnops.c:1.200 --- src/sys/miscfs/genfs/genfs_vnops.c:1.199 Wed Oct 25 08:12:39 2017 +++ src/sys/miscfs/genfs/genfs_vnops.c Sun Dec 1 13:56:29 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: genfs_vnops.c,v 1.199 2017/10/25 08:12:39 maya Exp $ */ +/* $NetBSD: genfs_vnops.c,v 1.200 2019/12/01 13:56:29 ad Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -57,7 +57,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.199 2017/10/25 08:12:39 maya Exp $"); +__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.200 2019/12/01 13:56:29 ad Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -292,12 +292,23 @@ genfs_deadlock(void *v) if (! ISSET(flags, LK_RETRY)) return ENOENT; - op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); - if (ISSET(flags, LK_NOWAIT)) { - if (! rw_tryenter(&vip->vi_lock, op)) - return EBUSY; + if (ISSET(flags, LK_DOWNGRADE)) { + rw_downgrade(vip->vi_lock); + } else if (ISSET(flags, LK_UPGRADE)) { + if (!rw_tryupgrade(vip->vi_lock)) { + if (ISSET(flags, LK_NOWAIT)) + return EBUSY; + rw_exit(vip->vi_lock); + rw_enter(vip->vi_lock, RW_WRITER); + } } else { - rw_enter(&vip->vi_lock, op); + op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); + if (ISSET(flags, LK_NOWAIT)) { + if (!rw_tryenter(vip->vi_lock, op)) + return EBUSY; + } else { + rw_enter(vip->vi_lock, op); + } } VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED); return 0; @@ -315,7 +326,7 @@ genfs_deadunlock(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - rw_exit(&vip->vi_lock); + rw_exit(vip->vi_lock); return 0; } @@ -335,12 +346,23 @@ genfs_lock(void *v) int flags = ap->a_flags; krw_t op; - op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); - if (ISSET(flags, LK_NOWAIT)) { - if (! rw_tryenter(&vip->vi_lock, op)) - return EBUSY; + if (ISSET(flags, LK_DOWNGRADE)) { + rw_downgrade(vip->vi_lock); + } else if (ISSET(flags, LK_UPGRADE)) { + if (!rw_tryupgrade(vip->vi_lock)) { + if (ISSET(flags, LK_NOWAIT)) + return EBUSY; + rw_exit(vip->vi_lock); + rw_enter(vip->vi_lock, RW_WRITER); + } } else { - rw_enter(&vip->vi_lock, op); + op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER); + if (ISSET(flags, LK_NOWAIT)) { + if (!rw_tryenter(vip->vi_lock, op)) + return EBUSY; + } else { + rw_enter(vip->vi_lock, op); + } } VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE); return 0; @@ -358,7 +380,7 @@ genfs_unlock(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - rw_exit(&vip->vi_lock); + rw_exit(vip->vi_lock); return 0; } @@ -375,10 +397,10 @@ genfs_islocked(void *v) vnode_t *vp = ap->a_vp; vnode_impl_t *vip = VNODE_TO_VIMPL(vp); - if (rw_write_held(&vip->vi_lock)) + if (rw_write_held(vip->vi_lock)) return LK_EXCLUSIVE; - if (rw_read_held(&vip->vi_lock)) + if (rw_read_held(vip->vi_lock)) return LK_SHARED; return 0; Index: src/sys/sys/vnode.h diff -u src/sys/sys/vnode.h:1.283 src/sys/sys/vnode.h:1.284 --- src/sys/sys/vnode.h:1.283 Sun Nov 10 06:47:30 2019 +++ src/sys/sys/vnode.h Sun Dec 1 13:56:29 2019 @@ -1,4 +1,4 @@ -/* $NetBSD: vnode.h,v 1.283 2019/11/10 06:47:30 mlelstv Exp $ */ +/* $NetBSD: vnode.h,v 1.284 2019/12/01 13:56:29 ad Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -206,10 +206,13 @@ typedef struct vnode vnode_t; /* * vnode lock flags */ +#define LK_NONE 0x00000000 /* no lock - for VOP_ISLOCKED() */ #define LK_SHARED 0x00000001 /* shared lock */ #define LK_EXCLUSIVE 0x00000002 /* exclusive lock */ -#define LK_NOWAIT 0x00000010 /* do not sleep to await lock */ -#define LK_RETRY 0x00020000 /* vn_lock: retry until locked */ +#define LK_UPGRADE 0x00000010 /* upgrade shared -> exclusive */ +#define LK_DOWNGRADE 0x00000020 /* downgrade exclusive -> shared */ +#define LK_NOWAIT 0x00000100 /* do not sleep to await lock */ +#define LK_RETRY 0x00000200 /* vn_lock: retry until locked */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value Index: src/sys/sys/vnode_impl.h diff -u src/sys/sys/vnode_impl.h:1.17 src/sys/sys/vnode_impl.h:1.18 --- src/sys/sys/vnode_impl.h:1.17 Thu Sep 21 18:19:44 2017 +++ src/sys/sys/vnode_impl.h Sun Dec 1 13:56:29 2019 @@ -1,12 +1,9 @@ -/* $NetBSD: vnode_impl.h,v 1.17 2017/09/21 18:19:44 joerg Exp $ */ +/* $NetBSD: vnode_impl.h,v 1.18 2019/12/01 13:56:29 ad Exp $ */ /*- - * Copyright (c) 2016 The NetBSD Foundation, Inc. + * Copyright (c) 2016, 2019 The NetBSD Foundation, Inc. * All rights reserved. * - * This code is derived from software contributed to The NetBSD Foundation - * by - * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -74,10 +71,11 @@ struct vnode_impl { LIST_HEAD(, namecache) vi_dnclist; /* n: namecaches (children) */ LIST_HEAD(, namecache) vi_nclist; /* n: namecaches (parent) */ int vi_synclist_slot; /* s: synclist slot index */ + int vi_lrulisttm; /* i: time of lru enqueue */ TAILQ_ENTRY(vnode_impl) vi_synclist; /* s: vnodes with dirty bufs */ TAILQ_ENTRY(vnode_impl) vi_mntvnodes; /* m: vnodes for mount point */ SLIST_ENTRY(vnode_impl) vi_hash; /* c: vnode cache list */ - krwlock_t vi_lock; /* -: lock for this vnode */ + krwlock_t *vi_lock; /* -: lock for this vnode */ struct vcache_key vi_key; /* c: vnode cache key */ }; typedef struct vnode_impl vnode_impl_t;