CVS commit: [ad-namecache] src/sys

Andrew Doran Fri, 17 Jan 2020 14:26:55 -0800

Module Name:    src
Committed By:   ad
Date:           Fri Jan 17 22:26:26 UTC 2020


Modified Files:
        src/sys/fs/tmpfs [ad-namecache]: tmpfs_subr.c
        src/sys/kern [ad-namecache]: vfs_cache.c vfs_lookup.c
        src/sys/sys [ad-namecache]: namei.src vnode_impl.h
        src/sys/ufs/ffs [ad-namecache]: ffs_vfsops.c
        src/sys/ufs/ufs [ad-namecache]: ufs_vnops.c

Log Message:
vfs_lookup:

- Do the easy component name lookups directly in the namecache without
  taking vnode locks nor vnode references (between the start and the leaf /
  parent), which seems to largely solve the lock contention problem with
  namei().  It needs support from the file system, which has to tell the
  name cache about directory permissions (only ffs and tmpfs tried so far),
  and I'm not sure how or if it can work with layered file systems yet.
  Work in progress.

vfs_cache:

- Make the rbtree operations more efficient: inline the lookup, and key on a
  64-bit hash value (32 bits plus 16 bits length) rather than names.

- Take namecache stuff out of vnode_impl, and take the rwlocks, and put them
  all together an an nchnode struct which is mapped 1:1: with vnodes.  Saves
  memory and nicer cache profile.

- Add a routine to help vfs_lookup do its easy component name lookups.

- Report some more stats.

- Tidy up the file a bit.


To generate a diff of this commit:
cvs rdiff -u -r1.105 -r1.105.2.1 src/sys/fs/tmpfs/tmpfs_subr.c
cvs rdiff -u -r1.126.2.5 -r1.126.2.6 src/sys/kern/vfs_cache.c
cvs rdiff -u -r1.212.4.2 -r1.212.4.3 src/sys/kern/vfs_lookup.c
cvs rdiff -u -r1.47.2.3 -r1.47.2.4 src/sys/sys/namei.src
cvs rdiff -u -r1.19.2.3 -r1.19.2.4 src/sys/sys/vnode_impl.h
cvs rdiff -u -r1.362.4.1 -r1.362.4.2 src/sys/ufs/ffs/ffs_vfsops.c
cvs rdiff -u -r1.248 -r1.248.2.1 src/sys/ufs/ufs/ufs_vnops.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/fs/tmpfs/tmpfs_subr.c
diff -u src/sys/fs/tmpfs/tmpfs_subr.c:1.105 src/sys/fs/tmpfs/tmpfs_subr.c:1.105.2.1
--- src/sys/fs/tmpfs/tmpfs_subr.c:1.105	Wed Sep 18 17:59:15 2019
+++ src/sys/fs/tmpfs/tmpfs_subr.c	Fri Jan 17 22:26:25 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: tmpfs_subr.c,v 1.105 2019/09/18 17:59:15 christos Exp $	*/
+/*	$NetBSD: tmpfs_subr.c,v 1.105.2.1 2020/01/17 22:26:25 ad Exp $	*/
 
 /*
  * Copyright (c) 2005-2013 The NetBSD Foundation, Inc.
@@ -73,7 +73,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.105 2019/09/18 17:59:15 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.105.2.1 2020/01/17 22:26:25 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/cprng.h>
@@ -147,6 +147,8 @@ tmpfs_init_vnode(struct vnode *vp, tmpfs
 	vp->v_data = node;
 	node->tn_vnode = vp;
 	uvm_vnp_setsize(vp, node->tn_size);
+	KASSERT(node->tn_mode != VNOVAL);
+	cache_set_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 }
 
 /*
@@ -1037,6 +1039,7 @@ tmpfs_chmod(vnode_t *vp, mode_t mode, ka
 	node->tn_mode = (mode & ALLPERMS);
 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
 	VN_KNOTE(vp, NOTE_ATTRIB);
+	cache_update_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 	return 0;
 }
 
@@ -1081,6 +1084,7 @@ tmpfs_chown(vnode_t *vp, uid_t uid, gid_
 	node->tn_gid = gid;
 	tmpfs_update(vp, TMPFS_UPDATE_CTIME);
 	VN_KNOTE(vp, NOTE_ATTRIB);
+	cache_update_id(vp, node->tn_mode, node->tn_uid, node->tn_gid);
 	return 0;
 }
 

Index: src/sys/kern/vfs_cache.c
diff -u src/sys/kern/vfs_cache.c:1.126.2.5 src/sys/kern/vfs_cache.c:1.126.2.6
--- src/sys/kern/vfs_cache.c:1.126.2.5	Tue Jan 14 11:07:40 2020
+++ src/sys/kern/vfs_cache.c	Fri Jan 17 22:26:25 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.126.2.6 2020/01/17 22:26:25 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
@@ -61,7 +61,7 @@
  */
 
 /*
- * Name caching works as follows:
+ * Name caching:
  *
  * 	Names found by directory scans are retained in a cache for future
  * 	reference.  It is managed pseudo-LRU, so frequently used names will
@@ -78,17 +78,14 @@
  * Data structures:
  *
  *	The original BSD implementation used a global hash table, which
- *	works very well on a uniprocessor system but causes performance
- *	difficulties on a multiprocessor system.  The global hash table is
- *	also difficult to size dynamically, and can become very large.  To
- *	try and address these problems, the focus of interest in this
- *	implementation is the directory itself.  A per-directory structure
- *	is used to look up names.
- *
- *	XXX Currently this structure is an rbtree, but rbtrees touch many
- *	cache lines during a lookup and so perform badly.  The intent is to
- *	utimately make use of some other data structure, perhaps a Robin
- *	Hood hash.  Insert blurb here when that happens.
+ *	works very well but can cause difficulties for the CPU cache on
+ *	modern CPUs, and concurrency headaches for the kernel hacker on
+ *	multiprocessor systems.  The global hash table is also difficult to
+ *	size dynamically.  To try and address these concerns, the focus of
+ *	interest in this implementation is the directory itself: a
+ *	per-directory red-black tree is used to look up names.  Other than
+ *	the choice of data structure, it works largely the same way as the
+ *	BSD implementation.
  *
  * Replacement:
  *
@@ -109,46 +106,49 @@
  * Lock order:
  *
  *	1) nc_dvp->vi_ncdlock
- *	2) cache_list_lock
- *	3) vp->v_interlock
+ *	2) nc_dvp->vi_ncvlock
+ *	3) cache_lru_lock, vp->v_interlock
  *
  * Ugly ASCII diagram:
  *
  *	XXX replace tabs with spaces, make less ugly
  *
  *          ...
- *	     |
- *	-----o-----
- *	|  VDIR   |
- *	|  vnode  |
- *	-----------
+ *           ^
+ *           |
+ *      -----o-----
+ *      |  VDIR   |
+ *      | nchnode |
+ *      -----------
  *           ^
  *           |- nd_tree
- *	     |		  	       	 		 
- *	+----+----+	  	  -----------		    -----------
- *	|  VDIR   |		  |  VCHR   |		    |  VREG   |
- *	|  vnode  o-----+	  |  vnode  o-----+	    |  vnode  o------+
- *	+---------+     |	  -----------	  |	    -----------      |
- *	     ^	        |	       ^	  |		 ^           |
- *	     |- nc_vp   |- vi_nclist   |- nc_vp	  |- vi_nclist	 |- nc_vp    |
- *	     |	        |	       |	  |		 |           |
- *	-----o-----     |	  -----o-----	  |	    -----o-----      |
- *  +---onamecache|<----+     +---onamecache|<----+	+---onamecache|<-----+
- *  |	-----------	      |	  -----------		|   -----------
- *  |        ^		      |	       ^   		|	 ^
+ *           |                                                           
+ *      -----o-----               -----------               -----------
+ *      |  VDIR   |               |  VCHR   |               |  VREG   |
+ *      | nchnode o-----+         | nchnode o-----+         | nchnode o------+
+ *      -----------     |         -----------     |         -----------      |
+ *           ^          |              ^          |              ^           |
+ *           |- nc_nn   |- nn_list     |- nc_nn   |- nn_list     |- nc_nn    |
+ *           |          |              |          |              |           |
+ *      -----o-----     |         -----o-----     |         -----o-----      |
+ *  +---onamecache|<----+     +---onamecache|<----+     +---onamecache|<-----+
+ *  |   -----------           |   -----------           |   -----------
+ *  |        ^                |        ^                |        ^
  *  |        |                |        |                |        |
- *  |        |  +----------------------+   		|	 |
- *  |-nc_dvp | +-------------------------------------------------+
- *  |	     |/- nd_tree      |		   		|
- *  |	     |		      |- nc_dvp		        |
- *  |	-----o-----	      |		  		|
- *  +-->|  VDIR   |<----------+			        |
- *	|  vnode  |<------------------------------------+
- *	-----------
+ *  |        |  +----------------------+                |        |
+ *  |-nc_dnn | +-------------------------------------------------+
+ *  |        |/- nd_tree      |                         |
+ *  |        |                |- nc_dnn                 |- nc_dnn
+ *  |   -----o-----           |                         |
+ *  +-->|  VDIR   |<----------+                         |
+ *      | nchnode |<------------------------------------+
+ *      -----------
+ *
+ *      START HERE
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.5 2020/01/14 11:07:40 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.126.2.6 2020/01/17 22:26:25 ad Exp $");
 
 #define __NAMECACHE_PRIVATE
 #ifdef _KERNEL_OPT
@@ -156,14 +156,15 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,
 #include "opt_dtrace.h"
 #endif
 
-#include <sys/param.h>
 #include <sys/cpu.h>
 #include <sys/errno.h>
 #include <sys/evcnt.h>
+#include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/param.h>
 #include <sys/pool.h>
 #include <sys/sdt.h>
 #include <sys/sysctl.h>
@@ -171,13 +172,48 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,
 #include <sys/time.h>
 #include <sys/vnode_impl.h>
 
+#include <miscfs/genfs/genfs.h>
+
+/*
+ * Per-vnode state for the namecache.  This is allocated apart from struct
+ * vnode to make the best use of memory, and best use of CPU cache.  Field
+ * markings and corresponding locks:
+ *
+ *	-	stable throught the lifetime of the vnode
+ *	n	protected by nn_lock
+ *	l	protected by nn_listlock
+ */
+struct nchnode {
+	/* First cache line: frequently used stuff. */
+	rb_tree_t	nn_tree;	/* n  namecache tree */
+	TAILQ_HEAD(,namecache) nn_list;	/* l  namecaches (parent) */
+	mode_t		nn_mode;	/* n  cached mode or VNOVAL */
+	uid_t		nn_uid;		/* n  cached UID or VNOVAL */
+	gid_t		nn_gid;		/* n  cached GID or VNOVAL */
+	uint32_t	nn_spare;	/* -  spare (padding) */
+
+	/* Second cache line: locks and infrequenly used stuff. */
+	krwlock_t	nn_lock		/* -  lock on node */
+	    __aligned(COHERENCY_UNIT);
+	krwlock_t	nn_listlock;	/* -  lock on nn_list */
+	struct vnode	*nn_vp;		/* -  backpointer to vnode */
+};
+
+static void	cache_activate(struct namecache *);
+static int	cache_compare_key(void *, const void *, const void *);
+static int	cache_compare_nodes(void *, const void *, const void *);
+static void	cache_deactivate(void);
+static void	cache_reclaim(void);
+static int	cache_stat_sysctl(SYSCTLFN_ARGS);
+
 /* Per-CPU counters. */
 struct nchstats_percpu _NAMEI_CACHE_STATS(uintptr_t);
 
 /* Global pool cache. */
 static pool_cache_t cache_pool __read_mostly;
+static pool_cache_t cache_node_pool __read_mostly;
 
-/* LRU replacement state. */
+/* LRU replacement. */
 enum cache_lru_id {
 	LRU_ACTIVE,
 	LRU_INACTIVE,
@@ -194,7 +230,6 @@ static kmutex_t cache_lru_lock __cacheli
 /* Cache effectiveness statistics.  This holds total from per-cpu stats */
 struct nchstats	nchstats __cacheline_aligned;
 
-/* Macro to count an event. */
 #define	COUNT(f)	do { \
 	kpreempt_disable(); \
 	((struct nchstats_percpu *)curcpu()->ci_data.cpu_nch)->f++; \
@@ -206,12 +241,16 @@ static const int cache_lru_maxdeact = 2;
 static const int cache_lru_maxscan = 128;	/* max # to scan/reclaim */
 static int doingcache = 1;			/* 1 => enable the cache */
 
-static void	cache_reclaim(void);
-static void	cache_remove(struct namecache *, const bool);
-
 /* sysctl */
-static struct	sysctllog *sysctllog;
-static void	sysctl_cache_stat_setup(void);
+static struct	sysctllog *cache_sysctllog;
+
+/* Read-black tree */
+static rb_tree_ops_t cache_rbtree_ops __read_mostly = {
+	.rbto_compare_nodes = cache_compare_nodes,
+	.rbto_compare_key = cache_compare_key,
+	.rbto_node_offset = offsetof(struct namecache, nc_tree),
+	.rbto_context = NULL
+};
 
 /* dtrace hooks */
 SDT_PROVIDER_DEFINE(vfs);
@@ -243,13 +282,16 @@ SDT_PROBE_DEFINE3(vfs, namecache, enter,
 static int
 cache_compare_nodes(void *context, const void *n1, const void *n2)
 {
-	const struct namecache *ncp1 = n1;
-	const struct namecache *ncp2 = n2;
+	const struct namecache *nc1 = n1;
+	const struct namecache *nc2 = n2;
 
-	if (ncp1->nc_nlen != ncp2->nc_nlen) {
-		return (int)ncp1->nc_nlen - (int)ncp2->nc_nlen;
+	if (nc1->nc_key < nc2->nc_key) {
+		return -1;
+	}
+	if (nc1->nc_key > nc2->nc_key) {
+		return 1;
 	}
-	return memcmp(ncp1->nc_name, ncp2->nc_name, ncp1->nc_nlen);
+	return 0;
 }
 
 /*
@@ -258,156 +300,135 @@ cache_compare_nodes(void *context, const
 static int
 cache_compare_key(void *context, const void *n, const void *k)
 {
-	const struct namecache *ncp = n;
-	const struct iovec *iov = k;
+	const struct namecache *nc = n;
+	const int64_t key = *(const int64_t *)k;
 
-	if (ncp->nc_nlen != iov->iov_len) {
-		return (int)ncp->nc_nlen - (int)iov->iov_len;
+	if (nc->nc_key < key) {
+		return -1;
+	}
+	if (nc->nc_key > key) {
+		return 1;
 	}
-	return memcmp(ncp->nc_name, iov->iov_base, ncp->nc_nlen);
+	return 0;
 }
 
 /*
- * rbtree definition.
+ * Compute a (with luck) unique key value for the given name.  The name
+ * length is encoded in the key value to try and improve uniqueness, and so
+ * that length doesn't need to be compared separately for subsequent string
+ * comparisons.
  */
-static rb_tree_ops_t cache_rbtree_ops __read_mostly = {
-	.rbto_compare_nodes = cache_compare_nodes,
-	.rbto_compare_key = cache_compare_key,
-	.rbto_node_offset = offsetof(struct namecache, nc_node),
-	.rbto_context = NULL
-};
+static int64_t
+cache_key(const char *name, size_t nlen)
+{
+	int64_t key;
+
+	KASSERT(nlen <= USHRT_MAX);
+
+	key = hash32_buf(name, nlen, HASH32_STR_INIT);
+	return (key << 16) | nlen;
+}
 
 /*
  * Remove an entry from the cache.  The directory lock must be held, and if
- * "dirtovnode" is true (it usually will be), the vnode's cache lock will be
- * acquired when removing the entry from the vnode list.
+ * "dir2node" is true, then we're locking in the conventional direction and
+ * the list lock will be acquired when removing the entry from the vnode
+ * list.
  */
 static void
-cache_remove(struct namecache *ncp, const bool dirtovnode)
+cache_remove(struct namecache *nc, const bool dir2node)
 {
-	krwlock_t *vlock;
+	struct nchnode *dnn = nc->nc_dnn;
+	struct nchnode *nn;
 
-	KASSERT(rw_write_held(VNODE_TO_VIMPL(ncp->nc_dvp)->vi_ncdlock));
+	KASSERT(rw_write_held(&dnn->nn_lock));
+	KASSERT(cache_key(nc->nc_name, nc->nc_nlen) == nc->nc_key);
+	KASSERT(rb_tree_find_node(&dnn->nn_tree, &nc->nc_key) == nc);
 
-	SDT_PROBE(vfs, namecache, invalidate, done, ncp->nc_dvp,
+	SDT_PROBE(vfs, namecache, invalidate, done, nc,
 	    0, 0, 0, 0);
 
 	/* First remove from the directory's rbtree. */
-	rb_tree_remove_node(&VNODE_TO_VIMPL(ncp->nc_dvp)->vi_nctree, ncp);
+	rb_tree_remove_node(&dnn->nn_tree, nc);
 
 	/* Then remove from the LRU lists. */
 	mutex_enter(&cache_lru_lock);
-	TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
-	cache_lru.count[ncp->nc_lrulist]--;
+	TAILQ_REMOVE(&cache_lru.list[nc->nc_lrulist], nc, nc_lru);
+	cache_lru.count[nc->nc_lrulist]--;
 	mutex_exit(&cache_lru_lock);
 
-	/* Then remove from the vnode list. */
-	if (ncp->nc_vp != NULL) {
-		vlock = VNODE_TO_VIMPL(ncp->nc_vp)->vi_ncvlock;
-		if (__predict_true(dirtovnode)) {
-			rw_enter(vlock, RW_WRITER);
-		}
-		TAILQ_REMOVE(&VNODE_TO_VIMPL(ncp->nc_vp)->vi_nclist, ncp,
-		    nc_vlist);
-		if (__predict_true(dirtovnode)) {
-			rw_exit(vlock);
+	/* Then remove from the node's list. */
+	if ((nn = nc->nc_nn) != NULL) {
+		if (__predict_true(dir2node)) {
+			rw_enter(&nn->nn_listlock, RW_WRITER);
+			TAILQ_REMOVE(&nn->nn_list, nc, nc_list);
+			rw_exit(&nn->nn_listlock);
+		} else {
+			TAILQ_REMOVE(&nn->nn_list, nc, nc_list);
 		}
-		ncp->nc_vp = NULL;
 	}
 
 	/* Finally, free it. */
-	if (ncp->nc_nlen > NCHNAMLEN) {
-		size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
-		kmem_free(ncp, sz);
+	if (nc->nc_nlen > NCHNAMLEN) {
+		size_t sz = offsetof(struct namecache, nc_name[nc->nc_nlen]);
+		kmem_free(nc, sz);
 	} else {
-		pool_cache_put(cache_pool, ncp);
+		pool_cache_put(cache_pool, nc);
 	}
 }
 
 /*
- * Try to balance the LRU lists.  Pick some victim entries, and re-queue
- * them from the head of the ACTIVE list to the tail of the INACTIVE list. 
+ * Find a single cache entry and return it locked.  The directory lock must
+ * be held.
+ *
+ * Marked __noinline, and with everything unnecessary excluded, the compiler
+ * (gcc 8.3.0 x86_64) does a great job on this.
  */
-static void __noinline
-cache_deactivate(void)
+static struct namecache * __noinline
+cache_lookup_entry(struct nchnode *dnn, const char *name, size_t namelen,
+    int64_t key)
 {
-	struct namecache *ncp;
-	int total, i;
+	struct rb_node *node = dnn->nn_tree.rbt_root;
+	struct namecache *nc;
 
-	KASSERT(mutex_owned(&cache_lru_lock));
+	KASSERT(rw_lock_held(&dnn->nn_lock));
 
-	/* If we're nowhere near budget yet, don't bother. */
-	total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
-	if (total < (desiredvnodes >> 1)) {
-	    	return;
-	}
-
-	/* If not out of balance, don't bother. */
-	if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
-		return;
-	}
-
-	/* Move victim from head of ACTIVE list, to tail of INACTIVE. */
-	for (i = 0; i < cache_lru_maxdeact; i++) {
-		ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
-		if (ncp == NULL) {
+	/*
+	 * Search the RB tree for the key.  This is one of the most
+	 * performance sensitive code paths in the system, so here is an
+	 * inlined version of rb_tree_find_node() tailored for exactly
+	 * what's needed here (64-bit key and so on).  Elsewhere during
+	 * entry/removal the generic functions are used as it doesn't
+	 * matter so much there.
+	 */
+	for (;;) {
+		if (__predict_false(RB_SENTINEL_P(node))) {
+			return NULL;
+		}
+		KASSERT((void *)&nc->nc_tree == (void *)nc);
+		nc = (struct namecache *)node;
+		KASSERT(nc->nc_dnn == dnn);
+		if (nc->nc_key == key) {
 			break;
 		}
-		KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
-		ncp->nc_lrulist = LRU_INACTIVE;
-		TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
-		TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
-		cache_lru.count[LRU_ACTIVE]--;
-		cache_lru.count[LRU_INACTIVE]++;
+		node = node->rb_nodes[nc->nc_key < key];
 	}
-}
-
-/*
- * Re-queue an entry onto the correct LRU list, after it has scored a hit.
- */
-static void __noinline
-cache_activate(struct namecache *ncp)
-{
-
-	mutex_enter(&cache_lru_lock);
-	/* Put on tail of ACTIVE queue, since it just scored a hit. */
-	TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
-	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
-	cache_lru.count[ncp->nc_lrulist]--;
-	cache_lru.count[LRU_ACTIVE]++;
-	ncp->nc_lrulist = LRU_ACTIVE;
-	mutex_exit(&cache_lru_lock);
-}
-
-/*
- * Find a single cache entry and return it locked.  The directory lock must
- * be held.
- */
-static struct namecache *
-cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen)
-{
-	struct namecache *ncp;
-	struct iovec iov;
-
-	KASSERT(rw_lock_held(VNODE_TO_VIMPL(dvp)->vi_ncdlock));
 
-	iov.iov_base = __UNCONST(name);
-	iov.iov_len = namelen;
-	ncp = rb_tree_find_node(&VNODE_TO_VIMPL(dvp)->vi_nctree, &iov);
+	/* Exclude collisions. */
+	KASSERT(nc->nc_nlen == namelen);
+	if (__predict_false(memcmp(nc->nc_name, name, namelen) != 0)) {
+		return NULL;
+	}
 
-	if (ncp != NULL) {
-		KASSERT(ncp->nc_dvp == dvp);
-		/* If the entry is on the wrong LRU list, requeue it. */
-		if (__predict_false(ncp->nc_lrulist != LRU_ACTIVE)) {
-			cache_activate(ncp);
-		}
-		SDT_PROBE(vfs, namecache, lookup, hit, dvp,
-		    name, namelen, 0, 0);
-	} else {
-		SDT_PROBE(vfs, namecache, lookup, miss, dvp,
-		    name, namelen, 0, 0);
+	/*
+	 * If the entry is on the wrong LRU list, requeue it.  This is an
+	 * unlocked check, but it will rarely be wrong and even then there
+	 * will be no harm caused.
+	 */
+	if (__predict_false(nc->nc_lrulist != LRU_ACTIVE)) {
+		cache_activate(nc);
 	}
-	return ncp;
+	return nc;
 }
 
 /*
@@ -465,9 +486,10 @@ cache_lookup(struct vnode *dvp, const ch
 	     uint32_t nameiop, uint32_t cnflags,
 	     int *iswht_ret, struct vnode **vn_ret)
 {
-	struct namecache *ncp;
+	struct nchnode *dnn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+	struct namecache *nc;
 	struct vnode *vp;
-	krwlock_t *dlock;
+	int64_t key;
 	int error;
 	bool hit;
 	krw_t op;
@@ -486,7 +508,6 @@ cache_lookup(struct vnode *dvp, const ch
 		SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
 		    name, namelen, 0, 0);
 		COUNT(ncs_long);
-		/* found nothing */
 		return false;
 	}
 
@@ -498,18 +519,17 @@ cache_lookup(struct vnode *dvp, const ch
 		op = RW_READER;
 	}
 
-	/* If no directory lock yet, there's nothing in cache. */
-	dlock = VNODE_TO_VIMPL(dvp)->vi_ncdlock;
-	if (__predict_false(dlock == NULL)) {
-		return false;
-	}
+	/* Compute the key up front - don't need the lock. */
+	key = cache_key(name, namelen);
 
-	rw_enter(dlock, op);
-	ncp = cache_lookup_entry(dvp, name, namelen);
-	if (__predict_false(ncp == NULL)) {
-		rw_exit(dlock);
-		/* found nothing */
+	/* Now look for the name. */
+	rw_enter(&dnn->nn_lock, op);
+	nc = cache_lookup_entry(dnn, name, namelen, key);
+	if (__predict_false(nc == NULL)) {
+		rw_exit(&dnn->nn_lock);
 		COUNT(ncs_miss);
+		SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+		    name, namelen, 0, 0);
 		return false;
 	}
 	if (__predict_false((cnflags & MAKEENTRY) == 0)) {
@@ -518,43 +538,43 @@ cache_lookup(struct vnode *dvp, const ch
 		 * the cache entry is invalid, or otherwise don't
 		 * want cache entry to exist.
 		 */
-		cache_remove(ncp, true);
-		rw_exit(dlock);
-		/* found nothing */
+		KASSERT((cnflags & ISLASTCN) != 0);
+		cache_remove(nc, true);
+		rw_exit(&dnn->nn_lock);
 		COUNT(ncs_badhits);
 		return false;
 	}
-	if (__predict_false(ncp->nc_vp == NULL)) {
-		if (__predict_true(nameiop != CREATE ||
-		    (cnflags & ISLASTCN) == 0)) {
-			COUNT(ncs_neghits);
-			/* found neg entry; vn is already null from above */
-			hit = true;
-		} else {
+	if (nc->nc_nn == NULL) {
+		if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
 			/*
 			 * Last component and we are preparing to create
 			 * the named object, so flush the negative cache
 			 * entry.
 			 */
 			COUNT(ncs_badhits);
-			cache_remove(ncp, true);
-			/* found nothing */
+			cache_remove(nc, true);
 			hit = false;
+		} else {
+			COUNT(ncs_neghits);
+			SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
+			    namelen, 0, 0);
+			/* found neg entry; vn is already null from above */
+			hit = true;
 		}
 		if (iswht_ret != NULL) {
 			/*
 			 * Restore the ISWHITEOUT flag saved earlier.
 			 */
-			*iswht_ret = ncp->nc_whiteout;
+			*iswht_ret = nc->nc_whiteout;
 		} else {
-			KASSERT(!ncp->nc_whiteout);
+			KASSERT(!nc->nc_whiteout);
 		}
-		rw_exit(dlock);
+		rw_exit(&dnn->nn_lock);
 		return hit;
 	}
-	vp = ncp->nc_vp;
+	vp = nc->nc_vp;
 	mutex_enter(vp->v_interlock);
-	rw_exit(dlock);
+	rw_exit(&dnn->nn_lock);
 
 	/*
 	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
@@ -567,98 +587,129 @@ cache_lookup(struct vnode *dvp, const ch
 		 * XXX badhits?
 		 */
 		COUNT(ncs_falsehits);
-		/* found nothing */
 		return false;
 	}
 
 	COUNT(ncs_goodhits);
+	SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 	/* found it */
 	*vn_ret = vp;
 	return true;
 }
 
 /*
- * Cut-'n-pasted version of the above without the nameiop argument.
- *
- * This could also be used for namei's LOOKUP case, i.e. when the directory
- * is not being modified.
+ * Version of the above without the nameiop argument, for NFS.
  */
 bool
 cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
 		 uint32_t cnflags,
 		 int *iswht_ret, struct vnode **vn_ret)
 {
-	struct namecache *ncp;
-	struct vnode *vp;
-	krwlock_t *dlock;
+
+	return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
+	    iswht_ret, vn_ret);
+}
+
+/*
+ * Used by namei() to walk down a path, component by component by looking up
+ * names in the cache.  The node locks are chained along the way: a parent's
+ * lock is not dropped until the child's is acquired.
+ */
+bool
+cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
+		    struct vnode **vn_ret, krwlock_t **plock,
+		    kauth_cred_t cred)
+{
+	struct nchnode *dnn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+	struct namecache *nc;
+	int64_t key;
 	int error;
 
 	/* Establish default results. */
-	if (iswht_ret != NULL) {
-		*iswht_ret = 0;
-	}
 	*vn_ret = NULL;
 
-	if (__predict_false(!doingcache)) {
-		/* found nothing */
+	/*
+	 * If disabled, or FS doesn't support us, bail out.  This is an
+	 * unlocked check of dnn->nn_mode, but an incorrect answer doesn't
+	 * have any ill consequences.
+	 */
+	if (__predict_false(!doingcache || dnn->nn_mode == VNOVAL)) {
 		return false;
 	}
 
-	/* If no directory lock yet, there's nothing in cache. */
-	dlock = VNODE_TO_VIMPL(dvp)->vi_ncdlock;
-	if (__predict_false(dlock == NULL)) {
+	if (__predict_false(namelen > USHRT_MAX)) {
+		COUNT(ncs_long);
 		return false;
 	}
 
-	rw_enter(dlock, RW_READER);
-	if (__predict_false(namelen > USHRT_MAX)) {
-		rw_exit(dlock);
-		/* found nothing */
-		COUNT(ncs_long);
+	/* Compute the key up front - don't need the lock. */
+	key = cache_key(name, namelen);
+
+	/*
+	 * Acquire the directory lock.  Once we have that, we can drop the
+	 * previous one (if any).
+	 *
+	 * The two lock holds mean that the directory can't go away while
+	 * here: the directory must be purged first, and both parent &
+	 * child's locks must be taken to do that.
+	 *
+	 * However if there's no previous lock, like at the root of the
+	 * chain, then "dvp" must be referenced to prevent it going away
+	 * before we get its lock.
+	 *
+	 * Note that the two locks can be the same if looking up a dot, for
+	 * example: /usr/bin/.
+	 */
+	if (*plock != &dnn->nn_lock) {
+		rw_enter(&dnn->nn_lock, RW_READER);
+		if (*plock != NULL) {
+			rw_exit(*plock);
+		}
+		*plock = &dnn->nn_lock;
+	} else {
+		KASSERT(dvp->v_usecount > 0);
+	}
+
+	/*
+	 * First up check if the user is allowed to look up files in this
+	 * directory.
+	 */
+	error = kauth_authorize_vnode(cred, KAUTH_ACCESS_ACTION(VEXEC,
+	    dvp->v_type, dnn->nn_mode & ALLPERMS), dvp, NULL,
+	    genfs_can_access(dvp->v_type, dnn->nn_mode & ALLPERMS,
+	    dnn->nn_uid, dnn->nn_gid, VEXEC, cred));
+	if (error != 0) {
+		COUNT(ncs_denied);
 		return false;
 	}
-	ncp = cache_lookup_entry(dvp, name, namelen);
-	if (__predict_false(ncp == NULL)) {
-		rw_exit(dlock);
-		/* found nothing */
+
+	/*
+	 * Now look for a matching cache entry.
+	 */
+	nc = cache_lookup_entry(dnn, name, namelen, key);
+	if (__predict_false(nc == NULL)) {
 		COUNT(ncs_miss);
+		SDT_PROBE(vfs, namecache, lookup, miss, dvp,
+		    name, namelen, 0, 0);
 		return false;
 	}
-	vp = ncp->nc_vp;
-	if (vp == NULL) {
-		/*
-		 * Restore the ISWHITEOUT flag saved earlier.
-		 */
-		if (iswht_ret != NULL) {
-			/*cnp->cn_flags |= ncp->nc_flags;*/
-			*iswht_ret = ncp->nc_whiteout;
-		}
-		rw_exit(dlock);
+	if (nc->nc_nn == NULL) {
 		/* found negative entry; vn is already null from above */
 		COUNT(ncs_neghits);
+		SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 		return true;
 	}
-	mutex_enter(vp->v_interlock);
-	rw_exit(dlock);
+
+	COUNT(ncs_goodhits); /* XXX can be "badhits" */
+	SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
 
 	/*
-	 * Unlocked except for the vnode interlock.  Call vcache_tryvget().
+	 * Return with the directory lock still held.  It will either be
+	 * returned to us with another call to cache_lookup_linked() when
+	 * looking up the next component, or the caller will release it
+	 * manually when finished.
 	 */
-	error = vcache_tryvget(vp);
-	if (error) {
-		KASSERT(error == EBUSY);
-		/*
-		 * This vnode is being cleaned out.
-		 * XXX badhits?
-		 */
-		COUNT(ncs_falsehits);
-		/* found nothing */
-		return false;
-	}
-
-	COUNT(ncs_goodhits); /* XXX can be "badhits" */
-	/* found it */
-	*vn_ret = vp;
+	*vn_ret = nc->nc_vp;
 	return true;
 }
 
@@ -678,9 +729,9 @@ cache_lookup_raw(struct vnode *dvp, cons
 int
 cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
 {
-	struct namecache *ncp;
+	struct nchnode *nn = VNODE_TO_VIMPL(vp)->vi_ncache;
+	struct namecache *nc;
 	struct vnode *dvp;
-	krwlock_t *vlock;
 	char *bp;
 	int error, nlen;
 
@@ -689,26 +740,25 @@ cache_revlookup(struct vnode *vp, struct
 	if (!doingcache)
 		goto out;
 
-	vlock = VNODE_TO_VIMPL(vp)->vi_ncvlock;
-	rw_enter(vlock, RW_READER);
-	TAILQ_FOREACH(ncp, &VNODE_TO_VIMPL(vp)->vi_nclist, nc_vlist) {
-		KASSERT(ncp->nc_vp == vp);
-		KASSERT(ncp->nc_dvp != NULL);
-		nlen = ncp->nc_nlen;
+	rw_enter(&nn->nn_listlock, RW_READER);
+	TAILQ_FOREACH(nc, &nn->nn_list, nc_list) {
+		KASSERT(nc->nc_nn == nn);
+		KASSERT(nc->nc_dnn != NULL);
+		nlen = nc->nc_nlen;
 		/*
 		 * The queue is partially sorted.  Once we hit dots, nothing
 		 * else remains but dots and dotdots, so bail out.
 		 */
-		if (ncp->nc_name[0] == '.') {
+		if (nc->nc_name[0] == '.') {
 			if (nlen == 1 ||
-			    (nlen == 2 && ncp->nc_name[1] == '.')) {
+			    (nlen == 2 && nc->nc_name[1] == '.')) {
 			    	break;
 			}
 		}
 
-		/* Record a hit on the entry. */
-		if (ncp->nc_lrulist != LRU_ACTIVE) {
-			cache_activate(ncp);
+		/* Record a hit on the entry.  This is an unlocked read. */
+		if (nc->nc_lrulist != LRU_ACTIVE) {
+			cache_activate(nc);
 		}
 
 		if (bufp) {
@@ -716,18 +766,18 @@ cache_revlookup(struct vnode *vp, struct
 			bp -= nlen;
 			if (bp <= bufp) {
 				*dvpp = NULL;
-				rw_exit(vlock);
+				rw_exit(&nn->nn_listlock);
 				SDT_PROBE(vfs, namecache, revlookup,
 				    fail, vp, ERANGE, 0, 0, 0);
 				return (ERANGE);
 			}
-			memcpy(bp, ncp->nc_name, nlen);
+			memcpy(bp, nc->nc_name, nlen);
 			*bpp = bp;
 		}
 
-		dvp = ncp->nc_dvp;
+		dvp = nc->nc_dnn->nn_vp;
 		mutex_enter(dvp->v_interlock);
-		rw_exit(vlock);
+		rw_exit(&nn->nn_listlock);
 		error = vcache_tryvget(dvp);
 		if (error) {
 			KASSERT(error == EBUSY);
@@ -744,7 +794,7 @@ cache_revlookup(struct vnode *vp, struct
 		COUNT(ncs_revhits);
 		return (0);
 	}
-	rw_exit(vlock);
+	rw_exit(&nn->nn_listlock);
 	COUNT(ncs_revmiss);
  out:
 	*dvpp = NULL;
@@ -752,45 +802,16 @@ cache_revlookup(struct vnode *vp, struct
 }
 
 /*
- * Allocate and install a directory lock for a vnode.
- * XXX Be better to do this when setting vnode type to VDIR.
- * XXX In the future this could be an "nchdir" - to try other data structs.
- */
-static krwlock_t *
-cache_alloc_dirlock(struct vnode *dvp)
-{
-	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
-	krwlock_t *dlock;
-
-	/*
-	 * File systems are not generally expected to make concurrent calls
-	 * to cache_enter(), but don't needlessly impose that limitation on
-	 * them.
-	 */
-	for (;;) {
-		dlock = dvi->vi_ncdlock;
-		if (dlock != NULL) {
-			return dlock;
-		}
-		dlock = rw_obj_alloc();
-		if (atomic_cas_ptr(&dvi->vi_ncdlock, NULL, dlock) == NULL) {
-			return dlock;
-		}
-		rw_obj_free(dlock);
-	}
-}
-
-/*
- * Add an entry to the cache
+ * Add an entry to the cache.
  */
 void
 cache_enter(struct vnode *dvp, struct vnode *vp,
 	    const char *name, size_t namelen, uint32_t cnflags)
 {
-	struct namecache *ncp;
-	struct namecache *oncp;
-	vnode_impl_t *vi, *dvi;
-	krwlock_t *dlock, *vlock;
+	struct nchnode *dnn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+	struct nchnode *nn;
+	struct namecache *nc;
+	struct namecache *onc;
 	int total;
 
 	/* First, check whether we can/should add a cache entry. */
@@ -814,58 +835,58 @@ cache_enter(struct vnode *dvp, struct vn
 	}
 
 	/* Now allocate a fresh entry. */
-	if (namelen > NCHNAMLEN) {
-		size_t sz = offsetof(struct namecache, nc_name[namelen]);
-		ncp = kmem_alloc(sz, KM_SLEEP);
+	if (__predict_true(namelen <= NCHNAMLEN)) {
+		nc = pool_cache_get(cache_pool, PR_WAITOK);
 	} else {
-		ncp = pool_cache_get(cache_pool, PR_WAITOK);
+		size_t sz = offsetof(struct namecache, nc_name[namelen]);
+		nc = kmem_alloc(sz, KM_SLEEP);
 	}
 
-	/* If there is no directory lock yet, then allocate one. */
-	dvi = VNODE_TO_VIMPL(dvp);
-	dlock = dvi->vi_ncdlock;
-	if (__predict_false(dlock == NULL)) {
-		dlock = cache_alloc_dirlock(dvp);
-	}
+	/* Fill in cache info. */
+	nc->nc_dnn = dnn;
+	nc->nc_key = cache_key(name, namelen);
+	nc->nc_nlen = namelen;
+	memcpy(nc->nc_name, name, namelen);
 
 	/*
-	 * Concurrent lookups in the same directory may race for a cache
-	 * entry.  If there's a duplicated entry, free it.
+	 * Insert to the directory.  Concurrent lookups in the same
+	 * directory may race for a cache entry.  There can also be hash
+	 * value collisions.  If there's a entry there already, free it.
 	 */
-	rw_enter(dlock, RW_WRITER);
-	oncp = cache_lookup_entry(dvp, name, namelen);
-	if (oncp) {
-		cache_remove(oncp, true);
+	rw_enter(&dnn->nn_lock, RW_WRITER);
+	onc = rb_tree_find_node(&dnn->nn_tree, &nc->nc_key);
+	if (onc) {
+		KASSERT(onc->nc_nlen == nc->nc_nlen);
+		if (memcmp(onc->nc_name, nc->nc_name, nc->nc_nlen) != 0) {
+			COUNT(ncs_collisions);
+		}
+		cache_remove(onc, true);
 	}
+	rb_tree_insert_node(&dnn->nn_tree, nc);
 
-	/* Fill in cache info and insert to directory. */
-	ncp->nc_dvp = dvp;
-	KASSERT(namelen <= USHRT_MAX);
-	ncp->nc_nlen = namelen;
-	memcpy(ncp->nc_name, name, (unsigned)ncp->nc_nlen);
-	rb_tree_insert_node(&dvi->vi_nctree, ncp);
-	ncp->nc_vp = vp;
-
-	/* Insert to the vnode. */
+	/* Then insert to the vnode. */
 	if (vp == NULL) {
 		/*
 		 * For negative hits, save the ISWHITEOUT flag so we can
 		 * restore it later when the cache entry is used again.
 		 */
-		ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
+		nc->nc_vp = NULL;
+		nc->nc_nn = NULL;
+		nc->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
 	} else {
-		ncp->nc_whiteout = false;
-		vi = VNODE_TO_VIMPL(vp);
-		vlock = vi->vi_ncvlock;
+		nn = VNODE_TO_VIMPL(vp)->vi_ncache;
 		/* Partially sort the per-vnode list: dots go to back. */
-		rw_enter(vlock, RW_WRITER);
+		rw_enter(&nn->nn_listlock, RW_WRITER);
 		if ((namelen == 1 && name[0] == '.') ||
 		    (namelen == 2 && name[0] == '.' && name[1] == '.')) {
-			TAILQ_INSERT_TAIL(&vi->vi_nclist, ncp, nc_vlist);
+			TAILQ_INSERT_TAIL(&nn->nn_list, nc, nc_list);
 		} else {
-			TAILQ_INSERT_HEAD(&vi->vi_nclist, ncp, nc_vlist);
+			TAILQ_INSERT_HEAD(&nn->nn_list, nc, nc_list);
 		}
-		rw_exit(vlock);
+		rw_exit(&nn->nn_listlock);
+		nc->nc_vp = vp;
+		nc->nc_nn = nn;
+		nc->nc_whiteout = false;
 	}
 
 	/*
@@ -874,31 +895,85 @@ cache_enter(struct vnode *dvp, struct vn
 	 * balance the lists.
 	 */
 	mutex_enter(&cache_lru_lock);
-	ncp->nc_lrulist = LRU_ACTIVE;
+	nc->nc_lrulist = LRU_ACTIVE;
 	cache_lru.count[LRU_ACTIVE]++;
-	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
+	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], nc, nc_lru);
 	cache_deactivate();
 	mutex_exit(&cache_lru_lock);
-	rw_exit(dlock);
+	rw_exit(&dnn->nn_lock);
+}
+
+/*
+ * First set of identity info in cache for a vnode.  If a file system calls
+ * this, it means the FS supports in-cache lookup.  We only care about
+ * directories so ignore other updates.
+ */
+void
+cache_set_id(struct vnode *dvp, mode_t mode, uid_t uid, gid_t gid)
+{
+	struct nchnode *nn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+
+	if (dvp->v_type == VDIR) {
+		rw_enter(&nn->nn_lock, RW_WRITER);
+		KASSERT(nn->nn_mode == VNOVAL);
+		KASSERT(nn->nn_uid == VNOVAL);
+		KASSERT(nn->nn_gid == VNOVAL);
+		nn->nn_mode = mode;
+		nn->nn_uid = uid;
+		nn->nn_gid = gid;
+		rw_exit(&nn->nn_lock);
+	}
+}
+
+/*
+ * Update of identity info in cache for a vnode.  If we didn't get ID info
+ * to begin with, then the file system doesn't support in-cache lookup,
+ * so ignore the update.  We only care about directories.
+ */
+void
+cache_update_id(struct vnode *dvp, mode_t mode, uid_t uid, gid_t gid)
+{
+	struct nchnode *nn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+
+	if (dvp->v_type == VDIR) {
+		rw_enter(&nn->nn_lock, RW_WRITER);
+		if (nn->nn_mode != VNOVAL) {
+			nn->nn_mode = mode;
+			nn->nn_uid = uid;
+			nn->nn_gid = gid;
+		}
+		rw_exit(&nn->nn_lock);
+	}
 }
 
 /*
- * Name cache initialization, from vfs_init() when we are booting.
+ * Name cache initialization, from vfs_init() when the system is booting.
  */
 void
 nchinit(void)
 {
 
 	cache_pool = pool_cache_init(sizeof(struct namecache),
-	    coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, NULL,
+	    coherency_unit, 0, 0, "nchentry", NULL, IPL_NONE, NULL,
 	    NULL, NULL);
 	KASSERT(cache_pool != NULL);
 
+	cache_node_pool = pool_cache_init(sizeof(struct nchnode),
+	    coherency_unit, 0, 0, "nchnode", NULL, IPL_NONE, NULL,
+	    NULL, NULL);
+	KASSERT(cache_node_pool != NULL);
+
 	mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
 	TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
 	TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
 
-	sysctl_cache_stat_setup();
+	KASSERT(cache_sysctllog == NULL);
+	sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT,
+		       CTLTYPE_STRUCT, "namecache_stats",
+		       SYSCTL_DESCR("namecache statistics"),
+		       cache_stat_sysctl, 0, NULL, 0,
+		       CTL_VFS, CTL_CREATE, CTL_EOL);
 }
 
 /*
@@ -922,12 +997,19 @@ cache_cpu_init(struct cpu_info *ci)
 void
 cache_vnode_init(struct vnode *vp)
 {
-	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+	struct nchnode *nn;
+
+	nn = pool_cache_get(cache_node_pool, PR_WAITOK);
+	rw_init(&nn->nn_lock);
+	rw_init(&nn->nn_listlock);
+	rb_tree_init(&nn->nn_tree, &cache_rbtree_ops);
+	TAILQ_INIT(&nn->nn_list);
+	nn->nn_mode = VNOVAL;
+	nn->nn_uid = VNOVAL;
+	nn->nn_gid = VNOVAL;
+	nn->nn_vp = vp;
 
-	vi->vi_ncdlock = NULL;
-	vi->vi_ncvlock = rw_obj_alloc();
-	rb_tree_init(&vi->vi_nctree, &cache_rbtree_ops);
-	TAILQ_INIT(&vi->vi_nclist);
+	VNODE_TO_VIMPL(vp)->vi_ncache = nn;
 }
 
 /*
@@ -936,133 +1018,219 @@ cache_vnode_init(struct vnode *vp)
 void
 cache_vnode_fini(struct vnode *vp)
 {
-	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
+	struct nchnode *nn = VNODE_TO_VIMPL(vp)->vi_ncache;
 
-	KASSERT(RB_TREE_MIN(&vi->vi_nctree) == NULL);
-	KASSERT(TAILQ_EMPTY(&vi->vi_nclist));
-	rw_obj_free(vi->vi_ncvlock);
-	if (vi->vi_ncdlock != NULL) {
-		rw_obj_free(vi->vi_ncdlock);
+	KASSERT(nn != NULL);
+	KASSERT(nn->nn_vp == vp);
+	KASSERT(RB_TREE_MIN(&nn->nn_tree) == NULL);
+	KASSERT(TAILQ_EMPTY(&nn->nn_list));
+
+	rw_destroy(&nn->nn_lock);
+	rw_destroy(&nn->nn_listlock);
+	pool_cache_put(cache_node_pool, nn);
+}
+
+/*
+ * Helper for cache_purge1(): purge cache entries for the given vnode from
+ * all directories that the vnode is cached in.
+ */
+static void
+cache_purge_parents(struct vnode *vp)
+{
+	struct nchnode *nn = VNODE_TO_VIMPL(vp)->vi_ncache;
+	struct nchnode *dnn, *blocked;
+	struct namecache *nc;
+	struct vnode *dvp;
+
+	SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
+
+	blocked = NULL;
+
+	rw_enter(&nn->nn_listlock, RW_WRITER);
+	while ((nc = TAILQ_FIRST(&nn->nn_list)) != NULL) {
+		/*
+		 * Locking in the wrong direction.  Try for a hold on the
+		 * directory node's lock, and if we get it then all good,
+		 * nuke the entry and move on to the next.
+		 */
+		dnn = nc->nc_dnn;
+		if (rw_tryenter(&dnn->nn_lock, RW_WRITER)) {
+			cache_remove(nc, false);
+			rw_exit(&dnn->nn_lock);
+			blocked = NULL;
+			continue;
+		}
+
+		/*
+		 * We can't wait on the directory node's lock with our list
+		 * lock held or the system could deadlock.
+		 *
+		 * Take a hold on the directory vnode to prevent it from
+		 * being freed (taking the nchnode & lock with it).  Then
+		 * wait for the lock to become available with no other locks
+		 * held, and retry.
+		 *
+		 * If this happens twice in a row, give the other side a
+		 * breather; we can do nothing until it lets go.
+		 */
+		dvp = dnn->nn_vp;
+		vhold(dvp);
+		rw_exit(&nn->nn_listlock);
+		rw_enter(&dnn->nn_lock, RW_WRITER);
+		/* Do nothing. */
+		rw_exit(&dnn->nn_lock);
+		holdrele(dvp);
+		if (blocked == dnn) {
+			kpause("ncpurge", false, 1, NULL);
+		}
+		rw_enter(&nn->nn_listlock, RW_WRITER);
+		blocked = dnn;
 	}
+	rw_exit(&nn->nn_listlock);
+}
+
+/*
+ * Helper for cache_purge1(): purge all cache entries hanging off the given
+ * directory vnode.
+ */
+static void
+cache_purge_children(struct vnode *dvp)
+{
+	struct nchnode *dnn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+	struct namecache *nc;
+
+	SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
+
+	rw_enter(&dnn->nn_lock, RW_WRITER);
+	while ((nc = rb_tree_iterate(&dnn->nn_tree, NULL, RB_DIR_RIGHT))
+	    != NULL) {
+		cache_remove(nc, true);
+	}
+	rw_exit(&dnn->nn_lock);
+}
+
+/*
+ * Helper for cache_purge1(): purge cache entry from the given vnode,
+ * finding it by name.
+ */
+static void
+cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
+{
+	struct nchnode *dnn = VNODE_TO_VIMPL(dvp)->vi_ncache;
+	struct namecache *nc;
+	int64_t key;
+
+	SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
+
+	key = cache_key(name, namelen);
+	rw_enter(&dnn->nn_lock, RW_WRITER);
+	nc = cache_lookup_entry(dnn, name, namelen, key);
+	if (nc) {
+		cache_remove(nc, true);
+	}
+	rw_exit(&dnn->nn_lock);
 }
 
 /*
  * Cache flush, a particular vnode; called when a vnode is renamed to
- * hide entries that would now be invalid
+ * hide entries that would now be invalid.
  */
 void
 cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
 {
-	struct namecache *ncp;
-	krwlock_t *dlock, *vlock, *blocked;
-	rb_tree_t *tree;
 
 	if (flags & PURGE_PARENTS) {
-		SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
-		blocked = NULL;
-		vlock = VNODE_TO_VIMPL(vp)->vi_ncvlock;
-		rw_enter(vlock, RW_WRITER);
-		while ((ncp = TAILQ_FIRST(&VNODE_TO_VIMPL(vp)->vi_nclist))
-		    != NULL) {
-			dlock = VNODE_TO_VIMPL(ncp->nc_dvp)->vi_ncdlock;
-			KASSERT(dlock != NULL);
-			/*
-			 * We can't wait on the directory lock with the 
-			 * list lock held or the system could
-			 * deadlock.  In the unlikely event that the
-			 * directory lock is unavailable, take a hold on it
-			 * and wait for it to become available with no other
-			 * locks held.  Then retry.  If this happens twice
-			 * in a row, we'll give the other side a breather to
-			 * prevent livelock.
-			 */
-			if (!rw_tryenter(dlock, RW_WRITER)) {
-				rw_obj_hold(dlock);
-				rw_exit(vlock);
-				rw_enter(dlock, RW_WRITER);
-				/* Do nothing. */
-				rw_exit(dlock);
-				rw_obj_free(dlock);
-				if (blocked == dlock) {
-					kpause("livelock", false, 1, NULL);
-				}
-				rw_enter(dlock, RW_WRITER);
-				blocked = dlock;
-			} else {
-				cache_remove(ncp, false);
-				rw_exit(dlock);
-				blocked = NULL;
-			}
-		}
-		rw_exit(vlock);
+		cache_purge_parents(vp);
 	}
 	if (flags & PURGE_CHILDREN) {
-		SDT_PROBE(vfs, namecache, purge, children, vp, 0, 0, 0, 0);
-		dlock = VNODE_TO_VIMPL(vp)->vi_ncdlock;
-		if (dlock != NULL) {
-			rw_enter(dlock, RW_WRITER);
-			tree = &VNODE_TO_VIMPL(vp)->vi_nctree;
-			while ((ncp = rb_tree_iterate(tree, NULL,
-			    RB_DIR_RIGHT)) != NULL) {
-				cache_remove(ncp, true);
-			}
-			rw_exit(dlock);
-		}
+		cache_purge_children(vp);
 	}
 	if (name != NULL) {
-		SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);
-		dlock = VNODE_TO_VIMPL(vp)->vi_ncdlock;
-		if (dlock != NULL) {
-			rw_enter(dlock, RW_WRITER);
-			ncp = cache_lookup_entry(vp, name, namelen);
-			if (ncp) {
-				cache_remove(ncp, true);
-			}
-			rw_exit(dlock);
-		}
+		cache_purge_name(vp, name, namelen);
 	}
 }
 
 /*
  * Cache flush, a whole filesystem; called when filesys is umounted to
  * remove entries that would now be invalid.
+ *
+ * In the BSD implementation this traversed the LRU list.  To avoid locking
+ * difficulties, and avoid scanning a ton of namecache entries that we have
+ * no interest in, scan the mount's vnode list instead.
  */
 void
 cache_purgevfs(struct mount *mp)
 {
 	struct vnode_iterator *marker;
-	struct namecache *ncp;
-	vnode_impl_t *vi;
-	krwlock_t *dlock;
 	vnode_t *vp;
 
-	/*
-	 * In the original BSD implementation this used to traverse the LRU
-	 * list.  To avoid locking difficulties, and avoid scanning a ton of
-	 * namecache entries that we have no interest in, scan the mount
-	 * list instead.
-	 */
 	vfs_vnode_iterator_init(mp, &marker);
 	while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
-		vi = VNODE_TO_VIMPL(vp);
-		if (vp->v_type != VDIR) {
-			KASSERT(RB_TREE_MIN(&vi->vi_nctree) == NULL);
-			continue;
-		}
-		if ((dlock = vi->vi_ncdlock) == NULL) {
-			continue;
+		cache_purge_children(vp);
+		if ((vp->v_vflag & VV_ROOT) != 0) {
+			cache_purge_parents(vp);
 		}
-		rw_enter(dlock, RW_WRITER);
-		while ((ncp = rb_tree_iterate(&vi->vi_nctree, NULL,
-		    RB_DIR_RIGHT)) != NULL) {
-			cache_remove(ncp, true);
-		}
-		rw_exit(dlock);
+		vrele(vp);
 	}
 	vfs_vnode_iterator_destroy(marker);
 }
 
 /*
+ * Re-queue an entry onto the correct LRU list, after it has scored a hit.
+ */
+static void
+cache_activate(struct namecache *nc)
+{
+
+	mutex_enter(&cache_lru_lock);
+	/* Put on tail of ACTIVE list, since it just scored a hit. */
+	TAILQ_REMOVE(&cache_lru.list[nc->nc_lrulist], nc, nc_lru);
+	TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], nc, nc_lru);
+	cache_lru.count[nc->nc_lrulist]--;
+	cache_lru.count[LRU_ACTIVE]++;
+	nc->nc_lrulist = LRU_ACTIVE;
+	mutex_exit(&cache_lru_lock);
+}
+
+/*
+ * Try to balance the LRU lists.  Pick some victim entries, and re-queue
+ * them from the head of the ACTIVE list to the tail of the INACTIVE list. 
+ */
+static void
+cache_deactivate(void)
+{
+	struct namecache *nc;
+	int total, i;
+
+	KASSERT(mutex_owned(&cache_lru_lock));
+
+	/* If we're nowhere near budget yet, don't bother. */
+	total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
+	if (total < (desiredvnodes >> 1)) {
+	    	return;
+	}
+
+	/* If not out of balance, don't bother. */
+	if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
+		return;
+	}
+
+	/* Move victim from head of ACTIVE list, to tail of INACTIVE. */
+	for (i = 0; i < cache_lru_maxdeact; i++) {
+		nc = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
+		if (nc == NULL) {
+			break;
+		}
+		KASSERT(nc->nc_lrulist == LRU_ACTIVE);
+		nc->nc_lrulist = LRU_INACTIVE;
+		TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], nc, nc_lru);
+		TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], nc, nc_lru);
+		cache_lru.count[LRU_ACTIVE]--;
+		cache_lru.count[LRU_INACTIVE]++;
+	}
+}
+
+/*
  * Free some entries from the cache, when we have gone over budget.
  *
  * We don't want to cause too much work for any individual caller, and it
@@ -1074,8 +1242,8 @@ cache_purgevfs(struct mount *mp)
 static void
 cache_reclaim(void)
 {
-	struct namecache *ncp;
-	krwlock_t *dlock;
+	struct namecache *nc;
+	struct nchnode *nn;
 	int toscan, total;
 
 	/* Scan up to a preset maxium number of entries. */
@@ -1088,13 +1256,13 @@ cache_reclaim(void)
 		cache_deactivate();
 
 		/* Now look for a victim on head of inactive list (old). */
-		ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
-		if (ncp == NULL) {
+		nc = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
+		if (nc == NULL) {
 			break;
 		}
-		dlock = VNODE_TO_VIMPL(ncp->nc_dvp)->vi_ncdlock;
-		KASSERT(ncp->nc_lrulist == LRU_INACTIVE);
-		KASSERT(dlock != NULL);
+		nn = nc->nc_nn;
+		KASSERT(nc->nc_lrulist == LRU_INACTIVE);
+		KASSERT(nn != NULL);
 
 		/*
 		 * Locking in the wrong direction.  If we can't get the
@@ -1102,62 +1270,33 @@ cache_reclaim(void)
 		 * cause problems for the next guy in here, so send the
 		 * entry to the back of the list.
 		 */
-		if (!rw_tryenter(dlock, RW_WRITER)) {
+		if (!rw_tryenter(&nn->nn_lock, RW_WRITER)) {
 			TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
-			    ncp, nc_lru);
+			    nc, nc_lru);
 			TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
-			    ncp, nc_lru);
+			    nc, nc_lru);
 			continue;
 		}
 
 		/*
 		 * Now have the victim entry locked.  Drop the LRU list
 		 * lock, purge the entry, and start over.  The hold on the
-		 * directory lock will prevent the directory lock itself
-		 * from vanishing until finished (the directory owner
-		 * will call cache_purge() on dvp before it disappears).
+		 * directory lock will prevent the vnode from vanishing
+		 * until finished (the vnode owner will call cache_purge()
+		 * on dvp before it disappears).
 		 */
 		mutex_exit(&cache_lru_lock);
-		cache_remove(ncp, true);
-		rw_exit(dlock);
+		cache_remove(nc, true);
+		rw_exit(&nn->nn_lock);
 		mutex_enter(&cache_lru_lock);
 	}
 	mutex_exit(&cache_lru_lock);
 }
 
-#ifdef DDB
-void
-namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
-{
-	struct vnode *dvp = NULL;
-	struct namecache *ncp;
-	enum cache_lru_id id;
-
-	for (id = 0; id < LRU_COUNT; id++) {
-		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
-			if (ncp->nc_vp == vp) {
-				(*pr)("name %.*s\n", ncp->nc_nlen,
-				    ncp->nc_name);
-				dvp = ncp->nc_dvp;
-			}
-		}
-	}
-	if (dvp == NULL) {
-		(*pr)("name not found\n");
-		return;
-	}
-	vp = dvp;
-	for (id = 0; id < LRU_COUNT; id++) {
-		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
-			if (ncp->nc_vp == vp) {
-				(*pr)("parent %.*s\n", ncp->nc_nlen,
-				    ncp->nc_name);
-			}
-		}
-	}
-}
-#endif
-
+/*
+ * For file system code: count a lookup that required a full re-scan of
+ * directory metadata.
+ */
 void
 namecache_count_pass2(void)
 {
@@ -1165,6 +1304,10 @@ namecache_count_pass2(void)
 	COUNT(ncs_pass2);
 }
 
+/*
+ * For file system code: count a lookup that scored a hit in the directory
+ * metadata near the location of the last lookup.
+ */
 void
 namecache_count_2passes(void)
 {
@@ -1180,55 +1323,80 @@ namecache_count_2passes(void)
 static int
 cache_stat_sysctl(SYSCTLFN_ARGS)
 {
-	struct nchstats stats;
 	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci;
 
 	if (oldp == NULL) {
-		*oldlenp = sizeof(stats);
+		*oldlenp = sizeof(nchstats);
 		return 0;
 	}
 
-	if (*oldlenp < sizeof(stats)) {
+	if (*oldlenp <= 0) {
 		*oldlenp = 0;
 		return 0;
 	}
 
 	sysctl_unlock();
-	memset(&stats, 0, sizeof(stats));
+	mutex_enter(&cache_lru_lock);
+	memset(&nchstats, 0, sizeof(nchstats));
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		struct nchstats_percpu *np = ci->ci_data.cpu_nch;
 
-		stats.ncs_goodhits += np->ncs_goodhits;
-		stats.ncs_neghits += np->ncs_neghits;
-		stats.ncs_badhits += np->ncs_badhits;
-		stats.ncs_falsehits += np->ncs_falsehits;
-		stats.ncs_miss += np->ncs_miss;
-		stats.ncs_long += np->ncs_long;
-		stats.ncs_pass2 += np->ncs_pass2;
-		stats.ncs_2passes += np->ncs_2passes;
-		stats.ncs_revhits += np->ncs_revhits;
-		stats.ncs_revmiss += np->ncs_revmiss;
+		nchstats.ncs_goodhits += np->ncs_goodhits;
+		nchstats.ncs_neghits += np->ncs_neghits;
+		nchstats.ncs_badhits += np->ncs_badhits;
+		nchstats.ncs_falsehits += np->ncs_falsehits;
+		nchstats.ncs_miss += np->ncs_miss;
+		nchstats.ncs_long += np->ncs_long;
+		nchstats.ncs_pass2 += np->ncs_pass2;
+		nchstats.ncs_2passes += np->ncs_2passes;
+		nchstats.ncs_revhits += np->ncs_revhits;
+		nchstats.ncs_revmiss += np->ncs_revmiss;
+		nchstats.ncs_collisions += np->ncs_collisions;
+		nchstats.ncs_denied += np->ncs_denied;
 	}
-	/* Serialize the update to nchstats, just because. */
-	mutex_enter(&cache_lru_lock);
-	nchstats = stats;
+	nchstats.ncs_active = cache_lru.count[LRU_ACTIVE];
+	nchstats.ncs_inactive = cache_lru.count[LRU_INACTIVE];
 	mutex_exit(&cache_lru_lock);
 	sysctl_relock();
 
-	*oldlenp = sizeof(stats);
-	return sysctl_copyout(l, &stats, oldp, sizeof(stats));
+	*oldlenp = MIN(sizeof(nchstats), *oldlenp);
+	return sysctl_copyout(l, &nchstats, oldp, *oldlenp);
 }
 
-static void
-sysctl_cache_stat_setup(void)
+/*
+ * For the debugger, given the address of a vnode, print all associated
+ * names in the cache.
+ */
+#ifdef DDB
+void
+namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 {
+	struct vnode *dvp = NULL;
+	struct namecache *nc;
+	struct nchnode *dnn;
+	enum cache_lru_id id;
 
-	KASSERT(sysctllog == NULL);
-	sysctl_createv(&sysctllog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT,
-		       CTLTYPE_STRUCT, "namecache_stats",
-		       SYSCTL_DESCR("namecache statistics"),
-		       cache_stat_sysctl, 0, NULL, 0,
-		       CTL_VFS, CTL_CREATE, CTL_EOL);
+	for (id = 0; id < LRU_COUNT; id++) {
+		TAILQ_FOREACH(nc, &cache_lru.list[id], nc_lru) {
+			if (nc->nc_vp == vp) {
+				(*pr)("name %.*s\n", nc->nc_nlen,
+				    nc->nc_name);
+				dnn = nc->nc_dnn;
+			}
+		}
+	}
+	if (dvp == NULL) {
+		(*pr)("name not found\n");
+		return;
+	}
+	for (id = 0; id < LRU_COUNT; id++) {
+		TAILQ_FOREACH(nc, &cache_lru.list[id], nc_lru) {
+			if (nc->nc_nn == dnn) {
+				(*pr)("parent %.*s\n", nc->nc_nlen,
+				    nc->nc_name);
+			}
+		}
+	}
 }
+#endif

Index: src/sys/kern/vfs_lookup.c
diff -u src/sys/kern/vfs_lookup.c:1.212.4.2 src/sys/kern/vfs_lookup.c:1.212.4.3
--- src/sys/kern/vfs_lookup.c:1.212.4.2	Fri Jan 17 21:47:35 2020
+++ src/sys/kern/vfs_lookup.c	Fri Jan 17 22:26:25 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_lookup.c,v 1.212.4.2 2020/01/17 21:47:35 ad Exp $	*/
+/*	$NetBSD: vfs_lookup.c,v 1.212.4.3 2020/01/17 22:26:25 ad Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.212.4.2 2020/01/17 21:47:35 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.212.4.3 2020/01/17 22:26:25 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_magiclinks.h"
@@ -50,6 +50,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c
 #include <sys/time.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
+#include <sys/vnode_impl.h>
 #include <sys/mount.h>
 #include <sys/errno.h>
 #include <sys/filedesc.h>
@@ -857,7 +858,7 @@ lookup_parsepath(struct namei_state *sta
 	 * responsibility for freeing the pathname buffer.
 	 *
 	 * At this point, our only vnode state is that the search dir
-	 * is held and locked.
+	 * is held.
 	 */
 	cnp->cn_consume = 0;
 	cnp->cn_namelen = namei_getcomponent(cnp->cn_nameptr);
@@ -914,6 +915,91 @@ lookup_parsepath(struct namei_state *sta
 }
 
 /*
+ * Take care of crossing a mounted-on vnode.  On error, foundobj_ret will be
+ * vrele'd, but searchdir is left alone.
+ */
+static int
+lookup_crossmount(struct namei_state *state,
+		  struct vnode **searchdir_ret,
+		  struct vnode **foundobj_ret,
+		  bool searchdir_locked)
+{
+	struct componentname *cnp = state->cnp;
+	struct vnode *foundobj;
+	struct vnode *searchdir;
+	struct mount *mp;
+	int error;
+
+	searchdir = *searchdir_ret;
+	foundobj = *foundobj_ret;
+
+	KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
+	KASSERT(searchdir != NULL);
+
+	/*
+	 * Check to see if the vnode has been mounted on;
+	 * if so find the root of the mounted file system.
+	 */
+	error = vn_lock(foundobj, LK_SHARED);
+	if (error != 0) {
+		vrele(foundobj);
+		*foundobj_ret = NULL;
+		return error;
+	}
+	while (foundobj->v_type == VDIR &&
+	    (mp = foundobj->v_mountedhere) != NULL &&
+	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		KASSERTMSG(searchdir != foundobj, "same vn %p", searchdir);
+
+		error = vfs_busy(mp);
+		if (error != 0) {
+			vput(foundobj);
+			*foundobj_ret = NULL;
+			return error;
+		}
+		if (searchdir_locked) {
+			VOP_UNLOCK(searchdir);
+		}
+		vput(foundobj);
+		error = VFS_ROOT(mp, LK_SHARED, &foundobj);
+		vfs_unbusy(mp);
+		if (error) {
+			if (searchdir_locked) {
+				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
+			}
+			*foundobj_ret = NULL;
+			return error;
+		}
+		/*
+		 * Avoid locking vnodes from two filesystems because
+		 * it's prone to deadlock, e.g. when using puffs.
+		 * Also, it isn't a good idea to propagate slowness of
+		 * a filesystem up to the root directory. For now,
+		 * only handle the common case, where foundobj is
+		 * VDIR.
+		 *
+		 * In this case set searchdir to null to avoid using
+		 * it again. It is not correct to set searchdir ==
+		 * foundobj here as that will confuse the caller.
+		 * (See PR 40740.)
+		 */
+		if (searchdir == NULL) {
+			/* already been here once; do nothing further */
+		} else if (foundobj->v_type == VDIR) {
+			vrele(searchdir);
+			*searchdir_ret = searchdir = NULL;
+			*foundobj_ret = foundobj;
+		} else if (searchdir_locked) {
+			VOP_UNLOCK(foundobj);
+			vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
+			vn_lock(foundobj, LK_SHARED | LK_RETRY);
+		}
+	}
+	VOP_UNLOCK(foundobj);
+	return 0;
+}
+
+/*
  * Call VOP_LOOKUP for a single lookup; return a new search directory
  * (used when crossing mountpoints up or searching union mounts down) and 
  * the found object, which for create operations may be NULL on success.
@@ -934,7 +1020,6 @@ lookup_once(struct namei_state *state,
 {
 	struct vnode *tmpvn;		/* scratch vnode */
 	struct vnode *foundobj;		/* result */
-	struct mount *mp;		/* mount table entry */
 	struct lwp *l = curlwp;
 	bool searchdir_locked = false;
 	int error;
@@ -1083,84 +1168,6 @@ unionlookup:
 			cnp->cn_flags |= ISLASTCN;
 	}
 
-	/*
-	 * Do an unlocked check to see if the vnode has been mounted on; if
-	 * so find the root of the mounted file system.
-	 */
-	KASSERT(searchdir != NULL);
-	if (foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL &&
-	    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
-		/*
-		 * "searchdir" is locked and held, "foundobj" is held,
-		 * they may be the same vnode.
-		 */
-		if (searchdir != foundobj) {
-			if (vn_lock(foundobj, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
-				if (cnp->cn_flags & ISDOTDOT)
-					VOP_UNLOCK(searchdir);
-				error = vn_lock(foundobj, LK_EXCLUSIVE);
-				if (cnp->cn_flags & ISDOTDOT)
-					vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
-				if (error != 0) {
-					vrele(foundobj);
-					goto done;
-				}
-			}
-		}
-		while (foundobj->v_type == VDIR &&
-		       (mp = foundobj->v_mountedhere) != NULL &&
-		       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
-
-			KASSERT(searchdir != foundobj);
-
-			error = vfs_busy(mp);
-			if (error != 0) {
-				vput(foundobj);
-				goto done;
-			}
-			if (searchdir != NULL) {
-				VOP_UNLOCK(searchdir);
-				searchdir_locked = false;
-			}
-			vput(foundobj);
-			error = VFS_ROOT(mp, LK_EXCLUSIVE, &foundobj);
-			vfs_unbusy(mp);
-			if (error) {
-				if (searchdir != NULL) {
-					vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
-					searchdir_locked = true;
-				}
-				goto done;
-			}
-			/*
-			 * Avoid locking vnodes from two filesystems because
-			 * it's prone to deadlock, e.g. when using puffs.
-			 * Also, it isn't a good idea to propagate slowness of
-			 * a filesystem up to the root directory. For now,
-			 * only handle the common case, where foundobj is
-			 * VDIR.
-			 *
-			 * In this case set searchdir to null to avoid using
-			 * it again. It is not correct to set searchdir ==
-			 * foundobj here as that will confuse the caller.
-			 * (See PR 47040.)
-			 */
-			if (searchdir == NULL) {
-				/* already been here once; do nothing further */
-			} else if (foundobj->v_type == VDIR) {
-				vrele(searchdir);
-				*newsearchdir_ret = searchdir = NULL;
-			} else {
-				VOP_UNLOCK(foundobj);
-				vn_lock(searchdir, LK_EXCLUSIVE | LK_RETRY);
-				vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
-				searchdir_locked = true;
-			}
-		}
-		KASSERT(searchdir != foundobj);
-		VOP_UNLOCK(foundobj);
-	}
-
 	/* Unlock, unless the caller needs the parent locked. */
 	if (searchdir != NULL) {
 		KASSERT(searchdir_locked);
@@ -1180,6 +1187,160 @@ done:
 	return error;
 }
 
+/*
+ * Parse out the first path name component that we need to to consider. 
+ *
+ * While doing this, attempt to use the name cache to fastforward through as
+ * many "easy" to find components of the path as possible.
+ *
+ * We use the namecache's node locks to form a chain, and avoid as many
+ * vnode references and locks as possible.  In the most ideal case, only the
+ * final vnode will have its reference count adjusted and lock taken.
+ */
+static int
+lookup_fastforward(struct namei_state *state, struct vnode **searchdir,
+		   struct vnode **foundobj)
+{
+	struct componentname *cnp = state->cnp;
+	struct nameidata *ndp = state->ndp;
+	krwlock_t *plock;
+	struct vnode *vp, *origsearchdir;
+	int error, error2;
+	size_t oldpathlen;
+	const char *oldnameptr;
+
+	/*
+	 * Eat as many path name components as possible before giving up and
+	 * letting lookup_once() handle it.  Remember the starting point in
+	 * case we can't get vnode references and need to roll back.
+	 */
+	plock = NULL;
+	origsearchdir = *searchdir;
+	oldnameptr = cnp->cn_nameptr;
+	oldpathlen = ndp->ni_pathlen;
+	for (;;) {
+		/*
+		 * Get the next component name.  There should be no slashes
+		 * here, and we shouldn't have looped around if we were
+		 * done.
+		 */
+		KASSERT(cnp->cn_nameptr[0] != '/');
+		KASSERT(cnp->cn_nameptr[0] != '\0');
+		if ((error = lookup_parsepath(state)) != 0) {
+			break;
+		}
+
+		/*
+		 * Can't deal with dotdot lookups, because it means lock
+		 * order reversal, and there are checks in lookup_once()
+		 * that need to be made.  Also check for missing mountpoints
+		 * (XXX racy).
+		 */
+		if ((cnp->cn_flags & ISDOTDOT) != 0 ||
+		    (*searchdir)->v_mount == NULL) {
+			error = EOPNOTSUPP;
+			break;
+		}
+
+		/*
+		 * Can't deal with last component when modifying; this needs
+		 * the directory vnode locked and VOP_LOOKUP() called (which
+		 * can and does modify state, despite the name).
+		 */
+		if ((cnp->cn_flags & ISLASTCN) != 0) {
+			if (cnp->cn_nameiop != LOOKUP ||
+			    (cnp->cn_flags & LOCKPARENT) != 0) {
+				error = EOPNOTSUPP;
+				break;
+			}
+		}
+
+		/*
+		 * Good, now look for it in cache.  cache_lookup_linked()
+		 * will fail if there's nothing there, or if there's not
+		 * ownership info for the directory, or if the user doesn't
+		 * have permission to look up files in this directory.
+		 */
+		if (!cache_lookup_linked(*searchdir, cnp->cn_nameptr,
+		    cnp->cn_namelen, &vp, &plock, cnp->cn_cred)) {
+			error = EOPNOTSUPP;
+			break;
+		}
+		KASSERT(plock != NULL && rw_lock_held(plock));
+
+		/* Scored a hit.  Negative is good too (ENOENT). */
+		if (vp == NULL) {
+			*foundobj = vp;
+			error = ENOENT;
+			break;
+		}
+
+		/* Stop if we've reached the last component: get vnode. */
+		if (cnp->cn_flags & ISLASTCN) {
+			mutex_enter(vp->v_interlock);
+			error = vcache_tryvget(vp);
+			/* v_interlock now released */
+			if (error == 0) {
+				*foundobj = vp;
+			}
+			vp = NULL;
+			break;
+		}
+
+		/*
+		 * Not the last component.  If we found something other than
+		 * a directory, or it's a directory with a filesystem
+		 * mounted on it, bail out.
+		 */
+		if (vp->v_type != VDIR || vp->v_mountedhere != NULL) {
+			error = EOPNOTSUPP;
+			vp = NULL;
+			break;
+		}
+
+		/*
+		 * Otherwise, we're still in business.  Set the found VDIR
+		 * vnode as the search dir for the next component and
+		 * continue on to it.
+		 */
+		cnp->cn_nameptr = ndp->ni_next;
+		*searchdir = vp;
+	}
+
+	/*
+	 * If we ended up with a new search dir, ref it before dropping the
+	 * namecache's lock.  The lock prevents both searchdir and foundobj
+	 * from disappearing.  If we can't ref the new searchdir, we have a
+	 * bit of a problem.  Roll back the fastforward to the beginning and
+	 * let lookup_once() take care of it.
+	 */
+	if (*searchdir != origsearchdir) {
+		mutex_enter((*searchdir)->v_interlock);
+		error2 = vcache_tryvget(*searchdir);
+		/* v_interlock now unheld */
+		if (plock != NULL) {
+			rw_exit(plock);
+		}
+		if (__predict_true(error2 == 0)) {
+			vrele(origsearchdir);
+		} else {
+			if (error == 0) {
+				vrele_async(*foundobj);
+				*foundobj = NULL;
+			}
+			*searchdir = origsearchdir;
+			cnp->cn_nameptr = oldnameptr;
+			ndp->ni_pathlen = oldpathlen;
+			error = lookup_parsepath(state);
+		}
+	} else if (plock != NULL) {
+		/* Drop any namecache lock still held. */
+		rw_exit(plock);
+	}
+
+	return error;
+}
+
 //////////////////////////////
 
 /*
@@ -1237,39 +1398,36 @@ namei_oneroot(struct namei_state *state,
 		KASSERT(!searchdir_locked);
 
 		/*
-		 * If the directory we're on is unmounted, bail out.
-		 * XXX: should this also check if it's unlinked?
-		 * XXX: yes it should... but how?
+		 * Parse out the first path name component that we need to
+		 * to consider.  While doing this, attempt to use the name
+		 * cache to fastforward through as many "easy" to find
+		 * components of the path as possible.
 		 */
-		if (searchdir->v_mount == NULL) {
-			vrele(searchdir);
-			ndp->ni_dvp = NULL;
-			ndp->ni_vp = NULL;
-			return (ENOENT);
-		}
+		error = lookup_fastforward(state, &searchdir, &foundobj);
 
 		/*
-		 * Look up the next path component.
-		 * (currently, this may consume more than one)
+		 * If we didn't get a good answer from the namecache, then
+		 * go directly to the file system.
 		 */
+		if (error != 0 && error != ENOENT) {
+			error = lookup_once(state, searchdir, &searchdir,
+			    &foundobj, &searchdir_locked);
+		}
 
-		/* There should be no slashes here. */
-		KASSERT(cnp->cn_nameptr[0] != '/');
-
-		/* and we shouldn't have looped around if we were done */
-		KASSERT(cnp->cn_nameptr[0] != '\0');
-
-		error = lookup_parsepath(state);
-		if (error) {
-			vrele(searchdir);
-			ndp->ni_dvp = NULL;
-			ndp->ni_vp = NULL;
-			state->attempt_retry = 1;
-			return (error);
+		/*
+		 * If the vnode we found is mounted on, then cross the mount
+		 * and get the root vnode in foundobj.  If this encounters
+		 * an error, it will dispose of foundobj, but searchdir is
+		 * untouched.
+		 */
+		if (error == 0 && foundobj != NULL &&
+		    foundobj->v_type == VDIR &&
+		    foundobj->v_mountedhere != NULL &&
+		    (cnp->cn_flags & NOCROSSMOUNT) == 0) {
+		    	error = lookup_crossmount(state, &searchdir,
+		    	    &foundobj, searchdir_locked);
 		}
 
-		error = lookup_once(state, searchdir, &searchdir, &foundobj,
-		    &searchdir_locked);
 		if (error) {
 			if (searchdir != NULL) {
 				if (searchdir_locked) {

Index: src/sys/sys/namei.src
diff -u src/sys/sys/namei.src:1.47.2.3 src/sys/sys/namei.src:1.47.2.4
--- src/sys/sys/namei.src:1.47.2.3	Tue Jan 14 11:07:40 2020
+++ src/sys/sys/namei.src	Fri Jan 17 22:26:26 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: namei.src,v 1.47.2.3 2020/01/14 11:07:40 ad Exp $	*/
+/*	$NetBSD: namei.src,v 1.47.2.4 2020/01/17 22:26:26 ad Exp $	*/
 
 /*
  * Copyright (c) 1985, 1989, 1991, 1993
@@ -39,6 +39,7 @@
 
 #ifdef _KERNEL
 #include <sys/kauth.h>
+#include <sys/rwlock.h>
 
 /*
  * Abstraction for a single pathname.
@@ -201,7 +202,8 @@ NAMEIFL	PARAMASK	0x02ee300	/* mask of pa
  *
  * This structure describes the elements in the cache of recent names looked
  * up by namei.  It's carefully sized to take up 128 bytes on _LP64, to make
- * good use of space and the CPU caches.
+ * good use of space and the CPU caches; nc_name is aligned on an 8-byte
+ * boundary to make string comparisons cheaper.
  *
  * Field markings and their corresponding locks:
  *
@@ -211,16 +213,19 @@ NAMEIFL	PARAMASK	0x02ee300	/* mask of pa
  * l  protected by cache_lru_lock
  * u  accesses are unlocked, no serialization applied
  */
+struct nchnode;
 struct namecache {
-	struct rb_node nc_node;		/* d  red-black tree node */
+	struct	rb_node nc_tree;	/* d  red-black tree, must be first */
+	TAILQ_ENTRY(namecache) nc_list;	/* v  vp's list of cache entries */
 	TAILQ_ENTRY(namecache) nc_lru;	/* l  pseudo-lru chain */
-	TAILQ_ENTRY(namecache) nc_vlist;/* v  vp's list of cache entries */
-	struct	vnode *nc_dvp;		/* -  vnode of parent of name */
+	struct	nchnode *nc_dnn;	/* -  nchnode of parent of name */
+	struct	nchnode *nc_nn;		/* -  nchnode the name refers to */
 	struct	vnode *nc_vp;		/* -  vnode the name refers to */
+	int64_t	nc_key;			/* -  hash key */
 	int	nc_lrulist;		/* l  which LRU list its on */
-	u_short	nc_nlen;		/* -  length of name */
-	bool	nc_whiteout;		/* -  true if a whiteout */
-	char	nc_name[49];		/* -  segment name */
+	short	nc_nlen;		/* -  length of the name */
+	char	nc_whiteout;		/* -  true if a whiteout */
+	char	nc_name[33];		/* -  segment name */
 };
 #endif
 
@@ -284,9 +289,13 @@ bool	cache_lookup(struct vnode *, const 
 			int *, struct vnode **);
 bool	cache_lookup_raw(struct vnode *, const char *, size_t, uint32_t,
 			int *, struct vnode **);
+bool	cache_lookup_linked(struct vnode *, const char *, size_t,
+			    struct vnode **, krwlock_t **, kauth_cred_t);
 int	cache_revlookup(struct vnode *, struct vnode **, char **, char *);
 void	cache_enter(struct vnode *, struct vnode *,
 			const char *, size_t, uint32_t);
+void	cache_set_id(struct vnode *, mode_t, uid_t, gid_t);
+void	cache_update_id(struct vnode *, mode_t, uid_t, gid_t);
 void	cache_vnode_init(struct vnode * );
 void	cache_vnode_fini(struct vnode * );
 void	cache_cpu_init(struct cpu_info *);
@@ -318,6 +327,10 @@ void	namecache_print(struct vnode *, voi
 	type	ncs_2passes;	/* number of times we attempt it (U) */	\
 	type	ncs_revhits;	/* reverse-cache hits */		\
 	type	ncs_revmiss;	/* reverse-cache misses */		\
+	type	ncs_collisions;	/* hash value collisions */		\
+	type	ncs_active;	/* active cache entries */		\
+	type	ncs_inactive;	/* inactive cache entries */		\
+	type	ncs_denied;	/* access denied */			\
 }
 
 /*

Index: src/sys/sys/vnode_impl.h
diff -u src/sys/sys/vnode_impl.h:1.19.2.3 src/sys/sys/vnode_impl.h:1.19.2.4
--- src/sys/sys/vnode_impl.h:1.19.2.3	Tue Jan 14 11:07:40 2020
+++ src/sys/sys/vnode_impl.h	Fri Jan 17 22:26:26 2020
@@ -1,7 +1,7 @@
-/*	$NetBSD: vnode_impl.h,v 1.19.2.3 2020/01/14 11:07:40 ad Exp $	*/
+/*	$NetBSD: vnode_impl.h,v 1.19.2.4 2020/01/17 22:26:26 ad Exp $	*/
 
 /*-
- * Copyright (c) 2016, 2019 The NetBSD Foundation, Inc.
+ * Copyright (c) 2016, 2019, 2020 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -32,6 +32,7 @@
 #include <sys/vnode.h>
 
 struct namecache;
+struct nchnode;
 
 enum vnode_state {
 	VS_ACTIVE,	/* Assert only, fs node attached and usecount > 0. */
@@ -60,7 +61,7 @@ struct vcache_key {
  *	d	vdrain_lock
  *	i	v_interlock
  *	m	mnt_vnodelock
- *	n	managed by vfs_cache: look there
+ *	n	managed by vfs_cache: see above
  *	s	syncer_data_lock
  */
 struct vnode_impl {
@@ -68,10 +69,7 @@ struct vnode_impl {
 	enum vnode_state vi_state;		/* i: current state */
 	struct vnodelst *vi_lrulisthd;		/* d: current lru list head */
 	TAILQ_ENTRY(vnode_impl) vi_lrulist;	/* d: lru list */
-	TAILQ_HEAD(, namecache) vi_nclist;	/* n: namecaches (parent) */
-	rb_tree_t vi_nctree;			/* n: namecache tree */
-	krwlock_t *vi_ncdlock;			/* n: namecache lock */
-	krwlock_t *vi_ncvlock;			/* n: namecache lock */
+	struct nchnode *vi_ncache;		/* n: namecache state */
 	int vi_synclist_slot;			/* s: synclist slot index */
 	int vi_lrulisttm;			/* i: time of lru enqueue */
 	TAILQ_ENTRY(vnode_impl) vi_synclist;	/* s: vnodes with dirty bufs */

Index: src/sys/ufs/ffs/ffs_vfsops.c
diff -u src/sys/ufs/ffs/ffs_vfsops.c:1.362.4.1 src/sys/ufs/ffs/ffs_vfsops.c:1.362.4.2
--- src/sys/ufs/ffs/ffs_vfsops.c:1.362.4.1	Fri Jan 17 21:47:37 2020
+++ src/sys/ufs/ffs/ffs_vfsops.c	Fri Jan 17 22:26:26 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.362.4.1 2020/01/17 21:47:37 ad Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.362.4.2 2020/01/17 22:26:26 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.362.4.1 2020/01/17 21:47:37 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.362.4.2 2020/01/17 22:26:26 ad Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -2082,6 +2082,7 @@ ffs_loadvnode(struct mount *mp, struct v
 		ip->i_gid = ip->i_ffs1_ogid;			/* XXX */
 	}							/* XXX */
 	uvm_vnp_setsize(vp, ip->i_size);
+	cache_set_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	*new_key = &ip->i_number;
 	return 0;
 }
@@ -2203,6 +2204,7 @@ ffs_newvnode(struct mount *mp, struct vn
 	}
 
 	uvm_vnp_setsize(vp, ip->i_size);
+	cache_set_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	*new_key = &ip->i_number;
 	return 0;
 }

Index: src/sys/ufs/ufs/ufs_vnops.c
diff -u src/sys/ufs/ufs/ufs_vnops.c:1.248 src/sys/ufs/ufs/ufs_vnops.c:1.248.2.1
--- src/sys/ufs/ufs/ufs_vnops.c:1.248	Wed Sep 18 17:59:15 2019
+++ src/sys/ufs/ufs/ufs_vnops.c	Fri Jan 17 22:26:26 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_vnops.c,v 1.248 2019/09/18 17:59:15 christos Exp $	*/
+/*	$NetBSD: ufs_vnops.c,v 1.248.2.1 2020/01/17 22:26:26 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -66,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.248 2019/09/18 17:59:15 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.248.2.1 2020/01/17 22:26:26 ad Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -621,6 +621,7 @@ ufs_setattr(void *v)
 	}
 	VN_KNOTE(vp, NOTE_ATTRIB);
 out:
+	cache_update_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (error);
 }
 
@@ -648,6 +649,7 @@ ufs_chmod(struct vnode *vp, int mode, ka
 	ip->i_flag |= IN_CHANGE;
 	DIP_ASSIGN(ip, mode, ip->i_mode);
 	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	cache_update_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (0);
 }
 
@@ -708,6 +710,7 @@ ufs_chown(struct vnode *vp, uid_t uid, g
 #endif /* QUOTA || QUOTA2 */
 	ip->i_flag |= IN_CHANGE;
 	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	cache_update_id(vp, ip->i_mode, ip->i_uid, ip->i_gid);
 	return (0);
 }

CVS commit: [ad-namecache] src/sys

Reply via email to