Module Name:    src
Committed By:   riastradh
Date:           Sat Mar 18 21:03:28 UTC 2017

Modified Files:
        src/sys/kern: vfs_cache.c
        src/sys/sys: namei.src

Log Message:
Rework namecache locking commentary.

- Annotate struct namecache declaration in namei.src/namei.h.
- Bulleted lists, not walls of texts.
- Identify lock order too.
- Note subtle exceptions.


To generate a diff of this commit:
cvs rdiff -u -r1.116 -r1.117 src/sys/kern/vfs_cache.c
cvs rdiff -u -r1.38 -r1.39 src/sys/sys/namei.src

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/vfs_cache.c
diff -u src/sys/kern/vfs_cache.c:1.116 src/sys/kern/vfs_cache.c:1.117
--- src/sys/kern/vfs_cache.c:1.116	Sat Mar 18 20:01:44 2017
+++ src/sys/kern/vfs_cache.c	Sat Mar 18 21:03:28 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_cache.c,v 1.116 2017/03/18 20:01:44 riastradh Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.117 2017/03/18 21:03:28 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -58,7 +58,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.116 2017/03/18 20:01:44 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.117 2017/03/18 21:03:28 riastradh Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ddb.h"
@@ -103,71 +103,95 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,
  */
 
 /*
- * The locking in this subsystem works as follows:
+ * Locking.
  *
- * When an entry is added to the cache, via cache_enter(),
- * namecache_lock is taken to exclude other writers.  The new
- * entry is added to the hash list in a way which permits
- * concurrent lookups and invalidations in the cache done on
- * other CPUs to continue in parallel.
- *
- * When a lookup is done in the cache, via cache_lookup() or
- * cache_lookup_raw(), the per-cpu lock below is taken.  This
- * protects calls to cache_lookup_entry() and cache_invalidate()
- * against cache_reclaim() but allows lookups to continue in
- * parallel with cache_enter().
- *
- * cache_revlookup() takes namecache_lock to exclude cache_enter()
- * and cache_reclaim() since the list it operates on is not
- * maintained to allow concurrent reads.
- *
- * When cache_reclaim() is called namecache_lock is held to hold
- * off calls to cache_enter()/cache_revlookup() and each of the
- * per-cpu locks is taken to hold off lookups.  Holding all these
- * locks essentially idles the subsystem, ensuring there are no
- * concurrent references to the cache entries being freed.
- *
- * 32 bit per-cpu statistic counters (struct nchstats_percpu) are
- * incremented when the operations they count are performed while
- * running on the corresponding CPU.  Frequently individual counters
- * are incremented while holding a lock (either a per-cpu lock or
- * namecache_lock) sufficient to preclude concurrent increments
- * being done to the same counter, so non-atomic increments are
- * done using the COUNT() macro.  Counters which are incremented
- * when one of these locks is not held use the COUNT_UNL() macro
- * instead.  COUNT_UNL() could be defined to do atomic increments
- * but currently just does what COUNT() does, on the theory that
- * it is unlikely the non-atomic increment will be interrupted
- * by something on the same CPU that increments the same counter,
- * but even if it does happen the consequences aren't serious.
+ * L namecache_lock		Global lock for namecache table and queues.
+ * C struct nchcpu::cpu_lock	Per-CPU lock to reduce read contention.
+ * N struct namecache::nc_lock	Per-entry lock.
+ * V struct vnode::v_interlock	Vnode interlock.
+ *
+ * Lock order: C -> N -> L -> V
+ *
+ * All use serialized by namecache_lock:
+ *
+ *	nclruhead / struct namecache::nc_lru
+ *	ncvhashtbl / struct namecache::nc_vhash
+ *	struct vnode_impl::vi_dnclist / struct namecache::nc_dvlist
+ *	struct vnode_impl::vi_nclist / struct namecache::nc_vlist
+ *	nchstats
+ *
+ * - Insertion serialized by namecache_lock,
+ * - read protected by per-CPU lock,
+ * - insert/read ordering guaranteed by memory barriers, and
+ * - deletion allowed only under namecache_lock and *all* per-CPU locks
+ *   in CPU_INFO_FOREACH order:
+ *
+ *	nchashtbl / struct namecache::nc_hash
+ *
+ *   The per-CPU locks exist only to reduce the probability of
+ *   contention between readers.  We do not bind to a CPU, so
+ *   contention is still possible.
+ *
+ * All use serialized by struct namecache::nc_lock:
+ *
+ *	struct namecache::nc_dvp
+ *	struct namecache::nc_vp
+ *	struct namecache::nc_gcqueue (*)
+ *	struct namecache::nc_hittime (**)
+ *
+ * (*) Once on the queue, only cache_thread uses this nc_gcqueue, unlocked.
+ * (**) cache_prune reads nc_hittime unlocked, since approximate is OK.
+ *
+ * Unlocked because stable after initialization:
+ *
+ *	struct namecache::nc_dvp
+ *	struct namecache::nc_vp
+ *	struct namecache::nc_flags
+ *	struct namecache::nc_nlen
+ *	struct namecache::nc_name
+ *
+ * Unlocked because approximation is OK:
+ *
+ *	struct nchcpu::cpu_stats
+ *	struct nchcpu::cpu_stats_last
+ *
+ * Updates under namecache_lock or any per-CPU lock are marked with
+ * COUNT, while updates outside those locks are marked with COUNT_UNL.
+ *
+ * - The theory seems to have been that you could replace COUNT_UNL by
+ *   atomic operations -- except that doesn't help unless you also
+ *   replace COUNT by atomic operations, because mixing atomics and
+ *   nonatomics is a recipe for failure.
+ * - We use 32-bit per-CPU counters and 64-bit global counters under
+ *   the theory that 32-bit counters are less likely to be hosed by
+ *   nonatomic increment.
+ */
+
+/*
+ * The comment below is preserved for posterity in case it is
+ * important, but it is clear that everywhere the namecache_count_*()
+ * functions are called, other cache_*() functions that take the same
+ * locks are also called, so I can't imagine how this could be a
+ * problem:
  *
  * N.B.: Attempting to protect COUNT_UNL() increments by taking
  * a per-cpu lock in the namecache_count_*() functions causes
  * a deadlock.  Don't do that, use atomic increments instead if
  * the imperfections here bug you.
+ */
+
+/*
+ * struct nchstats_percpu:
  *
- * The 64 bit system-wide statistic counts (struct nchstats) are
- * maintained by sampling the per-cpu counters periodically, adding
- * in the deltas since the last samples and recording the current
- * samples to use to compute the next delta.  The sampling is done
- * as a side effect of cache_reclaim() which is run periodically,
- * for its own purposes, often enough to avoid overflow of the 32
- * bit counters.  While sampling in this fashion requires no locking
- * it is never-the-less done only after all locks have been taken by
- * cache_reclaim() to allow cache_stat_sysctl() to hold off
- * cache_reclaim() with minimal locking.
- *
- * cache_stat_sysctl() takes its CPU's per-cpu lock to hold off
- * cache_reclaim() so that it can copy the subsystem total stats
- * without them being concurrently modified.  If CACHE_STATS_CURRENT
- * is defined it also harvests the per-cpu increments into the total,
- * which again requires cache_reclaim() to be held off.
- *
- * The per-cpu data (a lock and the per-cpu stats structures)
- * are defined next.
+ *	Per-CPU counters.
  */
 struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
 
+/*
+ * struct nchcpu:
+ *
+ *	Per-CPU namecache state: lock and per-CPU counters.
+ */
 struct nchcpu {
 	kmutex_t		cpu_lock;
 	struct nchstats_percpu	cpu_stats;

Index: src/sys/sys/namei.src
diff -u src/sys/sys/namei.src:1.38 src/sys/sys/namei.src:1.39
--- src/sys/sys/namei.src:1.38	Sat Mar 18 19:43:31 2017
+++ src/sys/sys/namei.src	Sat Mar 18 21:03:28 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: namei.src,v 1.38 2017/03/18 19:43:31 riastradh Exp $	*/
+/*	$NetBSD: namei.src,v 1.39 2017/03/18 21:03:28 riastradh Exp $	*/
 
 /*
  * Copyright (c) 1985, 1989, 1991, 1993
@@ -201,21 +201,28 @@ NAMEIFL	PARAMASK	0x02ee300	/* mask of pa
  * accessed and mostly read-only data is toward the front, with
  * infrequently accessed data and the lock towards the rear.  The
  * lock is then more likely to be in a seperate cache line.
- */
-struct	namecache {
-	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
-	LIST_ENTRY(namecache) nc_vhash;	/* directory hash chain */
-	struct	vnode *nc_dvp;		/* vnode of parent of name */
-	struct	vnode *nc_vp;		/* vnode the name refers to */
-	int	nc_flags;		/* copy of componentname's ISWHITEOUT */
-	char	nc_nlen;		/* length of name */
-	char	nc_name[NCHNAMLEN];	/* segment name */
-	void	*nc_gcqueue;		/* queue for garbage collection */
-	TAILQ_ENTRY(namecache) nc_lru;	/* psuedo-lru chain */
-	LIST_ENTRY(namecache) nc_dvlist;
-	LIST_ENTRY(namecache) nc_vlist;
-	kmutex_t nc_lock;		/* lock on this entry */
-	int	nc_hittime;		/* last time scored a hit */
+ *
+ * Locking rules:
+ *
+ *      -       stable after initialization
+ *      L       namecache_lock
+ *      C       struct nchcpu::cpu_lock
+ *      N       struct namecache::nc_lock
+ */
+struct namecache {
+	LIST_ENTRY(namecache) nc_hash;	/* L hash chain */
+	LIST_ENTRY(namecache) nc_vhash;	/* L directory hash chain */
+	struct	vnode *nc_dvp;		/* - vnode of parent of name */
+	struct	vnode *nc_vp;		/* - vnode the name refers to */
+	int	nc_flags;		/* - copy of componentname ISWHITEOUT */
+	char	nc_nlen;		/* - length of name */
+	char	nc_name[NCHNAMLEN];	/* - segment name */
+	void	*nc_gcqueue;		/* N queue for garbage collection */
+	TAILQ_ENTRY(namecache) nc_lru;	/* L psuedo-lru chain */
+	LIST_ENTRY(namecache) nc_dvlist;/* L dvp's list of cache entries */
+	LIST_ENTRY(namecache) nc_vlist; /* L vp's list of cache entries */
+	kmutex_t nc_lock;		/*   lock on this entry */
+	int	nc_hittime;		/* N last time scored a hit */
 };
 
 #ifdef _KERNEL

Reply via email to