After having fast recycle kick my butt for two days finding
vnode reuse bugs.. here is the new diff. this does appear stable on
my machines.
Note that it disables some functionality in procmap, temporarily,
although looking at it it may have already been pre-broken. miod has
promised to revisit this with me later.
Please test for me. and let know what arch'es you ran on.
Thanks
-Bob
Index: sys/kern/vfs_cache.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.30
diff -u -r1.30 vfs_cache.c
--- sys/kern/vfs_cache.c 9 Jul 2009 22:29:56 -0000 1.30
+++ sys/kern/vfs_cache.c 11 Aug 2009 02:34:31 -0000
@@ -68,26 +68,68 @@
/*
* Structures associated with name caching.
*/
-LIST_HEAD(nchashhead, namecache) *nchashtbl;
u_long nchash; /* size of hash table - 1 */
-long numcache; /* number of cache entries allocated */
-TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */
-struct nchstats nchstats; /* cache effectiveness statistics */
-
-LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl;
-u_long ncvhash; /* size of hash table - 1 */
-
-#define NCHASH(dvp, cnp) \
- hash32_buf(&(dvp)->v_id, sizeof((dvp)->v_id), (cnp)->cn_hash) & nchash
+long numcache; /* total number of cache entries
allocated */
+long numneg; /* number of negative cache entries */
-#define NCVHASH(vp) (vp)->v_id & ncvhash
+TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */
+TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU chain */
+struct nchstats nchstats; /* cache effectiveness statistics */
int doingcache = 1; /* 1 => enable the cache */
struct pool nch_pool;
+void cache_zap(struct namecache *);
u_long nextvnodeid;
+static int
+namecache_compare(struct namecache *n1, struct namecache *n2)
+{
+ if (n1->nc_nlen == n2->nc_nlen)
+ return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen));
+ else
+ return (n1->nc_nlen - n2->nc_nlen);
+}
+
+RB_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
+
+/*
+ * blow away a namecache entry
+ */
+void
+cache_zap(struct namecache *ncp)
+{
+ struct vnode *dvp = NULL;
+
+ if (ncp->nc_vp != NULL) {
+ TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
+ numcache--;
+ } else {
+ TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
+ numneg--;
+ }
+ if (ncp->nc_dvp) {
+ RB_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp);
+ if (RB_EMPTY(&ncp->nc_dvp->v_nc_tree))
+ dvp = ncp->nc_dvp;
+ }
+ if (ncp->nc_vp && (ncp->nc_vpid == ncp->nc_vp->v_id)) {
+ if (ncp->nc_vp != ncp->nc_dvp &&
+ ncp->nc_vp->v_type == VDIR &&
+ (ncp->nc_nlen > 2 ||
+ (ncp->nc_nlen > 1 &&
+ ncp->nc_name[1] != '.') ||
+ (ncp->nc_nlen > 0 &&
+ ncp->nc_name[0] != '.'))) {
+ TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me);
+ }
+ }
+ pool_put(&nch_pool, ncp);
+ if (dvp)
+ vdrop(dvp);
+}
+
/*
* Look for a name in the cache. We don't do this if the segment name is
* long, simply so the cache can avoid holding long names (which would
@@ -104,10 +146,11 @@
* is returned.
*/
int
-cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
+cache_lookup(struct vnode *dvp, struct vnode **vpp,
+ struct componentname *cnp)
{
struct namecache *ncp;
- struct nchashhead *ncpp;
+ struct namecache n;
struct vnode *vp;
struct proc *p = curproc;
u_long vpid;
@@ -125,14 +168,11 @@
return (-1);
}
- ncpp = &nchashtbl[NCHASH(dvp, cnp)];
- LIST_FOREACH(ncp, ncpp, nc_hash) {
- if (ncp->nc_dvp == dvp &&
- ncp->nc_dvpid == dvp->v_id &&
- ncp->nc_nlen == cnp->cn_namelen &&
- !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
- break;
- }
+ /* lookup in directory vnode's redblack tree */
+ n.nc_nlen = cnp->cn_namelen;
+ memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen);
+ ncp = RB_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n);
+
if (ncp == NULL) {
nchstats.ncs_miss++;
return (-1);
@@ -145,12 +185,12 @@
(cnp->cn_flags & ISLASTCN) == 0) {
nchstats.ncs_neghits++;
/*
- * Move this slot to end of LRU chain,
- * if not already there.
+ * Move this slot to end of the negative LRU chain,
*/
- if (TAILQ_NEXT(ncp, nc_lru) != NULL) {
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
+ if (TAILQ_NEXT(ncp, nc_neg) != NULL) {
+ TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
+ TAILQ_INSERT_TAIL(&nclruneghead, ncp,
+ nc_neg);
}
return (ENOENT);
} else {
@@ -220,7 +260,7 @@
nchstats.ncs_goodhits++;
/*
- * Move this slot to end of LRU chain, if not already there.
+ * Move this slot to end of the regular LRU chain.
*/
if (TAILQ_NEXT(ncp, nc_lru) != NULL) {
TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
@@ -235,16 +275,7 @@
* the cache entry is invalid, or otherwise don't
* want cache entry to exist.
*/
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- LIST_REMOVE(ncp, nc_hash);
- ncp->nc_hash.le_prev = NULL;
-
- if (ncp->nc_vhash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_vhash);
- ncp->nc_vhash.le_prev = NULL;
- }
-
- TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
+ cache_zap(ncp);
return (-1);
}
@@ -267,62 +298,51 @@
cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
{
struct namecache *ncp;
- struct vnode *dvp;
- struct ncvhashhead *nvcpp;
+ struct vnode *dvp = NULL;
char *bp;
if (!doingcache)
goto out;
-
- nvcpp = &ncvhashtbl[NCVHASH(vp)];
-
- LIST_FOREACH(ncp, nvcpp, nc_vhash) {
- if (ncp->nc_vp == vp &&
- ncp->nc_vpid == vp->v_id &&
- (dvp = ncp->nc_dvp) != NULL &&
- /* avoid pesky '.' entries.. */
- dvp != vp && ncp->nc_dvpid == dvp->v_id) {
-
+ TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) {
+ dvp = ncp->nc_dvp;
+ if (dvp && dvp != vp && ncp->nc_dvpid == dvp->v_id)
+ goto found;
+ }
+ goto miss;
+found:
#ifdef DIAGNOSTIC
- if (ncp->nc_nlen == 1 &&
- ncp->nc_name[0] == '.')
- panic("cache_revlookup: found entry for .");
-
- if (ncp->nc_nlen == 2 &&
- ncp->nc_name[0] == '.' &&
- ncp->nc_name[1] == '.')
- panic("cache_revlookup: found entry for ..");
+ if (ncp->nc_nlen == 1 &&
+ ncp->nc_name[0] == '.')
+ panic("cache_revlookup: found entry for .");
+ if (ncp->nc_nlen == 2 &&
+ ncp->nc_name[0] == '.' &&
+ ncp->nc_name[1] == '.')
+ panic("cache_revlookup: found entry for ..");
#endif
- nchstats.ncs_revhits++;
-
- if (bufp != NULL) {
- bp = *bpp;
- bp -= ncp->nc_nlen;
- if (bp <= bufp) {
- *dvpp = NULL;
- return (ERANGE);
- }
- memcpy(bp, ncp->nc_name, ncp->nc_nlen);
- *bpp = bp;
- }
-
- *dvpp = dvp;
+ nchstats.ncs_revhits++;
- /*
- * XXX: Should we vget() here to have more
- * consistent semantics with cache_lookup()?
- *
- * For MP safety it might be necessary to do
- * this here, while also protecting hash
- * tables themselves to provide some sort of
- * sane inter locking.
- */
- return (0);
+ if (bufp != NULL) {
+ bp = *bpp;
+ bp -= ncp->nc_nlen;
+ if (bp <= bufp) {
+ *dvpp = NULL;
+ return (ERANGE);
}
+ memcpy(bp, ncp->nc_name, ncp->nc_nlen);
+ *bpp = bp;
}
- nchstats.ncs_revmiss++;
- out:
+ *dvpp = dvp;
+
+ /*
+ * XXX: Should we vget() here to have more
+ * consistent semantics with cache_lookup()?
+ */
+ return (0);
+
+miss:
+ nchstats.ncs_revmiss++;
+out:
*dvpp = NULL;
return (-1);
}
@@ -333,71 +353,83 @@
void
cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
{
- struct namecache *ncp;
- struct nchashhead *ncpp;
- struct ncvhashhead *nvcpp;
+ struct namecache *ncp, *lncp;
if (!doingcache || cnp->cn_namelen > NCHNAMLEN)
return;
/*
- * Free the cache slot at head of lru chain.
+ * allocate, or recycle (free and allocate) an ncp.
*/
- if (numcache < desiredvnodes) {
- ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO);
- numcache++;
- } else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) {
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- if (ncp->nc_hash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_hash);
- ncp->nc_hash.le_prev = NULL;
- }
- if (ncp->nc_vhash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_vhash);
- ncp->nc_vhash.le_prev = NULL;
- }
- } else
- return;
+ if (numcache >= desiredvnodes) {
+ if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL)
+ cache_zap(ncp);
+ else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL)
+ cache_zap(ncp);
+ else
+ panic("wtf? leak?");
+ }
+ ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO);
+
/* grab the vnode we just found */
ncp->nc_vp = vp;
if (vp)
ncp->nc_vpid = vp->v_id;
+
/* fill in cache info */
ncp->nc_dvp = dvp;
ncp->nc_dvpid = dvp->v_id;
ncp->nc_nlen = cnp->cn_namelen;
bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen);
- TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
- ncpp = &nchashtbl[NCHASH(dvp, cnp)];
- LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
-
- /*
- * Create reverse-cache entries (used in getcwd) for
- * directories.
- */
-
- ncp->nc_vhash.le_prev = NULL;
- ncp->nc_vhash.le_next = NULL;
-
- if (vp && vp != dvp && vp->v_type == VDIR &&
- (ncp->nc_nlen > 2 ||
- (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') ||
- (ncp->nc_nlen > 0 && ncp->nc_name[0] != '.'))) {
- nvcpp = &ncvhashtbl[NCVHASH(vp)];
- LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash);
+ if (RB_EMPTY(&dvp->v_nc_tree)) {
+ vhold(dvp);
+ }
+ if ((lncp = RB_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp))
+ != NULL) {
+ /* someone has raced us and added a different entry
+ * for the same vnode (different ncp) - we don't need
+ * this entry, so free it and we are done.
+ */
+ pool_put(&nch_pool, ncp);
+ /* we know now dvp->v_nc_tree is not empty, no need
+ * to vdrop here
+ */
+ goto done;
}
+ if (vp) {
+ TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
+ numcache++;
+ /* don't put . or .. in the reverse map */
+ if (vp != dvp && vp->v_type == VDIR &&
+ (ncp->nc_nlen > 2 ||
+ (ncp->nc_nlen > 1 &&
+ ncp->nc_name[1] != '.') ||
+ (ncp->nc_nlen > 0 &&
+ ncp->nc_name[0] != '.')))
+ TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp,
+ nc_me);
+ } else {
+ TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg);
+ numneg++;
+ }
+ if (numneg > desiredvnodes) {
+ if ((ncp = TAILQ_FIRST(&nclruneghead))
+ != NULL)
+ cache_zap(ncp);
+ }
+done:
+ return;
}
+
/*
* Name cache initialization, from vfs_init() when we are booting
*/
void
nchinit()
{
-
TAILQ_INIT(&nclruhead);
- nchashtbl = hashinit(desiredvnodes, M_CACHE, M_WAITOK, &nchash);
- ncvhashtbl = hashinit(desiredvnodes/8, M_CACHE, M_WAITOK, &ncvhash);
+ TAILQ_INIT(&nclruneghead);
pool_init(&nch_pool, sizeof(struct namecache), 0, 0, 0, "nchpl",
&pool_allocator_nointr);
}
@@ -410,18 +442,16 @@
cache_purge(struct vnode *vp)
{
struct namecache *ncp;
- struct nchashhead *ncpp;
+ while ((ncp = TAILQ_FIRST(&vp->v_cache_dst)))
+ cache_zap(ncp);
+ while ((ncp = RB_ROOT(&vp->v_nc_tree)))
+ cache_zap(ncp);
+
+ /* XXX this blows goats */
vp->v_id = ++nextvnodeid;
- if (nextvnodeid != 0)
- return;
- for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
- LIST_FOREACH(ncp, ncpp, nc_hash) {
- ncp->nc_vpid = 0;
- ncp->nc_dvpid = 0;
- }
- }
- vp->v_id = ++nextvnodeid;
+ if (vp->v_id == 0)
+ vp->v_id = ++nextvnodeid;
}
/*
@@ -437,6 +467,7 @@
{
struct namecache *ncp, *nxtcp;
+ /* whack the regular entries */
for (ncp = TAILQ_FIRST(&nclruhead); ncp != TAILQ_END(&nclruhead);
ncp = nxtcp) {
if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) {
@@ -444,19 +475,20 @@
continue;
}
/* free the resources we had */
- ncp->nc_vp = NULL;
- ncp->nc_dvp = NULL;
- TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
- if (ncp->nc_hash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_hash);
- ncp->nc_hash.le_prev = NULL;
- }
- if (ncp->nc_vhash.le_prev != NULL) {
- LIST_REMOVE(ncp, nc_vhash);
- ncp->nc_vhash.le_prev = NULL;
- }
+ cache_zap(ncp);
/* cause rescan of list, it may have altered */
nxtcp = TAILQ_FIRST(&nclruhead);
- TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
+ }
+ /* whack the negative entries */
+ for (ncp = TAILQ_FIRST(&nclruneghead); ncp != TAILQ_END(&nclruneghead);
+ ncp = nxtcp) {
+ if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) {
+ nxtcp = TAILQ_NEXT(ncp, nc_neg);
+ continue;
+ }
+ /* free the resources we had */
+ cache_zap(ncp);
+ /* cause rescan of list, it may have altered */
+ nxtcp = TAILQ_FIRST(&nclruneghead);
}
}
Index: sys/kern/vfs_subr.c
===================================================================
RCS file: /cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.180
diff -u -r1.180 vfs_subr.c
--- sys/kern/vfs_subr.c 2 Aug 2009 16:28:40 -0000 1.180
+++ sys/kern/vfs_subr.c 10 Aug 2009 21:33:48 -0000
@@ -360,6 +360,8 @@
splx(s);
vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
RB_INIT(&vp->v_bufs_tree);
+ RB_INIT(&vp->v_nc_tree);
+ TAILQ_INIT(&vp->v_cache_dst);
numvnodes++;
} else {
for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
Index: sys/sys/namei.h
===================================================================
RCS file: /cvs/src/sys/sys/namei.h,v
retrieving revision 1.22
diff -u -r1.22 namei.h
--- sys/sys/namei.h 29 Aug 2008 08:57:28 -0000 1.22
+++ sys/sys/namei.h 9 Aug 2009 22:29:29 -0000
@@ -36,6 +36,12 @@
#define _SYS_NAMEI_H_
#include <sys/queue.h>
+#include <sys/tree.h>
+#include <sys/uio.h>
+
+struct namecache;
+struct namecache_rb_cache;
+RB_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
/*
* Encapsulation of namei parameters.
@@ -156,9 +162,10 @@
#define NCHNAMLEN 31 /* maximum name segment length we
bother with */
struct namecache {
- LIST_ENTRY(namecache) nc_hash; /* hash chain */
- LIST_ENTRY(namecache) nc_vhash; /* (reverse) directory hash chain */
- TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */
+ TAILQ_ENTRY(namecache) nc_lru; /* Regular Entry LRU chain */
+ TAILQ_ENTRY(namecache) nc_neg; /* Negative Entry LRU chain */
+ RB_ENTRY(namecache) n_rbcache; /* Namecache rb tree from vnode */
+ TAILQ_ENTRY(namecache) nc_me; /* ncp's referring to me */
struct vnode *nc_dvp; /* vnode of parent of name */
u_long nc_dvpid; /* capability number of nc_dvp */
struct vnode *nc_vp; /* vnode the name refers to */
Index: sys/sys/vnode.h
===================================================================
RCS file: /cvs/src/sys/sys/vnode.h,v
retrieving revision 1.102
diff -u -r1.102 vnode.h
--- sys/sys/vnode.h 2 Aug 2009 16:28:40 -0000 1.102
+++ sys/sys/vnode.h 9 Aug 2009 18:30:12 -0000
@@ -36,6 +36,7 @@
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/lock.h>
+#include <sys/namei.h>
#include <sys/selinfo.h>
#include <sys/tree.h>
@@ -82,6 +83,7 @@
LIST_HEAD(buflists, buf);
RB_HEAD(buf_rb_bufs, buf);
+RB_HEAD(namecache_rb_cache, namecache);
struct vnode {
struct uvm_vnode v_uvm; /* uvm data */
@@ -110,6 +112,10 @@
struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */
} v_un;
+ /* VFS namecache */
+ struct namecache_rb_cache v_nc_tree;
+ TAILQ_HEAD(, namecache) v_cache_dst; /* cache entries to us */
+
enum vtagtype v_tag; /* type of underlying data */
void *v_data; /* private data for fs */
struct selinfo v_selectinfo; /* identity of poller(s) */
@@ -247,8 +253,9 @@
extern int maxvnodes; /* XXX number of vnodes to allocate */
extern time_t syncdelay; /* time to delay syncing vnodes */
extern int rushjob; /* # of slots syncer should run ASAP */
+extern void vhold(struct vnode *);
+extern void vdrop(struct vnode *);
#endif /* _KERNEL */
-
/*
* Mods for exensibility.
Index: usr.sbin/procmap/procmap.c
===================================================================
RCS file: /cvs/src/usr.sbin/procmap/procmap.c,v
retrieving revision 1.32
diff -u -r1.32 procmap.c
--- usr.sbin/procmap/procmap.c 4 Jun 2009 22:38:53 -0000 1.32
+++ usr.sbin/procmap/procmap.c 11 Aug 2009 02:46:17 -0000
@@ -178,9 +178,11 @@
struct sum *);
char *findname(kvm_t *, struct kbit *, struct kbit *, struct kbit *,
struct kbit *, struct kbit *);
+#if 0
int search_cache(kvm_t *, struct kbit *, char **, char *, size_t);
void load_name_cache(kvm_t *);
void cache_enter(struct namecache *);
+#endif
static void __dead usage(void);
static pid_t strtopid(const char *);
void print_sum(struct sum *, struct sum *);
@@ -783,6 +785,7 @@
if (UVM_ET_ISOBJ(vme)) {
if (A(vfs)) {
l = strlen(D(vfs, mount)->mnt_stat.f_mntonname);
+#if 0
switch (search_cache(kd, vp, &name, buf, sizeof(buf))) {
case 0: /* found something */
if (name - (1 + 11 + l) < buf)
@@ -791,11 +794,13 @@
*name = '/';
/*FALLTHROUGH*/
case 2: /* found nothing */
+#endif
name -= 11;
memcpy(name, " -unknown- ", (size_t)11);
name -= l;
memcpy(name,
D(vfs, mount)->mnt_stat.f_mntonname, l);
+#if 0
break;
case 1: /* all is well */
if (name - (1 + l) < buf)
@@ -809,6 +814,7 @@
}
break;
}
+#endif
} else if (UVM_OBJ_IS_DEVICE(D(uvm_obj, uvm_object))) {
struct kbit kdev;
dev_t dev;
@@ -850,6 +856,7 @@
return (name);
}
+#if 0
int
search_cache(kvm_t *kd, struct kbit *vp, char **name, char *buf, size_t blen)
{
@@ -958,6 +965,7 @@
LIST_INSERT_HEAD(&lcache, ce, ce_next);
}
+#endif
static void __dead
usage(void)