Ok, this *used* to work, but I have some test reports
now that it will likely panic. so please hold off.
Sorry about that
-Bob
* Bob Beck <[email protected]> [2009-08-09 03:41]:
> Ok let's try this again:
>
> >
> > I could use some assistance in testing this, particularly on some of
> > the more odd archetectures.
> >
> > This diff makes a bunch of changes to the vfs name cache:
> >
> > 1) it gets rid of the global hash table and reverse hash table for namecahe
> > entries. namecache entries
> > are now allocated and freed in two global LRU's - one for regular, and
> > one for negative entries.
> >
> > 2) Entries are no longer searched in the global lists, instead they are
> > kept track of in the relevant
> > vnodes. Since each vnode can be either the parent (directory) vnode of a
> > namecache entry, or the target of
> > the entry, we keep track of it in both ways in the vnode. We now use a rb
> > tree to search the namecache from
> > a directory vnode, and keep a list of which entries that we are the target
> > vnode.
> >
> > 3) (most importantly) namecache entries can now be allocated and freed.
> >
> > 4) cache_purge now actually does something rather than depending on vnode
> > horror to work. when recycling a vnode
> > cache_purge will now correctly clear the name cache entries associated with
> > the vnode. (before it basically
> > didn't do anything, and depended on us noticing the vnodes were being
> > recycled underneath us)
> >
> > This has been beat on a bunch, and appears not to slow anything down. I
> > do require some testing
> > and reports on other arch's particularly the likes of sparc, vax, and
> > strange things if possible
> >
> > Thanks,
> > -Bob
>
> I had this ready to go after c2k9, but with the instability in the
> tree I never committed it. I'd like some people to beat on it again please.
>
> This is a first step toward making vnodes something more generic and
> sane where hopefully we can allocate and deallocate them without major
> pain. I really need testing of this. - now updated to apply again.
>
> -Bob
>
> Index: kern/vfs_cache.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_cache.c,v
> retrieving revision 1.30
> diff -u -r1.30 vfs_cache.c
> --- kern/vfs_cache.c 9 Jul 2009 22:29:56 -0000 1.30
> +++ kern/vfs_cache.c 9 Aug 2009 18:30:12 -0000
> @@ -68,26 +68,59 @@
> /*
> * Structures associated with name caching.
> */
> -LIST_HEAD(nchashhead, namecache) *nchashtbl;
> u_long nchash; /* size of hash table - 1 */
> -long numcache; /* number of cache entries allocated */
> -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */
> -struct nchstats nchstats; /* cache effectiveness
> statistics */
> -
> -LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl;
> -u_long ncvhash; /* size of hash table - 1 */
> -
> -#define NCHASH(dvp, cnp) \
> - hash32_buf(&(dvp)->v_id, sizeof((dvp)->v_id), (cnp)->cn_hash) & nchash
> +long numcache; /* total number of cache entries
> allocated */
> +long numneg; /* number of negative cache entries */
>
> -#define NCVHASH(vp) (vp)->v_id & ncvhash
> +TAILQ_HEAD(, namecache) nclruhead; /* Regular Entry LRU chain */
> +TAILQ_HEAD(, namecache) nclruneghead; /* Negative Entry LRU
> chain */
> +struct nchstats nchstats; /* cache effectiveness
> statistics */
>
> int doingcache = 1; /* 1 => enable the cache */
>
> struct pool nch_pool;
>
> +void cache_zap(struct namecache *);
> u_long nextvnodeid;
>
> +static int
> +namecache_compare(struct namecache *n1, struct namecache *n2)
> +{
> + if (n1->nc_nlen == n2->nc_nlen)
> + return (memcmp(n1->nc_name, n2->nc_name, n1->nc_nlen));
> + else
> + return (n1->nc_nlen - n2->nc_nlen);
> +}
> +
> +RB_GENERATE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
> +
> +/*
> + * blow away a namecache entry
> + */
> +void
> +cache_zap(struct namecache *ncp)
> +{
> + struct vnode *dvp = NULL;
> +
> + if (ncp->nc_vp != NULL) {
> + TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> + numcache--;
> + } else {
> + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
> + numneg--;
> + }
> + if (ncp->nc_dvp) {
> + RB_REMOVE(namecache_rb_cache, &ncp->nc_dvp->v_nc_tree, ncp);
> + if (RB_EMPTY(&ncp->nc_dvp->v_nc_tree))
> + dvp = ncp->nc_dvp;
> + }
> + if (ncp->nc_vp)
> + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_me);
> + pool_put(&nch_pool, ncp);
> + if (dvp)
> + vdrop(dvp);
> +}
> +
> /*
> * Look for a name in the cache. We don't do this if the segment name is
> * long, simply so the cache can avoid holding long names (which would
> @@ -107,7 +140,7 @@
> cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname
> *cnp)
> {
> struct namecache *ncp;
> - struct nchashhead *ncpp;
> + struct namecache n;
> struct vnode *vp;
> struct proc *p = curproc;
> u_long vpid;
> @@ -125,14 +158,11 @@
> return (-1);
> }
>
> - ncpp = &nchashtbl[NCHASH(dvp, cnp)];
> - LIST_FOREACH(ncp, ncpp, nc_hash) {
> - if (ncp->nc_dvp == dvp &&
> - ncp->nc_dvpid == dvp->v_id &&
> - ncp->nc_nlen == cnp->cn_namelen &&
> - !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
> - break;
> - }
> + /* lookup in directory vnode's redblack tree */
> + n.nc_nlen = cnp->cn_namelen;
> + memcpy(n.nc_name, cnp->cn_nameptr, n.nc_nlen);
> + ncp = RB_FIND(namecache_rb_cache, &dvp->v_nc_tree, &n);
> +
> if (ncp == NULL) {
> nchstats.ncs_miss++;
> return (-1);
> @@ -145,13 +175,10 @@
> (cnp->cn_flags & ISLASTCN) == 0) {
> nchstats.ncs_neghits++;
> /*
> - * Move this slot to end of LRU chain,
> - * if not already there.
> + * Move this slot to end of the negative LRU chain,
> */
> - if (TAILQ_NEXT(ncp, nc_lru) != NULL) {
> - TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
> - }
> + TAILQ_REMOVE(&nclruneghead, ncp, nc_neg);
> + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg);
> return (ENOENT);
> } else {
> nchstats.ncs_badhits++;
> @@ -220,12 +247,10 @@
>
> nchstats.ncs_goodhits++;
> /*
> - * Move this slot to end of LRU chain, if not already there.
> + * Move this slot to end of the regular LRU chain.
> */
> - if (TAILQ_NEXT(ncp, nc_lru) != NULL) {
> - TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
> - }
> + TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
> *vpp = vp;
> return (0);
>
> @@ -235,16 +260,7 @@
> * the cache entry is invalid, or otherwise don't
> * want cache entry to exist.
> */
> - TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> - LIST_REMOVE(ncp, nc_hash);
> - ncp->nc_hash.le_prev = NULL;
> -
> - if (ncp->nc_vhash.le_prev != NULL) {
> - LIST_REMOVE(ncp, nc_vhash);
> - ncp->nc_vhash.le_prev = NULL;
> - }
> -
> - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
> + cache_zap(ncp);
> return (-1);
> }
>
> @@ -267,62 +283,51 @@
> cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char
> *bufp)
> {
> struct namecache *ncp;
> - struct vnode *dvp;
> - struct ncvhashhead *nvcpp;
> + struct vnode *dvp = NULL;
> char *bp;
>
> if (!doingcache)
> goto out;
> -
> - nvcpp = &ncvhashtbl[NCVHASH(vp)];
> -
> - LIST_FOREACH(ncp, nvcpp, nc_vhash) {
> - if (ncp->nc_vp == vp &&
> - ncp->nc_vpid == vp->v_id &&
> - (dvp = ncp->nc_dvp) != NULL &&
> - /* avoid pesky '.' entries.. */
> - dvp != vp && ncp->nc_dvpid == dvp->v_id) {
> -
> + TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_me) {
> + dvp = ncp->nc_dvp;
> + if (dvp && dvp != vp)
> + goto found;
> + }
> + goto miss;
> +found:
> #ifdef DIAGNOSTIC
> - if (ncp->nc_nlen == 1 &&
> - ncp->nc_name[0] == '.')
> - panic("cache_revlookup: found entry for .");
> -
> - if (ncp->nc_nlen == 2 &&
> - ncp->nc_name[0] == '.' &&
> - ncp->nc_name[1] == '.')
> - panic("cache_revlookup: found entry for ..");
> + if (ncp->nc_nlen == 1 &&
> + ncp->nc_name[0] == '.')
> + panic("cache_revlookup: found entry for .");
> + if (ncp->nc_nlen == 2 &&
> + ncp->nc_name[0] == '.' &&
> + ncp->nc_name[1] == '.')
> + panic("cache_revlookup: found entry for ..");
> #endif
> - nchstats.ncs_revhits++;
> -
> - if (bufp != NULL) {
> - bp = *bpp;
> - bp -= ncp->nc_nlen;
> - if (bp <= bufp) {
> - *dvpp = NULL;
> - return (ERANGE);
> - }
> - memcpy(bp, ncp->nc_name, ncp->nc_nlen);
> - *bpp = bp;
> - }
> + nchstats.ncs_revhits++;
>
> - *dvpp = dvp;
> -
> - /*
> - * XXX: Should we vget() here to have more
> - * consistent semantics with cache_lookup()?
> - *
> - * For MP safety it might be necessary to do
> - * this here, while also protecting hash
> - * tables themselves to provide some sort of
> - * sane inter locking.
> - */
> - return (0);
> + if (bufp != NULL) {
> + bp = *bpp;
> + bp -= ncp->nc_nlen;
> + if (bp <= bufp) {
> + *dvpp = NULL;
> + return (ERANGE);
> }
> + memcpy(bp, ncp->nc_name, ncp->nc_nlen);
> + *bpp = bp;
> }
> - nchstats.ncs_revmiss++;
>
> - out:
> + *dvpp = dvp;
> +
> + /*
> + * XXX: Should we vget() here to have more
> + * consistent semantics with cache_lookup()?
> + */
> + return (0);
> +
> +miss:
> + nchstats.ncs_revmiss++;
> +out:
> *dvpp = NULL;
> return (-1);
> }
> @@ -334,57 +339,49 @@
> cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
> {
> struct namecache *ncp;
> - struct nchashhead *ncpp;
> - struct ncvhashhead *nvcpp;
>
> if (!doingcache || cnp->cn_namelen > NCHNAMLEN)
> return;
>
> /*
> - * Free the cache slot at head of lru chain.
> + * allocate, or recycle (free and allocate) an ncp.
> */
> - if (numcache < desiredvnodes) {
> - ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO);
> - numcache++;
> - } else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) {
> - TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> - if (ncp->nc_hash.le_prev != NULL) {
> - LIST_REMOVE(ncp, nc_hash);
> - ncp->nc_hash.le_prev = NULL;
> - }
> - if (ncp->nc_vhash.le_prev != NULL) {
> - LIST_REMOVE(ncp, nc_vhash);
> - ncp->nc_vhash.le_prev = NULL;
> - }
> - } else
> - return;
> + if (numcache >= desiredvnodes) {
> + if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL)
> + cache_zap(ncp);
> + else if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL)
> + cache_zap(ncp);
> + else
> + panic("wtf? leak?");
> + }
> + ncp = pool_get(&nch_pool, PR_WAITOK|PR_ZERO);
> +
> /* grab the vnode we just found */
> ncp->nc_vp = vp;
> if (vp)
> ncp->nc_vpid = vp->v_id;
> +
> /* fill in cache info */
> ncp->nc_dvp = dvp;
> ncp->nc_dvpid = dvp->v_id;
> ncp->nc_nlen = cnp->cn_namelen;
> bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen);
> - TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
> - ncpp = &nchashtbl[NCHASH(dvp, cnp)];
> - LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
> -
> - /*
> - * Create reverse-cache entries (used in getcwd) for
> - * directories.
> - */
> -
> - ncp->nc_vhash.le_prev = NULL;
> - ncp->nc_vhash.le_next = NULL;
> -
> - if (vp && vp != dvp && vp->v_type == VDIR &&
> - (ncp->nc_nlen > 2 ||
> - (ncp->nc_nlen > 1 && ncp->nc_name[1] != '.') ||
> - (ncp->nc_nlen > 0 && ncp->nc_name[0] != '.'))) {
> - nvcpp = &ncvhashtbl[NCVHASH(vp)];
> - LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash);
> + if (RB_EMPTY(&dvp->v_nc_tree)) {
> + vhold(dvp);
> + }
> + if (RB_INSERT(namecache_rb_cache, &dvp->v_nc_tree, ncp) != NULL)
> + panic("Ich habe eine Rotweinflasche in meinem Arsch\n");
> + if (vp) {
> + TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
> + numcache++;
> + TAILQ_INSERT_TAIL(&vp->v_cache_dst, ncp, nc_me);
> + } else {
> + TAILQ_INSERT_TAIL(&nclruneghead, ncp, nc_neg);
> + numneg++;
> + }
> + if (numneg > desiredvnodes) {
> + if ((ncp = TAILQ_FIRST(&nclruneghead)) != NULL)
> + cache_zap(ncp);
> }
> }
>
> @@ -394,10 +391,8 @@
> void
> nchinit()
> {
> -
> TAILQ_INIT(&nclruhead);
> - nchashtbl = hashinit(desiredvnodes, M_CACHE, M_WAITOK, &nchash);
> - ncvhashtbl = hashinit(desiredvnodes/8, M_CACHE, M_WAITOK, &ncvhash);
> + TAILQ_INIT(&nclruneghead);
> pool_init(&nch_pool, sizeof(struct namecache), 0, 0, 0, "nchpl",
> &pool_allocator_nointr);
> }
> @@ -410,18 +405,16 @@
> cache_purge(struct vnode *vp)
> {
> struct namecache *ncp;
> - struct nchashhead *ncpp;
>
> + while ((ncp = TAILQ_FIRST(&vp->v_cache_dst)))
> + cache_zap(ncp);
> + while ((ncp = RB_ROOT(&vp->v_nc_tree)))
> + cache_zap(ncp);
> +
> + /* XXX this blows goats */
> vp->v_id = ++nextvnodeid;
> - if (nextvnodeid != 0)
> - return;
> - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
> - LIST_FOREACH(ncp, ncpp, nc_hash) {
> - ncp->nc_vpid = 0;
> - ncp->nc_dvpid = 0;
> - }
> - }
> - vp->v_id = ++nextvnodeid;
> + if (vp->v_id == 0)
> + vp->v_id = ++nextvnodeid;
> }
>
> /*
> @@ -437,6 +430,7 @@
> {
> struct namecache *ncp, *nxtcp;
>
> + /* whack the regular entries */
> for (ncp = TAILQ_FIRST(&nclruhead); ncp != TAILQ_END(&nclruhead);
> ncp = nxtcp) {
> if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) {
> @@ -444,19 +438,20 @@
> continue;
> }
> /* free the resources we had */
> - ncp->nc_vp = NULL;
> - ncp->nc_dvp = NULL;
> - TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
> - if (ncp->nc_hash.le_prev != NULL) {
> - LIST_REMOVE(ncp, nc_hash);
> - ncp->nc_hash.le_prev = NULL;
> - }
> - if (ncp->nc_vhash.le_prev != NULL) {
> - LIST_REMOVE(ncp, nc_vhash);
> - ncp->nc_vhash.le_prev = NULL;
> - }
> + cache_zap(ncp);
> /* cause rescan of list, it may have altered */
> nxtcp = TAILQ_FIRST(&nclruhead);
> - TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru);
> + }
> + /* whack the negative entries */
> + for (ncp = TAILQ_FIRST(&nclruneghead); ncp != TAILQ_END(&nclruneghead);
> + ncp = nxtcp) {
> + if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) {
> + nxtcp = TAILQ_NEXT(ncp, nc_neg);
> + continue;
> + }
> + /* free the resources we had */
> + cache_zap(ncp);
> + /* cause rescan of list, it may have altered */
> + nxtcp = TAILQ_FIRST(&nclruneghead);
> }
> }
> Index: kern/vfs_subr.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/vfs_subr.c,v
> retrieving revision 1.180
> diff -u -r1.180 vfs_subr.c
> --- kern/vfs_subr.c 2 Aug 2009 16:28:40 -0000 1.180
> +++ kern/vfs_subr.c 9 Aug 2009 18:30:12 -0000
> @@ -360,6 +360,8 @@
> splx(s);
> vp = pool_get(&vnode_pool, PR_WAITOK | PR_ZERO);
> RB_INIT(&vp->v_bufs_tree);
> + RB_INIT(&vp->v_nc_tree);
> + TAILQ_INIT(&vp->v_cache_dst);
> numvnodes++;
> } else {
> for (vp = TAILQ_FIRST(listhd); vp != NULLVP;
> Index: sys/namei.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/namei.h,v
> retrieving revision 1.22
> diff -u -r1.22 namei.h
> --- sys/namei.h 29 Aug 2008 08:57:28 -0000 1.22
> +++ sys/namei.h 9 Aug 2009 18:30:12 -0000
> @@ -36,6 +36,11 @@
> #define _SYS_NAMEI_H_
>
> #include <sys/queue.h>
> +#include <sys/tree.h>
> +
> +struct namecache;
> +struct namecache_rb_cache;
> +RB_PROTOTYPE(namecache_rb_cache, namecache, n_rbcache, namecache_compare);
>
> /*
> * Encapsulation of namei parameters.
> @@ -156,9 +161,10 @@
> #define NCHNAMLEN 31 /* maximum name segment length we
> bother with */
>
> struct namecache {
> - LIST_ENTRY(namecache) nc_hash; /* hash chain */
> - LIST_ENTRY(namecache) nc_vhash; /* (reverse) directory hash chain */
> - TAILQ_ENTRY(namecache) nc_lru; /* LRU chain */
> + TAILQ_ENTRY(namecache) nc_lru; /* Regular Entry LRU chain */
> + TAILQ_ENTRY(namecache) nc_neg; /* Negative Entry LRU chain */
> + RB_ENTRY(namecache) n_rbcache; /* Namecache rb tree from vnode */
> + TAILQ_ENTRY(namecache) nc_me; /* ncp's referring to me */
> struct vnode *nc_dvp; /* vnode of parent of name */
> u_long nc_dvpid; /* capability number of nc_dvp */
> struct vnode *nc_vp; /* vnode the name refers to */
> Index: sys/vnode.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/vnode.h,v
> retrieving revision 1.102
> diff -u -r1.102 vnode.h
> --- sys/vnode.h 2 Aug 2009 16:28:40 -0000 1.102
> +++ sys/vnode.h 9 Aug 2009 18:30:12 -0000
> @@ -36,6 +36,7 @@
> #include <sys/types.h>
> #include <sys/queue.h>
> #include <sys/lock.h>
> +#include <sys/namei.h>
> #include <sys/selinfo.h>
> #include <sys/tree.h>
>
> @@ -82,6 +83,7 @@
> LIST_HEAD(buflists, buf);
>
> RB_HEAD(buf_rb_bufs, buf);
> +RB_HEAD(namecache_rb_cache, namecache);
>
> struct vnode {
> struct uvm_vnode v_uvm; /* uvm data */
> @@ -110,6 +112,10 @@
> struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */
> } v_un;
>
> + /* VFS namecache */
> + struct namecache_rb_cache v_nc_tree;
> + TAILQ_HEAD(, namecache) v_cache_dst; /* cache entries to us */
> +
> enum vtagtype v_tag; /* type of underlying data */
> void *v_data; /* private data for fs */
> struct selinfo v_selectinfo; /* identity of poller(s) */
> @@ -247,8 +253,9 @@
> extern int maxvnodes; /* XXX number of vnodes to
> allocate */
> extern time_t syncdelay; /* time to delay syncing vnodes
> */
> extern int rushjob; /* # of slots syncer should run
> ASAP */
> +extern void vhold(struct vnode *);
> +extern void vdrop(struct vnode *);
> #endif /* _KERNEL */
> -
>
> /*
> * Mods for exensibility.
>
--
#!/usr/bin/perl
if ((not 0 && not 1) != (! 0 && ! 1)) {
print "Larry and Tom must smoke some really primo stuff...\n";
}