Restructure the name cache code to eliminate most lock contention

resulting from forward lookups. Discussed on tech-kern@.
2008-04-11 15:25:24 +00:00 · 2008-04-11 15:25:24 +00:00 · 1e11b07bfa
commit 1e11b07bfa
parent 1c3c41f771
4 changed files with 429 additions and 139 deletions
--- a/sys/kern/kern_cpu.c
+++ b/sys/kern/kern_cpu.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_cpu.c,v 1.22 2008/03/22 18:04:42 ad Exp $	*/
+/*	$NetBSD: kern_cpu.c,v 1.23 2008/04/11 15:25:24 ad Exp $	*/

 /*-
 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
@ -64,7 +64,7 @@

 #include <sys/cdefs.h>

-__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.22 2008/03/22 18:04:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.23 2008/04/11 15:25:24 ad Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -82,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.22 2008/03/22 18:04:42 ad Exp $");
 #include <sys/pool.h>
 #include <sys/kmem.h>
 #include <sys/select.h>
+#include <sys/namei.h>

 #include <uvm/uvm_extern.h>

@ -135,6 +136,7 @@ mi_cpu_attach(struct cpu_info *ci)
 	xc_init_cpu(ci);
 	pool_cache_cpu_init(ci);
 	selsysinit(ci);
+	cache_cpu_init(ci);
 	TAILQ_INIT(&ci->ci_data.cpu_biodone);
 	ncpu++;
 	ncpuonline++;
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@ -1,4 +1,37 @@
-/*	$NetBSD: vfs_cache.c,v 1.72 2007/11/11 23:22:25 matt Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.73 2008/04/11 15:25:24 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the NetBSD
+ *	Foundation, Inc. and its contributors.
+ * 4. Neither the name of The NetBSD Foundation nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */

 /*
 * Copyright (c) 1989, 1993
@ -32,7 +65,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.72 2007/11/11 23:22:25 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.73 2008/04/11 15:25:24 ad Exp $");

 #include "opt_ddb.h"
 #include "opt_revcache.h"
@ -47,6 +80,11 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.72 2007/11/11 23:22:25 matt Exp $");
 #include <sys/malloc.h>
 #include <sys/pool.h>
 #include <sys/mutex.h>
+#include <sys/atomic.h>
+#include <sys/kthread.h>
+#include <sys/kernel.h>
+#include <sys/cpu.h>
+#include <sys/evcnt.h>

 #define NAMECACHE_ENTER_REVERSE
 /*
@ -75,7 +113,6 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.72 2007/11/11 23:22:25 matt Exp $");
 */
 LIST_HEAD(nchashhead, namecache) *nchashtbl;
 u_long	nchash;				/* size of hash table - 1 */
-long	numcache;			/* number of cache entries allocated */
 #define	NCHASH(cnp, dvp)	\
 	(((cnp)->cn_hash ^ ((uintptr_t)(dvp) >> 3)) & nchash)

@ -83,37 +120,78 @@ LIST_HEAD(ncvhashhead, namecache) *ncvhashtbl;
 u_long	ncvhash;			/* size of hash table - 1 */
 #define	NCVHASH(vp)		(((uintptr_t)(vp) >> 3) & ncvhash)

+long	numcache;			/* number of cache entries allocated */
+static u_int	cache_gcpend;		/* number of entries pending GC */
+static void	*cache_gcqueue;		/* garbage collection queue */
+
 TAILQ_HEAD(, namecache) nclruhead =		/* LRU chain */
 	TAILQ_HEAD_INITIALIZER(nclruhead);
+#define	COUNT(x)	nchstats.x++
 struct	nchstats nchstats;		/* cache effectiveness statistics */

 static pool_cache_t namecache_cache;

 MALLOC_DEFINE(M_CACHE, "namecache", "Dynamically allocated cache entries");

+int cache_lowat = 95;
+int cache_hiwat = 98;
+int cache_hottime = 5;			/* number of seconds */
 int doingcache = 1;			/* 1 => enable the cache */

-/* A single lock to protect cache insertion, removal and lookup */
-static kmutex_t namecache_lock;
+static struct evcnt cache_ev_scan;
+static struct evcnt cache_ev_gc;
+static struct evcnt cache_ev_over;
+static struct evcnt cache_ev_under;
+static struct evcnt cache_ev_forced;

-static void cache_remove(struct namecache *);
-static void cache_free(struct namecache *);
+/* A single lock to serialize modifications. */
+static kmutex_t *namecache_lock;
+
+static void cache_invalidate(struct namecache *);
 static inline struct namecache *cache_lookup_entry(
    const struct vnode *, const struct componentname *);
+static void cache_thread(void *);
+static void cache_invalidate(struct namecache *);
+static void cache_disassociate(struct namecache *);
+static void cache_reclaim(void);
+static int cache_ctor(void *, void *, int);
+static void cache_dtor(void *, void *);

+/*
+ * Invalidate a cache entry and enqueue it for garbage collection.
+ */
 static void
-cache_remove(struct namecache *ncp)
+cache_invalidate(struct namecache *ncp)
+{
+	void *head;
+
+	KASSERT(mutex_owned(&ncp->nc_lock));
+
+	if (ncp->nc_dvp != NULL) {
+		ncp->nc_vp = NULL;
+		ncp->nc_dvp = NULL;
+		do {
+			head = cache_gcqueue;
+			ncp->nc_gcqueue = head;
+		} while (atomic_cas_ptr(&cache_gcqueue, head, ncp) != head);
+		atomic_inc_uint(&cache_gcpend);
+	}
+}
+
+/*
+ * Disassociate a namecache entry from any vnodes it is attached to,
+ * and remove from the global LRU list.
+ */
+static void
+cache_disassociate(struct namecache *ncp)
 {

-	KASSERT(mutex_owned(&namecache_lock));
+	KASSERT(mutex_owned(namecache_lock));
+	KASSERT(ncp->nc_dvp == NULL);

-	ncp->nc_dvp = NULL;
-	ncp->nc_vp = NULL;
-
-	TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-	if (ncp->nc_hash.le_prev != NULL) {
-		LIST_REMOVE(ncp, nc_hash);
-		ncp->nc_hash.le_prev = NULL;
+	if (ncp->nc_lru.tqe_prev != NULL) {
+		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
+		ncp->nc_lru.tqe_prev = NULL;
 	}
 	if (ncp->nc_vhash.le_prev != NULL) {
 		LIST_REMOVE(ncp, nc_vhash);
@ -129,32 +207,62 @@ cache_remove(struct namecache *ncp)
 	}
 }

+/*
+ * Lock all CPUs to prevent any cache lookup activity.  Conceptually,
+ * this locks out all "readers".
+ */
 static void
-cache_free(struct namecache *ncp)
+cache_lock_cpus(void)
 {
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;

-	pool_cache_put(namecache_cache, ncp);
-	numcache--;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		mutex_enter(ci->ci_data.cpu_cachelock);
+	}
 }

-static inline struct namecache *
+/*
+ * Release all CPU locks.
+ */
+static void
+cache_unlock_cpus(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		mutex_exit(ci->ci_data.cpu_cachelock);
+	}
+}
+
+/*
+ * Find a single cache entry and return it locked.  'namecache_lock' or
+ * at least one of the per-CPU locks must be held.
+ */
+static struct namecache *
 cache_lookup_entry(const struct vnode *dvp, const struct componentname *cnp)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;

-	KASSERT(mutex_owned(&namecache_lock));
-
 	ncpp = &nchashtbl[NCHASH(cnp, dvp)];

 	LIST_FOREACH(ncp, ncpp, nc_hash) {
-		if (ncp->nc_dvp == dvp &&
-		    ncp->nc_nlen == cnp->cn_namelen &&
-		    !memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
-			break;
+		if (ncp->nc_dvp != dvp ||
+		    ncp->nc_nlen != cnp->cn_namelen ||
+		    memcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
+		    	continue;
+	    	mutex_enter(&ncp->nc_lock);
+		if (ncp->nc_dvp == dvp) {
+			ncp->nc_hittime = hardclock_ticks;
+			return ncp;
+		}
+		/* Raced: entry has been nullified. */
+		mutex_exit(&ncp->nc_lock);
 	}

-	return ncp;
+	return NULL;
 }

 /*
@ -178,6 +286,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct namecache *ncp;
 	struct vnode *vp;
+	kmutex_t *cpulock;
 	int error;

 	if (!doingcache) {
@ -188,18 +297,19 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)

 	if (cnp->cn_namelen > NCHNAMLEN) {
 		/* Unlocked, but only for stats. */
-		nchstats.ncs_long++;
+		COUNT(ncs_long);
 		cnp->cn_flags &= ~MAKEENTRY;
 		goto fail;
 	}
-	mutex_enter(&namecache_lock);
+	cpulock = curcpu()->ci_data.cpu_cachelock;
+	mutex_enter(cpulock);
 	ncp = cache_lookup_entry(dvp, cnp);
 	if (ncp == NULL) {
-		nchstats.ncs_miss++;
+		COUNT(ncs_miss);
 		goto fail_wlock;
 	}
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
-		nchstats.ncs_badhits++;
+		COUNT(ncs_badhits);
 		goto remove;
 	} else if (ncp->nc_vp == NULL) {
 		/*
@ -208,41 +318,25 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 		cnp->cn_flags |= ncp->nc_flags;
 		if (cnp->cn_nameiop != CREATE ||
 		    (cnp->cn_flags & ISLASTCN) == 0) {
-			nchstats.ncs_neghits++;
-			/*
-			 * Move this slot to end of LRU chain,
-			 * if not already there.
-			 */
-			if (TAILQ_NEXT(ncp, nc_lru) != 0) {
-				TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-				TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
-			}
-			mutex_exit(&namecache_lock);
+			COUNT(ncs_neghits);
+			mutex_exit(&ncp->nc_lock);
+			mutex_exit(cpulock);
 			return (ENOENT);
 		} else {
-			nchstats.ncs_badhits++;
+			COUNT(ncs_badhits);
 			goto remove;
 		}
 	}

 	vp = ncp->nc_vp;
-
-	/*
-	 * Move this slot to end of LRU chain, if not already there.
-	 */
-	if (TAILQ_NEXT(ncp, nc_lru) != 0) {
-		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-		TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
-	}
-
-	error = vget(vp, LK_NOWAIT);
-
-	/* Release the name cache mutex while we get reference to the vnode */
-	mutex_exit(&namecache_lock);
+	mutex_enter(&vp->v_interlock);
+	mutex_exit(&ncp->nc_lock);
+	mutex_exit(cpulock);
+	error = vget(vp, LK_NOWAIT | LK_INTERLOCK);

 #ifdef DEBUG
 	/*
-	 * since we released namecache_lock,
+	 * since we released nb->nb_lock,
 	 * we can't use this pointer any more.
 	 */
 	ncp = NULL;
@ -253,7 +347,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 		/*
 		 * this vnode is being cleaned out.
 		 */
-		nchstats.ncs_falsehits++; /* XXX badhits? */
+		COUNT(ncs_falsehits); /* XXX badhits? */
 		goto fail;
 	}

@ -272,13 +366,13 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 	 */
 	if (error) {
 		/* Unlocked, but only for stats. */
-		nchstats.ncs_badhits++;
+		COUNT(ncs_badhits);
 		*vpp = NULL;
 		return (-1);
 	}

 	/* Unlocked, but only for stats. */
-	nchstats.ncs_goodhits++;
+	COUNT(ncs_goodhits);
 	*vpp = vp;
 	return (0);

@ -288,11 +382,10 @@ remove:
 	 * the cache entry is invalid, or otherwise don't
 	 * want cache entry to exist.
 	 */
-	cache_remove(ncp);
-	cache_free(ncp);
-
+	cache_invalidate(ncp);
+	mutex_exit(&ncp->nc_lock);
 fail_wlock:
-	mutex_exit(&namecache_lock);
+	mutex_exit(cpulock);
 fail:
 	*vpp = NULL;
 	return (-1);
@ -304,6 +397,7 @@ cache_lookup_raw(struct vnode *dvp, struct vnode **vpp,
 {
 	struct namecache *ncp;
 	struct vnode *vp;
+	kmutex_t *cpulock;
 	int error;

 	if (!doingcache) {
@ -314,47 +408,39 @@ cache_lookup_raw(struct vnode *dvp, struct vnode **vpp,

 	if (cnp->cn_namelen > NCHNAMLEN) {
 		/* Unlocked, but only for stats. */
-		nchstats.ncs_long++;
+		COUNT(ncs_long);
 		cnp->cn_flags &= ~MAKEENTRY;
 		goto fail;
 	}
-	mutex_enter(&namecache_lock);
+	cpulock = curcpu()->ci_data.cpu_cachelock;
+	mutex_enter(cpulock);
 	ncp = cache_lookup_entry(dvp, cnp);
 	if (ncp == NULL) {
-		nchstats.ncs_miss++;
+		COUNT(ncs_miss);
 		goto fail_wlock;
 	}
-	/*
-	 * Move this slot to end of LRU chain,
-	 * if not already there.
-	 */
-	if (TAILQ_NEXT(ncp, nc_lru) != 0) {
-		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
-		TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
-	}
-
 	vp = ncp->nc_vp;
 	if (vp == NULL) {
 		/*
 		 * Restore the ISWHITEOUT flag saved earlier.
 		 */
 		cnp->cn_flags |= ncp->nc_flags;
-		nchstats.ncs_neghits++;
-		mutex_exit(&namecache_lock);
+		COUNT(ncs_neghits);
+		mutex_exit(&ncp->nc_lock);
+		mutex_exit(cpulock);
 		return (ENOENT);
 	}
-
-	error = vget(vp, LK_NOWAIT);
-
-	/* Release the name cache mutex while we get reference to the vnode */
-	mutex_exit(&namecache_lock);
+	mutex_enter(&vp->v_interlock);
+	mutex_exit(&ncp->nc_lock);
+	mutex_exit(cpulock);
+	error = vget(vp, LK_NOWAIT | LK_INTERLOCK);

 	if (error) {
 		KASSERT(error == EBUSY);
 		/*
 		 * this vnode is being cleaned out.
 		 */
-		nchstats.ncs_falsehits++; /* XXX badhits? */
+		COUNT(ncs_falsehits); /* XXX badhits? */
 		goto fail;
 	}

@ -363,7 +449,7 @@ cache_lookup_raw(struct vnode *dvp, struct vnode **vpp,
 	return 0;

 fail_wlock:
-	mutex_exit(&namecache_lock);
+	mutex_exit(cpulock);
 fail:
 	*vpp = NULL;
 	return -1;
@ -394,8 +480,9 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)

 	nvcpp = &ncvhashtbl[NCVHASH(vp)];

-	mutex_enter(&namecache_lock);
+	mutex_enter(namecache_lock);
 	LIST_FOREACH(ncp, nvcpp, nc_vhash) {
+		mutex_enter(&ncp->nc_lock);
 		if (ncp->nc_vp == vp &&
 		    (dvp = ncp->nc_dvp) != NULL &&
 		    dvp != vp) { 		/* avoid pesky . entries.. */
@ -410,14 +497,15 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)
 			    ncp->nc_name[1] == '.')
 				panic("cache_revlookup: found entry for ..");
 #endif
-			nchstats.ncs_revhits++;
+			COUNT(ncs_revhits);

 			if (bufp) {
 				bp = *bpp;
 				bp -= ncp->nc_nlen;
 				if (bp <= bufp) {
 					*dvpp = NULL;
-					mutex_exit(&namecache_lock);
+					mutex_exit(&ncp->nc_lock);
+					mutex_exit(namecache_lock);
 					return (ERANGE);
 				}
 				memcpy(bp, ncp->nc_name, ncp->nc_nlen);
@ -426,12 +514,14 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp)

 			/* XXX MP: how do we know dvp won't evaporate? */
 			*dvpp = dvp;
-			mutex_exit(&namecache_lock);
+			mutex_exit(&ncp->nc_lock);
+			mutex_exit(namecache_lock);
 			return (0);
 		}
+		mutex_exit(&ncp->nc_lock);
 	}
-	nchstats.ncs_revmiss++;
-	mutex_exit(&namecache_lock);
+	COUNT(ncs_revmiss);
+	mutex_exit(namecache_lock);
 out:
 	*dvpp = NULL;
 	return (-1);
@ -454,37 +544,34 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 #endif
 	if (!doingcache)
 		return;
-	/*
-	 * Free the cache slot at head of lru chain.
-	 */
-	mutex_enter(&namecache_lock);

-	if (numcache < numvnodes) {
-		numcache++;
-		mutex_exit(&namecache_lock);
-		ncp = pool_cache_get(namecache_cache, PR_WAITOK);
-		memset(ncp, 0, sizeof(*ncp));
-		mutex_enter(&namecache_lock);
-	} else if ((ncp = TAILQ_FIRST(&nclruhead)) != NULL) {
-		cache_remove(ncp);
-	} else {
-		mutex_exit(&namecache_lock);
-		return;
+	if (numcache > desiredvnodes) {
+		mutex_enter(namecache_lock);
+		cache_ev_forced.ev_count++;
+		cache_reclaim();
+		mutex_exit(namecache_lock);
 	}

+	ncp = pool_cache_get(namecache_cache, PR_WAITOK);
+	mutex_enter(namecache_lock);
+	numcache++;
+
 	/*
 	 * Concurrent lookups in the same directory may race for a
 	 * cache entry.  if there's a duplicated entry, free it.
 	 */
 	oncp = cache_lookup_entry(dvp, cnp);
 	if (oncp) {
-		cache_remove(oncp);
-		cache_free(oncp);
+		cache_invalidate(oncp);
+		mutex_exit(&oncp->nc_lock);
 	}
-	KASSERT(cache_lookup_entry(dvp, cnp) == NULL);

 	/* Grab the vnode we just found. */
+	mutex_enter(&ncp->nc_lock);
 	ncp->nc_vp = vp;
+	ncp->nc_flags = 0;
+	ncp->nc_hittime = 0;
+	ncp->nc_gcqueue = NULL;
 	if (vp == NULL) {
 		/*
 		 * For negative hits, save the ISWHITEOUT flag so we can
@ -497,10 +584,21 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 	LIST_INSERT_HEAD(&dvp->v_dnclist, ncp, nc_dvlist);
 	if (vp)
 		LIST_INSERT_HEAD(&vp->v_nclist, ncp, nc_vlist);
+	else {
+		ncp->nc_vlist.le_prev = NULL;
+		ncp->nc_vlist.le_next = NULL;
+	}
 	ncp->nc_nlen = cnp->cn_namelen;
-	memcpy(ncp->nc_name, cnp->cn_nameptr, (unsigned)ncp->nc_nlen);
 	TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
+	memcpy(ncp->nc_name, cnp->cn_nameptr, (unsigned)ncp->nc_nlen);
 	ncpp = &nchashtbl[NCHASH(cnp, dvp)];
+
+	/*
+	 * Flush updates before making visible in table.  No need for a
+	 * memory barrier on the other side: to see modifications the
+	 * list must be followed, meaning a dependent pointer load.
+	 */
+	membar_producer();
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);

 	ncp->nc_vhash.le_prev = NULL;
@ -521,7 +619,8 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 		nvcpp = &ncvhashtbl[NCVHASH(vp)];
 		LIST_INSERT_HEAD(nvcpp, ncp, nc_vhash);
 	}
-	mutex_exit(&namecache_lock);
+	mutex_exit(&ncp->nc_lock);
+	mutex_exit(namecache_lock);
 }

 /*
@ -530,12 +629,15 @@ cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 void
 nchinit(void)
 {
+	int error;

-	namecache_cache = pool_cache_init(sizeof(struct namecache), 0, 0, 0,
-	    "ncachepl", NULL, IPL_NONE, NULL, NULL, NULL);
+	namecache_cache = pool_cache_init(sizeof(struct namecache), 
+	    coherency_unit, 0, 0, "ncache", NULL, IPL_NONE, cache_ctor,
+	    cache_dtor, NULL);
 	KASSERT(namecache_cache != NULL);

-	mutex_init(&namecache_lock, MUTEX_DEFAULT, IPL_NONE);
+	namecache_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
+
 	nchashtbl =
 	    hashinit(desiredvnodes, HASH_LIST, M_CACHE, M_WAITOK, &nchash);
 	ncvhashtbl =
@ -544,6 +646,52 @@ nchinit(void)
 #else
 	    hashinit(desiredvnodes/8, HASH_LIST, M_CACHE, M_WAITOK, &ncvhash);
 #endif
+
+	error = kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, cache_thread,
+	    NULL, NULL, "cachegc");
+	if (error != 0)
+		panic("nchinit %d", error);
+
+	evcnt_attach_dynamic(&cache_ev_scan, EVCNT_TYPE_MISC, NULL,
+	   "namecache", "entries scanned");
+	evcnt_attach_dynamic(&cache_ev_gc, EVCNT_TYPE_MISC, NULL,
+	   "namecache", "entries collected");
+	evcnt_attach_dynamic(&cache_ev_over, EVCNT_TYPE_MISC, NULL,
+	   "namecache", "over scan target");
+	evcnt_attach_dynamic(&cache_ev_under, EVCNT_TYPE_MISC, NULL,
+	   "namecache", "under scan target");
+	evcnt_attach_dynamic(&cache_ev_forced, EVCNT_TYPE_MISC, NULL,
+	   "namecache", "forced reclaims");
+}
+
+static int
+cache_ctor(void *arg, void *obj, int flag)
+{
+	struct namecache *ncp;
+
+	ncp = obj;
+	mutex_init(&ncp->nc_lock, MUTEX_DEFAULT, IPL_NONE);
+
+	return 0;
+}
+
+static void
+cache_dtor(void *arg, void *obj)
+{
+	struct namecache *ncp;
+
+	ncp = obj;
+	mutex_destroy(&ncp->nc_lock);
+}
+
+/*
+ * Called once for each CPU in the system as attached.
+ */
+void
+cache_cpu_init(struct cpu_info *ci)
+{
+
+	ci->ci_data.cpu_cachelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
 }

 /*
@ -564,7 +712,8 @@ nchreinit(void)
 #else
 	    hashinit(desiredvnodes/8, HASH_LIST, M_CACHE, M_WAITOK, &mask2);
 #endif
-	mutex_enter(&namecache_lock);
+	mutex_enter(namecache_lock);
+	cache_lock_cpus();
 	oldhash1 = nchashtbl;
 	oldmask1 = nchash;
 	nchashtbl = hash1;
@ -585,7 +734,8 @@ nchreinit(void)
 			ncp->nc_vhash.le_prev = NULL;
 		}
 	}
-	mutex_exit(&namecache_lock);
+	cache_unlock_cpus();
+	mutex_exit(namecache_lock);
 	hashdone(oldhash1, M_CACHE);
 	hashdone(oldhash2, M_CACHE);
 }
@ -599,31 +749,36 @@ cache_purge1(struct vnode *vp, const struct componentname *cnp, int flags)
 {
 	struct namecache *ncp, *ncnext;

-	mutex_enter(&namecache_lock);
+	mutex_enter(namecache_lock);
 	if (flags & PURGE_PARENTS) {
 		for (ncp = LIST_FIRST(&vp->v_nclist); ncp != NULL;
 		    ncp = ncnext) {
 			ncnext = LIST_NEXT(ncp, nc_vlist);
-			cache_remove(ncp);
-			cache_free(ncp);
+			mutex_enter(&ncp->nc_lock);
+			cache_invalidate(ncp);
+			mutex_exit(&ncp->nc_lock);
+			cache_disassociate(ncp);
 		}
 	}
 	if (flags & PURGE_CHILDREN) {
 		for (ncp = LIST_FIRST(&vp->v_dnclist); ncp != NULL;
 		    ncp = ncnext) {
 			ncnext = LIST_NEXT(ncp, nc_dvlist);
-			cache_remove(ncp);
-			cache_free(ncp);
+			mutex_enter(&ncp->nc_lock);
+			cache_invalidate(ncp);
+			mutex_exit(&ncp->nc_lock);
+			cache_disassociate(ncp);
 		}
 	}
 	if (cnp != NULL) {
 		ncp = cache_lookup_entry(vp, cnp);
 		if (ncp) {
-			cache_remove(ncp);
-			cache_free(ncp);
+			cache_invalidate(ncp);
+			cache_disassociate(ncp);
+			mutex_exit(&ncp->nc_lock);
 		}
 	}
-	mutex_exit(&namecache_lock);
+	mutex_exit(namecache_lock);
 }

 /*
@ -635,17 +790,136 @@ cache_purgevfs(struct mount *mp)
 {
 	struct namecache *ncp, *nxtcp;

-	mutex_enter(&namecache_lock);
+	mutex_enter(namecache_lock);
 	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
 		nxtcp = TAILQ_NEXT(ncp, nc_lru);
-		if (ncp->nc_dvp == NULL || ncp->nc_dvp->v_mount != mp) {
+		mutex_enter(&ncp->nc_lock);
+		if (ncp->nc_dvp != NULL && ncp->nc_dvp->v_mount == mp) {
+			/* Free the resources we had. */
+			cache_invalidate(ncp);
+			cache_disassociate(ncp);
+		}
+		mutex_exit(&ncp->nc_lock);
+	}
+	cache_reclaim();
+	mutex_exit(namecache_lock);
+}
+
+/*
+ * Scan global list invalidating entries until we meet a preset target. 
+ * Prefer to invalidate entries that have not scored a hit within
+ * cache_hottime seconds.  We sort the LRU list only for this routine's
+ * benefit.
+ */
+static void
+cache_prune(int incache, int target)
+{
+	struct namecache *ncp, *nxtcp, *sentinel;
+	int items, recent, tryharder;
+
+	KASSERT(mutex_owned(namecache_lock));
+
+	items = 0;
+	tryharder = 0;
+	recent = hardclock_ticks - hz * cache_hottime;
+	sentinel = NULL;
+	for (ncp = TAILQ_FIRST(&nclruhead); ncp != NULL; ncp = nxtcp) {
+		if (incache <= target)
+			break;
+		items++;
+		nxtcp = TAILQ_NEXT(ncp, nc_lru);
+		if (ncp->nc_dvp == NULL)
+			continue;
+		if (ncp == sentinel) {
+			/*
+			 * If we looped back on ourself, then ignore
+			 * recent entries and purge whatever we find.
+			 */
+			tryharder = 1;
+		}
+		if (!tryharder && ncp->nc_hittime > recent) {
+			if (sentinel == NULL)
+				sentinel = ncp;
+			TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
+			TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
 			continue;
 		}
-		/* Free the resources we had. */
-		cache_remove(ncp);
-		cache_free(ncp);
+		mutex_enter(&ncp->nc_lock);
+		if (ncp->nc_dvp != NULL) {
+			cache_invalidate(ncp);
+			cache_disassociate(ncp);
+			incache--;
+		}
+		mutex_exit(&ncp->nc_lock);
+	}
+	cache_ev_scan.ev_count += items;
+}
+
+/*
+ * Collect dead cache entries from all CPUs and garbage collect.
+ */
+static void
+cache_reclaim(void)
+{
+	struct namecache *ncp, *next;
+	int items;
+
+	KASSERT(mutex_owned(namecache_lock));
+
+	/*
+	 * If the number of extant entries not awaiting garbage collection
+	 * exceeds the high water mark, then reclaim stale entries until we
+	 * reach our low water mark.
+	 */
+	items = numcache - cache_gcpend;
+	if (items > (uint64_t)desiredvnodes * cache_hiwat / 100) {
+		cache_prune(items, (int)((uint64_t)desiredvnodes *
+		    cache_lowat / 100));
+		cache_ev_over.ev_count++;
+	} else
+		cache_ev_under.ev_count++;
+
+	/*
+	 * Stop forward lookup activity on all CPUs and garbage collect dead
+	 * entries.
+	 */
+	cache_lock_cpus();
+	ncp = cache_gcqueue;
+	cache_gcqueue = NULL;
+	items = cache_gcpend;
+	cache_gcpend = 0;
+	while (ncp != NULL) {
+		next = ncp->nc_gcqueue;
+		cache_disassociate(ncp);
+		KASSERT(ncp->nc_dvp == NULL);
+		if (ncp->nc_hash.le_prev != NULL) {
+			LIST_REMOVE(ncp, nc_hash);
+			ncp->nc_hash.le_prev = NULL;
+		}
+		pool_cache_put(namecache_cache, ncp);
+		ncp = next;
+	}
+	cache_unlock_cpus();
+	numcache -= items;
+	cache_ev_gc.ev_count += items;
+}
+
+/*
+ * Cache maintainence thread, awakening once per second to:
+ *
+ * => keep number of entries below the high water mark
+ * => sort pseudo-LRU list
+ * => garbage collect dead entries
+ */
+static void
+cache_thread(void *arg)
+{
+
+	mutex_enter(namecache_lock);
+	for (;;) {
+		cache_reclaim();
+		kpause("cachegc", false, hz, namecache_lock);
 	}
-	mutex_exit(&namecache_lock);
 }

 #ifdef DDB
@ -656,7 +930,7 @@ namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 	struct namecache *ncp;

 	TAILQ_FOREACH(ncp, &nclruhead, nc_lru) {
-		if (ncp->nc_vp == vp) {
+		if (ncp->nc_vp == vp && ncp->nc_dvp != NULL) {
 			(*pr)("name %.*s\n", ncp->nc_nlen, ncp->nc_name);
 			dvp = ncp->nc_dvp;
 		}
--- a/sys/sys/cpu_data.h
+++ b/sys/sys/cpu_data.h
@ -1,4 +1,4 @@
-/*	$NetBSD: cpu_data.h,v 1.19 2008/03/22 18:04:42 ad Exp $	*/
+/*	$NetBSD: cpu_data.h,v 1.20 2008/04/11 15:25:24 ad Exp $	*/

 /*-
 * Copyright (c) 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@ -92,6 +92,7 @@ struct cpu_data {
 	TAILQ_HEAD(,buf) cpu_biodone;		/* finished block xfers */
 	percpu_cpu_t	cpu_percpu;		/* per-cpu data */
 	struct selcpu	*cpu_selcpu;		/* per-CPU select() info */
+	void		*cpu_cachelock;		/* per-cpu vfs_cache lock */
 };

 /* compat definitions */
--- a/sys/sys/namei.src
+++ b/sys/sys/namei.src
@ -1,4 +1,4 @@
-/*	$NetBSD: namei.src,v 1.6 2007/12/08 19:29:52 pooka Exp $	*/
+/*	$NetBSD: namei.src,v 1.7 2008/04/11 15:25:24 ad Exp $	*/

 /*
 * Copyright (c) 1985, 1989, 1991, 1993
@ -35,6 +35,9 @@
 #define	_SYS_NAMEI_H_

 #include <sys/queue.h>
+#include <sys/mutex.h>
+#include <sys/kauth.h>
+
 #ifdef _KERNEL
 /*
 * Encapsulation of namei parameters.
@ -132,11 +135,10 @@ NAMEIFL	DOWHITEOUT	0x0040000	/* do whiteouts */
 NAMEIFL	REQUIREDIR	0x0080000	/* must be a directory */
 NAMEIFL	CREATEDIR	0x0200000	/* trailing slashes are ok */
 NAMEIFL	PARAMASK	0x02fff00	/* mask of parameter descriptors */
+
 /*
 * Initialization of an nameidata structure.
 */
-#include <sys/kauth.h>
-
 #define NDINIT(ndp, op, flags, segflg, namep) { \
 	(ndp)->ni_cnd.cn_nameiop = op; \
 	(ndp)->ni_cnd.cn_flags = flags; \
@ -155,17 +157,26 @@ NAMEIFL	PARAMASK	0x02fff00	/* mask of parameter descriptors */

 #define	NCHNAMLEN	31	/* maximum name segment length we bother with */

+/*
+ * Namecache entry.  This structure is arranged so that frequently
+ * accessed and mostly read-only data is toward the front, with
+ * infrequently accessed data and the lock towards the rear.  The
+ * lock is then more likely to be in a seperate cache line.
+ */
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
-	TAILQ_ENTRY(namecache) nc_lru;	/* LRU chain */
 	LIST_ENTRY(namecache) nc_vhash;	/* directory hash chain */
-	LIST_ENTRY(namecache) nc_dvlist;
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
-	LIST_ENTRY(namecache) nc_vlist;
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	int	nc_flags;		/* copy of componentname's ISWHITEOUT */
 	char	nc_nlen;		/* length of name */
 	char	nc_name[NCHNAMLEN];	/* segment name */
+	void	*nc_gcqueue;		/* queue for garbage collection */
+	TAILQ_ENTRY(namecache) nc_lru;	/* psuedo-lru chain */
+	LIST_ENTRY(namecache) nc_dvlist;
+	LIST_ENTRY(namecache) nc_vlist;
+	kmutex_t nc_lock;		/* lock on this entry */
+	int	nc_hittime;		/* last time scored a hit */
 };

 #ifdef _KERNEL
@ -173,6 +184,7 @@ struct	namecache {
 #include <sys/pool.h>

 struct mount;
+struct cpu_info;

 extern pool_cache_t pnbuf_cache;	/* pathname buffer cache */

@ -194,6 +206,7 @@ int	cache_revlookup(struct vnode *, struct vnode **, char **, char *);
 void	cache_enter(struct vnode *, struct vnode *, struct componentname *);
 void	nchinit(void);
 void	nchreinit(void);
+void	cache_cpu_init(struct cpu_info *);
 void	cache_purgevfs(struct mount *);
 void	namecache_print(struct vnode *, void (*)(const char *, ...));