- Shrink namecache entries to 64 bytes on 32-bit platforms and use 32-bit

key values there for speed (remains 128 bytes & 64-bits on _LP64). - Comments.
2023-09-09 18:27:59 +00:00 · 2023-09-09 18:27:59 +00:00 · 6e2810dfd3
parent 7e018a29ea
commit 6e2810dfd3
4 changed files with 93 additions and 60 deletions
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@ -1,7 +1,7 @@
-/*	$NetBSD: vfs_cache.c,v 1.154 2023/04/29 10:07:22 riastradh Exp $	*/
+/*	$NetBSD: vfs_cache.c,v 1.155 2023/09/09 18:27:59 ad Exp $	*/

 /*-
- * Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
+ * Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -79,10 +79,9 @@
 *	we are not sensible, and use a per-directory data structure to index
 *	names, but the cache otherwise functions the same.
 *
- *	The index is a red-black tree.  There are no special concurrency
- *	requirements placed on it, because it's per-directory and protected
- *	by the namecache's per-directory locks.  It should therefore not be
- *	difficult to experiment with other types of index.
+ *	The index is a red-black tree.  It should not be difficult to
+ *	experiment with other types of index, however note that a tree
+ *	can trivially be made to support lockless lookup.
 *
 *	Each cached name is stored in a struct namecache, along with a
 *	pointer to the associated vnode (nc_vp).  Names longer than a
@ -91,6 +90,19 @@
 *	in struct namecache.  If it is a "negative" entry, (i.e. for a name
 *	that is known NOT to exist) the vnode pointer will be NULL.
 *
+ *	In practice this implementation is not any slower than the hash
+ *	table that preceeded it and in some cases it significantly
+ *	outperforms the hash table.  Some reasons why this might be:
+ *
+ *	- natural partitioning provided by the file system structure, which
+ *	  the prior implementation discarded (global hash table).
+ *	- worst case tree traversal of O(log n), the hash table could have
+ *	  many collisions.
+ *	- minimized cache misses & total L2/L3 CPU cache footprint; struct
+ *	  namecache and vnode_impl_t are laid out to keep cache footprint
+ *	  minimal in the lookup path; no hash table buckets to cache.
+ *	- minimized number of conditionals & string comparisons.
+ *
 *	For a directory with 3 cached names for 3 distinct vnodes, the
 *	various vnodes and namecache structs would be connected like this
 *	(the root is at the bottom of the diagram):
@ -167,12 +179,10 @@
 *				 used during reverse lookup)
 *
 *	3) cache_lru_lock	(LRU list direction, used during reclaim)
- *
- *	4) vp->v_interlock	(what the cache entry points to)
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.154 2023/04/29 10:07:22 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.155 2023/09/09 18:27:59 ad Exp $");

 #define __NAMECACHE_PRIVATE
 #ifdef _KERNEL_OPT
@ -202,6 +212,16 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.154 2023/04/29 10:07:22 riastradh Ex

 #include <miscfs/genfs/genfs.h>

+/*
+ * Assert that data structure layout hasn't changed unintentionally.
+ */
+#ifdef _LP64
+CTASSERT(sizeof(struct namecache) == 128);
+#else
+CTASSERT(sizeof(struct namecache) == 64);
+#endif
+CTASSERT(NC_NLEN_MASK >= MAXPATHLEN);
+
 static void	cache_activate(struct namecache *);
 static void	cache_update_stats(void *);
 static int	cache_compare_nodes(void *, const void *, const void *);
@ -262,7 +282,7 @@ static kmutex_t cache_stat_lock __cacheline_aligned;
 */
 int cache_lru_maxdeact __read_mostly = 2;	/* max # to deactivate */
 int cache_lru_maxscan __read_mostly = 64;	/* max # to scan/reclaim */
-int cache_maxlen __read_mostly = USHRT_MAX;	/* max name length to cache */
+int cache_maxlen __read_mostly = NC_NLEN_MASK;	/* max name length to cache */
 int cache_stat_interval __read_mostly = 300;	/* in seconds */

 /*
@ -328,8 +348,8 @@ cache_compare_nodes(void *context, const void *n1, const void *n2)
 	if (nc1->nc_key > nc2->nc_key) {
 		return 1;
 	}
-	KASSERT(nc1->nc_nlen == nc2->nc_nlen);
-	return memcmp(nc1->nc_name, nc2->nc_name, nc1->nc_nlen);
+	KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2));
+	return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1));
 }

 /*
@ -337,15 +357,15 @@ cache_compare_nodes(void *context, const void *n1, const void *n2)
 * the key value to try and improve uniqueness, and so that length doesn't
 * need to be compared separately for string comparisons.
 */
-static inline uint64_t
+static uintptr_t
 cache_key(const char *name, size_t nlen)
 {
-	uint64_t key;
+	uintptr_t key;

-	KASSERT(nlen <= USHRT_MAX);
+	KASSERT((nlen & ~NC_NLEN_MASK) == 0);

 	key = hash32_buf(name, nlen, HASH32_STR_INIT);
-	return (key << 32) | nlen;
+	return (key << NC_NLEN_BITS) | (uintptr_t)nlen;
 }

 /*
@ -358,13 +378,13 @@ cache_remove(struct namecache *ncp, const bool dir2node)
 {
 	struct vnode *vp, *dvp = ncp->nc_dvp;
 	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
+	size_t namelen = NC_NLEN(ncp);

 	KASSERT(rw_write_held(&dvi->vi_nc_lock));
-	KASSERT(cache_key(ncp->nc_name, ncp->nc_nlen) == ncp->nc_key);
+	KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key);
 	KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp);

-	SDT_PROBE(vfs, namecache, invalidate, done, ncp,
-	    0, 0, 0, 0);
+	SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0);

 	/*
 	 * Remove from the vnode's list.  This excludes cache_revlookup(),
@ -391,8 +411,8 @@ cache_remove(struct namecache *ncp, const bool dir2node)
 	mutex_exit(&cache_lru_lock);

 	/* Finally, free it. */
-	if (ncp->nc_nlen > NCHNAMLEN) {
-		size_t sz = offsetof(struct namecache, nc_name[ncp->nc_nlen]);
+	if (namelen > NCHNAMLEN) {
+		size_t sz = offsetof(struct namecache, nc_name[namelen]);
 		kmem_free(ncp, sz);
 	} else {
 		pool_cache_put(cache_pool, ncp);
@ -404,13 +424,15 @@ cache_remove(struct namecache *ncp, const bool dir2node)
 */
 static struct namecache * __noinline
 cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
-    uint64_t key)
+    uintptr_t key)
 {
 	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct rb_node *node = dvi->vi_nc_tree.rbt_root;
 	struct namecache *ncp;
-	int lrulist, diff;
+	enum cache_lru_id lrulist;
+	int diff;

+	KASSERT(namelen <= MAXPATHLEN);
 	KASSERT(rw_lock_held(&dvi->vi_nc_lock));

 	/*
@ -430,7 +452,7 @@ cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
 		KASSERT((void *)&ncp->nc_tree == (void *)ncp);
 		KASSERT(ncp->nc_dvp == dvp);
 		if (ncp->nc_key == key) {
-			KASSERT(ncp->nc_nlen == namelen);
+			KASSERT(NC_NLEN(ncp) == namelen);
 			diff = memcmp(ncp->nc_name, name, namelen);
 			if (__predict_true(diff == 0)) {
 				break;
@ -511,7 +533,7 @@ cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
 	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct namecache *ncp;
 	struct vnode *vp;
-	uint64_t key;
+	uintptr_t key;
 	int error;
 	bool hit;
 	krw_t op;
@ -564,7 +586,7 @@ cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
 		COUNT(ncs_badhits);
 		return false;
 	}
-	if (ncp->nc_vp == NULL) {
+	if ((vp = ncp->nc_vp) == NULL) {
 		if (iswht_ret != NULL) {
 			/*
 			 * Restore the ISWHITEOUT flag saved earlier.
@ -592,7 +614,6 @@ cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
 		rw_exit(&dvi->vi_nc_lock);
 		return hit;
 	}
-	vp = ncp->nc_vp;
 	error = vcache_tryvget(vp);
 	rw_exit(&dvi->vi_nc_lock);
 	if (error) {
@ -638,7 +659,8 @@ cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
 	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct namecache *ncp;
 	krwlock_t *oldlock, *newlock;
-	uint64_t key;
+	struct vnode *vp;
+	uintptr_t key;
 	int error;

 	KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
@ -727,7 +749,7 @@ cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
 		    name, namelen, 0, 0);
 		return false;
 	}
-	if (ncp->nc_vp == NULL) {
+	if ((vp = ncp->nc_vp) == NULL) {
 		/* found negative entry; vn is already null from above */
 		KASSERT(namelen != cache_mp_nlen);
 		KASSERT(name != cache_mp_name);
@ -749,7 +771,7 @@ cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
 	if (newlock) {
 		*plock = newlock;
 	}
-	*vn_ret = ncp->nc_vp;
+	*vn_ret = vp;
 	return true;
 }

@ -772,8 +794,9 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
 {
 	vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
 	struct namecache *ncp;
+	enum cache_lru_id lrulist;
 	struct vnode *dvp;
-	int error, nlen, lrulist;
+	int error, nlen;
 	char *bp;

 	KASSERT(vp != NULL);
@ -812,12 +835,12 @@ cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
 	TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) {
 		KASSERT(ncp->nc_vp == vp);
 		KASSERT(ncp->nc_dvp != NULL);
-		nlen = ncp->nc_nlen;
+		nlen = NC_NLEN(ncp);

 		/*
 		 * Ignore mountpoint entries.
 		 */
-		if (ncp->nc_nlen == cache_mp_nlen) {
+		if (nlen == cache_mp_nlen) {
 			continue;
 		}

@ -929,7 +952,6 @@ cache_enter(struct vnode *dvp, struct vnode *vp,
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	ncp->nc_key = cache_key(name, namelen);
-	ncp->nc_nlen = namelen;
 	ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
 	memcpy(ncp->nc_name, name, namelen);

@ -941,7 +963,7 @@ cache_enter(struct vnode *dvp, struct vnode *vp,
 	oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
 	if (oncp != ncp) {
 		KASSERT(oncp->nc_key == ncp->nc_key);
-		KASSERT(oncp->nc_nlen == ncp->nc_nlen);
+		KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp));
 		KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
 		cache_remove(oncp, true);
 		oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
@ -1107,12 +1129,11 @@ nchinit(void)
 void
 cache_cpu_init(struct cpu_info *ci)
 {
-	void *p;
 	size_t sz;

-	sz = roundup2(sizeof(struct nchcpu), coherency_unit) + coherency_unit;
-	p = kmem_zalloc(sz, KM_SLEEP);
-	ci->ci_data.cpu_nch = (void *)roundup2((uintptr_t)p, coherency_unit);
+	sz = roundup2(sizeof(struct nchcpu), coherency_unit);
+	ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP);
+	KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0);
 }

 /*
@ -1232,7 +1253,7 @@ cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
 {
 	vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
 	struct namecache *ncp;
-	uint64_t key;
+	uintptr_t key;

 	SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0);

@ -1524,7 +1545,7 @@ namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 	for (id = 0; id < LRU_COUNT; id++) {
 		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
 			if (ncp->nc_vp == vp) {
-				(*pr)("name %.*s\n", ncp->nc_nlen,
+				(*pr)("name %.*s\n", NC_NLEN(ncp),
 				    ncp->nc_name);
 				dvp = ncp->nc_dvp;
 			}
@ -1537,7 +1558,7 @@ namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
 	for (id = 0; id < LRU_COUNT; id++) {
 		TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
 			if (ncp->nc_vp == dvp) {
-				(*pr)("parent %.*s\n", ncp->nc_nlen,
+				(*pr)("parent %.*s\n", NC_NLEN(ncp),
 				    ncp->nc_name);
 			}
 		}
--- a/sys/sys/namei.src
+++ b/sys/sys/namei.src
@ -1,4 +1,4 @@
-/*	$NetBSD: namei.src,v 1.60 2021/06/29 22:39:21 dholland Exp $	*/
+/*	$NetBSD: namei.src,v 1.61 2023/09/09 18:27:59 ad Exp $	*/

 /*
 * Copyright (c) 1985, 1989, 1991, 1993
@ -199,13 +199,25 @@ NAMEIFL	PARAMASK	0x02ff800	/* mask of parameter descriptors */
 */
 #define	NCHNAMLEN	sizeof(((struct namecache *)NULL)->nc_name)

+/*
+ * The uintptr_t-sized key value computed for each name consists of name
+ * length and a hash value.  On 32-bit platforms the top NC_NLEN_BITS of
+ * the 32-bit hash value is lobbed off.
+ */
+
+#define	NC_NLEN_BITS	11
+#define	NC_NLEN_MASK	((1 << NC_NLEN_BITS) - 1)
+#define	NC_NLEN(ncp)	((ncp)->nc_key & NC_NLEN_MASK)
+
 /*
 * Namecache entry.
 *
 * This structure describes the elements in the cache of recent names looked
- * up by namei.  It's carefully sized to take up 128 bytes on _LP64, to make
- * good use of space and the CPU caches.  Items used during RB tree lookup
- * (nc_tree, nc_key) are clustered at the start of the structure.
+ * up by namei.  It's carefully sized to take up 128 bytes on _LP64 and 64
+ * bytes on 32-bit machines, to make good use of space and the CPU caches.
+ *
+ * Items used during RB tree lookup (nc_tree, nc_key) are clustered at the
+ * start of the structure to minimise cache misses during lookup.
 *
 * Field markings and their corresponding locks:
 *
@ -216,17 +228,20 @@ NAMEIFL	PARAMASK	0x02ff800	/* mask of parameter descriptors */
 */
 struct namecache {
 	struct	rb_node nc_tree;	/* d  red-black tree, must be first */
-	uint64_t nc_key;		/* -  hashed key value */
+	uintptr_t nc_key;		/* -  hashed key value */
 	TAILQ_ENTRY(namecache) nc_list;	/* v  nc_vp's list of cache entries */
 	TAILQ_ENTRY(namecache) nc_lru;	/* l  pseudo-lru chain */
 	struct	vnode *nc_dvp;		/* -  vnode of parent of name */
 	struct	vnode *nc_vp;		/* -  vnode the name refers to */
-	int	nc_lrulist;		/* l  which LRU list it's on */
-	u_short	nc_nlen;		/* -  length of the name */
-	char	nc_whiteout;		/* -  true if a whiteout */
-	char	nc_name[41];		/* -  segment name */
-};
+	u_char	nc_lrulist;		/* l  LRU list entry is on */
+	u_char	nc_whiteout;		/* -  whiteout indicator */
+#ifdef _LP64
+	char	nc_name[46];		/* -  segment name */
+#else
+	char	nc_name[22];		/* -  segment name */
 #endif
+};
+#endif /* __NAMECACHE_PRIVATE */

 #ifdef _KERNEL
 #include <sys/pool.h>
--- a/usr.bin/pmap/main.h
+++ b/usr.bin/pmap/main.h
@ -1,4 +1,4 @@
-/*      $NetBSD: main.h,v 1.7 2022/08/21 07:46:52 mlelstv Exp $ */
+/*      $NetBSD: main.h,v 1.8 2023/09/09 18:27:59 ad Exp $ */

 /*
 * Copyright (c) 2003 The NetBSD Foundation, Inc.
@ -36,9 +36,6 @@ extern u_long kernel_map_addr;
 extern void *uvm_vnodeops, *uvm_deviceops, *aobj_pager, *ubc_pager;
 extern rlim_t maxssiz;

-LIST_HEAD(nchashhead, namecache);
-extern struct nchashhead *nchashtbl;
-
 struct cache_entry {
 	LIST_ENTRY(cache_entry) ce_next;
 	struct vnode *ce_vp, *ce_pvp;
--- a/usr.bin/pmap/pmap.c
+++ b/usr.bin/pmap/pmap.c
@ -1,7 +1,7 @@
-/*	$NetBSD: pmap.c,v 1.57 2022/08/21 07:46:52 mlelstv Exp $ */
+/*	$NetBSD: pmap.c,v 1.58 2023/09/09 18:27:59 ad Exp $ */

 /*
- * Copyright (c) 2002, 2003, 2020 The NetBSD Foundation, Inc.
+ * Copyright (c) 2002, 2003, 2020, 2023 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -31,7 +31,7 @@

 #include <sys/cdefs.h>
 #ifndef lint
-__RCSID("$NetBSD: pmap.c,v 1.57 2022/08/21 07:46:52 mlelstv Exp $");
+__RCSID("$NetBSD: pmap.c,v 1.58 2023/09/09 18:27:59 ad Exp $");
 #endif

 #include <string.h>
@ -874,7 +874,7 @@ search_cache(kvm_t *kd, struct vnode *vp, char **name, char *buf, size_t blen)
 			    (vi.vi_vnode.v_vflag & VV_ROOT) != 0)
 				break;
 			/* Otherwise pull first NCHNAMLEN chars of name. */
-			nlen = MIN((size_t)nc.nc_nlen, NCHNAMLEN);
+			nlen = MIN(NC_NLEN(&nc), NCHNAMLEN);
 			/* too small */
 			if ((size_t)(o - buf) < nlen + (o != e ? 1 : 0))
 				break;