Redo the page allocator to perform better, especially on multi-core and

multi-socket systems. Proposed on tech-kern. While here: - add rudimentary NUMA support - needs more work. - remove now unused "listq" from vm_page.
2019-12-27 12:51:56 +00:00 · 2019-12-27 12:51:56 +00:00 · 9b1e2fa25c
commit 9b1e2fa25c
parent a4a6d53262
18 changed files with 1327 additions and 505 deletions
--- a/sys/arch/amd64/amd64/autoconf.c
+++ b/sys/arch/amd64/amd64/autoconf.c
@ -1,4 +1,4 @@
-/*	$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $	*/
+/*	$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $	*/

 /*-
 * Copyright (c) 1990 The Regents of the University of California.
@ -46,7 +46,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $");

 #include "opt_multiprocessor.h"
 #include "opt_intrdebug.h"
@ -60,9 +60,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
 #include <machine/pte.h>
 #include <machine/cpufunc.h>

+#include "acpica.h"
 #include "ioapic.h"
 #include "lapic.h"

+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
 #if NIOAPIC > 0
 #include <machine/i82093var.h>
 #endif
@ -112,6 +117,11 @@ cpu_configure(void)
 	cpu_init_idle_lwps();
 #endif

+#if NACPICA > 0
+	/* Load NUMA memory regions into UVM. */
+	acpisrat_load_uvm();
+#endif
+
 	spl0();
 	lcr8(0);
 }
--- a/sys/arch/i386/i386/autoconf.c
+++ b/sys/arch/i386/i386/autoconf.c
@ -1,4 +1,4 @@
-/*	$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $	*/
+/*	$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $	*/

 /*-
 * Copyright (c) 1990 The Regents of the University of California.
@ -46,7 +46,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $");

 #include "opt_intrdebug.h"
 #include "opt_multiprocessor.h"
@ -65,9 +65,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
 #include <machine/cpufunc.h>
 #include <x86/fpu.h>

+#include "acpica.h"
 #include "ioapic.h"
 #include "lapic.h"

+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
 #if NIOAPIC > 0
 #include <machine/i82093var.h>
 #endif
@ -132,6 +137,11 @@ cpu_configure(void)
 	cpu_init_idle_lwps();
 #endif

+#if NACPICA > 0
+	/* Load NUMA memory regions into UVM. */
+	acpisrat_load_uvm();
+#endif
+
 	spl0();
 #if NLAPIC > 0
 	lapic_write_tpri(0);
--- a/sys/ddb/db_command.c
+++ b/sys/ddb/db_command.c
@ -1,7 +1,8 @@
-/*	$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $	*/
+/*	$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $	*/

 /*
- * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009 The NetBSD Foundation, Inc.
+ * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019
+ *     The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -60,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $");
+__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $");

 #ifdef _KERNEL_OPT
 #include "opt_aio.h"
@ -193,6 +194,7 @@ static void     db_help_print_cmd(db_expr_t, bool, db_expr_t, const char *);
 static void	db_lock_print_cmd(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_all_locks(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_lockstats(db_expr_t, bool, db_expr_t, const char *);
+static void	db_show_all_freelists(db_expr_t, bool, db_expr_t, const char *);
 static void	db_mount_print_cmd(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_all_mount(db_expr_t, bool, db_expr_t, const char *);
 static void	db_mbuf_print_cmd(db_expr_t, bool, db_expr_t, const char *);
@ -234,6 +236,8 @@ static const struct db_command db_show_cmds[] = {
 	    0 ,"Show all held locks", "[/t]", NULL) },
 	{ DDB_ADD_CMD("mount",	db_show_all_mount,	0,
 	    "Print all mount structures.", "[/f]", NULL) },
+	{ DDB_ADD_CMD("freelists",	db_show_all_freelists,
+	    0 ,"Show all freelists", NULL, NULL) },
 #ifdef AIO
 	/*added from all sub cmds*/
 	{ DDB_ADD_CMD("aio_jobs",	db_show_aio_jobs,	0,
@ -1284,6 +1288,16 @@ db_show_all_locks(db_expr_t addr, bool have_addr,
 #endif
 }

+static void
+db_show_all_freelists(db_expr_t addr, bool have_addr,
+    db_expr_t count, const char *modif)
+{
+
+#ifdef _KERNEL	/* XXX CRASH(8) */
+	uvm_page_print_freelists(db_printf);
+#endif
+}
+
 static void
 db_show_lockstats(db_expr_t addr, bool have_addr,
    db_expr_t count, const char *modif)
--- a/sys/dev/acpi/acpi_srat.c
+++ b/sys/dev/acpi/acpi_srat.c
@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $ */
+/* $NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $ */

 /*
 * Copyright (c) 2009 The NetBSD Foundation, Inc.
@ -30,7 +30,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $");

 #include <sys/param.h>
 #include <sys/kmem.h>
@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
 #include <dev/acpi/acpivar.h>
 #include <dev/acpi/acpi_srat.h>

+#include <uvm/uvm_extern.h>
+
 static ACPI_TABLE_SRAT *srat;

 static uint32_t nnodes; /* Number of NUMA nodes */
@ -472,6 +474,28 @@ acpisrat_dump(void)
 	}
 }

+void
+acpisrat_load_uvm(void)
+{
+	uint32_t i, j, nn, nm;
+	struct acpisrat_mem m;
+
+	nn = acpisrat_nodes();
+	aprint_debug("SRAT: %u NUMA nodes\n", nn);
+	for (i = 0; i < nn; i++) {
+		nm = acpisrat_node_memoryranges(i);
+		for (j = 0; j < nm; j++) {
+			acpisrat_mem(i, j, &m);
+			aprint_debug("SRAT: node %u memory range %u (0x%"
+			    PRIx64" - 0x%"PRIx64" flags %u)\n",
+			    m.nodeid, j, m.baseaddress,
+			    m.baseaddress + m.length, m.flags);
+			uvm_page_numa_load(trunc_page(m.baseaddress),
+			    trunc_page(m.length), m.nodeid);
+		}
+	}
+}
+
 /*
 * Get number of NUMA nodes.
 */
--- a/sys/dev/acpi/acpi_srat.h
+++ b/sys/dev/acpi/acpi_srat.h
@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.h,v 1.4 2017/12/28 08:49:28 maxv Exp $ */
+/* $NetBSD: acpi_srat.h,v 1.5 2019/12/27 12:51:57 ad Exp $ */

 /*
 * Copyright (c) 2009 The NetBSD Foundation, Inc.
@ -68,6 +68,7 @@ int acpisrat_init(void);
 int acpisrat_refresh(void);
 int acpisrat_exit(void);
 void acpisrat_dump(void);
+void acpisrat_load_uvm(void);
 uint32_t acpisrat_nodes(void);
 uint32_t acpisrat_node_cpus(acpisrat_nodeid_t);
 uint32_t acpisrat_node_memoryranges(acpisrat_nodeid_t);
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@ -1,4 +1,4 @@
-/*	$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $	*/
+/*	$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $	*/

 /*-
 * Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
@ -97,7 +97,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $");

 #include "opt_ddb.h"
 #include "opt_inet.h"
@ -814,6 +814,10 @@ configure2(void)
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		uvm_cpu_attach(ci);
 	}
+
+	/* Decide how to partition free memory. */
+	uvm_page_rebucket();
+
 	mp_online = true;
 #if defined(MULTIPROCESSOR)
 	cpu_boot_secondary_processors();
--- a/sys/uvm/files.uvm
+++ b/sys/uvm/files.uvm
@ -1,4 +1,4 @@
-#	$NetBSD: files.uvm,v 1.31 2019/12/15 21:11:35 ad Exp $
+#	$NetBSD: files.uvm,v 1.32 2019/12/27 12:51:57 ad Exp $

 #
 # UVM options
@ -42,6 +42,7 @@ file	uvm/uvm_pager.c			uvm
 file	uvm/uvm_pdaemon.c		uvm
 file	uvm/uvm_pdpolicy_clock.c	!pdpolicy_clockpro
 file	uvm/uvm_pdpolicy_clockpro.c	pdpolicy_clockpro
+file	uvm/uvm_pgflcache.c		uvm
 file	uvm/uvm_pglist.c		uvm
 file	uvm/uvm_physseg.c		uvm
 file	uvm/uvm_readahead.c		uvm
--- a/sys/uvm/uvm.h
+++ b/sys/uvm/uvm.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm.h,v 1.70 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm.h,v 1.71 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -71,21 +71,19 @@
 #include <machine/vmparam.h>

 struct workqueue;
+struct pgflcache;

 /*
 * per-cpu data
 */

 struct uvm_cpu {
-	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
-	int page_free_nextcolor;	/* next color to allocate from */
-	int page_idlezero_next;		/* which color to zero next */
-	bool page_idle_zero;		/* TRUE if we should try to zero
-					   pages in the idle loop */
-	int pages[PGFL_NQUEUES];	/* total of pages in page_free */
-	u_int emap_gen;			/* emap generation number */
-
-	krndsource_t rs;		/* entropy source */
+	struct pgflcache *pgflcache[VM_NFREELIST];/* cpu-local cached pages */
+	void		*pgflcachemem;		/* pointer to allocated mem */
+	size_t		pgflcachememsz;		/* size of allocated memory */
+	u_int		pgflcolor;		/* next color to allocate */
+	u_int		pgflbucket;		/* where to send our pages */
+	krndsource_t 	rs;			/* entropy source */
 };

 /*
@ -98,7 +96,9 @@ struct uvm {

 		/* vm_page queues */
 	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
-	bool page_init_done;		/* TRUE if uvm_page_init() finished */
+	u_int	bucketcount;
+	bool	page_init_done;		/* true if uvm_page_init() finished */
+	bool	numa_alloc;		/* use NUMA page allocation strategy */

 		/* page daemon trigger */
 	int pagedaemon;			/* daemon sleeps on this */
@ -123,7 +123,6 @@ extern struct uvm_object *uvm_kernel_object;
 * locks (made globals for lockstat).
 */

-extern kmutex_t uvm_fpageqlock;		/* lock for free page q */
 extern kmutex_t uvm_kentry_lock;

 #endif /* _KERNEL */
--- a/sys/uvm/uvm_ddb.h
+++ b/sys/uvm/uvm_ddb.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_ddb.h,v 1.15 2011/05/17 04:18:07 mrg Exp $	*/
+/*	$NetBSD: uvm_ddb.h,v 1.16 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -40,6 +40,7 @@ void	uvm_object_printit(struct uvm_object *, bool,
 void	uvm_page_printit(struct vm_page *, bool,
 	    void (*)(const char *, ...));
 void	uvm_page_printall(void (*)(const char *, ...));
+void	uvm_page_print_freelists(void (*)(const char *, ...));
 void	uvmexp_print(void (*)(const char *, ...));
 #endif /* DDB || DEBUGPRINT */

--- a/sys/uvm/uvm_extern.h
+++ b/sys/uvm/uvm_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.215 2019/12/21 12:58:26 ad Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.216 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -210,6 +210,7 @@ b\32UNMAP\0\
 #define	UVM_PGA_STRAT_NORMAL	0	/* priority (low id to high) walk */
 #define	UVM_PGA_STRAT_ONLY	1	/* only specified free list */
 #define	UVM_PGA_STRAT_FALLBACK	2	/* ONLY falls back on NORMAL */
+#define	UVM_PGA_STRAT_NUMA	3	/* strongly prefer ideal bucket */

 /*
 * flags for uvm_pagealloc_strat()
@ -736,6 +737,7 @@ void			uvm_obj_unwirepages(struct uvm_object *, off_t, off_t);

 /* uvm_page.c */
 int			uvm_free(void);
+void			uvm_page_numa_load(paddr_t, paddr_t, u_int);
 struct vm_page		*uvm_pagealloc_strat(struct uvm_object *,
 			    voff_t, struct vm_anon *, int, int, int);
 #define	uvm_pagealloc(obj, off, anon, flags) \
--- a/sys/uvm/uvm_glue.c
+++ b/sys/uvm/uvm_glue.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $	*/
+/*	$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -62,7 +62,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $");

 #include "opt_kgdb.h"
 #include "opt_kstack.h"
@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
 #include <sys/asan.h>

 #include <uvm/uvm.h>
+#include <uvm/uvm_pgflcache.h>

 /*
 * uvm_kernacc: test if kernel can access a memory region.
@ -500,9 +501,17 @@ uvm_scheduler(void)
 	lwp_changepri(l, PRI_VM);
 	lwp_unlock(l);

+	/* Start the freelist cache. */
+	uvm_pgflcache_start();
+
 	for (;;) {
 		/* Update legacy stats for post-mortem debugging. */
 		uvm_update_uvmexp();
+
+		/* See if the pagedaemon needs to generate some free pages. */
+		uvm_kick_pdaemon();
+
+		/* Calculate process statistics. */
 		sched_pstats();
 		(void)kpause("uvm", false, hz, NULL);
 	}
--- a/sys/uvm/uvm_init.c
+++ b/sys/uvm/uvm_init.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -32,7 +32,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -64,7 +64,6 @@ const int * const uvmexp_pagemask = &uvmexp.pagemask;
 const int * const uvmexp_pageshift = &uvmexp.pageshift;
 #endif

-kmutex_t uvm_fpageqlock __cacheline_aligned;
 kmutex_t uvm_kentry_lock __cacheline_aligned;

 /*
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
--- a/sys/uvm/uvm_page.h
+++ b/sys/uvm/uvm_page.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.h,v 1.88 2019/12/21 14:41:44 ad Exp $	*/
+/*	$NetBSD: uvm_page.h,v 1.89 2019/12/27 12:51:57 ad Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -119,7 +119,6 @@
 *
 * o free
 *   => pageq.list is entry on global free page queue
- *   => listq.list is entry on per-CPU free page queue
 *   => uanon is unused (or (void *)0xdeadbeef for DEBUG)
 *   => uobject is unused (or (void *)0xdeadbeef for DEBUG)
 *   => PG_FREE is set in flags
@ -129,13 +128,11 @@
 *   => uobject is owner
 * o owned by a vm_anon
 *   => pageq is unused (XXX correct?)
- *   => listq is unused (XXX correct?)
 *   => uanon is owner
 *   => uobject is NULL
 *   => PG_ANON is set in flags
 * o allocated by uvm_pglistalloc
 *   => pageq.queue is entry on resulting pglist, owned by caller
- *   => listq is unused (XXX correct?)
 *   => uanon is unused
 *   => uobject is unused
 *
@ -153,11 +150,6 @@ struct vm_page {
 						 * or uvm_pglistalloc output */
 		LIST_ENTRY(vm_page) list;	/* f: global free page queue */
 	} pageq;
-
-	union {
-		LIST_ENTRY(vm_page) list;	/* f: CPU free page queue */
-	} listq;
-
 	struct vm_anon		*uanon;		/* o,i: anon */
 	struct uvm_object	*uobject;	/* o,i: object */
 	voff_t			offset;		/* o: offset into object */
@ -302,6 +294,7 @@ void uvm_page_own(struct vm_page *, const char *);
 bool uvm_page_physget(paddr_t *);
 #endif
 void uvm_page_recolor(int);
+void uvm_page_rebucket(void);
 void uvm_pageidlezero(void);

 void uvm_pageactivate(struct vm_page *);
@ -318,6 +311,8 @@ void uvm_pagewire(struct vm_page *);
 void uvm_pagezero(struct vm_page *);
 bool uvm_pageismanaged(paddr_t);
 bool uvm_page_locked_p(struct vm_page *);
+void uvm_pgfl_lock(void);
+void uvm_pgfl_unlock(void);

 int uvm_page_lookup_freelist(struct vm_page *);

@ -348,8 +343,12 @@ int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t,
 #define	VM_PGCOLOR(pg) \
 	(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
 #define	PHYS_TO_VM_PAGE(pa)	uvm_phys_to_vm_page(pa)
+
+/*
+ * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
+ * per-CPU cache.  If you need to be certain, pause caching.
+ */
 #define VM_PAGE_IS_FREE(entry)  ((entry)->flags & PG_FREE)
-#define	VM_FREE_PAGE_TO_CPU(pg)	((struct uvm_cpu *)((uintptr_t)pg->offset))

 /*
 * Use the lower 10 bits of pg->phys_addr to cache some some locators for
--- a/sys/uvm/uvm_pgflcache.c
+++ b/sys/uvm/uvm_pgflcache.c
@ -0,0 +1,471 @@
+/*	$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * uvm_pgflcache.c: page freelist cache.
+ *
+ * This implements a tiny per-CPU cache of pages that sits between the main
+ * page allocator and the freelists.  By allocating and freeing pages in
+ * batch, it reduces freelist contention by an order of magnitude.
+ *
+ * The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
+ * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
+ * world.  On system with one CPU per physical package (e.g. a uniprocessor)
+ * the cache is not enabled.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $");
+
+#include "opt_uvm.h"
+#include "opt_multiprocessor.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sched.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/xcall.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pglist.h>
+#include <uvm/uvm_pgflcache.h>
+
+/* There is no point doing any of this on a uniprocessor. */
+#ifdef MULTIPROCESSOR
+
+/*
+ * MAXPGS - maximum pages per color, per bucket.
+ * FILLPGS - number of pages to allocate at once, per color, per bucket.
+ *
+ * Why the chosen values:
+ *
+ * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
+ * colors.  We make the assumption that most of the time allocation activity
+ * will be centered around one UVM freelist, so most of the time there will
+ * be no more than 224kB worth of cached pages per-CPU.  That's tiny, but
+ * enough to hugely reduce contention on the freelist locks, and give us a
+ * small pool of pages which if we're very lucky may have some L1/L2 cache
+ * locality, and do so without subtracting too much from the L2/L3 cache
+ * benefits of having per-package free lists in the page allocator.
+ *
+ * (2) With the chosen values on _LP64, the data structure for each color
+ * takes up a single cache line (64 bytes) giving this very low overhead
+ * even in the "miss" case.
+ *
+ * (3) We don't want to cause too much pressure by hiding away memory that
+ * could otherwise be put to good use.
+ */
+#define	MAXPGS		7
+#define	FILLPGS		6
+
+/* Variable size, according to # colors. */
+struct pgflcache {
+	struct pccolor {
+		intptr_t	count;
+		struct vm_page	*pages[MAXPGS];
+	} color[1];
+};
+
+static kmutex_t		uvm_pgflcache_lock;
+static kcondvar_t	uvm_pgflcache_cv;
+static int		uvm_pgflcache_sem;
+static bool		uvm_pgflcache_draining;
+
+/*
+ * uvm_pgflcache_fill: fill specified freelist/color from global list
+ *
+ * => must be called at IPL_VM
+ * => must be called with given bucket lock held
+ * => must only fill from the correct bucket for this CPU
+ */
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+	struct pgflbucket *pgb;
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct pgflist *head;
+	struct vm_page *pg;
+	int count;
+
+	KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
+	KASSERT(ucpu->pgflbucket == b);
+
+	/* If caching is off, then bail out. */
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return;
+	}
+
+	/* Fill only to the limit. */
+	pcc = &pc->color[c];
+	pgb = uvm.page_free[fl].pgfl_buckets[b];
+	head = &pgb->pgb_colors[c];
+	if (pcc->count >= FILLPGS) {
+		return;
+	}
+
+	/* Pull pages from the bucket until it's empty, or we are full. */
+	count = pcc->count;
+	pg = LIST_FIRST(head);
+	while (__predict_true(pg != NULL && count < FILLPGS)) {
+		KASSERT(pg->flags & PG_FREE);
+		KASSERT(uvm_page_get_bucket(pg) == b);
+		pcc->pages[count++] = pg;
+		pg = LIST_NEXT(pg, pageq.list);
+	}
+
+	/* Violate LIST abstraction to remove all pages at once. */
+	head->lh_first = pg;
+	if (__predict_true(pg != NULL)) {
+		pg->pageq.list.le_prev = &head->lh_first;
+	}
+	pgb->pgb_nfree -= (count - pcc->count);
+	pcc->count = count;
+}
+
+/*
+ * uvm_pgflcache_spill: spill specified freelist/color to global list
+ *
+ * => must be called at IPL_VM
+ * => mark __noinline so we don't pull it into uvm_pgflcache_free()
+ */
+
+static void __noinline
+uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
+{
+	struct pgflbucket *pgb;
+	struct pgfreelist *pgfl;
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct pgflist *head;
+	kmutex_t *lock;
+	int b, adj;
+
+	pc = ucpu->pgflcache[fl];
+	pcc = &pc->color[c];
+	pgfl = &uvm.page_free[fl];
+	b = ucpu->pgflbucket;
+	pgb = pgfl->pgfl_buckets[b];
+	head = &pgb->pgb_colors[c];
+	lock = &uvm_freelist_locks[b].lock;
+
+	mutex_spin_enter(lock);
+	for (adj = pcc->count; pcc->count != 0;) {
+		pcc->count--;
+		KASSERT(pcc->pages[pcc->count] != NULL);
+		KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
+		LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
+	}
+	pgb->pgb_nfree += adj;
+	mutex_spin_exit(lock);
+}
+
+/*
+ * uvm_pgflcache_alloc: try to allocate a cached page.
+ *
+ * => must be called at IPL_VM
+ * => allocate only from the given freelist and given page color
+ */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct vm_page *pg;
+
+	/* If caching is off, then bail out. */
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return NULL;
+	}
+
+	/* Very simple: if we have a page then return it. */
+	pcc = &pc->color[c];
+	if (__predict_false(pcc->count == 0)) {
+		return NULL;
+	}
+	pg = pcc->pages[--(pcc->count)];
+	KASSERT(pg != NULL);
+	KASSERT(pg->flags & PG_FREE);
+	KASSERT(uvm_page_get_freelist(pg) == fl);
+	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+	pg->flags &= PG_ZERO;
+	return pg;
+}
+
+/*
+ * uvm_pgflcache_free: cache a page, if possible.
+ *
+ * => must be called at IPL_VM
+ * => must only send pages for the correct bucket for this CPU
+ */
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	int fl, c;
+
+	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+
+	/* If caching is off, then bail out. */
+ 	fl = uvm_page_get_freelist(pg);
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return false;
+	}
+
+	/* If the array is full spill it first, then add page to array. */
+	c = VM_PGCOLOR(pg);
+	pcc = &pc->color[c];
+	KASSERT((pg->flags & PG_FREE) == 0);
+	if (__predict_false(pcc->count == MAXPGS)) {
+		uvm_pgflcache_spill(ucpu, fl, c);
+	}
+	pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
+	pcc->pages[pcc->count] = pg;
+	pcc->count++;
+	return true;
+}
+
+/*
+ * uvm_pgflcache_init: allocate and initialize per-CPU data structures for
+ * the free page cache.  Don't set anything in motion - that's taken care
+ * of by uvm_pgflcache_resume().
+ */
+
+static void
+uvm_pgflcache_init_cpu(struct cpu_info *ci)
+{
+	struct uvm_cpu *ucpu;
+	size_t sz;
+
+	ucpu = ci->ci_data.cpu_uvm;
+	KASSERT(ucpu->pgflcachemem == NULL);
+	KASSERT(ucpu->pgflcache[0] == NULL);
+
+	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+	ucpu->pgflcachememsz =
+	    (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
+	ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
+}
+
+/*
+ * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
+ * and shut down caching on the CPU.  Called on each CPU in the system via
+ * xcall.
+ */
+
+static void
+uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
+{
+	struct uvm_cpu *ucpu;
+	int fl, color, s;
+
+	ucpu = curcpu()->ci_data.cpu_uvm;
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		s = splvm();
+		for (color = 0; color < uvmexp.ncolors; color++) {
+			uvm_pgflcache_spill(ucpu, fl, color);
+		}
+		ucpu->pgflcache[fl] = NULL;
+		splx(s);
+	}
+}
+
+/*
+ * uvm_pgflcache_pause: pause operation of the caches
+ */
+
+void
+uvm_pgflcache_pause(void)
+{
+	uint64_t where;
+
+	/* First one in starts draining.  Everyone else waits. */
+	mutex_enter(&uvm_pgflcache_lock);
+	if (uvm_pgflcache_sem++ == 0) {
+		uvm_pgflcache_draining = true;
+		mutex_exit(&uvm_pgflcache_lock);
+		where = xc_broadcast(0, uvm_pgflcache_fini_cpu, NULL, NULL);
+		xc_wait(where);
+		mutex_enter(&uvm_pgflcache_lock);
+		uvm_pgflcache_draining = false;
+		cv_broadcast(&uvm_pgflcache_cv);
+	} else {
+		while (uvm_pgflcache_draining) {
+			cv_wait(&uvm_pgflcache_cv, &uvm_pgflcache_lock);
+		}
+	}
+	mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_resume: resume operation of the caches
+ */
+
+void
+uvm_pgflcache_resume(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	struct uvm_cpu *ucpu;
+	uintptr_t addr;
+	size_t sz;
+	int fl;
+
+	/* Last guy out takes care of business. */
+	mutex_enter(&uvm_pgflcache_lock);
+	KASSERT(!uvm_pgflcache_draining);
+	KASSERT(uvm_pgflcache_sem > 0);
+	if (uvm_pgflcache_sem-- > 1) {
+		mutex_exit(&uvm_pgflcache_lock);
+		return;
+	}
+
+	/*
+	 * Make sure dependant data structure updates are remotely visible.
+	 * Essentially this functions as a global memory barrier.
+	 */
+	xc_barrier(XC_HIGHPRI);
+
+	/*
+	 * Then set all of the pointers in place on each CPU.  As soon as
+	 * each pointer is set, caching is operational in that dimension.
+	 */
+	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		ucpu = ci->ci_data.cpu_uvm;
+		addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
+		for (fl = 0; fl < VM_NFREELIST; fl++) {
+			ucpu->pgflcache[fl] = (struct pgflcache *)addr;
+			addr += sz;
+		}
+	}
+	mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_start: start operation of the cache.
+ *
+ * => called once only, when init(8) is about to be started
+ */
+
+void
+uvm_pgflcache_start(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	KASSERT(uvm_pgflcache_sem > 0);
+
+	/*
+	 * There's not much point doing this if every CPU has its own
+	 * bucket (and that includes the uniprocessor case).
+	 */
+	if (ncpu == uvm.bucketcount) {
+		return;
+	}
+
+	/* Create each CPU's buckets. */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		uvm_pgflcache_init_cpu(ci);
+	}
+
+	/* Kick it into action. */
+	uvm_pgflcache_resume();
+}
+
+/*
+ * uvm_pgflcache_init: set up data structures for the free page cache.
+ */
+
+void
+uvm_pgflcache_init(void)
+{
+
+	uvm_pgflcache_sem = 1;
+	mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&uvm_pgflcache_cv, "flcache");
+}
+
+#else	/* MULTIPROCESSOR */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+
+	return NULL;
+}
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+
+	return false;
+}
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+
+}
+
+void
+uvm_pgflcache_pause(void)
+{
+
+}
+
+void
+uvm_pgflcache_resume(void)
+{
+
+}
+
+void
+uvm_pgflcache_start(void)
+{
+
+}
+
+void
+uvm_pgflcache_init(void)
+{
+
+}
+
+#endif	/* MULTIPROCESSOR */
--- a/sys/uvm/uvm_pgflcache.h
+++ b/sys/uvm/uvm_pgflcache.h
@ -0,0 +1,43 @@
+/*	$NetBSD: uvm_pgflcache.h,v 1.1 2019/12/27 12:51:57 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(_UVM_PGFLCACHE_H_)
+#define	_UVM_PGFLCACHE_H_
+
+struct vm_page	*uvm_pgflcache_alloc(struct uvm_cpu *, int, int);
+void		uvm_pgflcache_fill(struct uvm_cpu *, int, int, int);
+bool		uvm_pgflcache_free(struct uvm_cpu *, struct vm_page *);
+void		uvm_pgflcache_init(void);
+void		uvm_pgflcache_pause(void);
+void		uvm_pgflcache_resume(void);
+void		uvm_pgflcache_start(void);
+
+#endif /* !_UVM_PGFLCACHE_H_ */
--- a/sys/uvm/uvm_pglist.c
+++ b/sys/uvm/uvm_pglist.c
@ -1,12 +1,12 @@
-/*	$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $	*/
+/*	$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $	*/

 /*-
- * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center.
+ * NASA Ames Research Center, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -35,13 +35,14 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>

 #include <uvm/uvm.h>
 #include <uvm/uvm_pdpolicy.h>
+#include <uvm/uvm_pgflcache.h>

 #ifdef VM_PAGE_ALLOC_MEMORY_STATS
 #define	STAT_INCR(v)	(v)++
@ -79,34 +80,25 @@ u_long	uvm_pglistalloc_npages;
 static void
 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
 {
-	int free_list __unused, color __unused, pgflidx;
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;

-	KASSERT(mutex_owned(&uvm_fpageqlock));
+	pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+	pgb = pgfl->pgfl_buckets[uvm_page_get_bucket(pg)];

-#if PGFL_NQUEUES != 2
-#error uvm_pglistalloc needs to be updated
-#endif
-
-	free_list = uvm_page_get_freelist(pg);
-	color = VM_PGCOLOR(pg);
-	pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
 #ifdef UVMDEBUG
 	struct vm_page *tp;
-	LIST_FOREACH(tp,
-	    &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
-	    pageq.list) {
+	LIST_FOREACH(tp, &pgb->pgb_colors[VM_PGCOLOR(pg)], pageq.list) {
 		if (tp == pg)
 			break;
 	}
 	if (tp == NULL)
 		panic("uvm_pglistalloc: page not on freelist");
 #endif
-	LIST_REMOVE(pg, pageq.list);	/* global */
-	LIST_REMOVE(pg, listq.list);	/* cpu */
-	uvmexp.free--;
+	LIST_REMOVE(pg, pageq.list);
+	pgb->pgb_nfree--;
 	if (pg->flags & PG_ZERO)
 		CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
-	VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
 	pg->flags = PG_CLEAN;
 	pg->uobject = NULL;
 	pg->uanon = NULL;
@ -129,8 +121,6 @@ uvm_pglistalloc_c_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
 	printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
 #endif

-	KASSERT(mutex_owned(&uvm_fpageqlock));
-
 	low = atop(low);
 	high = atop(high);
 	alignment = atop(alignment);
@ -316,7 +306,7 @@ uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment,
 	/*
 	 * Block all memory allocation and lock the free list.
 	 */
-	mutex_spin_enter(&uvm_fpageqlock);
+	uvm_pgfl_lock();

 	/* Are there even any free pages? */
 	if (uvm_free() <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
@ -352,7 +342,7 @@ out:
 	 * the pagedaemon.
 	 */

-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 	uvm_kick_pdaemon();
 	return (error);
 }
@ -368,7 +358,6 @@ uvm_pglistalloc_s_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
 	printf("pgalloc: simple %d pgs from psi %zd\n", num, psi);
 #endif

-	KASSERT(mutex_owned(&uvm_fpageqlock));
 	KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi));
 	KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi));
 	KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi));
@ -461,7 +450,7 @@ again:
 	/*
 	 * Block all memory allocation and lock the free list.
 	 */
-	mutex_spin_enter(&uvm_fpageqlock);
+	uvm_pgfl_lock();
 	count++;

 	/* Are there even any free pages? */
@ -493,7 +482,7 @@ out:
 	 * the pagedaemon.
 	 */

-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 	uvm_kick_pdaemon();

 	if (error) {
@ -539,6 +528,12 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,

 	TAILQ_INIT(rlist);

+	/*
+	 * Turn off the caching of free pages - we need everything to be on
+	 * the global freelists.
+	 */
+	uvm_pgflcache_pause();
+
 	if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
 	    (boundary != 0))
 		res = uvm_pglistalloc_contig(num, low, high, alignment,
@ -546,6 +541,8 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
 	else
 		res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);

+	uvm_pgflcache_resume();
+
 	return (res);
 }

@ -558,45 +555,34 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
 void
 uvm_pglistfree(struct pglist *list)
 {
-	struct uvm_cpu *ucpu;
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;
 	struct vm_page *pg;
-	int index, color, queue;
-	bool iszero;
+	int c, b;

 	/*
 	 * Lock the free list and free each page.
 	 */

-	mutex_spin_enter(&uvm_fpageqlock);
-	ucpu = curcpu()->ci_data.cpu_uvm;
+	uvm_pgfl_lock();
 	while ((pg = TAILQ_FIRST(list)) != NULL) {
-		KASSERT(!uvmpdpol_pageisqueued_p(pg));
 		TAILQ_REMOVE(list, pg, pageq.queue);
-		iszero = (pg->flags & PG_ZERO);
 		pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
 #ifdef DEBUG
 		pg->uobject = (void *)0xdeadbeef;
 		pg->uanon = (void *)0xdeadbeef;
-#endif /* DEBUG */
-#ifdef DEBUG
-		if (iszero)
+		if (pg->flags & PG_ZERO)
 			uvm_pagezerocheck(pg);
 #endif /* DEBUG */
-		index = uvm_page_get_freelist(pg);
-		color = VM_PGCOLOR(pg);
-		queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
-		pg->offset = (uintptr_t)ucpu;
-		LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
-		    pgfl_queues[queue], pg, pageq.list);
-		LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
-		    pgfl_queues[queue], pg, listq.list);
-		uvmexp.free++;
-		if (iszero)
+		c = VM_PGCOLOR(pg);
+		b = uvm_page_get_bucket(pg);
+		pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+		pgb = pgfl->pgfl_buckets[b];
+		if (pg->flags & PG_ZERO)
 			CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
-		ucpu->pages[queue]++;
+		pgb->pgb_nfree++;
+		LIST_INSERT_HEAD(&pgb->pgb_colors[c], pg, pageq.list);
 		STAT_DECR(uvm_pglistalloc_npages);
 	}
-	if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
-		ucpu->page_idle_zero = vm_page_zero_enable;
-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 }
--- a/sys/uvm/uvm_pglist.h
+++ b/sys/uvm/uvm_pglist.h
@ -1,11 +1,11 @@
-/*	$NetBSD: uvm_pglist.h,v 1.8 2010/11/06 15:48:00 uebayasi Exp $	*/
+/*	$NetBSD: uvm_pglist.h,v 1.9 2019/12/27 12:51:57 ad Exp $	*/

 /*-
- * Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000, 2001, 2008, 2019 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
- * by Jason R. Thorpe.
+ * by Jason R. Thorpe, and by Andrew Doran.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
@ -41,19 +41,51 @@ TAILQ_HEAD(pglist, vm_page);
 LIST_HEAD(pgflist, vm_page);

 /*
- * A page free list consists of free pages of unknown contents and free
- * pages of all zeros.
+ * The global uvm.page_free list (uvm_page.c, uvm_pglist.c).  Free pages are
+ * stored according to freelist, bucket, and cache colour.
+ *
+ * pglist = &uvm.page_free[freelist].pgfl_buckets[bucket].pgb_color[color];
+ *
+ * Freelists provide a priority ordering of pages for allocation, based upon
+ * how valuable they are for special uses (e.g. device driver DMA).
+ *
+ * Pages are then grouped in buckets according to some common factor, for
+ * example L2/L3 cache locality.  Each bucket has its own lock, and the
+ * locks are shared among freelists for the same numbered buckets.
+ *
+ * Inside each bucket, pages are further distributed by cache color.
+ *
+ * We want these data structures to occupy as few cache lines as possible,
+ * as they will be highly contended.
 */
-#define	PGFL_UNKNOWN	0
-#define	PGFL_ZEROS	1
-#define	PGFL_NQUEUES	2
-
 struct pgflbucket {
-	struct pgflist pgfl_queues[PGFL_NQUEUES];
+	uintptr_t	pgb_nfree;	/* total # free pages, all colors */
+	struct pgflist	pgb_colors[1];	/* variable size array */
 };

+/*
+ * At the root, the freelists.  MD code decides the number and structure of
+ * these.  They are always arranged in descending order of allocation
+ * priority.
+ *
+ * 8 buckets should be enough to cover most all current x86 systems (2019),
+ * given the way package/core/smt IDs are structured on x86.  For systems
+ * that report high package counts despite having a single physical CPU
+ * package (e.g. Ampere eMAG) a little bit of sharing isn't going to hurt
+ * in the least.
+ */
+#define	PGFL_MAX_BUCKETS	8
 struct pgfreelist {
-	struct pgflbucket *pgfl_buckets;
+	struct pgflbucket	*pgfl_buckets[PGFL_MAX_BUCKETS];
 };

+/*
+ * Lock for each bucket.
+ */
+union uvm_freelist_lock {
+        kmutex_t        lock;
+        uint8_t         padding[COHERENCY_UNIT];
+};
+extern union uvm_freelist_lock	uvm_freelist_locks[PGFL_MAX_BUCKETS];
+
 #endif /* _UVM_UVM_PGLIST_H_ */