Redo the page allocator to perform better, especially on multi-core and

multi-socket systems.  Proposed on tech-kern.  While here:

- add rudimentary NUMA support - needs more work.
- remove now unused "listq" from vm_page.
This commit is contained in:
ad 2019-12-27 12:51:56 +00:00
parent a4a6d53262
commit 9b1e2fa25c
18 changed files with 1327 additions and 505 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $ */
/* $NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
@ -46,7 +46,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $");
#include "opt_multiprocessor.h"
#include "opt_intrdebug.h"
@ -60,9 +60,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
#include <machine/pte.h>
#include <machine/cpufunc.h>
#include "acpica.h"
#include "ioapic.h"
#include "lapic.h"
#if NACPICA > 0
#include <dev/acpi/acpi_srat.h>
#endif
#if NIOAPIC > 0
#include <machine/i82093var.h>
#endif
@ -112,6 +117,11 @@ cpu_configure(void)
cpu_init_idle_lwps();
#endif
#if NACPICA > 0
/* Load NUMA memory regions into UVM. */
acpisrat_load_uvm();
#endif
spl0();
lcr8(0);
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $ */
/* $NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
@ -46,7 +46,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $");
#include "opt_intrdebug.h"
#include "opt_multiprocessor.h"
@ -65,9 +65,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
#include <machine/cpufunc.h>
#include <x86/fpu.h>
#include "acpica.h"
#include "ioapic.h"
#include "lapic.h"
#if NACPICA > 0
#include <dev/acpi/acpi_srat.h>
#endif
#if NIOAPIC > 0
#include <machine/i82093var.h>
#endif
@ -132,6 +137,11 @@ cpu_configure(void)
cpu_init_idle_lwps();
#endif
#if NACPICA > 0
/* Load NUMA memory regions into UVM. */
acpisrat_load_uvm();
#endif
spl0();
#if NLAPIC > 0
lapic_write_tpri(0);

View File

@ -1,7 +1,8 @@
/* $NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $ */
/* $NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $ */
/*
* Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009 The NetBSD Foundation, Inc.
* Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
@ -60,7 +61,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $");
__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_aio.h"
@ -193,6 +194,7 @@ static void db_help_print_cmd(db_expr_t, bool, db_expr_t, const char *);
static void db_lock_print_cmd(db_expr_t, bool, db_expr_t, const char *);
static void db_show_all_locks(db_expr_t, bool, db_expr_t, const char *);
static void db_show_lockstats(db_expr_t, bool, db_expr_t, const char *);
static void db_show_all_freelists(db_expr_t, bool, db_expr_t, const char *);
static void db_mount_print_cmd(db_expr_t, bool, db_expr_t, const char *);
static void db_show_all_mount(db_expr_t, bool, db_expr_t, const char *);
static void db_mbuf_print_cmd(db_expr_t, bool, db_expr_t, const char *);
@ -234,6 +236,8 @@ static const struct db_command db_show_cmds[] = {
0 ,"Show all held locks", "[/t]", NULL) },
{ DDB_ADD_CMD("mount", db_show_all_mount, 0,
"Print all mount structures.", "[/f]", NULL) },
{ DDB_ADD_CMD("freelists", db_show_all_freelists,
0 ,"Show all freelists", NULL, NULL) },
#ifdef AIO
/*added from all sub cmds*/
{ DDB_ADD_CMD("aio_jobs", db_show_aio_jobs, 0,
@ -1284,6 +1288,16 @@ db_show_all_locks(db_expr_t addr, bool have_addr,
#endif
}
static void
db_show_all_freelists(db_expr_t addr, bool have_addr,
db_expr_t count, const char *modif)
{
#ifdef _KERNEL /* XXX CRASH(8) */
uvm_page_print_freelists(db_printf);
#endif
}
static void
db_show_lockstats(db_expr_t addr, bool have_addr,
db_expr_t count, const char *modif)

View File

@ -1,4 +1,4 @@
/* $NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $ */
/* $NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
#include <dev/acpi/acpivar.h>
#include <dev/acpi/acpi_srat.h>
#include <uvm/uvm_extern.h>
static ACPI_TABLE_SRAT *srat;
static uint32_t nnodes; /* Number of NUMA nodes */
@ -472,6 +474,28 @@ acpisrat_dump(void)
}
}
void
acpisrat_load_uvm(void)
{
uint32_t i, j, nn, nm;
struct acpisrat_mem m;
nn = acpisrat_nodes();
aprint_debug("SRAT: %u NUMA nodes\n", nn);
for (i = 0; i < nn; i++) {
nm = acpisrat_node_memoryranges(i);
for (j = 0; j < nm; j++) {
acpisrat_mem(i, j, &m);
aprint_debug("SRAT: node %u memory range %u (0x%"
PRIx64" - 0x%"PRIx64" flags %u)\n",
m.nodeid, j, m.baseaddress,
m.baseaddress + m.length, m.flags);
uvm_page_numa_load(trunc_page(m.baseaddress),
trunc_page(m.length), m.nodeid);
}
}
}
/*
* Get number of NUMA nodes.
*/

View File

@ -1,4 +1,4 @@
/* $NetBSD: acpi_srat.h,v 1.4 2017/12/28 08:49:28 maxv Exp $ */
/* $NetBSD: acpi_srat.h,v 1.5 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@ -68,6 +68,7 @@ int acpisrat_init(void);
int acpisrat_refresh(void);
int acpisrat_exit(void);
void acpisrat_dump(void);
void acpisrat_load_uvm(void);
uint32_t acpisrat_nodes(void);
uint32_t acpisrat_node_cpus(acpisrat_nodeid_t);
uint32_t acpisrat_node_memoryranges(acpisrat_nodeid_t);

View File

@ -1,4 +1,4 @@
/* $NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $ */
/* $NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
@ -97,7 +97,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $");
#include "opt_ddb.h"
#include "opt_inet.h"
@ -814,6 +814,10 @@ configure2(void)
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_cpu_attach(ci);
}
/* Decide how to partition free memory. */
uvm_page_rebucket();
mp_online = true;
#if defined(MULTIPROCESSOR)
cpu_boot_secondary_processors();

View File

@ -1,4 +1,4 @@
# $NetBSD: files.uvm,v 1.31 2019/12/15 21:11:35 ad Exp $
# $NetBSD: files.uvm,v 1.32 2019/12/27 12:51:57 ad Exp $
#
# UVM options
@ -42,6 +42,7 @@ file uvm/uvm_pager.c uvm
file uvm/uvm_pdaemon.c uvm
file uvm/uvm_pdpolicy_clock.c !pdpolicy_clockpro
file uvm/uvm_pdpolicy_clockpro.c pdpolicy_clockpro
file uvm/uvm_pgflcache.c uvm
file uvm/uvm_pglist.c uvm
file uvm/uvm_physseg.c uvm
file uvm/uvm_readahead.c uvm

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm.h,v 1.70 2019/12/13 20:10:22 ad Exp $ */
/* $NetBSD: uvm.h,v 1.71 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -71,21 +71,19 @@
#include <machine/vmparam.h>
struct workqueue;
struct pgflcache;
/*
* per-cpu data
*/
struct uvm_cpu {
struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
int page_free_nextcolor; /* next color to allocate from */
int page_idlezero_next; /* which color to zero next */
bool page_idle_zero; /* TRUE if we should try to zero
pages in the idle loop */
int pages[PGFL_NQUEUES]; /* total of pages in page_free */
u_int emap_gen; /* emap generation number */
krndsource_t rs; /* entropy source */
struct pgflcache *pgflcache[VM_NFREELIST];/* cpu-local cached pages */
void *pgflcachemem; /* pointer to allocated mem */
size_t pgflcachememsz; /* size of allocated memory */
u_int pgflcolor; /* next color to allocate */
u_int pgflbucket; /* where to send our pages */
krndsource_t rs; /* entropy source */
};
/*
@ -98,7 +96,9 @@ struct uvm {
/* vm_page queues */
struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
bool page_init_done; /* TRUE if uvm_page_init() finished */
u_int bucketcount;
bool page_init_done; /* true if uvm_page_init() finished */
bool numa_alloc; /* use NUMA page allocation strategy */
/* page daemon trigger */
int pagedaemon; /* daemon sleeps on this */
@ -123,7 +123,6 @@ extern struct uvm_object *uvm_kernel_object;
* locks (made globals for lockstat).
*/
extern kmutex_t uvm_fpageqlock; /* lock for free page q */
extern kmutex_t uvm_kentry_lock;
#endif /* _KERNEL */

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_ddb.h,v 1.15 2011/05/17 04:18:07 mrg Exp $ */
/* $NetBSD: uvm_ddb.h,v 1.16 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -40,6 +40,7 @@ void uvm_object_printit(struct uvm_object *, bool,
void uvm_page_printit(struct vm_page *, bool,
void (*)(const char *, ...));
void uvm_page_printall(void (*)(const char *, ...));
void uvm_page_print_freelists(void (*)(const char *, ...));
void uvmexp_print(void (*)(const char *, ...));
#endif /* DDB || DEBUGPRINT */

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_extern.h,v 1.215 2019/12/21 12:58:26 ad Exp $ */
/* $NetBSD: uvm_extern.h,v 1.216 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -210,6 +210,7 @@ b\32UNMAP\0\
#define UVM_PGA_STRAT_NORMAL 0 /* priority (low id to high) walk */
#define UVM_PGA_STRAT_ONLY 1 /* only specified free list */
#define UVM_PGA_STRAT_FALLBACK 2 /* ONLY falls back on NORMAL */
#define UVM_PGA_STRAT_NUMA 3 /* strongly prefer ideal bucket */
/*
* flags for uvm_pagealloc_strat()
@ -736,6 +737,7 @@ void uvm_obj_unwirepages(struct uvm_object *, off_t, off_t);
/* uvm_page.c */
int uvm_free(void);
void uvm_page_numa_load(paddr_t, paddr_t, u_int);
struct vm_page *uvm_pagealloc_strat(struct uvm_object *,
voff_t, struct vm_anon *, int, int, int);
#define uvm_pagealloc(obj, off, anon, flags) \

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $ */
/* $NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -62,7 +62,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $");
#include "opt_kgdb.h"
#include "opt_kstack.h"
@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
#include <sys/asan.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pgflcache.h>
/*
* uvm_kernacc: test if kernel can access a memory region.
@ -500,9 +501,17 @@ uvm_scheduler(void)
lwp_changepri(l, PRI_VM);
lwp_unlock(l);
/* Start the freelist cache. */
uvm_pgflcache_start();
for (;;) {
/* Update legacy stats for post-mortem debugging. */
uvm_update_uvmexp();
/* See if the pagedaemon needs to generate some free pages. */
uvm_kick_pdaemon();
/* Calculate process statistics. */
sched_pstats();
(void)kpause("uvm", false, hz, NULL);
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $ */
/* $NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -64,7 +64,6 @@ const int * const uvmexp_pagemask = &uvmexp.pagemask;
const int * const uvmexp_pageshift = &uvmexp.pageshift;
#endif
kmutex_t uvm_fpageqlock __cacheline_aligned;
kmutex_t uvm_kentry_lock __cacheline_aligned;
/*

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_page.h,v 1.88 2019/12/21 14:41:44 ad Exp $ */
/* $NetBSD: uvm_page.h,v 1.89 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -119,7 +119,6 @@
*
* o free
* => pageq.list is entry on global free page queue
* => listq.list is entry on per-CPU free page queue
* => uanon is unused (or (void *)0xdeadbeef for DEBUG)
* => uobject is unused (or (void *)0xdeadbeef for DEBUG)
* => PG_FREE is set in flags
@ -129,13 +128,11 @@
* => uobject is owner
* o owned by a vm_anon
* => pageq is unused (XXX correct?)
* => listq is unused (XXX correct?)
* => uanon is owner
* => uobject is NULL
* => PG_ANON is set in flags
* o allocated by uvm_pglistalloc
* => pageq.queue is entry on resulting pglist, owned by caller
* => listq is unused (XXX correct?)
* => uanon is unused
* => uobject is unused
*
@ -153,11 +150,6 @@ struct vm_page {
* or uvm_pglistalloc output */
LIST_ENTRY(vm_page) list; /* f: global free page queue */
} pageq;
union {
LIST_ENTRY(vm_page) list; /* f: CPU free page queue */
} listq;
struct vm_anon *uanon; /* o,i: anon */
struct uvm_object *uobject; /* o,i: object */
voff_t offset; /* o: offset into object */
@ -302,6 +294,7 @@ void uvm_page_own(struct vm_page *, const char *);
bool uvm_page_physget(paddr_t *);
#endif
void uvm_page_recolor(int);
void uvm_page_rebucket(void);
void uvm_pageidlezero(void);
void uvm_pageactivate(struct vm_page *);
@ -318,6 +311,8 @@ void uvm_pagewire(struct vm_page *);
void uvm_pagezero(struct vm_page *);
bool uvm_pageismanaged(paddr_t);
bool uvm_page_locked_p(struct vm_page *);
void uvm_pgfl_lock(void);
void uvm_pgfl_unlock(void);
int uvm_page_lookup_freelist(struct vm_page *);
@ -348,8 +343,12 @@ int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t,
#define VM_PGCOLOR(pg) \
(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
#define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa)
/*
* VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
* per-CPU cache. If you need to be certain, pause caching.
*/
#define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE)
#define VM_FREE_PAGE_TO_CPU(pg) ((struct uvm_cpu *)((uintptr_t)pg->offset))
/*
* Use the lower 10 bits of pg->phys_addr to cache some some locators for

471
sys/uvm/uvm_pgflcache.c Normal file
View File

@ -0,0 +1,471 @@
/* $NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_pgflcache.c: page freelist cache.
*
* This implements a tiny per-CPU cache of pages that sits between the main
* page allocator and the freelists. By allocating and freeing pages in
* batch, it reduces freelist contention by an order of magnitude.
*
* The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
* uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
* world. On system with one CPU per physical package (e.g. a uniprocessor)
* the cache is not enabled.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $");
#include "opt_uvm.h"
#include "opt_multiprocessor.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>
/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR
/*
* MAXPGS - maximum pages per color, per bucket.
* FILLPGS - number of pages to allocate at once, per color, per bucket.
*
* Why the chosen values:
*
* (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
* colors. We make the assumption that most of the time allocation activity
* will be centered around one UVM freelist, so most of the time there will
* be no more than 224kB worth of cached pages per-CPU. That's tiny, but
* enough to hugely reduce contention on the freelist locks, and give us a
* small pool of pages which if we're very lucky may have some L1/L2 cache
* locality, and do so without subtracting too much from the L2/L3 cache
* benefits of having per-package free lists in the page allocator.
*
* (2) With the chosen values on _LP64, the data structure for each color
* takes up a single cache line (64 bytes) giving this very low overhead
* even in the "miss" case.
*
* (3) We don't want to cause too much pressure by hiding away memory that
* could otherwise be put to good use.
*/
#define MAXPGS 7
#define FILLPGS 6
/* Variable size, according to # colors. */
struct pgflcache {
struct pccolor {
intptr_t count;
struct vm_page *pages[MAXPGS];
} color[1];
};
static kmutex_t uvm_pgflcache_lock;
static kcondvar_t uvm_pgflcache_cv;
static int uvm_pgflcache_sem;
static bool uvm_pgflcache_draining;
/*
* uvm_pgflcache_fill: fill specified freelist/color from global list
*
* => must be called at IPL_VM
* => must be called with given bucket lock held
* => must only fill from the correct bucket for this CPU
*/
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
struct pgflbucket *pgb;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
struct vm_page *pg;
int count;
KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
KASSERT(ucpu->pgflbucket == b);
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return;
}
/* Fill only to the limit. */
pcc = &pc->color[c];
pgb = uvm.page_free[fl].pgfl_buckets[b];
head = &pgb->pgb_colors[c];
if (pcc->count >= FILLPGS) {
return;
}
/* Pull pages from the bucket until it's empty, or we are full. */
count = pcc->count;
pg = LIST_FIRST(head);
while (__predict_true(pg != NULL && count < FILLPGS)) {
KASSERT(pg->flags & PG_FREE);
KASSERT(uvm_page_get_bucket(pg) == b);
pcc->pages[count++] = pg;
pg = LIST_NEXT(pg, pageq.list);
}
/* Violate LIST abstraction to remove all pages at once. */
head->lh_first = pg;
if (__predict_true(pg != NULL)) {
pg->pageq.list.le_prev = &head->lh_first;
}
pgb->pgb_nfree -= (count - pcc->count);
pcc->count = count;
}
/*
* uvm_pgflcache_spill: spill specified freelist/color to global list
*
* => must be called at IPL_VM
* => mark __noinline so we don't pull it into uvm_pgflcache_free()
*/
static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflbucket *pgb;
struct pgfreelist *pgfl;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
kmutex_t *lock;
int b, adj;
pc = ucpu->pgflcache[fl];
pcc = &pc->color[c];
pgfl = &uvm.page_free[fl];
b = ucpu->pgflbucket;
pgb = pgfl->pgfl_buckets[b];
head = &pgb->pgb_colors[c];
lock = &uvm_freelist_locks[b].lock;
mutex_spin_enter(lock);
for (adj = pcc->count; pcc->count != 0;) {
pcc->count--;
KASSERT(pcc->pages[pcc->count] != NULL);
KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
}
pgb->pgb_nfree += adj;
mutex_spin_exit(lock);
}
/*
* uvm_pgflcache_alloc: try to allocate a cached page.
*
* => must be called at IPL_VM
* => allocate only from the given freelist and given page color
*/
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflcache *pc;
struct pccolor *pcc;
struct vm_page *pg;
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return NULL;
}
/* Very simple: if we have a page then return it. */
pcc = &pc->color[c];
if (__predict_false(pcc->count == 0)) {
return NULL;
}
pg = pcc->pages[--(pcc->count)];
KASSERT(pg != NULL);
KASSERT(pg->flags & PG_FREE);
KASSERT(uvm_page_get_freelist(pg) == fl);
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
pg->flags &= PG_ZERO;
return pg;
}
/*
* uvm_pgflcache_free: cache a page, if possible.
*
* => must be called at IPL_VM
* => must only send pages for the correct bucket for this CPU
*/
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
struct pgflcache *pc;
struct pccolor *pcc;
int fl, c;
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
/* If caching is off, then bail out. */
fl = uvm_page_get_freelist(pg);
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return false;
}
/* If the array is full spill it first, then add page to array. */
c = VM_PGCOLOR(pg);
pcc = &pc->color[c];
KASSERT((pg->flags & PG_FREE) == 0);
if (__predict_false(pcc->count == MAXPGS)) {
uvm_pgflcache_spill(ucpu, fl, c);
}
pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
pcc->pages[pcc->count] = pg;
pcc->count++;
return true;
}
/*
* uvm_pgflcache_init: allocate and initialize per-CPU data structures for
* the free page cache. Don't set anything in motion - that's taken care
* of by uvm_pgflcache_resume().
*/
static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
size_t sz;
ucpu = ci->ci_data.cpu_uvm;
KASSERT(ucpu->pgflcachemem == NULL);
KASSERT(ucpu->pgflcache[0] == NULL);
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
ucpu->pgflcachememsz =
(roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}
/*
* uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
* and shut down caching on the CPU. Called on each CPU in the system via
* xcall.
*/
static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
struct uvm_cpu *ucpu;
int fl, color, s;
ucpu = curcpu()->ci_data.cpu_uvm;
for (fl = 0; fl < VM_NFREELIST; fl++) {
s = splvm();
for (color = 0; color < uvmexp.ncolors; color++) {
uvm_pgflcache_spill(ucpu, fl, color);
}
ucpu->pgflcache[fl] = NULL;
splx(s);
}
}
/*
* uvm_pgflcache_pause: pause operation of the caches
*/
void
uvm_pgflcache_pause(void)
{
uint64_t where;
/* First one in starts draining. Everyone else waits. */
mutex_enter(&uvm_pgflcache_lock);
if (uvm_pgflcache_sem++ == 0) {
uvm_pgflcache_draining = true;
mutex_exit(&uvm_pgflcache_lock);
where = xc_broadcast(0, uvm_pgflcache_fini_cpu, NULL, NULL);
xc_wait(where);
mutex_enter(&uvm_pgflcache_lock);
uvm_pgflcache_draining = false;
cv_broadcast(&uvm_pgflcache_cv);
} else {
while (uvm_pgflcache_draining) {
cv_wait(&uvm_pgflcache_cv, &uvm_pgflcache_lock);
}
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_resume: resume operation of the caches
*/
void
uvm_pgflcache_resume(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct uvm_cpu *ucpu;
uintptr_t addr;
size_t sz;
int fl;
/* Last guy out takes care of business. */
mutex_enter(&uvm_pgflcache_lock);
KASSERT(!uvm_pgflcache_draining);
KASSERT(uvm_pgflcache_sem > 0);
if (uvm_pgflcache_sem-- > 1) {
mutex_exit(&uvm_pgflcache_lock);
return;
}
/*
* Make sure dependant data structure updates are remotely visible.
* Essentially this functions as a global memory barrier.
*/
xc_barrier(XC_HIGHPRI);
/*
* Then set all of the pointers in place on each CPU. As soon as
* each pointer is set, caching is operational in that dimension.
*/
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
for (CPU_INFO_FOREACH(cii, ci)) {
ucpu = ci->ci_data.cpu_uvm;
addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
for (fl = 0; fl < VM_NFREELIST; fl++) {
ucpu->pgflcache[fl] = (struct pgflcache *)addr;
addr += sz;
}
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_start: start operation of the cache.
*
* => called once only, when init(8) is about to be started
*/
void
uvm_pgflcache_start(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(uvm_pgflcache_sem > 0);
/*
* There's not much point doing this if every CPU has its own
* bucket (and that includes the uniprocessor case).
*/
if (ncpu == uvm.bucketcount) {
return;
}
/* Create each CPU's buckets. */
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_pgflcache_init_cpu(ci);
}
/* Kick it into action. */
uvm_pgflcache_resume();
}
/*
* uvm_pgflcache_init: set up data structures for the free page cache.
*/
void
uvm_pgflcache_init(void)
{
uvm_pgflcache_sem = 1;
mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&uvm_pgflcache_cv, "flcache");
}
#else /* MULTIPROCESSOR */
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
return NULL;
}
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
return false;
}
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
}
void
uvm_pgflcache_pause(void)
{
}
void
uvm_pgflcache_resume(void)
{
}
void
uvm_pgflcache_start(void)
{
}
void
uvm_pgflcache_init(void)
{
}
#endif /* MULTIPROCESSOR */

43
sys/uvm/uvm_pgflcache.h Normal file
View File

@ -0,0 +1,43 @@
/* $NetBSD: uvm_pgflcache.h,v 1.1 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(_UVM_PGFLCACHE_H_)
#define _UVM_PGFLCACHE_H_
struct vm_page *uvm_pgflcache_alloc(struct uvm_cpu *, int, int);
void uvm_pgflcache_fill(struct uvm_cpu *, int, int, int);
bool uvm_pgflcache_free(struct uvm_cpu *, struct vm_page *);
void uvm_pgflcache_init(void);
void uvm_pgflcache_pause(void);
void uvm_pgflcache_resume(void);
void uvm_pgflcache_start(void);
#endif /* !_UVM_PGFLCACHE_H_ */

View File

@ -1,12 +1,12 @@
/* $NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $ */
/* $NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 1997 The NetBSD Foundation, Inc.
* Copyright (c) 1997, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -35,13 +35,14 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>
#ifdef VM_PAGE_ALLOC_MEMORY_STATS
#define STAT_INCR(v) (v)++
@ -79,34 +80,25 @@ u_long uvm_pglistalloc_npages;
static void
uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
{
int free_list __unused, color __unused, pgflidx;
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
KASSERT(mutex_owned(&uvm_fpageqlock));
pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
pgb = pgfl->pgfl_buckets[uvm_page_get_bucket(pg)];
#if PGFL_NQUEUES != 2
#error uvm_pglistalloc needs to be updated
#endif
free_list = uvm_page_get_freelist(pg);
color = VM_PGCOLOR(pg);
pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
#ifdef UVMDEBUG
struct vm_page *tp;
LIST_FOREACH(tp,
&uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
pageq.list) {
LIST_FOREACH(tp, &pgb->pgb_colors[VM_PGCOLOR(pg)], pageq.list) {
if (tp == pg)
break;
}
if (tp == NULL)
panic("uvm_pglistalloc: page not on freelist");
#endif
LIST_REMOVE(pg, pageq.list); /* global */
LIST_REMOVE(pg, listq.list); /* cpu */
uvmexp.free--;
LIST_REMOVE(pg, pageq.list);
pgb->pgb_nfree--;
if (pg->flags & PG_ZERO)
CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
pg->flags = PG_CLEAN;
pg->uobject = NULL;
pg->uanon = NULL;
@ -129,8 +121,6 @@ uvm_pglistalloc_c_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
#endif
KASSERT(mutex_owned(&uvm_fpageqlock));
low = atop(low);
high = atop(high);
alignment = atop(alignment);
@ -316,7 +306,7 @@ uvm_pglistalloc_contig(int num, paddr_t low, paddr_t high, paddr_t alignment,
/*
* Block all memory allocation and lock the free list.
*/
mutex_spin_enter(&uvm_fpageqlock);
uvm_pgfl_lock();
/* Are there even any free pages? */
if (uvm_free() <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
@ -352,7 +342,7 @@ out:
* the pagedaemon.
*/
mutex_spin_exit(&uvm_fpageqlock);
uvm_pgfl_unlock();
uvm_kick_pdaemon();
return (error);
}
@ -368,7 +358,6 @@ uvm_pglistalloc_s_ps(uvm_physseg_t psi, int num, paddr_t low, paddr_t high,
printf("pgalloc: simple %d pgs from psi %zd\n", num, psi);
#endif
KASSERT(mutex_owned(&uvm_fpageqlock));
KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi));
KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi));
KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi));
@ -461,7 +450,7 @@ again:
/*
* Block all memory allocation and lock the free list.
*/
mutex_spin_enter(&uvm_fpageqlock);
uvm_pgfl_lock();
count++;
/* Are there even any free pages? */
@ -493,7 +482,7 @@ out:
* the pagedaemon.
*/
mutex_spin_exit(&uvm_fpageqlock);
uvm_pgfl_unlock();
uvm_kick_pdaemon();
if (error) {
@ -539,6 +528,12 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
TAILQ_INIT(rlist);
/*
* Turn off the caching of free pages - we need everything to be on
* the global freelists.
*/
uvm_pgflcache_pause();
if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
(boundary != 0))
res = uvm_pglistalloc_contig(num, low, high, alignment,
@ -546,6 +541,8 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
else
res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);
uvm_pgflcache_resume();
return (res);
}
@ -558,45 +555,34 @@ uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment,
void
uvm_pglistfree(struct pglist *list)
{
struct uvm_cpu *ucpu;
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
struct vm_page *pg;
int index, color, queue;
bool iszero;
int c, b;
/*
* Lock the free list and free each page.
*/
mutex_spin_enter(&uvm_fpageqlock);
ucpu = curcpu()->ci_data.cpu_uvm;
uvm_pgfl_lock();
while ((pg = TAILQ_FIRST(list)) != NULL) {
KASSERT(!uvmpdpol_pageisqueued_p(pg));
TAILQ_REMOVE(list, pg, pageq.queue);
iszero = (pg->flags & PG_ZERO);
pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
#endif /* DEBUG */
#ifdef DEBUG
if (iszero)
if (pg->flags & PG_ZERO)
uvm_pagezerocheck(pg);
#endif /* DEBUG */
index = uvm_page_get_freelist(pg);
color = VM_PGCOLOR(pg);
queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
pg->offset = (uintptr_t)ucpu;
LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
pgfl_queues[queue], pg, pageq.list);
LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
pgfl_queues[queue], pg, listq.list);
uvmexp.free++;
if (iszero)
c = VM_PGCOLOR(pg);
b = uvm_page_get_bucket(pg);
pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
pgb = pgfl->pgfl_buckets[b];
if (pg->flags & PG_ZERO)
CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
ucpu->pages[queue]++;
pgb->pgb_nfree++;
LIST_INSERT_HEAD(&pgb->pgb_colors[c], pg, pageq.list);
STAT_DECR(uvm_pglistalloc_npages);
}
if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
ucpu->page_idle_zero = vm_page_zero_enable;
mutex_spin_exit(&uvm_fpageqlock);
uvm_pgfl_unlock();
}

View File

@ -1,11 +1,11 @@
/* $NetBSD: uvm_pglist.h,v 1.8 2010/11/06 15:48:00 uebayasi Exp $ */
/* $NetBSD: uvm_pglist.h,v 1.9 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc.
* Copyright (c) 2000, 2001, 2008, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
* by Jason R. Thorpe, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -41,19 +41,51 @@ TAILQ_HEAD(pglist, vm_page);
LIST_HEAD(pgflist, vm_page);
/*
* A page free list consists of free pages of unknown contents and free
* pages of all zeros.
* The global uvm.page_free list (uvm_page.c, uvm_pglist.c). Free pages are
* stored according to freelist, bucket, and cache colour.
*
* pglist = &uvm.page_free[freelist].pgfl_buckets[bucket].pgb_color[color];
*
* Freelists provide a priority ordering of pages for allocation, based upon
* how valuable they are for special uses (e.g. device driver DMA).
*
* Pages are then grouped in buckets according to some common factor, for
* example L2/L3 cache locality. Each bucket has its own lock, and the
* locks are shared among freelists for the same numbered buckets.
*
* Inside each bucket, pages are further distributed by cache color.
*
* We want these data structures to occupy as few cache lines as possible,
* as they will be highly contended.
*/
#define PGFL_UNKNOWN 0
#define PGFL_ZEROS 1
#define PGFL_NQUEUES 2
struct pgflbucket {
struct pgflist pgfl_queues[PGFL_NQUEUES];
uintptr_t pgb_nfree; /* total # free pages, all colors */
struct pgflist pgb_colors[1]; /* variable size array */
};
/*
* At the root, the freelists. MD code decides the number and structure of
* these. They are always arranged in descending order of allocation
* priority.
*
* 8 buckets should be enough to cover most all current x86 systems (2019),
* given the way package/core/smt IDs are structured on x86. For systems
* that report high package counts despite having a single physical CPU
* package (e.g. Ampere eMAG) a little bit of sharing isn't going to hurt
* in the least.
*/
#define PGFL_MAX_BUCKETS 8
struct pgfreelist {
struct pgflbucket *pgfl_buckets;
struct pgflbucket *pgfl_buckets[PGFL_MAX_BUCKETS];
};
/*
* Lock for each bucket.
*/
union uvm_freelist_lock {
kmutex_t lock;
uint8_t padding[COHERENCY_UNIT];
};
extern union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS];
#endif /* _UVM_UVM_PGLIST_H_ */