NetBSD/sys/kern/kern_malloc_debug.c

/*	$NetBSD: kern_malloc_debug.c,v 1.7 2002/08/14 15:21:31 thorpej Exp $	*/

/*
 * Copyright (c) 1999, 2000 Artur Grabowski <art@openbsd.org>
 * All rights reserved. 
 *
 * Redistribution and use in source and binary forms, with or without 
 * modification, are permitted provided that the following conditions 
 * are met: 
 *
 * 1. Redistributions of source code must retain the above copyright 
 *    notice, this list of conditions and the following disclaimer. 
 * 2. Redistributions in binary form must reproduce the above copyright 
 *    notice, this list of conditions and the following disclaimer in the 
 *    documentation and/or other materials provided with the distribution. 
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission. 
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL  DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 *
 * OpenBSD: kern_malloc_debug.c,v 1.10 2001/07/26 13:33:52 art Exp
 */

/*
 * This really belongs in kern/kern_malloc.c, but it was too much pollution.
 */

/*
 * It's only possible to debug one type/size at a time. The question is
 * if this is a limitation or a feature. We never want to run this as the
 * default malloc because we'll run out of memory really fast. Adding
 * more types will also add to the complexity of the code.
 *
 * This is really simple. Every malloc() allocates two virtual pages,
 * the second page is left unmapped, and the the value returned is aligned
 * so that it ends at (or very close to) the page boundary to catch overflows.
 * Every free() changes the protection of the first page to VM_PROT_NONE so
 * that we can catch any dangling writes to it.
 * To minimize the risk of writes to recycled chunks we keep an LRU of latest
 * freed chunks. The length of it is controlled by MALLOC_DEBUG_CHUNKS.
 *
 * Don't expect any performance.
 *
 * TODO:
 *  - support for size >= PAGE_SIZE
 *  - add support to the fault handler to give better diagnostics if we fail.
 */

#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_malloc_debug.c,v 1.7 2002/08/14 15:21:31 thorpej Exp $");

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/systm.h>
#include <sys/pool.h>

#include <uvm/uvm.h>

/*
 * debug_malloc_type and debug_malloc_size define the type and size of
 * memory to be debugged. Use 0 for a wildcard. debug_malloc_size_lo
 * is the lower limit and debug_malloc_size_hi the upper limit of sizes
 * being debugged; 0 will not work as a wildcard for the upper limit.
 * For any debugging to take place, type must be != -1, size must be >= 0,
 * and if the limits are being used, size must be set to 0.
 * See /usr/src/sys/sys/malloc.h and malloc(9) for a list of types.
 *
 * Although those are variables, it's a really bad idea to change the type
 * if any memory chunks of this type are used. It's ok to change the size
 * in runtime.
 */
int debug_malloc_type = -1;
int debug_malloc_size = -1;
int debug_malloc_size_lo = -1;
int debug_malloc_size_hi = -1;

/*
 * MALLOC_DEBUG_CHUNKS is the number of memory chunks we require on the
 * freelist before we reuse them.
 */
#define MALLOC_DEBUG_CHUNKS 16

void debug_malloc_allocate_free(int);

struct debug_malloc_entry {
	TAILQ_ENTRY(debug_malloc_entry) md_list;
	vaddr_t md_va;
	paddr_t md_pa;
	size_t md_size;
	int md_type;
};

TAILQ_HEAD(,debug_malloc_entry) debug_malloc_freelist;
TAILQ_HEAD(,debug_malloc_entry) debug_malloc_usedlist;

int debug_malloc_allocs;
int debug_malloc_frees;
int debug_malloc_pages;
int debug_malloc_chunks_on_freelist;

struct pool debug_malloc_pool;

int
debug_malloc(unsigned long size, int type, int flags, void **addr)
{
	struct debug_malloc_entry *md = NULL;
	int s, wait = !(flags & M_NOWAIT);

	/* Careful not to compare unsigned long to int -1 */
	if ((type != debug_malloc_type && debug_malloc_type != 0) ||
	    (size != debug_malloc_size && debug_malloc_size != 0) ||
	    (debug_malloc_size_lo != -1 && size < debug_malloc_size_lo) ||
	    (debug_malloc_size_hi != -1 && size > debug_malloc_size_hi))
		return (0);

	/* XXX - fix later */
	if (size > PAGE_SIZE)
		return (0);

	s = splvm();
	if (debug_malloc_chunks_on_freelist < MALLOC_DEBUG_CHUNKS)
		debug_malloc_allocate_free(wait);

	md = TAILQ_FIRST(&debug_malloc_freelist);
	if (md == NULL) {
		splx(s);
		return (0);
	}
	TAILQ_REMOVE(&debug_malloc_freelist, md, md_list);
	debug_malloc_chunks_on_freelist--;

	TAILQ_INSERT_HEAD(&debug_malloc_usedlist, md, md_list);
	debug_malloc_allocs++;
	splx(s);

	pmap_kenter_pa(md->md_va, md->md_pa, VM_PROT_READ|VM_PROT_WRITE);

	md->md_size = size;
	md->md_type = type;

	/*
	 * Align the returned addr so that it ends where the first page
	 * ends. roundup to get decent alignment.
	 */
	*addr = (void *)(md->md_va + PAGE_SIZE - roundup(size, sizeof(long)));
	if (*addr != NULL && (flags & M_ZERO))
		memset(*addr, 0, size);
	return (1);
}

int
debug_free(void *addr, int type)
{
	struct debug_malloc_entry *md;
	vaddr_t va;
	int s;

	if (type != debug_malloc_type && debug_malloc_type != 0)
		return (0);

	/*
	 * trunc_page to get the address of the page.
	 */
	va = trunc_page((vaddr_t)addr);

	s = splvm();
	TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list)
		if (md->md_va == va)
			break;

	/*
	 * If we are not responsible for this entry, let the normal free
	 * handle it
	 */
	if (md == NULL) {
		/*
		 * sanity check. Check for multiple frees.
		 */
		TAILQ_FOREACH(md, &debug_malloc_freelist, md_list)
			if (md->md_va == va)
				panic("debug_free: already free");
		splx(s);
		return (0);
	}

	debug_malloc_frees++;
	TAILQ_REMOVE(&debug_malloc_usedlist, md, md_list);

	TAILQ_INSERT_TAIL(&debug_malloc_freelist, md, md_list);
	debug_malloc_chunks_on_freelist++;
	/*
	 * unmap the page.
	 */
	pmap_kremove(md->md_va, PAGE_SIZE);
	splx(s);

	return (1);
}

void
debug_malloc_init(void)
{

	TAILQ_INIT(&debug_malloc_freelist);
	TAILQ_INIT(&debug_malloc_usedlist);

	debug_malloc_allocs = 0;
	debug_malloc_frees = 0;
	debug_malloc_pages = 0;
	debug_malloc_chunks_on_freelist = 0;

	pool_init(&debug_malloc_pool, sizeof(struct debug_malloc_entry),
	    0, 0, 0, "mdbepl", NULL);
}

/*
 * Add one chunk to the freelist.
 *
 * called at splvm.
 */
void
debug_malloc_allocate_free(int wait)
{
	vaddr_t va, offset;
	struct vm_page *pg;
	struct debug_malloc_entry *md;

	md = pool_get(&debug_malloc_pool, wait ? PR_WAITOK : PR_NOWAIT);
	if (md == NULL)
		return;

	va = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE * 2,
	    UVM_KMF_VALLOC | (wait ? UVM_KMF_NOWAIT : 0));
	if (va == 0) {
		pool_put(&debug_malloc_pool, md);
		return;
	}

	offset = va - vm_map_min(kernel_map);
	for (;;) {
		pg = uvm_pagealloc(NULL, offset, NULL, 0);
		if (pg) {
			pg->flags &= ~PG_BUSY;  /* new page */
			UVM_PAGE_OWN(pg, NULL);
		}

		if (pg)
			break;

		if (wait == 0) {
			uvm_unmap(kmem_map, va, va + PAGE_SIZE * 2);
			pool_put(&debug_malloc_pool, md);
			return;
		}
		uvm_wait("debug_malloc");
	}

	md->md_va = va;
	md->md_pa = VM_PAGE_TO_PHYS(pg);

	debug_malloc_pages++;
	TAILQ_INSERT_HEAD(&debug_malloc_freelist, md, md_list);
	debug_malloc_chunks_on_freelist++;
}

void
debug_malloc_print(void)
{

	debug_malloc_printit(printf, NULL);
}

void
debug_malloc_printit(void (*pr)(const char *, ...), vaddr_t addr)
{
	struct debug_malloc_entry *md;

	if (addr) {
		TAILQ_FOREACH(md, &debug_malloc_freelist, md_list) {
			if (addr >= md->md_va &&
			    addr < md->md_va + 2 * PAGE_SIZE) {
				(*pr)("Memory at address 0x%x is in a freed "
				      "area. type %d, size: %d\n ",
				      addr, md->md_type, md->md_size);
				return;
			}
		}
		TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list) {
			if (addr >= md->md_va + PAGE_SIZE &&
			    addr < md->md_va + 2 * PAGE_SIZE) {
				(*pr)("Memory at address 0x%x is just outside "
				      "an allocated area. type %d, size: %d\n",
				      addr, md->md_type, md->md_size);
				return;
			}
		}
		(*pr)("Memory at address 0x%x is outside debugged malloc.\n");
		return;
	}

	(*pr)("allocs: %d\n", debug_malloc_allocs);
	(*pr)("frees: %d\n", debug_malloc_frees);
	(*pr)("pages used: %d\n", debug_malloc_pages);
	(*pr)("chunks on freelist: %d\n", debug_malloc_chunks_on_freelist);

	(*pr)("\taddr:\tsize:\n");
	(*pr)("free chunks:\n");
	TAILQ_FOREACH(md, &debug_malloc_freelist, md_list)
		(*pr)("\t0x%x\t0x%x\t%d\n", md->md_va, md->md_size,
		      md->md_type);
	(*pr)("used chunks:\n");
	TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list)
		(*pr)("\t0x%x\t0x%x\t%d\n", md->md_va, md->md_size,
		      md->md_type);
}
Don't pass VM_PROT_EXEC to pmap_kenter_pa(). 2002-08-14 19:21:31 +04:00			`/* $NetBSD: kern_malloc_debug.c,v 1.7 2002/08/14 15:21:31 thorpej Exp $ */`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00
			`/*`
			`* Copyright (c) 1999, 2000 Artur Grabowski <art@openbsd.org>`
			`* All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`*`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`* 3. The name of the author may not be used to endorse or promote products`
			`* derived from this software without specific prior written permission.`
			`*`
			* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
			`* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY`
			`* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL`
			`* THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,`
			`* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,`
			`* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;`
			`* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,`
			`* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR`
			`* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF`
			`* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.`
			`*`
			`* OpenBSD: kern_malloc_debug.c,v 1.10 2001/07/26 13:33:52 art Exp`
			`*/`

			`/*`
			`* This really belongs in kern/kern_malloc.c, but it was too much pollution.`
			`*/`

			`/*`
			`* It's only possible to debug one type/size at a time. The question is`
			`* if this is a limitation or a feature. We never want to run this as the`
			`* default malloc because we'll run out of memory really fast. Adding`
			`* more types will also add to the complexity of the code.`
			`*`
			`* This is really simple. Every malloc() allocates two virtual pages,`
			`* the second page is left unmapped, and the the value returned is aligned`
			`* so that it ends at (or very close to) the page boundary to catch overflows.`
			`* Every free() changes the protection of the first page to VM_PROT_NONE so`
			`* that we can catch any dangling writes to it.`
			`* To minimize the risk of writes to recycled chunks we keep an LRU of latest`
			`* freed chunks. The length of it is controlled by MALLOC_DEBUG_CHUNKS.`
			`*`
			`* Don't expect any performance.`
			`*`
			`* TODO:`
			`* - support for size >= PAGE_SIZE`
			`* - add support to the fault handler to give better diagnostics if we fail.`
			`*/`

add RCSIDs 2001-11-12 18:25:01 +03:00			`#include <sys/cdefs.h>`
Don't pass VM_PROT_EXEC to pmap_kenter_pa(). 2002-08-14 19:21:31 +04:00			`__KERNEL_RCSID(0, "$NetBSD: kern_malloc_debug.c,v 1.7 2002/08/14 15:21:31 thorpej Exp $");`
add RCSIDs 2001-11-12 18:25:01 +03:00
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00			`#include <sys/param.h>`
			`#include <sys/proc.h>`
			`#include <sys/kernel.h>`
			`#include <sys/malloc.h>`
			`#include <sys/systm.h>`
			`#include <sys/pool.h>`

			`#include <uvm/uvm.h>`

			`/*`
			`* debug_malloc_type and debug_malloc_size define the type and size of`
			`* memory to be debugged. Use 0 for a wildcard. debug_malloc_size_lo`
			`* is the lower limit and debug_malloc_size_hi the upper limit of sizes`
			`* being debugged; 0 will not work as a wildcard for the upper limit.`
			`* For any debugging to take place, type must be != -1, size must be >= 0,`
			`* and if the limits are being used, size must be set to 0.`
			`* See /usr/src/sys/sys/malloc.h and malloc(9) for a list of types.`
			`*`
			`* Although those are variables, it's a really bad idea to change the type`
			`* if any memory chunks of this type are used. It's ok to change the size`
			`* in runtime.`
			`*/`
			`int debug_malloc_type = -1;`
			`int debug_malloc_size = -1;`
			`int debug_malloc_size_lo = -1;`
			`int debug_malloc_size_hi = -1;`

			`/*`
			`* MALLOC_DEBUG_CHUNKS is the number of memory chunks we require on the`
			`* freelist before we reuse them.`
			`*/`
			`#define MALLOC_DEBUG_CHUNKS 16`

			`void debug_malloc_allocate_free(int);`

			`struct debug_malloc_entry {`
			`TAILQ_ENTRY(debug_malloc_entry) md_list;`
			`vaddr_t md_va;`
			`paddr_t md_pa;`
			`size_t md_size;`
			`int md_type;`
			`};`

			`TAILQ_HEAD(,debug_malloc_entry) debug_malloc_freelist;`
			`TAILQ_HEAD(,debug_malloc_entry) debug_malloc_usedlist;`

			`int debug_malloc_allocs;`
			`int debug_malloc_frees;`
			`int debug_malloc_pages;`
			`int debug_malloc_chunks_on_freelist;`

			`struct pool debug_malloc_pool;`

			`int`
			`debug_malloc(unsigned long size, int type, int flags, void **addr)`
			`{`
			`struct debug_malloc_entry *md = NULL;`
Fix reversed wait/nowait logic. 2001-11-01 01:23:18 +03:00			`int s, wait = !(flags & M_NOWAIT);`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00
			`/* Careful not to compare unsigned long to int -1 */`
			`if ((type != debug_malloc_type && debug_malloc_type != 0) \|\|`
			`(size != debug_malloc_size && debug_malloc_size != 0) \|\|`
			`(debug_malloc_size_lo != -1 && size < debug_malloc_size_lo) \|\|`
			`(debug_malloc_size_hi != -1 && size > debug_malloc_size_hi))`
			`return (0);`

			`/* XXX - fix later */`
			`if (size > PAGE_SIZE)`
			`return (0);`

			`s = splvm();`
			`if (debug_malloc_chunks_on_freelist < MALLOC_DEBUG_CHUNKS)`
			`debug_malloc_allocate_free(wait);`

			`md = TAILQ_FIRST(&debug_malloc_freelist);`
			`if (md == NULL) {`
			`splx(s);`
			`return (0);`
			`}`
			`TAILQ_REMOVE(&debug_malloc_freelist, md, md_list);`
			`debug_malloc_chunks_on_freelist--;`

			`TAILQ_INSERT_HEAD(&debug_malloc_usedlist, md, md_list);`
			`debug_malloc_allocs++;`
			`splx(s);`

Don't pass VM_PROT_EXEC to pmap_kenter_pa(). 2002-08-14 19:21:31 +04:00			`pmap_kenter_pa(md->md_va, md->md_pa, VM_PROT_READ\|VM_PROT_WRITE);`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00
			`md->md_size = size;`
			`md->md_type = type;`

			`/*`
			`* Align the returned addr so that it ends where the first page`
			`* ends. roundup to get decent alignment.`
			`*/`
			`addr = (void )(md->md_va + PAGE_SIZE - roundup(size, sizeof(long)));`
Add new malloc(9) flag M_ZERO - zeros memory before returning. From Poul-Henning Kamp's equivalent enhancement in FreeBSD. 2001-11-17 06:50:27 +03:00			`if (*addr != NULL && (flags & M_ZERO))`
			`memset(*addr, 0, size);`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00			`return (1);`
			`}`

			`int`
			`debug_free(void *addr, int type)`
			`{`
			`struct debug_malloc_entry *md;`
			`vaddr_t va;`
			`int s;`

			`if (type != debug_malloc_type && debug_malloc_type != 0)`
			`return (0);`

			`/*`
			`* trunc_page to get the address of the page.`
			`*/`
			`va = trunc_page((vaddr_t)addr);`

			`s = splvm();`
			`TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list)`
			`if (md->md_va == va)`
			`break;`

			`/*`
			`* If we are not responsible for this entry, let the normal free`
			`* handle it`
			`*/`
			`if (md == NULL) {`
			`/*`
			`* sanity check. Check for multiple frees.`
			`*/`
			`TAILQ_FOREACH(md, &debug_malloc_freelist, md_list)`
			`if (md->md_va == va)`
			`panic("debug_free: already free");`
			`splx(s);`
			`return (0);`
			`}`

			`debug_malloc_frees++;`
			`TAILQ_REMOVE(&debug_malloc_usedlist, md, md_list);`

			`TAILQ_INSERT_TAIL(&debug_malloc_freelist, md, md_list);`
			`debug_malloc_chunks_on_freelist++;`
			`/*`
			`* unmap the page.`
			`*/`
			`pmap_kremove(md->md_va, PAGE_SIZE);`
			`splx(s);`

			`return (1);`
			`}`

			`void`
			`debug_malloc_init(void)`
			`{`

			`TAILQ_INIT(&debug_malloc_freelist);`
			`TAILQ_INIT(&debug_malloc_usedlist);`

			`debug_malloc_allocs = 0;`
			`debug_malloc_frees = 0;`
			`debug_malloc_pages = 0;`
			`debug_malloc_chunks_on_freelist = 0;`

			`pool_init(&debug_malloc_pool, sizeof(struct debug_malloc_entry),`
Pool deals fairly well with physical memory shortage, but it doesn't deal with shortages of the VM maps where the backing pages are mapped (usually kmem_map). Try to deal with this: * Group all information about the backend allocator for a pool in a separate structure. The pool references this structure, rather than the individual fields. * Change the pool_init() API accordingly, and adjust all callers. * Link all pools using the same backend allocator on a list. * The backend allocator is responsible for waiting for physical memory to become available, but will still fail if it cannot callocate KVA space for the pages. If this happens, carefully drain all pools using the same backend allocator, so that some KVA space can be freed. * Change pool_reclaim() to indicate if it actually succeeded in freeing some pages, and use that information to make draining easier and more efficient. * Get rid of PR_URGENT. There was only one use of it, and it could be dealt with by the caller. From art@openbsd.org. 2002-03-08 23:48:27 +03:00			`0, 0, 0, "mdbepl", NULL);`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00			`}`

			`/*`
			`* Add one chunk to the freelist.`
			`*`
			`* called at splvm.`
			`*/`
			`void`
			`debug_malloc_allocate_free(int wait)`
			`{`
			`vaddr_t va, offset;`
			`struct vm_page *pg;`
			`struct debug_malloc_entry *md;`

			`md = pool_get(&debug_malloc_pool, wait ? PR_WAITOK : PR_NOWAIT);`
			`if (md == NULL)`
			`return;`

a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took. 2001-09-16 00:36:31 +04:00			`va = uvm_km_kmemalloc(kmem_map, NULL, PAGE_SIZE * 2,`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00			`UVM_KMF_VALLOC \| (wait ? UVM_KMF_NOWAIT : 0));`
			`if (va == 0) {`
			`pool_put(&debug_malloc_pool, md);`
			`return;`
			`}`

			`offset = va - vm_map_min(kernel_map);`
			`for (;;) {`
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took. 2001-09-16 00:36:31 +04:00			`pg = uvm_pagealloc(NULL, offset, NULL, 0);`
Artur Grabowski's simple brute-force malloc debugger, which uses guard pages. Can only debug one malloc type at a time, and nothing larger than 1 page. But can be useful for debugging certain types of "data modified on freelist" type problems. Modified from code in OpenBSD. 2001-08-17 04:48:29 +04:00			`if (pg) {`
			`pg->flags &= ~PG_BUSY; /* new page */`
			`UVM_PAGE_OWN(pg, NULL);`
			`}`

			`if (pg)`
			`break;`

			`if (wait == 0) {`
			`uvm_unmap(kmem_map, va, va + PAGE_SIZE * 2);`
			`pool_put(&debug_malloc_pool, md);`
			`return;`
			`}`
			`uvm_wait("debug_malloc");`
			`}`

			`md->md_va = va;`
			`md->md_pa = VM_PAGE_TO_PHYS(pg);`

			`debug_malloc_pages++;`
			`TAILQ_INSERT_HEAD(&debug_malloc_freelist, md, md_list);`
			`debug_malloc_chunks_on_freelist++;`
			`}`

			`void`
			`debug_malloc_print(void)`
			`{`

			`debug_malloc_printit(printf, NULL);`
			`}`

			`void`
			`debug_malloc_printit(void (pr)(const char , ...), vaddr_t addr)`
			`{`
			`struct debug_malloc_entry *md;`

			`if (addr) {`
			`TAILQ_FOREACH(md, &debug_malloc_freelist, md_list) {`
			`if (addr >= md->md_va &&`
			`addr < md->md_va + 2 * PAGE_SIZE) {`
			`(*pr)("Memory at address 0x%x is in a freed "`
			`"area. type %d, size: %d\n ",`
			`addr, md->md_type, md->md_size);`
			`return;`
			`}`
			`}`
			`TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list) {`
			`if (addr >= md->md_va + PAGE_SIZE &&`
			`addr < md->md_va + 2 * PAGE_SIZE) {`
			`(*pr)("Memory at address 0x%x is just outside "`
			`"an allocated area. type %d, size: %d\n",`
			`addr, md->md_type, md->md_size);`
			`return;`
			`}`
			`}`
			`(*pr)("Memory at address 0x%x is outside debugged malloc.\n");`
			`return;`
			`}`

			`(*pr)("allocs: %d\n", debug_malloc_allocs);`
			`(*pr)("frees: %d\n", debug_malloc_frees);`
			`(*pr)("pages used: %d\n", debug_malloc_pages);`
			`(*pr)("chunks on freelist: %d\n", debug_malloc_chunks_on_freelist);`

			`(*pr)("\taddr:\tsize:\n");`
			`(*pr)("free chunks:\n");`
			`TAILQ_FOREACH(md, &debug_malloc_freelist, md_list)`
			`(*pr)("\t0x%x\t0x%x\t%d\n", md->md_va, md->md_size,`
			`md->md_type);`
			`(*pr)("used chunks:\n");`
			`TAILQ_FOREACH(md, &debug_malloc_usedlist, md_list)`
			`(*pr)("\t0x%x\t0x%x\t%d\n", md->md_va, md->md_size,`
			`md->md_type);`
			`}`