2003-12-19 09:02:50 +03:00
|
|
|
/* $NetBSD: uvm_map.c,v 1.152 2003/12/19 06:02:50 simonb Exp $ */
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1998-02-05 09:25:08 +03:00
|
|
|
* Copyright (c) 1997 Charles D. Cranor and Washington University.
|
2001-05-25 08:06:11 +04:00
|
|
|
* Copyright (c) 1991, 1993, The Regents of the University of California.
|
1998-02-05 09:25:08 +03:00
|
|
|
*
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This code is derived from software contributed to Berkeley by
|
|
|
|
* The Mach Operating System project at Carnegie-Mellon University.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by Charles D. Cranor,
|
2001-05-25 08:06:11 +04:00
|
|
|
* Washington University, the University of California, Berkeley and
|
1998-02-05 09:25:08 +03:00
|
|
|
* its contributors.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)vm_map.c 8.3 (Berkeley) 1/12/94
|
1998-02-07 14:07:38 +03:00
|
|
|
* from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
|
1998-02-05 09:25:08 +03:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
|
|
|
|
* All rights reserved.
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* Permission to use, copy, modify and distribute this software and
|
|
|
|
* its documentation is hereby granted, provided that both the copyright
|
|
|
|
* notice and this permission notice appear in all copies of the
|
|
|
|
* software, derivative works or modified versions, and any portions
|
|
|
|
* thereof, and that both notices appear in supporting documentation.
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
|
|
|
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
|
|
|
|
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
|
1998-02-05 09:25:08 +03:00
|
|
|
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* Carnegie Mellon requests users of this software to return to
|
|
|
|
*
|
|
|
|
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
|
|
|
|
* School of Computer Science
|
|
|
|
* Carnegie Mellon University
|
|
|
|
* Pittsburgh PA 15213-3890
|
|
|
|
*
|
|
|
|
* any improvements or extensions that they make and grant Carnegie the
|
|
|
|
* rights to redistribute these changes.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map.c: uvm map operations
|
|
|
|
*/
|
|
|
|
|
2001-11-10 10:36:59 +03:00
|
|
|
#include <sys/cdefs.h>
|
2003-12-19 09:02:50 +03:00
|
|
|
__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.152 2003/12/19 06:02:50 simonb Exp $");
|
2001-11-10 10:36:59 +03:00
|
|
|
|
|
|
|
#include "opt_ddb.h"
|
|
|
|
#include "opt_uvmhist.h"
|
|
|
|
#include "opt_sysv.h"
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/malloc.h>
|
1998-08-31 04:20:26 +04:00
|
|
|
#include <sys/pool.h>
|
2001-09-09 23:38:22 +04:00
|
|
|
#include <sys/kernel.h>
|
2001-10-30 22:05:26 +03:00
|
|
|
#include <sys/mount.h>
|
2001-10-30 02:06:03 +03:00
|
|
|
#include <sys/vnode.h>
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
#ifdef SYSVSHM
|
|
|
|
#include <sys/shm.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define UVM_MAP
|
|
|
|
#include <uvm/uvm.h>
|
2003-11-13 05:44:01 +03:00
|
|
|
#undef RB_AUGMENT
|
|
|
|
#define RB_AUGMENT(x) uvm_rb_augment(x)
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-07-05 02:18:13 +04:00
|
|
|
#ifdef DDB
|
|
|
|
#include <uvm/uvm_ddb.h>
|
|
|
|
#endif
|
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
extern struct vm_map *pager_map;
|
1998-07-05 02:18:13 +04:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
struct uvm_cnt map_ubackmerge, map_uforwmerge;
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
struct uvm_cnt map_ubimerge, map_unomerge;
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
struct uvm_cnt map_kbackmerge, map_kforwmerge;
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
struct uvm_cnt map_kbimerge, map_knomerge;
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
struct uvm_cnt uvm_map_call, uvm_mlk_call, uvm_mlk_hint;
|
2000-12-13 11:06:11 +03:00
|
|
|
const char vmmapbsy[] = "vmmapbsy";
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-08-31 04:20:26 +04:00
|
|
|
/*
|
|
|
|
* pool for vmspace structures.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct pool uvm_vmspace_pool;
|
|
|
|
|
1998-08-31 05:10:15 +04:00
|
|
|
/*
|
|
|
|
* pool for dynamically-allocated map entries.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct pool uvm_map_entry_pool;
|
2001-09-09 23:38:22 +04:00
|
|
|
struct pool uvm_map_entry_kmem_pool;
|
1998-08-31 04:20:26 +04:00
|
|
|
|
2003-02-01 09:23:35 +03:00
|
|
|
MALLOC_DEFINE(M_VMMAP, "VM map", "VM map structures");
|
|
|
|
MALLOC_DEFINE(M_VMPMAP, "VM pmap", "VM pmap");
|
|
|
|
|
1999-05-21 03:03:23 +04:00
|
|
|
#ifdef PMAP_GROWKERNEL
|
|
|
|
/*
|
|
|
|
* This global represents the end of the kernel virtual address
|
|
|
|
* space. If we want to exceed this, we must grow the kernel
|
|
|
|
* virtual address space dynamically.
|
|
|
|
*
|
|
|
|
* Note, this variable is locked by kernel_map's lock.
|
|
|
|
*/
|
|
|
|
vaddr_t uvm_maxkaddr;
|
|
|
|
#endif
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* macros
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_entry_link: insert entry into a map
|
|
|
|
*
|
|
|
|
* => map must be locked
|
|
|
|
*/
|
1998-03-09 03:58:55 +03:00
|
|
|
#define uvm_map_entry_link(map, after_where, entry) do { \
|
2003-11-01 14:09:02 +03:00
|
|
|
KASSERT(entry->start < entry->end); \
|
1998-03-09 03:58:55 +03:00
|
|
|
(map)->nentries++; \
|
|
|
|
(entry)->prev = (after_where); \
|
|
|
|
(entry)->next = (after_where)->next; \
|
|
|
|
(entry)->prev->next = (entry); \
|
|
|
|
(entry)->next->prev = (entry); \
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_rb_insert((map), (entry)); \
|
2002-11-02 10:40:47 +03:00
|
|
|
} while (/*CONSTCOND*/ 0)
|
1998-03-09 03:58:55 +03:00
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* uvm_map_entry_unlink: remove entry from a map
|
|
|
|
*
|
|
|
|
* => map must be locked
|
|
|
|
*/
|
1998-03-09 03:58:55 +03:00
|
|
|
#define uvm_map_entry_unlink(map, entry) do { \
|
|
|
|
(map)->nentries--; \
|
|
|
|
(entry)->next->prev = (entry)->prev; \
|
|
|
|
(entry)->prev->next = (entry)->next; \
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_rb_remove((map), (entry)); \
|
2002-11-02 10:40:47 +03:00
|
|
|
} while (/*CONSTCOND*/ 0)
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* SAVE_HINT: saves the specified entry as the hint for future lookups.
|
|
|
|
*
|
|
|
|
* => map need not be locked (protected by hint_lock).
|
|
|
|
*/
|
2000-10-11 21:21:11 +04:00
|
|
|
#define SAVE_HINT(map,check,value) do { \
|
1998-03-09 03:58:55 +03:00
|
|
|
simple_lock(&(map)->hint_lock); \
|
2000-10-11 21:21:11 +04:00
|
|
|
if ((map)->hint == (check)) \
|
|
|
|
(map)->hint = (value); \
|
1998-03-09 03:58:55 +03:00
|
|
|
simple_unlock(&(map)->hint_lock); \
|
2002-11-02 10:40:47 +03:00
|
|
|
} while (/*CONSTCOND*/ 0)
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* VM_MAP_RANGE_CHECK: check and correct range
|
|
|
|
*
|
|
|
|
* => map must at least be read locked
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
#define VM_MAP_RANGE_CHECK(map, start, end) do { \
|
2003-10-02 03:08:32 +04:00
|
|
|
if (start < vm_map_min(map)) \
|
|
|
|
start = vm_map_min(map); \
|
|
|
|
if (end > vm_map_max(map)) \
|
|
|
|
end = vm_map_max(map); \
|
|
|
|
if (start > end) \
|
|
|
|
start = end; \
|
2002-11-02 10:40:47 +03:00
|
|
|
} while (/*CONSTCOND*/ 0)
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* local prototypes
|
|
|
|
*/
|
|
|
|
|
2003-10-02 02:50:15 +04:00
|
|
|
static struct vm_map_entry *
|
|
|
|
uvm_mapent_alloc(struct vm_map *, int);
|
|
|
|
static void uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
|
|
|
|
static void uvm_mapent_free(struct vm_map_entry *);
|
|
|
|
static void uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
|
|
|
|
static void uvm_map_reference_amap(struct vm_map_entry *, int);
|
2003-10-02 04:02:10 +04:00
|
|
|
static int uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
|
|
|
|
struct vm_map_entry *);
|
2003-10-02 02:50:15 +04:00
|
|
|
static void uvm_map_unreference_amap(struct vm_map_entry *, int);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
int _uvm_tree_sanity(struct vm_map *, const char *);
|
|
|
|
static vsize_t uvm_rb_subtree_space(const struct vm_map_entry *);
|
|
|
|
|
|
|
|
static __inline int
|
|
|
|
uvm_compare(const struct vm_map_entry *a, const struct vm_map_entry *b)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (a->start < b->start)
|
|
|
|
return (-1);
|
|
|
|
else if (a->start > b->start)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void
|
|
|
|
uvm_rb_augment(struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
|
|
|
|
entry->space = uvm_rb_subtree_space(entry);
|
|
|
|
}
|
|
|
|
|
|
|
|
RB_PROTOTYPE(uvm_tree, vm_map_entry, rb_entry, uvm_compare);
|
|
|
|
|
|
|
|
RB_GENERATE(uvm_tree, vm_map_entry, rb_entry, uvm_compare);
|
|
|
|
|
|
|
|
static __inline vsize_t
|
|
|
|
uvm_rb_space(const struct vm_map *map, const struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
/* XXX map is not used */
|
|
|
|
|
|
|
|
KASSERT(entry->next != NULL);
|
|
|
|
return entry->next->start - entry->end;
|
|
|
|
}
|
|
|
|
|
|
|
|
static vsize_t
|
|
|
|
uvm_rb_subtree_space(const struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
vaddr_t space, tmp;
|
|
|
|
|
|
|
|
space = entry->ownspace;
|
|
|
|
if (RB_LEFT(entry, rb_entry)) {
|
|
|
|
tmp = RB_LEFT(entry, rb_entry)->space;
|
|
|
|
if (tmp > space)
|
|
|
|
space = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RB_RIGHT(entry, rb_entry)) {
|
|
|
|
tmp = RB_RIGHT(entry, rb_entry)->space;
|
|
|
|
if (tmp > space)
|
|
|
|
space = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
return (space);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void
|
|
|
|
uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
/* We need to traverse to the very top */
|
|
|
|
do {
|
|
|
|
entry->ownspace = uvm_rb_space(map, entry);
|
|
|
|
entry->space = uvm_rb_subtree_space(entry);
|
|
|
|
} while ((entry = RB_PARENT(entry, rb_entry)) != NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void
|
|
|
|
uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
vaddr_t space = uvm_rb_space(map, entry);
|
|
|
|
struct vm_map_entry *tmp;
|
|
|
|
|
|
|
|
entry->ownspace = entry->space = space;
|
|
|
|
tmp = RB_INSERT(uvm_tree, &(map)->rbhead, entry);
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (tmp != NULL)
|
|
|
|
panic("uvm_rb_insert: duplicate entry?");
|
|
|
|
#endif
|
|
|
|
uvm_rb_fixup(map, entry);
|
|
|
|
if (entry->prev != &map->header)
|
|
|
|
uvm_rb_fixup(map, entry->prev);
|
|
|
|
}
|
|
|
|
|
|
|
|
static __inline void
|
|
|
|
uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
struct vm_map_entry *parent;
|
|
|
|
|
|
|
|
parent = RB_PARENT(entry, rb_entry);
|
|
|
|
RB_REMOVE(uvm_tree, &(map)->rbhead, entry);
|
|
|
|
if (entry->prev != &map->header)
|
|
|
|
uvm_rb_fixup(map, entry->prev);
|
|
|
|
if (parent)
|
|
|
|
uvm_rb_fixup(map, parent);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
#define uvm_tree_sanity(x,y) _uvm_tree_sanity(x,y)
|
|
|
|
#else
|
|
|
|
#define uvm_tree_sanity(x,y)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int
|
|
|
|
_uvm_tree_sanity(struct vm_map *map, const char *name)
|
|
|
|
{
|
|
|
|
struct vm_map_entry *tmp, *trtmp;
|
|
|
|
int n = 0, i = 1;
|
|
|
|
|
|
|
|
RB_FOREACH(tmp, uvm_tree, &map->rbhead) {
|
|
|
|
if (tmp->ownspace != uvm_rb_space(map, tmp)) {
|
|
|
|
printf("%s: %d/%d ownspace %lx != %lx %s\n",
|
|
|
|
name, n + 1, map->nentries,
|
|
|
|
(ulong)tmp->ownspace, (ulong)uvm_rb_space(map, tmp),
|
|
|
|
tmp->next == &map->header ? "(last)" : "");
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
trtmp = NULL;
|
|
|
|
RB_FOREACH(tmp, uvm_tree, &map->rbhead) {
|
|
|
|
if (tmp->space != uvm_rb_subtree_space(tmp)) {
|
|
|
|
printf("%s: space %lx != %lx\n",
|
|
|
|
name, (ulong)tmp->space,
|
|
|
|
(ulong)uvm_rb_subtree_space(tmp));
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
if (trtmp != NULL && trtmp->start >= tmp->start) {
|
|
|
|
printf("%s: corrupt: 0x%lx >= 0x%lx\n",
|
|
|
|
name, trtmp->start, tmp->start);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
n++;
|
|
|
|
|
|
|
|
trtmp = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (n != map->nentries) {
|
|
|
|
printf("%s: nentries: %d vs %d\n",
|
|
|
|
name, n, map->nentries);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (tmp = map->header.next; tmp && tmp != &map->header;
|
|
|
|
tmp = tmp->next, i++) {
|
|
|
|
trtmp = RB_FIND(uvm_tree, &map->rbhead, tmp);
|
|
|
|
if (trtmp != tmp) {
|
|
|
|
printf("%s: lookup: %d: %p - %p: %p\n",
|
|
|
|
name, i, tmp, trtmp,
|
|
|
|
RB_PARENT(tmp, rb_entry));
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
error:
|
|
|
|
#ifdef DDB
|
|
|
|
/* handy breakpoint location for error case */
|
|
|
|
__asm(".globl treesanity_label\ntreesanity_label:");
|
|
|
|
#endif
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* local inlines
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_mapent_alloc: allocate a map entry
|
|
|
|
*/
|
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
static __inline struct vm_map_entry *
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_mapent_alloc(struct vm_map *map, int flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *me;
|
1998-03-09 03:58:55 +03:00
|
|
|
int s;
|
2002-12-11 10:14:28 +03:00
|
|
|
int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
|
2001-09-09 23:38:22 +04:00
|
|
|
UVMHIST_FUNC("uvm_mapent_alloc"); UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-09-09 23:38:22 +04:00
|
|
|
if (map->flags & VM_MAP_INTRSAFE || cold) {
|
|
|
|
s = splvm();
|
1998-03-09 03:58:55 +03:00
|
|
|
simple_lock(&uvm.kentry_lock);
|
|
|
|
me = uvm.kentry_free;
|
2003-10-02 03:08:32 +04:00
|
|
|
if (me)
|
|
|
|
uvm.kentry_free = me->next;
|
1998-03-09 03:58:55 +03:00
|
|
|
simple_unlock(&uvm.kentry_lock);
|
|
|
|
splx(s);
|
2002-11-30 21:28:04 +03:00
|
|
|
if (__predict_false(me == NULL)) {
|
2001-09-09 23:38:22 +04:00
|
|
|
panic("uvm_mapent_alloc: out of static map entries, "
|
2003-10-02 03:08:32 +04:00
|
|
|
"check MAX_KMAPENT (currently %d)",
|
|
|
|
MAX_KMAPENT);
|
2001-09-09 23:38:22 +04:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
me->flags = UVM_MAP_STATIC;
|
2001-09-09 23:38:22 +04:00
|
|
|
} else if (map == kernel_map) {
|
2002-11-30 21:28:04 +03:00
|
|
|
me = pool_get(&uvm_map_entry_kmem_pool, pflags);
|
|
|
|
if (__predict_false(me == NULL))
|
|
|
|
return NULL;
|
2001-09-09 23:38:22 +04:00
|
|
|
me->flags = UVM_MAP_KMEM;
|
|
|
|
} else {
|
2002-11-30 21:28:04 +03:00
|
|
|
me = pool_get(&uvm_map_entry_pool, pflags);
|
|
|
|
if (__predict_false(me == NULL))
|
|
|
|
return NULL;
|
2001-09-09 23:38:22 +04:00
|
|
|
me->flags = 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
2001-09-09 23:38:22 +04:00
|
|
|
UVMHIST_LOG(maphist, "<- new entry=0x%x [kentry=%d]", me,
|
|
|
|
((map->flags & VM_MAP_INTRSAFE) != 0 || map == kernel_map), 0, 0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (me);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_mapent_free: free map entry
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
static __inline void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_mapent_free(struct vm_map_entry *me)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-03-09 03:58:55 +03:00
|
|
|
int s;
|
2001-09-09 23:38:22 +04:00
|
|
|
UVMHIST_FUNC("uvm_mapent_free"); UVMHIST_CALLED(maphist);
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
UVMHIST_LOG(maphist,"<- freeing map entry=0x%x [flags=%d]",
|
1998-02-05 09:25:08 +03:00
|
|
|
me, me->flags, 0, 0);
|
2001-09-09 23:38:22 +04:00
|
|
|
if (me->flags & UVM_MAP_STATIC) {
|
|
|
|
s = splvm();
|
1998-03-09 03:58:55 +03:00
|
|
|
simple_lock(&uvm.kentry_lock);
|
|
|
|
me->next = uvm.kentry_free;
|
|
|
|
uvm.kentry_free = me;
|
|
|
|
simple_unlock(&uvm.kentry_lock);
|
|
|
|
splx(s);
|
2001-09-09 23:38:22 +04:00
|
|
|
} else if (me->flags & UVM_MAP_KMEM) {
|
|
|
|
pool_put(&uvm_map_entry_kmem_pool, me);
|
|
|
|
} else {
|
|
|
|
pool_put(&uvm_map_entry_pool, me);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_mapent_copy: copy a map entry, preserving flags
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
static __inline void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2003-10-02 03:08:32 +04:00
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
memcpy(dst, src, ((char *)&src->uvm_map_entry_stop_copy) -
|
2003-10-02 03:08:32 +04:00
|
|
|
((char *)src));
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_entry_unwire: unwire a map entry
|
|
|
|
*
|
|
|
|
* => map should be locked by caller
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
static __inline void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2003-10-02 03:08:32 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->wired_count = 0;
|
1999-06-17 02:11:23 +04:00
|
|
|
uvm_fault_unwire_locked(map, entry->start, entry->end);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* wrapper for calling amap_ref()
|
|
|
|
*/
|
|
|
|
static __inline void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
|
2000-11-25 09:27:59 +03:00
|
|
|
{
|
2003-10-02 03:08:32 +04:00
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
|
2003-10-02 03:08:32 +04:00
|
|
|
(entry->end - entry->start) >> PAGE_SHIFT, flags);
|
2000-11-25 09:27:59 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* wrapper for calling amap_unref()
|
2000-11-25 09:27:59 +03:00
|
|
|
*/
|
|
|
|
static __inline void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
|
2000-11-25 09:27:59 +03:00
|
|
|
{
|
2003-10-02 03:08:32 +04:00
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
|
2003-10-02 03:08:32 +04:00
|
|
|
(entry->end - entry->start) >> PAGE_SHIFT, flags);
|
2000-11-25 09:27:59 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* uvm_map_init: init mapping system at boot time. note that we allocate
|
2001-06-02 22:09:08 +04:00
|
|
|
* and init the static pool of struct vm_map_entry *'s for the kernel here.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_init(void)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-03-09 03:58:55 +03:00
|
|
|
static struct vm_map_entry kernel_map_entry[MAX_KMAPENT];
|
1998-02-05 09:25:08 +03:00
|
|
|
#if defined(UVMHIST)
|
1998-03-09 03:58:55 +03:00
|
|
|
static struct uvm_history_ent maphistbuf[100];
|
|
|
|
static struct uvm_history_ent pdhistbuf[100];
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
1998-03-09 03:58:55 +03:00
|
|
|
int lcv;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* first, init logging system.
|
|
|
|
*/
|
|
|
|
|
|
|
|
UVMHIST_FUNC("uvm_map_init");
|
|
|
|
UVMHIST_INIT_STATIC(maphist, maphistbuf);
|
|
|
|
UVMHIST_INIT_STATIC(pdhist, pdhistbuf);
|
|
|
|
UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
|
2003-10-02 03:08:32 +04:00
|
|
|
UVMCNT_INIT(uvm_map_call, UVMCNT_CNT, 0,
|
1998-03-09 03:58:55 +03:00
|
|
|
"# uvm_map() successful calls", 0);
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
|
|
|
|
UVMCNT_INIT(map_ubackmerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() back umerges", 0);
|
|
|
|
UVMCNT_INIT(map_uforwmerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() forward umerges", 0);
|
|
|
|
UVMCNT_INIT(map_ubimerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() dual umerge", 0);
|
|
|
|
UVMCNT_INIT(map_unomerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() no umerge", 0);
|
|
|
|
|
|
|
|
UVMCNT_INIT(map_kbackmerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() back kmerges", 0);
|
|
|
|
UVMCNT_INIT(map_kforwmerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() forward kmerges", 0);
|
|
|
|
UVMCNT_INIT(map_kbimerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() dual kmerge", 0);
|
|
|
|
UVMCNT_INIT(map_knomerge, UVMCNT_CNT, 0,
|
|
|
|
"# uvm_map() no kmerge", 0);
|
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
UVMCNT_INIT(uvm_mlk_call, UVMCNT_CNT, 0, "# map lookup calls", 0);
|
|
|
|
UVMCNT_INIT(uvm_mlk_hint, UVMCNT_CNT, 0, "# map lookup hint hits", 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* now set up static pool of kernel map entrys ...
|
|
|
|
*/
|
|
|
|
|
|
|
|
simple_lock_init(&uvm.kentry_lock);
|
|
|
|
uvm.kentry_free = NULL;
|
|
|
|
for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) {
|
|
|
|
kernel_map_entry[lcv].next = uvm.kentry_free;
|
|
|
|
uvm.kentry_free = &kernel_map_entry[lcv];
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-08-31 04:20:26 +04:00
|
|
|
/*
|
|
|
|
* initialize the map-related pools.
|
|
|
|
*/
|
|
|
|
pool_init(&uvm_vmspace_pool, sizeof(struct vmspace),
|
2002-03-08 23:48:27 +03:00
|
|
|
0, 0, 0, "vmsppl", &pool_allocator_nointr);
|
1998-08-31 05:10:15 +04:00
|
|
|
pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry),
|
2002-03-08 23:48:27 +03:00
|
|
|
0, 0, 0, "vmmpepl", &pool_allocator_nointr);
|
2001-09-09 23:38:22 +04:00
|
|
|
pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry),
|
2002-03-08 23:48:27 +03:00
|
|
|
0, 0, 0, "vmmpekpl", NULL);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* clippers
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_clip_start: ensure that the entry begins at or after
|
|
|
|
* the starting address, if it doesn't we split the entry.
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* => caller should use UVM_MAP_CLIP_START macro rather than calling
|
|
|
|
* this directly
|
|
|
|
* => map must be locked by caller
|
|
|
|
*/
|
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
|
|
|
|
vaddr_t start)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *new_entry;
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t new_adj;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/* uvm_map_simplify_entry(map, entry); */ /* XXX */
|
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "clip_start entry");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* Split off the front portion. note that we must insert the new
|
|
|
|
* entry BEFORE this one, so that this entry has the specified
|
1998-02-05 09:25:08 +03:00
|
|
|
* starting address.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2002-11-30 21:28:04 +03:00
|
|
|
new_entry = uvm_mapent_alloc(map, 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
new_entry->end = start;
|
1998-02-05 09:25:08 +03:00
|
|
|
new_adj = start - new_entry->start;
|
|
|
|
if (entry->object.uvm_obj)
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->offset += new_adj; /* shift start over */
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
/* Does not change order for the RB tree */
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->start = start;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
if (new_entry->aref.ar_amap) {
|
1998-03-09 03:58:55 +03:00
|
|
|
amap_splitref(&new_entry->aref, &entry->aref, new_adj);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
uvm_map_entry_link(map, entry->prev, new_entry);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
if (UVM_ET_ISSUBMAP(entry)) {
|
|
|
|
/* ... unlikely to happen, but play it safe */
|
|
|
|
uvm_map_reference(new_entry->object.sub_map);
|
1998-02-05 09:25:08 +03:00
|
|
|
} else {
|
2001-05-25 08:06:11 +04:00
|
|
|
if (UVM_ET_ISOBJ(entry) &&
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->object.uvm_obj->pgops &&
|
|
|
|
entry->object.uvm_obj->pgops->pgo_reference)
|
|
|
|
entry->object.uvm_obj->pgops->pgo_reference(
|
|
|
|
entry->object.uvm_obj);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
uvm_tree_sanity(map, "clip_start leave");
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_clip_end: ensure that the entry ends at or before
|
|
|
|
* the ending address, if it does't we split the reference
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* => caller should use UVM_MAP_CLIP_END macro rather than calling
|
|
|
|
* this directly
|
|
|
|
* => map must be locked by caller
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry * new_entry;
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t new_adj; /* #bytes we move start forward */
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "clip_end entry");
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* Create a new entry and insert it
|
|
|
|
* AFTER the specified entry
|
|
|
|
*/
|
|
|
|
|
2002-11-30 21:28:04 +03:00
|
|
|
new_entry = uvm_mapent_alloc(map, 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
|
|
|
|
|
|
|
|
new_entry->start = entry->end = end;
|
|
|
|
new_adj = end - entry->start;
|
|
|
|
if (new_entry->object.uvm_obj)
|
|
|
|
new_entry->offset += new_adj;
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (entry->aref.ar_amap)
|
|
|
|
amap_splitref(&entry->aref, &new_entry->aref, new_adj);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_rb_fixup(map, entry);
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
uvm_map_entry_link(map, entry, new_entry);
|
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
if (UVM_ET_ISSUBMAP(entry)) {
|
|
|
|
/* ... unlikely to happen, but play it safe */
|
2003-10-02 03:08:32 +04:00
|
|
|
uvm_map_reference(new_entry->object.sub_map);
|
1998-02-05 09:25:08 +03:00
|
|
|
} else {
|
1998-03-09 03:58:55 +03:00
|
|
|
if (UVM_ET_ISOBJ(entry) &&
|
|
|
|
entry->object.uvm_obj->pgops &&
|
|
|
|
entry->object.uvm_obj->pgops->pgo_reference)
|
|
|
|
entry->object.uvm_obj->pgops->pgo_reference(
|
|
|
|
entry->object.uvm_obj);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
uvm_tree_sanity(map, "clip_end leave");
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* M A P - m a i n e n t r y p o i n t
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* uvm_map: establish a valid mapping in a map
|
|
|
|
*
|
|
|
|
* => assume startp is page aligned.
|
|
|
|
* => assume size is a multiple of PAGE_SIZE.
|
|
|
|
* => assume sys_mmap provides enough of a "hint" to have us skip
|
|
|
|
* over text/data/bss area.
|
|
|
|
* => map must be unlocked (we will lock it)
|
|
|
|
* => <uobj,uoffset> value meanings (4 cases):
|
2003-10-02 03:08:32 +04:00
|
|
|
* [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER
|
1998-02-05 09:25:08 +03:00
|
|
|
* [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER
|
|
|
|
* [3] <uobj,uoffset> == normal mapping
|
|
|
|
* [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* case [4] is for kernel mappings where we don't know the offset until
|
1998-02-24 18:58:09 +03:00
|
|
|
* we've found a virtual address. note that kernel object offsets are
|
|
|
|
* always relative to vm_map_min(kernel_map).
|
2000-09-13 19:00:15 +04:00
|
|
|
*
|
|
|
|
* => if `align' is non-zero, we try to align the virtual address to
|
|
|
|
* the specified alignment. this is only a hint; if we can't
|
|
|
|
* do it, the address will be unaligned. this is provided as
|
|
|
|
* a mechanism for large pages.
|
|
|
|
*
|
1998-02-05 09:25:08 +03:00
|
|
|
* => XXXCDC: need way to map in external amap?
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
|
|
|
|
struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *prev_entry, *new_entry;
|
2002-12-11 10:14:28 +03:00
|
|
|
const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
|
2002-11-30 21:28:04 +03:00
|
|
|
AMAP_EXTEND_NOWAIT : 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_prot_t prot = UVM_PROTECTION(flags), maxprot =
|
|
|
|
UVM_MAXPROTECTION(flags);
|
|
|
|
vm_inherit_t inherit = UVM_INHERIT(flags);
|
|
|
|
int advice = UVM_ADVICE(flags);
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
int error, merged = 0, kmap = (vm_map_pmap(map) == pmap_kernel());
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_map");
|
|
|
|
UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, "(map=0x%x, *startp=0x%x, size=%d, flags=0x%x)",
|
|
|
|
map, *startp, size, flags);
|
|
|
|
UVMHIST_LOG(maphist, " uobj/offset 0x%x/%d", uobj, uoffset,0,0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-09-21 11:57:35 +04:00
|
|
|
/*
|
|
|
|
* detect a popular device driver bug.
|
|
|
|
*/
|
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
KASSERT(doing_shutdown || curlwp != NULL ||
|
2003-01-21 03:03:07 +03:00
|
|
|
(map->flags & VM_MAP_INTRSAFE));
|
2001-09-21 11:57:35 +04:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
/*
|
|
|
|
* zero-sized mapping doesn't make any sense.
|
|
|
|
*/
|
|
|
|
KASSERT(size > 0);
|
|
|
|
|
|
|
|
uvm_tree_sanity(map, "map entry");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
* check sanity of protection code
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if ((prot & maxprot) != prot) {
|
2001-05-25 08:06:11 +04:00
|
|
|
UVMHIST_LOG(maphist, "<- prot. failure: prot=0x%x, max=0x%x",
|
1998-02-05 09:25:08 +03:00
|
|
|
prot, maxprot,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EACCES;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
* for pager_map, allocate the new entry first to avoid sleeping
|
|
|
|
* for memory while we have the map locked.
|
|
|
|
*/
|
|
|
|
|
|
|
|
new_entry = NULL;
|
|
|
|
if (map == pager_map) {
|
2002-12-11 10:14:28 +03:00
|
|
|
new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
|
2003-10-09 07:12:29 +04:00
|
|
|
if (__predict_false(new_entry == NULL))
|
2002-11-30 21:28:04 +03:00
|
|
|
return ENOMEM;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* figure out where to put new VM range
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
if (vm_map_lock_try(map) == FALSE) {
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (flags & UVM_FLAG_TRYLOCK) {
|
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
}
|
2001-03-15 09:10:32 +03:00
|
|
|
return EAGAIN;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_lock(map); /* could sleep here */
|
|
|
|
}
|
2001-05-25 08:06:11 +04:00
|
|
|
if ((prev_entry = uvm_map_findspace(map, *startp, size, startp,
|
2000-09-13 19:00:15 +04:00
|
|
|
uobj, uoffset, align, flags)) == NULL) {
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- uvm_map_findspace failed!",0,0,0,0);
|
|
|
|
vm_map_unlock(map);
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
}
|
2001-03-15 09:10:32 +03:00
|
|
|
return ENOMEM;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1999-05-21 03:03:23 +04:00
|
|
|
#ifdef PMAP_GROWKERNEL
|
2003-12-19 09:02:50 +03:00
|
|
|
/*
|
|
|
|
* If the kernel pmap can't map the requested space,
|
|
|
|
* then allocate more resources for it.
|
|
|
|
*/
|
|
|
|
if (map == kernel_map && uvm_maxkaddr < (*startp + size))
|
|
|
|
uvm_maxkaddr = pmap_growkernel(*startp + size);
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMCNT_INCR(uvm_map_call);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
|
2001-05-25 08:06:11 +04:00
|
|
|
* [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in
|
|
|
|
* either case we want to zero it before storing it in the map entry
|
1998-03-09 03:58:55 +03:00
|
|
|
* (because it looks strange and confusing when debugging...)
|
2001-05-25 08:06:11 +04:00
|
|
|
*
|
|
|
|
* if uobj is not null
|
1998-03-09 03:58:55 +03:00
|
|
|
* if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
|
|
|
|
* and we do not need to change uoffset.
|
|
|
|
* if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
|
|
|
|
* now (based on the starting address of the map). this case is
|
|
|
|
* for kernel object mappings where we don't know the offset until
|
|
|
|
* the virtual address is found (with uvm_map_findspace). the
|
|
|
|
* offset is the distance we are from the start of the map.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (uobj == NULL) {
|
|
|
|
uoffset = 0;
|
|
|
|
} else {
|
|
|
|
if (uoffset == UVM_UNKNOWN_OFFSET) {
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
|
1998-03-09 03:58:55 +03:00
|
|
|
uoffset = *startp - vm_map_min(kernel_map);
|
|
|
|
}
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
* try and insert in map by extending previous entry, if possible.
|
1998-03-09 03:58:55 +03:00
|
|
|
* XXX: we don't try and pull back the next entry. might be useful
|
|
|
|
* for a stack, but we are currently allocating our stack in advance.
|
|
|
|
*/
|
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (flags & UVM_FLAG_NOMERGE)
|
|
|
|
goto nomerge;
|
|
|
|
|
|
|
|
if (prev_entry->end == *startp &&
|
|
|
|
prev_entry != &map->header &&
|
1998-03-09 03:58:55 +03:00
|
|
|
prev_entry->object.uvm_obj == uobj) {
|
|
|
|
|
|
|
|
if (uobj && prev_entry->offset +
|
|
|
|
(prev_entry->end - prev_entry->start) != uoffset)
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
if (UVM_ET_ISSUBMAP(prev_entry))
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
if (prev_entry->protection != prot ||
|
1998-03-09 03:58:55 +03:00
|
|
|
prev_entry->max_protection != maxprot)
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
if (prev_entry->inheritance != inherit ||
|
|
|
|
prev_entry->advice != advice)
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
1999-06-16 23:34:24 +04:00
|
|
|
/* wiring status must match (new area is unwired) */
|
1999-06-16 04:29:04 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(prev_entry))
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* can't extend a shared amap. note: no need to lock amap to
|
1999-01-25 02:53:14 +03:00
|
|
|
* look at refs since we don't care about its exact value.
|
1998-03-09 03:58:55 +03:00
|
|
|
* if it is one (i.e. we have only reference) it will stay there
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (prev_entry->aref.ar_amap &&
|
1999-01-25 02:53:14 +03:00
|
|
|
amap_refs(prev_entry->aref.ar_amap) != 1) {
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto forwardmerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2002-09-15 20:54:26 +04:00
|
|
|
if (prev_entry->aref.ar_amap) {
|
2003-10-02 03:08:32 +04:00
|
|
|
error = amap_extend(prev_entry, size,
|
2002-11-30 21:28:04 +03:00
|
|
|
amapwaitflag | AMAP_EXTEND_FORWARDS);
|
2002-09-15 20:54:26 +04:00
|
|
|
if (error) {
|
|
|
|
vm_map_unlock(map);
|
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (kmap)
|
|
|
|
UVMCNT_INCR(map_kbackmerge);
|
|
|
|
else
|
|
|
|
UVMCNT_INCR(map_ubackmerge);
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist," starting back merge", 0, 0, 0, 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* drop our reference to uobj since we are extending a reference
|
|
|
|
* that we already have (the ref count can not drop to zero).
|
|
|
|
*/
|
2002-09-15 20:54:26 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (uobj && uobj->pgops->pgo_detach)
|
|
|
|
uobj->pgops->pgo_detach(uobj);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
prev_entry->end += size;
|
2003-11-01 22:45:13 +03:00
|
|
|
uvm_rb_fixup(map, prev_entry);
|
|
|
|
|
|
|
|
uvm_tree_sanity(map, "map backmerged");
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
new_entry = NULL;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
}
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
merged++;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
forwardmerge:
|
|
|
|
if (prev_entry->next->start == (*startp + size) &&
|
|
|
|
prev_entry->next != &map->header &&
|
|
|
|
prev_entry->next->object.uvm_obj == uobj) {
|
1998-02-05 09:25:08 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (uobj && prev_entry->next->offset != uoffset + size)
|
|
|
|
goto nomerge;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (UVM_ET_ISSUBMAP(prev_entry->next))
|
|
|
|
goto nomerge;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (prev_entry->next->protection != prot ||
|
|
|
|
prev_entry->next->max_protection != maxprot)
|
|
|
|
goto nomerge;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (prev_entry->next->inheritance != inherit ||
|
|
|
|
prev_entry->next->advice != advice)
|
|
|
|
goto nomerge;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
/* wiring status must match (new area is unwired) */
|
|
|
|
if (VM_MAPENT_ISWIRED(prev_entry->next))
|
|
|
|
goto nomerge;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
* can't extend a shared amap. note: no need to lock amap to
|
|
|
|
* look at refs since we don't care about its exact value.
|
2002-10-25 00:37:59 +04:00
|
|
|
* if it is one (i.e. we have only reference) it will stay there.
|
|
|
|
*
|
|
|
|
* note that we also can't merge two amaps, so if we
|
|
|
|
* merged with the previous entry which has an amap,
|
|
|
|
* and the next entry also has an amap, we give up.
|
|
|
|
*
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
* Interesting cases:
|
|
|
|
* amap, new, amap -> give up second merge (single fwd extend)
|
|
|
|
* amap, new, none -> double forward extend (extend again here)
|
|
|
|
* none, new, amap -> double backward extend (done here)
|
|
|
|
* uobj, new, amap -> single backward extend (done here)
|
|
|
|
*
|
2002-10-25 00:37:59 +04:00
|
|
|
* XXX should we attempt to deal with someone refilling
|
|
|
|
* the deallocated region between two entries that are
|
|
|
|
* backed by the same amap (ie, arefs is 2, "prev" and
|
|
|
|
* "next" refer to it, and adding this allocation will
|
|
|
|
* close the hole, thus restoring arefs to 1 and
|
|
|
|
* deallocating the "next" vm_map_entry)? -- @@@
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (prev_entry->next->aref.ar_amap &&
|
2002-10-25 00:37:59 +04:00
|
|
|
(amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
|
|
|
|
(merged && prev_entry->aref.ar_amap))) {
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
goto nomerge;
|
|
|
|
}
|
|
|
|
|
2002-10-25 00:37:59 +04:00
|
|
|
if (merged) {
|
2002-10-25 02:22:28 +04:00
|
|
|
/*
|
|
|
|
* Try to extend the amap of the previous entry to
|
|
|
|
* cover the next entry as well. If it doesn't work
|
|
|
|
* just skip on, don't actually give up, since we've
|
|
|
|
* already completed the back merge.
|
|
|
|
*/
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
if (prev_entry->aref.ar_amap) {
|
|
|
|
if (amap_extend(prev_entry,
|
|
|
|
prev_entry->next->end -
|
|
|
|
prev_entry->next->start,
|
2002-11-30 21:28:04 +03:00
|
|
|
amapwaitflag | AMAP_EXTEND_FORWARDS))
|
2003-10-09 07:12:29 +04:00
|
|
|
goto nomerge;
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to extend the amap of the *next* entry
|
|
|
|
* back to cover the new allocation *and* the
|
|
|
|
* previous entry as well (the previous merge
|
|
|
|
* didn't have an amap already otherwise we
|
|
|
|
* wouldn't be checking here for an amap). If
|
|
|
|
* it doesn't work just skip on, again, don't
|
|
|
|
* actually give up, since we've already
|
|
|
|
* completed the back merge.
|
|
|
|
*/
|
|
|
|
else if (prev_entry->next->aref.ar_amap) {
|
|
|
|
if (amap_extend(prev_entry->next,
|
|
|
|
prev_entry->end -
|
2003-10-09 06:44:54 +04:00
|
|
|
prev_entry->start,
|
2002-11-30 21:28:04 +03:00
|
|
|
amapwaitflag | AMAP_EXTEND_BACKWARDS))
|
2003-10-09 07:12:29 +04:00
|
|
|
goto nomerge;
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Pull the next entry's amap backwards to cover this
|
|
|
|
* new allocation.
|
|
|
|
*/
|
|
|
|
if (prev_entry->next->aref.ar_amap) {
|
|
|
|
error = amap_extend(prev_entry->next, size,
|
2002-11-30 21:28:04 +03:00
|
|
|
amapwaitflag | AMAP_EXTEND_BACKWARDS);
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
if (error) {
|
|
|
|
vm_map_unlock(map);
|
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
}
|
2002-10-25 00:37:59 +04:00
|
|
|
}
|
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (merged) {
|
|
|
|
if (kmap) {
|
|
|
|
UVMCNT_DECR(map_kbackmerge);
|
|
|
|
UVMCNT_INCR(map_kbimerge);
|
|
|
|
} else {
|
2002-10-25 00:37:59 +04:00
|
|
|
UVMCNT_DECR(map_ubackmerge);
|
|
|
|
UVMCNT_INCR(map_ubimerge);
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
}
|
2002-10-25 00:37:59 +04:00
|
|
|
} else {
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (kmap)
|
|
|
|
UVMCNT_INCR(map_kforwmerge);
|
|
|
|
else
|
|
|
|
UVMCNT_INCR(map_uforwmerge);
|
|
|
|
}
|
|
|
|
UVMHIST_LOG(maphist," starting forward merge", 0, 0, 0, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* drop our reference to uobj since we are extending a reference
|
|
|
|
* that we already have (the ref count can not drop to zero).
|
|
|
|
* (if merged, we've already detached)
|
|
|
|
*/
|
|
|
|
if (uobj && uobj->pgops->pgo_detach && !merged)
|
|
|
|
uobj->pgops->pgo_detach(uobj);
|
|
|
|
|
|
|
|
if (merged) {
|
|
|
|
struct vm_map_entry *dead = prev_entry->next;
|
|
|
|
prev_entry->end = dead->end;
|
|
|
|
uvm_map_entry_unlink(map, dead);
|
Implement backwards extension of amaps. There are three cases to deal
with:
Case #1 -- adjust offset: The slot offset in the aref can be
decremented to cover the required size addition.
Case #2 -- move pages and adjust offset: The slot offset is not large
enough, but the amap contains enough inactive space *after* the mapped
pages to make up the difference, so active slots are slid to the "end"
of the amap, and the slot offset is, again, adjusted to cover the
required size addition. This optimizes for hitting case #1 again on
the next small extension.
Case #3 -- reallocate, move pages, and adjust offset: There is not
enough inactive space in the amap, so the arrays are reallocated, and
the active pages are copied again to the "end" of the amap, and the
slot offset is adjusted to cover the required size. This also
optimizes for hitting case #1 on the next backwards extension.
This provides the missing piece in the "forward extension of
vm_map_entries" logic, so the merge failure counters have been
removed.
Not many applications will make any use of this at this time (except
for jvms and perhaps gcc3), but a "top-down" memory allocator will use
it extensively.
2002-11-14 20:58:48 +03:00
|
|
|
if (dead->aref.ar_amap != NULL) {
|
|
|
|
prev_entry->aref = dead->aref;
|
|
|
|
dead->aref.ar_amap = NULL;
|
|
|
|
}
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
uvm_mapent_free(dead);
|
|
|
|
} else {
|
|
|
|
prev_entry->next->start -= size;
|
2003-11-01 22:45:13 +03:00
|
|
|
if (prev_entry != &map->header)
|
|
|
|
uvm_rb_fixup(map, prev_entry);
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
if (uobj)
|
|
|
|
prev_entry->next->offset = uoffset;
|
|
|
|
}
|
|
|
|
|
2003-11-01 22:45:13 +03:00
|
|
|
uvm_tree_sanity(map, "map forwardmerged");
|
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
|
|
|
|
if (new_entry) {
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
new_entry = NULL;
|
|
|
|
}
|
|
|
|
merged++;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
nomerge:
|
|
|
|
if (!merged) {
|
|
|
|
UVMHIST_LOG(maphist," allocating new map entry", 0, 0, 0, 0);
|
|
|
|
if (kmap)
|
|
|
|
UVMCNT_INCR(map_knomerge);
|
|
|
|
else
|
|
|
|
UVMCNT_INCR(map_unomerge);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
/*
|
|
|
|
* allocate new entry and link it in.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (new_entry == NULL) {
|
2002-11-30 21:28:04 +03:00
|
|
|
new_entry = uvm_mapent_alloc(map,
|
2002-12-11 10:14:28 +03:00
|
|
|
(flags & UVM_FLAG_NOWAIT));
|
2002-11-30 21:28:04 +03:00
|
|
|
if (__predict_false(new_entry == NULL)) {
|
|
|
|
vm_map_unlock(map);
|
|
|
|
return ENOMEM;
|
|
|
|
}
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
}
|
|
|
|
new_entry->start = *startp;
|
|
|
|
new_entry->end = new_entry->start + size;
|
|
|
|
new_entry->object.uvm_obj = uobj;
|
|
|
|
new_entry->offset = uoffset;
|
|
|
|
|
|
|
|
if (uobj)
|
|
|
|
new_entry->etype = UVM_ET_OBJ;
|
|
|
|
else
|
|
|
|
new_entry->etype = 0;
|
|
|
|
|
|
|
|
if (flags & UVM_FLAG_COPYONW) {
|
|
|
|
new_entry->etype |= UVM_ET_COPYONWRITE;
|
|
|
|
if ((flags & UVM_FLAG_OVERLAY) == 0)
|
|
|
|
new_entry->etype |= UVM_ET_NEEDSCOPY;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_entry->protection = prot;
|
|
|
|
new_entry->max_protection = maxprot;
|
|
|
|
new_entry->inheritance = inherit;
|
|
|
|
new_entry->wired_count = 0;
|
|
|
|
new_entry->advice = advice;
|
|
|
|
if (flags & UVM_FLAG_OVERLAY) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* to_add: for BSS we overallocate a little since we
|
|
|
|
* are likely to extend
|
|
|
|
*/
|
|
|
|
|
|
|
|
vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
|
|
|
|
UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
|
2002-11-30 21:28:04 +03:00
|
|
|
struct vm_amap *amap = amap_alloc(size, to_add,
|
2002-12-11 10:14:28 +03:00
|
|
|
(flags & UVM_FLAG_NOWAIT) ? M_NOWAIT : M_WAITOK);
|
2002-11-30 21:28:04 +03:00
|
|
|
if (__predict_false(amap == NULL)) {
|
|
|
|
vm_map_unlock(map);
|
|
|
|
uvm_mapent_free(new_entry);
|
|
|
|
return ENOMEM;
|
|
|
|
}
|
Add an implementation of forward merging of new map entries. Most new
allocations can be merged either forwards or backwards, meaning no new
entries will be added to the list, and some can even be merged in both
directions, resulting in a surplus entry.
This code typically reduces the number of map entries in the
kernel_map by an order of magnitude or more. It also makes possible
recovery from the pathological case of "5000 processes created and
then killed", which leaves behind a large number of map entries.
The only forward merge case not covered is the instance of an amap
that has to be extended backwards (WIP). Note that this only affects
processes, not the kernel (the kernel doesn't use amaps), and that
merge opportunities like this come up *very* rarely, if at all. Eg,
after being up for eight days, I see only three failures in this
regard, and even those are most likely due to programs I'm developing
to exercise this case.
Code reviewed by thorpej, matt, christos, mrg, chuq, chuck, perry,
tls, and probably others. I'd like to thank my mother, the Hollywood
Foreign Press...
2002-10-18 17:18:42 +04:00
|
|
|
new_entry->aref.ar_pageoff = 0;
|
|
|
|
new_entry->aref.ar_amap = amap;
|
|
|
|
} else {
|
|
|
|
new_entry->aref.ar_pageoff = 0;
|
|
|
|
new_entry->aref.ar_amap = NULL;
|
|
|
|
}
|
|
|
|
uvm_map_entry_link(map, prev_entry, new_entry);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the free space hint
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((map->first_free == prev_entry) &&
|
|
|
|
(prev_entry->end >= new_entry->start))
|
|
|
|
map->first_free = new_entry;
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 22:56:09 +03:00
|
|
|
map->size += size;
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_lookup_entry: find map entry at or before an address
|
|
|
|
*
|
|
|
|
* => map must at least be read-locked by caller
|
|
|
|
* => entry is returned in "entry"
|
|
|
|
* => return value is true if address is in the returned entry
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
boolean_t
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
|
|
|
|
struct vm_map_entry **entry /* OUT */)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *cur;
|
2003-11-01 14:09:02 +03:00
|
|
|
boolean_t use_tree = FALSE;
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_lookup_entry");
|
|
|
|
UVMHIST_CALLED(maphist);
|
|
|
|
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,addr=0x%x,ent=0x%x)",
|
1998-03-09 03:58:55 +03:00
|
|
|
map, address, entry, 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* start looking either from the head of the
|
|
|
|
* list, or from the hint.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
simple_lock(&map->hint_lock);
|
|
|
|
cur = map->hint;
|
|
|
|
simple_unlock(&map->hint_lock);
|
|
|
|
|
|
|
|
if (cur == &map->header)
|
|
|
|
cur = cur->next;
|
|
|
|
|
|
|
|
UVMCNT_INCR(uvm_mlk_call);
|
|
|
|
if (address >= cur->start) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* go from hint to end of list.
|
1998-02-05 09:25:08 +03:00
|
|
|
*
|
1998-03-09 03:58:55 +03:00
|
|
|
* but first, make a quick check to see if
|
|
|
|
* we are already looking at the entry we
|
|
|
|
* want (which is usually the case).
|
|
|
|
* note also that we don't need to save the hint
|
|
|
|
* here... it is the same hint (unless we are
|
|
|
|
* at the header, in which case the hint didn't
|
|
|
|
* buy us anything anyway).
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
if (cur != &map->header && cur->end > address) {
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMCNT_INCR(uvm_mlk_hint);
|
|
|
|
*entry = cur;
|
|
|
|
UVMHIST_LOG(maphist,"<- got it via hint (0x%x)",
|
1998-03-09 03:58:55 +03:00
|
|
|
cur, 0, 0, 0);
|
|
|
|
return (TRUE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
if (map->nentries > 30)
|
|
|
|
use_tree = TRUE;
|
1998-03-09 03:58:55 +03:00
|
|
|
} else {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
/*
|
2003-11-01 14:09:02 +03:00
|
|
|
* invalid hint. use tree.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
2003-11-01 14:09:02 +03:00
|
|
|
use_tree = TRUE;
|
|
|
|
}
|
|
|
|
|
|
|
|
uvm_tree_sanity(map, __func__);
|
|
|
|
|
|
|
|
if (use_tree) {
|
|
|
|
struct vm_map_entry *prev = &map->header;
|
|
|
|
cur = RB_ROOT(&map->rbhead);
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
/*
|
|
|
|
* Simple lookup in the tree. Happens when the hint is
|
|
|
|
* invalid, or nentries reach a threshold.
|
|
|
|
*/
|
|
|
|
while (cur) {
|
|
|
|
if (address >= cur->start) {
|
|
|
|
if (address < cur->end) {
|
|
|
|
*entry = cur;
|
|
|
|
goto got;
|
|
|
|
}
|
|
|
|
prev = cur;
|
|
|
|
cur = RB_RIGHT(cur, rb_entry);
|
|
|
|
} else
|
|
|
|
cur = RB_LEFT(cur, rb_entry);
|
|
|
|
}
|
|
|
|
*entry = prev;
|
|
|
|
goto failed;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* search linearly
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
while (cur != &map->header) {
|
1998-02-05 09:25:08 +03:00
|
|
|
if (cur->end > address) {
|
|
|
|
if (address >= cur->start) {
|
2003-10-02 03:08:32 +04:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* save this lookup for future
|
|
|
|
* hints, and return
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
*entry = cur;
|
2003-11-01 14:09:02 +03:00
|
|
|
got:
|
|
|
|
SAVE_HINT(map, map->hint, *entry);
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- search got it (0x%x)",
|
1998-03-09 03:58:55 +03:00
|
|
|
cur, 0, 0, 0);
|
2003-11-01 14:09:02 +03:00
|
|
|
KDASSERT((*entry)->start <= address);
|
|
|
|
KDASSERT(address < (*entry)->end);
|
1998-03-09 03:58:55 +03:00
|
|
|
return (TRUE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cur = cur->next;
|
|
|
|
}
|
|
|
|
*entry = cur->prev;
|
2003-11-01 14:09:02 +03:00
|
|
|
failed:
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(map, map->hint, *entry);
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
|
2003-11-02 10:58:52 +03:00
|
|
|
KDASSERT((*entry) == &map->header || (*entry)->end <= address);
|
2003-11-01 14:09:02 +03:00
|
|
|
KDASSERT((*entry)->next == &map->header ||
|
|
|
|
address < (*entry)->next->start);
|
1998-03-09 03:58:55 +03:00
|
|
|
return (FALSE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
2003-10-02 04:02:10 +04:00
|
|
|
/*
|
|
|
|
* See if the range between start and start + length fits in the gap
|
|
|
|
* entry->next->start and entry->end. Returns 1 if fits, 0 if doesn't
|
|
|
|
* fit, and -1 address wraps around.
|
|
|
|
*/
|
|
|
|
static __inline int
|
|
|
|
uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
|
|
|
|
vsize_t align, int topdown, struct vm_map_entry *entry)
|
|
|
|
{
|
|
|
|
vaddr_t end;
|
|
|
|
|
|
|
|
#ifdef PMAP_PREFER
|
|
|
|
/*
|
|
|
|
* push start address forward as needed to avoid VAC alias problems.
|
|
|
|
* we only do this if a valid offset is specified.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (uoffset != UVM_UNKNOWN_OFFSET)
|
|
|
|
PMAP_PREFER(uoffset, start);
|
|
|
|
#endif
|
|
|
|
if (align != 0) {
|
|
|
|
if ((*start & (align - 1)) != 0) {
|
|
|
|
if (topdown)
|
|
|
|
*start &= ~(align - 1);
|
|
|
|
else
|
|
|
|
*start = roundup(*start, align);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* XXX Should we PMAP_PREFER() here again?
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the end of the proposed new region. Be sure we didn't
|
|
|
|
* wrap around the address; if so, we lose. Otherwise, if the
|
|
|
|
* proposed new region fits before the next entry, we win.
|
|
|
|
*/
|
|
|
|
|
|
|
|
end = *start + length;
|
|
|
|
if (end < *start)
|
|
|
|
return (-1);
|
|
|
|
|
|
|
|
if (entry->next->start >= end && *start >= entry->end)
|
|
|
|
return (1);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* uvm_map_findspace: find "length" sized space in "map".
|
|
|
|
*
|
2000-09-13 19:00:15 +04:00
|
|
|
* => "hint" is a hint about where we want it, unless FINDSPACE_FIXED is
|
|
|
|
* set (in which case we insist on using "hint").
|
1998-02-05 09:25:08 +03:00
|
|
|
* => "result" is VA returned
|
|
|
|
* => uobj/uoffset are to be used to handle VAC alignment, if required
|
2000-09-13 19:00:15 +04:00
|
|
|
* => if `align' is non-zero, we attempt to align to that value.
|
1998-02-05 09:25:08 +03:00
|
|
|
* => caller must at least have read-locked map
|
|
|
|
* => returns NULL on failure, or pointer to prev. map entry if success
|
|
|
|
* => note this is a cross between the old vm_map_findspace and vm_map_find
|
|
|
|
*/
|
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
|
|
|
|
vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
|
|
|
|
vsize_t align, int flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2003-10-02 04:02:10 +04:00
|
|
|
struct vm_map_entry *entry;
|
2003-11-01 14:09:02 +03:00
|
|
|
struct vm_map_entry *child, *prev, *tmp;
|
2003-10-02 04:02:10 +04:00
|
|
|
vaddr_t orig_hint;
|
2003-02-21 01:16:05 +03:00
|
|
|
const int topdown = map->flags & VM_MAP_TOPDOWN;
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_findspace");
|
|
|
|
UVMHIST_CALLED(maphist);
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
UVMHIST_LOG(maphist, "(map=0x%x, hint=0x%x, len=%d, flags=0x%x)",
|
2003-10-02 04:02:10 +04:00
|
|
|
map, hint, length, flags);
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT((align & (align - 1)) == 0);
|
|
|
|
KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
|
2000-09-13 19:00:15 +04:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "map_findspace entry");
|
|
|
|
|
2000-09-13 19:00:15 +04:00
|
|
|
/*
|
|
|
|
* remember the original hint. if we are aligning, then we
|
|
|
|
* may have to try again with no alignment constraint if
|
|
|
|
* we fail the first time.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
orig_hint = hint;
|
1998-02-05 09:25:08 +03:00
|
|
|
if (hint < map->min_offset) { /* check ranges ... */
|
2000-09-13 19:00:15 +04:00
|
|
|
if (flags & UVM_FLAG_FIXED) {
|
1998-02-05 09:25:08 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (NULL);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
hint = map->min_offset;
|
|
|
|
}
|
|
|
|
if (hint > map->max_offset) {
|
|
|
|
UVMHIST_LOG(maphist,"<- VA 0x%x > range [0x%x->0x%x]",
|
2003-10-02 04:02:10 +04:00
|
|
|
hint, map->min_offset, map->max_offset, 0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (NULL);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look for the first possible address; if there's already
|
|
|
|
* something at this address, we have to start after it.
|
|
|
|
*/
|
|
|
|
|
2003-02-21 01:16:05 +03:00
|
|
|
/*
|
|
|
|
* @@@: there are four, no, eight cases to consider.
|
|
|
|
*
|
|
|
|
* 0: found, fixed, bottom up -> fail
|
|
|
|
* 1: found, fixed, top down -> fail
|
2003-10-02 04:02:10 +04:00
|
|
|
* 2: found, not fixed, bottom up -> start after entry->end,
|
|
|
|
* loop up
|
|
|
|
* 3: found, not fixed, top down -> start before entry->start,
|
|
|
|
* loop down
|
|
|
|
* 4: not found, fixed, bottom up -> check entry->next->start, fail
|
|
|
|
* 5: not found, fixed, top down -> check entry->next->start, fail
|
|
|
|
* 6: not found, not fixed, bottom up -> check entry->next->start,
|
|
|
|
* loop up
|
|
|
|
* 7: not found, not fixed, top down -> check entry->next->start,
|
|
|
|
* loop down
|
2003-02-21 01:16:05 +03:00
|
|
|
*
|
|
|
|
* as you can see, it reduces to roughly five cases, and that
|
|
|
|
* adding top down mapping only adds one unique case (without
|
|
|
|
* it, there would be four cases).
|
|
|
|
*/
|
|
|
|
|
2000-09-13 19:00:15 +04:00
|
|
|
if ((flags & UVM_FLAG_FIXED) == 0 && hint == map->min_offset) {
|
2003-10-02 04:02:10 +04:00
|
|
|
entry = map->first_free;
|
1998-02-05 09:25:08 +03:00
|
|
|
} else {
|
2003-10-02 04:02:10 +04:00
|
|
|
if (uvm_map_lookup_entry(map, hint, &entry)) {
|
1998-02-05 09:25:08 +03:00
|
|
|
/* "hint" address already in use ... */
|
2000-09-13 19:00:15 +04:00
|
|
|
if (flags & UVM_FLAG_FIXED) {
|
2003-10-02 04:02:10 +04:00
|
|
|
UVMHIST_LOG(maphist, "<- fixed & VA in use",
|
1998-03-09 03:58:55 +03:00
|
|
|
0, 0, 0, 0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (NULL);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
2003-10-02 04:02:10 +04:00
|
|
|
if (topdown)
|
|
|
|
/* Start from lower gap. */
|
|
|
|
entry = entry->prev;
|
|
|
|
} else if (flags & UVM_FLAG_FIXED) {
|
|
|
|
if (entry->next->start >= hint + length &&
|
|
|
|
hint + length > hint)
|
|
|
|
goto found;
|
|
|
|
|
|
|
|
/* "hint" address is gap but too small */
|
|
|
|
UVMHIST_LOG(maphist, "<- fixed mapping failed",
|
|
|
|
0, 0, 0, 0);
|
|
|
|
return (NULL); /* only one shot at it ... */
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* See if given hint fits in this gap.
|
|
|
|
*/
|
|
|
|
switch (uvm_map_space_avail(&hint, length,
|
|
|
|
uoffset, align, topdown, entry)) {
|
|
|
|
case 1:
|
|
|
|
goto found;
|
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
|
|
|
}
|
|
|
|
|
2003-11-05 18:09:09 +03:00
|
|
|
if (topdown) {
|
2003-10-02 04:02:10 +04:00
|
|
|
/*
|
|
|
|
* Still there is a chance to fit
|
|
|
|
* if hint > entry->end.
|
|
|
|
*/
|
2003-11-05 18:09:09 +03:00
|
|
|
} else {
|
|
|
|
/* Start from higer gap. */
|
|
|
|
entry = entry->next;
|
|
|
|
if (entry == &map->header)
|
|
|
|
goto notfound;
|
2003-10-02 04:02:10 +04:00
|
|
|
goto nextgap;
|
2003-11-05 18:09:09 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
/*
|
|
|
|
* Note that all UVM_FLAGS_FIXED case is already handled.
|
|
|
|
*/
|
|
|
|
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
|
|
|
|
|
|
|
|
/* Try to find the space in the red-black tree */
|
|
|
|
|
|
|
|
/* Check slot before any entry */
|
|
|
|
hint = topdown ? entry->next->start - length : entry->end;
|
|
|
|
switch (uvm_map_space_avail(&hint, length, uoffset, align,
|
|
|
|
topdown, entry)) {
|
|
|
|
case 1:
|
|
|
|
goto found;
|
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
|
|
|
}
|
|
|
|
|
|
|
|
nextgap:
|
2003-11-05 18:09:09 +03:00
|
|
|
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
|
2003-11-01 14:09:02 +03:00
|
|
|
/* If there is not enough space in the whole tree, we fail */
|
|
|
|
tmp = RB_ROOT(&map->rbhead);
|
|
|
|
if (tmp == NULL || tmp->space < length)
|
|
|
|
goto notfound;
|
|
|
|
|
|
|
|
prev = NULL; /* previous candidate */
|
|
|
|
|
|
|
|
/* Find an entry close to hint that has enough space */
|
|
|
|
for (; tmp;) {
|
|
|
|
KASSERT(tmp->next->start == tmp->end + tmp->ownspace);
|
|
|
|
if (topdown) {
|
|
|
|
if (tmp->next->start < hint + length &&
|
|
|
|
(prev == NULL || tmp->end > prev->end)) {
|
|
|
|
if (tmp->ownspace >= length)
|
|
|
|
prev = tmp;
|
|
|
|
else if ((child = RB_LEFT(tmp, rb_entry))
|
|
|
|
!= NULL && child->space >= length)
|
|
|
|
prev = tmp;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (tmp->end >= hint &&
|
|
|
|
(prev == NULL || tmp->end < prev->end)) {
|
|
|
|
if (tmp->ownspace >= length)
|
|
|
|
prev = tmp;
|
|
|
|
else if ((child = RB_RIGHT(tmp, rb_entry))
|
|
|
|
!= NULL && child->space >= length)
|
|
|
|
prev = tmp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tmp->next->start < hint + length)
|
|
|
|
child = RB_RIGHT(tmp, rb_entry);
|
|
|
|
else if (tmp->end > hint)
|
|
|
|
child = RB_LEFT(tmp, rb_entry);
|
|
|
|
else {
|
|
|
|
if (tmp->ownspace >= length)
|
|
|
|
break;
|
|
|
|
if (topdown)
|
|
|
|
child = RB_LEFT(tmp, rb_entry);
|
|
|
|
else
|
|
|
|
child = RB_RIGHT(tmp, rb_entry);
|
|
|
|
}
|
|
|
|
if (child == NULL || child->space < length)
|
|
|
|
break;
|
|
|
|
tmp = child;
|
|
|
|
}
|
|
|
|
|
2003-11-05 18:09:09 +03:00
|
|
|
if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
|
2003-11-01 14:09:02 +03:00
|
|
|
/*
|
|
|
|
* Check if the entry that we found satifies the
|
|
|
|
* space requirement
|
|
|
|
*/
|
2003-11-05 18:09:09 +03:00
|
|
|
if (topdown) {
|
2003-11-05 18:34:50 +03:00
|
|
|
if (hint > tmp->next->start - length)
|
|
|
|
hint = tmp->next->start - length;
|
2003-11-05 18:09:09 +03:00
|
|
|
} else {
|
2003-11-05 18:34:50 +03:00
|
|
|
if (hint < tmp->end)
|
|
|
|
hint = tmp->end;
|
2003-11-05 18:09:09 +03:00
|
|
|
}
|
|
|
|
switch (uvm_map_space_avail(&hint, length, uoffset, align,
|
|
|
|
topdown, tmp)) {
|
|
|
|
case 1:
|
2003-11-01 14:09:02 +03:00
|
|
|
entry = tmp;
|
|
|
|
goto found;
|
2003-11-05 18:09:09 +03:00
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
2003-11-01 14:09:02 +03:00
|
|
|
}
|
|
|
|
if (tmp->ownspace >= length)
|
|
|
|
goto listsearch;
|
|
|
|
}
|
|
|
|
if (prev == NULL)
|
|
|
|
goto notfound;
|
|
|
|
|
2003-11-05 18:09:09 +03:00
|
|
|
if (topdown) {
|
2003-11-06 15:45:26 +03:00
|
|
|
KASSERT(orig_hint >= prev->next->start - length ||
|
2003-11-05 18:09:09 +03:00
|
|
|
prev->next->start - length > prev->next->start);
|
|
|
|
hint = prev->next->start - length;
|
|
|
|
} else {
|
2003-11-06 15:45:26 +03:00
|
|
|
KASSERT(orig_hint <= prev->end);
|
2003-11-05 18:09:09 +03:00
|
|
|
hint = prev->end;
|
|
|
|
}
|
|
|
|
switch (uvm_map_space_avail(&hint, length, uoffset, align,
|
|
|
|
topdown, prev)) {
|
|
|
|
case 1:
|
2003-11-01 14:09:02 +03:00
|
|
|
entry = prev;
|
|
|
|
goto found;
|
2003-11-05 18:09:09 +03:00
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
2003-11-01 14:09:02 +03:00
|
|
|
}
|
|
|
|
if (prev->ownspace >= length)
|
|
|
|
goto listsearch;
|
|
|
|
|
|
|
|
if (topdown)
|
|
|
|
tmp = RB_LEFT(prev, rb_entry);
|
|
|
|
else
|
|
|
|
tmp = RB_RIGHT(prev, rb_entry);
|
|
|
|
for (;;) {
|
|
|
|
KASSERT(tmp && tmp->space >= length);
|
|
|
|
if (topdown)
|
|
|
|
child = RB_RIGHT(tmp, rb_entry);
|
|
|
|
else
|
|
|
|
child = RB_LEFT(tmp, rb_entry);
|
|
|
|
if (child && child->space >= length) {
|
|
|
|
tmp = child;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (tmp->ownspace >= length)
|
|
|
|
break;
|
|
|
|
if (topdown)
|
|
|
|
tmp = RB_LEFT(tmp, rb_entry);
|
|
|
|
else
|
|
|
|
tmp = RB_RIGHT(tmp, rb_entry);
|
|
|
|
}
|
|
|
|
|
2003-11-05 18:09:09 +03:00
|
|
|
if (topdown) {
|
2003-11-06 15:45:26 +03:00
|
|
|
KASSERT(orig_hint >= tmp->next->start - length ||
|
2003-11-05 18:09:09 +03:00
|
|
|
tmp->next->start - length > tmp->next->start);
|
|
|
|
hint = tmp->next->start - length;
|
|
|
|
} else {
|
2003-11-06 15:45:26 +03:00
|
|
|
KASSERT(orig_hint <= tmp->end);
|
2003-11-05 18:09:09 +03:00
|
|
|
hint = tmp->end;
|
|
|
|
}
|
2003-11-01 14:09:02 +03:00
|
|
|
switch (uvm_map_space_avail(&hint, length, uoffset, align,
|
|
|
|
topdown, tmp)) {
|
|
|
|
case 1:
|
|
|
|
entry = tmp;
|
|
|
|
goto found;
|
2003-11-05 18:09:09 +03:00
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
2003-11-01 14:09:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The tree fails to find an entry because of offset or alignment
|
|
|
|
* restrictions. Search the list instead.
|
|
|
|
*/
|
|
|
|
listsearch:
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* Look through the rest of the map, trying to fit a new region in
|
|
|
|
* the gap between existing regions, or after the very last region.
|
2003-10-02 04:02:10 +04:00
|
|
|
* note: entry->end = base VA of current gap,
|
|
|
|
* entry->next->start = VA of end of current gap
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
2003-10-02 04:02:10 +04:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
/* Update hint for current gap. */
|
|
|
|
hint = topdown ? entry->next->start - length : entry->end;
|
|
|
|
|
|
|
|
/* See if it fits. */
|
|
|
|
switch (uvm_map_space_avail(&hint, length, uoffset, align,
|
|
|
|
topdown, entry)) {
|
|
|
|
case 1:
|
|
|
|
goto found;
|
|
|
|
case -1:
|
|
|
|
goto wraparound;
|
|
|
|
}
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2003-10-02 04:02:10 +04:00
|
|
|
/* Advance to next/previous gap */
|
|
|
|
if (topdown) {
|
|
|
|
if (entry == &map->header) {
|
|
|
|
UVMHIST_LOG(maphist, "<- failed (off start)",
|
|
|
|
0,0,0,0);
|
|
|
|
goto notfound;
|
2003-03-02 05:55:03 +03:00
|
|
|
}
|
2003-10-02 04:02:10 +04:00
|
|
|
entry = entry->prev;
|
|
|
|
} else {
|
|
|
|
entry = entry->next;
|
|
|
|
if (entry == &map->header) {
|
|
|
|
UVMHIST_LOG(maphist, "<- failed (off end)",
|
2000-09-13 19:00:15 +04:00
|
|
|
0,0,0,0);
|
2003-10-02 04:02:10 +04:00
|
|
|
goto notfound;
|
2000-09-13 19:00:15 +04:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
}
|
2003-10-02 04:02:10 +04:00
|
|
|
|
|
|
|
found:
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(map, map->hint, entry);
|
1998-02-05 09:25:08 +03:00
|
|
|
*result = hint;
|
|
|
|
UVMHIST_LOG(maphist,"<- got it! (result=0x%x)", hint, 0,0,0);
|
2003-11-05 18:09:09 +03:00
|
|
|
KASSERT( topdown || hint >= orig_hint);
|
|
|
|
KASSERT(!topdown || hint <= orig_hint);
|
2003-11-01 14:09:02 +03:00
|
|
|
KASSERT(entry->end <= hint);
|
|
|
|
KASSERT(hint + length <= entry->next->start);
|
1998-02-05 09:25:08 +03:00
|
|
|
return (entry);
|
2003-10-02 04:02:10 +04:00
|
|
|
|
|
|
|
wraparound:
|
|
|
|
UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
|
|
|
|
|
|
|
|
notfound:
|
|
|
|
if (align != 0) {
|
|
|
|
UVMHIST_LOG(maphist, "calling recursively, no align",
|
|
|
|
0,0,0,0);
|
|
|
|
return (uvm_map_findspace(map, orig_hint,
|
|
|
|
length, result, uobj, uoffset, 0, flags));
|
|
|
|
}
|
|
|
|
return (NULL);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* U N M A P - m a i n h e l p e r f u n c t i o n s
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
|
|
|
|
*
|
2001-05-25 08:06:11 +04:00
|
|
|
* => caller must check alignment and size
|
1998-02-05 09:25:08 +03:00
|
|
|
* => map must be locked by caller
|
|
|
|
* => we return a list of map entries that we've remove from the map
|
|
|
|
* in "entry_list"
|
|
|
|
*/
|
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
struct vm_map_entry **entry_list /* OUT */)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry, *first_entry, *next;
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t len;
|
2001-06-02 22:09:08 +04:00
|
|
|
UVMHIST_FUNC("uvm_unmap_remove"); UVMHIST_CALLED(maphist);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x, start=0x%x, end=0x%x)",
|
|
|
|
map, start, end, 0);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "unmap_remove entry");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* find first entry
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (uvm_map_lookup_entry(map, start, &first_entry) == TRUE) {
|
1998-10-12 03:14:47 +04:00
|
|
|
/* clip and go... */
|
1998-03-09 03:58:55 +03:00
|
|
|
entry = first_entry;
|
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
/* critical! prevents stale hint */
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(map, entry, entry->prev);
|
1998-03-09 03:58:55 +03:00
|
|
|
} else {
|
|
|
|
entry = first_entry->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Save the free space hint
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (map->first_free->start >= start)
|
|
|
|
map->first_free = entry->prev;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* note: we now re-use first_entry for a different task. we remove
|
|
|
|
* a number of map entries from the map and save them in a linked
|
|
|
|
* list headed by "first_entry". once we remove them from the map
|
|
|
|
* the caller should unlock the map and drop the references to the
|
|
|
|
* backing objects [c.f. uvm_unmap_detach]. the object is to
|
2001-07-22 17:33:58 +04:00
|
|
|
* separate unmapping from reference dropping. why?
|
1998-03-09 03:58:55 +03:00
|
|
|
* [1] the map has to be locked for unmapping
|
|
|
|
* [2] the map need not be locked for reference dropping
|
|
|
|
* [3] dropping references may trigger pager I/O, and if we hit
|
|
|
|
* a pager that does synchronous I/O we may have to wait for it.
|
|
|
|
* [4] we would like all waiting for I/O to occur with maps unlocked
|
2001-05-25 08:06:11 +04:00
|
|
|
* so that we don't block other threads.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
first_entry = NULL;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
*entry_list = NULL;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* break up the area into map entry sized regions and unmap. note
|
1998-03-09 03:58:55 +03:00
|
|
|
* that all mappings have to be removed before we can even consider
|
|
|
|
* dropping references to amaps or VM objects (otherwise we could end
|
|
|
|
* up with a mapping to a page on the free list which would be very bad)
|
|
|
|
*/
|
|
|
|
|
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
2001-05-25 08:06:11 +04:00
|
|
|
UVM_MAP_CLIP_END(map, entry, end);
|
1998-03-09 03:58:55 +03:00
|
|
|
next = entry->next;
|
|
|
|
len = entry->end - entry->start;
|
2000-09-13 19:00:15 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* unwire before removing addresses from the pmap; otherwise
|
|
|
|
* unwiring will put the entries back into the pmap (XXX).
|
|
|
|
*/
|
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry)) {
|
1998-03-09 03:58:55 +03:00
|
|
|
uvm_map_entry_unwire(map, entry);
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
}
|
|
|
|
if ((map->flags & VM_MAP_PAGEABLE) == 0) {
|
1998-03-09 03:58:55 +03:00
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
/*
|
|
|
|
* if the map is non-pageable, any pages mapped there
|
|
|
|
* must be wired and entered with pmap_kenter_pa(),
|
|
|
|
* and we should free any such pages immediately.
|
|
|
|
* this is mostly used for kmem_map and mb_map.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
uvm_km_pgremove_intrsafe(entry->start, entry->end);
|
|
|
|
pmap_kremove(entry->start, len);
|
|
|
|
} else if (UVM_ET_ISOBJ(entry) &&
|
|
|
|
UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(vm_map_pmap(map) == pmap_kernel());
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* note: kernel object mappings are currently used in
|
|
|
|
* two ways:
|
|
|
|
* [1] "normal" mappings of pages in the kernel object
|
|
|
|
* [2] uvm_km_valloc'd allocations in which we
|
|
|
|
* pmap_enter in some non-kernel-object page
|
|
|
|
* (e.g. vmapbuf).
|
|
|
|
*
|
|
|
|
* for case [1], we need to remove the mapping from
|
|
|
|
* the pmap and then remove the page from the kernel
|
|
|
|
* object (because, once pages in a kernel object are
|
|
|
|
* unmapped they are no longer needed, unlike, say,
|
|
|
|
* a vnode where you might want the data to persist
|
|
|
|
* until flushed out of a queue).
|
|
|
|
*
|
|
|
|
* for case [2], we need to remove the mapping from
|
|
|
|
* the pmap. there shouldn't be any pages at the
|
|
|
|
* specified offset in the kernel object [but it
|
|
|
|
* doesn't hurt to call uvm_km_pgremove just to be
|
|
|
|
* safe?]
|
|
|
|
*
|
2001-05-25 08:06:11 +04:00
|
|
|
* uvm_km_pgremove currently does the following:
|
|
|
|
* for pages in the kernel object in range:
|
1999-05-26 00:30:08 +04:00
|
|
|
* - drops the swap slot
|
1998-03-09 03:58:55 +03:00
|
|
|
* - uvm_pagefree the page
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
1999-05-26 00:30:08 +04:00
|
|
|
* remove mappings from pmap and drop the pages
|
|
|
|
* from the object. offsets are always relative
|
|
|
|
* to vm_map_min(kernel_map).
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
pmap_remove(pmap_kernel(), entry->start,
|
|
|
|
entry->start + len);
|
|
|
|
uvm_km_pgremove(entry->object.uvm_obj,
|
|
|
|
entry->start - vm_map_min(kernel_map),
|
|
|
|
entry->end - vm_map_min(kernel_map));
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* null out kernel_object reference, we've just
|
|
|
|
* dropped it
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->etype &= ~UVM_ET_OBJ;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
entry->object.uvm_obj = NULL;
|
|
|
|
} else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
/*
|
2003-10-02 03:08:32 +04:00
|
|
|
* remove mappings the standard way.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
pmap_remove(map->pmap, entry->start, entry->end);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* remove entry from map and put it on our list of entries
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
* that we've nuked. then go to next entry.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, " removed map entry 0x%x", entry, 0, 0,0);
|
2000-10-11 21:21:11 +04:00
|
|
|
|
|
|
|
/* critical! prevents stale hint */
|
|
|
|
SAVE_HINT(map, entry, entry->prev);
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
uvm_map_entry_unlink(map, entry);
|
2003-11-01 22:56:09 +03:00
|
|
|
KASSERT(map->size >= len);
|
1998-03-09 03:58:55 +03:00
|
|
|
map->size -= len;
|
2003-02-21 01:16:05 +03:00
|
|
|
entry->prev = NULL;
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->next = first_entry;
|
|
|
|
first_entry = entry;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
entry = next;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2002-09-22 11:21:29 +04:00
|
|
|
if ((map->flags & VM_MAP_DYING) == 0) {
|
|
|
|
pmap_update(vm_map_pmap(map));
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "unmap_remove leave");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* now we've cleaned up the map and are ready for the caller to drop
|
2001-05-25 08:06:11 +04:00
|
|
|
* references to the mapped objects.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
*entry_list = first_entry;
|
|
|
|
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_unmap_detach: drop references in a chain of map entries
|
|
|
|
*
|
|
|
|
* => we will free the map entries as we traverse the list.
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *next_entry;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_unmap_detach"); UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
while (first_entry) {
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(!VM_MAPENT_ISWIRED(first_entry));
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,
|
2001-05-25 08:06:11 +04:00
|
|
|
" detach 0x%x: amap=0x%x, obj=0x%x, submap?=%d",
|
|
|
|
first_entry, first_entry->aref.ar_amap,
|
1998-10-12 03:14:47 +04:00
|
|
|
first_entry->object.uvm_obj,
|
|
|
|
UVM_ET_ISSUBMAP(first_entry));
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* drop reference to amap, if we've got one
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (first_entry->aref.ar_amap)
|
2000-11-25 09:27:59 +03:00
|
|
|
uvm_map_unreference_amap(first_entry, flags);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* drop reference to our backing object, if we've got one
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2002-09-22 11:21:29 +04:00
|
|
|
KASSERT(!UVM_ET_ISSUBMAP(first_entry));
|
|
|
|
if (UVM_ET_ISOBJ(first_entry) &&
|
|
|
|
first_entry->object.uvm_obj->pgops->pgo_detach) {
|
|
|
|
(*first_entry->object.uvm_obj->pgops->pgo_detach)
|
|
|
|
(first_entry->object.uvm_obj);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
next_entry = first_entry->next;
|
|
|
|
uvm_mapent_free(first_entry);
|
|
|
|
first_entry = next_entry;
|
|
|
|
}
|
|
|
|
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* E X T R A C T I O N F U N C T I O N S
|
|
|
|
*/
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1998-02-05 09:25:08 +03:00
|
|
|
* uvm_map_reserve: reserve space in a vm_map for future use.
|
|
|
|
*
|
2001-05-25 08:06:11 +04:00
|
|
|
* => we reserve space in a map by putting a dummy map entry in the
|
1998-02-05 09:25:08 +03:00
|
|
|
* map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
|
|
|
|
* => map should be unlocked (we will write lock it)
|
|
|
|
* => we return true if we were able to reserve space
|
|
|
|
* => XXXCDC: should be inline?
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_reserve(struct vm_map *map, vsize_t size,
|
|
|
|
vaddr_t offset /* hint for pmap_prefer */,
|
|
|
|
vsize_t align /* alignment hint */,
|
|
|
|
vaddr_t *raddr /* IN:hint, OUT: reserved VA */)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-05-25 08:06:11 +04:00
|
|
|
UVMHIST_FUNC("uvm_map_reserve"); UVMHIST_CALLED(maphist);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, "(map=0x%x, size=0x%x, offset=0x%x,addr=0x%x)",
|
2003-10-02 03:08:32 +04:00
|
|
|
map,size,offset,raddr);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
size = round_page(size);
|
|
|
|
if (*raddr < vm_map_min(map))
|
2003-10-02 03:08:32 +04:00
|
|
|
*raddr = vm_map_min(map); /* hint */
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* reserve some virtual space.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2000-09-13 19:00:15 +04:00
|
|
|
if (uvm_map(map, raddr, size, NULL, offset, 0,
|
1998-03-09 03:58:55 +03:00
|
|
|
UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
|
2001-03-15 09:10:32 +03:00
|
|
|
UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) != 0) {
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
|
|
|
|
return (FALSE);
|
2001-05-25 08:06:11 +04:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, "<- done (*raddr=0x%x)", *raddr,0,0,0);
|
|
|
|
return (TRUE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* uvm_map_replace: replace a reserved (blank) area of memory with
|
1998-02-05 09:25:08 +03:00
|
|
|
* real mappings.
|
|
|
|
*
|
2001-05-25 08:06:11 +04:00
|
|
|
* => caller must WRITE-LOCK the map
|
1998-02-05 09:25:08 +03:00
|
|
|
* => we return TRUE if replacement was a success
|
|
|
|
* => we expect the newents chain to have nnewents entrys on it and
|
|
|
|
* we expect newents->prev to point to the last entry on the list
|
|
|
|
* => note newents is allowed to be NULL
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
struct vm_map_entry *newents, int nnewents)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *oldent, *last;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "map_replace entry");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* first find the blank map entry at the specified address
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (!uvm_map_lookup_entry(map, start, &oldent)) {
|
2003-10-02 03:08:32 +04:00
|
|
|
return (FALSE);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* check to make sure we have a proper blank entry
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
if (oldent->start != start || oldent->end != end ||
|
1998-03-09 03:58:55 +03:00
|
|
|
oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
|
|
|
|
return (FALSE);
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
#ifdef DIAGNOSTIC
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* sanity check the newents chain
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *tmpent = newents;
|
1998-03-09 03:58:55 +03:00
|
|
|
int nent = 0;
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t cur = start;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
while (tmpent) {
|
|
|
|
nent++;
|
|
|
|
if (tmpent->start < cur)
|
|
|
|
panic("uvm_map_replace1");
|
|
|
|
if (tmpent->start > tmpent->end || tmpent->end > end) {
|
|
|
|
printf("tmpent->start=0x%lx, tmpent->end=0x%lx, end=0x%lx\n",
|
|
|
|
tmpent->start, tmpent->end, end);
|
|
|
|
panic("uvm_map_replace2");
|
|
|
|
}
|
|
|
|
cur = tmpent->end;
|
|
|
|
if (tmpent->next) {
|
|
|
|
if (tmpent->next->prev != tmpent)
|
|
|
|
panic("uvm_map_replace3");
|
|
|
|
} else {
|
|
|
|
if (newents->prev != tmpent)
|
|
|
|
panic("uvm_map_replace4");
|
|
|
|
}
|
|
|
|
tmpent = tmpent->next;
|
|
|
|
}
|
|
|
|
if (nent != nnewents)
|
|
|
|
panic("uvm_map_replace5");
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* map entry is a valid blank! replace it. (this does all the
|
|
|
|
* work of map entry link/unlink...).
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (newents) {
|
2001-06-02 22:09:08 +04:00
|
|
|
last = newents->prev;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* critical: flush stale hints out of map */
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(map, map->hint, newents);
|
1998-03-09 03:58:55 +03:00
|
|
|
if (map->first_free == oldent)
|
|
|
|
map->first_free = last;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
last->next = oldent->next;
|
|
|
|
last->next->prev = last;
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
/* Fix RB tree */
|
|
|
|
uvm_rb_remove(map, oldent);
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
newents->prev = oldent->prev;
|
|
|
|
newents->prev->next = newents;
|
|
|
|
map->nentries = map->nentries + (nnewents - 1);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
/* Fixup the RB tree */
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct vm_map_entry *tmp;
|
|
|
|
|
|
|
|
tmp = newents;
|
|
|
|
for (i = 0; i < nnewents && tmp; i++) {
|
|
|
|
uvm_rb_insert(map, tmp);
|
|
|
|
tmp = tmp->next;
|
|
|
|
}
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
} else {
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* critical: flush stale hints out of map */
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(map, map->hint, oldent->prev);
|
1998-03-09 03:58:55 +03:00
|
|
|
if (map->first_free == oldent)
|
|
|
|
map->first_free = oldent->prev;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* NULL list of new entries: just remove the old one */
|
|
|
|
uvm_map_entry_unlink(map, oldent);
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(map, "map_replace leave");
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* now we can free the old blank entry, unlock the map and return.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
uvm_mapent_free(oldent);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (TRUE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_extract: extract a mapping from a map and put it somewhere
|
|
|
|
* (maybe removing the old mapping)
|
|
|
|
*
|
|
|
|
* => maps should be unlocked (we will write lock them)
|
|
|
|
* => returns 0 on success, error code otherwise
|
|
|
|
* => start must be page aligned
|
|
|
|
* => len must be page sized
|
|
|
|
* => flags:
|
|
|
|
* UVM_EXTRACT_REMOVE: remove mappings from srcmap
|
|
|
|
* UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
|
|
|
|
* UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
|
|
|
|
* UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
|
|
|
|
* >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
|
|
|
|
* >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
|
|
|
|
* be used from within the kernel in a kernel level map <<<
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
|
|
|
|
struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge,
|
1998-03-09 03:58:55 +03:00
|
|
|
oldstart;
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
|
|
|
|
*deadentry, *oldentry;
|
1998-08-13 06:10:37 +04:00
|
|
|
vsize_t elen;
|
1998-03-09 03:58:55 +03:00
|
|
|
int nchain, error, copy_ok;
|
|
|
|
UVMHIST_FUNC("uvm_map_extract"); UVMHIST_CALLED(maphist);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"(srcmap=0x%x,start=0x%x, len=0x%x", srcmap, start,
|
|
|
|
len,0);
|
|
|
|
UVMHIST_LOG(maphist," ...,dstmap=0x%x, flags=0x%x)", dstmap,flags,0,0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-11-01 14:09:02 +03:00
|
|
|
uvm_tree_sanity(srcmap, "map_extract src enter");
|
|
|
|
uvm_tree_sanity(dstmap, "map_extract dst enter");
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* step 0: sanity check: start must be on a page boundary, length
|
|
|
|
* must be page sized. can't ask for CONTIG/QREF if you asked for
|
|
|
|
* REMOVE.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT((start & PAGE_MASK) == 0 && (len & PAGE_MASK) == 0);
|
|
|
|
KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
|
|
|
|
(flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* step 1: reserve space in the target map for the extracted area
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-06-05 11:28:56 +04:00
|
|
|
dstaddr = vm_map_min(dstmap);
|
2000-09-13 19:00:15 +04:00
|
|
|
if (uvm_map_reserve(dstmap, len, start, 0, &dstaddr) == FALSE)
|
2003-10-02 03:08:32 +04:00
|
|
|
return (ENOMEM);
|
1998-03-09 03:58:55 +03:00
|
|
|
*dstaddrp = dstaddr; /* pass address back to caller */
|
|
|
|
UVMHIST_LOG(maphist, " dstaddr=0x%x", dstaddr,0,0,0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* step 2: setup for the extraction process loop by init'ing the
|
1998-03-09 03:58:55 +03:00
|
|
|
* map entry chain, locking src map, and looking up the first useful
|
|
|
|
* entry in the map.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
end = start + len;
|
|
|
|
newend = dstaddr + len;
|
|
|
|
chain = endchain = NULL;
|
|
|
|
nchain = 0;
|
|
|
|
vm_map_lock(srcmap);
|
|
|
|
|
|
|
|
if (uvm_map_lookup_entry(srcmap, start, &entry)) {
|
|
|
|
|
|
|
|
/* "start" is within an entry */
|
|
|
|
if (flags & UVM_EXTRACT_QREF) {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* for quick references we don't clip the entry, so
|
|
|
|
* the entry may map space "before" the starting
|
|
|
|
* virtual address... this is the "fudge" factor
|
|
|
|
* (which can be non-zero only the first time
|
|
|
|
* through the "while" loop in step 3).
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
fudge = start - entry->start;
|
|
|
|
} else {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* normal reference: we clip the map to fit (thus
|
|
|
|
* fudge is zero)
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVM_MAP_CLIP_START(srcmap, entry, start);
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(srcmap, srcmap->hint, entry->prev);
|
1998-03-09 03:58:55 +03:00
|
|
|
fudge = 0;
|
|
|
|
}
|
|
|
|
} else {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* "start" is not within an entry ... skip to next entry */
|
|
|
|
if (flags & UVM_EXTRACT_CONTIG) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad; /* definite hole here ... */
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
entry = entry->next;
|
|
|
|
fudge = 0;
|
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* save values from srcmap for step 6 */
|
|
|
|
orig_entry = entry;
|
|
|
|
orig_fudge = fudge;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* step 3: now start looping through the map entries, extracting
|
|
|
|
* as we go.
|
|
|
|
*/
|
|
|
|
|
|
|
|
while (entry->start < end && entry != &srcmap->header) {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* if we are not doing a quick reference, clip it */
|
|
|
|
if ((flags & UVM_EXTRACT_QREF) == 0)
|
|
|
|
UVM_MAP_CLIP_END(srcmap, entry, end);
|
|
|
|
|
|
|
|
/* clear needs_copy (allow chunking) */
|
|
|
|
if (UVM_ET_ISNEEDSCOPY(entry)) {
|
|
|
|
if (fudge)
|
|
|
|
oldstart = entry->start;
|
|
|
|
else
|
|
|
|
oldstart = 0; /* XXX: gcc */
|
|
|
|
amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end);
|
|
|
|
if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */
|
|
|
|
error = ENOMEM;
|
|
|
|
goto bad;
|
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* amap_copy could clip (during chunk)! update fudge */
|
|
|
|
if (fudge) {
|
|
|
|
fudge = fudge - (entry->start - oldstart);
|
|
|
|
orig_fudge = fudge;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* calculate the offset of this from "start" */
|
|
|
|
oldoffset = (entry->start + fudge) - start;
|
|
|
|
|
|
|
|
/* allocate a new map entry */
|
2002-11-30 21:28:04 +03:00
|
|
|
newentry = uvm_mapent_alloc(dstmap, 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
if (newentry == NULL) {
|
|
|
|
error = ENOMEM;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set up new map entry */
|
|
|
|
newentry->next = NULL;
|
|
|
|
newentry->prev = endchain;
|
|
|
|
newentry->start = dstaddr + oldoffset;
|
|
|
|
newentry->end =
|
|
|
|
newentry->start + (entry->end - (entry->start + fudge));
|
1999-04-19 18:43:46 +04:00
|
|
|
if (newentry->end > newend || newentry->end < newentry->start)
|
1998-03-09 03:58:55 +03:00
|
|
|
newentry->end = newend;
|
|
|
|
newentry->object.uvm_obj = entry->object.uvm_obj;
|
|
|
|
if (newentry->object.uvm_obj) {
|
|
|
|
if (newentry->object.uvm_obj->pgops->pgo_reference)
|
|
|
|
newentry->object.uvm_obj->pgops->
|
|
|
|
pgo_reference(newentry->object.uvm_obj);
|
|
|
|
newentry->offset = entry->offset + fudge;
|
|
|
|
} else {
|
|
|
|
newentry->offset = 0;
|
|
|
|
}
|
|
|
|
newentry->etype = entry->etype;
|
2001-05-25 08:06:11 +04:00
|
|
|
newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
|
|
|
|
entry->max_protection : entry->protection;
|
1998-03-09 03:58:55 +03:00
|
|
|
newentry->max_protection = entry->max_protection;
|
|
|
|
newentry->inheritance = entry->inheritance;
|
|
|
|
newentry->wired_count = 0;
|
|
|
|
newentry->aref.ar_amap = entry->aref.ar_amap;
|
|
|
|
if (newentry->aref.ar_amap) {
|
1999-01-25 02:53:14 +03:00
|
|
|
newentry->aref.ar_pageoff =
|
|
|
|
entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
|
2000-11-25 09:27:59 +03:00
|
|
|
uvm_map_reference_amap(newentry, AMAP_SHARED |
|
1998-03-09 03:58:55 +03:00
|
|
|
((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
|
|
|
|
} else {
|
1999-01-25 02:53:14 +03:00
|
|
|
newentry->aref.ar_pageoff = 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
newentry->advice = entry->advice;
|
|
|
|
|
|
|
|
/* now link it on the chain */
|
|
|
|
nchain++;
|
|
|
|
if (endchain == NULL) {
|
|
|
|
chain = endchain = newentry;
|
|
|
|
} else {
|
|
|
|
endchain->next = newentry;
|
|
|
|
endchain = newentry;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* end of 'while' loop! */
|
2001-05-25 08:06:11 +04:00
|
|
|
if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
|
1998-03-09 03:58:55 +03:00
|
|
|
(entry->next == &srcmap->header ||
|
|
|
|
entry->next->start != entry->end)) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
entry = entry->next;
|
|
|
|
fudge = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* step 4: close off chain (in format expected by uvm_map_replace)
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (chain)
|
|
|
|
chain->prev = endchain;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* step 5: attempt to lock the dest map so we can pmap_copy.
|
2001-05-25 08:06:11 +04:00
|
|
|
* note usage of copy_ok:
|
1998-03-09 03:58:55 +03:00
|
|
|
* 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
|
|
|
|
* 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
|
|
|
|
*/
|
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
if (srcmap == dstmap || vm_map_lock_try(dstmap) == TRUE) {
|
1998-03-09 03:58:55 +03:00
|
|
|
copy_ok = 1;
|
|
|
|
if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
|
|
|
|
nchain)) {
|
|
|
|
if (srcmap != dstmap)
|
|
|
|
vm_map_unlock(dstmap);
|
|
|
|
error = EIO;
|
|
|
|
goto bad;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
copy_ok = 0;
|
|
|
|
/* replace defered until step 7 */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* step 6: traverse the srcmap a second time to do the following:
|
|
|
|
* - if we got a lock on the dstmap do pmap_copy
|
|
|
|
* - if UVM_EXTRACT_REMOVE remove the entries
|
|
|
|
* we make use of orig_entry and orig_fudge (saved in step 2)
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
|
|
|
|
|
|
|
|
/* purge possible stale hints from srcmap */
|
|
|
|
if (flags & UVM_EXTRACT_REMOVE) {
|
2000-10-11 21:21:11 +04:00
|
|
|
SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
|
1998-03-09 03:58:55 +03:00
|
|
|
if (srcmap->first_free->start >= start)
|
|
|
|
srcmap->first_free = orig_entry->prev;
|
|
|
|
}
|
|
|
|
|
|
|
|
entry = orig_entry;
|
|
|
|
fudge = orig_fudge;
|
|
|
|
deadentry = NULL; /* for UVM_EXTRACT_REMOVE */
|
|
|
|
|
|
|
|
while (entry->start < end && entry != &srcmap->header) {
|
|
|
|
if (copy_ok) {
|
2000-05-19 21:43:55 +04:00
|
|
|
oldoffset = (entry->start + fudge) - start;
|
2001-02-05 14:29:54 +03:00
|
|
|
elen = MIN(end, entry->end) -
|
2000-05-19 21:43:55 +04:00
|
|
|
(entry->start + fudge);
|
|
|
|
pmap_copy(dstmap->pmap, srcmap->pmap,
|
|
|
|
dstaddr + oldoffset, elen,
|
|
|
|
entry->start + fudge);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
2000-05-19 21:43:55 +04:00
|
|
|
/* we advance "entry" in the following if statement */
|
1998-03-09 03:58:55 +03:00
|
|
|
if (flags & UVM_EXTRACT_REMOVE) {
|
2001-05-25 08:06:11 +04:00
|
|
|
pmap_remove(srcmap->pmap, entry->start,
|
1998-05-22 06:01:54 +04:00
|
|
|
entry->end);
|
2003-10-02 03:08:32 +04:00
|
|
|
oldentry = entry; /* save entry */
|
|
|
|
entry = entry->next; /* advance */
|
1998-05-22 06:01:54 +04:00
|
|
|
uvm_map_entry_unlink(srcmap, oldentry);
|
|
|
|
/* add to dead list */
|
|
|
|
oldentry->next = deadentry;
|
|
|
|
deadentry = oldentry;
|
2003-10-02 03:08:32 +04:00
|
|
|
} else {
|
|
|
|
entry = entry->next; /* advance */
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* end of 'while' loop */
|
|
|
|
fudge = 0;
|
|
|
|
}
|
2001-09-11 01:19:08 +04:00
|
|
|
pmap_update(srcmap->pmap);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* unlock dstmap. we will dispose of deadentry in
|
|
|
|
* step 7 if needed
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (copy_ok && srcmap != dstmap)
|
|
|
|
vm_map_unlock(dstmap);
|
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
} else {
|
|
|
|
deadentry = NULL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* step 7: we are done with the source map, unlock. if copy_ok
|
|
|
|
* is 0 then we have not replaced the dummy mapping in dstmap yet
|
|
|
|
* and we need to do so now.
|
|
|
|
*/
|
|
|
|
|
|
|
|
vm_map_unlock(srcmap);
|
|
|
|
if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
|
|
|
|
uvm_unmap_detach(deadentry, 0); /* dispose of old entries */
|
|
|
|
|
|
|
|
/* now do the replacement if we didn't do it in step 5 */
|
|
|
|
if (copy_ok == 0) {
|
|
|
|
vm_map_lock(dstmap);
|
|
|
|
error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
|
|
|
|
nchain);
|
|
|
|
vm_map_unlock(dstmap);
|
|
|
|
|
|
|
|
if (error == FALSE) {
|
|
|
|
error = EIO;
|
|
|
|
goto bad2;
|
|
|
|
}
|
|
|
|
}
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
uvm_tree_sanity(srcmap, "map_extract src leave");
|
|
|
|
uvm_tree_sanity(dstmap, "map_extract dst leave");
|
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
return (0);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* bad: failure recovery
|
|
|
|
*/
|
|
|
|
bad:
|
|
|
|
vm_map_unlock(srcmap);
|
|
|
|
bad2: /* src already unlocked */
|
|
|
|
if (chain)
|
|
|
|
uvm_unmap_detach(chain,
|
|
|
|
(flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);
|
2003-11-01 14:09:02 +03:00
|
|
|
|
|
|
|
uvm_tree_sanity(srcmap, "map_extract src err leave");
|
|
|
|
uvm_tree_sanity(dstmap, "map_extract dst err leave");
|
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */
|
2003-10-02 03:08:32 +04:00
|
|
|
return (error);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/* end of extraction functions */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_submap: punch down part of a map into a submap
|
|
|
|
*
|
|
|
|
* => only the kernel_map is allowed to be submapped
|
|
|
|
* => the purpose of submapping is to break up the locking granularity
|
|
|
|
* of a larger map
|
|
|
|
* => the range specified must have been mapped previously with a uvm_map()
|
|
|
|
* call [with uobj==NULL] to create a blank map entry in the main map.
|
|
|
|
* [And it had better still be blank!]
|
|
|
|
* => maps which contain submaps should never be copied or forked.
|
2001-05-25 08:06:11 +04:00
|
|
|
* => to remove a submap, use uvm_unmap() on the main map
|
1998-03-09 03:58:55 +03:00
|
|
|
* and then uvm_map_deallocate() the submap.
|
|
|
|
* => main map must be unlocked.
|
|
|
|
* => submap must have been init'd and have a zero reference count.
|
|
|
|
* [need not be locked as we don't actually reference it]
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
struct vm_map *submap)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry;
|
2001-03-15 09:10:32 +03:00
|
|
|
int error;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
vm_map_lock(map);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (uvm_map_lookup_entry(map, start, &entry)) {
|
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
UVM_MAP_CLIP_END(map, entry, end); /* to be safe */
|
2001-03-15 09:10:32 +03:00
|
|
|
} else {
|
1998-03-09 03:58:55 +03:00
|
|
|
entry = NULL;
|
|
|
|
}
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
if (entry != NULL &&
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->start == start && entry->end == end &&
|
|
|
|
entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
|
|
|
|
!UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
|
1998-10-12 03:14:47 +04:00
|
|
|
entry->etype |= UVM_ET_SUBMAP;
|
1998-03-09 03:58:55 +03:00
|
|
|
entry->object.sub_map = submap;
|
|
|
|
entry->offset = 0;
|
|
|
|
uvm_map_reference(submap);
|
2001-03-15 09:10:32 +03:00
|
|
|
error = 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
} else {
|
2001-03-15 09:10:32 +03:00
|
|
|
error = EINVAL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return error;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_protect: change map protection
|
|
|
|
*
|
|
|
|
* => set_max means set max_protection.
|
|
|
|
* => map must be unlocked.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
|
1999-03-28 23:53:49 +04:00
|
|
|
~VM_PROT_WRITE : VM_PROT_ALL)
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
vm_prot_t new_prot, boolean_t set_max)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *current, *entry;
|
2001-03-15 09:10:32 +03:00
|
|
|
int error = 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_protect"); UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_prot=0x%x)",
|
2000-11-25 09:27:59 +03:00
|
|
|
map, start, end, new_prot);
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_lock(map);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
|
|
|
if (uvm_map_lookup_entry(map, start, &entry)) {
|
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
} else {
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* make a first pass to check for protection violations.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
current = entry;
|
|
|
|
while ((current != &map->header) && (current->start < end)) {
|
1999-07-18 04:41:56 +04:00
|
|
|
if (UVM_ET_ISSUBMAP(current)) {
|
2001-03-15 09:10:32 +03:00
|
|
|
error = EINVAL;
|
1999-07-18 04:41:56 +04:00
|
|
|
goto out;
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
if ((new_prot & current->max_protection) != new_prot) {
|
2001-03-15 09:10:32 +03:00
|
|
|
error = EACCES;
|
1999-07-18 04:41:56 +04:00
|
|
|
goto out;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2001-10-30 22:05:26 +03:00
|
|
|
/*
|
|
|
|
* Don't allow VM_PROT_EXECUTE to be set on entries that
|
|
|
|
* point to vnodes that are associated with a NOEXEC file
|
|
|
|
* system.
|
|
|
|
*/
|
|
|
|
if (UVM_ET_ISOBJ(current) &&
|
|
|
|
UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
|
|
|
|
struct vnode *vp =
|
|
|
|
(struct vnode *) current->object.uvm_obj;
|
|
|
|
|
|
|
|
if ((new_prot & VM_PROT_EXECUTE) != 0 &&
|
|
|
|
(vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
|
|
|
|
error = EACCES;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
1999-07-18 04:41:56 +04:00
|
|
|
current = current->next;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* go back and fix up protections (no need to clip this time). */
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
current = entry;
|
|
|
|
while ((current != &map->header) && (current->start < end)) {
|
|
|
|
vm_prot_t old_prot;
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVM_MAP_CLIP_END(map, current, end);
|
|
|
|
old_prot = current->protection;
|
|
|
|
if (set_max)
|
|
|
|
current->protection =
|
|
|
|
(current->max_protection = new_prot) & old_prot;
|
|
|
|
else
|
|
|
|
current->protection = new_prot;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* update physical map if necessary. worry about copy-on-write
|
1998-03-09 03:58:55 +03:00
|
|
|
* here -- CHECK THIS XXX
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (current->protection != old_prot) {
|
1998-10-12 03:14:47 +04:00
|
|
|
/* update pmap! */
|
|
|
|
pmap_protect(map->pmap, current->start, current->end,
|
|
|
|
current->protection & MASK(entry));
|
2001-10-30 02:06:03 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If this entry points at a vnode, and the
|
|
|
|
* protection includes VM_PROT_EXECUTE, mark
|
2001-10-30 21:52:17 +03:00
|
|
|
* the vnode as VEXECMAP.
|
2001-10-30 02:06:03 +03:00
|
|
|
*/
|
|
|
|
if (UVM_ET_ISOBJ(current)) {
|
|
|
|
struct uvm_object *uobj =
|
|
|
|
current->object.uvm_obj;
|
|
|
|
|
|
|
|
if (UVM_OBJ_IS_VNODE(uobj) &&
|
|
|
|
(current->protection & VM_PROT_EXECUTE))
|
2001-10-30 18:32:01 +03:00
|
|
|
vn_markexec((struct vnode *) uobj);
|
2001-10-30 02:06:03 +03:00
|
|
|
}
|
1999-07-18 04:41:56 +04:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
|
1999-07-18 04:41:56 +04:00
|
|
|
/*
|
|
|
|
* If the map is configured to lock any future mappings,
|
|
|
|
* wire this entry now if the old protection was VM_PROT_NONE
|
|
|
|
* and the new protection is not VM_PROT_NONE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
|
|
|
|
VM_MAPENT_ISWIRED(entry) == 0 &&
|
|
|
|
old_prot == VM_PROT_NONE &&
|
|
|
|
new_prot != VM_PROT_NONE) {
|
|
|
|
if (uvm_map_pageable(map, entry->start,
|
|
|
|
entry->end, FALSE,
|
2001-03-15 09:10:32 +03:00
|
|
|
UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-07-18 04:41:56 +04:00
|
|
|
/*
|
|
|
|
* If locking the entry fails, remember the
|
|
|
|
* error if it's the first one. Note we
|
|
|
|
* still continue setting the protection in
|
2001-03-15 09:10:32 +03:00
|
|
|
* the map, but will return the error
|
|
|
|
* condition regardless.
|
1999-07-18 04:41:56 +04:00
|
|
|
*
|
|
|
|
* XXX Ignore what the actual error is,
|
|
|
|
* XXX just call it a resource shortage
|
|
|
|
* XXX so that it doesn't get confused
|
|
|
|
* XXX what uvm_map_protect() itself would
|
|
|
|
* XXX normally return.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
error = ENOMEM;
|
1999-07-18 04:41:56 +04:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
current = current->next;
|
|
|
|
}
|
2001-09-11 01:19:08 +04:00
|
|
|
pmap_update(map->pmap);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-07-18 04:41:56 +04:00
|
|
|
out:
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
UVMHIST_LOG(maphist, "<- done, error=%d",error,0,0,0);
|
|
|
|
return error;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
#undef MASK
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1998-02-05 09:25:08 +03:00
|
|
|
* uvm_map_inherit: set inheritance code for range of addrs in map.
|
|
|
|
*
|
|
|
|
* => map must be unlocked
|
|
|
|
* => note that the inherit code is used during a "fork". see fork
|
|
|
|
* code for details.
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
vm_inherit_t new_inheritance)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry, *temp_entry;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_inherit"); UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_inh=0x%x)",
|
|
|
|
map, start, end, new_inheritance);
|
|
|
|
|
|
|
|
switch (new_inheritance) {
|
2000-08-01 04:53:07 +04:00
|
|
|
case MAP_INHERIT_NONE:
|
|
|
|
case MAP_INHERIT_COPY:
|
|
|
|
case MAP_INHERIT_SHARE:
|
1998-03-09 03:58:55 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EINVAL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_lock(map);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
|
|
|
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
|
|
|
|
entry = temp_entry;
|
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
} else {
|
|
|
|
entry = temp_entry->next;
|
|
|
|
}
|
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
|
|
|
UVM_MAP_CLIP_END(map, entry, end);
|
|
|
|
entry->inheritance = new_inheritance;
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
vm_map_unlock(map);
|
|
|
|
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1999-05-23 10:27:13 +04:00
|
|
|
* uvm_map_advice: set advice code for range of addrs in map.
|
|
|
|
*
|
|
|
|
* => map must be unlocked
|
|
|
|
*/
|
|
|
|
|
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
|
1999-05-23 10:27:13 +04:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry, *temp_entry;
|
1999-05-23 10:27:13 +04:00
|
|
|
UVMHIST_FUNC("uvm_map_advice"); UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_adv=0x%x)",
|
|
|
|
map, start, end, new_advice);
|
|
|
|
|
|
|
|
vm_map_lock(map);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
|
|
|
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
|
|
|
|
entry = temp_entry;
|
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
} else {
|
|
|
|
entry = temp_entry->next;
|
|
|
|
}
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXXJRT: disallow holes?
|
|
|
|
*/
|
|
|
|
|
1999-05-23 10:27:13 +04:00
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
|
|
|
UVM_MAP_CLIP_END(map, entry, end);
|
|
|
|
|
|
|
|
switch (new_advice) {
|
|
|
|
case MADV_NORMAL:
|
|
|
|
case MADV_RANDOM:
|
|
|
|
case MADV_SEQUENTIAL:
|
|
|
|
/* nothing special here */
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
1999-06-01 03:36:23 +04:00
|
|
|
vm_map_unlock(map);
|
1999-05-23 10:27:13 +04:00
|
|
|
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EINVAL;
|
1999-05-23 10:27:13 +04:00
|
|
|
}
|
|
|
|
entry->advice = new_advice;
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
vm_map_unlock(map);
|
|
|
|
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1999-05-23 10:27:13 +04:00
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
|
|
|
* uvm_map_pageable: sets the pageability of a range in a map.
|
|
|
|
*
|
1999-06-16 23:34:24 +04:00
|
|
|
* => wires map entries. should not be used for transient page locking.
|
|
|
|
* for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
|
1998-02-05 09:25:08 +03:00
|
|
|
* => regions sepcified as not pageable require lock-down (wired) memory
|
|
|
|
* and page tables.
|
1999-06-18 09:13:45 +04:00
|
|
|
* => map must never be read-locked
|
|
|
|
* => if islocked is TRUE, map is already write-locked
|
|
|
|
* => we always unlock the map, since we must downgrade to a read-lock
|
|
|
|
* to call uvm_fault_wire()
|
1998-02-05 09:25:08 +03:00
|
|
|
* => XXXCDC: check this and try and clean it up.
|
|
|
|
*/
|
|
|
|
|
1998-05-09 19:05:50 +04:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
boolean_t new_pageable, int lockflags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry, *start_entry, *failed_entry;
|
1998-03-09 03:58:55 +03:00
|
|
|
int rv;
|
1999-07-02 00:07:05 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
u_int timestamp_save;
|
|
|
|
#endif
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_pageable"); UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,new_pageable=0x%x)",
|
2000-11-25 09:27:59 +03:00
|
|
|
map, start, end, new_pageable);
|
|
|
|
KASSERT(map->flags & VM_MAP_PAGEABLE);
|
1999-05-26 23:16:28 +04:00
|
|
|
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_ENTER) == 0)
|
1999-06-18 09:13:45 +04:00
|
|
|
vm_map_lock(map);
|
1998-03-09 03:58:55 +03:00
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* only one pageability change may take place at one time, since
|
|
|
|
* uvm_fault_wire assumes it will be called only once for each
|
|
|
|
* wiring/unwiring. therefore, we have to make sure we're actually
|
|
|
|
* changing the pageability for the entire region. we do so before
|
2001-05-25 08:06:11 +04:00
|
|
|
* making any changes.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
if (uvm_map_lookup_entry(map, start, &start_entry) == FALSE) {
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0)
|
|
|
|
vm_map_unlock(map);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
|
|
|
|
return EFAULT;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
entry = start_entry;
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
2001-07-22 17:33:58 +04:00
|
|
|
* handle wiring and unwiring separately.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
1999-06-16 23:34:24 +04:00
|
|
|
if (new_pageable) { /* unwire */
|
1998-03-09 03:58:55 +03:00
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* unwiring. first ensure that the range to be unwired is
|
2001-05-25 08:06:11 +04:00
|
|
|
* really wired down and that there are no holes.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
|
|
|
if (entry->wired_count == 0 ||
|
|
|
|
(entry->end < end &&
|
1999-06-16 04:29:04 +04:00
|
|
|
(entry->next == &map->header ||
|
|
|
|
entry->next->start > entry->end))) {
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0)
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
|
|
|
|
return EINVAL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1999-06-16 23:34:24 +04:00
|
|
|
* POSIX 1003.1b - a single munlock call unlocks a region,
|
|
|
|
* regardless of the number of mlock calls made on that
|
|
|
|
* region.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
entry = start_entry;
|
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
|
|
|
UVM_MAP_CLIP_END(map, entry, end);
|
1999-06-16 23:34:24 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry))
|
1998-03-09 03:58:55 +03:00
|
|
|
uvm_map_entry_unwire(map, entry);
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0)
|
|
|
|
vm_map_unlock(map);
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* wire case: in two passes [XXXCDC: ugly block of code here]
|
|
|
|
*
|
|
|
|
* 1: holding the write lock, we create any anonymous maps that need
|
|
|
|
* to be created. then we clip each map entry to the region to
|
2001-05-25 08:06:11 +04:00
|
|
|
* be wired and increment its wiring count.
|
1998-03-09 03:58:55 +03:00
|
|
|
*
|
|
|
|
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
|
1999-06-16 23:34:24 +04:00
|
|
|
* in the pages for any newly wired area (wired_count == 1).
|
1998-03-09 03:58:55 +03:00
|
|
|
*
|
|
|
|
* downgrading to a read lock for uvm_fault_wire avoids a possible
|
|
|
|
* deadlock with another thread that may have faulted on one of
|
|
|
|
* the pages to be wired (it would mark the page busy, blocking
|
|
|
|
* us, then in turn block on the map lock that we hold). because
|
|
|
|
* of problems in the recursive lock package, we cannot upgrade
|
|
|
|
* to a write lock in vm_map_lookup. thus, any actions that
|
|
|
|
* require the write lock must be done beforehand. because we
|
|
|
|
* keep the read lock on the map, the copy-on-write status of the
|
|
|
|
* entries we modify here cannot change.
|
1998-02-05 09:25:08 +03:00
|
|
|
*/
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
while ((entry != &map->header) && (entry->start < end)) {
|
1999-06-16 04:29:04 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
|
2000-11-25 09:27:59 +03:00
|
|
|
|
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* perform actions of vm_map_lookup that need the
|
|
|
|
* write lock on the map: create an anonymous map
|
|
|
|
* for a copy-on-write region, or an anonymous map
|
1998-10-12 03:14:47 +04:00
|
|
|
* for a zero-fill region. (XXXCDC: submap case
|
|
|
|
* ok?)
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-10-12 03:14:47 +04:00
|
|
|
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */
|
2001-05-25 08:06:11 +04:00
|
|
|
if (UVM_ET_ISNEEDSCOPY(entry) &&
|
introduce a new UVM fault type, VM_FAULT_WIREMAX. this is different
from VM_FAULT_WIRE in that when the pages being wired are faulted in,
the simulated fault is at the maximum protection allowed for the mapping
instead of the current protection. use this in uvm_map_pageable{,_all}()
to fix the problem where writing via ptrace() to shared libraries that
are also mapped with wired mappings in another process causes a
diagnostic panic when the wired mapping is removed.
this is a really obscure problem so it deserves some more explanation.
ptrace() writing to another process ends up down in uvm_map_extract(),
which for MAP_PRIVATE mappings (such as shared libraries) will cause
the amap to be copied or created. then the amap is made shared
(ie. the AMAP_SHARED flag is set) between the kernel and the ptrace()d
process so that the kernel can modify pages in the amap and have the
ptrace()d process see the changes. then when the page being modified
is actually faulted on, the object pages (from the shared library vnode)
is copied to a new anon page and inserted into the shared amap.
to make all the processes sharing the amap actually see the new anon
page instead of the vnode page that was there before, we need to
invalidate all the pmap-level mappings of the vnode page in the pmaps
of the processes sharing the amap, but we don't have a good way of
doing this. the amap doesn't keep track of the vm_maps which map it.
so all we can do at this point is to remove all the mappings of the
page with pmap_page_protect(), but this has the unfortunate side-effect
of removing wired mappings as well. removing wired mappings with
pmap_page_protect() is a legitimate operation, it can happen when a file
with a wired mapping is truncated. so the pmap has no way of knowing
whether a request to remove a wired mapping is normal or when it's due to
this weird situation. so the pmap has to remove the weird mapping.
the process being ptrace()d goes away and life continues. then,
much later when we go to unwire or remove the wired vm_map mapping,
we discover that the pmap mapping has been removed when it should
still be there, and we panic.
so where did we go wrong? the problem is that we don't have any way
to update just the pmap mappings that need to be updated in this
scenario. we could invent a mechanism to do this, but that is much
more complicated than this change and it doesn't seem like the right
way to go in the long run either.
the real underlying problem here is that wired pmap mappings just
aren't a good concept. one of the original properties of the pmap
design was supposed to be that all the information in the pmap could
be thrown away at any time and the VM system could regenerate it all
through fault processing, but wired pmap mappings don't allow that.
a better design for UVM would not require wired pmap mappings,
and Chuck C. and I are talking about this, but it won't be done
anytime soon, so this change will do for now.
this change has the effect of causing MAP_PRIVATE mappings to be
copied to anonymous memory when they are mlock()d, so that uvm_fault()
doesn't need to copy these pages later when called from ptrace(), thus
avoiding the call to pmap_page_protect() and the panic that results
from this when the mlock()d region is unlocked or freed. note that
this change doesn't help the case where the wired mapping is MAP_SHARED.
discussed at great length with Chuck Cranor.
fixes PRs 10363, 12554, 12604, 13041, 13487, 14580 and 14853.
2002-01-01 01:34:39 +03:00
|
|
|
((entry->max_protection & VM_PROT_WRITE) ||
|
1999-06-16 03:27:47 +04:00
|
|
|
(entry->object.uvm_obj == NULL))) {
|
1998-03-09 03:58:55 +03:00
|
|
|
amap_copy(map, entry, M_WAITOK, TRUE,
|
2001-05-25 08:06:11 +04:00
|
|
|
start, end);
|
1998-03-09 03:58:55 +03:00
|
|
|
/* XXXCDC: wait OK? */
|
|
|
|
}
|
|
|
|
}
|
1999-06-16 04:29:04 +04:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
UVM_MAP_CLIP_START(map, entry, start);
|
|
|
|
UVM_MAP_CLIP_END(map, entry, end);
|
|
|
|
entry->wired_count++;
|
|
|
|
|
|
|
|
/*
|
2001-05-25 08:06:11 +04:00
|
|
|
* Check for holes
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
if (entry->protection == VM_PROT_NONE ||
|
|
|
|
(entry->end < end &&
|
|
|
|
(entry->next == &map->header ||
|
|
|
|
entry->next->start > entry->end))) {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* found one. amap creation actions do not need to
|
2001-05-25 08:06:11 +04:00
|
|
|
* be undone, but the wired counts need to be restored.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
while (entry != &map->header && entry->end > start) {
|
|
|
|
entry->wired_count--;
|
|
|
|
entry = entry->prev;
|
|
|
|
}
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0)
|
|
|
|
vm_map_unlock(map);
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EINVAL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pass 2.
|
|
|
|
*/
|
1999-06-03 01:23:08 +04:00
|
|
|
|
1999-07-02 00:07:05 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
timestamp_save = map->timestamp;
|
|
|
|
#endif
|
|
|
|
vm_map_busy(map);
|
1999-06-03 01:23:08 +04:00
|
|
|
vm_map_downgrade(map);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
rv = 0;
|
|
|
|
entry = start_entry;
|
|
|
|
while (entry != &map->header && entry->start < end) {
|
1999-06-03 01:23:08 +04:00
|
|
|
if (entry->wired_count == 1) {
|
1999-05-26 04:36:53 +04:00
|
|
|
rv = uvm_fault_wire(map, entry->start, entry->end,
|
introduce a new UVM fault type, VM_FAULT_WIREMAX. this is different
from VM_FAULT_WIRE in that when the pages being wired are faulted in,
the simulated fault is at the maximum protection allowed for the mapping
instead of the current protection. use this in uvm_map_pageable{,_all}()
to fix the problem where writing via ptrace() to shared libraries that
are also mapped with wired mappings in another process causes a
diagnostic panic when the wired mapping is removed.
this is a really obscure problem so it deserves some more explanation.
ptrace() writing to another process ends up down in uvm_map_extract(),
which for MAP_PRIVATE mappings (such as shared libraries) will cause
the amap to be copied or created. then the amap is made shared
(ie. the AMAP_SHARED flag is set) between the kernel and the ptrace()d
process so that the kernel can modify pages in the amap and have the
ptrace()d process see the changes. then when the page being modified
is actually faulted on, the object pages (from the shared library vnode)
is copied to a new anon page and inserted into the shared amap.
to make all the processes sharing the amap actually see the new anon
page instead of the vnode page that was there before, we need to
invalidate all the pmap-level mappings of the vnode page in the pmaps
of the processes sharing the amap, but we don't have a good way of
doing this. the amap doesn't keep track of the vm_maps which map it.
so all we can do at this point is to remove all the mappings of the
page with pmap_page_protect(), but this has the unfortunate side-effect
of removing wired mappings as well. removing wired mappings with
pmap_page_protect() is a legitimate operation, it can happen when a file
with a wired mapping is truncated. so the pmap has no way of knowing
whether a request to remove a wired mapping is normal or when it's due to
this weird situation. so the pmap has to remove the weird mapping.
the process being ptrace()d goes away and life continues. then,
much later when we go to unwire or remove the wired vm_map mapping,
we discover that the pmap mapping has been removed when it should
still be there, and we panic.
so where did we go wrong? the problem is that we don't have any way
to update just the pmap mappings that need to be updated in this
scenario. we could invent a mechanism to do this, but that is much
more complicated than this change and it doesn't seem like the right
way to go in the long run either.
the real underlying problem here is that wired pmap mappings just
aren't a good concept. one of the original properties of the pmap
design was supposed to be that all the information in the pmap could
be thrown away at any time and the VM system could regenerate it all
through fault processing, but wired pmap mappings don't allow that.
a better design for UVM would not require wired pmap mappings,
and Chuck C. and I are talking about this, but it won't be done
anytime soon, so this change will do for now.
this change has the effect of causing MAP_PRIVATE mappings to be
copied to anonymous memory when they are mlock()d, so that uvm_fault()
doesn't need to copy these pages later when called from ptrace(), thus
avoiding the call to pmap_page_protect() and the panic that results
from this when the mlock()d region is unlocked or freed. note that
this change doesn't help the case where the wired mapping is MAP_SHARED.
discussed at great length with Chuck Cranor.
fixes PRs 10363, 12554, 12604, 13041, 13487, 14580 and 14853.
2002-01-01 01:34:39 +03:00
|
|
|
VM_FAULT_WIREMAX, entry->max_protection);
|
1998-03-09 03:58:55 +03:00
|
|
|
if (rv) {
|
2001-03-15 09:10:32 +03:00
|
|
|
|
1999-06-03 01:23:08 +04:00
|
|
|
/*
|
|
|
|
* wiring failed. break out of the loop.
|
|
|
|
* we'll clean up the map below, once we
|
|
|
|
* have a write lock again.
|
|
|
|
*/
|
2001-03-15 09:10:32 +03:00
|
|
|
|
1999-06-03 01:23:08 +04:00
|
|
|
break;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
|
|
|
|
2003-10-02 03:08:32 +04:00
|
|
|
if (rv) { /* failed? */
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-06-03 02:40:51 +04:00
|
|
|
/*
|
|
|
|
* Get back to an exclusive (write) lock.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-06-03 02:40:51 +04:00
|
|
|
vm_map_upgrade(map);
|
1999-07-02 00:07:05 +04:00
|
|
|
vm_map_unbusy(map);
|
|
|
|
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (timestamp_save != map->timestamp)
|
|
|
|
panic("uvm_map_pageable: stale map");
|
|
|
|
#endif
|
1999-06-03 02:40:51 +04:00
|
|
|
|
1999-06-03 01:23:08 +04:00
|
|
|
/*
|
|
|
|
* first drop the wiring count on all the entries
|
|
|
|
* which haven't actually been wired yet.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
failed_entry = entry;
|
|
|
|
while (entry != &map->header && entry->start < end) {
|
1999-06-03 01:23:08 +04:00
|
|
|
entry->wired_count--;
|
1999-06-16 03:27:47 +04:00
|
|
|
entry = entry->next;
|
|
|
|
}
|
1999-06-03 01:23:08 +04:00
|
|
|
|
|
|
|
/*
|
1999-06-16 03:27:47 +04:00
|
|
|
* now, unwire all the entries that were successfully
|
|
|
|
* wired above.
|
1999-06-03 01:23:08 +04:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
entry = start_entry;
|
|
|
|
while (entry != failed_entry) {
|
|
|
|
entry->wired_count--;
|
1999-06-16 04:29:04 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry) == 0)
|
1999-06-16 03:27:47 +04:00
|
|
|
uvm_map_entry_unwire(map, entry);
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0)
|
|
|
|
vm_map_unlock(map);
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist, "<- done (RV=%d)", rv,0,0,0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (rv);
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
1999-06-03 01:23:08 +04:00
|
|
|
|
1999-06-03 02:40:51 +04:00
|
|
|
/* We are holding a read lock here. */
|
1999-07-18 01:35:49 +04:00
|
|
|
if ((lockflags & UVM_LK_EXIT) == 0) {
|
|
|
|
vm_map_unbusy(map);
|
|
|
|
vm_map_unlock_read(map);
|
|
|
|
} else {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-07-18 01:35:49 +04:00
|
|
|
/*
|
|
|
|
* Get back to an exclusive (write) lock.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-07-18 01:35:49 +04:00
|
|
|
vm_map_upgrade(map);
|
|
|
|
vm_map_unbusy(map);
|
|
|
|
}
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* uvm_map_pageable_all: special case of uvm_map_pageable - affects
|
|
|
|
* all mapped regions.
|
|
|
|
*
|
|
|
|
* => map must not be locked.
|
|
|
|
* => if no flags are specified, all regions are unwired.
|
|
|
|
* => XXXJRT: has some of the same problems as uvm_map_pageable() above.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
|
1999-06-16 03:27:47 +04:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry, *failed_entry;
|
1999-06-16 03:27:47 +04:00
|
|
|
vsize_t size;
|
|
|
|
int rv;
|
1999-07-02 00:07:05 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
u_int timestamp_save;
|
|
|
|
#endif
|
1999-06-16 03:27:47 +04:00
|
|
|
UVMHIST_FUNC("uvm_map_pageable_all"); UVMHIST_CALLED(maphist);
|
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,flags=0x%x)", map, flags, 0, 0);
|
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(map->flags & VM_MAP_PAGEABLE);
|
1999-06-16 03:27:47 +04:00
|
|
|
|
|
|
|
vm_map_lock(map);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* handle wiring and unwiring separately.
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (flags == 0) { /* unwire */
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
1999-06-16 23:34:24 +04:00
|
|
|
* POSIX 1003.1b -- munlockall unlocks all regions,
|
|
|
|
* regardless of how many times mlockall has been called.
|
1999-06-16 03:27:47 +04:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
for (entry = map->header.next; entry != &map->header;
|
|
|
|
entry = entry->next) {
|
1999-06-16 23:34:24 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry))
|
|
|
|
uvm_map_entry_unwire(map, entry);
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
|
1999-06-16 03:27:47 +04:00
|
|
|
vm_map_unlock(map);
|
|
|
|
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & MCL_FUTURE) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* must wire all future mappings; remember this.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
vm_map_modflags(map, VM_MAP_WIREFUTURE, 0);
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((flags & MCL_CURRENT) == 0) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* no more work to do!
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wire case: in three passes [XXXCDC: ugly block of code here]
|
|
|
|
*
|
|
|
|
* 1: holding the write lock, count all pages mapped by non-wired
|
|
|
|
* entries. if this would cause us to go over our limit, we fail.
|
|
|
|
*
|
|
|
|
* 2: still holding the write lock, we create any anonymous maps that
|
|
|
|
* need to be created. then we increment its wiring count.
|
|
|
|
*
|
|
|
|
* 3: we downgrade to a read lock, and call uvm_fault_wire to fault
|
1999-06-16 23:34:24 +04:00
|
|
|
* in the pages for any newly wired area (wired_count == 1).
|
1999-06-16 03:27:47 +04:00
|
|
|
*
|
|
|
|
* downgrading to a read lock for uvm_fault_wire avoids a possible
|
|
|
|
* deadlock with another thread that may have faulted on one of
|
|
|
|
* the pages to be wired (it would mark the page busy, blocking
|
|
|
|
* us, then in turn block on the map lock that we hold). because
|
|
|
|
* of problems in the recursive lock package, we cannot upgrade
|
|
|
|
* to a write lock in vm_map_lookup. thus, any actions that
|
|
|
|
* require the write lock must be done beforehand. because we
|
|
|
|
* keep the read lock on the map, the copy-on-write status of the
|
|
|
|
* entries we modify here cannot change.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (size = 0, entry = map->header.next; entry != &map->header;
|
|
|
|
entry = entry->next) {
|
|
|
|
if (entry->protection != VM_PROT_NONE &&
|
1999-06-16 04:29:04 +04:00
|
|
|
VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
|
1999-06-16 03:27:47 +04:00
|
|
|
size += entry->end - entry->start;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return ENOMEM;
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX non-pmap_wired_count case must be handled by caller */
|
|
|
|
#ifdef pmap_wired_count
|
|
|
|
if (limit != 0 &&
|
|
|
|
(size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
|
|
|
|
vm_map_unlock(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return ENOMEM;
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pass 2.
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (entry = map->header.next; entry != &map->header;
|
|
|
|
entry = entry->next) {
|
|
|
|
if (entry->protection == VM_PROT_NONE)
|
|
|
|
continue;
|
1999-06-16 04:29:04 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* perform actions of vm_map_lookup that need the
|
|
|
|
* write lock on the map: create an anonymous map
|
|
|
|
* for a copy-on-write region, or an anonymous map
|
|
|
|
* for a zero-fill region. (XXXCDC: submap case
|
|
|
|
* ok?)
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */
|
2001-05-25 08:06:11 +04:00
|
|
|
if (UVM_ET_ISNEEDSCOPY(entry) &&
|
introduce a new UVM fault type, VM_FAULT_WIREMAX. this is different
from VM_FAULT_WIRE in that when the pages being wired are faulted in,
the simulated fault is at the maximum protection allowed for the mapping
instead of the current protection. use this in uvm_map_pageable{,_all}()
to fix the problem where writing via ptrace() to shared libraries that
are also mapped with wired mappings in another process causes a
diagnostic panic when the wired mapping is removed.
this is a really obscure problem so it deserves some more explanation.
ptrace() writing to another process ends up down in uvm_map_extract(),
which for MAP_PRIVATE mappings (such as shared libraries) will cause
the amap to be copied or created. then the amap is made shared
(ie. the AMAP_SHARED flag is set) between the kernel and the ptrace()d
process so that the kernel can modify pages in the amap and have the
ptrace()d process see the changes. then when the page being modified
is actually faulted on, the object pages (from the shared library vnode)
is copied to a new anon page and inserted into the shared amap.
to make all the processes sharing the amap actually see the new anon
page instead of the vnode page that was there before, we need to
invalidate all the pmap-level mappings of the vnode page in the pmaps
of the processes sharing the amap, but we don't have a good way of
doing this. the amap doesn't keep track of the vm_maps which map it.
so all we can do at this point is to remove all the mappings of the
page with pmap_page_protect(), but this has the unfortunate side-effect
of removing wired mappings as well. removing wired mappings with
pmap_page_protect() is a legitimate operation, it can happen when a file
with a wired mapping is truncated. so the pmap has no way of knowing
whether a request to remove a wired mapping is normal or when it's due to
this weird situation. so the pmap has to remove the weird mapping.
the process being ptrace()d goes away and life continues. then,
much later when we go to unwire or remove the wired vm_map mapping,
we discover that the pmap mapping has been removed when it should
still be there, and we panic.
so where did we go wrong? the problem is that we don't have any way
to update just the pmap mappings that need to be updated in this
scenario. we could invent a mechanism to do this, but that is much
more complicated than this change and it doesn't seem like the right
way to go in the long run either.
the real underlying problem here is that wired pmap mappings just
aren't a good concept. one of the original properties of the pmap
design was supposed to be that all the information in the pmap could
be thrown away at any time and the VM system could regenerate it all
through fault processing, but wired pmap mappings don't allow that.
a better design for UVM would not require wired pmap mappings,
and Chuck C. and I are talking about this, but it won't be done
anytime soon, so this change will do for now.
this change has the effect of causing MAP_PRIVATE mappings to be
copied to anonymous memory when they are mlock()d, so that uvm_fault()
doesn't need to copy these pages later when called from ptrace(), thus
avoiding the call to pmap_page_protect() and the panic that results
from this when the mlock()d region is unlocked or freed. note that
this change doesn't help the case where the wired mapping is MAP_SHARED.
discussed at great length with Chuck Cranor.
fixes PRs 10363, 12554, 12604, 13041, 13487, 14580 and 14853.
2002-01-01 01:34:39 +03:00
|
|
|
((entry->max_protection & VM_PROT_WRITE) ||
|
1999-06-16 03:27:47 +04:00
|
|
|
(entry->object.uvm_obj == NULL))) {
|
|
|
|
amap_copy(map, entry, M_WAITOK, TRUE,
|
|
|
|
entry->start, entry->end);
|
|
|
|
/* XXXCDC: wait OK? */
|
|
|
|
}
|
|
|
|
}
|
1999-06-16 04:29:04 +04:00
|
|
|
}
|
1999-06-16 03:27:47 +04:00
|
|
|
entry->wired_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pass 3.
|
|
|
|
*/
|
|
|
|
|
1999-07-02 00:07:05 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
timestamp_save = map->timestamp;
|
|
|
|
#endif
|
|
|
|
vm_map_busy(map);
|
1999-06-16 03:27:47 +04:00
|
|
|
vm_map_downgrade(map);
|
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
rv = 0;
|
1999-06-16 03:27:47 +04:00
|
|
|
for (entry = map->header.next; entry != &map->header;
|
|
|
|
entry = entry->next) {
|
|
|
|
if (entry->wired_count == 1) {
|
|
|
|
rv = uvm_fault_wire(map, entry->start, entry->end,
|
introduce a new UVM fault type, VM_FAULT_WIREMAX. this is different
from VM_FAULT_WIRE in that when the pages being wired are faulted in,
the simulated fault is at the maximum protection allowed for the mapping
instead of the current protection. use this in uvm_map_pageable{,_all}()
to fix the problem where writing via ptrace() to shared libraries that
are also mapped with wired mappings in another process causes a
diagnostic panic when the wired mapping is removed.
this is a really obscure problem so it deserves some more explanation.
ptrace() writing to another process ends up down in uvm_map_extract(),
which for MAP_PRIVATE mappings (such as shared libraries) will cause
the amap to be copied or created. then the amap is made shared
(ie. the AMAP_SHARED flag is set) between the kernel and the ptrace()d
process so that the kernel can modify pages in the amap and have the
ptrace()d process see the changes. then when the page being modified
is actually faulted on, the object pages (from the shared library vnode)
is copied to a new anon page and inserted into the shared amap.
to make all the processes sharing the amap actually see the new anon
page instead of the vnode page that was there before, we need to
invalidate all the pmap-level mappings of the vnode page in the pmaps
of the processes sharing the amap, but we don't have a good way of
doing this. the amap doesn't keep track of the vm_maps which map it.
so all we can do at this point is to remove all the mappings of the
page with pmap_page_protect(), but this has the unfortunate side-effect
of removing wired mappings as well. removing wired mappings with
pmap_page_protect() is a legitimate operation, it can happen when a file
with a wired mapping is truncated. so the pmap has no way of knowing
whether a request to remove a wired mapping is normal or when it's due to
this weird situation. so the pmap has to remove the weird mapping.
the process being ptrace()d goes away and life continues. then,
much later when we go to unwire or remove the wired vm_map mapping,
we discover that the pmap mapping has been removed when it should
still be there, and we panic.
so where did we go wrong? the problem is that we don't have any way
to update just the pmap mappings that need to be updated in this
scenario. we could invent a mechanism to do this, but that is much
more complicated than this change and it doesn't seem like the right
way to go in the long run either.
the real underlying problem here is that wired pmap mappings just
aren't a good concept. one of the original properties of the pmap
design was supposed to be that all the information in the pmap could
be thrown away at any time and the VM system could regenerate it all
through fault processing, but wired pmap mappings don't allow that.
a better design for UVM would not require wired pmap mappings,
and Chuck C. and I are talking about this, but it won't be done
anytime soon, so this change will do for now.
this change has the effect of causing MAP_PRIVATE mappings to be
copied to anonymous memory when they are mlock()d, so that uvm_fault()
doesn't need to copy these pages later when called from ptrace(), thus
avoiding the call to pmap_page_protect() and the panic that results
from this when the mlock()d region is unlocked or freed. note that
this change doesn't help the case where the wired mapping is MAP_SHARED.
discussed at great length with Chuck Cranor.
fixes PRs 10363, 12554, 12604, 13041, 13487, 14580 and 14853.
2002-01-01 01:34:39 +03:00
|
|
|
VM_FAULT_WIREMAX, entry->max_protection);
|
1999-06-16 03:27:47 +04:00
|
|
|
if (rv) {
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* wiring failed. break out of the loop.
|
|
|
|
* we'll clean up the map below, once we
|
|
|
|
* have a write lock again.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
if (rv) {
|
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* Get back an exclusive (write) lock.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
vm_map_upgrade(map);
|
1999-07-02 00:07:05 +04:00
|
|
|
vm_map_unbusy(map);
|
|
|
|
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (timestamp_save != map->timestamp)
|
|
|
|
panic("uvm_map_pageable_all: stale map");
|
|
|
|
#endif
|
1999-06-16 03:27:47 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* first drop the wiring count on all the entries
|
|
|
|
* which haven't actually been wired yet.
|
1999-08-03 04:38:33 +04:00
|
|
|
*
|
|
|
|
* Skip VM_PROT_NONE entries like we did above.
|
1999-06-16 03:27:47 +04:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
failed_entry = entry;
|
|
|
|
for (/* nothing */; entry != &map->header;
|
1999-08-03 04:38:33 +04:00
|
|
|
entry = entry->next) {
|
|
|
|
if (entry->protection == VM_PROT_NONE)
|
|
|
|
continue;
|
1999-06-16 03:27:47 +04:00
|
|
|
entry->wired_count--;
|
1999-08-03 04:38:33 +04:00
|
|
|
}
|
1999-06-16 03:27:47 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* now, unwire all the entries that were successfully
|
|
|
|
* wired above.
|
1999-08-03 04:38:33 +04:00
|
|
|
*
|
|
|
|
* Skip VM_PROT_NONE entries like we did above.
|
1999-06-16 03:27:47 +04:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
for (entry = map->header.next; entry != failed_entry;
|
|
|
|
entry = entry->next) {
|
1999-08-03 04:38:33 +04:00
|
|
|
if (entry->protection == VM_PROT_NONE)
|
|
|
|
continue;
|
1999-06-16 03:27:47 +04:00
|
|
|
entry->wired_count--;
|
1999-08-03 04:38:33 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(entry))
|
1999-06-16 03:27:47 +04:00
|
|
|
uvm_map_entry_unwire(map, entry);
|
|
|
|
}
|
|
|
|
vm_map_unlock(map);
|
|
|
|
UVMHIST_LOG(maphist,"<- done (RV=%d)", rv,0,0,0);
|
|
|
|
return (rv);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We are holding a read lock here. */
|
1999-07-02 00:07:05 +04:00
|
|
|
vm_map_unbusy(map);
|
1999-06-16 03:27:47 +04:00
|
|
|
vm_map_unlock_read(map);
|
|
|
|
|
|
|
|
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
|
2001-03-15 09:10:32 +03:00
|
|
|
return 0;
|
1999-06-16 03:27:47 +04:00
|
|
|
}
|
|
|
|
|
1998-02-05 09:25:08 +03:00
|
|
|
/*
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
* uvm_map_clean: clean out a map range
|
1998-02-05 09:25:08 +03:00
|
|
|
*
|
|
|
|
* => valid flags:
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
* if (flags & PGO_CLEANIT): dirty pages are cleaned first
|
1998-02-05 09:25:08 +03:00
|
|
|
* if (flags & PGO_SYNCIO): dirty pages are written synchronously
|
|
|
|
* if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
|
|
|
|
* if (flags & PGO_FREE): any cached pages are freed after clean
|
|
|
|
* => returns an error if any part of the specified range isn't mapped
|
2001-05-25 08:06:11 +04:00
|
|
|
* => never a need to flush amap layer since the anonymous memory has
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
* no permanent home, but may deactivate pages there
|
|
|
|
* => called from sys_msync() and sys_madvise()
|
1998-02-05 09:25:08 +03:00
|
|
|
* => caller must not write-lock map (read OK).
|
|
|
|
* => we may sleep while cleaning if SYNCIO [with map read-locked]
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
int
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *current, *entry;
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
struct uvm_object *uobj;
|
|
|
|
struct vm_amap *amap;
|
|
|
|
struct vm_anon *anon;
|
|
|
|
struct vm_page *pg;
|
1998-08-13 06:10:37 +04:00
|
|
|
vaddr_t offset;
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
vsize_t size;
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
int error, refs;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvm_map_clean"); UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-11-25 09:27:59 +03:00
|
|
|
UVMHIST_LOG(maphist,"(map=0x%x,start=0x%x,end=0x%x,flags=0x%x)",
|
|
|
|
map, start, end, flags);
|
|
|
|
KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
|
|
|
|
(PGO_FREE|PGO_DEACTIVATE));
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_lock_read(map);
|
|
|
|
VM_MAP_RANGE_CHECK(map, start, end);
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_unlock_read(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EFAULT;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a first pass to check for holes.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
for (current = entry; current->start < end; current = current->next) {
|
|
|
|
if (UVM_ET_ISSUBMAP(current)) {
|
|
|
|
vm_map_unlock_read(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EINVAL;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2001-02-05 14:29:54 +03:00
|
|
|
if (end <= current->end) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (current->end != current->next->start) {
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_unlock_read(map);
|
2001-03-15 09:10:32 +03:00
|
|
|
return EFAULT;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
error = 0;
|
2001-02-05 14:29:54 +03:00
|
|
|
for (current = entry; start < end; current = current->next) {
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
amap = current->aref.ar_amap; /* top layer */
|
|
|
|
uobj = current->object.uvm_obj; /* bottom layer */
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(start >= current->start);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
* No amap cleaning necessary if:
|
|
|
|
*
|
|
|
|
* (1) There's no amap.
|
|
|
|
*
|
|
|
|
* (2) We're not deactivating or freeing pages.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2001-02-05 14:29:54 +03:00
|
|
|
if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
goto flush_object;
|
|
|
|
|
|
|
|
amap_lock(amap);
|
|
|
|
offset = start - current->start;
|
2001-02-05 14:29:54 +03:00
|
|
|
size = MIN(end, current->end) - start;
|
|
|
|
for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
anon = amap_lookup(¤t->aref, offset);
|
|
|
|
if (anon == NULL)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
simple_lock(&anon->an_lock);
|
1999-07-08 01:51:35 +04:00
|
|
|
pg = anon->u.an_page;
|
|
|
|
if (pg == NULL) {
|
|
|
|
simple_unlock(&anon->an_lock);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
|
2000-11-25 09:27:59 +03:00
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
/*
|
2001-12-31 22:21:36 +03:00
|
|
|
* In these first 3 cases, we just deactivate the page.
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
case PGO_CLEANIT|PGO_FREE:
|
|
|
|
case PGO_CLEANIT|PGO_DEACTIVATE:
|
|
|
|
case PGO_DEACTIVATE:
|
1999-08-21 06:19:05 +04:00
|
|
|
deactivate_it:
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
/*
|
2001-12-31 22:21:36 +03:00
|
|
|
* skip the page if it's loaned or wired,
|
|
|
|
* since it shouldn't be on a paging queue
|
|
|
|
* at all in these cases.
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
2001-12-31 22:21:36 +03:00
|
|
|
uvm_lock_pageq();
|
|
|
|
if (pg->loan_count != 0 ||
|
|
|
|
pg->wire_count != 0) {
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
uvm_unlock_pageq();
|
|
|
|
simple_unlock(&anon->an_lock);
|
|
|
|
continue;
|
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
KASSERT(pg->uanon == anon);
|
Page scanner improvements, behavior is actually a bit more like
Mach VM's now. Specific changes:
- Pages now need not have all of their mappings removed before being
put on the inactive list. They only need to have the "referenced"
attribute cleared. This makes putting pages onto the inactive list
much more efficient. In order to eliminate redundant clearings of
"refrenced", callers of uvm_pagedeactivate() must now do this
themselves.
- When checking the "modified" attribute for a page (for clearing
PG_CLEAN), make sure to only do it if PG_CLEAN is currently set on
the page (saves a potentially expensive pmap operation).
- When scanning the inactive list, if a page is referenced, reactivate
it (this part was actually added in uvm_pdaemon.c,v 1.27). This
now works properly now that pages on the inactive list are allowed to
have mappings.
- When scanning the inactive list and considering a page for freeing,
remove all mappings, and then check the "modified" attribute if the
page is marked PG_CLEAN.
- When scanning the active list, if the page was referenced since its
last sweep by the scanner, don't deactivate it. (This part was
actually added in uvm_pdaemon.c,v 1.28.)
These changes greatly improve interactive performance during
moderate to high memory and I/O load.
2001-01-29 02:30:42 +03:00
|
|
|
pmap_clear_reference(pg);
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
uvm_pagedeactivate(pg);
|
|
|
|
uvm_unlock_pageq();
|
|
|
|
simple_unlock(&anon->an_lock);
|
|
|
|
continue;
|
|
|
|
|
|
|
|
case PGO_FREE:
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-08-21 06:19:05 +04:00
|
|
|
/*
|
|
|
|
* If there are multiple references to
|
|
|
|
* the amap, just deactivate the page.
|
|
|
|
*/
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1999-08-21 06:19:05 +04:00
|
|
|
if (amap_refs(amap) > 1)
|
|
|
|
goto deactivate_it;
|
|
|
|
|
2001-12-31 22:21:36 +03:00
|
|
|
/* skip the page if it's wired */
|
1999-07-08 01:04:22 +04:00
|
|
|
if (pg->wire_count != 0) {
|
|
|
|
simple_unlock(&anon->an_lock);
|
|
|
|
continue;
|
|
|
|
}
|
1999-07-19 21:45:23 +04:00
|
|
|
amap_unadd(¤t->aref, offset);
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
refs = --anon->an_ref;
|
|
|
|
simple_unlock(&anon->an_lock);
|
|
|
|
if (refs == 0)
|
|
|
|
uvm_anfree(anon);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
amap_unlock(amap);
|
|
|
|
|
|
|
|
flush_object:
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
1998-11-15 07:38:19 +03:00
|
|
|
* flush pages if we've got a valid backing object.
|
2001-12-31 23:34:01 +03:00
|
|
|
* note that we must always clean object pages before
|
|
|
|
* freeing them since otherwise we could reveal stale
|
|
|
|
* data from files.
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
offset = current->offset + (start - current->start);
|
2001-02-05 14:29:54 +03:00
|
|
|
size = MIN(end, current->end) - start;
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
if (uobj != NULL) {
|
|
|
|
simple_lock(&uobj->vmobjlock);
|
2003-04-10 01:39:29 +04:00
|
|
|
if (uobj->pgops->pgo_put != NULL)
|
|
|
|
error = (uobj->pgops->pgo_put)(uobj, offset,
|
|
|
|
offset + size, flags | PGO_CLEANIT);
|
|
|
|
else
|
|
|
|
error = 0;
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
start += size;
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
vm_map_unlock_read(map);
|
2001-05-25 08:06:11 +04:00
|
|
|
return (error);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_checkprot: check protection in map
|
|
|
|
*
|
|
|
|
* => must allow specified protection in a fully allocated region.
|
|
|
|
* => map must be read or write locked by caller.
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
boolean_t
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
|
|
|
|
vm_prot_t protection)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry;
|
|
|
|
struct vm_map_entry *tmp_entry;
|
2001-03-15 09:10:32 +03:00
|
|
|
|
|
|
|
if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
|
2003-10-02 03:08:32 +04:00
|
|
|
return (FALSE);
|
2001-03-15 09:10:32 +03:00
|
|
|
}
|
|
|
|
entry = tmp_entry;
|
|
|
|
while (start < end) {
|
|
|
|
if (entry == &map->header) {
|
2003-10-02 03:08:32 +04:00
|
|
|
return (FALSE);
|
2001-03-15 09:10:32 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* no holes allowed
|
|
|
|
*/
|
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
if (start < entry->start) {
|
2003-10-02 03:08:32 +04:00
|
|
|
return (FALSE);
|
2001-03-15 09:10:32 +03:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* check protection associated with entry
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
if ((entry->protection & protection) != protection) {
|
2003-10-02 03:08:32 +04:00
|
|
|
return (FALSE);
|
2001-03-15 09:10:32 +03:00
|
|
|
}
|
|
|
|
start = entry->end;
|
|
|
|
entry = entry->next;
|
|
|
|
}
|
2003-10-02 03:08:32 +04:00
|
|
|
return (TRUE);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_alloc: allocate a vmspace structure.
|
|
|
|
*
|
|
|
|
* - structure includes vm_map and pmap
|
|
|
|
* - XXX: no locking on this structure
|
|
|
|
* - refcnt set to 1, rest must be init'd by caller
|
|
|
|
*/
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_alloc(vaddr_t min, vaddr_t max)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *vm;
|
|
|
|
UVMHIST_FUNC("uvmspace_alloc"); UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-08-31 04:20:26 +04:00
|
|
|
vm = pool_get(&uvm_vmspace_pool, PR_WAITOK);
|
2001-08-16 05:37:50 +04:00
|
|
|
uvmspace_init(vm, NULL, min, max);
|
1998-03-27 04:47:06 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done (vm=0x%x)", vm,0,0,0);
|
|
|
|
return (vm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_init: initialize a vmspace structure.
|
|
|
|
*
|
|
|
|
* - XXX: no locking on this structure
|
2003-02-21 19:38:44 +03:00
|
|
|
* - refcnt set to 1, rest must be init'd by caller
|
1998-03-27 04:47:06 +03:00
|
|
|
*/
|
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max)
|
1998-03-27 04:47:06 +03:00
|
|
|
{
|
|
|
|
UVMHIST_FUNC("uvmspace_init"); UVMHIST_CALLED(maphist);
|
|
|
|
|
1998-08-10 02:36:37 +04:00
|
|
|
memset(vm, 0, sizeof(*vm));
|
2003-02-21 01:16:05 +03:00
|
|
|
uvm_map_setup(&vm->vm_map, min, max, VM_MAP_PAGEABLE
|
|
|
|
#ifdef __USING_TOPDOWN_VM
|
|
|
|
| VM_MAP_TOPDOWN
|
|
|
|
#endif
|
|
|
|
);
|
1998-03-27 04:47:06 +03:00
|
|
|
if (pmap)
|
|
|
|
pmap_reference(pmap);
|
|
|
|
else
|
|
|
|
pmap = pmap_create();
|
|
|
|
vm->vm_map.pmap = pmap;
|
1998-03-09 03:58:55 +03:00
|
|
|
vm->vm_refcnt = 1;
|
1998-03-27 04:47:06 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_share: share a vmspace between two proceses
|
|
|
|
*
|
|
|
|
* - XXX: no locking on vmspace
|
|
|
|
* - used for vfork, threads(?)
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_share(struct proc *p1, struct proc *p2)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2003-10-02 03:08:32 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
p2->p_vmspace = p1->p_vmspace;
|
|
|
|
p1->p_vmspace->vm_refcnt++;
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
|
|
|
|
*
|
|
|
|
* - XXX: no locking on vmspace
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_unshare(struct lwp *l)
|
1998-03-09 03:58:55 +03:00
|
|
|
{
|
2003-01-18 11:51:40 +03:00
|
|
|
struct proc *p = l->l_proc;
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *nvm, *ovm = p->p_vmspace;
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (ovm->vm_refcnt == 1)
|
|
|
|
/* nothing to do: vmspace isn't shared in the first place */
|
|
|
|
return;
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* make a new vmspace, still holding old one */
|
|
|
|
nvm = uvmspace_fork(ovm);
|
|
|
|
|
2003-01-18 11:51:40 +03:00
|
|
|
pmap_deactivate(l); /* unbind old vmspace */
|
2001-05-25 08:06:11 +04:00
|
|
|
p->p_vmspace = nvm;
|
2003-01-18 11:51:40 +03:00
|
|
|
pmap_activate(l); /* switch to new vmspace */
|
1998-03-19 09:37:26 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
uvmspace_free(ovm); /* drop reference to old vmspace */
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_exec: the process wants to exec a new program
|
|
|
|
*
|
|
|
|
* - XXX: no locking on vmspace
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2003-01-18 11:51:40 +03:00
|
|
|
struct proc *p = l->l_proc;
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *nvm, *ovm = p->p_vmspace;
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map *map = &ovm->vm_map;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-04-10 06:21:26 +04:00
|
|
|
#ifdef __sparc__
|
1998-03-09 03:58:55 +03:00
|
|
|
/* XXX cgd 960926: the sparc #ifdef should be a MD hook */
|
2003-01-18 11:51:40 +03:00
|
|
|
kill_user_windows(l); /* before stack addresses go away */
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* see if more than one process is using this vmspace...
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (ovm->vm_refcnt == 1) {
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* if p is the only process using its vmspace then we can safely
|
|
|
|
* recycle that vmspace for the program that is being exec'd.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
#ifdef SYSVSHM
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* SYSV SHM semantics require us to kill all segments on an exec
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (ovm->vm_shm)
|
|
|
|
shmexit(ovm);
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1999-06-16 03:27:47 +04:00
|
|
|
/*
|
|
|
|
* POSIX 1003.1b -- "lock future mappings" is revoked
|
|
|
|
* when a process execs another program image.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
Add some more meat to madvise(2):
* Implement MADV_DONTNEED: deactivate pages in the specified range,
semantics similar to Solaris's MADV_DONTNEED.
* Add MADV_FREE: free pages and swap resources associated with the
specified range, causing the range to be reloaded from backing
store (vnodes) or zero-fill (anonymous), semantics like FreeBSD's
MADV_FREE and like Digital UNIX's MADV_DONTNEED (isn't it SO GREAT
that madvise(2) isn't standardized!?)
As part of this, move the non-map-modifying advice handling out of
uvm_map_advise(), and into sys_madvise().
As another part, implement general amap cleaning in uvm_map_clean(), and
change uvm_map_clean() to only push dirty pages to disk if PGO_CLEANIT
is set in its flags (and update sys___msync13() accordingly). XXX Add
a patchable global "amap_clean_works", defaulting to 1, which can disable
the amap cleaning code, just in case problems are unearthed; this gives
a developer/user a quick way to recover and send a bug report (e.g. boot
into DDB and change the value).
XXX Still need to implement a real uao_flush().
XXX Need to update the manual page.
With these changes, rebuilding libc will automatically cause the new
malloc(3) to use MADV_FREE to actually release pages and swap resources
when it decides that can be done.
1999-07-07 10:02:21 +04:00
|
|
|
vm_map_modflags(map, 0, VM_MAP_WIREFUTURE);
|
1999-06-16 03:27:47 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* now unmap the old program
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2002-09-22 11:21:29 +04:00
|
|
|
pmap_remove_all(map->pmap);
|
2001-02-06 20:01:51 +03:00
|
|
|
uvm_unmap(map, map->min_offset, map->max_offset);
|
2003-11-01 14:09:02 +03:00
|
|
|
KASSERT(map->header.prev == &map->header);
|
|
|
|
KASSERT(map->nentries == 0);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-02-11 04:34:23 +03:00
|
|
|
/*
|
|
|
|
* resize the map
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2001-02-11 04:34:23 +03:00
|
|
|
map->min_offset = start;
|
|
|
|
map->max_offset = end;
|
1998-03-09 03:58:55 +03:00
|
|
|
} else {
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* p's vmspace is being shared, so we can't reuse it for p since
|
|
|
|
* it is still being used for others. allocate a new vmspace
|
|
|
|
* for p
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2001-08-16 05:37:50 +04:00
|
|
|
nvm = uvmspace_alloc(start, end);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* install new vmspace and drop our ref to the old one.
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2003-01-18 11:51:40 +03:00
|
|
|
pmap_deactivate(l);
|
1998-03-09 03:58:55 +03:00
|
|
|
p->p_vmspace = nvm;
|
2003-01-18 11:51:40 +03:00
|
|
|
pmap_activate(l);
|
1998-03-19 09:37:26 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
uvmspace_free(ovm);
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvmspace_free: free a vmspace data structure
|
|
|
|
*
|
|
|
|
* - XXX: no locking on vmspace
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_free(struct vmspace *vm)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *dead_entries;
|
2002-09-22 11:21:29 +04:00
|
|
|
struct vm_map *map;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvmspace_free"); UVMHIST_CALLED(maphist);
|
|
|
|
|
|
|
|
UVMHIST_LOG(maphist,"(vm=0x%x) ref=%d", vm, vm->vm_refcnt,0,0);
|
2002-09-22 11:21:29 +04:00
|
|
|
if (--vm->vm_refcnt > 0) {
|
|
|
|
return;
|
|
|
|
}
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2002-09-22 11:21:29 +04:00
|
|
|
/*
|
|
|
|
* at this point, there should be no other references to the map.
|
|
|
|
* delete all of the mappings, then destroy the pmap.
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2002-09-22 11:21:29 +04:00
|
|
|
map = &vm->vm_map;
|
|
|
|
map->flags |= VM_MAP_DYING;
|
|
|
|
pmap_remove_all(map->pmap);
|
2001-02-10 08:05:27 +03:00
|
|
|
#ifdef SYSVSHM
|
2002-09-22 11:21:29 +04:00
|
|
|
/* Get rid of any SYSV shared memory segments. */
|
|
|
|
if (vm->vm_shm != NULL)
|
|
|
|
shmexit(vm);
|
2001-02-10 08:05:27 +03:00
|
|
|
#endif
|
2002-09-22 11:21:29 +04:00
|
|
|
if (map->nentries) {
|
|
|
|
uvm_unmap_remove(map, map->min_offset, map->max_offset,
|
|
|
|
&dead_entries);
|
|
|
|
if (dead_entries != NULL)
|
|
|
|
uvm_unmap_detach(dead_entries, 0);
|
|
|
|
}
|
2003-11-01 22:56:09 +03:00
|
|
|
KASSERT(map->nentries == 0);
|
|
|
|
KASSERT(map->size == 0);
|
2002-09-22 11:21:29 +04:00
|
|
|
pmap_destroy(map->pmap);
|
|
|
|
pool_put(&uvm_vmspace_pool, vm);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* F O R K - m a i n e n t r y p o i n t
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* uvmspace_fork: fork a process' main map
|
|
|
|
*
|
|
|
|
* => create a new vmspace for child process from parent.
|
|
|
|
* => parent's map must not be locked.
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *
|
2003-10-02 02:50:15 +04:00
|
|
|
uvmspace_fork(struct vmspace *vm1)
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vmspace *vm2;
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map *old_map = &vm1->vm_map;
|
|
|
|
struct vm_map *new_map;
|
|
|
|
struct vm_map_entry *old_entry;
|
|
|
|
struct vm_map_entry *new_entry;
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_FUNC("uvmspace_fork"); UVMHIST_CALLED(maphist);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
vm_map_lock(old_map);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2001-08-16 05:37:50 +04:00
|
|
|
vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset);
|
1998-08-10 02:36:37 +04:00
|
|
|
memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
|
2003-10-26 02:05:45 +03:00
|
|
|
(caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
|
1998-03-09 03:58:55 +03:00
|
|
|
new_map = &vm2->vm_map; /* XXX */
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
old_entry = old_map->header.next;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* go entry-by-entry
|
|
|
|
*/
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
while (old_entry != &old_map->header) {
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* first, some sanity checks on the old entry
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
2001-03-15 09:10:32 +03:00
|
|
|
KASSERT(!UVM_ET_ISSUBMAP(old_entry));
|
|
|
|
KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
|
|
|
|
!UVM_ET_ISNEEDSCOPY(old_entry));
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
switch (old_entry->inheritance) {
|
2000-08-01 04:53:07 +04:00
|
|
|
case MAP_INHERIT_NONE:
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* drop the mapping
|
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
break;
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2000-08-01 04:53:07 +04:00
|
|
|
case MAP_INHERIT_SHARE:
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
|
|
|
* share the mapping: this means we want the old and
|
|
|
|
* new entries to share amaps and backing objects.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* if the old_entry needs a new amap (due to prev fork)
|
|
|
|
* then we need to allocate it now so that we have
|
|
|
|
* something we own to share with the new_entry. [in
|
|
|
|
* other words, we need to clear needs_copy]
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (UVM_ET_ISNEEDSCOPY(old_entry)) {
|
|
|
|
/* get our own amap, clears needs_copy */
|
|
|
|
amap_copy(old_map, old_entry, M_WAITOK, FALSE,
|
2001-05-25 08:06:11 +04:00
|
|
|
0, 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
/* XXXCDC: WAITOK??? */
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
2002-11-30 21:28:04 +03:00
|
|
|
new_entry = uvm_mapent_alloc(new_map, 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
/* old_entry -> new_entry */
|
|
|
|
uvm_mapent_copy(old_entry, new_entry);
|
|
|
|
|
|
|
|
/* new pmap has nothing wired in it */
|
|
|
|
new_entry->wired_count = 0;
|
|
|
|
|
|
|
|
/*
|
1998-10-12 03:14:47 +04:00
|
|
|
* gain reference to object backing the map (can't
|
|
|
|
* be a submap, already checked this case).
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
2001-06-02 22:09:08 +04:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (new_entry->aref.ar_amap)
|
2000-11-25 09:27:59 +03:00
|
|
|
uvm_map_reference_amap(new_entry, AMAP_SHARED);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
if (new_entry->object.uvm_obj &&
|
|
|
|
new_entry->object.uvm_obj->pgops->pgo_reference)
|
|
|
|
new_entry->object.uvm_obj->
|
|
|
|
pgops->pgo_reference(
|
|
|
|
new_entry->object.uvm_obj);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/* insert entry at end of new_map's entry list */
|
|
|
|
uvm_map_entry_link(new_map, new_map->header.prev,
|
|
|
|
new_entry);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
break;
|
|
|
|
|
2000-08-01 04:53:07 +04:00
|
|
|
case MAP_INHERIT_COPY:
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* copy-on-write the mapping (using mmap's
|
|
|
|
* MAP_PRIVATE semantics)
|
1998-10-12 03:14:47 +04:00
|
|
|
*
|
2001-05-25 08:06:11 +04:00
|
|
|
* allocate new_entry, adjust reference counts.
|
1998-10-12 03:14:47 +04:00
|
|
|
* (note that new references are read-only).
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
2002-11-30 21:28:04 +03:00
|
|
|
new_entry = uvm_mapent_alloc(new_map, 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
/* old_entry -> new_entry */
|
|
|
|
uvm_mapent_copy(old_entry, new_entry);
|
|
|
|
|
|
|
|
if (new_entry->aref.ar_amap)
|
2000-11-25 09:27:59 +03:00
|
|
|
uvm_map_reference_amap(new_entry, 0);
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
if (new_entry->object.uvm_obj &&
|
|
|
|
new_entry->object.uvm_obj->pgops->pgo_reference)
|
|
|
|
new_entry->object.uvm_obj->pgops->pgo_reference
|
|
|
|
(new_entry->object.uvm_obj);
|
|
|
|
|
|
|
|
/* new pmap has nothing wired in it */
|
|
|
|
new_entry->wired_count = 0;
|
|
|
|
|
|
|
|
new_entry->etype |=
|
|
|
|
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
|
|
|
|
uvm_map_entry_link(new_map, new_map->header.prev,
|
|
|
|
new_entry);
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-19 22:26:50 +03:00
|
|
|
/*
|
1998-03-09 03:58:55 +03:00
|
|
|
* the new entry will need an amap. it will either
|
|
|
|
* need to be copied from the old entry or created
|
1998-03-19 22:26:50 +03:00
|
|
|
* from scratch (if the old entry does not have an
|
|
|
|
* amap). can we defer this process until later
|
|
|
|
* (by setting "needs_copy") or do we need to copy
|
|
|
|
* the amap now?
|
1998-03-09 03:58:55 +03:00
|
|
|
*
|
1998-03-19 22:26:50 +03:00
|
|
|
* we must copy the amap now if any of the following
|
1998-03-09 03:58:55 +03:00
|
|
|
* conditions hold:
|
1998-03-19 22:26:50 +03:00
|
|
|
* 1. the old entry has an amap and that amap is
|
|
|
|
* being shared. this means that the old (parent)
|
2001-05-25 08:06:11 +04:00
|
|
|
* process is sharing the amap with another
|
1998-03-19 22:26:50 +03:00
|
|
|
* process. if we do not clear needs_copy here
|
|
|
|
* we will end up in a situation where both the
|
|
|
|
* parent and child process are refering to the
|
2001-05-25 08:06:11 +04:00
|
|
|
* same amap with "needs_copy" set. if the
|
1998-03-19 22:26:50 +03:00
|
|
|
* parent write-faults, the fault routine will
|
|
|
|
* clear "needs_copy" in the parent by allocating
|
2001-05-25 08:06:11 +04:00
|
|
|
* a new amap. this is wrong because the
|
1998-03-19 22:26:50 +03:00
|
|
|
* parent is supposed to be sharing the old amap
|
|
|
|
* and the new amap will break that.
|
1998-03-09 03:58:55 +03:00
|
|
|
*
|
1998-03-19 22:26:50 +03:00
|
|
|
* 2. if the old entry has an amap and a non-zero
|
|
|
|
* wire count then we are going to have to call
|
2001-05-25 08:06:11 +04:00
|
|
|
* amap_cow_now to avoid page faults in the
|
1998-03-19 22:26:50 +03:00
|
|
|
* parent process. since amap_cow_now requires
|
|
|
|
* "needs_copy" to be clear we might as well
|
|
|
|
* clear it here as well.
|
1998-03-09 03:58:55 +03:00
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
1998-03-19 22:26:50 +03:00
|
|
|
if (old_entry->aref.ar_amap != NULL) {
|
2001-06-02 22:09:08 +04:00
|
|
|
if ((amap_flags(old_entry->aref.ar_amap) &
|
|
|
|
AMAP_SHARED) != 0 ||
|
|
|
|
VM_MAPENT_ISWIRED(old_entry)) {
|
1998-03-19 22:26:50 +03:00
|
|
|
|
2001-06-02 22:09:08 +04:00
|
|
|
amap_copy(new_map, new_entry, M_WAITOK,
|
|
|
|
FALSE, 0, 0);
|
|
|
|
/* XXXCDC: M_WAITOK ... ok? */
|
|
|
|
}
|
1998-03-19 22:26:50 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
/*
|
1998-03-19 22:26:50 +03:00
|
|
|
* if the parent's entry is wired down, then the
|
|
|
|
* parent process does not want page faults on
|
|
|
|
* access to that memory. this means that we
|
|
|
|
* cannot do copy-on-write because we can't write
|
|
|
|
* protect the old entry. in this case we
|
|
|
|
* resolve all copy-on-write faults now, using
|
|
|
|
* amap_cow_now. note that we have already
|
|
|
|
* allocated any needed amap (above).
|
1998-03-09 03:58:55 +03:00
|
|
|
*/
|
|
|
|
|
1999-06-16 04:29:04 +04:00
|
|
|
if (VM_MAPENT_ISWIRED(old_entry)) {
|
1998-03-19 22:26:50 +03:00
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
/*
|
1998-03-19 22:26:50 +03:00
|
|
|
* resolve all copy-on-write faults now
|
2001-05-25 08:06:11 +04:00
|
|
|
* (note that there is nothing to do if
|
1998-03-19 22:26:50 +03:00
|
|
|
* the old mapping does not have an amap).
|
|
|
|
*/
|
|
|
|
if (old_entry->aref.ar_amap)
|
|
|
|
amap_cow_now(new_map, new_entry);
|
|
|
|
|
2001-05-25 08:06:11 +04:00
|
|
|
} else {
|
1998-03-19 22:26:50 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* setup mappings to trigger copy-on-write faults
|
|
|
|
* we must write-protect the parent if it has
|
|
|
|
* an amap and it is not already "needs_copy"...
|
|
|
|
* if it is already "needs_copy" then the parent
|
|
|
|
* has already been write-protected by a previous
|
|
|
|
* fork operation.
|
|
|
|
*/
|
|
|
|
|
2001-11-06 08:27:17 +03:00
|
|
|
if (old_entry->aref.ar_amap &&
|
|
|
|
!UVM_ET_ISNEEDSCOPY(old_entry)) {
|
1998-03-19 22:26:50 +03:00
|
|
|
if (old_entry->max_protection & VM_PROT_WRITE) {
|
|
|
|
pmap_protect(old_map->pmap,
|
|
|
|
old_entry->start,
|
|
|
|
old_entry->end,
|
|
|
|
old_entry->protection &
|
|
|
|
~VM_PROT_WRITE);
|
2001-09-11 01:19:08 +04:00
|
|
|
pmap_update(old_map->pmap);
|
1998-03-19 22:26:50 +03:00
|
|
|
}
|
|
|
|
old_entry->etype |= UVM_ET_NEEDSCOPY;
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
|
|
|
break;
|
1998-03-19 22:26:50 +03:00
|
|
|
} /* end of switch statement */
|
1998-03-09 03:58:55 +03:00
|
|
|
old_entry = old_entry->next;
|
|
|
|
}
|
|
|
|
|
|
|
|
new_map->size = old_map->size;
|
2001-05-25 08:06:11 +04:00
|
|
|
vm_map_unlock(old_map);
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
#ifdef SYSVSHM
|
1998-03-09 03:58:55 +03:00
|
|
|
if (vm1->vm_shm)
|
|
|
|
shmfork(vm1, vm2);
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1999-05-12 23:11:23 +04:00
|
|
|
#ifdef PMAP_FORK
|
|
|
|
pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
|
|
|
|
#endif
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
|
2003-10-02 03:08:32 +04:00
|
|
|
return (vm2);
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(DDB)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* DDB hooks
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_map_printit: actually prints the map
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_map_printit(struct vm_map *map, boolean_t full,
|
|
|
|
void (*pr)(const char *, ...))
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2001-06-02 22:09:08 +04:00
|
|
|
struct vm_map_entry *entry;
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
(*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset);
|
1999-06-07 20:31:42 +04:00
|
|
|
(*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=0x%x\n",
|
|
|
|
map->nentries, map->size, map->ref_count, map->timestamp,
|
|
|
|
map->flags);
|
2001-05-25 08:06:11 +04:00
|
|
|
(*pr)("\tpmap=%p(resident=%d)\n", map->pmap,
|
1998-03-30 21:34:58 +04:00
|
|
|
pmap_resident_count(map->pmap));
|
1998-03-09 03:58:55 +03:00
|
|
|
if (!full)
|
|
|
|
return;
|
|
|
|
for (entry = map->header.next; entry != &map->header;
|
|
|
|
entry = entry->next) {
|
2000-03-27 00:54:45 +04:00
|
|
|
(*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n",
|
1998-03-09 03:58:55 +03:00
|
|
|
entry, entry->start, entry->end, entry->object.uvm_obj,
|
2000-11-25 09:27:59 +03:00
|
|
|
(long long)entry->offset, entry->aref.ar_amap,
|
|
|
|
entry->aref.ar_pageoff);
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)(
|
2000-11-25 09:27:59 +03:00
|
|
|
"\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
|
|
|
|
"wc=%d, adv=%d\n",
|
1998-03-09 03:58:55 +03:00
|
|
|
(entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
|
2001-05-25 08:06:11 +04:00
|
|
|
(entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
|
1998-03-09 03:58:55 +03:00
|
|
|
(entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
|
|
|
|
entry->protection, entry->max_protection,
|
|
|
|
entry->inheritance, entry->wired_count, entry->advice);
|
|
|
|
}
|
2001-05-25 08:06:11 +04:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_object_printit: actually prints the object
|
|
|
|
*/
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_object_printit(struct uvm_object *uobj, boolean_t full,
|
|
|
|
void (*pr)(const char *, ...))
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
1998-03-09 03:58:55 +03:00
|
|
|
struct vm_page *pg;
|
|
|
|
int cnt = 0;
|
|
|
|
|
2000-04-10 06:21:26 +04:00
|
|
|
(*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ",
|
|
|
|
uobj, uobj->vmobjlock.lock_data, uobj->pgops, uobj->uo_npages);
|
1999-05-25 04:09:00 +04:00
|
|
|
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)("refs=<SYSTEM>\n");
|
|
|
|
else
|
|
|
|
(*pr)("refs=%d\n", uobj->uo_refs);
|
|
|
|
|
2000-04-10 06:21:26 +04:00
|
|
|
if (!full) {
|
|
|
|
return;
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)(" PAGES <pg,offset>:\n ");
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
TAILQ_FOREACH(pg, &uobj->memq, listq) {
|
|
|
|
cnt++;
|
2000-11-25 09:27:59 +03:00
|
|
|
(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if ((cnt % 3) == 0) {
|
2000-04-10 06:21:26 +04:00
|
|
|
(*pr)("\n ");
|
|
|
|
}
|
|
|
|
}
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if ((cnt % 3) != 0) {
|
2000-04-10 06:21:26 +04:00
|
|
|
(*pr)("\n");
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2001-05-25 08:06:11 +04:00
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* uvm_page_printit: actually print the page
|
|
|
|
*/
|
|
|
|
|
2000-11-27 11:39:39 +03:00
|
|
|
static const char page_flagbits[] =
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5PAGEOUT\6RELEASED\7FAKE\10RDONLY"
|
2000-11-27 11:39:39 +03:00
|
|
|
"\11ZERO\15PAGER1";
|
|
|
|
static const char page_pqflagbits[] =
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
"\20\1FREE\2INACTIVE\3ACTIVE\5ANON\6AOBJ";
|
2000-11-27 11:39:39 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
void
|
2003-10-02 02:50:15 +04:00
|
|
|
uvm_page_printit(struct vm_page *pg, boolean_t full,
|
|
|
|
void (*pr)(const char *, ...))
|
1998-02-05 09:25:08 +03:00
|
|
|
{
|
2000-11-25 09:27:59 +03:00
|
|
|
struct vm_page *tpg;
|
1998-03-09 03:58:55 +03:00
|
|
|
struct uvm_object *uobj;
|
|
|
|
struct pglist *pgl;
|
2000-04-10 06:21:26 +04:00
|
|
|
char pgbuf[128];
|
|
|
|
char pqbuf[128];
|
1998-02-05 09:25:08 +03:00
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)("PAGE %p:\n", pg);
|
2000-04-10 06:21:26 +04:00
|
|
|
bitmask_snprintf(pg->flags, page_flagbits, pgbuf, sizeof(pgbuf));
|
|
|
|
bitmask_snprintf(pg->pqflags, page_pqflagbits, pqbuf, sizeof(pqbuf));
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
(*pr)(" flags=%s, pqflags=%s, wire_count=%d, pa=0x%lx\n",
|
2003-08-26 19:12:18 +04:00
|
|
|
pgbuf, pqbuf, pg->wire_count, (long)VM_PAGE_TO_PHYS(pg));
|
2000-11-27 11:39:39 +03:00
|
|
|
(*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n",
|
|
|
|
pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count);
|
1998-02-05 09:25:08 +03:00
|
|
|
#if defined(UVM_PAGE_TRKOWN)
|
1998-03-09 03:58:55 +03:00
|
|
|
if (pg->flags & PG_BUSY)
|
|
|
|
(*pr)(" owning process = %d, tag=%s\n",
|
|
|
|
pg->owner, pg->owner_tag);
|
|
|
|
else
|
|
|
|
(*pr)(" page not busy, no owner\n");
|
1998-02-05 09:25:08 +03:00
|
|
|
#else
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)(" [page ownership tracking disabled]\n");
|
1998-02-05 09:25:08 +03:00
|
|
|
#endif
|
|
|
|
|
1998-03-09 03:58:55 +03:00
|
|
|
if (!full)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* cross-verify object/anon */
|
|
|
|
if ((pg->pqflags & PQ_FREE) == 0) {
|
|
|
|
if (pg->pqflags & PQ_ANON) {
|
|
|
|
if (pg->uanon == NULL || pg->uanon->u.an_page != pg)
|
2000-11-25 09:27:59 +03:00
|
|
|
(*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
|
1998-03-09 03:58:55 +03:00
|
|
|
(pg->uanon) ? pg->uanon->u.an_page : NULL);
|
|
|
|
else
|
|
|
|
(*pr)(" anon backpointer is OK\n");
|
|
|
|
} else {
|
|
|
|
uobj = pg->uobject;
|
|
|
|
if (uobj) {
|
|
|
|
(*pr)(" checking object list\n");
|
2000-11-25 09:27:59 +03:00
|
|
|
TAILQ_FOREACH(tpg, &uobj->memq, listq) {
|
|
|
|
if (tpg == pg) {
|
|
|
|
break;
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
if (tpg)
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)(" page found on object list\n");
|
|
|
|
else
|
|
|
|
(*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* cross-verify page queue */
|
2000-04-24 21:12:00 +04:00
|
|
|
if (pg->pqflags & PQ_FREE) {
|
|
|
|
int fl = uvm_page_lookup_freelist(pg);
|
2001-04-29 08:23:20 +04:00
|
|
|
int color = VM_PGCOLOR_BUCKET(pg);
|
|
|
|
pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
|
|
|
|
((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
|
2000-11-25 09:27:59 +03:00
|
|
|
} else if (pg->pqflags & PQ_INACTIVE) {
|
2001-05-22 04:44:44 +04:00
|
|
|
pgl = &uvm.page_inactive;
|
2000-11-25 09:27:59 +03:00
|
|
|
} else if (pg->pqflags & PQ_ACTIVE) {
|
1998-03-09 03:58:55 +03:00
|
|
|
pgl = &uvm.page_active;
|
2003-10-02 03:08:32 +04:00
|
|
|
} else {
|
1998-03-09 03:58:55 +03:00
|
|
|
pgl = NULL;
|
2000-11-25 09:27:59 +03:00
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
|
|
|
|
if (pgl) {
|
|
|
|
(*pr)(" checking pageq list\n");
|
2000-11-25 09:27:59 +03:00
|
|
|
TAILQ_FOREACH(tpg, pgl, pageq) {
|
|
|
|
if (tpg == pg) {
|
|
|
|
break;
|
|
|
|
}
|
1998-03-09 03:58:55 +03:00
|
|
|
}
|
2000-11-25 09:27:59 +03:00
|
|
|
if (tpg)
|
1998-03-09 03:58:55 +03:00
|
|
|
(*pr)(" page found on pageq list\n");
|
|
|
|
else
|
|
|
|
(*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
|
|
|
|
}
|
1998-02-05 09:25:08 +03:00
|
|
|
}
|
|
|
|
#endif
|