NetBSD/sys/uvm/uvm_loan.c

1241 lines
30 KiB
C
Raw Normal View History

/* $NetBSD: uvm_loan.c,v 1.100 2020/03/22 18:32:42 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1998-02-07 14:07:38 +03:00
*
* from: Id: uvm_loan.c,v 1.1.6.4 1998/02/06 05:08:43 chs Exp
*/
/*
* uvm_loan.c: page loanout handler
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_loan.c,v 1.100 2020/03/22 18:32:42 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mman.h>
#include <uvm/uvm.h>
#ifdef UVMHIST
UVMHIST_DEFINE(loanhist);
#endif
/*
2001-05-25 08:06:11 +04:00
* "loaned" pages are pages which are (read-only, copy-on-write) loaned
* from the VM system to other parts of the kernel. this allows page
* copying to be avoided (e.g. you can loan pages from objs/anons to
* the mbuf system).
*
* there are 3 types of loans possible:
* O->K uvm_object page to wired kernel page (e.g. mbuf data area)
* A->K anon page to wired kernel page (e.g. mbuf data area)
* O->A uvm_object to anon loan (e.g. vnode page to an anon)
* note that it possible to have an O page loaned to both an A and K
* at the same time.
*
* loans are tracked by pg->loan_count. an O->A page will have both
* a uvm_object and a vm_anon, but PG_ANON will not be set. this sort
* of page is considered "owned" by the uvm_object (not the anon).
*
* each loan of a page to the kernel bumps the pg->wire_count. the
* kernel mappings for these pages will be read-only and wired. since
* the page will also be wired, it will not be a candidate for pageout,
* and thus will never be pmap_page_protect()'d with VM_PROT_NONE. a
* write fault in the kernel to one of these pages will not cause
* copy-on-write. instead, the page fault is considered fatal. this
* is because the kernel mapping will have no way to look up the
* object/anon which the page is owned by. this is a good side-effect,
* since a kernel write to a loaned page is an error.
*
2001-05-25 08:06:11 +04:00
* owners that want to free their pages and discover that they are
* loaned out simply "disown" them (the page becomes an orphan). these
* pages should be freed when the last loan is dropped. in some cases
* an anon may "adopt" an orphaned page.
*
* locking: to read pg->loan_count either the owner or pg->interlock
* must be locked. to modify pg->loan_count, both the owner of the page
* and pg->interlock must be locked. pg->flags is (as always) locked by
* the owner of the page.
*
* note that locking from the "loaned" side is tricky since the object
* getting the loaned page has no reference to the page's owner and thus
* the owner could "die" at any time. in order to prevent the owner
* from dying pg->interlock should be locked. this forces us to sometimes
* use "try" locking.
*
* loans are typically broken by the following events:
2001-05-25 08:06:11 +04:00
* 1. user-level xwrite fault to a loaned page
* 2. pageout of clean+inactive O->A loaned page
* 3. owner frees page (e.g. pager flush)
*
* note that loaning a page causes all mappings of the page to become
* read-only (via pmap_page_protect). this could have an unexpected
* effect on normal "wired" pages if one is not careful (XXX).
*/
/*
* local prototypes
*/
2004-03-24 10:50:48 +03:00
static int uvm_loananon(struct uvm_faultinfo *, void ***,
int, struct vm_anon *);
static int uvm_loanuobj(struct uvm_faultinfo *, void ***,
int, vaddr_t);
static int uvm_loanzero(struct uvm_faultinfo *, void ***, int);
static void uvm_unloananon(struct vm_anon **, int);
static void uvm_unloanpage(struct vm_page **, int);
static int uvm_loanpage(struct vm_page **, int);
/*
* inlines
*/
/*
* uvm_loanentry: loan out pages in a map entry (helper fn for uvm_loan())
*
* => "ufi" is the result of a successful map lookup (meaning that
* on entry the map is locked by the caller)
* => we may unlock and then relock the map if needed (for I/O)
* => we put our output result in "output"
* => we always return with the map unlocked
* => possible return values:
* -1 == error, map is unlocked
* 0 == map relock error (try again!), map is unlocked
* >0 == number of pages we loaned, map is unlocked
*
* NOTE: We can live with this being an inline, because it is only called
* from one place.
*/
static inline int
2005-06-27 06:19:48 +04:00
uvm_loanentry(struct uvm_faultinfo *ufi, void ***output, int flags)
{
vaddr_t curaddr = ufi->orig_rvaddr;
vsize_t togo = ufi->size;
1998-03-09 03:58:55 +03:00
struct vm_aref *aref = &ufi->entry->aref;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
struct vm_anon *anon;
int rv, result = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
1998-03-09 03:58:55 +03:00
/*
* lock us the rest of the way down (we unlock before return)
1998-03-09 03:58:55 +03:00
*/
if (aref->ar_amap) {
amap_lock(aref->ar_amap, RW_WRITER);
}
1998-03-09 03:58:55 +03:00
/*
* loop until done
*/
while (togo) {
/*
* find the page we want. check the anon layer first.
*/
if (aref->ar_amap) {
anon = amap_lookup(aref, curaddr - ufi->entry->start);
} else {
anon = NULL;
}
/* locked: map, amap, uobj */
1998-03-09 03:58:55 +03:00
if (anon) {
rv = uvm_loananon(ufi, output, flags, anon);
} else if (uobj) {
rv = uvm_loanuobj(ufi, output, flags, curaddr);
} else if (UVM_ET_ISCOPYONWRITE(ufi->entry)) {
rv = uvm_loanzero(ufi, output, flags);
} else {
uvmfault_unlockall(ufi, aref->ar_amap, uobj);
rv = -1;
1998-03-09 03:58:55 +03:00
}
/* locked: if (rv > 0) => map, amap, uobj [o.w. unlocked] */
KASSERT(rv > 0 || aref->ar_amap == NULL ||
!rw_write_held(aref->ar_amap->am_lock));
2008-01-02 14:48:20 +03:00
KASSERT(rv > 0 || uobj == NULL ||
!rw_write_held(uobj->vmobjlock));
1998-03-09 03:58:55 +03:00
/* total failure */
if (rv < 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "failure %jd", rv, 0,0,0);
return (-1);
}
1998-03-09 03:58:55 +03:00
/* relock failed, need to do another lookup */
if (rv == 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "relock failure %jd", result
,0,0,0);
return (result);
}
1998-03-09 03:58:55 +03:00
/*
* got it... advance to next page
*/
1998-03-09 03:58:55 +03:00
result++;
togo -= PAGE_SIZE;
curaddr += PAGE_SIZE;
}
/*
* unlock what we locked, unlock the maps and return
1998-03-09 03:58:55 +03:00
*/
if (aref->ar_amap) {
amap_unlock(aref->ar_amap);
}
2007-02-22 09:05:00 +03:00
uvmfault_unlockmaps(ufi, false);
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "done %jd", result, 0,0,0);
return (result);
}
/*
* normal functions
*/
/*
* uvm_loan: loan pages in a map out to anons or to the kernel
2001-05-25 08:06:11 +04:00
*
* => map should be unlocked
* => start and len should be multiples of PAGE_SIZE
* => result is either an array of anon's or vm_pages (depending on flags)
* => flag values: UVM_LOAN_TOANON - loan to anons
* UVM_LOAN_TOPAGE - loan to wired kernel page
* one and only one of these flags must be set!
* => returns 0 (success), or an appropriate error number
*/
1998-03-09 03:58:55 +03:00
int
2005-06-27 06:19:48 +04:00
uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags)
{
1998-03-09 03:58:55 +03:00
struct uvm_faultinfo ufi;
void **result, **output;
int rv, error;
1999-06-03 04:05:45 +04:00
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
1998-03-09 03:58:55 +03:00
/*
* ensure that one and only one of the flags is set
*/
KASSERT(((flags & UVM_LOAN_TOANON) == 0) ^
((flags & UVM_LOAN_TOPAGE) == 0));
1998-03-09 03:58:55 +03:00
/*
* "output" is a pointer to the current place to put the loaned page.
1998-03-09 03:58:55 +03:00
*/
result = v;
1998-03-09 03:58:55 +03:00
output = &result[0]; /* start at the beginning ... */
/*
* while we've got pages to do
*/
while (len > 0) {
/*
* fill in params for a call to uvmfault_lookup
*/
ufi.orig_map = map;
ufi.orig_rvaddr = start;
ufi.orig_size = len;
2001-05-25 08:06:11 +04:00
1998-03-09 03:58:55 +03:00
/*
* do the lookup, the only time this will fail is if we hit on
* an unmapped region (an error)
*/
2007-02-22 09:05:00 +03:00
if (!uvmfault_lookup(&ufi, false)) {
error = ENOENT;
1998-03-09 03:58:55 +03:00
goto fail;
}
1998-03-09 03:58:55 +03:00
/*
* map now locked. now do the loanout...
1998-03-09 03:58:55 +03:00
*/
1998-03-09 03:58:55 +03:00
rv = uvm_loanentry(&ufi, &output, flags);
if (rv < 0) {
/* all unlocked due to error */
error = EINVAL;
1998-03-09 03:58:55 +03:00
goto fail;
}
1998-03-09 03:58:55 +03:00
/*
* done! the map is unlocked. advance, if possible.
*
2004-03-24 10:50:48 +03:00
* XXXCDC: could be recoded to hold the map lock with
* smarter code (but it only happens on map entry
* boundaries, so it isn't that bad).
1998-03-09 03:58:55 +03:00
*/
if (rv) {
rv <<= PAGE_SHIFT;
len -= rv;
start += rv;
}
1998-03-09 03:58:55 +03:00
}
UVMHIST_LOG(loanhist, "success", 0,0,0,0);
return 0;
fail:
1998-03-09 03:58:55 +03:00
/*
* failed to complete loans. drop any loans and return failure code.
* map is already unlocked.
1998-03-09 03:58:55 +03:00
*/
1998-03-09 03:58:55 +03:00
if (output - result) {
if (flags & UVM_LOAN_TOANON) {
1998-03-09 03:58:55 +03:00
uvm_unloananon((struct vm_anon **)result,
output - result);
} else {
1998-03-09 03:58:55 +03:00
uvm_unloanpage((struct vm_page **)result,
output - result);
}
1998-03-09 03:58:55 +03:00
}
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "error %jd", error,0,0,0);
return (error);
}
/*
* uvm_loananon: loan a page from an anon out
2001-05-25 08:06:11 +04:00
*
* => called with map, amap, uobj locked
* => return value:
* -1 = fatal error, everything is unlocked, abort.
* 0 = lookup in ufi went stale, everything unlocked, relookup and
* try again
* 1 = got it, everything still locked
*/
1998-03-09 03:58:55 +03:00
int
2005-06-27 06:19:48 +04:00
uvm_loananon(struct uvm_faultinfo *ufi, void ***output, int flags,
struct vm_anon *anon)
{
1998-03-09 03:58:55 +03:00
struct vm_page *pg;
int error;
1998-03-09 03:58:55 +03:00
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
1998-03-09 03:58:55 +03:00
/*
* if we are loaning to "another" anon then it is easy, we just
1998-03-09 03:58:55 +03:00
* bump the reference count on the current anon and return a
* pointer to it (it becomes copy-on-write shared).
1998-03-09 03:58:55 +03:00
*/
1998-03-09 03:58:55 +03:00
if (flags & UVM_LOAN_TOANON) {
KASSERT(rw_write_held(anon->an_lock));
pg = anon->an_page;
if (pg && (pg->flags & PG_ANON) != 0 && anon->an_ref == 1) {
if (pg->wire_count > 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "->A wired %#jx",
(uintptr_t)pg, 0, 0, 0);
uvmfault_unlockall(ufi,
ufi->entry->aref.ar_amap,
ufi->entry->object.uvm_obj);
return (-1);
}
pmap_page_protect(pg, VM_PROT_READ);
}
1998-03-09 03:58:55 +03:00
anon->an_ref++;
**output = anon;
(*output)++;
UVMHIST_LOG(loanhist, "->A done", 0,0,0,0);
return (1);
1998-03-09 03:58:55 +03:00
}
/*
* we are loaning to a kernel-page. we need to get the page
* resident so we can wire it. uvmfault_anonget will handle
* this for us.
*/
KASSERT(rw_write_held(anon->an_lock));
error = uvmfault_anonget(ufi, ufi->entry->aref.ar_amap, anon);
1998-03-09 03:58:55 +03:00
/*
* if we were unable to get the anon, then uvmfault_anonget has
* unlocked everything and returned an error code.
*/
if (error) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "error %jd", error,0,0,0);
KASSERT(error != ENOLCK);
1998-03-09 03:58:55 +03:00
/* need to refault (i.e. refresh our lookup) ? */
if (error == ERESTART) {
return (0);
}
1998-03-09 03:58:55 +03:00
/* "try again"? sleep a bit and retry ... */
if (error == EAGAIN) {
kpause("loanagain", false, hz/2, NULL);
return (0);
1998-03-09 03:58:55 +03:00
}
/* otherwise flag it as an error */
return (-1);
1998-03-09 03:58:55 +03:00
}
/*
* we have the page and its owner locked: do the loan now.
*/
pg = anon->an_page;
if (pg->wire_count > 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "->K wired %#jx", (uintptr_t)pg, 0, 0, 0);
KASSERT(pg->uobject == NULL);
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, NULL);
return (-1);
}
if (pg->loan_count == 0) {
pmap_page_protect(pg, VM_PROT_READ);
}
uvm_pagelock(pg);
1998-03-09 03:58:55 +03:00
pg->loan_count++;
KASSERT(pg->loan_count > 0); /* detect wrap-around */
uvm_pageactivate(pg);
uvm_pageunlock(pg);
1998-03-09 03:58:55 +03:00
**output = pg;
(*output)++;
1998-03-09 03:58:55 +03:00
/* unlock and return success */
if (pg->uobject)
rw_exit(pg->uobject->vmobjlock);
UVMHIST_LOG(loanhist, "->K done", 0,0,0,0);
return (1);
}
/*
* uvm_loanpage: loan out pages to kernel (->K)
*
* => pages should be object-owned and the object should be locked.
* => in the case of error, the object might be unlocked and relocked.
* => caller should busy the pages beforehand.
* => pages will be unbusied.
* => fail with EBUSY if meet a wired page.
*/
static int
2005-06-27 06:19:48 +04:00
uvm_loanpage(struct vm_page **pgpp, int npages)
{
int i;
int error = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgpp[i];
KASSERT(pg->uobject != NULL);
KASSERT(pg->uobject == pgpp[0]->uobject);
KASSERT(!(pg->flags & (PG_RELEASED|PG_PAGEOUT)));
KASSERT(rw_write_held(pg->uobject->vmobjlock));
KASSERT(pg->flags & PG_BUSY);
if (pg->wire_count > 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "wired %#jx", (uintptr_t)pg,
0, 0, 0);
error = EBUSY;
break;
}
if (pg->loan_count == 0) {
pmap_page_protect(pg, VM_PROT_READ);
}
uvm_pagelock(pg);
pg->loan_count++;
KASSERT(pg->loan_count > 0); /* detect wrap-around */
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
uvm_page_unbusy(pgpp, npages);
if (error) {
/*
* backout what we've done
*/
krwlock_t *slock = pgpp[0]->uobject->vmobjlock;
rw_exit(slock);
uvm_unloan(pgpp, i, UVM_LOAN_TOPAGE);
rw_enter(slock, RW_WRITER);
}
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "done %jd", error, 0, 0, 0);
return error;
}
/*
* XXX UBC temp limit
* number of pages to get at once.
* should be <= MAX_READ_AHEAD in genfs_vnops.c
*/
#define UVM_LOAN_GET_CHUNK 16
/*
2004-01-30 15:01:27 +03:00
* uvm_loanuobjpages: loan pages from a uobj out (O->K)
*
2004-01-30 15:01:27 +03:00
* => uobj shouldn't be locked. (we'll lock it)
* => fail with EBUSY if we meet a wired page.
*/
int
2005-06-27 06:19:48 +04:00
uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages,
struct vm_page **origpgpp)
{
int ndone; /* # of pages loaned out */
struct vm_page **pgpp;
int error;
int i;
krwlock_t *slock;
pgpp = origpgpp;
for (ndone = 0; ndone < orignpages; ) {
int npages;
/* npendloan: # of pages busied but not loand out yet. */
int npendloan = 0xdead; /* XXX gcc */
reget:
npages = MIN(UVM_LOAN_GET_CHUNK, orignpages - ndone);
rw_enter(uobj->vmobjlock, RW_WRITER);
error = (*uobj->pgops->pgo_get)(uobj,
pgoff + (ndone << PAGE_SHIFT), pgpp, &npages, 0,
VM_PROT_READ, 0, PGO_SYNCIO);
if (error == EAGAIN) {
kpause("loanuopg", false, hz/2, NULL);
continue;
}
if (error)
goto fail;
KASSERT(npages > 0);
2004-03-24 10:50:48 +03:00
/* loan and unbusy pages */
slock = NULL;
for (i = 0; i < npages; i++) {
krwlock_t *nextslock; /* slock for next page */
struct vm_page *pg = *pgpp;
/* XXX assuming that the page is owned by uobj */
KASSERT(pg->uobject != NULL);
nextslock = pg->uobject->vmobjlock;
if (slock != nextslock) {
if (slock) {
KASSERT(npendloan > 0);
error = uvm_loanpage(pgpp - npendloan,
npendloan);
rw_exit(slock);
if (error)
goto fail;
ndone += npendloan;
KASSERT(origpgpp + ndone == pgpp);
}
slock = nextslock;
npendloan = 0;
rw_enter(slock, RW_WRITER);
}
if ((pg->flags & PG_RELEASED) != 0) {
/*
* release pages and try again.
*/
rw_exit(slock);
for (; i < npages; i++) {
pg = pgpp[i];
slock = pg->uobject->vmobjlock;
rw_enter(slock, RW_WRITER);
uvm_page_unbusy(&pg, 1);
rw_exit(slock);
}
goto reget;
}
npendloan++;
pgpp++;
KASSERT(origpgpp + ndone + npendloan == pgpp);
}
KASSERT(slock != NULL);
KASSERT(npendloan > 0);
error = uvm_loanpage(pgpp - npendloan, npendloan);
rw_exit(slock);
if (error)
goto fail;
ndone += npendloan;
KASSERT(origpgpp + ndone == pgpp);
}
return 0;
fail:
uvm_unloan(origpgpp, ndone, UVM_LOAN_TOPAGE);
return error;
}
/*
* uvm_loanuobj: loan a page from a uobj out
*
* => called with map, amap, uobj locked
* => return value:
* -1 = fatal error, everything is unlocked, abort.
* 0 = lookup in ufi went stale, everything unlocked, relookup and
* try again
* 1 = got it, everything still locked
*/
static int
2005-06-27 06:19:48 +04:00
uvm_loanuobj(struct uvm_faultinfo *ufi, void ***output, int flags, vaddr_t va)
{
1998-03-09 03:58:55 +03:00
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
struct vm_page *pg;
int error, npages;
bool locked;
1998-03-09 03:58:55 +03:00
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
1998-03-09 03:58:55 +03:00
/*
* first we must make sure the page is resident.
*
* XXXCDC: duplicate code with uvm_fault().
*/
2010-02-03 17:02:49 +03:00
/* locked: maps(read), amap(if there) */
rw_enter(uobj->vmobjlock, RW_WRITER);
2010-02-03 17:02:49 +03:00
/* locked: maps(read), amap(if there), uobj */
if (uobj->pgops->pgo_get) { /* try locked pgo_get */
1998-03-09 03:58:55 +03:00
npages = 1;
pg = NULL;
error = (*uobj->pgops->pgo_get)(uobj,
va - ufi->entry->start + ufi->entry->offset,
1998-03-09 03:58:55 +03:00
&pg, &npages, 0, VM_PROT_READ, MADV_NORMAL, PGO_LOCKED);
} else {
error = EIO; /* must have pgo_get op */
1998-03-09 03:58:55 +03:00
}
/*
* check the result of the locked pgo_get. if there is a problem,
* then we fail the loan.
*/
if (error && error != EBUSY) {
uvmfault_unlockall(ufi, amap, uobj);
return (-1);
1998-03-09 03:58:55 +03:00
}
/*
* if we need to unlock for I/O, do so now.
*/
if (error == EBUSY) {
uvmfault_unlockall(ufi, amap, NULL);
1998-03-09 03:58:55 +03:00
/* locked: uobj */
npages = 1;
error = (*uobj->pgops->pgo_get)(uobj,
va - ufi->entry->start + ufi->entry->offset,
&pg, &npages, 0, VM_PROT_READ, MADV_NORMAL, PGO_SYNCIO);
1998-03-09 03:58:55 +03:00
/* locked: <nothing> */
2001-05-25 08:06:11 +04:00
if (error) {
2003-10-26 19:04:00 +03:00
if (error == EAGAIN) {
kpause("fltagain2", false, hz/2, NULL);
return (0);
2001-05-25 08:06:11 +04:00
}
return (-1);
1998-03-09 03:58:55 +03:00
}
/*
* pgo_get was a success. attempt to relock everything.
*/
locked = uvmfault_relock(ufi);
if (locked && amap)
amap_lock(amap, RW_WRITER);
uobj = pg->uobject;
rw_enter(uobj->vmobjlock, RW_WRITER);
1998-03-09 03:58:55 +03:00
/*
* verify that the page has not be released and re-verify
* that amap slot is still free. if there is a problem we
* drop our lock (thus force a lookup refresh/retry).
*/
2001-05-25 08:06:11 +04:00
1998-03-09 03:58:55 +03:00
if ((pg->flags & PG_RELEASED) != 0 ||
(locked && amap && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start))) {
1998-03-09 03:58:55 +03:00
if (locked)
uvmfault_unlockall(ufi, amap, NULL);
2007-02-22 09:05:00 +03:00
locked = false;
2001-05-25 08:06:11 +04:00
}
1998-03-09 03:58:55 +03:00
/*
* didn't get the lock? release the page and retry.
*/
2007-02-22 09:05:00 +03:00
if (locked == false) {
1998-03-09 03:58:55 +03:00
if (pg->flags & PG_RELEASED) {
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
uvm_pagefree(pg);
rw_exit(uobj->vmobjlock);
1998-03-09 03:58:55 +03:00
return (0);
}
uvm_pagelock(pg);
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
uvm_pageactivate(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
rw_exit(uobj->vmobjlock);
1998-03-09 03:58:55 +03:00
return (0);
}
}
KASSERT(uobj == pg->uobject);
1998-03-09 03:58:55 +03:00
/*
* at this point we have the page we want ("pg") marked PG_BUSY for us
* and we have all data structures locked. do the loanout. page can
1998-03-09 03:58:55 +03:00
* not be PG_RELEASED (we caught this above).
*/
if ((flags & UVM_LOAN_TOANON) == 0) {
if (uvm_loanpage(&pg, 1)) {
uvmfault_unlockall(ufi, amap, uobj);
return (-1);
}
rw_exit(uobj->vmobjlock);
**output = pg;
(*output)++;
return (1);
1998-03-09 03:58:55 +03:00
}
#ifdef notdef
1998-03-09 03:58:55 +03:00
/*
* must be a loan to an anon. check to see if there is already
* an anon associated with this page. if so, then just return
2001-05-25 08:06:11 +04:00
* a reference to this object. the page should already be
1998-03-09 03:58:55 +03:00
* mapped read-only because it is already on loan.
*/
if (pg->uanon) {
/* XXX: locking */
1998-03-09 03:58:55 +03:00
anon = pg->uanon;
anon->an_ref++;
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
rw_exit(uobj->vmobjlock);
**output = anon;
(*output)++;
return (1);
1998-03-09 03:58:55 +03:00
}
2001-05-25 08:06:11 +04:00
1998-03-09 03:58:55 +03:00
/*
* need to allocate a new anon
*/
anon = uvm_analloc();
if (anon == NULL) {
goto fail;
1998-03-09 03:58:55 +03:00
}
if (pg->wire_count > 0) {
Update the kernhist(9) kernel history code to address issues identified in PR kern/52639, as well as some general cleaning-up... (As proposed on tech-kern@ with additional changes and enhancements.) Details of changes: * All history arguments are now stored as uintmax_t values[1], both in the kernel and in the structures used for exporting the history data to userland via sysctl(9). This avoids problems on some architectures where passing a 64-bit (or larger) value to printf(3) can cause it to process the value as multiple arguments. (This can be particularly problematic when printf()'s format string is not a literal, since in that case the compiler cannot know how large each argument should be.) * Update the data structures used for exporting kernel history data to include a version number as well as the length of history arguments. * All [2] existing users of kernhist(9) have had their format strings updated. Each format specifier now includes an explicit length modifier 'j' to refer to numeric values of the size of uintmax_t. * All [2] existing users of kernhist(9) have had their format strings updated to replace uses of "%p" with "%#jx", and the pointer arguments are now cast to (uintptr_t) before being subsequently cast to (uintmax_t). This is needed to avoid compiler warnings about casting "pointer to integer of a different size." * All [2] existing users of kernhist(9) have had instances of "%s" or "%c" format strings replaced with numeric formats; several instances of mis-match between format string and argument list have been fixed. * vmstat(1) has been modified to handle the new size of arguments in the history data as exported by sysctl(9). * vmstat(1) now provides a warning message if the history requested with the -u option does not exist (previously, this condition was silently ignored, with only a single blank line being printed). * vmstat(1) now checks the version and argument length included in the data exported via sysctl(9) and exits if they do not match the values with which vmstat was built. * The kernhist(9) man-page has been updated to note the additional requirements imposed on the format strings, along with several other minor changes and enhancements. [1] It would have been possible to use an explicit length (for example, uint64_t) for the history arguments. But that would require another "rototill" of all the users in the future when we add support for an architecture that supports a larger size. Also, the printf(3) format specifiers for explicitly-sized values, such as "%"PRIu64, are much more verbose (and less aesthetically appealing, IMHO) than simply using "%ju". [2] I've tried very hard to find "all [the] existing users of kernhist(9)" but it is possible that I've missed some of them. I would be glad to update any stragglers that anyone identifies.
2017-10-28 03:37:11 +03:00
UVMHIST_LOG(loanhist, "wired %#jx", (uintptr_t)pg, 0, 0, 0);
goto fail;
}
if (pg->loan_count == 0) {
pmap_page_protect(pg, VM_PROT_READ);
}
uvm_pagelock(pg);
1998-03-09 03:58:55 +03:00
pg->loan_count++;
KASSERT(pg->loan_count > 0); /* detect wrap-around */
pg->uanon = anon;
anon->an_page = pg;
anon->an_lock = /* TODO: share amap lock */
1998-03-09 03:58:55 +03:00
uvm_pageactivate(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
rw_exit(uobj->vmobjlock);
rw_exit(&anon->an_lock);
**output = anon;
(*output)++;
return (1);
fail:
UVMHIST_LOG(loanhist, "fail", 0,0,0,0);
/*
* unlock everything and bail out.
*/
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
uvmfault_unlockall(ufi, amap, uobj, NULL);
if (anon) {
anon->an_ref--;
uvm_anfree(anon);
}
#endif /* notdef */
return (-1);
}
/*
* uvm_loanzero: loan a zero-fill page out
*
* => called with map, amap, uobj locked
* => return value:
* -1 = fatal error, everything is unlocked, abort.
* 0 = lookup in ufi went stale, everything unlocked, relookup and
* try again
* 1 = got it, everything still locked
*/
static struct uvm_object uvm_loanzero_object;
static krwlock_t uvm_loanzero_lock __cacheline_aligned;
static int
2005-06-27 06:19:48 +04:00
uvm_loanzero(struct uvm_faultinfo *ufi, void ***output, int flags)
{
1998-03-09 03:58:55 +03:00
struct vm_page *pg;
struct vm_amap *amap = ufi->entry->aref.ar_amap;
1998-03-09 03:58:55 +03:00
UVMHIST_FUNC(__func__); UVMHIST_CALLED(loanhist);
again:
rw_enter(uvm_loanzero_object.vmobjlock, RW_WRITER);
/*
* first, get ahold of our single zero page.
*/
pg = uvm_pagelookup(&uvm_loanzero_object, 0);
if (__predict_false(pg == NULL)) {
while ((pg = uvm_pagealloc(&uvm_loanzero_object, 0, NULL,
UVM_PGA_ZERO)) == NULL) {
rw_exit(uvm_loanzero_object.vmobjlock);
uvmfault_unlockall(ufi, amap, NULL);
uvm_wait("loanzero");
if (!uvmfault_relock(ufi)) {
return (0);
}
if (amap) {
amap_lock(amap, RW_WRITER);
}
goto again;
1998-03-09 03:58:55 +03:00
}
2001-05-25 08:06:11 +04:00
/* got a zero'd page. */
pg->flags &= ~(PG_BUSY|PG_FAKE);
pg->flags |= PG_RDONLY;
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
UVM_PAGE_OWN(pg, NULL);
}
if ((flags & UVM_LOAN_TOANON) == 0) { /* loaning to kernel-page */
mutex_enter(&pg->interlock);
pg->loan_count++;
KASSERT(pg->loan_count > 0); /* detect wrap-around */
mutex_exit(&pg->interlock);
rw_exit(uvm_loanzero_object.vmobjlock);
1998-03-09 03:58:55 +03:00
**output = pg;
(*output)++;
return (1);
1998-03-09 03:58:55 +03:00
}
#ifdef notdef
/*
* loaning to an anon. check to see if there is already an anon
* associated with this page. if so, then just return a reference
* to this object.
*/
if (pg->uanon) {
anon = pg->uanon;
rw_enter(&anon->an_lock, RW_WRITER);
anon->an_ref++;
rw_exit(&anon->an_lock);
rw_exit(uvm_loanzero_object.vmobjlock);
**output = anon;
(*output)++;
return (1);
}
1998-03-09 03:58:55 +03:00
/*
* need to allocate a new anon
*/
1998-03-09 03:58:55 +03:00
anon = uvm_analloc();
if (anon == NULL) {
/* out of swap causes us to fail */
rw_exit(uvm_loanzero_object.vmobjlock);
uvmfault_unlockall(ufi, amap, NULL, NULL);
return (-1);
1998-03-09 03:58:55 +03:00
}
anon->an_page = pg;
pg->uanon = anon;
uvm_pagelock(pg);
pg->loan_count++;
KASSERT(pg->loan_count > 0); /* detect wrap-around */
1998-03-09 03:58:55 +03:00
uvm_pageactivate(pg);
uvm_pageunlock(pg);
rw_exit(&anon->an_lock);
rw_exit(uvm_loanzero_object.vmobjlock);
1998-03-09 03:58:55 +03:00
**output = anon;
(*output)++;
return (1);
#else
return (-1);
#endif
}
/*
* uvm_unloananon: kill loans on anons (basically a normal ref drop)
*
* => we expect all our resources to be unlocked
*/
static void
2005-06-27 06:19:48 +04:00
uvm_unloananon(struct vm_anon **aloans, int nanons)
{
#ifdef notdef
struct vm_anon *anon, *to_free = NULL;
/* TODO: locking */
amap_lock(amap, RW_WRITER);
1998-03-09 03:58:55 +03:00
while (nanons-- > 0) {
anon = *aloans++;
if (--anon->an_ref == 0) {
uvm_anfree(anon);
1998-03-09 03:58:55 +03:00
}
}
amap_unlock(amap);
#endif /* notdef */
}
/*
* uvm_unloanpage: kill loans on pages loaned out to the kernel
*
* => we expect all our resources to be unlocked
*/
static void
2005-06-27 06:19:48 +04:00
uvm_unloanpage(struct vm_page **ploans, int npages)
{
1998-03-09 03:58:55 +03:00
struct vm_page *pg;
krwlock_t *slock;
1998-03-09 03:58:55 +03:00
while (npages-- > 0) {
pg = *ploans++;
1998-03-09 03:58:55 +03:00
/*
* do a little dance to acquire the object or anon lock
* as appropriate. we are locking in the wrong order,
* so we have to do a try-lock here.
*/
mutex_enter(&pg->interlock);
slock = NULL;
while (pg->uobject != NULL || pg->uanon != NULL) {
if (pg->uobject != NULL) {
slock = pg->uobject->vmobjlock;
} else {
slock = pg->uanon->an_lock;
}
if (rw_tryenter(slock, RW_WRITER)) {
break;
}
2008-01-02 14:48:20 +03:00
/* XXX Better than yielding but inadequate. */
kpause("livelock", false, 1, &pg->interlock);
slock = NULL;
}
/*
* drop our loan. if page is owned by an anon but
* PG_ANON is not set, the page was loaned to the anon
* from an object which dropped ownership, so resolve
* this by turning the anon's loan into real ownership
* (ie. decrement loan_count again and set PG_ANON).
* after all this, if there are no loans left, put the
* page back a paging queue (if the page is owned by
* an anon) or free it (if the page is now unowned).
1998-03-09 03:58:55 +03:00
*/
KASSERT(pg->loan_count > 0);
pg->loan_count--;
if (pg->uobject == NULL && pg->uanon != NULL &&
(pg->flags & PG_ANON) == 0) {
KASSERT(pg->loan_count > 0);
pg->loan_count--;
pg->flags |= PG_ANON;
}
mutex_exit(&pg->interlock);
if (pg->loan_count == 0 && pg->uobject == NULL &&
pg->uanon == NULL) {
KASSERT((pg->flags & PG_BUSY) == 0);
uvm_pagefree(pg);
}
if (slock != NULL) {
rw_exit(slock);
}
1998-03-09 03:58:55 +03:00
}
}
/*
* uvm_unloan: kill loans on pages or anons.
*/
void
uvm_unloan(void *v, int npages, int flags)
{
if (flags & UVM_LOAN_TOANON) {
uvm_unloananon(v, npages);
} else {
uvm_unloanpage(v, npages);
}
}
/*
* Minimal pager for uvm_loanzero_object. We need to provide a "put"
* method, because the page can end up on a paging queue, and the
* page daemon will want to call pgo_put when it encounters the page
* on the inactive list.
*/
static int
ulz_put(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct vm_page *pg;
KDASSERT(uobj == &uvm_loanzero_object);
/*
* Don't need to do any work here if we're not freeing pages.
*/
if ((flags & PGO_FREE) == 0) {
rw_exit(uobj->vmobjlock);
return 0;
}
/*
* we don't actually want to ever free the uvm_loanzero_page, so
* just reactivate or dequeue it.
*/
pg = uvm_pagelookup(uobj, 0);
KASSERT(pg != NULL);
uvm_pagelock(pg);
if (pg->uanon) {
uvm_pageactivate(pg);
} else {
uvm_pagedequeue(pg);
}
uvm_pageunlock(pg);
rw_exit(uobj->vmobjlock);
return 0;
}
2007-12-01 13:40:27 +03:00
static const struct uvm_pagerops ulz_pager = {
.pgo_put = ulz_put,
};
/*
* uvm_loan_init(): initialize the uvm_loan() facility.
*/
void
uvm_loan_init(void)
{
rw_init(&uvm_loanzero_lock);
uvm_obj_init(&uvm_loanzero_object, &ulz_pager, false, 0);
uvm_obj_setlock(&uvm_loanzero_object, &uvm_loanzero_lock);
UVMHIST_INIT(loanhist, 300);
}
/*
* uvm_loanbreak: break loan on a uobj page
*
* => called with uobj locked
* => the page should be busy
* => return value:
* newly allocated page if succeeded
*/
struct vm_page *
uvm_loanbreak(struct vm_page *uobjpage)
{
struct vm_page *pg;
2017-03-20 02:44:34 +03:00
struct uvm_object *uobj __diagused = uobjpage->uobject;
KASSERT(uobj != NULL);
KASSERT(rw_write_held(uobj->vmobjlock));
KASSERT(uobjpage->flags & PG_BUSY);
/* alloc new un-owned page */
pg = uvm_pagealloc(NULL, 0, NULL, 0);
if (pg == NULL)
return NULL;
/*
* copy the data from the old page to the new
* one and clear the fake flags on the new page (keep it busy).
* force a reload of the old page by clearing it from all
* pmaps.
* then rename the pages.
*/
uvm_pagecopy(uobjpage, pg); /* old -> new */
pg->flags &= ~PG_FAKE;
KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);
pmap_page_protect(uobjpage, VM_PROT_NONE);
/* uobj still locked */
uobjpage->flags &= ~PG_BUSY;
UVM_PAGE_OWN(uobjpage, NULL);
/*
* if the page is no longer referenced by
* an anon (i.e. we are breaking an O->K
* loan), then remove it from any pageq's.
*/
uvm_pagelock2(uobjpage, pg);
uvm_pagewakeup(uobjpage);
if (uobjpage->uanon == NULL)
uvm_pagedequeue(uobjpage);
/*
* replace uobjpage with new page.
*/
uvm_pagereplace(uobjpage, pg);
/*
* at this point we have absolutely no
* control over uobjpage
*/
uvm_pageactivate(pg);
uvm_pageunlock2(uobjpage, pg);
/*
* done! loan is broken and "pg" is
* PG_BUSY. it can now replace uobjpage.
*/
return pg;
}
int
uvm_loanbreak_anon(struct vm_anon *anon, struct uvm_object *uobj)
{
struct vm_page *newpg, *oldpg;
unsigned oldstatus;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(uobj == NULL || rw_write_held(uobj->vmobjlock));
KASSERT(anon->an_page->loan_count > 0);
2010-02-03 17:02:49 +03:00
/* get new un-owned replacement page */
newpg = uvm_pagealloc(NULL, 0, NULL, 0);
if (newpg == NULL) {
return ENOMEM;
}
oldpg = anon->an_page;
/* copy old -> new */
uvm_pagecopy(oldpg, newpg);
KASSERT(uvm_pagegetdirty(newpg) == UVM_PAGE_STATUS_DIRTY);
/* force reload */
pmap_page_protect(oldpg, VM_PROT_NONE);
oldstatus = uvm_pagegetdirty(anon->an_page);
uvm_pagelock2(oldpg, newpg);
if (uobj == NULL) {
/*
* we were the lender (A->K); need to remove the page from
* pageq's.
*
* PG_ANON is updated by the caller.
*/
KASSERT((oldpg->flags & PG_ANON) != 0);
oldpg->flags &= ~PG_ANON;
uvm_pagedequeue(oldpg);
}
oldpg->uanon = NULL;
if (uobj) {
/* if we were receiver of loan */
KASSERT((oldpg->pqflags & PG_ANON) == 0);
oldpg->loan_count--;
}
/* install new page in anon */
anon->an_page = newpg;
newpg->uanon = anon;
newpg->flags |= PG_ANON;
uvm_pageactivate(newpg);
uvm_pageunlock2(oldpg, newpg);
newpg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(newpg, NULL);
if (uobj) {
rw_exit(uobj->vmobjlock);
}
/* done! */
kpreempt_disable();
if (uobj != NULL) {
CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
} else {
CPU_COUNT(CPU_COUNT_ANONUNKNOWN + oldstatus, -1);
}
CPU_COUNT(CPU_COUNT_ANONDIRTY, 1);
kpreempt_enable();
return 0;
}