NetBSD/sys/vm/vm_swap.c

1304 lines
30 KiB
C

/* $NetBSD: vm_swap.c,v 1.46 1997/10/14 08:50:18 pk Exp $ */
/*
* Copyright (c) 1995, 1996, 1997 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/dmap.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/lock.h>
#include <sys/vnode.h>
#include <sys/map.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/extent.h>
#include <vm/vm_swap.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <machine/vmparam.h>
#include <vm/vm_conf.h>
#include <miscfs/specfs/specdev.h>
/*
* The idea here is to provide a single interface for multiple swap devices,
* of any kind and priority in a simple and fast way.
*
* Each swap device has these properties:
* * swap in use.
* * swap enabled.
* * map information in `/dev/drum'.
* * vnode pointer.
* Files have these additional properties:
* * block size.
* * maximum byte count in buffer.
* * buffer.
* * credentials.
*
* The arguments to swapctl(2) are:
* int cmd;
* void *arg;
* int misc;
* The cmd can be one of:
* SWAP_NSWAP - swapctl(2) returns the number of swap devices currently in
* use.
* SWAP_STATS - swapctl(2) takes a struct ent * in (void *arg) and writes
* misc or fewer (to zero) entries of configured swap devices,
* and returns the number of entries written or -1 on error.
* SWAP_ON - swapctl(2) takes a (char *) in arg to be the pathname of a
* device or file to begin swapping on, with it's priority in
* misc, returning 0 on success and -1 on error.
* SWAP_OFF - swapctl(2) takes a (char *) n arg to be the pathname of a
* device or file to stop swapping on. returning 0 or -1.
* XXX unwritten.
* SWAP_CTL - swapctl(2) changes the priority of a swap device, using the
* misc value.
*/
#ifdef SWAPDEBUG
#define VMSDB_SWON 0x0001
#define VMSDB_SWOFF 0x0002
#define VMSDB_SWINIT 0x0004
#define VMSDB_SWALLOC 0x0008
#define VMSDB_SWFLOW 0x0010
#define VMSDB_INFO 0x0020
int vmswapdebug = 0;
#endif
#define SWAP_TO_FILES
struct swapdev {
struct swapent swd_se;
#define swd_dev swd_se.se_dev
#define swd_flags swd_se.se_flags
#define swd_nblks swd_se.se_nblks
#define swd_inuse swd_se.se_inuse
#define swd_priority swd_se.se_priority
daddr_t swd_mapoffset;
int swd_mapsize;
struct extent *swd_ex;
struct vnode *swd_vp;
CIRCLEQ_ENTRY(swapdev) swd_next;
#ifdef SWAP_TO_FILES
int swd_bsize;
int swd_maxactive;
struct buf swd_tab;
struct ucred *swd_cred;
#endif
};
/*
* Swap device priority entry; the list is kept sorted on `spi_priority'.
*/
struct swappri {
int spi_priority;
CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
LIST_ENTRY(swappri) spi_swappri;
};
/*
* Pool resource management helpers for vndxfer & vndbuf below.
*/
struct pool_item {
struct pool_item *pi_next;
};
struct pool {
struct pool_item *pr_freelist; /* Free items in pool */
int pr_size; /* Size of item */
int pr_freecount; /* # of free items */
int pr_hiwat; /* max # of pooled items */
char *pr_wchan; /* tsleep(9) identifier */
int pr_flags;
#define PR_WANTED 1
struct simplelock pr_lock;
};
static void *get_pooled_resource __P((struct pool *));
static void put_pooled_resource __P((struct pool *, void *));
static int prime_pooled_resource __P((struct pool *, int));
/* Grab an item from the pool; must be called at splbio */
static __inline void *
get_pooled_resource(pp)
struct pool *pp;
{
void *v;
struct pool_item *pi;
again:
simple_lock(&pp->pr_lock);
if (pp->pr_freelist == NULL) {
/* if (pp->pr_flags & PR_MALLOC) */
v = (void *)malloc(pp->pr_size, M_DEVBUF, M_NOWAIT);
if (v == NULL) {
pp->pr_flags |= PR_WANTED;
simple_unlock(&pp->pr_lock);
tsleep((caddr_t)pp, PSWP, pp->pr_wchan, 0);
goto again;
}
} else {
v = pi = pp->pr_freelist;
pp->pr_freelist = pi->pi_next;
pp->pr_freecount--;
}
simple_unlock(&pp->pr_lock);
return (v);
}
/* Return resource to the pool; must be called at splbio */
static __inline void
put_pooled_resource(pp, v)
struct pool *pp;
void *v;
{
struct pool_item *pi = v;
simple_lock(&pp->pr_lock);
if ((pp->pr_flags & PR_WANTED) || pp->pr_freecount < pp->pr_hiwat) {
/* Return to pool */
pi->pi_next = pp->pr_freelist;
pp->pr_freelist = pi;
pp->pr_freecount++;
if (pp->pr_flags & PR_WANTED) {
pp->pr_flags &= ~PR_WANTED;
wakeup((caddr_t)pp);
}
} else {
/* Return to system */
free(v, M_DEVBUF);
/*
* Return any excess items allocated during periods of
* contention.
*/
while (pp->pr_freecount > pp->pr_hiwat) {
pi = pp->pr_freelist;
pp->pr_freelist = pi->pi_next;
pp->pr_freecount--;
free(pi, M_DEVBUF);
}
}
simple_unlock(&pp->pr_lock);
}
/* Add N items to the pool */
static int
prime_pooled_resource(pp, n)
struct pool *pp;
int n;
{
struct pool_item *pi;
simple_lock(&pp->pr_lock);
pp->pr_hiwat += n;
while (n--) {
pi = malloc(pp->pr_size, M_DEVBUF, M_NOWAIT);
if (pi == NULL) {
simple_unlock(&pp->pr_lock);
return (ENOMEM);
}
pi->pi_next = pp->pr_freelist;
pp->pr_freelist = pi;
pp->pr_freecount++;
}
simple_unlock(&pp->pr_lock);
return (0);
}
/*
* The following two structures are used to keep track of data transfers
* on swap devices associated with regular files.
* NOTE: this code is more or less a copy of vnd.c; we use the same
* structure names here to ease porting..
*/
struct vndxfer {
struct pool_item vx_pool; /* MUST be first */
struct buf *vx_bp; /* Pointer to parent buffer */
struct swapdev *vx_sdp;
int vx_error;
int vx_pending; /* # of pending aux buffers */
};
struct vndbuf {
struct pool_item vb_pool; /* MUST be first */
struct buf vb_buf;
struct vndxfer *vb_xfer;
};
/* To get from a buffer to the encapsulating vndbuf */
#define BUF_TO_VNDBUF(bp) \
((struct vndbuf *)((long)bp - ((long)&((struct vndbuf *)0)->vb_buf)))
/*
* We keep a pool vndbuf's and vndxfer structures.
*/
struct pool vndxfer_head = { NULL, sizeof(struct vndxfer), 0, 0, "sw vnx", 0 };
struct pool vndbuf_head = { NULL, sizeof(struct vndbuf), 0, 0, "sw vnd", 0 };
#define getvndxfer(vnx) do { \
int s = splbio(); \
(vnx) = (struct vndxfer *)get_pooled_resource(&vndxfer_head); \
splx(s); \
} while (0)
#define putvndxfer(vnx) \
put_pooled_resource(&vndxfer_head, (void *)(vnx));
#define getvndbuf(vbp) do { \
int s = splbio(); \
(vbp) = (struct vndbuf *)get_pooled_resource(&vndbuf_head); \
splx(s); \
} while (0)
#define putvndbuf(vbp) \
put_pooled_resource(&vndbuf_head, (void *)(vbp));
int nswapdev;
int swflags;
struct extent *swapmap;
LIST_HEAD(swap_priority, swappri) swap_priority;
static int swap_on __P((struct proc *, struct swapdev *));
#ifdef SWAP_OFF_WORKS
static int swap_off __P((struct proc *, struct swapdev *));
#endif
static struct swapdev *swap_getsdpfromaddr __P((daddr_t));
static void swap_addmap __P((struct swapdev *, int));
#ifdef SWAP_TO_FILES
static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
static void sw_reg_iodone __P((struct buf *));
static void sw_reg_start __P((struct swapdev *));
#endif
static void insert_swapdev __P((struct swapdev *, int));
static struct swapdev *find_swapdev __P((struct vnode *, int));
static void swaplist_trim __P((void));
/* XXX - Replace with general locking device when available */
static void _swaplist_lock __P((void));
static void _swaplist_unlock __P((void));
int swaplock = 0;
#define SWP_LOCKED 1
#define SWP_WANT 2
static __inline void
_swaplist_lock()
{
if (swaplock & SWP_LOCKED) {
swaplock |= SWP_WANT;
tsleep((caddr_t)&swaplock, PSWP, "swaplock", 0);
}
swaplock |= SWP_LOCKED;
}
static __inline void
_swaplist_unlock()
{
swaplock &= ~SWP_LOCKED;
if (swaplock & SWP_WANT) {
swaplock &= ~SWP_WANT;
wakeup((caddr_t)&swaplock);
}
}
/*
* Insert a swap device on the priority list.
*/
void
insert_swapdev(sdp, priority)
struct swapdev *sdp;
int priority;
{
struct swappri *spp, *pspp;
again:
_swaplist_lock();
/*
* Find entry at or after which to insert the new device.
*/
for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next) {
if (priority <= spp->spi_priority)
break;
pspp = spp;
}
if (spp == NULL || spp->spi_priority != priority) {
spp = (struct swappri *)
malloc(sizeof *spp, M_VMSWAP, M_NOWAIT);
if (spp == NULL) {
_swaplist_unlock();
tsleep((caddr_t)&lbolt, PSWP, "memory", 0);
goto again;
}
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw: had to create a new swappri = %d\n",
priority);
#endif /* SWAPDEBUG */
spp->spi_priority = priority;
CIRCLEQ_INIT(&spp->spi_swapdev);
if (pspp)
LIST_INSERT_AFTER(pspp, spp, spi_swappri);
else
LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
}
/* Onto priority list */
CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
sdp->swd_priority = priority;
_swaplist_unlock();
}
/*
* Find and optionally remove a swap device from the priority list.
*/
struct swapdev *
find_swapdev(vp, remove)
struct vnode *vp;
int remove;
{
struct swapdev *sdp;
struct swappri *spp;
_swaplist_lock();
for (spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next) {
for (sdp = spp->spi_swapdev.cqh_first;
sdp != (void *)&spp->spi_swapdev;
sdp = sdp->swd_next.cqe_next)
if (sdp->swd_vp == vp) {
if (remove)
CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp,
swd_next);
_swaplist_unlock();
return(sdp);
}
}
_swaplist_unlock();
return (NULL);
}
/*
* Scan priority list for empty priority entries.
*/
void
swaplist_trim()
{
struct swappri *spp;
_swaplist_lock();
restart:
for (spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next) {
if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev)
continue;
LIST_REMOVE(spp, spi_swappri);
free((caddr_t)spp, M_VMSWAP);
goto restart;
}
_swaplist_unlock();
}
int
sys_swapctl(p, v, retval)
struct proc *p;
void *v;
register_t *retval;
{
struct sys_swapctl_args /* {
syscallarg(int) cmd;
syscallarg(void *) arg;
syscallarg(int) misc;
} */ *uap = (struct sys_swapctl_args *)v;
struct vnode *vp;
struct nameidata nd;
struct swappri *spp;
struct swapdev *sdp;
struct swapent *sep;
int count, error, misc;
int priority;
misc = SCARG(uap, misc);
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("entering sys_swapctl\n");
#endif /* SWAPDEBUG */
/* how many swap devices */
if (SCARG(uap, cmd) == SWAP_NSWAP) {
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("did SWAP_NSWAP: leaving sys_swapctl\n");
#endif /* SWAPDEBUG */
*retval = nswapdev;
return (0);
}
/* stats on the swap devices. */
if (SCARG(uap, cmd) == SWAP_STATS) {
sep = (struct swapent *)SCARG(uap, arg);
count = 0;
for (spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next) {
for (sdp = spp->spi_swapdev.cqh_first;
sdp != (void *)&spp->spi_swapdev && misc-- > 0;
sdp = sdp->swd_next.cqe_next) {
error = copyout((caddr_t)&sdp->swd_se,
(caddr_t)sep, sizeof(struct swapent));
if (error)
return (error);
count++;
sep++;
}
}
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw: did SWAP_STATS: leaving sys_swapctl\n");
#endif /* SWAPDEBUG */
*retval = count;
return (0);
}
if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
if (SCARG(uap, arg) == NULL) {
/* XXX - interface - arg==NULL: miniroot */
vp = rootvp;
if (vget(vp, 1))
return (EBUSY);
} else {
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, UIO_USERSPACE,
SCARG(uap, arg), p);
if ((error = namei(&nd)))
return (error);
vp = nd.ni_vp;
}
switch(SCARG(uap, cmd)) {
case SWAP_CTL:
priority = SCARG(uap, misc);
if ((sdp = find_swapdev(vp, 1)) == NULL) {
error = ENOENT;
break;
}
insert_swapdev(sdp, priority);
swaplist_trim();
break;
case SWAP_ON:
priority = SCARG(uap, misc);
/* Check for duplicates */
if ((sdp = find_swapdev(vp, 0)) != NULL) {
error = EBUSY;
goto bad;
}
sdp = (struct swapdev *)
malloc(sizeof *sdp, M_VMSWAP, M_WAITOK);
sdp->swd_inuse = sdp->swd_flags = 0;
sdp->swd_vp = vp;
sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
if ((error = swap_on(p, sdp)) != 0) {
free((caddr_t)sdp, M_VMSWAP);
break;
}
#ifdef SWAP_TO_FILES
/*
* XXX Is NFS elaboration necessary?
*/
if (vp->v_type == VREG)
sdp->swd_cred = crdup(p->p_ucred);
#endif
insert_swapdev(sdp, priority);
/* Keep reference to vnode */
vref(vp);
break;
case SWAP_OFF:
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("doing SWAP_OFF...\n");
#endif /* SWAPDEBUG */
#ifdef SWAP_OFF_WORKS
if ((sdp = find_swapdev(vp, 0)) == NULL) {
error = ENXIO;
break;
}
/*
* If a device isn't in use or enabled, we
* can't stop swapping from it (again).
*/
if ((sdp->swd_flags &
(SWF_INUSE|SWF_ENABLE)) == 0) {
error = EBUSY;
goto bad;
}
if ((error = swap_off(p, sdp)) != 0)
goto bad;
/* Find again and remove this time */
if ((sdp = find_swapdev(vp, 1)) == NULL) {
error = ENXIO;
break;
}
free((caddr_t)sdp, M_VMSWAP);
#else
error = ENODEV;
#endif
break;
default:
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("doing default...\n");
#endif /* SWAPDEBUG */
error = EINVAL;
}
bad:
vput(vp);
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("leaving sys_swapctl: error %d\n", error);
#endif /* SWAPDEBUG */
return (error);
}
/*
* swap_on() attempts to begin swapping on a swapdev. we check that this
* device is OK to swap from, miss the start of any disk (to avoid any
* disk labels that may exist).
*/
static int
swap_on(p, sdp)
struct proc *p;
struct swapdev *sdp;
{
static int count = 0;
struct vnode *vp = sdp->swd_vp;
int error, nblks, size;
long addr;
char *storage;
int storagesize;
#ifdef SWAP_TO_FILES
struct vattr va;
#endif
#ifdef NFS
extern int (**nfsv2_vnodeop_p) __P((void *));
#endif /* NFS */
dev_t dev = sdp->swd_dev;
char *name;
/* If root on swap, then the skip open/close operations. */
if (vp != rootvp) {
if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p)))
return (error);
}
#ifdef SWAPDEBUG /* this wants only for block devices */
if (vmswapdebug & VMSDB_INFO)
printf("swap_on: dev = %d, major(dev) = %d\n", dev, major(dev));
#endif /* SWAPDEBUG */
switch (vp->v_type) {
case VBLK:
if (bdevsw[major(dev)].d_psize == 0 ||
(nblks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) {
error = ENXIO;
goto bad;
}
break;
#ifdef SWAP_TO_FILES
case VREG:
if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)))
goto bad;
nblks = (int)btodb(va.va_size);
if ((error =
VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0)
goto bad;
sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize;
#ifdef NFS
if (vp->v_op == nfsv2_vnodeop_p)
sdp->swd_maxactive = 2; /* XXX */
else
#endif /* NFS */
sdp->swd_maxactive = 8; /* XXX */
break;
#endif
default:
error = ENXIO;
goto bad;
}
if (nblks == 0) {
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("swap_on: nblks == 0\n");
#endif /* SWAPDEBUG */
error = EINVAL;
goto bad;
}
sdp->swd_flags |= SWF_INUSE;
sdp->swd_nblks = nblks;
/*
* skip over first cluster of a device in case of labels or
* boot blocks.
*/
if (vp->v_type == VBLK) {
size = (int)(nblks - ctod(CLSIZE));
addr = (long)ctod(CLSIZE);
} else {
size = (int)nblks;
addr = (long)0;
}
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWON)
printf("swap_on: dev %x: size %d, addr %ld\n", dev, size, addr);
#endif /* SWAPDEBUG */
name = malloc(12, M_VMSWAP, M_WAITOK);
sprintf(name, "swap0x%04x", count++);
/* XXX make this based on ram as well. */
storagesize = EXTENT_FIXED_STORAGE_SIZE(maxproc * 2);
storage = malloc(storagesize, M_VMSWAP, M_WAITOK);
sdp->swd_ex = extent_create(name, 0, nblks, M_VMSWAP,
storage, storagesize, EX_WAITOK);
if (addr) {
if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK))
panic("disklabel region");
sdp->swd_inuse += addr;
}
if (vp == rootvp) {
struct mount *mp;
struct statfs *sp;
int rootblks;
/* Get size from root FS (mountroot did statfs) */
mp = rootvnode->v_mount;
sp = &mp->mnt_stat;
rootblks = sp->f_blocks * (sp->f_bsize / DEV_BSIZE);
if (rootblks > nblks)
panic("miniroot size");
if (extent_alloc_region(sdp->swd_ex, addr, rootblks, EX_WAITOK))
panic("miniroot region");
printf("Preserved %d blocks of miniroot leaving %d pages of swap
\n",
rootblks, dtoc(size - rootblks));
}
if (vp->v_type == VREG) {
/* Allocate additional vnx and vnd buffers */
int n, s;
/*
* Allocation Policy:
* (8 * swd_maxactive) vnx headers per swap dev
* (16 * swd_maxactive) vnd buffers per swap dev
*/
s = splbio();
n = 8 * sdp->swd_maxactive;
(void)prime_pooled_resource(&vndxfer_head, n);
n = 16 * sdp->swd_maxactive;
(void)prime_pooled_resource(&vndbuf_head, n);
splx(s);
}
swap_addmap(sdp, size);
nswapdev++;
sdp->swd_flags |= SWF_ENABLE;
return (0);
bad:
if (vp != rootvp)
(void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
return (error);
}
#ifdef SWAP_OFF_WORKS
static int
swap_off(p, sdp)
struct proc *p;
struct swapdev *sdp;
{
char *name;
/* turn off the enable flag */
sdp->swd_flags &= ~SWF_ENABLE;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWOFF)
printf("swap_off: %x\n", sdp->swd_dev);
#endif /* SWAPDEBUG */
/*
* XXX write me
*
* the idea is to find out which processes are using this swap
* device, and page them all in.
*
* eventually, we should try to move them out to other swap areas
* if available.
*
* The alternative is to create a redirection map for this swap
* device. This should work by moving all the pages of data from
* the ex-swap device to another one, and making an entry in the
* redirection map for it. locking is going to be important for
* this!
*/
/* until the above code is written, we must ENODEV */
return ENODEV;
extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK);
nswapdev--;
name = sdp->swd_ex->ex_name;
extent_destroy(sdp->swd_ex);
free(name, M_VMSWAP);
free((caddr_t)sdp->swd_ex, M_VMSWAP);
if (sdp->swp_vp != rootvp)
(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p);
if (sdp->swd_vp)
vrele(sdp->swd_vp);
free((caddr_t)sdp, M_VMSWAP);
return (0);
}
#endif
/*
* to decide where to allocate what part of swap, we must "round robin"
* the swap devices in swap_priority of the same priority until they are
* full. we do this with a list of swap priorities that have circle
* queues of swapdevs.
*
* the following functions control allocation and freeing of part of the
* swap area. you call swap_alloc() with a size and it returns an address.
* later you call swap_free() and it frees the use of that swap area.
*
* daddr_t swap_alloc(int size);
* void swap_free(int size, daddr_t addr);
*/
daddr_t
swap_alloc(size)
int size;
{
struct swapdev *sdp;
struct swappri *spp;
u_long result;
if (nswapdev < 1)
return 0;
_swaplist_lock();
for (spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next) {
for (sdp = spp->spi_swapdev.cqh_first;
sdp != (void *)&spp->spi_swapdev;
sdp = sdp->swd_next.cqe_next) {
/* if it's not enabled, then we can't swap from it */
if ((sdp->swd_flags & SWF_ENABLE) == 0 ||
/* XXX IS THIS CORRECT ? */
#if 1
(sdp->swd_inuse + size > sdp->swd_nblks) ||
#endif
extent_alloc(sdp->swd_ex, size, EX_NOALIGN,
EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT,
&result) != 0) {
continue;
}
CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
sdp->swd_inuse += size;
_swaplist_unlock();
return (daddr_t)(result + sdp->swd_mapoffset);
}
}
_swaplist_unlock();
return 0;
}
void
swap_free(size, addr)
int size;
daddr_t addr;
{
struct swapdev *sdp = swap_getsdpfromaddr(addr);
#ifdef DIAGNOSTIC
if (sdp == NULL)
panic("swap_free: unmapped address\n");
if (nswapdev < 1)
panic("swap_free: nswapdev < 1\n");
#endif
extent_free(sdp->swd_ex, addr - sdp->swd_mapoffset, size,
EX_MALLOCOK|EX_NOWAIT);
sdp->swd_inuse -= size;
#ifdef DIAGNOSTIC
if (sdp->swd_inuse < 0)
panic("swap_free: inuse < 0");
#endif
}
/*
* We have a physical -> virtual mapping to address here. There are several
* different physical address spaces (one for each swap partition) that are
* to be mapped onto a single virtual address space.
*/
#define ADDR_IN_MAP(addr, sdp) \
(((addr) >= (sdp)->swd_mapoffset) && \
((addr) < ((sdp)->swd_mapoffset + (sdp)->swd_mapsize)))
struct swapdev *
swap_getsdpfromaddr(addr)
daddr_t addr;
{
struct swapdev *sdp;
struct swappri *spp;
_swaplist_lock();
for (spp = swap_priority.lh_first; spp != NULL;
spp = spp->spi_swappri.le_next)
for (sdp = spp->spi_swapdev.cqh_first;
sdp != (void *)&spp->spi_swapdev;
sdp = sdp->swd_next.cqe_next)
if (ADDR_IN_MAP(addr, sdp)) {
_swaplist_unlock();
return sdp;
}
_swaplist_unlock();
return NULL;
}
void
swap_addmap(sdp, size)
struct swapdev *sdp;
int size;
{
u_long result;
if (extent_alloc(swapmap, size, EX_NOALIGN, EX_NOBOUNDARY,
EX_WAITOK, &result))
panic("swap_addmap");
sdp->swd_mapoffset = result;
sdp->swd_mapsize = size;
}
/*ARGSUSED*/
int
swread(dev, uio, ioflag)
dev_t dev;
struct uio *uio;
int ioflag;
{
return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
}
/*ARGSUSED*/
int
swwrite(dev, uio, ioflag)
dev_t dev;
struct uio *uio;
int ioflag;
{
return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
}
void
swstrategy(bp)
struct buf *bp;
{
struct swapdev *sdp;
struct vnode *vp;
int bn;
bn = bp->b_blkno;
sdp = swap_getsdpfromaddr(bn);
if (sdp == NULL) {
bp->b_error = EINVAL;
bp->b_flags |= B_ERROR;
biodone(bp);
return;
}
bn -= sdp->swd_mapoffset;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("swstrategy(%s): mapoff %x, bn %x, bcount %ld\n",
((bp->b_flags & B_READ) == 0) ? "write" : "read",
sdp->swd_mapoffset, bn, bp->b_bcount);
#endif
switch (sdp->swd_vp->v_type) {
default:
panic("swstrategy: vnode type %x", sdp->swd_vp->v_type);
case VBLK:
bp->b_blkno = bn;
vp = sdp->swd_vp;
bp->b_dev = sdp->swd_dev;
VHOLD(vp);
if ((bp->b_flags & B_READ) == 0) {
int s = splbio();
vwakeup(bp);
vp->v_numoutput++;
splx(s);
}
if (bp->b_vp != NULL)
brelvp(bp);
bp->b_vp = vp;
VOP_STRATEGY(bp);
return;
#ifdef SWAP_TO_FILES
case VREG:
sw_reg_strategy(sdp, bp, bn);
return;
#endif
}
/* NOTREACHED */
}
#ifdef SWAP_TO_FILES
static void
sw_reg_strategy(sdp, bp, bn)
struct swapdev *sdp;
struct buf *bp;
int bn;
{
struct vnode *vp;
struct vndbuf *nbp;
struct vndxfer *vnx;
daddr_t nbn;
caddr_t addr;
int s, off, nra, error, sz, resid;
/*
* Translate the device logical block numbers into physical
* block numbers of the underlying filesystem device.
*/
bp->b_resid = bp->b_bcount;
addr = bp->b_data;
bn = dbtob(bn);
/* Allocate a header for this transfer and link it to the buffer */
getvndxfer(vnx);
vnx->vx_error = 0;
vnx->vx_pending = 0;
vnx->vx_bp = bp;
vnx->vx_sdp = sdp;
for (resid = bp->b_resid; resid; resid -= sz) {
nra = 0;
error = VOP_BMAP(sdp->swd_vp, bn / sdp->swd_bsize,
&vp, &nbn, &nra);
if (error == 0 && (long)nbn == -1)
error = EIO;
/*
* If there was an error or a hole in the file...punt.
* Note that we may have to wait for any operations
* that we have already fired off before releasing
* the buffer.
*
* XXX we could deal with holes here but it would be
* a hassle (in the write case).
*/
if (error) {
vnx->vx_error = error;
s = splbio();
if (vnx->vx_pending == 0) {
bp->b_error = error;
bp->b_flags |= B_ERROR;
putvndxfer(vnx);
biodone(bp);
}
splx(s);
return;
}
if ((off = bn % sdp->swd_bsize) != 0)
sz = sdp->swd_bsize - off;
else
sz = (1 + nra) * sdp->swd_bsize;
if (resid < sz)
sz = resid;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw_reg_strategy: vp %p/%p bn 0x%x/0x%x"
" sz 0x%x\n", sdp->swd_vp, vp, bn, nbn, sz);
#endif /* SWAPDEBUG */
getvndbuf(nbp);
nbp->vb_buf.b_flags = bp->b_flags | B_CALL;
nbp->vb_buf.b_bcount = sz;
nbp->vb_buf.b_bufsize = bp->b_bufsize;
nbp->vb_buf.b_error = 0;
nbp->vb_buf.b_dev = vp->v_type == VREG
? NODEV : vp->v_rdev;
nbp->vb_buf.b_data = addr;
nbp->vb_buf.b_blkno = nbn + btodb(off);
nbp->vb_buf.b_proc = bp->b_proc;
nbp->vb_buf.b_iodone = sw_reg_iodone;
nbp->vb_buf.b_vp = NULLVP;
nbp->vb_buf.b_rcred = sdp->swd_cred;
nbp->vb_buf.b_wcred = sdp->swd_cred;
if (bp->b_dirtyend == 0) {
nbp->vb_buf.b_dirtyoff = 0;
nbp->vb_buf.b_dirtyend = sz;
} else {
nbp->vb_buf.b_dirtyoff =
max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
nbp->vb_buf.b_dirtyend =
min(sz,
max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
}
if (bp->b_validend == 0) {
nbp->vb_buf.b_validoff = 0;
nbp->vb_buf.b_validend = sz;
} else {
nbp->vb_buf.b_validoff =
max(0, bp->b_validoff - (bp->b_bcount-resid));
nbp->vb_buf.b_validend =
min(sz,
max(0, bp->b_validend - (bp->b_bcount-resid)));
}
nbp->vb_xfer = vnx;
/*
* Just sort by block number
*/
nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno;
s = splbio();
vnx->vx_pending++;
bgetvp(vp, &nbp->vb_buf);
disksort(&sdp->swd_tab, &nbp->vb_buf);
if (sdp->swd_tab.b_active < sdp->swd_maxactive) {
sdp->swd_tab.b_active++;
sw_reg_start(sdp);
}
splx(s);
bn += sz;
addr += sz;
}
}
/*
* Feed requests sequentially.
* We do it this way to keep from flooding NFS servers if we are connected
* to an NFS file. This places the burden on the client rather than the
* server.
*/
static void
sw_reg_start(sdp)
struct swapdev *sdp;
{
struct buf *bp;
bp = sdp->swd_tab.b_actf;
sdp->swd_tab.b_actf = bp->b_actf;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw_reg_start: bp %p vp %p blkno %x addr %p cnt %lx\n",
bp, bp->b_vp, bp->b_blkno,bp->b_data, bp->b_bcount);
#endif
if ((bp->b_flags & B_READ) == 0)
bp->b_vp->v_numoutput++;
VOP_STRATEGY(bp);
}
static void
sw_reg_iodone(bp)
struct buf *bp;
{
register struct vndbuf *vbp = BUF_TO_VNDBUF(bp);
register struct vndxfer *vnx = (struct vndxfer *)vbp->vb_xfer;
register struct buf *pbp = vnx->vx_bp;
struct swapdev *sdp = vnx->vx_sdp;
int s, resid;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw_reg_iodone: vbp %p vp %p blkno %x addr %p "
"cnt %lx(%lx)\n",
vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
vbp->vb_buf.b_data, vbp->vb_buf.b_bcount,
vbp->vb_buf.b_resid);
#endif /* SWAPDEBUG */
s = splbio();
resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
pbp->b_resid -= resid;
vnx->vx_pending--;
if (vbp->vb_buf.b_error) {
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("sw_reg_iodone: vbp %p error %d\n", vbp,
vbp->vb_buf.b_error);
#endif /* SWAPDEBUG */
vnx->vx_error = vbp->vb_buf.b_error;
}
if (vbp->vb_buf.b_vp != NULLVP)
brelvp(&vbp->vb_buf);
putvndbuf(vbp);
/*
* Wrap up this transaction if it has run to completion or, in
* case of an error, when all auxiliary buffers have returned.
*/
if (pbp->b_resid == 0 || (vnx->vx_error && vnx->vx_pending == 0)) {
if (vnx->vx_error != 0) {
pbp->b_flags |= B_ERROR;
pbp->b_error = vnx->vx_error;
}
putvndxfer(vnx);
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWFLOW)
printf("swiodone: pbp %p iodone\n", pbp);
#endif
biodone(pbp);
}
if (sdp->swd_tab.b_actf)
sw_reg_start(sdp);
else
sdp->swd_tab.b_active--;
splx(s);
}
#endif /* SWAP_TO_FILES */
void
swapinit()
{
struct buf *sp = swbuf;
struct proc *p = &proc0; /* XXX */
int i;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWINIT)
printf("swapinit\n");
#endif
nswapdev = 0;
if (bdevvp(swapdev, &swapdev_vp))
panic("swapinit: can setup swapdev_vp");
LIST_INIT(&swap_priority);
/*
* Create swap block resource map. The range [1..INT_MAX] allows
* for a grand total of 2 gigablocks of swap resource.
* (start at 1 because "block #0" will be interpreted as
* an allocation failure).
*/
swapmap = extent_create("swapmap", 1, INT_MAX,
M_VMSWAP, 0, 0, EX_WAITOK);
if (swapmap == 0)
panic("swapinit: extent_create failed");
/*
* Now set up swap buffer headers.
*/
bswlist.b_actf = sp;
for (i = 0; i < nswbuf - 1; i++, sp++) {
sp->b_actf = sp + 1;
sp->b_rcred = sp->b_wcred = p->p_ucred;
sp->b_vnbufs.le_next = NOLIST;
}
sp->b_rcred = sp->b_wcred = p->p_ucred;
sp->b_vnbufs.le_next = NOLIST;
sp->b_actf = NULL;
#ifdef SWAPDEBUG
if (vmswapdebug & VMSDB_SWINIT)
printf("leaving swapinit\n");
#endif
}