physio: make requests with large buffers faster,
by queueing more i/o simultaneously.
This commit is contained in:
parent
8d3549ebb0
commit
428a59477a
|
@ -1,4 +1,4 @@
|
|||
# $NetBSD: files,v 1.738 2005/10/21 04:07:48 dyoung Exp $
|
||||
# $NetBSD: files,v 1.739 2005/10/29 11:23:19 yamt Exp $
|
||||
|
||||
# @(#)files.newconf 7.5 (Berkeley) 5/10/93
|
||||
|
||||
|
@ -1264,7 +1264,9 @@ file kern/subr_pool.c
|
|||
file kern/subr_prf.c
|
||||
file kern/subr_prof.c
|
||||
file kern/subr_prop.c
|
||||
file kern/subr_once.c
|
||||
file kern/subr_userconf.c userconf
|
||||
file kern/subr_workqueue.c
|
||||
file kern/subr_xxx.c
|
||||
file kern/sys_generic.c
|
||||
file kern/sys_pipe.c !pipe_socketpair
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: kern_physio.c,v 1.61 2005/06/23 23:15:12 thorpej Exp $ */
|
||||
/* $NetBSD: kern_physio.c,v 1.62 2005/10/29 11:23:19 yamt Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 1982, 1986, 1990, 1993
|
||||
|
@ -71,16 +71,20 @@
|
|||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.61 2005/06/23 23:15:12 thorpej Exp $");
|
||||
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.62 2005/10/29 11:23:19 yamt Exp $");
|
||||
|
||||
#include <sys/param.h>
|
||||
#include <sys/systm.h>
|
||||
#include <sys/buf.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/once.h>
|
||||
#include <sys/workqueue.h>
|
||||
|
||||
#include <uvm/uvm_extern.h>
|
||||
|
||||
ONCE_DECL(physio_initialized);
|
||||
struct workqueue *physio_workqueue;
|
||||
|
||||
/*
|
||||
* The routines implemented in this file are described in:
|
||||
* Leffler, et al.: The Design and Implementation of the 4.3BSD
|
||||
|
@ -106,8 +110,8 @@ getphysbuf(void)
|
|||
s = splbio();
|
||||
bp = pool_get(&bufpool, PR_WAITOK);
|
||||
splx(s);
|
||||
memset(bp, 0, sizeof(*bp));
|
||||
BUF_INIT(bp);
|
||||
bp->b_error = 0;
|
||||
return(bp);
|
||||
}
|
||||
|
||||
|
@ -126,6 +130,112 @@ putphysbuf(struct buf *bp)
|
|||
splx(s);
|
||||
}
|
||||
|
||||
/* abuse these members of struct buf */
|
||||
#define b_running b_freelistindex
|
||||
#define b_eomoffset b_lblkno
|
||||
|
||||
static void
|
||||
physio_done(struct work *wk, void *dummy)
|
||||
{
|
||||
struct buf *bp = (void *)wk;
|
||||
size_t todo = bp->b_bufsize;
|
||||
struct buf *mbp = bp->b_private;
|
||||
|
||||
KASSERT(&bp->b_work == wk);
|
||||
KASSERT(bp->b_bcount <= todo);
|
||||
KASSERT(bp->b_resid <= bp->b_bcount);
|
||||
KASSERT((bp->b_flags & B_PHYS) != 0);
|
||||
KASSERT(dummy == NULL);
|
||||
|
||||
vunmapbuf(bp, todo);
|
||||
uvm_vsunlock(bp->b_proc, bp->b_data, todo);
|
||||
|
||||
simple_lock(&mbp->b_interlock);
|
||||
if ((mbp->b_flags & B_ERROR) != 0) {
|
||||
goto done;
|
||||
}
|
||||
if ((bp->b_flags & B_ERROR) != 0) {
|
||||
if (bp->b_error == 0) {
|
||||
mbp->b_error = EIO; /* XXX */
|
||||
} else {
|
||||
mbp->b_error = bp->b_error;
|
||||
}
|
||||
mbp->b_flags |= B_ERROR;
|
||||
goto done;
|
||||
}
|
||||
KASSERT(bp->b_resid == 0); /* XXX */
|
||||
if (bp->b_bcount != todo) {
|
||||
#if defined(DIAGNOSTIC)
|
||||
off_t eomoffset = dbtob(bp->b_blkno);
|
||||
|
||||
if ((mbp->b_flags & B_ERROR) != 0 &&
|
||||
mbp->b_eomoffset != eomoffset) {
|
||||
panic("%s: eom mismatch", __func__);
|
||||
}
|
||||
mbp->b_eomoffset = eomoffset;
|
||||
#endif /* defined(DIAGNOSTIC) */
|
||||
mbp->b_flags |= B_ERROR;
|
||||
mbp->b_error = 0;
|
||||
goto done;
|
||||
}
|
||||
done:
|
||||
mbp->b_resid -= bp->b_bcount - bp->b_resid;
|
||||
mbp->b_running--;
|
||||
if ((mbp->b_flags & B_WANTED) != 0) {
|
||||
mbp->b_flags &= ~B_WANTED;
|
||||
wakeup(mbp);
|
||||
}
|
||||
simple_unlock(&mbp->b_interlock);
|
||||
|
||||
putphysbuf(bp);
|
||||
}
|
||||
|
||||
static void
|
||||
physio_biodone(struct buf *bp)
|
||||
{
|
||||
struct buf *mbp = bp->b_private;
|
||||
size_t todo = bp->b_bufsize;
|
||||
|
||||
KASSERT(mbp->b_running > 0);
|
||||
KASSERT(todo <= mbp->b_resid);
|
||||
KASSERT(bp->b_bcount <= todo);
|
||||
KASSERT(bp->b_resid <= bp->b_bcount);
|
||||
|
||||
workqueue_enqueue(physio_workqueue, &bp->b_work);
|
||||
}
|
||||
|
||||
static int
|
||||
physio_wait(struct buf *bp, int n, const char *wchan)
|
||||
{
|
||||
int error = 0;
|
||||
|
||||
LOCK_ASSERT(simple_lock_held(&bp->b_interlock));
|
||||
|
||||
while (bp->b_running > n) {
|
||||
bp->b_flags |= B_WANTED;
|
||||
error = ltsleep(bp, PRIBIO + 1, wchan, 0, &bp->b_interlock);
|
||||
if (error) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static void
|
||||
physio_init(void)
|
||||
{
|
||||
|
||||
KASSERT(physio_workqueue == NULL);
|
||||
|
||||
if (workqueue_create(&physio_workqueue, "physiod",
|
||||
physio_done, NULL, PRIBIO, IPL_BIO, 0)) {
|
||||
panic("physiod create");
|
||||
}
|
||||
}
|
||||
|
||||
#define PHYSIO_CONCURRENCY 16 /* XXX tune */
|
||||
|
||||
/*
|
||||
* Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
|
||||
* from the raw device to user buffers, and bypasses the buffer cache.
|
||||
|
@ -133,59 +243,69 @@ putphysbuf(struct buf *bp)
|
|||
* Comments in brackets are from Leffler, et al.'s pseudo-code implementation.
|
||||
*/
|
||||
int
|
||||
physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags,
|
||||
physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
|
||||
void (*min_phys)(struct buf *), struct uio *uio)
|
||||
{
|
||||
struct iovec *iovp;
|
||||
struct lwp *l = curlwp;
|
||||
struct proc *p = l->l_proc;
|
||||
int error, done, i, nobuf, s;
|
||||
long todo;
|
||||
int i, s;
|
||||
int error = 0;
|
||||
int error2;
|
||||
size_t todo;
|
||||
struct buf *bp = NULL;
|
||||
struct buf *mbp;
|
||||
|
||||
RUN_ONCE(&physio_initialized, physio_init);
|
||||
|
||||
error = 0;
|
||||
flags &= B_READ | B_WRITE;
|
||||
|
||||
/* Make sure we have a buffer, creating one if necessary. */
|
||||
if ((nobuf = (bp == NULL)) != 0) {
|
||||
|
||||
bp = getphysbuf();
|
||||
/* bp was just malloc'd so can't already be busy */
|
||||
bp->b_flags |= B_BUSY;
|
||||
|
||||
} else {
|
||||
|
||||
if (obp != NULL) {
|
||||
/* [raise the processor priority level to splbio;] */
|
||||
s = splbio();
|
||||
|
||||
/* [while the buffer is marked busy] */
|
||||
while (bp->b_flags & B_BUSY) {
|
||||
while (obp->b_flags & B_BUSY) {
|
||||
/* [mark the buffer wanted] */
|
||||
bp->b_flags |= B_WANTED;
|
||||
obp->b_flags |= B_WANTED;
|
||||
/* [wait until the buffer is available] */
|
||||
tsleep((caddr_t)bp, PRIBIO+1, "physbuf", 0);
|
||||
tsleep(obp, PRIBIO+1, "physbuf", 0);
|
||||
}
|
||||
|
||||
/* Mark it busy, so nobody else will use it. */
|
||||
bp->b_flags |= B_BUSY;
|
||||
obp->b_flags |= B_BUSY;
|
||||
|
||||
/* [lower the priority level] */
|
||||
splx(s);
|
||||
}
|
||||
|
||||
/* [set up the fixed part of the buffer for a transfer] */
|
||||
bp->b_dev = dev;
|
||||
bp->b_error = 0;
|
||||
bp->b_proc = p;
|
||||
LIST_INIT(&bp->b_dep);
|
||||
mbp = getphysbuf();
|
||||
mbp->b_resid = uio->uio_resid;
|
||||
mbp->b_running = 0;
|
||||
mbp->b_flags = 0;
|
||||
|
||||
PHOLD(l);
|
||||
|
||||
/*
|
||||
* [while there are data to transfer and no I/O error]
|
||||
* Note that I/O errors are handled with a 'goto' at the bottom
|
||||
* of the 'while' loop.
|
||||
*/
|
||||
for (i = 0; i < uio->uio_iovcnt; i++) {
|
||||
iovp = &uio->uio_iov[i];
|
||||
while (iovp->iov_len > 0) {
|
||||
simple_lock(&mbp->b_interlock);
|
||||
if ((mbp->b_flags & B_ERROR) != 0) {
|
||||
error = mbp->b_error;
|
||||
goto done_locked;
|
||||
}
|
||||
error = physio_wait(mbp, PHYSIO_CONCURRENCY - 1,
|
||||
"physio1");
|
||||
if (error) {
|
||||
goto done_locked;
|
||||
}
|
||||
simple_unlock(&mbp->b_interlock);
|
||||
bp = getphysbuf();
|
||||
bp->b_dev = dev;
|
||||
bp->b_proc = p;
|
||||
bp->b_private = mbp;
|
||||
bp->b_vp = NULL;
|
||||
|
||||
/*
|
||||
* [mark the buffer busy for physical I/O]
|
||||
|
@ -194,11 +314,16 @@ physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags,
|
|||
* "Set by physio for raw transfers.", in addition
|
||||
* to the "busy" and read/write flag.)
|
||||
*/
|
||||
bp->b_flags = B_BUSY | B_PHYS | B_RAW | flags;
|
||||
bp->b_flags = B_BUSY | B_PHYS | B_RAW | B_CALL | flags;
|
||||
bp->b_iodone = physio_biodone;
|
||||
|
||||
/* [set up the buffer for a maximum-sized transfer] */
|
||||
bp->b_blkno = btodb(uio->uio_offset);
|
||||
bp->b_bcount = iovp->iov_len;
|
||||
if (dbtob(bp->b_blkno) != uio->uio_offset) {
|
||||
error = EINVAL;
|
||||
goto done;
|
||||
}
|
||||
bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
|
||||
bp->b_data = iovp->iov_base;
|
||||
|
||||
/*
|
||||
|
@ -207,14 +332,12 @@ physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags,
|
|||
* for later comparison.
|
||||
*/
|
||||
(*min_phys)(bp);
|
||||
todo = bp->b_bcount;
|
||||
#ifdef DIAGNOSTIC
|
||||
if (todo <= 0)
|
||||
panic("todo(%ld) <= 0; minphys broken", todo);
|
||||
todo = bp->b_bufsize = bp->b_bcount;
|
||||
#if defined(DIAGNOSTIC)
|
||||
if (todo > MAXPHYS)
|
||||
panic("todo(%ld) > MAXPHYS; minphys broken",
|
||||
todo);
|
||||
#endif
|
||||
panic("todo(%zu) > MAXPHYS; minphys broken",
|
||||
todo);
|
||||
#endif /* defined(DIAGNOSTIC) */
|
||||
|
||||
/*
|
||||
* [lock the part of the user address space involved
|
||||
|
@ -223,100 +346,75 @@ physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags,
|
|||
* saves it in b_saveaddr. However, vunmapbuf()
|
||||
* restores it.
|
||||
*/
|
||||
PHOLD(l);
|
||||
error = uvm_vslock(p, bp->b_data, todo,
|
||||
(flags & B_READ) ?
|
||||
VM_PROT_WRITE : VM_PROT_READ);
|
||||
(flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
|
||||
if (error) {
|
||||
bp->b_flags |= B_ERROR;
|
||||
bp->b_error = error;
|
||||
goto after_vsunlock;
|
||||
goto done;
|
||||
}
|
||||
vmapbuf(bp, todo);
|
||||
|
||||
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
|
||||
|
||||
simple_lock(&mbp->b_interlock);
|
||||
mbp->b_running++;
|
||||
simple_unlock(&mbp->b_interlock);
|
||||
|
||||
/* [call strategy to start the transfer] */
|
||||
(*strategy)(bp);
|
||||
bp = NULL;
|
||||
|
||||
/*
|
||||
* Note that the raise/wait/lower/get error
|
||||
* steps below would be done by biowait(), but
|
||||
* we want to unlock the address space before
|
||||
* we lower the priority.
|
||||
*
|
||||
* [raise the priority level to splbio]
|
||||
*/
|
||||
s = splbio();
|
||||
|
||||
/* [wait for the transfer to complete] */
|
||||
while ((bp->b_flags & B_DONE) == 0)
|
||||
tsleep((caddr_t) bp, PRIBIO + 1, "physio", 0);
|
||||
|
||||
/* Mark it busy again, so nobody else will use it. */
|
||||
bp->b_flags |= B_BUSY;
|
||||
|
||||
/* [lower the priority level] */
|
||||
splx(s);
|
||||
|
||||
/*
|
||||
* [unlock the part of the address space previously
|
||||
* locked]
|
||||
*/
|
||||
vunmapbuf(bp, todo);
|
||||
uvm_vsunlock(p, bp->b_data, todo);
|
||||
after_vsunlock:
|
||||
PRELE(l);
|
||||
|
||||
/* remember error value (save a splbio/splx pair) */
|
||||
if (bp->b_flags & B_ERROR)
|
||||
error = (bp->b_error ? bp->b_error : EIO);
|
||||
|
||||
/*
|
||||
* [deduct the transfer size from the total number
|
||||
* of data to transfer]
|
||||
*/
|
||||
done = bp->b_bcount - bp->b_resid;
|
||||
KASSERT(done >= 0);
|
||||
KASSERT(done <= todo);
|
||||
|
||||
iovp->iov_len -= done;
|
||||
iovp->iov_base = (caddr_t)iovp->iov_base + done;
|
||||
uio->uio_offset += done;
|
||||
uio->uio_resid -= done;
|
||||
|
||||
/*
|
||||
* Now, check for an error.
|
||||
* Also, handle weird end-of-disk semantics.
|
||||
*/
|
||||
if (error || done < todo)
|
||||
goto done;
|
||||
iovp->iov_len -= todo;
|
||||
iovp->iov_base = (caddr_t)iovp->iov_base + todo;
|
||||
uio->uio_offset += todo;
|
||||
uio->uio_resid -= todo;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
simple_lock(&mbp->b_interlock);
|
||||
done_locked:
|
||||
error2 = physio_wait(mbp, 0, "physio2");
|
||||
if (error == 0) {
|
||||
error = error2;
|
||||
}
|
||||
simple_unlock(&mbp->b_interlock);
|
||||
KASSERT((mbp->b_flags & B_ERROR) != 0 ||
|
||||
mbp->b_resid == uio->uio_resid);
|
||||
#if defined(DIAGNOSTIC)
|
||||
if ((mbp->b_flags & B_ERROR) != 0 && mbp->b_error == 0 &&
|
||||
uio->uio_offset - mbp->b_resid != mbp->b_eomoffset) {
|
||||
panic("%s: eom", __func__);
|
||||
}
|
||||
#endif /* defined(DIAGNOSTIC) */
|
||||
uio->uio_resid = mbp->b_resid;
|
||||
if (bp != NULL) {
|
||||
putphysbuf(bp);
|
||||
}
|
||||
if (error == 0) {
|
||||
error = mbp->b_error;
|
||||
}
|
||||
putphysbuf(mbp);
|
||||
|
||||
/*
|
||||
* [clean up the state of the buffer]
|
||||
* Remember if somebody wants it, so we can wake them up below.
|
||||
* Also, if we had to steal it, give it back.
|
||||
*/
|
||||
s = splbio();
|
||||
bp->b_flags &= ~(B_BUSY | B_PHYS | B_RAW);
|
||||
if (nobuf)
|
||||
putphysbuf(bp);
|
||||
else {
|
||||
if (obp != NULL) {
|
||||
s = splbio();
|
||||
/*
|
||||
* [if another process is waiting for the raw I/O buffer,
|
||||
* wake up processes waiting to do physical I/O;
|
||||
*/
|
||||
if (bp->b_flags & B_WANTED) {
|
||||
bp->b_flags &= ~B_WANTED;
|
||||
wakeup(bp);
|
||||
if (obp->b_flags & B_WANTED) {
|
||||
obp->b_flags &= ~B_WANTED;
|
||||
wakeup(obp);
|
||||
}
|
||||
splx(s);
|
||||
}
|
||||
splx(s);
|
||||
PRELE(l);
|
||||
|
||||
return (error);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: buf.h,v 1.81 2005/05/31 02:57:58 christos Exp $ */
|
||||
/* $NetBSD: buf.h,v 1.82 2005/10/29 11:23:19 yamt Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
|
||||
|
@ -79,6 +79,7 @@
|
|||
#include <sys/pool.h>
|
||||
#include <sys/queue.h>
|
||||
#include <sys/lock.h>
|
||||
#include <sys/workqueue.h>
|
||||
|
||||
struct buf;
|
||||
struct mount;
|
||||
|
@ -113,7 +114,12 @@ struct bio_ops {
|
|||
* The buffer header describes an I/O operation in the kernel.
|
||||
*/
|
||||
struct buf {
|
||||
TAILQ_ENTRY(buf) b_actq; /* Device driver queue when active. */
|
||||
union {
|
||||
TAILQ_ENTRY(buf) u_actq; /* Device driver queue when active. */
|
||||
struct work u_work;
|
||||
} b_u;
|
||||
#define b_actq b_u.u_actq
|
||||
#define b_work b_u.u_work
|
||||
struct simplelock b_interlock; /* Lock for b_flags changes */
|
||||
volatile int b_flags; /* B_* flags. */
|
||||
int b_error; /* Errno value. */
|
||||
|
|
Loading…
Reference in New Issue