add support for O_DIRECT (I/O directly to application memory,

bypassing any kernel caching for file data).
This commit is contained in:
chs 2006-10-05 14:48:32 +00:00
parent 0b2c7040b5
commit 33c1fd1917
21 changed files with 421 additions and 108 deletions

View File

@ -1,4 +1,4 @@
.\" $NetBSD: open.2,v 1.35 2006/09/23 15:29:09 wiz Exp $
.\" $NetBSD: open.2,v 1.36 2006/10/05 14:48:32 chs Exp $
.\"
.\" Copyright (c) 1980, 1991, 1993
.\" The Regents of the University of California. All rights reserved.
@ -117,18 +117,31 @@ with
.Dv O_DSYNC
only, or specifying it without any other synchronized I/O integrity
completion flag set, has no further effect.
.It O_ALT_IO
.It Dv O_ALT_IO
Alternate I/O semantics will be used for read and write operations
on the file descriptor.
Alternate semantics are defined by the underlying layers and will not
have any alternate effect in most cases.
.It O_NOCTTY
.It Dv O_NOCTTY
If the file is a terminal device, the opened device is not
made the controlling terminal for the session.
This flag has no effect on
.Nx ,
since the system defaults to the abovementioned behaviour.
The flag is present only for standards conformance.
.It Dv O_DIRECT
If set on a regular file, data I/O operations will not buffer the data
being transferred in the kernel's cache, but rather transfer the data
directly between user memory and the underlying device driver if possible.
This flag is advisory; the request may be performed in the normal
buffered fashion if certain conditions are not met, eg. if the request
is not sufficiently aligned or if the file is mapped.
.Pp
To meet the alignment requirements for direct I/O, the file offset,
the length of the I/O and the address of the buffer in memory must all
be multiples of DEV_BSIZE (512 bytes). If the I/O request is made
using an interface that supports scatter/gather via struct iovec, each
element of the request must meet the above alignment constraints.
.El
.Pp
Opening a file with

View File

@ -1,4 +1,4 @@
.\" $NetBSD: uvm.9,v 1.75 2006/10/04 11:27:45 pooka Exp $
.\" $NetBSD: uvm.9,v 1.76 2006/10/05 14:48:32 chs Exp $
.\"
.\" Copyright (c) 1998 Matthew R. Green
.\" All rights reserved.
@ -890,11 +890,11 @@ and should match what was used for previous call to
.Pp
.Ft int
.br
.Fn uvm_vslock "struct proc *l" "caddr_t addr" "size_t len" "vm_prot_t prot" ;
.Fn uvm_vslock "struct vmspace *vs" "void *addr" "size_t len" "vm_prot_t prot" ;
.Pp
.Ft void
.br
.Fn uvm_vsunlock "struct proc *p" "caddr_t addr" "size_t len" ;
.Fn uvm_vsunlock "struct vmspace *vs" "void *addr" "size_t len" ;
.Pp
.Ft void
.br

View File

@ -1,4 +1,4 @@
.\" $NetBSD: vnode.9,v 1.33 2006/10/04 11:35:47 pooka Exp $
.\" $NetBSD: vnode.9,v 1.34 2006/10/05 14:48:32 chs Exp $
.\"
.\" Copyright (c) 2001, 2005, 2006 The NetBSD Foundation, Inc.
.\" All rights reserved.
@ -183,9 +183,6 @@ struct vnode {
struct lock *v_vnlock; /* ptr to vnode lock */
void *v_data; /* private data for fs */
struct klist v_klist; /* knotes attached to vnode */
#ifdef VERIFIED_EXEC
char fp_status; /* fingerprint status */
#endif
};
.Ed
.Pp
@ -243,6 +240,8 @@ This vnode is on a layered file system.
This vnode is on syncer work-list.
.It VFREEING
This vnode is being freed.
.It VMAPPED
This vnode might have user mappings.
.El
.Pp
The VXLOCK flag is used to prevent multiple processes from entering

View File

@ -1,4 +1,4 @@
.\" $NetBSD: vnodeops.9,v 1.54 2006/10/04 12:32:53 reinoud Exp $
.\" $NetBSD: vnodeops.9,v 1.55 2006/10/05 14:48:32 chs Exp $
.\"
.\" Copyright (c) 2001, 2005, 2006 The NetBSD Foundation, Inc.
.\" All rights reserved.
@ -672,6 +672,8 @@ use alternate I/O semantics
operate on regular data
.It IO_EXT
operate on extended attributes
.It IO_DIRECT
do not buffer data in the kernel
.El
.Pp
Zero is returned on success, otherwise an error is returned.

View File

@ -1,4 +1,4 @@
/* $NetBSD: cpu.c,v 1.20 2006/03/29 04:16:44 thorpej Exp $ */
/* $NetBSD: cpu.c,v 1.21 2006/10/05 14:48:32 chs Exp $ */
/*-
* Copyright (c) 2000, 2001 Ben Harris
@ -32,7 +32,7 @@
#include <sys/param.h>
__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.20 2006/03/29 04:16:44 thorpej Exp $");
__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.21 2006/10/05 14:48:32 chs Exp $");
#include <sys/device.h>
#include <sys/proc.h>
@ -219,7 +219,7 @@ swp_handler(u_int addr, u_int insn, struct trapframe *tf, int fault_code)
uaddr = (caddr_t)getreg(rn);
/* We want the page wired so we won't sleep */
/* XXX only wire one byte due to weirdness with unaligned words */
err = uvm_vslock(p, uaddr, 1, VM_PROT_READ | VM_PROT_WRITE);
err = uvm_vslock(p->p_vmspace, uaddr, 1, VM_PROT_READ | VM_PROT_WRITE);
if (err != 0) {
ksiginfo_t ksi;
KSI_INIT_TRAP(&ksi);
@ -243,7 +243,7 @@ swp_handler(u_int addr, u_int insn, struct trapframe *tf, int fault_code)
suword(uaddr, getreg(rm));
getreg(rd) = temp;
}
uvm_vsunlock(p, uaddr, 1);
uvm_vsunlock(p->p_vmspace, uaddr, 1);
return 0;
}
#endif

View File

@ -1,4 +1,4 @@
/* $NetBSD: trap.c,v 1.36 2006/07/23 22:06:06 ad Exp $ */
/* $NetBSD: trap.c,v 1.37 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright 2001 Wasabi Systems, Inc.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.36 2006/07/23 22:06:06 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.37 2006/10/05 14:48:32 chs Exp $");
#include "opt_altivec.h"
#include "opt_ddb.h"
@ -490,7 +490,7 @@ bigcopyin(const void *udaddr, void *kaddr, size_t len)
* Stolen from physio():
*/
PHOLD(l);
error = uvm_vslock(p, __UNCONST(udaddr), len, VM_PROT_READ);
error = uvm_vslock(p->p_vmspace, __UNCONST(udaddr), len, VM_PROT_READ);
if (error) {
PRELE(l);
return EFAULT;
@ -499,7 +499,7 @@ bigcopyin(const void *udaddr, void *kaddr, size_t len)
memcpy(kp, up, len);
vunmaprange((vaddr_t)up, len);
uvm_vsunlock(p, __UNCONST(udaddr), len);
uvm_vsunlock(p->p_vmspace, __UNCONST(udaddr), len);
PRELE(l);
return 0;
@ -571,7 +571,7 @@ bigcopyout(const void *kaddr, void *udaddr, size_t len)
* Stolen from physio():
*/
PHOLD(l);
error = uvm_vslock(p, udaddr, len, VM_PROT_WRITE);
error = uvm_vslock(p->p_vmspace, udaddr, len, VM_PROT_WRITE);
if (error) {
PRELE(l);
return EFAULT;
@ -581,7 +581,7 @@ bigcopyout(const void *kaddr, void *udaddr, size_t len)
memcpy(up, kp, len);
vunmaprange((vaddr_t)up, len);
uvm_vsunlock(p, udaddr, len);
uvm_vsunlock(p->p_vmspace, udaddr, len);
PRELE(l);
return 0;

View File

@ -1,4 +1,4 @@
/* $NetBSD: machdep.c,v 1.190 2006/09/13 11:35:53 mrg Exp $ */
/* $NetBSD: machdep.c,v 1.191 2006/10/05 14:48:32 chs Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
@ -78,7 +78,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.190 2006/09/13 11:35:53 mrg Exp $");
__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.191 2006/10/05 14:48:32 chs Exp $");
#include "opt_ddb.h"
#include "opt_multiprocessor.h"
@ -1210,7 +1210,7 @@ _bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio,
* in the transfer.
*/
PHOLD(p);
if (__predict_false(uvm_vslock(p, vaddr, buflen,
if (__predict_false(uvm_vslock(p->p_vmspace, vaddr, buflen,
(uio->uio_rw == UIO_WRITE) ?
VM_PROT_WRITE : VM_PROT_READ) != 0)) {
goto after_vsunlock;
@ -1242,7 +1242,7 @@ _bus_dmamap_load_uio(bus_dma_tag_t t, bus_dmamap_t map, struct uio *uio,
segs[i]._ds_mlist = NULL;
i++;
}
uvm_vsunlock(p, bp->b_data, todo);
uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
PRELE(p);
if (buflen > 0 && i >= MAX_DMA_SEGS)
/* Exceeded the size of our dmamap */

View File

@ -1,4 +1,4 @@
/* $NetBSD: rrunner.c,v 1.56 2006/09/07 02:40:32 dogcow Exp $ */
/* $NetBSD: rrunner.c,v 1.57 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
@ -42,7 +42,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rrunner.c,v 1.56 2006/09/07 02:40:32 dogcow Exp $");
__KERNEL_RCSID(0, "$NetBSD: rrunner.c,v 1.57 2006/10/05 14:48:32 chs Exp $");
#include "opt_inet.h"
@ -1043,13 +1043,13 @@ esh_fpread(dev, uio, ioflag)
/* Lock down the pages */
for (i = 0; i < uio->uio_iovcnt; i++) {
iovp = &uio->uio_iov[i];
error = uvm_vslock(p, iovp->iov_base, iovp->iov_len,
error = uvm_vslock(p->p_vmspace, iovp->iov_base, iovp->iov_len,
VM_PROT_WRITE);
if (error) {
/* Unlock what we've locked so far. */
for (--i; i >= 0; i--) {
iovp = &uio->uio_iov[i];
uvm_vsunlock(p, iovp->iov_base,
uvm_vsunlock(p->p_vmspace, iovp->iov_base,
iovp->iov_len);
}
goto fpread_done;
@ -1139,7 +1139,7 @@ esh_fpread(dev, uio, ioflag)
uio->uio_resid -= di->ed_read_len;
for (i = 0; i < uio->uio_iovcnt; i++) {
iovp = &uio->uio_iov[i];
uvm_vsunlock(p, iovp->iov_base, iovp->iov_len);
uvm_vsunlock(p->p_vmspace, iovp->iov_base, iovp->iov_len);
}
PRELE(l); /* Release process info */
@ -1204,13 +1204,13 @@ esh_fpwrite(dev, uio, ioflag)
/* Lock down the pages */
for (i = 0; i < uio->uio_iovcnt; i++) {
iovp = &uio->uio_iov[i];
error = uvm_vslock(p, iovp->iov_base, iovp->iov_len,
error = uvm_vslock(p->p_vmspace, iovp->iov_base, iovp->iov_len,
VM_PROT_READ);
if (error) {
/* Unlock what we've locked so far. */
for (--i; i >= 0; i--) {
iovp = &uio->uio_iov[i];
uvm_vsunlock(p, iovp->iov_base,
uvm_vsunlock(p->p_vmspace, iovp->iov_base,
iovp->iov_len);
}
goto fpwrite_done;
@ -1296,7 +1296,7 @@ esh_fpwrite(dev, uio, ioflag)
for (i = 0; i < uio->uio_iovcnt; i++) {
iovp = &uio->uio_iov[i];
uvm_vsunlock(p, iovp->iov_base, iovp->iov_len);
uvm_vsunlock(p->p_vmspace, iovp->iov_base, iovp->iov_len);
}
PRELE(l); /* Release process info */

View File

@ -1,4 +1,4 @@
/* $NetBSD: exec_subr.c,v 1.49 2006/07/23 22:06:10 ad Exp $ */
/* $NetBSD: exec_subr.c,v 1.50 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.49 2006/07/23 22:06:10 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.50 2006/10/05 14:48:32 chs Exp $");
#include "opt_pax.h"
@ -155,11 +155,12 @@ int
vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct uvm_object *uobj;
struct vnode *vp = cmd->ev_vp;
struct proc *p = l->l_proc;
int error;
vm_prot_t prot, maxprot;
KASSERT(cmd->ev_vp->v_flag & VTEXT);
KASSERT(vp->v_flag & VTEXT);
/*
* map the vnode in using uvm_map.
@ -178,10 +179,18 @@ vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd)
* first, attach to the object
*/
uobj = uvn_attach(cmd->ev_vp, VM_PROT_READ|VM_PROT_EXECUTE);
uobj = uvn_attach(vp, VM_PROT_READ|VM_PROT_EXECUTE);
if (uobj == NULL)
return(ENOMEM);
VREF(cmd->ev_vp);
VREF(vp);
if ((vp->v_flag & VMAPPED) == 0) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
simple_lock(&vp->v_interlock);
vp->v_flag |= VMAPPED;
simple_unlock(&vp->v_interlock);
VOP_UNLOCK(vp, 0);
}
prot = cmd->ev_prot;
maxprot = UVM_PROT_ALL;

View File

@ -1,4 +1,4 @@
/* $NetBSD: kern_physio.c,v 1.73 2006/04/18 09:54:32 yamt Exp $ */
/* $NetBSD: kern_physio.c,v 1.74 2006/10/05 14:48:32 chs Exp $ */
/*-
* Copyright (c) 1982, 1986, 1990, 1993
@ -71,7 +71,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.73 2006/04/18 09:54:32 yamt Exp $");
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.74 2006/10/05 14:48:32 chs Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -155,7 +155,7 @@ physio_done(struct work *wk, void *dummy)
KASSERT(dummy == NULL);
vunmapbuf(bp, todo);
uvm_vsunlock(bp->b_proc, bp->b_data, todo);
uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
simple_lock(&mbp->b_interlock);
if (__predict_false(done != todo)) {
@ -401,7 +401,7 @@ physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
* saves it in b_saveaddr. However, vunmapbuf()
* restores it.
*/
error = uvm_vslock(p, bp->b_data, todo,
error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
(flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
if (error) {
goto done;

View File

@ -1,4 +1,4 @@
/* $NetBSD: kern_sysctl.c,v 1.203 2006/09/23 22:01:04 manu Exp $ */
/* $NetBSD: kern_sysctl.c,v 1.204 2006/10/05 14:48:32 chs Exp $ */
/*-
* Copyright (c) 2003 The NetBSD Foundation, Inc.
@ -75,7 +75,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.203 2006/09/23 22:01:04 manu Exp $");
__KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.204 2006/10/05 14:48:32 chs Exp $");
#include "opt_defcorename.h"
#include "opt_ktrace.h"
@ -392,15 +392,18 @@ sysctl_lock(struct lwp *l, void *oldp, size_t savelen)
return (error);
if (l != NULL && oldp != NULL && savelen) {
/*
* be lazy - memory is locked for short time only, so
* just do a basic check against system limit
*/
if (uvmexp.wired + atop(savelen) > uvmexp.wiredmax) {
lockmgr(&sysctl_treelock, LK_RELEASE, NULL);
return (ENOMEM);
}
error = uvm_vslock(l->l_proc, oldp, savelen, VM_PROT_WRITE);
error = uvm_vslock(l->l_proc->p_vmspace, oldp, savelen,
VM_PROT_WRITE);
if (error) {
(void) lockmgr(&sysctl_treelock, LK_RELEASE, NULL);
return (error);
@ -502,7 +505,8 @@ sysctl_unlock(struct lwp *l)
{
if (l != NULL && sysctl_memsize != 0) {
uvm_vsunlock(l->l_proc, sysctl_memaddr, sysctl_memsize);
uvm_vsunlock(l->l_proc->p_vmspace, sysctl_memaddr,
sysctl_memsize);
sysctl_memsize = 0;
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: vfs_subr.c,v 1.269 2006/08/24 01:08:00 jld Exp $ */
/* $NetBSD: vfs_subr.c,v 1.270 2006/10/05 14:48:32 chs Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005 The NetBSD Foundation, Inc.
@ -80,7 +80,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.269 2006/08/24 01:08:00 jld Exp $");
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.270 2006/10/05 14:48:32 chs Exp $");
#include "opt_inet.h"
#include "opt_ddb.h"
@ -1250,7 +1250,7 @@ vput(struct vnode *vp)
uvmexp.execpages -= vp->v_uobj.uo_npages;
uvmexp.filepages += vp->v_uobj.uo_npages;
}
vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP);
vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
simple_unlock(&vp->v_interlock);
VOP_INACTIVE(vp, l);
}
@ -1293,7 +1293,7 @@ vrele(struct vnode *vp)
uvmexp.execpages -= vp->v_uobj.uo_npages;
uvmexp.filepages += vp->v_uobj.uo_npages;
}
vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP);
vp->v_flag &= ~(VTEXT|VEXECMAP|VWRITEMAP|VMAPPED);
if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
VOP_INACTIVE(vp, l);
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: vfs_vnops.c,v 1.124 2006/09/12 08:23:51 elad Exp $ */
/* $NetBSD: vfs_vnops.c,v 1.125 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
@ -37,7 +37,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.124 2006/09/12 08:23:51 elad Exp $");
__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.125 2006/10/05 14:48:32 chs Exp $");
#include "fs_union.h"
#include "veriexec.h"
@ -491,6 +491,8 @@ vn_read(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
ioflag |= IO_SYNC;
if (fp->f_flag & FALTIO)
ioflag |= IO_ALTSEMANTICS;
if (fp->f_flag & FDIRECT)
ioflag |= IO_DIRECT;
vn_lock(vp, LK_SHARED | LK_RETRY);
uio->uio_offset = *offset;
count = uio->uio_resid;
@ -524,6 +526,8 @@ vn_write(struct file *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
ioflag |= IO_DSYNC;
if (fp->f_flag & FALTIO)
ioflag |= IO_ALTSEMANTICS;
if (fp->f_flag & FDIRECT)
ioflag |= IO_DIRECT;
mp = NULL;
if (vp->v_type != VCHR &&
(error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)

View File

@ -1,4 +1,4 @@
/* $NetBSD: genfs_node.h,v 1.11 2006/05/14 21:31:52 elad Exp $ */
/* $NetBSD: genfs_node.h,v 1.12 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright (c) 2001 Chuck Silvers.
@ -82,5 +82,6 @@ void genfs_size(struct vnode *, off_t, off_t *, int);
void genfs_node_init(struct vnode *, const struct genfs_ops *);
int genfs_gop_write(struct vnode *, struct vm_page **, int, int);
int genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
void genfs_directio(struct vnode *, struct uio *, int);
#endif /* _MISCFS_GENFS_GENFS_NODE_H_ */

View File

@ -1,4 +1,4 @@
/* $NetBSD: genfs_vnops.c,v 1.129 2006/09/15 15:51:12 yamt Exp $ */
/* $NetBSD: genfs_vnops.c,v 1.130 2006/10/05 14:48:32 chs Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
@ -31,7 +31,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.129 2006/09/15 15:51:12 yamt Exp $");
__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.130 2006/10/05 14:48:32 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_nfsserver.h"
@ -66,6 +66,12 @@ __KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.129 2006/09/15 15:51:12 yamt Exp $
#include <nfs/nfs_var.h>
#endif
static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
off_t, enum uio_rw);
static void genfs_dio_iodone(struct buf *);
static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
void (*)(struct buf *));
static inline void genfs_rel_pages(struct vm_page **, int);
static void filt_genfsdetach(struct knote *);
static int filt_genfsread(struct knote *, long);
@ -73,6 +79,8 @@ static int filt_genfsvnode(struct knote *, long);
#define MAX_READ_PAGES 16 /* XXXUBC 16 */
int genfs_maxdio = MAXPHYS;
int
genfs_poll(void *v)
{
@ -1459,21 +1467,51 @@ skip_scan:
int
genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
{
int s, error, run;
int fs_bshift, dev_bshift;
off_t off;
vaddr_t kva;
off_t eof, offset, startoffset;
size_t bytes, iobytes, skipbytes;
daddr_t lbn, blkno;
struct vm_page *pg;
struct buf *mbp, *bp;
struct vnode *devvp;
boolean_t async = (flags & PGO_SYNCIO) == 0;
UVMHIST_FUNC("genfs_gop_write"); UVMHIST_CALLED(ubchist);
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
vp, pgs, npages, flags);
off = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
len = npages << PAGE_SHIFT;
error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
uvm_aio_biodone);
return error;
}
/*
* Backend routine for doing I/O to vnode pages. Pages are already locked
* and mapped into kernel memory. Here we just look up the underlying
* device block addresses and call the strategy routine.
*/
static int
genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
enum uio_rw rw, void (*iodone)(struct buf *))
{
int s, error, run;
int fs_bshift, dev_bshift;
off_t eof, offset, startoffset;
size_t bytes, iobytes, skipbytes;
daddr_t lbn, blkno;
struct buf *mbp, *bp;
struct vnode *devvp;
boolean_t async = (flags & PGO_SYNCIO) == 0;
boolean_t write = rw == UIO_WRITE;
int brw = write ? B_WRITE : B_READ;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %p kva %p len 0x%x flags 0x%x",
vp, kva, len, flags);
GOP_SIZE(vp, vp->v_size, &eof, 0);
if (vp->v_type != VBLK) {
fs_bshift = vp->v_mount->mnt_fs_bshift;
@ -1483,28 +1521,26 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
dev_bshift = DEV_BSHIFT;
}
error = 0;
pg = pgs[0];
startoffset = pg->offset;
bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
startoffset = off;
bytes = MIN(len, eof - startoffset);
skipbytes = 0;
KASSERT(bytes != 0);
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
s = splbio();
simple_lock(&global_v_numoutput_slock);
vp->v_numoutput += 2;
simple_unlock(&global_v_numoutput_slock);
splx(s);
if (write) {
s = splbio();
simple_lock(&global_v_numoutput_slock);
vp->v_numoutput += 2;
simple_unlock(&global_v_numoutput_slock);
splx(s);
}
mbp = getiobuf();
UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
vp, mbp, vp->v_numoutput, bytes);
mbp->b_bufsize = npages << PAGE_SHIFT;
mbp->b_bufsize = len;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_flags = B_BUSY|B_WRITE|B_AGE| (async ? (B_CALL|B_ASYNC) : 0);
mbp->b_iodone = uvm_aio_biodone;
mbp->b_flags = B_BUSY | brw | B_AGE | (async ? (B_CALL | B_ASYNC) : 0);
mbp->b_iodone = iodone;
mbp->b_vp = vp;
if (curproc == uvm.pagedaemon_proc)
BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
@ -1529,6 +1565,10 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
if (blkno == (daddr_t)-1) {
if (!write) {
memset((char *)kva + (offset - startoffset), 0,
iobytes);
}
skipbytes += iobytes;
continue;
}
@ -1540,7 +1580,7 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
vp, bp, vp->v_numoutput, 0);
bp = getiobuf();
nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
}
bp->b_lblkno = 0;
@ -1563,7 +1603,7 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
}
UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
error = biowait(mbp);
uvm_aio_aiodone(mbp);
(*iodone)(mbp);
UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
return (error);
}
@ -1747,6 +1787,215 @@ genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
return (error);
}
/*
* Process a uio using direct I/O. If we reach a part of the request
* which cannot be processed in this fashion for some reason, just return.
* The caller must handle some additional part of the request using
* buffered I/O before trying direct I/O again.
*/
void
genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
{
struct vmspace *vs;
struct iovec *iov;
vaddr_t va;
size_t len;
const int mask = DEV_BSIZE - 1;
int error;
/*
* We only support direct I/O to user space for now.
*/
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
return;
}
/*
* If the vnode is mapped, we would need to get the getpages lock
* to stabilize the bmap, but then we would get into trouble whil e
* locking the pages if the pages belong to this same vnode (or a
* multi-vnode cascade to the same effect). Just fall back to
* buffered I/O if the vnode is mapped to avoid this mess.
*/
if (vp->v_flag & VMAPPED) {
return;
}
/*
* Do as much of the uio as possible with direct I/O.
*/
vs = uio->uio_vmspace;
while (uio->uio_resid) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
va = (vaddr_t)iov->iov_base;
len = MIN(iov->iov_len, genfs_maxdio);
len &= ~mask;
/*
* If the next chunk is smaller than DEV_BSIZE or extends past
* the current EOF, then fall back to buffered I/O.
*/
if (len == 0 || uio->uio_offset + len > vp->v_size) {
return;
}
/*
* Check alignment. The file offset must be at least
* sector-aligned. The exact constraint on memory alignment
* is very hardware-dependent, but requiring sector-aligned
* addresses there too is safe.
*/
if (uio->uio_offset & mask || va & mask) {
return;
}
error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
uio->uio_rw);
if (error) {
break;
}
iov->iov_base = (caddr_t)iov->iov_base + len;
iov->iov_len -= len;
uio->uio_offset += len;
uio->uio_resid -= len;
}
}
/*
* Iodone routine for direct I/O. We don't do much here since the request is
* always synchronous, so the caller will do most of the work after biowait().
*/
static void
genfs_dio_iodone(struct buf *bp)
{
int s;
KASSERT((bp->b_flags & B_ASYNC) == 0);
s = splbio();
if ((bp->b_flags & (B_READ | B_AGE)) == B_AGE) {
vwakeup(bp);
}
putiobuf(bp);
splx(s);
}
/*
* Process one chunk of a direct I/O request.
*/
static int
genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
off_t off, enum uio_rw rw)
{
struct vm_map *map;
struct pmap *upm, *kpm;
size_t klen = round_page(uva + len) - trunc_page(uva);
off_t spoff, epoff;
vaddr_t kva, puva;
paddr_t pa;
vm_prot_t prot;
int error, rv, poff, koff;
const int pgoflags = PGO_CLEANIT | PGO_SYNCIO |
(rw == UIO_WRITE ? PGO_FREE : 0);
/*
* For writes, verify that this range of the file already has fully
* allocated backing store. If there are any holes, just punt and
* make the caller take the buffered write path.
*/
if (rw == UIO_WRITE) {
daddr_t lbn, elbn, blkno;
int bsize, bshift, run;
bshift = vp->v_mount->mnt_fs_bshift;
bsize = 1 << bshift;
lbn = off >> bshift;
elbn = (off + len + bsize - 1) >> bshift;
while (lbn < elbn) {
error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
if (error) {
return error;
}
if (blkno == (daddr_t)-1) {
return ENOSPC;
}
lbn += 1 + run;
}
}
/*
* Flush any cached pages for parts of the file that we're about to
* access. If we're writing, invalidate pages as well.
*/
spoff = trunc_page(off);
epoff = round_page(off + len);
simple_lock(&vp->v_interlock);
error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
if (error) {
return error;
}
/*
* Wire the user pages and remap them into kernel memory.
*/
prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
error = uvm_vslock(vs, (void *)uva, len, prot);
if (error) {
return error;
}
map = &vs->vm_map;
upm = vm_map_pmap(map);
kpm = vm_map_pmap(kernel_map);
kva = uvm_km_alloc(kernel_map, klen, 0,
UVM_KMF_VAONLY | UVM_KMF_WAITVA);
puva = trunc_page(uva);
for (poff = 0; poff < klen; poff += PAGE_SIZE) {
rv = pmap_extract(upm, puva + poff, &pa);
KASSERT(rv);
pmap_enter(kpm, kva + poff, pa, prot, prot | PMAP_WIRED);
}
pmap_update(kpm);
/*
* Do the I/O.
*/
koff = uva - trunc_page(uva);
error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
genfs_dio_iodone);
/*
* Tear down the kernel mapping.
*/
pmap_remove(kpm, kva, kva + klen);
pmap_update(kpm);
uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);
/*
* Unwire the user pages.
*/
uvm_vsunlock(vs, (void *)uva, len);
return error;
}
static void
filt_genfsdetach(struct knote *kn)
{

View File

@ -1,4 +1,4 @@
/* $NetBSD: fcntl.h,v 1.33 2005/11/29 22:52:02 yamt Exp $ */
/* $NetBSD: fcntl.h,v 1.34 2006/10/05 14:48:33 chs Exp $ */
/*-
* Copyright (c) 1983, 1990, 1993
@ -106,6 +106,7 @@
#if defined(_NETBSD_SOURCE)
#define O_ALT_IO 0x00040000 /* use alternate i/o semantics */
#define O_DIRECT 0x00080000 /* direct I/O hint */
#endif
/* defined by POSIX 1003.1; BSD default, but required to be bitwise distinct */
@ -119,17 +120,17 @@
/* all bits settable during open(2) */
#define O_MASK (O_ACCMODE|O_NONBLOCK|O_APPEND|O_SHLOCK|O_EXLOCK|\
O_ASYNC|O_SYNC|O_CREAT|O_TRUNC|O_EXCL|O_DSYNC|\
O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW)
O_RSYNC|O_NOCTTY|O_ALT_IO|O_NOFOLLOW|O_DIRECT)
#define FMARK 0x00001000 /* mark during gc() */
#define FDEFER 0x00002000 /* defer for next gc pass */
#define FHASLOCK 0x00004000 /* descriptor holds advisory lock */
#define FKIOCTL 0x80000000 /* kernel originated ioctl */
/* bits to save after open(2) */
#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|\
FRSYNC|FALTIO)
/* bits settable by fcntl(F_SETFL, ...) */
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO)
#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FDSYNC|FRSYNC|FALTIO|\
FDIRECT)
/* bits to save after open(2) */
#define FMASK (FREAD|FWRITE|FCNTLFLAGS)
#endif /* _KERNEL */
/*
@ -150,6 +151,7 @@
#define FDSYNC O_DSYNC /* kernel */
#define FRSYNC O_RSYNC /* kernel */
#define FALTIO O_ALT_IO /* kernel */
#define FDIRECT O_DIRECT /* kernel */
#endif
/*

View File

@ -1,4 +1,4 @@
/* $NetBSD: vnode.h,v 1.155 2006/06/23 14:13:02 yamt Exp $ */
/* $NetBSD: vnode.h,v 1.156 2006/10/05 14:48:33 chs Exp $ */
/*
* Copyright (c) 1989, 1993
@ -154,7 +154,7 @@ struct vnode {
#define VSYSTEM 0x0004 /* vnode being used by kernel */
/* VISTTY used when reading dead vnodes */
#define VISTTY 0x0008 /* vnode represents a tty */
#define VEXECMAP 0x0010 /* vnode has PROT_EXEC mappings */
#define VEXECMAP 0x0010 /* vnode might have PROT_EXEC mappings */
#define VWRITEMAP 0x0020 /* might have PROT_WRITE user mappings */
#define VWRITEMAPDIRTY 0x0040 /* might have dirty pages due to VWRITEMAP */
#define VLOCKSWORK 0x0080 /* FS supports locking discipline */
@ -166,11 +166,12 @@ struct vnode {
#define VLAYER 0x2000 /* vnode is on a layer filesystem */
#define VONWORKLST 0x4000 /* On syncer work-list */
#define VFREEING 0x8000 /* vnode is being freed */
#define VMAPPED 0x10000 /* vnode might have user mappings */
#define VNODE_FLAGBITS \
"\20\1ROOT\2TEXT\3SYSTEM\4ISTTY\5EXECMAP\6WRITEMAP\7WRITEMAPDIRTY" \
"\10LOCKSWORK\11XLOCK\12XWANT\13BWAIT\14ALIASED" \
"\15DIROP\16LAYER\17ONWORKLIST\20FREEING"
"\15DIROP\16LAYER\17ONWORKLIST\20FREEING\21MAPPED"
#define VSIZENOTSET ((voff_t)-1)
@ -233,6 +234,7 @@ struct vattr {
#define IO_ALTSEMANTICS 0x00400 /* use alternate i/o semantics */
#define IO_NORMAL 0x00800 /* operate on regular data */
#define IO_EXT 0x01000 /* operate on extended attributes */
#define IO_DIRECT 0x02000 /* direct I/O hint */
#define IO_ADV_MASK 0x00003 /* access pattern hint */
#define IO_ADV_SHIFT 0

View File

@ -1,4 +1,4 @@
/* $NetBSD: ufs_readwrite.c,v 1.69 2006/10/03 18:24:48 christos Exp $ */
/* $NetBSD: ufs_readwrite.c,v 1.70 2006/10/05 14:48:33 chs Exp $ */
/*-
* Copyright (c) 1993
@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.69 2006/10/03 18:24:48 christos Exp $");
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.70 2006/10/05 14:48:33 chs Exp $");
#ifdef LFS_READWRITE
#define BLKSIZE(a, b, c) blksize(a, b, c)
@ -78,13 +78,14 @@ READ(void *v)
daddr_t lbn, nextlbn;
off_t bytesinfile;
long size, xfersize, blkoffset;
int error, flags;
int error, flags, ioflag;
boolean_t usepc = FALSE;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ip->i_ump;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
error = 0;
#ifdef DIAGNOSTIC
@ -115,6 +116,9 @@ READ(void *v)
const int advice = IO_ADV_DECODE(ap->a_ioflag);
while (uio->uio_resid > 0) {
if (ioflag & IO_DIRECT) {
genfs_directio(vp, uio, ioflag);
}
bytelen = MIN(ip->i_size - uio->uio_offset,
uio->uio_resid);
if (bytelen == 0)
@ -319,9 +323,16 @@ WRITE(void *v)
boolean_t extending; /* if we're extending a whole block */
off_t newoff;
if (ioflag & IO_DIRECT) {
genfs_directio(vp, uio, ioflag);
}
oldoff = uio->uio_offset;
blkoffset = blkoff(fs, uio->uio_offset);
bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
if (bytelen == 0) {
break;
}
/*
* if we're filling in a hole, allocate the blocks now and

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_extern.h,v 1.118 2006/09/15 15:51:13 yamt Exp $ */
/* $NetBSD: uvm_extern.h,v 1.119 2006/10/05 14:48:33 chs Exp $ */
/*
*
@ -581,8 +581,8 @@ __dead void uvm_scheduler(void) __attribute__((noreturn));
void uvm_swapin(struct lwp *);
boolean_t uvm_uarea_alloc(vaddr_t *);
void uvm_uarea_drain(boolean_t);
int uvm_vslock(struct proc *, caddr_t, size_t, vm_prot_t);
void uvm_vsunlock(struct proc *, caddr_t, size_t);
int uvm_vslock(struct vmspace *, void *, size_t, vm_prot_t);
void uvm_vsunlock(struct vmspace *, void *, size_t);
/* uvm_init.c */

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_glue.c,v 1.96 2006/08/29 23:34:48 matt Exp $ */
/* $NetBSD: uvm_glue.c,v 1.97 2006/10/05 14:48:33 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.96 2006/08/29 23:34:48 matt Exp $");
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.97 2006/10/05 14:48:33 chs Exp $");
#include "opt_coredump.h"
#include "opt_kgdb.h"
@ -171,13 +171,13 @@ uvm_chgkprot(caddr_t addr, size_t len, int rw)
*/
int
uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access_type)
{
struct vm_map *map;
vaddr_t start, end;
int error;
map = &p->p_vmspace->vm_map;
map = &vs->vm_map;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
error = uvm_fault_wire(map, start, end, access_type, 0);
@ -192,9 +192,9 @@ uvm_vslock(struct proc *p, caddr_t addr, size_t len, vm_prot_t access_type)
*/
void
uvm_vsunlock(struct proc *p, caddr_t addr, size_t len)
uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
{
uvm_fault_unwire(&p->p_vmspace->vm_map, trunc_page((vaddr_t)addr),
uvm_fault_unwire(&vs->vm_map, trunc_page((vaddr_t)addr),
round_page((vaddr_t)addr + len));
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: uvm_mmap.c,v 1.99 2006/09/30 10:56:31 elad Exp $ */
/* $NetBSD: uvm_mmap.c,v 1.100 2006/10/05 14:48:33 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -51,7 +51,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.99 2006/09/30 10:56:31 elad Exp $");
__KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.100 2006/10/05 14:48:33 chs Exp $");
#include "opt_compat_netbsd.h"
#include "opt_pax.h"
@ -181,7 +181,7 @@ sys_mincore(l, v, retval)
*/
npgs = len >> PAGE_SHIFT;
error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE);
error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
if (error) {
return error;
}
@ -272,7 +272,7 @@ sys_mincore(l, v, retval)
out:
vm_map_unlock_read(map);
uvm_vsunlock(p, SCARG(uap, vec), npgs);
uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
return (error);
}
@ -1065,6 +1065,7 @@ uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
int error;
int advice = UVM_ADV_NORMAL;
uvm_flag_t uvmflag = 0;
boolean_t needwritemap;
/*
* check params
@ -1181,10 +1182,26 @@ uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
return((vp->v_type == VREG) ? ENOMEM : EINVAL);
if ((flags & MAP_SHARED) == 0) {
uvmflag |= UVM_FLAG_COPYONW;
} else if ((maxprot & VM_PROT_WRITE) != 0) {
}
/*
* Set vnode flags to indicate the new kinds of mapping.
* We take the vnode lock in exclusive mode here to serialize
* with direct I/O.
*/
needwritemap = (vp->v_flag & VWRITEMAP) == 0 &&
(flags & MAP_SHARED) != 0 &&
(maxprot & VM_PROT_WRITE) != 0;
if ((vp->v_flag & VMAPPED) == 0 || needwritemap) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
simple_lock(&vp->v_interlock);
vp->v_flag |= VWRITEMAP;
vp->v_flag |= VMAPPED;
if (needwritemap) {
vp->v_flag |= VWRITEMAP;
}
simple_unlock(&vp->v_interlock);
VOP_UNLOCK(vp, 0);
}
}