diff --git a/sys/adosfs/advnops.c b/sys/adosfs/advnops.c index 90abaac8cc17..2a8180132ff0 100644 --- a/sys/adosfs/advnops.c +++ b/sys/adosfs/advnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: advnops.c,v 1.54 2000/08/03 00:54:23 thorpej Exp $ */ +/* $NetBSD: advnops.c,v 1.55 2000/11/27 08:39:39 chs Exp $ */ /* * Copyright (c) 1994 Christian E. Hopps @@ -143,7 +143,9 @@ struct vnodeopv_entry_desc adosfs_vnodeop_entries[] = { { &vop_truncate_desc, adosfs_truncate }, /* truncate */ { &vop_update_desc, adosfs_update }, /* update */ { &vop_bwrite_desc, adosfs_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_size_desc, genfs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc adosfs_vnodeop_opv_desc = @@ -226,6 +228,7 @@ adosfs_read(v) int a_ioflag; struct ucred *a_cred; } */ *sp = v; + struct vnode *vp = sp->a_vp; struct adosfsmount *amp; struct anode *ap; struct uio *uio; @@ -265,6 +268,28 @@ adosfs_read(v) /* * taken from ufs_read() */ + + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = min(ap->fsize - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + } + goto out; + } + do { /* * we are only supporting ADosFFS currently @@ -326,6 +351,8 @@ adosfs_read(v) amp->bsize - amp->dbsize, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); + +out: reterr: #ifdef ADOSFS_DIAGNOSTIC printf(" %d)", error); diff --git a/sys/arch/atari/dev/md_root.c b/sys/arch/atari/dev/md_root.c index cca5977f4c85..7d2cee563770 100644 --- a/sys/arch/atari/dev/md_root.c +++ b/sys/arch/atari/dev/md_root.c @@ -1,4 +1,4 @@ -/* $NetBSD: md_root.c,v 1.14 2000/01/21 23:29:02 thorpej Exp $ */ +/* $NetBSD: md_root.c,v 1.15 2000/11/27 08:39:40 chs Exp $ */ /* * Copyright (c) 1996 Leo Weppelman. @@ -159,7 +159,6 @@ struct proc *proc; * Initialize our buffer header: */ memset(&buf, 0, sizeof(buf)); - buf.b_rcred = buf.b_wcred = proc->p_ucred; buf.b_vnbufs.le_next = NOLIST; buf.b_flags = B_BUSY; buf.b_dev = ld_dev; diff --git a/sys/coda/coda_subr.c b/sys/coda/coda_subr.c index 1dc53c6d69eb..4099bd6bfa45 100644 --- a/sys/coda/coda_subr.c +++ b/sys/coda/coda_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: coda_subr.c,v 1.9 2000/03/30 11:24:16 augustss Exp $ */ +/* $NetBSD: coda_subr.c,v 1.10 2000/11/27 08:39:40 chs Exp $ */ /* * @@ -227,7 +227,7 @@ coda_kill(whoIam, dcstat) #endif count++; CODADEBUG(CODA_FLUSH, - myprintf(("Live cnode fid %lx.%lx.%lx flags %d count %ld\n", + myprintf(("Live cnode fid %lx.%lx.%lx flags %d count %d\n", (cp->c_fid).Volume, (cp->c_fid).Vnode, (cp->c_fid).Unique, @@ -277,7 +277,7 @@ coda_testflush(void) for (cp = coda_cache[hash]; cp != NULL; cp = CNODE_NEXT(cp)) { - myprintf(("Live cnode fid %lx.%lx.%lx count %ld\n", + myprintf(("Live cnode fid %lx.%lx.%lx count %d\n", (cp->c_fid).Volume,(cp->c_fid).Vnode, (cp->c_fid).Unique, CTOV(cp)->v_usecount)); } @@ -424,7 +424,7 @@ int handleDownCall(opcode, out) if (CTOV(cp)->v_flag & VTEXT) error = coda_vmflush(cp); CODADEBUG(CODA_ZAPFILE, myprintf(("zapfile: fid = (%lx.%lx.%lx), - refcnt = %ld, error = %d\n", + refcnt = %d, error = %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, @@ -452,7 +452,7 @@ int handleDownCall(opcode, out) coda_nc_zapParentfid(&out->coda_zapdir.CodaFid, IS_DOWNCALL); CODADEBUG(CODA_ZAPDIR, myprintf(("zapdir: fid = (%lx.%lx.%lx), - refcnt = %ld\n",cp->c_fid.Volume, + refcnt = %d\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, CTOV(cp)->v_usecount - 1));); @@ -486,7 +486,7 @@ int handleDownCall(opcode, out) error = coda_vmflush(cp); } - CODADEBUG(CODA_PURGEFID, myprintf(("purgefid: fid = (%lx.%lx.%lx), refcnt = %ld, error = %d\n", + CODADEBUG(CODA_PURGEFID, myprintf(("purgefid: fid = (%lx.%lx.%lx), refcnt = %d, error = %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, CTOV(cp)->v_usecount - 1, error));); diff --git a/sys/coda/coda_vnops.c b/sys/coda/coda_vnops.c index 0fc5c9e3388b..b46b59e87870 100644 --- a/sys/coda/coda_vnops.c +++ b/sys/coda/coda_vnops.c @@ -6,7 +6,7 @@ mkdir rmdir symlink */ -/* $NetBSD: coda_vnops.c,v 1.21 2000/09/19 22:00:01 fvdl Exp $ */ +/* $NetBSD: coda_vnops.c,v 1.22 2000/11/27 08:39:40 chs Exp $ */ /* * @@ -453,7 +453,7 @@ printf("coda_rdwr: Internally Opening %p\n", vp); } /* Have UFS handle the call. */ - CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %ld\n", + CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %d\n", cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, CTOV(cp)->v_usecount)); ) @@ -873,9 +873,9 @@ coda_inactive(v) if (IS_UNMOUNTING(cp)) { #ifdef DEBUG - printf("coda_inactive: IS_UNMOUNTING use %ld: vp %p, cp %p\n", vp->v_usecount, vp, cp); + printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); if (cp->c_ovp != NULL) - printf("coda_inactive: cp->ovp != NULL use %ld: vp %p, cp %p\n", + printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp); #endif lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock); @@ -1727,7 +1727,7 @@ printf("coda_readdir: Internally Opening %p\n", vp); } /* Have UFS handle the call. */ - CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %ld\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); ) + CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %d\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); ) error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, cookies, ncookies); if (error) diff --git a/sys/conf/files b/sys/conf/files index 3d28ce0c0f44..481c3622f888 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $NetBSD: files,v 1.404 2000/11/26 17:44:02 ad Exp $ +# $NetBSD: files,v 1.405 2000/11/27 08:39:41 chs Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -1060,6 +1060,7 @@ file ufs/ufs/ufs_vnops.c ffs | lfs | mfs | ext2fs file uvm/uvm_amap.c file uvm/uvm_anon.c file uvm/uvm_aobj.c +file uvm/uvm_bio.c file uvm/uvm_device.c file uvm/uvm_fault.c file uvm/uvm_glue.c diff --git a/sys/dev/vnd.c b/sys/dev/vnd.c index bb4735e7c1d5..5fb85d49cf98 100644 --- a/sys/dev/vnd.c +++ b/sys/dev/vnd.c @@ -1,4 +1,4 @@ -/* $NetBSD: vnd.c,v 1.68 2000/09/12 08:03:24 enami Exp $ */ +/* $NetBSD: vnd.c,v 1.69 2000/11/27 08:39:41 chs Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. @@ -445,29 +445,7 @@ vndstrategy(bp) nbp->vb_buf.b_proc = bp->b_proc; nbp->vb_buf.b_iodone = vndiodone; nbp->vb_buf.b_vp = NULLVP; - nbp->vb_buf.b_rcred = vnd->sc_cred; /* XXX crdup? */ - nbp->vb_buf.b_wcred = vnd->sc_cred; /* XXX crdup? */ LIST_INIT(&nbp->vb_buf.b_dep); - if (bp->b_dirtyend == 0) { - nbp->vb_buf.b_dirtyoff = 0; - nbp->vb_buf.b_dirtyend = sz; - } else { - nbp->vb_buf.b_dirtyoff = - max(0, bp->b_dirtyoff - (bp->b_bcount - resid)); - nbp->vb_buf.b_dirtyend = - min(sz, - max(0, bp->b_dirtyend - (bp->b_bcount-resid))); - } - if (bp->b_validend == 0) { - nbp->vb_buf.b_validoff = 0; - nbp->vb_buf.b_validend = sz; - } else { - nbp->vb_buf.b_validoff = - max(0, bp->b_validoff - (bp->b_bcount - resid)); - nbp->vb_buf.b_validend = - min(sz, - max(0, bp->b_validend - (bp->b_bcount-resid))); - } nbp->vb_xfer = vnx; diff --git a/sys/filecorefs/filecore_vfsops.c b/sys/filecorefs/filecore_vfsops.c index 91473671314c..a806e670413a 100644 --- a/sys/filecorefs/filecore_vfsops.c +++ b/sys/filecorefs/filecore_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: filecore_vfsops.c,v 1.11 2000/03/16 18:08:22 jdolecek Exp $ */ +/* $NetBSD: filecore_vfsops.c,v 1.12 2000/11/27 08:39:41 chs Exp $ */ /*- * Copyright (c) 1998 Andrew McMurry @@ -324,6 +324,9 @@ filecore_mountfs(devvp, mp, p, argp) mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_FILECORE); mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = fcdr->log2secsize; + mp->mnt_fs_bshift = fcmp->log2bsize; + fcmp->fc_mountp = mp; fcmp->fc_dev = dev; fcmp->fc_devvp = devvp; diff --git a/sys/filecorefs/filecore_vnops.c b/sys/filecorefs/filecore_vnops.c index 1e8b054de913..1e5db467341d 100644 --- a/sys/filecorefs/filecore_vnops.c +++ b/sys/filecorefs/filecore_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: filecore_vnops.c,v 1.9 2000/08/03 03:38:39 thorpej Exp $ */ +/* $NetBSD: filecore_vnops.c,v 1.10 2000/11/27 08:39:42 chs Exp $ */ /*- * Copyright (c) 1998 Andrew McMurry @@ -162,6 +162,28 @@ filecore_read(v) return (EINVAL); ip->i_flag |= IN_ACCESS; fcmp = ip->i_mnt; + + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = min(ip->i_size - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + } + goto out; + } + do { lbn = lblkno(fcmp, uio->uio_offset); on = blkoff(fcmp, uio->uio_offset); @@ -213,6 +235,8 @@ filecore_read(v) #endif brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); + +out: return (error); } @@ -571,7 +595,9 @@ struct vnodeopv_entry_desc filecore_vnodeop_entries[] = { { &vop_truncate_desc, filecore_truncate }, /* truncate */ { &vop_update_desc, filecore_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_size_desc, genfs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc filecore_vnodeop_opv_desc = { &filecore_vnodeop_p, filecore_vnodeop_entries }; diff --git a/sys/isofs/cd9660/cd9660_vfsops.c b/sys/isofs/cd9660/cd9660_vfsops.c index c9642a96ad36..9e465136b772 100644 --- a/sys/isofs/cd9660/cd9660_vfsops.c +++ b/sys/isofs/cd9660/cd9660_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: cd9660_vfsops.c,v 1.49 2000/07/15 21:40:44 jdolecek Exp $ */ +/* $NetBSD: cd9660_vfsops.c,v 1.50 2000/11/27 08:39:42 chs Exp $ */ /*- * Copyright (c) 1994 @@ -399,6 +399,8 @@ iso_mountfs(devvp, mp, p, argp) mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_CD9660); mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = iso_bsize; + mp->mnt_fs_bshift = isomp->im_bshift; isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; diff --git a/sys/isofs/cd9660/cd9660_vnops.c b/sys/isofs/cd9660/cd9660_vnops.c index d04d86a1611d..5203e113c6d2 100644 --- a/sys/isofs/cd9660/cd9660_vnops.c +++ b/sys/isofs/cd9660/cd9660_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: cd9660_vnops.c,v 1.60 2000/11/14 22:26:32 thorpej Exp $ */ +/* $NetBSD: cd9660_vnops.c,v 1.61 2000/11/27 08:39:42 chs Exp $ */ /*- * Copyright (c) 1994 @@ -278,6 +278,26 @@ cd9660_read(v) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; + + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = min(ip->i_size - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + goto out; + } + do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); @@ -315,6 +335,8 @@ cd9660_read(v) error = uiomove(bp->b_data + on, (int)n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); + +out: return (error); } @@ -955,7 +977,9 @@ struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { { &vop_truncate_desc, cd9660_truncate }, /* truncate */ { &vop_update_desc, cd9660_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_size_desc, genfs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc cd9660_vnodeop_opv_desc = { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; @@ -1009,7 +1033,7 @@ struct vnodeopv_entry_desc cd9660_specop_entries[] = { { &vop_truncate_desc, spec_truncate }, /* truncate */ { &vop_update_desc, cd9660_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc cd9660_specop_opv_desc = { &cd9660_specop_p, cd9660_specop_entries }; @@ -1060,7 +1084,7 @@ struct vnodeopv_entry_desc cd9660_fifoop_entries[] = { { &vop_truncate_desc, fifo_truncate }, /* truncate */ { &vop_update_desc, cd9660_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc cd9660_fifoop_opv_desc = { &cd9660_fifoop_p, cd9660_fifoop_entries }; diff --git a/sys/kern/exec_subr.c b/sys/kern/exec_subr.c index 48c84ac376e9..bbc97152cc3f 100644 --- a/sys/kern/exec_subr.c +++ b/sys/kern/exec_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: exec_subr.c,v 1.25 2000/11/05 22:41:35 tv Exp $ */ +/* $NetBSD: exec_subr.c,v 1.26 2000/11/27 08:39:42 chs Exp $ */ /* * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou @@ -163,6 +163,7 @@ vmcmd_map_pagedvn(struct proc *p, struct exec_vmcmd *cmd) uobj = uvn_attach((void *) cmd->ev_vp, VM_PROT_READ|VM_PROT_EXECUTE); if (uobj == NULL) return(ENOMEM); + VREF(cmd->ev_vp); /* * do the map diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 4fe93f7cf146..793afcabfd08 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,4 @@ -/* $NetBSD: init_main.c,v 1.184 2000/11/21 00:37:56 jdolecek Exp $ */ +/* $NetBSD: init_main.c,v 1.185 2000/11/27 08:39:43 chs Exp $ */ /* * Copyright (c) 1995 Christopher G. Demetriou. All rights reserved. @@ -323,6 +323,8 @@ main(void) /* Configure the system hardware. This will enable interrupts. */ configure(); + ubc_init(); /* must be after autoconfig */ + /* Lock the kernel on behalf of proc0. */ KERNEL_PROC_LOCK(p); @@ -472,6 +474,10 @@ main(void) if (kthread_create1(sched_sync, NULL, NULL, "ioflush")) panic("fork syncer"); + /* Create the aiodone daemon kernel thread. */ + if (kthread_create1(uvm_aiodone_daemon, NULL, NULL, "aiodoned")) + panic("fork aiodoned"); + #if defined(MULTIPROCESSOR) /* Boot the secondary processors. */ cpu_boot_secondary_processors(); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 55320e5ecd3b..c0bcf8829de4 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,4 +1,4 @@ -/* $NetBSD: kern_exec.c,v 1.124 2000/11/21 00:37:56 jdolecek Exp $ */ +/* $NetBSD: kern_exec.c,v 1.125 2000/11/27 08:39:43 chs Exp $ */ /*- * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou @@ -152,6 +152,7 @@ check_exec(struct proc *p, struct exec_package *epp) VOP_UNLOCK(vp, 0); /* now we have the file, get the exec header */ + uvn_attach(vp, VM_PROT_READ); error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0, UIO_SYSSPACE, 0, p->p_ucred, &resid, p); if (error) diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index d538611be760..11c23b97f86c 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -1,4 +1,4 @@ -/* $NetBSD: kern_physio.c,v 1.44 2000/09/29 13:27:12 ad Exp $ */ +/* $NetBSD: kern_physio.c,v 1.45 2000/11/27 08:39:43 chs Exp $ */ /*- * Copyright (c) 1994 Christopher G. Demetriou @@ -290,8 +290,7 @@ getphysbuf() splx(s); memset(bp, 0, sizeof(*bp)); - /* XXXCDC: are the following two lines necessary? */ - bp->b_rcred = bp->b_wcred = NOCRED; + /* XXXCDC: is the following line necessary? */ bp->b_vnbufs.le_next = NOLIST; return(bp); diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index c2acb655e696..3a80417d3590 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_bio.c,v 1.72 2000/11/18 15:58:13 simonb Exp $ */ +/* $NetBSD: vfs_bio.c,v 1.73 2000/11/27 08:39:43 chs Exp $ */ /*- * Copyright (c) 1994 Christopher G. Demetriou @@ -59,7 +59,7 @@ #include #include -#include +#include #include @@ -72,7 +72,7 @@ * Definitions for the buffer hash lists. */ #define BUFHASH(dvp, lbn) \ - (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) + (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; u_long bufhash; struct bio_ops bioops; /* I/O operation notification */ @@ -134,7 +134,6 @@ bremfree(bp) panic("bremfree: lost tail"); } TAILQ_REMOVE(dp, bp, b_freelist); - splx(s); } @@ -166,8 +165,6 @@ bufinit() bp = &buf[i]; memset((char *)bp, 0, sizeof(*bp)); bp->b_dev = NODEV; - bp->b_rcred = NOCRED; - bp->b_wcred = NOCRED; bp->b_vnbufs.le_next = NOLIST; LIST_INIT(&bp->b_dep); bp->b_data = buffers + i * MAXBSIZE; @@ -201,12 +198,8 @@ bio_doread(vp, blkno, size, cred, async) * Therefore, it's valid if it's I/O has completed or been delayed. */ if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { - /* Start I/O for the buffer (keeping credentials). */ + /* Start I/O for the buffer. */ SET(bp->b_flags, B_READ | async); - if (cred != NOCRED && bp->b_rcred == NOCRED) { - crhold(cred); - bp->b_rcred = cred; - } VOP_STRATEGY(bp); /* Pay for the read. */ @@ -375,7 +368,6 @@ bwrite(bp) bp->b_vp->v_numoutput++; splx(s); - SET(bp->b_flags, B_WRITEINPROG); VOP_STRATEGY(bp); if (sync) { @@ -509,6 +501,8 @@ brelse(bp) struct bqueues *bufq; int s; + KASSERT(ISSET(bp->b_flags, B_BUSY)); + /* Wake up any processes waiting for any buffer to become free. */ if (needbuffer) { needbuffer = 0; @@ -602,6 +596,7 @@ brelse(bp) already_queued: /* Unlock the buffer. */ CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE|B_ORDERED); + SET(bp->b_flags, B_CACHE); /* Allow disk interrupts. */ splx(s); @@ -630,7 +625,7 @@ incore(vp, blkno) return (bp); } - return (0); + return (NULL); } /* @@ -647,56 +642,38 @@ getblk(vp, blkno, size, slpflag, slptimeo) daddr_t blkno; int size, slpflag, slptimeo; { - struct bufhashhdr *bh; struct buf *bp; int s, err; - /* - * XXX - * The following is an inlined version of 'incore()', but with - * the 'invalid' test moved to after the 'busy' test. It's - * necessary because there are some cases in which the NFS - * code sets B_INVAL prior to writing data to the server, but - * in which the buffers actually contain valid data. In this - * case, we can't allow the system to allocate a new buffer for - * the block until the write is finished. - */ - bh = BUFHASH(vp, blkno); start: - bp = bh->lh_first; - for (; bp != NULL; bp = bp->b_hash.le_next) { - if (bp->b_lblkno != blkno || bp->b_vp != vp) - continue; - + bp = incore(vp, blkno); + if (bp != NULL) { s = splbio(); if (ISSET(bp->b_flags, B_BUSY)) { + if (curproc == uvm.pagedaemon_proc) { + splx(s); + return NULL; + } SET(bp->b_flags, B_WANTED); err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", - slptimeo); + slptimeo); splx(s); if (err) return (NULL); goto start; } - - if (!ISSET(bp->b_flags, B_INVAL)) { #ifdef DIAGNOSTIC - if (ISSET(bp->b_flags, B_DONE|B_DELWRI) && - bp->b_bcount < size) - panic("getblk: block size invariant failed"); + if (ISSET(bp->b_flags, B_DONE|B_DELWRI) && bp->b_bcount < size) + panic("getblk: block size invariant failed"); #endif - SET(bp->b_flags, B_BUSY); - bremfree(bp); - splx(s); - break; - } + SET(bp->b_flags, B_BUSY); + bremfree(bp); splx(s); - } - - if (bp == NULL) { + } else { if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) goto start; - binshash(bp, bh); + + binshash(bp, BUFHASH(vp, blkno)); bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; s = splbio(); bgetvp(vp, bp); @@ -720,7 +697,6 @@ geteblk(size) SET(bp->b_flags, B_INVAL); binshash(bp, &invalhash); allocbuf(bp, size); - return (bp); } @@ -737,9 +713,9 @@ allocbuf(bp, size) struct buf *bp; int size; { - struct buf *nbp; - vsize_t desired_size; - int s; + struct buf *nbp; + vsize_t desired_size; + int s; desired_size = round_page((vsize_t)size); if (desired_size > MAXBSIZE) @@ -759,6 +735,7 @@ allocbuf(bp, size) /* find a buffer */ while ((nbp = getnewbuf(0, 0)) == NULL) ; + SET(nbp->b_flags, B_INVAL); binshash(nbp, &invalhash); @@ -836,7 +813,7 @@ start: needbuffer = 1; tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo); splx(s); - return (0); + return (NULL); } if (ISSET(bp->b_flags, B_VFLUSH)) { @@ -882,18 +859,6 @@ start: bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; - - /* nuke any credentials we were holding */ - if (bp->b_rcred != NOCRED) { - crfree(bp->b_rcred); - bp->b_rcred = NOCRED; - } - if (bp->b_wcred != NOCRED) { - crfree(bp->b_wcred); - bp->b_wcred = NOCRED; - } bremhash(bp); return (bp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index b4b878da85e0..a4a31b7bd660 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_subr.c,v 1.141 2000/11/24 03:59:09 chs Exp $ */ +/* $NetBSD: vfs_subr.c,v 1.142 2000/11/27 08:39:44 chs Exp $ */ /*- * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc. @@ -413,6 +413,8 @@ getnewvnode(tag, mp, vops, vpp) int (**vops) __P((void *)); struct vnode **vpp; { + extern struct uvm_pagerops uvm_vnodeops; + struct uvm_object *uobj; struct proc *p = curproc; /* XXX */ struct freelst *listhd; static int toggle; @@ -451,6 +453,7 @@ getnewvnode(tag, mp, vops, vpp) * vnode_hold_list because we will lose the identity of all its * referencing buffers. */ + toggle ^= 1; if (numvnodes > 2 * desiredvnodes) toggle = 0; @@ -461,7 +464,7 @@ getnewvnode(tag, mp, vops, vpp) (TAILQ_FIRST(listhd = &vnode_hold_list) == NULL || toggle))) { simple_unlock(&vnode_free_list_slock); vp = pool_get(&vnode_pool, PR_WAITOK); - memset((char *)vp, 0, sizeof(*vp)); + memset(vp, 0, sizeof(*vp)); simple_lock_init(&vp->v_interlock); numvnodes++; } else { @@ -522,6 +525,7 @@ getnewvnode(tag, mp, vops, vpp) vp->v_type = VNON; vp->v_vnlock = &vp->v_lock; lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); + lockinit(&vp->v_glock, PVFS, "glock", 0, 0); cache_purge(vp); vp->v_tag = tag; vp->v_op = vops; @@ -530,6 +534,16 @@ getnewvnode(tag, mp, vops, vpp) vp->v_usecount = 1; vp->v_data = 0; simple_lock_init(&vp->v_uvm.u_obj.vmobjlock); + + /* + * initialize uvm_object within vnode. + */ + + uobj = &vp->v_uvm.u_obj; + uobj->pgops = &uvm_vnodeops; + TAILQ_INIT(&uobj->memq); + vp->v_uvm.u_size = VSIZENOTSET; + if (mp && error != EDEADLK) vfs_unbusy(mp); return (0); @@ -606,7 +620,6 @@ vwakeup(bp) { struct vnode *vp; - bp->b_flags &= ~B_WRITEINPROG; if ((vp = bp->b_vp) != NULL) { if (--vp->v_numoutput < 0) panic("vwakeup: neg numoutput, vp %p", vp); @@ -630,9 +643,21 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) struct proc *p; int slpflag, slptimeo; { + struct uvm_object *uobj = &vp->v_uvm.u_obj; struct buf *bp, *nbp; - int s, error; + int s, error, rv; + int flushflags = PGO_ALLPAGES|PGO_FREE|PGO_SYNCIO| + (flags & V_SAVE ? PGO_CLEANIT : 0); + /* XXXUBC this doesn't look at flags or slp* */ + if (vp->v_type == VREG) { + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + return EIO; + } + } if (flags & V_SAVE) { error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p); if (error) @@ -714,10 +739,22 @@ vtruncbuf(vp, lbn, slpflag, slptimeo) daddr_t lbn; int slpflag, slptimeo; { + struct uvm_object *uobj = &vp->v_uvm.u_obj; struct buf *bp, *nbp; - int s, error; + int s, error, rv; s = splbio(); + if (vp->v_type == VREG) { + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, + round_page(lbn << vp->v_mount->mnt_fs_bshift), + vp->v_uvm.u_size, PGO_FREE); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + splx(s); + return EIO; + } + } restart: for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { @@ -726,7 +763,7 @@ restart: continue; if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + error = tsleep(bp, slpflag | (PRIBIO + 1), "vtruncbuf", slptimeo); if (error) { splx(s); @@ -744,7 +781,7 @@ restart: continue; if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), + error = tsleep(bp, slpflag | (PRIBIO + 1), "vtruncbuf", slptimeo); if (error) { splx(s); @@ -766,9 +803,18 @@ vflushbuf(vp, sync) struct vnode *vp; int sync; { + struct uvm_object *uobj = &vp->v_uvm.u_obj; struct buf *bp, *nbp; int s; + if (vp->v_type == VREG) { + int flags = PGO_CLEANIT|PGO_ALLPAGES| (sync ? PGO_SYNCIO : 0); + + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, flags); + simple_unlock(&uobj->vmobjlock); + } + loop: s = splbio(); for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { @@ -850,11 +896,14 @@ brelvp(bp) */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); - if ((vp->v_flag & VONWORKLST) && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { + + if (vp->v_type != VREG && (vp->v_flag & VONWORKLST) && + LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); } - bp->b_vp = (struct vnode *) 0; + + bp->b_vp = NULL; HOLDRELE(vp); splx(s); } @@ -874,11 +923,6 @@ reassignbuf(bp, newvp) struct buflists *listheadp; int delay; - if (newvp == NULL) { - printf("reassignbuf: NULL"); - return; - } - /* * Delete from old vnode list, if on one. */ @@ -890,7 +934,8 @@ reassignbuf(bp, newvp) */ if ((bp->b_flags & B_DELWRI) == 0) { listheadp = &newvp->v_cleanblkhd; - if ((newvp->v_flag & VONWORKLST) && + if (newvp->v_type != VREG && + (newvp->v_flag & VONWORKLST) && LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) { newvp->v_flag &= ~VONWORKLST; LIST_REMOVE(newvp, v_synclist); @@ -1074,9 +1119,13 @@ vget(vp, flags) * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ + if ((flags & LK_INTERLOCK) == 0) simple_lock(&vp->v_interlock); if (vp->v_flag & VXLOCK) { + if (flags & LK_NOWAIT) { + return EBUSY; + } vp->v_flag |= VXWANT; ltsleep((caddr_t)vp, PINOD|PNORELOCK, "vget", 0, &vp->v_interlock); @@ -1167,6 +1216,7 @@ vput(vp) else TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VTEXT; simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); } @@ -1194,7 +1244,7 @@ vrele(vp) #ifdef DIAGNOSTIC if (vp->v_usecount < 0 || vp->v_writecount != 0) { vprint("vrele: bad ref count", vp); - panic("vrele: ref cnt"); + panic("vrele: ref cnt vp %p", vp); } #endif /* @@ -1206,6 +1256,7 @@ vrele(vp) else TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VTEXT; if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) VOP_INACTIVE(vp, p); } @@ -1256,6 +1307,7 @@ holdrele(vp) if (vp->v_holdcnt <= 0) panic("holdrele: holdcnt vp %p", vp); vp->v_holdcnt--; + /* * If it is on the holdlist and the hold count drops to * zero, move it to the free list. The test of the back @@ -1269,6 +1321,7 @@ holdrele(vp) * getnewvnode after removing it from a freelist to ensure * that we do not try to move it here. */ + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && vp->v_holdcnt == 0 && vp->v_usecount == 0) { simple_lock(&vnode_free_list_slock); @@ -1427,6 +1480,8 @@ vclean(vp, flags, p) if (vp->v_flag & VXLOCK) panic("vclean: deadlock, vp %p", vp); vp->v_flag |= VXLOCK; + vp->v_flag &= ~VTEXT; + /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK @@ -1437,11 +1492,7 @@ vclean(vp, flags, p) VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK); /* - * clean out any VM data associated with the vnode. - */ - uvm_vnp_terminate(vp); - /* - * Clean out any buffers associated with the vnode. + * Clean out any cached data associated with the vnode. */ if (flags & DOCLOSE) vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); @@ -1467,7 +1518,6 @@ vclean(vp, flags, p) */ if (VOP_RECLAIM(vp, p)) panic("vclean: cannot reclaim, vp %p", vp); - if (active) { /* * Inline copy of vrele() since VOP_INACTIVE @@ -1484,6 +1534,7 @@ vclean(vp, flags, p) /* * Insert at tail of LRU list. */ + simple_unlock(&vp->v_interlock); simple_lock(&vnode_free_list_slock); #ifdef DIAGNOSTIC @@ -1740,7 +1791,7 @@ vprint(label, vp) if (label != NULL) printf("%s: ", label); - printf("tag %d type %s, usecount %ld, writecount %ld, refcount %ld,", + printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,", vp->v_tag, typename[vp->v_type], vp->v_usecount, vp->v_writecount, vp->v_holdcnt); buf[0] = '\0'; @@ -2365,7 +2416,7 @@ vfs_shutdown() /* avoid coming back this way again if we panic. */ doing_shutdown = 1; - sys_sync(p, (void *)0, (register_t *)0); + sys_sync(p, NULL, NULL); /* Wait for sync to finish. */ dcount = 10000; @@ -2608,10 +2659,10 @@ vfs_detach(vfs) #ifdef DDB const char buf_flagbits[] = - "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6CACHE\7CALL\10DELWRI" + "\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI" "\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE" - "\21PAGET\22PGIN\23PHYS\24RAW\25READ\26TAPE\27UAREA\30WANTED" - "\31WRITEINPROG\32XXX\33VFLUSH"; + "\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED" + "\32XXX\33VFLUSH"; void vfs_buf_print(bp, full, pr) @@ -2629,15 +2680,9 @@ vfs_buf_print(bp, full, pr) (*pr)(" bufsize 0x%x bcount 0x%x resid 0x%x\n", bp->b_bufsize, bp->b_bcount, bp->b_resid); - (*pr)(" data %p saveaddr %p\n", - bp->b_data, bp->b_saveaddr); + (*pr)(" data %p saveaddr %p dep %p\n", + bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep)); (*pr)(" iodone %p\n", bp->b_iodone); - - (*pr)(" dirtyoff 0x%x dirtyend 0x%x validoff 0x%x validend 0x%x\n", - bp->b_dirtyoff, bp->b_dirtyend, - bp->b_validoff, bp->b_validend); - - (*pr)(" rcred %p wcred %p\n", bp->b_rcred, bp->b_wcred); } @@ -2689,16 +2734,17 @@ vfs_vnode_print(vp, full, pr) int full; void (*pr) __P((const char *, ...)); { - char buf[1024]; + char buf[256]; const char *vtype, *vtag; uvm_object_printit(&vp->v_uvm.u_obj, full, pr); bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf)); (*pr)("\nVNODE flags %s\n", buf); - (*pr)("nio %d size 0x%x wlist %s\n", - vp->v_uvm.u_nio, vp->v_uvm.u_size, - vp->v_uvm.u_wlist.le_next ? "YES" : "NO"); + (*pr)("mp %p nio %d size 0x%x rwlock 0x%x glock 0x%x\n", + vp->v_mount, vp->v_uvm.u_nio, (int)vp->v_uvm.u_size, + vp->v_vnlock ? lockstatus(vp->v_vnlock) : 0x999, + lockstatus(&vp->v_glock)); (*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n", vp->v_data, vp->v_usecount, vp->v_writecount, @@ -2723,16 +2769,14 @@ vfs_vnode_print(vp, full, pr) struct buf *bp; (*pr)("clean bufs:\n"); - for (bp = LIST_FIRST(&vp->v_cleanblkhd); - bp != NULL; - bp = LIST_NEXT(bp, b_vnbufs)) { + LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) { + (*pr)(" bp %p\n", bp); vfs_buf_print(bp, full, pr); } (*pr)("dirty bufs:\n"); - for (bp = LIST_FIRST(&vp->v_dirtyblkhd); - bp != NULL; - bp = LIST_NEXT(bp, b_vnbufs)) { + LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) { + (*pr)(" bp %p\n", bp); vfs_buf_print(bp, full, pr); } } diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 58466e983323..bb42edafad56 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_syscalls.c,v 1.163 2000/09/28 06:43:20 enami Exp $ */ +/* $NetBSD: vfs_syscalls.c,v 1.164 2000/11/27 08:39:44 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -571,7 +571,6 @@ sys_sync(p, v, retval) if ((mp->mnt_flag & MNT_RDONLY) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; - uvm_vnp_sync(mp); VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; @@ -1181,6 +1180,11 @@ sys_fhopen(p, v, retval) } if ((error = VOP_OPEN(vp, flags, cred, p)) != 0) goto bad; + if (vp->v_type == VREG && + uvn_attach(vp, flags & FWRITE ? VM_PROT_WRITE : 0) == NULL) { + error = EIO; + goto bad; + } if (flags & FWRITE) vp->v_writecount++; @@ -1583,8 +1587,6 @@ sys_unlink(p, v, retval) goto out; } - (void)uvm_vnp_uncache(vp); - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); @@ -2852,7 +2854,6 @@ out: if (fromnd.ni_dvp != tdvp) VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); if (tvp) { - (void)uvm_vnp_uncache(tvp); VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 20fc480cc176..96b0323938e2 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_vnops.c,v 1.44 2000/08/12 16:43:00 sommerfeld Exp $ */ +/* $NetBSD: vfs_vnops.c,v 1.45 2000/11/27 08:39:44 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -156,8 +156,14 @@ vn_open(ndp, fmode, cmode) } if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0) goto bad; + if (vp->v_type == VREG && + uvn_attach(vp, fmode & FWRITE ? VM_PROT_WRITE : 0) == NULL) { + error = EIO; + goto bad; + } if (fmode & FWRITE) vp->v_writecount++; + return (0); bad: vput(vp); @@ -174,11 +180,10 @@ vn_writechk(vp) { /* - * If there's shared text associated with - * the vnode, try to free it up once. If - * we fail, we can't allow writing. + * If the vnode is in use as a process's text, + * we can't allow writing. */ - if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp)) + if (vp->v_flag & VTEXT) return (ETXTBSY); return (0); } diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index f9ea58cc6c7f..7bf4b624b0d8 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -1,4 +1,4 @@ -# $NetBSD: vnode_if.src,v 1.25 2000/09/19 21:57:14 fvdl Exp $ +# $NetBSD: vnode_if.src,v 1.26 2000/11/27 08:39:45 chs Exp $ # # Copyright (c) 1992, 1993 # The Regents of the University of California. All rights reserved. @@ -502,6 +502,17 @@ vop_balloc { OUT struct buf **bpp; }; +# +#% ballocn vp L L L +# +vop_ballocn { + IN struct vnode *vp; + IN off_t offset; + IN off_t length; + IN struct ucred *cred; + IN int flags; +}; + # #% reallocblks vp L L L # @@ -569,3 +580,37 @@ vop_whiteout { #vop_bwrite { # IN struct buf *bp; #}; + +# +#% getpages vp L L L +# +vop_getpages { + IN struct vnode *vp; + IN voff_t offset; + IN vm_page_t *m; + IN int *count; + IN int centeridx; + IN vm_prot_t access_type; + IN int advice; + IN int flags; +}; + +# +#% putpages vp L L L +# +vop_putpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int flags; + IN int *rtvals; +}; + +# +#% size vp = = = +# +vop_size { + IN struct vnode *vp; + IN off_t size; + OUT off_t *eobp; +}; diff --git a/sys/miscfs/genfs/genfs.h b/sys/miscfs/genfs/genfs.h index cf6ed901048d..85608a2e4abb 100644 --- a/sys/miscfs/genfs/genfs.h +++ b/sys/miscfs/genfs/genfs.h @@ -1,4 +1,4 @@ -/* $NetBSD: genfs.h,v 1.10 1999/08/03 20:19:19 wrstuden Exp $ */ +/* $NetBSD: genfs.h,v 1.11 2000/11/27 08:39:45 chs Exp $ */ int genfs_badop __P((void *)); int genfs_nullop __P((void *)); @@ -22,3 +22,6 @@ int genfs_lease_check __P((void *)); int genfs_lock __P((void *)); int genfs_islocked __P((void *)); int genfs_unlock __P((void *)); +int genfs_getpages __P((void *)); +int genfs_putpages __P((void *)); +int genfs_size __P((void *)); diff --git a/sys/miscfs/genfs/genfs_vnops.c b/sys/miscfs/genfs/genfs_vnops.c index 8b0c220816fe..4b9168c280a6 100644 --- a/sys/miscfs/genfs/genfs_vnops.c +++ b/sys/miscfs/genfs/genfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: genfs_vnops.c,v 1.20 2000/09/19 22:01:59 fvdl Exp $ */ +/* $NetBSD: genfs_vnops.c,v 1.21 2000/11/27 08:39:45 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -50,6 +50,9 @@ #include #include +#include +#include + #ifdef NFSSERVER #include #include @@ -414,3 +417,659 @@ genfs_lease_check(v) return (0); #endif /* NFSSERVER */ } + +/* + * generic VM getpages routine. + * Return PG_BUSY pages for the given range, + * reading from backing store if necessary. + */ + +int +genfs_getpages(v) + void *v; +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + off_t eof, offset, origoffset, startoffset, endoffset, raoffset; + daddr_t lbn, blkno; + int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount; + int fs_bshift, fs_bsize, dev_bshift, dev_bsize; + int flags = ap->a_flags; + size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; + vaddr_t kva; + struct buf *bp, *mbp; + struct vnode *vp = ap->a_vp; + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct vm_page *pgs[16]; /* XXXUBC 16 */ + struct ucred *cred = curproc->p_ucred; /* XXXUBC curproc */ + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; + boolean_t sawhole = FALSE; + UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist); + + /* XXXUBC temp limit */ + if (*ap->a_count > 16) { + return EINVAL; + } + + error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof); + if (error) { + return error; + } + +#ifdef DIAGNOSTIC + if (ap->a_centeridx < 0 || ap->a_centeridx > *ap->a_count) { + panic("genfs_getpages: centeridx %d out of range", + ap->a_centeridx); + } + if (ap->a_offset & (PAGE_SIZE - 1) || ap->a_offset < 0) { + panic("genfs_getpages: offset 0x%x", (int)ap->a_offset); + } + if (*ap->a_count < 0) { + panic("genfs_getpages: count %d < 0", *ap->a_count); + } +#endif + + /* + * Bounds-check the request. + */ + + error = 0; + origoffset = ap->a_offset; + + if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= eof && + (flags & PGO_PASTEOF) == 0) { + if ((flags & PGO_LOCKED) == 0) { + simple_unlock(&uobj->vmobjlock); + } + UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x", + origoffset, *ap->a_count, eof,0); + return EINVAL; + } + + /* + * For PGO_LOCKED requests, just return whatever's in memory. + */ + + if (flags & PGO_LOCKED) { + uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, + UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY); + + return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; + } + + /* vnode is VOP_LOCKed, uobj is locked */ + + if (write && (vp->v_flag & VONWORKLST) == 0) { + vn_syncer_add_to_worklist(vp, filedelay); + } + + /* + * find the requested pages and make some simple checks. + * leave space in the page array for a whole block. + */ + + fs_bshift = vp->v_mount->mnt_fs_bshift; + fs_bsize = 1 << fs_bshift; + dev_bshift = vp->v_mount->mnt_dev_bshift; + dev_bsize = 1 << dev_bshift; + KASSERT((eof & (dev_bsize - 1)) == 0); + + orignpages = min(*ap->a_count, + round_page(eof - origoffset) >> PAGE_SHIFT); + if (flags & PGO_PASTEOF) { + orignpages = *ap->a_count; + } + npages = orignpages; + startoffset = origoffset & ~(fs_bsize - 1); + endoffset = round_page((origoffset + (npages << PAGE_SHIFT) + + fs_bsize - 1) & ~(fs_bsize - 1)); + endoffset = min(endoffset, round_page(eof)); + ridx = (origoffset - startoffset) >> PAGE_SHIFT; + + memset(pgs, 0, sizeof(pgs)); + uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); + + /* + * if PGO_OVERWRITE is set, don't bother reading the pages. + * PGO_OVERWRITE also means that the caller guarantees + * that the pages already have backing store allocated. + */ + + if (flags & PGO_OVERWRITE) { + UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + uvm_pagezero(pg); + pg->flags &= ~(PG_FAKE); + } + pg->flags &= ~(PG_RDONLY); + } + goto out; + } + + /* + * if the pages are already resident, just return them. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if ((pg->flags & PG_FAKE) || + (write && (pg->flags & PG_RDONLY))) { + break; + } + } + if (i == npages) { + UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); + raoffset = origoffset + (orignpages << PAGE_SHIFT); + goto raout; + } + + /* + * the page wasn't resident and we're not overwriting, + * so we're going to have to do some i/o. + * find any additional pages needed to cover the expanded range. + */ + + if (startoffset != origoffset) { + + /* + * XXXUBC we need to avoid deadlocks caused by locking + * additional pages at lower offsets than pages we + * already have locked. for now, unlock them all and + * start over. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + pg->flags |= PG_RELEASED; + } + } + uvm_page_unbusy(&pgs[ridx], npages); + memset(pgs, 0, sizeof(pgs)); + + UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", + startoffset, endoffset, 0,0); + npages = (endoffset - startoffset) >> PAGE_SHIFT; + npgs = npages; + uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); + } + simple_unlock(&uobj->vmobjlock); + + /* + * read the desired page(s). + */ + + totalbytes = npages << PAGE_SHIFT; + bytes = min(totalbytes, eof - startoffset); + tailbytes = totalbytes - bytes; + skipbytes = 0; + + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | + UVMPAGER_MAPIN_READ); + + s = splbio(); + mbp = pool_get(&bufpool, PR_WAITOK); + splx(s); + mbp->b_bufsize = totalbytes; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + /* + * if EOF is in the middle of the last page, zero the part past EOF. + */ + + if (tailbytes > 0) { + memset((void *)(kva + bytes), 0, tailbytes); + } + + /* + * now loop over the pages, reading as needed. + */ + + if (write) { + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL); + } else { + lockmgr(&vp->v_glock, LK_SHARED, NULL); + } + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + + /* + * skip pages which don't need to be read. + */ + + pidx = (offset - startoffset) >> PAGE_SHIFT; + while ((pgs[pidx]->flags & PG_FAKE) == 0) { + size_t b; + +#ifdef DEBUG + if (offset & (PAGE_SIZE - 1)) { + panic("genfs_getpages: skipping from middle " + "of page"); + } +#endif + + b = min(PAGE_SIZE, bytes); + offset += b; + bytes -= b; + skipbytes += b; + pidx++; + UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", + offset, 0,0,0); + if (bytes == 0) { + goto loopdone; + } + } + + /* + * bmap the file to find out the blkno to read from and + * how much we can read in one i/o. if bmap returns an error, + * skip the rest of the top-level i/o. + */ + + lbn = offset >> fs_bshift; + error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); + if (error) { + UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n", + lbn, error,0,0); + skipbytes += bytes; + goto loopdone; + } + + /* + * see how many pages can be read with this i/o. + * reduce the i/o size if necessary to avoid + * overwriting pages with valid data. + */ + + iobytes = min(((lbn + 1 + run) << fs_bshift) - offset, bytes); + if (offset + iobytes > round_page(offset)) { + pcount = 1; + while (pidx + pcount < npages && + pgs[pidx + pcount]->flags & PG_FAKE) { + pcount++; + } + iobytes = min(iobytes, (pcount << PAGE_SHIFT) - + (offset - trunc_page(offset))); + } + + /* + * if this block isn't allocated, zero it instead of reading it. + * if this is a read access, mark the pages we zeroed PG_RDONLY. + */ + + if (blkno < 0) { + UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0); + + sawhole = TRUE; + memset((char *)kva + (offset - startoffset), 0, + iobytes); + skipbytes += iobytes; + + if (!write) { + int holepages = + (round_page(offset + iobytes) - + trunc_page(offset)) >> PAGE_SHIFT; + for (i = 0; i < holepages; i++) { + pgs[pidx + i]->flags |= PG_RDONLY; + } + } + continue; + } + + /* + * allocate a sub-buf for this piece of the i/o + * (or just use mbp if there's only 1 piece), + * and start it going. + */ + + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + bp = pool_get(&bufpool, PR_WAITOK); + splx(s); + bp->b_data = (char *)kva + offset - startoffset; + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_READ|B_CALL; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_lblkno = 0; + bp->b_private = mbp; + + /* adjust physical blkno for partial blocks */ + bp->b_blkno = blkno + ((offset - (lbn << fs_bshift)) >> + dev_bshift); + + UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", + bp, offset, iobytes, bp->b_blkno); + + VOP_STRATEGY(bp); + } + +loopdone: + if (skipbytes) { + s = splbio(); + if (error) { + mbp->b_flags |= B_ERROR; + mbp->b_error = error; + } + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + + if (async) { + UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL); + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + s = splbio(); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + raoffset = offset; + + /* + * if this we encountered a hole then we have to do a little more work. + * for read faults, we marked the page PG_RDONLY so that future + * write accesses to the page will fault again. + * for write faults, we must make sure that the backing store for + * the page is completely allocated while the pages are locked. + */ + + if (error == 0 && sawhole && write) { + error = VOP_BALLOCN(vp, startoffset, npages << PAGE_SHIFT, + cred, 0); + if (error) { + UVMHIST_LOG(ubchist, "balloc lbn 0x%x -> %d", + lbn, error,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL); + simple_lock(&uobj->vmobjlock); + goto out; + } + } + lockmgr(&vp->v_glock, LK_RELEASE, NULL); + simple_lock(&uobj->vmobjlock); + + /* + * see if we want to start any readahead. + * XXXUBC for now, just read the next 128k on 64k boundaries. + * this is pretty nonsensical, but it is 50% faster than reading + * just the next 64k. + */ + +raout: + if (!async && !write && ((int)raoffset & 0xffff) == 0 && + PAGE_SHIFT <= 16) { + int racount; + + racount = 1 << (16 - PAGE_SHIFT); + (void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + + racount = 1 << (16 - PAGE_SHIFT); + (void) VOP_GETPAGES(vp, raoffset + 0x10000, NULL, &racount, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + } + + /* + * we're almost done! release the pages... + * for errors, we free the pages. + * otherwise we activate them and mark them as valid and clean. + * also, unbusy pages that were not actually requested. + */ + +out: + if (error) { + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if ((pgs[i]->flags & PG_FAKE) == 0) { + continue; + } + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + uvm_pagefree(pgs[i]); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0); + return error; + } + + UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_FAKE) { + UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", + pgs[i], pgs[i]->offset,0,0); + pgs[i]->flags &= ~(PG_FAKE); + pmap_clear_modify(pgs[i]); + pmap_clear_reference(pgs[i]); + } + if (write) { + pgs[i]->flags &= ~(PG_RDONLY); + } + if (i < ridx || i >= ridx + orignpages || async) { + UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", + pgs[i], pgs[i]->offset,0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->wire_count == 0) { + uvm_pageactivate(pgs[i]); + } + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + } + simple_unlock(&uobj->vmobjlock); + if (ap->a_m != NULL) { + memcpy(ap->a_m, &pgs[ridx], + orignpages * sizeof(struct vm_page *)); + } + return 0; +} + +/* + * generic VM putpages routine. + * Write the given range of pages to backing store. + */ + +int +genfs_putpages(v) + void *v; +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + struct vm_page **a_m; + int a_count; + int a_flags; + int *a_rtvals; + } */ *ap = v; + + int s, error, error2, npages, run; + int fs_bshift, dev_bshift, dev_bsize; + vaddr_t kva; + off_t eof, offset, startoffset; + size_t bytes, iobytes, skipbytes; + daddr_t lbn, blkno; + struct vm_page *pg; + struct buf *mbp, *bp; + struct vnode *vp = ap->a_vp; + boolean_t async = (ap->a_flags & PGO_SYNCIO) == 0; + UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist); + + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof); + if (error) { + return error; + } + + error = error2 = 0; + npages = ap->a_count; + fs_bshift = vp->v_mount->mnt_fs_bshift; + dev_bshift = vp->v_mount->mnt_dev_bshift; + dev_bsize = 1 << dev_bshift; + KASSERT((eof & (dev_bsize - 1)) == 0); + + pg = ap->a_m[0]; + startoffset = pg->offset; + bytes = min(npages << PAGE_SHIFT, eof - startoffset); + skipbytes = 0; + KASSERT(bytes != 0); + + kva = uvm_pagermapin(ap->a_m, npages, UVMPAGER_MAPIN_WAITOK); + + s = splbio(); + vp->v_numoutput += 2; + mbp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + splx(s); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_WRITE|B_AGE | + (async ? B_CALL : 0) | + (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + lbn = offset >> fs_bshift; + error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); + if (error) { + UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0); + skipbytes += bytes; + bytes = 0; + break; + } + + iobytes = min(((lbn + 1 + run) << fs_bshift) - offset, bytes); + if (blkno == (daddr_t)-1) { + skipbytes += iobytes; + continue; + } + + /* if it's really one i/o, don't make a second buf */ + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + vp->v_numoutput++; + bp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + splx(s); + bp->b_data = (char *)kva + + (vaddr_t)(offset - pg->offset); + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_lblkno = 0; + bp->b_private = mbp; + + /* adjust physical blkno for partial blocks */ + bp->b_blkno = blkno + ((offset - (lbn << fs_bshift)) >> + dev_bshift); + UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x", + vp, offset, bp->b_bcount, bp->b_blkno); + VOP_STRATEGY(bp); + } + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + UVMHIST_LOG(ubchist, "returning PEND", 0,0,0,0); + return EINPROGRESS; + } + if (bp != NULL) { + UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0); + error2 = biowait(mbp); + } + { + /* XXXUBC */ + void softdep_pageiodone(struct buf *); + softdep_pageiodone(mbp); + } + s = splbio(); + vwakeup(mbp); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0); + return error ? error : error2; +} + +int +genfs_size(v) + void *v; +{ + struct vop_size_args /* { + struct vnode *a_vp; + off_t a_size; + off_t *a_eobp; + } */ *ap = v; + int bsize; + + bsize = 1 << ap->a_vp->v_mount->mnt_fs_bshift; + *ap->a_eobp = (ap->a_size + bsize) & ~(bsize - 1); + return 0; +} diff --git a/sys/miscfs/genfs/layer_subr.c b/sys/miscfs/genfs/layer_subr.c index bbef11b66cc2..f879dde6c105 100644 --- a/sys/miscfs/genfs/layer_subr.c +++ b/sys/miscfs/genfs/layer_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: layer_subr.c,v 1.6 2000/03/16 18:08:24 jdolecek Exp $ */ +/* $NetBSD: layer_subr.c,v 1.7 2000/11/27 08:39:45 chs Exp $ */ /* * Copyright (c) 1999 National Aeronautics & Space Administration @@ -272,6 +272,7 @@ layer_node_alloc(mp, lowervp, vpp) VREF(lowervp); /* Take into account reference held in layer_node */ hd = LAYER_NHASH(lmp, lowervp); LIST_INSERT_HEAD(hd, xp, layer_hash); + uvm_vnp_setsize(vp, 0); simple_unlock(&lmp->layerm_hashlock); return (0); } diff --git a/sys/miscfs/procfs/procfs_subr.c b/sys/miscfs/procfs/procfs_subr.c index 914d6d324251..d1007949f92a 100644 --- a/sys/miscfs/procfs/procfs_subr.c +++ b/sys/miscfs/procfs/procfs_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: procfs_subr.c,v 1.33 2000/11/24 18:58:37 chs Exp $ */ +/* $NetBSD: procfs_subr.c,v 1.34 2000/11/27 08:39:46 chs Exp $ */ /* * Copyright (c) 1994 Christopher G. Demetriou. All rights reserved. @@ -167,6 +167,7 @@ procfs_allocvp(mp, vpp, pid, pfs_type) } procfs_hashins(pfs); + uvm_vnp_setsize(vp, 0); lockmgr(&pfs_hashlock, LK_RELEASE, NULL); return (error); diff --git a/sys/miscfs/syncfs/sync_subr.c b/sys/miscfs/syncfs/sync_subr.c index 2e222f1c7eda..2a89c4eb46c4 100644 --- a/sys/miscfs/syncfs/sync_subr.c +++ b/sys/miscfs/syncfs/sync_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: sync_subr.c,v 1.7 2000/10/06 19:08:00 jdolecek Exp $ */ +/* $NetBSD: sync_subr.c,v 1.8 2000/11/27 08:39:46 chs Exp $ */ /* * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. @@ -188,16 +188,14 @@ sched_sync(v) } s = splbio(); if (LIST_FIRST(slp) == vp) { - if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL && - vp->v_type != VBLK) - panic("sched_sync: fsync failed vp %p tag %d", - vp, vp->v_tag); + /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ + vn_syncer_add_to_worklist(vp, syncdelay); } splx(s); @@ -235,7 +233,7 @@ sched_sync(v) * filesystem activity. */ if (time.tv_sec == starttime) - tsleep(&lbolt, PPAUSE, "syncer", 0); + tsleep(&rushjob, PPAUSE, "syncer", hz); } } @@ -247,21 +245,12 @@ sched_sync(v) int speedup_syncer() { - int s; - - /* - * XXX Should not be doing this, should be using ltsleep() - * XXX with a timeout, rather than sleeping on lbolt. - */ - SCHED_LOCK(s); - if (updateproc && updateproc->p_wchan == &lbolt) - setrunnable(updateproc); - SCHED_UNLOCK(s); - - if (rushjob < syncdelay / 2) { - rushjob += 1; - stat_rush_requests += 1; - return (1); + if (rushjob >= syncdelay / 2) { + return (0); } - return(0); + + rushjob++; + wakeup(&rushjob); + stat_rush_requests += 1; + return (1); } diff --git a/sys/msdosfs/msdosfs_denode.c b/sys/msdosfs/msdosfs_denode.c index 615364796fe8..cf1fc3a00ac8 100644 --- a/sys/msdosfs/msdosfs_denode.c +++ b/sys/msdosfs/msdosfs_denode.c @@ -1,4 +1,4 @@ -/* $NetBSD: msdosfs_denode.c,v 1.46 2000/11/08 14:28:14 ad Exp $ */ +/* $NetBSD: msdosfs_denode.c,v 1.47 2000/11/27 08:39:46 chs Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. @@ -323,6 +323,7 @@ deget(pmp, dirclust, diroffset, depp) nvp->v_type = VREG; VREF(ldep->de_devvp); *depp = ldep; + nvp->v_uvm.u_size = ldep->de_FileSize; return (0); } @@ -427,7 +428,7 @@ detrunc(dep, length, flags, cred, p) #endif return (error); } - uvm_vnp_uncache(DETOV(dep)); /* what's this for? */ + /* * is this the right place for it? */ diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c index 94c5db40a21b..f778511061a3 100644 --- a/sys/msdosfs/msdosfs_fat.c +++ b/sys/msdosfs/msdosfs_fat.c @@ -1,4 +1,4 @@ -/* $NetBSD: msdosfs_fat.c,v 1.33 2000/05/13 06:04:42 cgd Exp $ */ +/* $NetBSD: msdosfs_fat.c,v 1.34 2000/11/27 08:39:46 chs Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. @@ -965,6 +965,7 @@ fillinusemap(pmp) * the de_flag field of the denode and it does not change the de_FileSize * field. This is left for the caller to do. */ + int extendfile(dep, count, bpp, ncp, flags) struct denode *dep; @@ -974,8 +975,7 @@ extendfile(dep, count, bpp, ncp, flags) int flags; { int error; - u_long frcn; - u_long cn, got; + u_long frcn, cn, got, origcount; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; @@ -1002,16 +1002,19 @@ extendfile(dep, count, bpp, ncp, flags) return (error); } + origcount = count; while (count > 0) { + /* * Allocate a new cluster chain and cat onto the end of the - * file. * If the file is empty we make de_StartCluster point + * file. If the file is empty we make de_StartCluster point * to the new block. Note that de_StartCluster being 0 is * sufficient to be sure the file is empty since we exclude * attempts to extend the root directory above, and the root * dir is the only file with a startcluster of 0 that has * blocks allocated (sort of). */ + if (dep->de_StartCluster == 0) cn = 0; else @@ -1046,41 +1049,33 @@ extendfile(dep, count, bpp, ncp, flags) } /* - * Update the "last cluster of the file" entry in the denode's fat - * cache. + * Update the "last cluster of the file" entry in the + * denode's fat cache. */ - fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); - if (flags & DE_CLEAR) { + fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); + if ((flags & DE_CLEAR) && + (dep->de_Attributes & ATTR_DIRECTORY)) { while (got-- > 0) { - /* - * Get the buf header for the new block of the file. - */ - if (dep->de_Attributes & ATTR_DIRECTORY) - bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), - pmp->pm_bpcluster, 0, 0); - else { - bp = getblk(DETOV(dep), de_cn2bn(pmp, frcn++), + bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), pmp->pm_bpcluster, 0, 0); - /* - * Do the bmap now, as in msdosfs_write - */ - if (pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0)) - bp->b_blkno = -1; - if (bp->b_blkno == -1) - panic("extendfile: pcbmap"); - } clrbuf(bp); if (bpp) { *bpp = bp; - bpp = NULL; - } else + bpp = NULL; + } else { bdwrite(bp); + } } } } + if ((flags & DE_CLEAR) && !(dep->de_Attributes & ATTR_DIRECTORY)) { + int cnshift = pmp->pm_cnshift; + + uvm_vnp_zerorange(DETOV(dep), frcn << cnshift, + origcount << cnshift); + } + return (0); } diff --git a/sys/msdosfs/msdosfs_vfsops.c b/sys/msdosfs/msdosfs_vfsops.c index 8c7f82fc44f9..4a80da71e875 100644 --- a/sys/msdosfs/msdosfs_vfsops.c +++ b/sys/msdosfs/msdosfs_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: msdosfs_vfsops.c,v 1.70 2000/09/19 22:02:10 fvdl Exp $ */ +/* $NetBSD: msdosfs_vfsops.c,v 1.71 2000/11/27 08:39:47 chs Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. @@ -699,6 +699,9 @@ msdosfs_mountfs(devvp, mp, p, argp) mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_MSDOS); mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = pmp->pm_bnshift; + mp->mnt_fs_bshift = pmp->pm_cnshift; + #ifdef QUOTA /* * If we ever do quotas for DOS filesystems this would be a place diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c index d6c758b6f12b..cc364365b0fa 100644 --- a/sys/msdosfs/msdosfs_vnops.c +++ b/sys/msdosfs/msdosfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: msdosfs_vnops.c,v 1.104 2000/08/03 20:41:29 thorpej Exp $ */ +/* $NetBSD: msdosfs_vnops.c,v 1.105 2000/11/27 08:39:47 chs Exp $ */ /*- * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. @@ -464,11 +464,11 @@ msdosfs_read(v) int error = 0; int64_t diff; int blsize; - int isadir; long n; long on; daddr_t lbn; - daddr_t rablock; + void *win; + vsize_t bytelen; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); @@ -478,12 +478,31 @@ msdosfs_read(v) /* * If they didn't ask for any data, then we are done. */ + if (uio->uio_resid == 0) return (0); if (uio->uio_offset < 0) return (EINVAL); - isadir = dep->de_Attributes & ATTR_DIRECTORY; + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + bytelen = min(dep->de_FileSize - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + dep->de_flag |= DE_ACCESS; + goto out; + } + + /* this loop is only for directories now */ do { lbn = de_cluster(pmp, uio->uio_offset); on = uio->uio_offset & pmp->pm_crbomask; @@ -494,41 +513,28 @@ msdosfs_read(v) diff = dep->de_FileSize - uio->uio_offset; if (diff < n) n = (long) diff; - /* convert cluster # to block # if a directory */ - if (isadir) { - error = pcbmap(dep, lbn, &lbn, 0, &blsize); - if (error) - return (error); - } + + /* convert cluster # to block # */ + error = pcbmap(dep, lbn, &lbn, 0, &blsize); + if (error) + return (error); + /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ - if (isadir) { - error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); - } else { - rablock = lbn + 1; - if (vp->v_lastr + 1 == lbn && - de_cn2off(pmp, rablock) < dep->de_FileSize) - error = breada(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, de_cn2bn(pmp, rablock), - pmp->pm_bpcluster, NOCRED, &bp); - else - error = bread(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, NOCRED, &bp); - vp->v_lastr = lbn; - } + error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); n = min(n, pmp->pm_bpcluster - bp->b_resid); if (error) { brelse(bp); return (error); } error = uiomove(bp->b_data + on, (int) n, uio); - if (!isadir) - dep->de_flag |= DE_ACCESS; brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); + +out: if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) error = deupdat(dep, 1); return (error); @@ -547,19 +553,19 @@ msdosfs_write(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - int n; - int croffset; int resid; u_long osize; int error = 0; u_long count; - daddr_t bn, lastcn; - struct buf *bp; + daddr_t lastcn; int ioflag = ap->a_ioflag; + void *win; + vsize_t bytelen; + off_t oldoff; + boolean_t rv; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; - struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; @@ -575,7 +581,6 @@ msdosfs_write(v) case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; - thisvp = vp; break; case VDIR: return EISDIR; @@ -630,84 +635,53 @@ msdosfs_write(v) } else lastcn = de_clcount(pmp, osize) - 1; + if (dep->de_FileSize < uio->uio_offset + resid) { + dep->de_FileSize = uio->uio_offset + resid; + uvm_vnp_setsize(vp, dep->de_FileSize); + } + do { - if (de_cluster(pmp, uio->uio_offset) > lastcn) { + oldoff = uio->uio_offset; + if (de_cluster(pmp, oldoff) > lastcn) { error = ENOSPC; break; } + bytelen = min(dep->de_FileSize - oldoff, uio->uio_resid); - bn = de_blk(pmp, uio->uio_offset); - if ((uio->uio_offset & pmp->pm_crbomask) == 0 - && (de_blk(pmp, uio->uio_offset + uio->uio_resid) > de_blk(pmp, uio->uio_offset) - || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { - /* - * If either the whole cluster gets written, - * or we write the cluster from its start beyond EOF, - * then no need to read data from disk. - */ - bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0); - clrbuf(bp); - /* - * Do the bmap now, since pcbmap needs buffers - * for the fat table. (see msdosfs_strategy) - */ - if (bp->b_blkno == bp->b_lblkno) { - error = pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0); - if (error) - bp->b_blkno = -1; - } - if (bp->b_blkno == -1) { - brelse(bp); - if (!error) - error = EIO; /* XXX */ - break; - } - } else { - /* - * The block we need to write into exists, so read it in. - */ - error = bread(thisvp, bn, pmp->pm_bpcluster, - NOCRED, &bp); - if (error) { - brelse(bp); - break; - } + /* + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. + */ + + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, oldoff, &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; } - croffset = uio->uio_offset & pmp->pm_crbomask; - n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); - if (uio->uio_offset + n > dep->de_FileSize) { - dep->de_FileSize = uio->uio_offset + n; - uvm_vnp_setsize(vp, dep->de_FileSize);/* why? */ + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (ioflag & IO_SYNC) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, oldoff, + oldoff + bytelen, PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } - (void) uvm_vnp_uncache(vp); /* why not? */ - /* - * Should these vnode_pager_* functions be done on dir - * files? - */ - - /* - * Copy the data from user space into the buf header. - */ - error = uiomove(bp->b_data + croffset, n, uio); - - /* - * If they want this synchronous then write it and wait for - * it. Otherwise, if on a cluster boundary write it - * asynchronously so we can move on to the next block - * without delay. Otherwise do a delayed write because we - * may want to write somemore into the block later. - */ - if (ioflag & IO_SYNC) - (void) bwrite(bp); - else if (n + croffset == pmp->pm_bpcluster) - bawrite(bp); - else - bdwrite(bp); - dep->de_flag |= DE_UPDATE; } while (error == 0 && uio->uio_resid > 0); + dep->de_flag |= DE_UPDATE; /* * If the write failed and they want us to, truncate the file back @@ -720,7 +694,8 @@ errexit: uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { - detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL); + detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, + NULL); if (uio->uio_resid != resid) error = 0; } @@ -1805,12 +1780,12 @@ msdosfs_strategy(v) biodone(bp); return (error); } -#ifdef DIAGNOSTIC -#endif + /* * Read/write the block from/to the disk that contains the desired * file block. */ + vp = dep->de_devvp; bp->b_dev = vp->v_rdev; VOCALL(vp->v_op, VOFFSET(vop_strategy), ap); @@ -1934,7 +1909,10 @@ struct vnodeopv_entry_desc msdosfs_vnodeop_entries[] = { { &vop_reallocblks_desc, msdosfs_reallocblks }, /* reallocblks */ { &vop_update_desc, msdosfs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_putpages_desc, genfs_putpages }, /* putpages */ + { &vop_size_desc, genfs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc msdosfs_vnodeop_opv_desc = { &msdosfs_vnodeop_p, msdosfs_vnodeop_entries }; diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h index 0bd1750ae2fa..aacd7e98e28a 100644 --- a/sys/nfs/nfs.h +++ b/sys/nfs/nfs.h @@ -1,4 +1,4 @@ -/* $NetBSD: nfs.h,v 1.24 2000/09/19 22:05:55 fvdl Exp $ */ +/* $NetBSD: nfs.h,v 1.25 2000/11/27 08:39:48 chs Exp $ */ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. @@ -84,8 +84,18 @@ extern int nfs_niothreads; /* Number of async_daemons desired */ * DIRBLKSIZ. */ +#if 1 +/* + * XXXUBC temp hack because of the removal of b_validend. + * eventually we'll store NFS VDIR data in the page cache as well, + * we'll fix this at that point. + */ +#define NFS_DIRBLKSIZ PAGE_SIZE +#define NFS_DIRFRAGSIZ PAGE_SIZE +#else #define NFS_DIRBLKSIZ 8192 /* Must be a multiple of DIRBLKSIZ */ #define NFS_DIRFRAGSIZ 512 /* Same as DIRBLKSIZ, generally */ +#endif /* * Maximum number of directory entries cached per NFS node, to avoid @@ -120,10 +130,10 @@ extern int nfs_niothreads; /* Number of async_daemons desired */ #endif /* - * The B_INVAFTERWRITE flag should be set to whatever is required by the - * buffer cache code to say "Invalidate the block after it is written back". + * Use the vm_page flag reserved for pager use to indicate pages + * which have been written to the server but not yet committed. */ -#define B_INVAFTERWRITE B_INVAL +#define PG_NEEDCOMMIT PG_PAGER1 /* * The IO_METASYNC flag should be implemented for local file systems. diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index a09ca6528cc0..cfde347404e4 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_bio.c,v 1.53 2000/09/19 23:26:26 bjh21 Exp $ */ +/* $NetBSD: nfs_bio.c,v 1.54 2000/11/27 08:39:48 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -39,6 +39,7 @@ */ #include "opt_nfs.h" +#include "opt_ddb.h" #include #include @@ -52,8 +53,10 @@ #include #include #include +#include #include +#include #include #include @@ -78,18 +81,17 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) struct ucred *cred; { struct nfsnode *np = VTONFS(vp); - int biosize, diff; + int biosize; struct buf *bp = NULL, *rabp; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); struct nfsdircache *ndp = NULL, *nndp = NULL; - daddr_t lbn, bn, rabn; caddr_t baddr, ep, edp; - int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin, en, enn; + int got_buf = 0, error = 0, n = 0, on = 0, en, enn; int enough = 0; struct dirent *dp, *pdp; - off_t curoff = 0, offdiff; + off_t curoff = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -109,6 +111,7 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) return (EFBIG); biosize = nmp->nm_rsize; + /* * For nfs, cache consistency can only be maintained approximately. * Although RFC1094 does not specify the criteria, the following is @@ -126,6 +129,7 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) * attributes this could be forced by setting n_attrstamp to 0 before * the VOP_GETATTR() call. */ + if ((nmp->nm_flag & NFSMNT_NQNFS) == 0 && vp->v_type != VLNK) { if (np->n_flag & NMODIFIED) { if (vp->v_type != VREG) { @@ -158,8 +162,18 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) } } } - do { + /* + * update the cached read creds for this node. + */ + + if (np->n_rcred) { + crfree(np->n_rcred); + } + np->n_rcred = cred; + crhold(cred); + + do { #ifndef NFS_V2_ONLY /* * Get a valid lease. If cached data is stale, flush it. @@ -199,7 +213,7 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) || ((vp->v_flag & VROOT) && vp->v_type == VLNK)) { switch (vp->v_type) { case VREG: - return (nfs_readrpc(vp, uio, cred)); + return (nfs_readrpc(vp, uio)); case VLNK: return (nfs_readlinkrpc(vp, uio, cred)); case VDIR: @@ -213,89 +227,26 @@ nfs_bioread(vp, uio, ioflag, cred, cflag) switch (vp->v_type) { case VREG: nfsstats.biocache_reads++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - bn = lbn * (biosize / DEV_BSIZE); - not_readin = 1; - /* - * Start the read ahead(s), as required. - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0 && - lbn - 1 == vp->v_lastr) { - for (nra = 0; nra < nmp->nm_readahead && - (lbn + 1 + nra) * biosize < np->n_size; nra++) { - rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p); - if (!rabp) - return (EINTR); - if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { - rabp->b_flags |= (B_READ | B_ASYNC); - if (nfs_asyncio(rabp, cred)) { - rabp->b_flags |= B_INVAL; - brelse(rabp); - } - } else - brelse(rabp); - } - } - } + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = min(np->n_size - uio->uio_offset, + uio->uio_resid); - /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. - */ - if ((bp = incore(vp, bn)) && - (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == - (B_BUSY | B_WRITEINPROG)) - got_buf = 0; - else { -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { - bp->b_flags |= B_READ; - not_readin = 0; - error = nfs_doio(bp, cred, p); - if (error) { - brelse(bp); - return (error); - } + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; } } - n = min((unsigned)(biosize - on), uio->uio_resid); - offdiff = np->n_size - uio->uio_offset; - if (offdiff < (off_t)n) - n = (int)offdiff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - if (!got_buf) { - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - } - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; - } - } - vp->v_lastr = lbn; - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; + n = 0; break; + case VLNK: nfsstats.biocache_readlinks++; bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); @@ -303,7 +254,7 @@ again: return (EINTR); if ((bp->b_flags & B_DONE) == 0) { bp->b_flags |= B_READ; - error = nfs_doio(bp, cred, p); + error = nfs_doio(bp, p); if (error) { brelse(bp); return (error); @@ -347,7 +298,7 @@ diragain: if ((bp->b_flags & B_DONE) == 0) { bp->b_flags |= B_READ; bp->b_dcookie = ndp->dc_blkcookie; - error = nfs_doio(bp, cred, p); + error = nfs_doio(bp, p); if (error) { /* * Yuck! The directory has been modified on the @@ -383,7 +334,7 @@ diragain: en = ndp->dc_entry; pdp = dp = (struct dirent *)bp->b_data; - edp = bp->b_data + bp->b_validend; + edp = bp->b_data + bp->b_bcount; enn = 0; while (enn < en && (caddr_t)dp < edp) { pdp = dp; @@ -427,11 +378,11 @@ diragain: enn++; } - if (uio->uio_resid < (bp->b_validend - on)) { + if (uio->uio_resid < (bp->b_bcount - on)) { n = uio->uio_resid; enough = 1; } else - n = bp->b_validend - on; + n = bp->b_bcount - on; ep = bp->b_data + on + n; @@ -461,7 +412,7 @@ diragain: * set of the offset to it. */ - if ((on + n) < bp->b_validend) { + if ((on + n) < bp->b_bcount) { curoff = NFS_GETCOOKIE(pdp); nndp = nfs_enterdircache(vp, curoff, ndp->dc_blkcookie, enn, bp->b_lblkno); @@ -499,7 +450,7 @@ diragain: if ((rabp->b_flags & (B_DONE | B_DELWRI)) == 0) { rabp->b_dcookie = nndp->dc_cookie; rabp->b_flags |= (B_READ | B_ASYNC); - if (nfs_asyncio(rabp, cred)) { + if (nfs_asyncio(rabp)) { rabp->b_flags |= B_INVAL; brelse(rabp); } @@ -512,7 +463,7 @@ diragain: default: printf(" nfsbioread: type %x unexpected\n",vp->v_type); break; - }; + } if (n > 0) { if (!baddr) @@ -554,18 +505,16 @@ nfs_write(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - int biosize; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; - struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, bn; - int n, on, error = 0, iomode, must_commit; + int error = 0, iomode, must_commit; + int rv; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -614,18 +563,29 @@ nfs_write(v) psignal(p, SIGXFSZ); return (EFBIG); } - /* - * I use nm_rsize, not nm_wsize so that all buffer cache blocks - * will be the same size within a filesystem. nfs_writerpc will - * still use nm_wsize when sizing the rpc's. - */ - biosize = nmp->nm_rsize; - do { - /* - * XXX make sure we aren't cached in the VM page cache - */ - (void)uvm_vnp_uncache(vp); + /* + * update the cached write creds for this node. + */ + + if (np->n_wcred) { + crfree(np->n_wcred); + } + np->n_wcred = cred; + crhold(cred); + + if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { + iomode = NFSV3WRITE_FILESYNC; + error = nfs_writerpc(vp, uio, &iomode, &must_commit); + if (must_commit) + nfs_clearcommit(vp->v_mount); + return (error); + } + + do { + void *win; + voff_t oldoff = uio->uio_offset; + vsize_t bytelen = uio->uio_resid; #ifndef NFS_V2_ONLY /* @@ -647,128 +607,47 @@ nfs_write(v) } } #endif - if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) { - iomode = NFSV3WRITE_FILESYNC; - error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit); - if (must_commit) - nfs_clearcommit(vp->v_mount); - return (error); - } nfsstats.biocache_writes++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); - bn = lbn * (biosize / DEV_BSIZE); -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - if (bp->b_wcred == NOCRED) { - crhold(cred); - bp->b_wcred = cred; - } + np->n_flag |= NMODIFIED; - if (uio->uio_offset + n > np->n_size) { - np->n_size = uio->uio_offset + n; + if (np->n_size < uio->uio_offset + bytelen) { + np->n_size = uio->uio_offset + bytelen; uvm_vnp_setsize(vp, np->n_size); } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise force a write rpc of the old dirty area. - */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - bp->b_proc = p; - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - goto again; - } - -#ifndef NFS_V2_ONLY - /* - * Check for valid write lease and get one as required. - * In case getblk() and/or bwrite() delayed us. - */ - if ((nmp->nm_flag & NFSMNT_NQNFS) && - NQNFS_CKINVALID(vp, np, ND_WRITE)) { - do { - error = nqnfs_getlease(vp, ND_WRITE, cred, p); - } while (error == NQNFS_EXPIRED); - if (error) { - brelse(bp); - return (error); - } - if (np->n_lrev != np->n_brev || - (np->n_flag & NQNFSNONCACHE)) { - brelse(bp); - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - np->n_brev = np->n_lrev; - goto again; - } - } -#endif - error = uiomove((char *)bp->b_data + on, n, uio); + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, + UBC_WRITE); + error = uiomove(win, bytelen, uio); if (error) { - bp->b_flags |= B_ERROR; - brelse(bp); - return (error); + memset((void *)trunc_page((vaddr_t)win), 0, + round_page((vaddr_t)win + bytelen) - + trunc_page((vaddr_t)win)); } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); - } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; - } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); - } - - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - if (NFS_ISV3(vp)) { - lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); - if (bp->b_flags & B_NEEDCOMMIT) { - bp->b_flags &= ~B_NEEDCOMMIT; - nfs_del_tobecommitted_range(vp, bp); - } - nfs_del_committed_range(vp, bp); - lockmgr(&np->n_commitlock, LK_RELEASE, NULL); - } - - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ + ubc_release(win, 0); + rv = 1; if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) - return (error); - if (np->n_flag & NQNFSNONCACHE) { - error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1); - if (error) - return (error); - } - } else if ((n + on) == biosize && - (nmp->nm_flag & NFSMNT_NQNFS) == 0) { - bp->b_proc = (struct proc *)0; - bawrite(bp); - } else { - bdwrite(bp); + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if ((oldoff & ~(nmp->nm_wsize - 1)) != + (uio->uio_offset & ~(nmp->nm_wsize - 1))) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_WEAK); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } - } while (uio->uio_resid > 0 && n > 0); - return (0); + if (!rv) { + error = EIO; + break; + } + } while (uio->uio_resid > 0); + return error; } /* @@ -790,9 +669,9 @@ nfs_getcacheblk(vp, bn, size, p) if (nmp->nm_flag & NFSMNT_INT) { bp = getblk(vp, bn, size, PCATCH, 0); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); + while (bp == NULL) { + if (nfs_sigintr(nmp, NULL, p)) + return (NULL); bp = getblk(vp, bn, size, 0, 2 * hz); } } else @@ -832,7 +711,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHWANT; error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) + if (error && intrflg && nfs_sigintr(nmp, NULL, p)) return (EINTR); } @@ -842,7 +721,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHINPROG; error = vinvalbuf(vp, flags, cred, p, slpflag, 0); while (error) { - if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { + if (intrflg && nfs_sigintr(nmp, NULL, p)) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; @@ -866,9 +745,8 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) * are all hung on a dead server. */ int -nfs_asyncio(bp, cred) +nfs_asyncio(bp) struct buf *bp; - struct ucred *cred; { int i; struct nfsmount *nmp; @@ -894,7 +772,7 @@ again: * Found one, so wake it up and tell it which * mount to process. */ - nfs_iodwant[i] = (struct proc *)0; + nfs_iodwant[i] = NULL; nfs_iodmount[i] = nmp; nmp->nm_bufqiods++; wakeup((caddr_t)&nfs_iodwant[i]); @@ -935,20 +813,6 @@ again: if (nmp->nm_bufqiods == 0) goto again; } - - if (bp->b_flags & B_READ) { - if (bp->b_rcred == NOCRED && cred != NOCRED) { - crhold(cred); - bp->b_rcred = cred; - } - } else { - bp->b_flags |= B_WRITEINPROG; - if (bp->b_wcred == NOCRED && cred != NOCRED) { - crhold(cred); - bp->b_wcred = cred; - } - } - TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist); nmp->nm_bufqlen++; return (0); @@ -966,21 +830,17 @@ again: * synchronously or from an nfsiod. */ int -nfs_doio(bp, cr, p) +nfs_doio(bp, p) struct buf *bp; - struct ucred *cr; struct proc *p; { struct uio *uiop; struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; - int error = 0, diff, len, iomode, must_commit = 0, s, retv = 0; - int pushedrange; - unsigned cnt; + int error = 0, diff, len, iomode, must_commit = 0; struct uio uio; struct iovec io; - off_t off; vp = bp->b_vp; np = VTONFS(vp); @@ -1001,16 +861,16 @@ nfs_doio(bp, cr, p) io.iov_len = uiop->uio_resid = bp->b_bcount; /* mapping was done by vmapbuf() */ io.iov_base = bp->b_data; - uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; + uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; if (bp->b_flags & B_READ) { uiop->uio_rw = UIO_READ; nfsstats.read_physios++; - error = nfs_readrpc(vp, uiop, cr); + error = nfs_readrpc(vp, uiop); } else { iomode = NFSV3WRITE_DATASYNC; uiop->uio_rw = UIO_WRITE; nfsstats.write_physios++; - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); + error = nfs_writerpc(vp, uiop, &iomode, &must_commit); } if (error) { bp->b_flags |= B_ERROR; @@ -1022,29 +882,25 @@ nfs_doio(bp, cr, p) uiop->uio_rw = UIO_READ; switch (vp->v_type) { case VREG: - uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE; + uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; nfsstats.read_bios++; - error = nfs_readrpc(vp, uiop, cr); - if (!error) { - bp->b_validoff = 0; - if (uiop->uio_resid) { + error = nfs_readrpc(vp, uiop); + if (!error && uiop->uio_resid) { + /* * If len > 0, there is a hole in the file and * no writes after the hole have been pushed to * the server yet. * Just zero fill the rest of the valid area. */ + diff = bp->b_bcount - uiop->uio_resid; - len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE + len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) + diff); if (len > 0) { - len = min(len, uiop->uio_resid); - memset((char *)bp->b_data + diff, 0, len); - bp->b_validend = diff + len; - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; + len = min(len, uiop->uio_resid); + memset((char *)bp->b_data + diff, 0, len); + } } if (p && (vp->v_flag & VTEXT) && (((nmp->nm_flag & NFSMNT_NQNFS) && @@ -1052,7 +908,8 @@ nfs_doio(bp, cr, p) np->n_lrev != np->n_brev) || (!(nmp->nm_flag & NFSMNT_NQNFS) && np->n_mtime != np->n_vattr->va_mtime.tv_sec))) { - uprintf("Process killed due to text file modification\n"); + uprintf("Process killed due to " + "text file modification\n"); psignal(p, SIGKILL); p->p_holdcnt++; } @@ -1060,28 +917,26 @@ nfs_doio(bp, cr, p) case VLNK: uiop->uio_offset = (off_t)0; nfsstats.readlink_bios++; - error = nfs_readlinkrpc(vp, uiop, cr); + error = nfs_readlinkrpc(vp, uiop, curproc->p_ucred); break; case VDIR: nfsstats.readdir_bios++; uiop->uio_offset = bp->b_dcookie; if (nmp->nm_flag & NFSMNT_RDIRPLUS) { - error = nfs_readdirplusrpc(vp, uiop, cr); + error = nfs_readdirplusrpc(vp, uiop, curproc->p_ucred); if (error == NFSERR_NOTSUPP) nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) - error = nfs_readdirrpc(vp, uiop, cr); + error = nfs_readdirrpc(vp, uiop, curproc->p_ucred); if (!error) { bp->b_dcookie = uiop->uio_offset; - bp->b_validoff = 0; - bp->b_validend = bp->b_bcount - uiop->uio_resid; } break; default: printf("nfs_doio: type %x unexpected\n",vp->v_type); break; - }; + } if (error) { bp->b_flags |= B_ERROR; bp->b_error = error; @@ -1091,117 +946,14 @@ nfs_doio(bp, cr, p) * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not * an actual write will have to be scheduled. */ - if (bp->b_flags & B_NEEDCOMMIT) { - /* - * If the buffer is in the range that we already committed, - * there's nothing to do. - * - * If it's in the range that we need to commit, push the - * whole range at once. Else only push the buffer. In - * both these cases, acquire the commit lock to avoid - * other processes modifying the range. Normally the - * vnode lock should have handled this, but there are - * no proper vnode locks for NFS yet (XXX). - */ - lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); - if (!(bp->b_flags & B_NEEDCOMMIT)) { - lockmgr(&np->n_commitlock, LK_RELEASE, NULL); - goto dowrite; - } - if (!nfs_in_committed_range(vp, bp)) { - if (nfs_in_tobecommitted_range(vp, bp)) { - pushedrange = 1; - off = np->n_pushlo; - /* XXX will be too big if > 2G buffer cache */ - cnt = np->n_pushhi - np->n_pushlo; - } else { - pushedrange = 0; - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE; - cnt = bp->b_dirtyend; - } - bp->b_flags |= B_WRITEINPROG; - retv = nfs_commit(bp->b_vp, off, cnt, - bp->b_wcred, bp->b_proc); - bp->b_flags &= ~B_WRITEINPROG; - if (retv == 0) { - if (pushedrange) { - nfs_merge_commit_ranges(vp); - } - else - nfs_add_committed_range(vp, bp); - } - } - lockmgr(&np->n_commitlock, LK_RELEASE, NULL); - if (!retv) { - bp->b_resid = bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; - biodone(bp); - return (0); - } else if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(bp->b_vp->v_mount); - } -dowrite: - io.iov_len = uiop->uio_resid = bp->b_dirtyend - - bp->b_dirtyoff; - uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE - + bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + io.iov_base = bp->b_data; + io.iov_len = uiop->uio_resid = bp->b_bcount; + uiop->uio_offset = (((off_t)bp->b_blkno) << DEV_BSHIFT); uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - - bp->b_flags |= B_WRITEINPROG; - error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit); - s = splbio(); - if (!error && iomode == NFSV3WRITE_UNSTABLE) { - bp->b_flags |= B_NEEDCOMMIT; - lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); - nfs_add_tobecommitted_range(vp, bp); - lockmgr(&np->n_commitlock, LK_RELEASE, NULL); - } else if (!error && bp->b_flags & B_NEEDCOMMIT) { - bp->b_flags &= ~B_NEEDCOMMIT; - lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); - nfs_del_committed_range(vp, bp); - lockmgr(&np->n_commitlock, LK_RELEASE, NULL); - } - /* XXX the use of NOCACHE is a hack */ - bp->b_flags &= ~(B_WRITEINPROG|B_NOCACHE); - - /* - * For an interrupted write, the buffer is still valid and the - * write hasn't been pushed to the server yet, so we can't set - * B_ERROR and report the interruption by setting B_EINTR. For - * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt - * is essentially a noop. - * For the case of a V3 write rpc not being committed to stable - * storage, the block is still dirty and requires either a commit - * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC - * before the block is reused. This is indicated by setting the - * B_DELWRI and B_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { - bp->b_flags |= B_DELWRI; - /* - * A B_ASYNC block still needs to be committed, so put - * it back on the dirty list. - */ - if (bp->b_flags & B_ASYNC) - reassignbuf(bp, vp); - else if (error) - bp->b_flags |= B_EINTR; - } else { - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - } - splx(s); + iomode = NFSV3WRITE_UNSTABLE; + error = nfs_writerpc(vp, uiop, &iomode, &must_commit); } bp->b_resid = uiop->uio_resid; if (must_commit) @@ -1209,3 +961,547 @@ dowrite: biodone(bp); return (error); } + +/* + * Vnode op for VM getpages. + */ +int +nfs_getpages(v) + void *v; +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + off_t eof, offset, origoffset, startoffset, endoffset; + int s, i, error, npages, orignpages, npgs, ridx, pidx, pcount; + vaddr_t kva; + struct buf *bp, *mbp; + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; + int flags = ap->a_flags; + int bsize; + struct vm_page *pgs[16]; /* XXXUBC 16 */ + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; + + UVMHIST_FUNC("nfs_getpages"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x count %d", vp, (int)ap->a_offset, + *ap->a_count,0); + +#ifdef DIAGNOSTIC + if (ap->a_centeridx < 0 || ap->a_centeridx >= *ap->a_count) { + panic("nfs_getpages: centeridx %d out of range", + ap->a_centeridx); + } +#endif + + error = 0; + origoffset = ap->a_offset; + eof = vp->v_uvm.u_size; + if (origoffset >= eof) { + if ((flags & PGO_LOCKED) == 0) { + simple_unlock(&uobj->vmobjlock); + } + UVMHIST_LOG(ubchist, "off 0x%x past EOF 0x%x", + (int)origoffset, (int)eof,0,0); + return EINVAL; + } + + if (flags & PGO_LOCKED) { + uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, + UFP_NOWAIT|UFP_NOALLOC); + return 0; + } + + /* vnode is VOP_LOCKed, uobj is locked */ + + bsize = nmp->nm_rsize; + orignpages = min(*ap->a_count, + round_page(eof - origoffset) >> PAGE_SHIFT); + npages = orignpages; + startoffset = origoffset & ~(bsize - 1); + endoffset = round_page((origoffset + (npages << PAGE_SHIFT) + + bsize - 1) & ~(bsize - 1)); + endoffset = min(endoffset, round_page(eof)); + ridx = (origoffset - startoffset) >> PAGE_SHIFT; + + if (!async && !write) { + int rapages = max(PAGE_SIZE, nmp->nm_rsize) >> PAGE_SHIFT; + + (void) VOP_GETPAGES(vp, endoffset, NULL, &rapages, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + } + + UVMHIST_LOG(ubchist, "npages %d offset 0x%x", npages, + (int)origoffset, 0,0); + memset(pgs, 0, sizeof(pgs)); + uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); + + if (flags & PGO_OVERWRITE) { + UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); + + /* XXXUBC for now, zero the page if we allocated it */ + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + uvm_pagezero(pg); + pg->flags &= ~(PG_FAKE); + } + } + goto out; + } + + /* + * if the pages are already resident, just return them. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if ((pg->flags & PG_FAKE) != 0 || + ((ap->a_access_type & VM_PROT_WRITE) && + (pg->flags & PG_RDONLY))) { + break; + } + } + if (i == npages) { + UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); + goto out; + } + + /* + * the page wasn't resident and we're not overwriting, + * so we're going to have to do some i/o. + * find any additional pages needed to cover the expanded range. + */ + + if (startoffset != origoffset || + startoffset + (npages << PAGE_SHIFT) != endoffset) { + UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", + (int)startoffset, (int)endoffset, 0,0); + npages = (endoffset - startoffset) >> PAGE_SHIFT; + KASSERT(npages != 0); + npgs = npages; + uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); + } + simple_unlock(&uobj->vmobjlock); + + /* + * update the cached read creds for this node. + */ + + if (np->n_rcred) { + crfree(np->n_rcred); + } + np->n_rcred = curproc->p_ucred; + crhold(np->n_rcred); + + /* + * read the desired page(s). + */ + + totalbytes = npages << PAGE_SHIFT; + bytes = min(totalbytes, vp->v_uvm.u_size - startoffset); + tailbytes = totalbytes - bytes; + skipbytes = 0; + + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | + UVMPAGER_MAPIN_READ); + + s = splbio(); + mbp = pool_get(&bufpool, PR_WAITOK); + splx(s); + mbp->b_bufsize = totalbytes; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL|B_ASYNC : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + /* + * if EOF is in the middle of the last page, zero the part past EOF. + */ + + if (tailbytes > 0) { + memset((char *)kva + bytes, 0, tailbytes); + } + + /* + * now loop over the pages, reading as needed. + */ + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + + /* + * skip pages which don't need to be read. + */ + + pidx = (offset - startoffset) >> PAGE_SHIFT; + UVMHIST_LOG(ubchist, "pidx %d offset 0x%x startoffset 0x%x", + pidx, (int)offset, (int)startoffset,0); + while ((pgs[pidx]->flags & PG_FAKE) == 0) { + size_t b; + +#ifdef DEBUG + if (offset & (PAGE_SIZE - 1)) { + panic("nfs_getpages: skipping from middle " + "of page"); + } +#endif + + b = min(PAGE_SIZE, bytes); + offset += b; + bytes -= b; + skipbytes += b; + pidx++; + UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", + (int)offset, 0,0,0); + if (bytes == 0) { + goto loopdone; + } + } + + /* + * see how many pages can be read with this i/o. + * reduce the i/o size if necessary. + */ + + iobytes = bytes; + if (offset + iobytes > round_page(offset)) { + pcount = 1; + while (pidx + pcount < npages && + pgs[pidx + pcount]->flags & PG_FAKE) { + pcount++; + } + iobytes = min(iobytes, (pcount << PAGE_SHIFT) - + (offset - trunc_page(offset))); + } + iobytes = min(iobytes, nmp->nm_rsize); + + /* + * allocate a sub-buf for this piece of the i/o + * (or just use mbp if there's only 1 piece), + * and start it going. + */ + + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + bp = pool_get(&bufpool, PR_WAITOK); + splx(s); + bp->b_data = (char *)kva + offset - startoffset; + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = offset >> DEV_BSHIFT; + + UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", + bp, offset, iobytes, bp->b_blkno); + + VOP_STRATEGY(bp); + } + +loopdone: + if (skipbytes) { + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + s = splbio(); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + + if (write && v3) { + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); + nfs_del_committed_range(vp, origoffset, npages); + nfs_del_tobecommitted_range(vp, origoffset, npages); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL); + } + + simple_lock(&uobj->vmobjlock); + +out: + uvm_lock_pageq(); + if (error) { + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if ((pgs[i]->flags & PG_FAKE) == 0) { + continue; + } + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + uvm_pagefree(pgs[i]); + } + goto done; + } + + UVMHIST_LOG(ubchist, "ridx %d count %d", ridx, npages, 0,0); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_FAKE) { + UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + pgs[i]->flags &= ~(PG_FAKE); + pmap_clear_modify(pgs[i]); + pmap_clear_reference(pgs[i]); + } + if (i < ridx || i >= ridx + orignpages || async) { + UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + KASSERT((pgs[i]->flags & PG_RELEASED) == 0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->wire_count == 0) { + uvm_pageactivate(pgs[i]); + } + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + } + +done: + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + if (ap->a_m != NULL) { + memcpy(ap->a_m, &pgs[ridx], + *ap->a_count * sizeof(struct vm_page *)); + } + + UVMHIST_LOG(ubchist, "done -> %d", error, 0,0,0); + return error; +} + +/* + * Vnode op for VM putpages. + */ +int +nfs_putpages(v) + void *v; +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + struct vm_page **a_m; + int a_count; + int a_flags; + int *a_rtvals; + } */ *ap = v; + + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct buf *bp, *mbp; + struct vm_page **pgs = ap->a_m; + int flags = ap->a_flags; + int npages = ap->a_count; + int s, error, i; + size_t bytes, iobytes, skipbytes; + vaddr_t kva; + off_t offset, origoffset, commitoff; + uint32_t commitbytes; + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t weak = (flags & PGO_WEAK) && v3; + UVMHIST_FUNC("nfs_putpages"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "vp %p pgp %p count %d", + vp, ap->a_m, ap->a_count,0); + + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + origoffset = pgs[0]->offset; + bytes = min(ap->a_count << PAGE_SHIFT, vp->v_uvm.u_size - origoffset); + skipbytes = 0; + + /* + * if the range has been committed already, mark the pages thus. + * if the range just needs to be committed, we're done + * if it's a weak putpage, otherwise commit the range. + */ + + if (v3) { + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); + if (nfs_in_committed_range(vp, origoffset, bytes)) { + goto committed; + } + if (nfs_in_tobecommitted_range(vp, origoffset, bytes)) { + if (weak) { + lockmgr(&np->n_commitlock, LK_RELEASE, NULL); + return 0; + } else { + commitoff = np->n_pushlo; + commitbytes = (uint32_t)(np->n_pushhi - + np->n_pushlo); + goto commit; + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL); + } + + /* + * otherwise write or commit all the pages. + */ + + kva = uvm_pagermapin(pgs, ap->a_count, UVMPAGER_MAPIN_WAITOK| + UVMPAGER_MAPIN_WRITE); + + s = splbio(); + vp->v_numoutput += 2; + mbp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + splx(s); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_WRITE|B_AGE | + (async ? B_CALL|B_ASYNC : 0) | + (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); + mbp->b_iodone = uvm_aio_aiodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + for (offset = origoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + iobytes = min(nmp->nm_wsize, bytes); + + /* + * skip writing any pages which only need a commit. + */ + + if ((pgs[(offset - origoffset) >> PAGE_SHIFT]->flags & + PG_NEEDCOMMIT) != 0) { + iobytes = PAGE_SIZE; + skipbytes += min(iobytes, vp->v_uvm.u_size - offset); + continue; + } + + /* if it's really one i/o, don't make a second buf */ + if (offset == origoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + vp->v_numoutput++; + bp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + splx(s); + bp->b_data = (char *)kva + (offset - origoffset); + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = (daddr_t)(offset >> DEV_BSHIFT); + UVMHIST_LOG(ubchist, "bp %p numout %d", + bp, vp->v_numoutput,0,0); + VOP_STRATEGY(bp); + } + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + return EINPROGRESS; + } + error = biowait(mbp); + + s = splbio(); + vwakeup(mbp); + pool_put(&bufpool, mbp); + splx(s); + + uvm_pagermapout(kva, ap->a_count); + if (error || !v3) { + UVMHIST_LOG(ubchist, "returning error %d", error, 0,0,0); + return error; + } + + /* + * for a weak put, mark the range as "to be committed" + * and mark the pages read-only so that we will be notified + * to remove the pages from the "to be committed" range + * if they are made dirty again. + * for a strong put, commit the pages and remove them from the + * "to be committed" range. also, mark them as writable + * and not cleanable with just a commit. + */ + + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL); + if (weak) { + nfs_add_tobecommitted_range(vp, origoffset, + npages << PAGE_SHIFT); + for (i = 0; i < npages; i++) { + pgs[i]->flags |= PG_NEEDCOMMIT|PG_RDONLY; + } + } else { + commitoff = origoffset; + commitbytes = npages << PAGE_SHIFT; +commit: + error = nfs_commit(vp, commitoff, commitbytes, curproc); + nfs_del_tobecommitted_range(vp, commitoff, commitbytes); +committed: + for (i = 0; i < npages; i++) { + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL); + return error; +} diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c index b3feb0eb92a4..c67f44a80a0e 100644 --- a/sys/nfs/nfs_node.c +++ b/sys/nfs/nfs_node.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_node.c,v 1.37 2000/11/08 14:28:15 ad Exp $ */ +/* $NetBSD: nfs_node.c,v 1.38 2000/11/27 08:39:48 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -140,7 +140,7 @@ nfs_nget(mntp, fhp, fhsize, npp) loop: for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) { if (mntp != NFSTOV(np)->v_mount || np->n_fhsize != fhsize || - memcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize)) + memcmp(fhp, np->n_fhp, fhsize)) continue; vp = NFSTOV(np); if (vget(vp, LK_EXCLUSIVE)) @@ -159,10 +159,11 @@ loop: nvp->v_vnlock = 0; /* XXX At least untill we do locking */ vp = nvp; np = pool_get(&nfs_node_pool, PR_WAITOK); - memset((caddr_t)np, 0, sizeof *np); + memset(np, 0, sizeof *np); lockinit(&np->n_commitlock, PINOD, "nfsclock", 0, 0); vp->v_data = np; np->n_vnode = vp; + /* * Insert the nfsnode in the hash queue for its new file handle */ @@ -171,11 +172,21 @@ loop: np->n_fhp = malloc(fhsize, M_NFSBIGFH, M_WAITOK); } else np->n_fhp = &np->n_fh; - memcpy((caddr_t)np->n_fhp, (caddr_t)fhp, fhsize); + memcpy(np->n_fhp, fhp, fhsize); np->n_fhsize = fhsize; np->n_accstamp = -1; np->n_vattr = pool_get(&nfs_vattr_pool, PR_WAITOK); - memset(np->n_vattr, 0, sizeof (struct vattr)); + + /* + * XXXUBC doing this while holding the nfs_hashlock is bad, + * but there's no alternative at the moment. + */ + error = VOP_GETATTR(vp, np->n_vattr, curproc->p_ucred, curproc); + if (error) { + return error; + } + uvm_vnp_setsize(vp, np->n_vattr->va_size); + lockmgr(&nfs_hashlock, LK_RELEASE, 0); *npp = np; return (0); @@ -227,7 +238,7 @@ nfs_inactive(v) nfs_removeit(sp); crfree(sp->s_cred); vrele(sp->s_dvp); - FREE((caddr_t)sp, M_NFSREQ); + FREE(sp, M_NFSREQ); } np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED | NQNFSNONCACHE | NQNFSWRITE); @@ -272,12 +283,18 @@ nfs_reclaim(v) FREE(np->n_dircache, M_NFSDIROFF); } if (np->n_fhsize > NFS_SMALLFH) { - free((caddr_t)np->n_fhp, M_NFSBIGFH); + free(np->n_fhp, M_NFSBIGFH); } pool_put(&nfs_vattr_pool, np->n_vattr); + if (np->n_rcred) { + crfree(np->n_rcred); + } + if (np->n_wcred) { + crfree(np->n_wcred); + } cache_purge(vp); pool_put(&nfs_node_pool, vp->v_data); - vp->v_data = (void *)0; + vp->v_data = NULL; return (0); } diff --git a/sys/nfs/nfs_serv.c b/sys/nfs/nfs_serv.c index cf38aa33b342..291f7fb0345b 100644 --- a/sys/nfs/nfs_serv.c +++ b/sys/nfs/nfs_serv.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_serv.c,v 1.58 2000/09/19 22:05:29 fvdl Exp $ */ +/* $NetBSD: nfs_serv.c,v 1.59 2000/11/27 08:39:49 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -1729,7 +1729,6 @@ nfsrv_remove(nfsd, slp, procp, mrq) } out: if (!error) { - (void)uvm_vnp_uncache(vp); nqsrv_getl(nd.ni_dvp, ND_WRITE); nqsrv_getl(vp, ND_WRITE); error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); @@ -1904,7 +1903,6 @@ out: nqsrv_getl(fromnd.ni_dvp, ND_WRITE); nqsrv_getl(tdvp, ND_WRITE); if (tvp) { - (void)uvm_vnp_uncache(tvp); nqsrv_getl(tvp, ND_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, @@ -3389,12 +3387,12 @@ nfsrv_access(vp, flags, cred, rdonly, p, override) break; } } + /* - * If there's shared text associated with - * the inode, try to free it up once. If - * we fail, we can't allow writing. + * If the vnode is in use as a process's text, + * we can't allow writing. */ - if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp)) + if (vp->v_flag & VTEXT) return (ETXTBSY); } error = VOP_GETATTR(vp, &vattr, cred, p); diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c index 8ed14a97a01a..3842d9f7ca4d 100644 --- a/sys/nfs/nfs_subs.c +++ b/sys/nfs/nfs_subs.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_subs.c,v 1.88 2000/11/08 14:28:15 ad Exp $ */ +/* $NetBSD: nfs_subs.c,v 1.89 2000/11/27 08:39:49 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -1665,17 +1665,14 @@ nfs_loadattrcache(vpp, fp, vaper) vap->va_filerev = 0; } if (vap->va_size != np->n_size) { - if (vap->va_type == VREG) { - if (np->n_flag & NMODIFIED) { - if (vap->va_size < np->n_size) - vap->va_size = np->n_size; - else - np->n_size = vap->va_size; - } else - np->n_size = vap->va_size; - uvm_vnp_setsize(vp, np->n_size); - } else + if ((np->n_flag & NMODIFIED) && vap->va_size < np->n_size) { + vap->va_size = np->n_size; + } else { np->n_size = vap->va_size; + if (vap->va_type == VREG) { + uvm_vnp_setsize(vp, np->n_size); + } + } } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { @@ -2366,7 +2363,6 @@ netaddr_match(family, haddr, nam) return (0); } - /* * The write verifier has changed (probably due to a server reboot), so all * B_NEEDCOMMIT blocks will have to be written again. Since they are on the @@ -2377,17 +2373,14 @@ void nfs_clearcommit(mp) struct mount *mp; { - struct vnode *vp, *nvp; - struct buf *bp, *nbp; + struct vnode *vp; struct nfsnode *np; + struct vm_page *pg; int s; s = splbio(); -loop: - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) /* Paranoia */ - goto loop; - nvp = vp->v_mntvnodes.le_next; + LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + KASSERT(vp->v_mount == mp); if (vp->v_type == VNON) continue; np = VTONFS(vp); @@ -2395,12 +2388,11 @@ loop: np->n_pushedhi = 0; np->n_commitflags &= ~(NFS_COMMIT_PUSH_VALID | NFS_COMMIT_PUSHED_VALID); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + TAILQ_FOREACH(pg, &vp->v_uvm.u_obj.memq, listq) { + pg->flags &= ~PG_NEEDCOMMIT; } + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } splx(s); } @@ -2432,47 +2424,47 @@ nfs_merge_commit_ranges(vp) } int -nfs_in_committed_range(vp, bp) +nfs_in_committed_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) return 0; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; return (lo >= np->n_pushedlo && hi <= np->n_pushedhi); } int -nfs_in_tobecommitted_range(vp, bp) +nfs_in_tobecommitted_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) return 0; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; return (lo >= np->n_pushlo && hi <= np->n_pushhi); } void -nfs_add_committed_range(vp, bp) +nfs_add_committed_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) { np->n_pushedlo = lo; @@ -2491,9 +2483,9 @@ nfs_add_committed_range(vp, bp) } void -nfs_del_committed_range(vp, bp) +nfs_del_committed_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; @@ -2501,8 +2493,8 @@ nfs_del_committed_range(vp, bp) if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) return; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; if (lo > np->n_pushedhi || hi < np->n_pushedlo) return; @@ -2528,15 +2520,15 @@ nfs_del_committed_range(vp, bp) } void -nfs_add_tobecommitted_range(vp, bp) +nfs_add_tobecommitted_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) { np->n_pushlo = lo; @@ -2555,9 +2547,9 @@ nfs_add_tobecommitted_range(vp, bp) } void -nfs_del_tobecommitted_range(vp, bp) +nfs_del_tobecommitted_range(vp, off, len) struct vnode *vp; - struct buf *bp; + off_t off, len; { struct nfsnode *np = VTONFS(vp); off_t lo, hi; @@ -2565,8 +2557,8 @@ nfs_del_tobecommitted_range(vp, bp) if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) return; - lo = (off_t)bp->b_blkno * DEV_BSIZE; - hi = lo + bp->b_dirtyend; + lo = off; + hi = lo + len; if (lo > np->n_pushhi || hi < np->n_pushlo) return; diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c index ec0b55593075..85090656c94a 100644 --- a/sys/nfs/nfs_syscalls.c +++ b/sys/nfs/nfs_syscalls.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_syscalls.c,v 1.47 2000/11/24 23:30:03 chs Exp $ */ +/* $NetBSD: nfs_syscalls.c,v 1.48 2000/11/27 08:39:50 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -970,10 +970,7 @@ nfssvc_iod(p) nmp->nm_bufqwant = FALSE; wakeup(&nmp->nm_bufq); } - if (bp->b_flags & B_READ) - (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0); - else - (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0); + (void) nfs_doio(bp, NULL); /* * If there are more than one iod on this mount, then defect * so that the iods can be shared out fairly between the mounts diff --git a/sys/nfs/nfs_var.h b/sys/nfs/nfs_var.h index 6bb4e6122cdc..8948798737e0 100644 --- a/sys/nfs/nfs_var.h +++ b/sys/nfs/nfs_var.h @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_var.h,v 1.18 2000/09/19 22:14:59 fvdl Exp $ */ +/* $NetBSD: nfs_var.h,v 1.19 2000/11/27 08:39:50 chs Exp $ */ /*- * Copyright (c) 1996 The NetBSD Foundation, Inc. @@ -74,8 +74,8 @@ int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, int)); struct buf *nfs_getcacheblk __P((struct vnode *, daddr_t, int, struct proc *)); int nfs_vinvalbuf __P((struct vnode *, int, struct ucred *, struct proc *, int)); -int nfs_asyncio __P((struct buf *, struct ucred *)); -int nfs_doio __P((struct buf *, struct ucred *, struct proc *)); +int nfs_asyncio __P((struct buf *)); +int nfs_doio __P((struct buf *, struct proc *)); /* nfs_boot.c */ /* see nfsdiskless.h */ @@ -91,9 +91,8 @@ int nfs_null __P((struct vnode *, struct ucred *, struct proc *)); int nfs_setattrrpc __P((struct vnode *, struct vattr *, struct ucred *, struct proc *)); int nfs_readlinkrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_readrpc __P((struct vnode *, struct uio *, struct ucred *)); -int nfs_writerpc __P((struct vnode *, struct uio *, struct ucred *, int *, - int *)); +int nfs_readrpc __P((struct vnode *, struct uio *)); +int nfs_writerpc __P((struct vnode *, struct uio *, int *, int *)); int nfs_mknodrpc __P((struct vnode *, struct vnode **, struct componentname *, struct vattr *)); int nfs_removeit __P((struct sillyrename *)); @@ -109,8 +108,7 @@ int nfs_sillyrename __P((struct vnode *, struct vnode *, struct componentname *)); int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *, struct proc *, struct nfsnode **)); -int nfs_commit __P((struct vnode *, u_quad_t, unsigned, struct ucred *, - struct proc *)); +int nfs_commit __P((struct vnode *, off_t, uint32_t, struct proc *)); int nfs_flush __P((struct vnode *, struct ucred *, int, struct proc *, int)); /* nfs_nqlease.c */ @@ -267,12 +265,12 @@ int netaddr_match __P((int, union nethostaddr *, struct mbuf *)); void nfs_clearcommit __P((struct mount *)); void nfs_merge_commit_ranges __P((struct vnode *)); -int nfs_in_committed_range __P((struct vnode *, struct buf *)); -int nfs_in_tobecommitted_range __P((struct vnode *, struct buf *)); -void nfs_add_committed_range __P((struct vnode *, struct buf *)); -void nfs_del_committed_range __P((struct vnode *, struct buf *)); -void nfs_add_tobecommitted_range __P((struct vnode *, struct buf *)); -void nfs_del_tobecommitted_range __P((struct vnode *, struct buf *)); +int nfs_in_committed_range __P((struct vnode *, off_t, off_t)); +int nfs_in_tobecommitted_range __P((struct vnode *, off_t, off_t)); +void nfs_add_committed_range __P((struct vnode *, off_t, off_t)); +void nfs_del_committed_range __P((struct vnode *, off_t, off_t)); +void nfs_add_tobecommitted_range __P((struct vnode *, off_t, off_t)); +void nfs_del_tobecommitted_range __P((struct vnode *, off_t, off_t)); int nfsrv_errmap __P((struct nfsrv_descript *, int)); void nfsrvw_sort __P((gid_t *, int)); diff --git a/sys/nfs/nfs_vfsops.c b/sys/nfs/nfs_vfsops.c index f308cd9a6633..43d83839ab16 100644 --- a/sys/nfs/nfs_vfsops.c +++ b/sys/nfs/nfs_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_vfsops.c,v 1.96 2000/09/19 22:15:41 fvdl Exp $ */ +/* $NetBSD: nfs_vfsops.c,v 1.97 2000/11/27 08:39:50 chs Exp $ */ /* * Copyright (c) 1989, 1993, 1995 @@ -689,6 +689,8 @@ mountnfs(argp, mp, nam, pth, hst, vpp, p) #else mp->mnt_stat.f_type = 0; #endif + mp->mnt_fs_bshift = DEV_BSHIFT; + mp->mnt_dev_bshift = -1; strncpy(&mp->mnt_stat.f_fstypename[0], mp->mnt_op->vfs_name, MFSNAMELEN); memcpy(mp->mnt_stat.f_mntfromname, hst, MNAMELEN); diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index 1162c10b8ab5..0a65c3a62b2d 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: nfs_vnops.c,v 1.123 2000/11/08 05:20:32 chs Exp $ */ +/* $NetBSD: nfs_vnops.c,v 1.124 2000/11/27 08:39:51 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -43,6 +43,7 @@ */ #include "opt_nfs.h" +#include "opt_uvmhist.h" #include #include @@ -63,6 +64,7 @@ #include #include +#include #include #include @@ -136,7 +138,9 @@ struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_truncate_desc, nfs_truncate }, /* truncate */ { &vop_update_desc, nfs_update }, /* update */ { &vop_bwrite_desc, nfs_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, nfs_getpages }, /* getpages */ + { &vop_putpages_desc, nfs_putpages }, /* putpages */ + { NULL, NULL } }; struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; @@ -163,7 +167,7 @@ struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vop_poll_desc, spec_poll }, /* poll */ { &vop_revoke_desc, spec_revoke }, /* revoke */ { &vop_mmap_desc, spec_mmap }, /* mmap */ - { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ { &vop_seek_desc, spec_seek }, /* seek */ { &vop_remove_desc, spec_remove }, /* remove */ { &vop_link_desc, spec_link }, /* link */ @@ -191,7 +195,7 @@ struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vop_truncate_desc, spec_truncate }, /* truncate */ { &vop_update_desc, nfs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc spec_nfsv2nodeop_opv_desc = { &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries }; @@ -243,7 +247,7 @@ struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = { { &vop_truncate_desc, fifo_truncate }, /* truncate */ { &vop_update_desc, nfs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc = { &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries }; @@ -432,11 +436,9 @@ nfs_open(v) int error; if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) { -#ifdef DIAGNOSTIC - printf("open eacces vtyp=%d\n",vp->v_type); -#endif return (EACCES); } + #ifndef NFS_V2_ONLY /* * Get a valid lease. If cached data is stale, flush it. @@ -454,7 +456,6 @@ nfs_open(v) if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); - (void) uvm_vnp_uncache(vp); np->n_brev = np->n_lrev; } } @@ -465,7 +466,6 @@ nfs_open(v) if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); - (void) uvm_vnp_uncache(vp); np->n_attrstamp = 0; if (vp->v_type == VDIR) { nfs_invaldircache(vp, 0); @@ -487,7 +487,6 @@ nfs_open(v) if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); - (void) uvm_vnp_uncache(vp); np->n_mtime = vattr.va_mtime.tv_sec; } } @@ -542,6 +541,7 @@ nfs_close(v) struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); int error = 0; + UVMHIST_FUNC("nfs_close"); UVMHIST_CALLED(ubchist); if (vp->v_type == VREG) { if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 && @@ -558,6 +558,7 @@ nfs_close(v) error = np->n_error; } } + UVMHIST_LOG(ubchist, "returning %d", error,0,0,0); return (error); } @@ -1020,10 +1021,9 @@ nfs_readlinkrpc(vp, uiop, cred) * Ditto above */ int -nfs_readrpc(vp, uiop, cred) +nfs_readrpc(vp, uiop) struct vnode *vp; struct uio *uiop; - struct ucred *cred; { u_int32_t *tl; caddr_t cp; @@ -1055,7 +1055,8 @@ nfs_readrpc(vp, uiop, cred) *tl++ = txdr_unsigned(len); *tl = 0; } - nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred); + nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, + VTONFS(vp)->n_rcred); if (v3) { nfsm_postop_attr(vp, attrflag); if (error) { @@ -1084,10 +1085,9 @@ nfsmout: * nfs write call */ int -nfs_writerpc(vp, uiop, cred, iomode, must_commit) +nfs_writerpc(vp, uiop, iomode, must_commit) struct vnode *vp; struct uio *uiop; - struct ucred *cred; int *iomode, *must_commit; { u_int32_t *tl; @@ -1110,7 +1110,7 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) return (EFBIG); while (tsiz > 0) { nfsstats.rpccnt[NFSPROC_WRITE]++; - len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz; + len = min(tsiz, nmp->nm_wsize); nfsm_reqhead(vp, NFSPROC_WRITE, NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); nfsm_fhtom(vp, v3); @@ -1135,7 +1135,8 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit) } nfsm_uiotom(uiop, len); - nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred); + nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, + VTONFS(vp)->n_wcred); if (v3) { wccflag = NFSV3_WCCCHK; nfsm_wcc_data(vp, wccflag); @@ -2595,11 +2596,10 @@ nfs_lookitup(dvp, name, len, cred, procp, npp) * Nfs Version 3 commit rpc */ int -nfs_commit(vp, offset, cnt, cred, procp) +nfs_commit(vp, offset, cnt, procp) struct vnode *vp; - u_quad_t offset; - unsigned cnt; - struct ucred *cred; + off_t offset; + uint32_t cnt; struct proc *procp; { caddr_t cp; @@ -2624,7 +2624,7 @@ nfs_commit(vp, offset, cnt, cred, procp) txdr_hyper(offset, tl); tl += 2; *tl = txdr_unsigned(cnt); - nfsm_request(vp, NFSPROC_COMMIT, procp, cred); + nfsm_request(vp, NFSPROC_COMMIT, procp, VTONFS(vp)->n_wcred); nfsm_wcc_data(vp, wccflag); if (!error) { nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF); @@ -2680,28 +2680,25 @@ nfs_strategy(v) { struct vop_strategy_args *ap = v; struct buf *bp = ap->a_bp; - struct ucred *cr; struct proc *p; int error = 0; if ((bp->b_flags & (B_PHYS|B_ASYNC)) == (B_PHYS|B_ASYNC)) panic("nfs physio/async"); if (bp->b_flags & B_ASYNC) - p = (struct proc *)0; + p = NULL; else p = curproc; /* XXX */ - if (bp->b_flags & B_READ) - cr = bp->b_rcred; - else - cr = bp->b_wcred; + /* * If the op is asynchronous and an i/o daemon is waiting * queue the request, wake it up and wait for completion * otherwise just do it ourselves. */ + if ((bp->b_flags & B_ASYNC) == 0 || - nfs_asyncio(bp, NOCRED)) - error = nfs_doio(bp, cr, p); + nfs_asyncio(bp)) + error = nfs_doio(bp, p); return (error); } @@ -2750,16 +2747,7 @@ nfs_fsync(v) } /* - * Flush all the blocks associated with a vnode. - * Walk through the buffer pool and push any dirty pages - * associated with the vnode. - * - * Don't bother to cluster commits; the commitrange code will - * do that. In the first pass, push all dirty buffers to the - * server, using stable writes if commit is set to 1. - * In the 2nd pass, push anything that might be left, - * i.e. the buffer was busy in the first pass, or it wasn't - * committed in the first pass. + * Flush all the data associated with a vnode. */ int nfs_flush(vp, cred, waitfor, p, commit) @@ -2769,104 +2757,25 @@ nfs_flush(vp, cred, waitfor, p, commit) struct proc *p; int commit; { + struct uvm_object *uobj = &vp->v_uvm.u_obj; struct nfsnode *np = VTONFS(vp); - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - struct buf *bp; - struct buf *nbp; - int pass, s, error, slpflag, slptimeo; + int error; + int flushflags = PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO; + int rv; + UVMHIST_FUNC("nfs_flush"); UVMHIST_CALLED(ubchist); - pass = 1; error = 0; - slptimeo = 0; - slpflag = nmp->nm_flag & NFSMNT_INT ? PCATCH : 0; -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (bp->b_flags & B_BUSY) { - if (pass == 2 && waitfor == MNT_WAIT) { - bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), - "nfsfsync", slptimeo); - splx(s); - if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - } - goto loop; - } else - continue; - } -#ifdef DIAGNOSTIC - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfs_fsync: not dirty"); -#endif - if (!commit && (bp->b_flags & B_NEEDCOMMIT)) - continue; - /* - * Note: can't use B_VFLUSH here, since there is no - * real vnode lock, so we can't leave the buffer on - * the freelist. - */ - bremfree(bp); - if (commit && vp->v_type == VREG) - /* - * Setting B_NOCACHE has the effect - * effect of nfs_doio using a stable write - * RPC. XXX this abuses the B_NOCACHE flag, - * but it is needed to tell nfs_strategy - * that this buffer is async, but needs to - * be written with a stable RPC. nfs_doio - * will remove B_NOCACHE again. - */ - bp->b_flags |= B_NOCACHE; - - bp->b_flags |= B_BUSY | B_ASYNC; - splx(s); - VOP_BWRITE(bp); - goto loop; - } - splx(s); - - if (commit && pass == 1) { - pass = 2; - goto loop; - } - - if (waitfor == MNT_WAIT) { - s = splbio(); - while (vp->v_numoutput) { - vp->v_flag |= VBWAIT; - error = tsleep((caddr_t)&vp->v_numoutput, - slpflag | (PRIBIO + 1), "nfsfsync", slptimeo); - if (error) { - splx(s); - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - s = splbio(); - } - } - splx(s); - if (vp->v_dirtyblkhd.lh_first && commit) { -#if 0 - vprint("nfs_fsync: dirty", vp); -#endif - goto loop; - } + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + error = EIO; } if (np->n_flag & NWRITEERR) { error = np->n_error; np->n_flag &= ~NWRITEERR; } + UVMHIST_LOG(ubchist, "returning %d", error,0,0,0); return (error); } diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h index 2b5f0c9d5ab3..e924c07b8a5e 100644 --- a/sys/nfs/nfsnode.h +++ b/sys/nfs/nfsnode.h @@ -1,4 +1,4 @@ -/* $NetBSD: nfsnode.h,v 1.30 2000/09/19 22:18:40 fvdl Exp $ */ +/* $NetBSD: nfsnode.h,v 1.31 2000/11/27 08:39:51 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -140,6 +140,8 @@ struct nfsnode { off_t n_pushhi; /* Last block in range */ struct lock n_commitlock; /* Serialize commits XXX */ int n_commitflags; + struct ucred *n_rcred; + struct ucred *n_wcred; }; /* @@ -173,7 +175,7 @@ struct nfsnode { * Convert between nfsnode pointers and vnode pointers */ #define VTONFS(vp) ((struct nfsnode *)(vp)->v_data) -#define NFSTOV(np) ((struct vnode *)(np)->n_vnode) +#define NFSTOV(np) ((np)->n_vnode) /* * Queue head for nfsiod's @@ -235,6 +237,8 @@ int nfs_bwrite __P((void *)); #define nfs_vfree genfs_nullop int nfs_truncate __P((void *)); int nfs_update __P((void *)); +int nfs_getpages __P((void *)); +int nfs_putpages __P((void *)); extern int (**nfsv2_vnodeop_p) __P((void *)); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index e68bd0f4a469..6fc1007c12b5 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $NetBSD: buf.h,v 1.43 2000/04/10 02:22:15 chs Exp $ */ +/* $NetBSD: buf.h,v 1.44 2000/11/27 08:39:51 chs Exp $ */ /*- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. @@ -190,13 +190,8 @@ struct buf { number (not partition relative) */ /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); - struct vnode *b_vp; /* Device vnode. */ - int b_dirtyoff; /* Offset in buffer of dirty region. */ - int b_dirtyend; /* Offset of end of dirty region. */ - struct ucred *b_rcred; /* Read credentials reference. */ - struct ucred *b_wcred; /* Write credentials reference. */ - int b_validoff; /* Offset in buffer of valid region. */ - int b_validend; /* Offset of end of valid region. */ + struct vnode *b_vp; /* File vnode. */ + void *b_private; /* Private data for owner */ off_t b_dcookie; /* Offset cookie if dir block */ struct workhead b_dep; /* List of filesystem dependencies. */ }; @@ -230,15 +225,16 @@ struct buf { #define B_LOCKED 0x00004000 /* Locked in core (not reusable). */ #define B_NOCACHE 0x00008000 /* Do not cache block after use. */ #define B_ORDERED 0x00010000 /* ordered I/O request */ +#define B_CACHE 0x00020000 /* Bread found us in the cache. */ #define B_PHYS 0x00040000 /* I/O to user memory. */ #define B_RAW 0x00080000 /* Set by physio for raw transfers. */ #define B_READ 0x00100000 /* Read buffer. */ #define B_TAPE 0x00200000 /* Magnetic tape I/O. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ -#define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_VFLUSH 0x04000000 /* Buffer is being synced. */ +#define B_PDAEMON 0x10000000 /* I/O initiated by pagedaemon. */ /* * This structure describes a clustered I/O. It is stored in the b_saveaddr @@ -268,6 +264,7 @@ do { \ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #ifdef _KERNEL + extern int nbuf; /* The number of buffer headers */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 23c315cb09e1..cf10615bc1ba 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,4 +1,4 @@ -/* $NetBSD: mount.h,v 1.86 2000/06/26 21:10:34 christos Exp $ */ +/* $NetBSD: mount.h,v 1.87 2000/11/27 08:39:52 chs Exp $ */ /* * Copyright (c) 1989, 1991, 1993 @@ -131,6 +131,8 @@ struct mount { struct lock mnt_lock; /* mount structure lock */ int mnt_flag; /* flags */ int mnt_maxsymlinklen; /* max size of short symlink */ + int mnt_fs_bshift; /* offset shift for lblkno */ + int mnt_dev_bshift; /* shift for device sectors */ struct statfs mnt_stat; /* cache of filesystem stats */ qaddr_t mnt_data; /* private data */ int mnt_wcnt; /* count of vfs_busy waiters */ diff --git a/sys/sys/param.h b/sys/sys/param.h index f8a0b94b3fb1..c9df690d44bb 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,4 +1,4 @@ -/* $NetBSD: param.h,v 1.111 2000/11/11 00:53:24 thorpej Exp $ */ +/* $NetBSD: param.h,v 1.112 2000/11/27 08:39:52 chs Exp $ */ /*- * Copyright (c) 1982, 1986, 1989, 1993 @@ -252,4 +252,15 @@ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1< #include #include #include #include #include +#include + +#include #include #include @@ -75,8 +82,13 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags) u_int deallocated; ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; int unwindidx = -1; + UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist); - *bpp = NULL; + UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0); + + if (bpp != NULL) { + *bpp = NULL; + } if (bn < 0) return (EFBIG); fs = ip->i_e2fs; @@ -88,29 +100,43 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags) if (bn < NDADDR) { nb = fs2h32(ip->i_e2fs_blocks[bn]); if (nb != 0) { - error = bread(vp, bn, fs->e2fs_bsize, NOCRED, &bp); - if (error) { - brelse(bp); - return (error); + + /* + * the block is already allocated, just read it. + */ + + if (bpp != NULL) { + error = bread(vp, bn, fs->e2fs_bsize, NOCRED, + &bp); + if (error) { + brelse(bp); + return (error); + } + *bpp = bp; } - *bpp = bp; return (0); - } else { - error = ext2fs_alloc(ip, bn, - ext2fs_blkpref(ip, bn, (int)bn, &ip->i_e2fs_blocks[0]), - cred, &newb); - if (error) - return (error); - ip->i_e2fs_last_lblk = lbn; - ip->i_e2fs_last_blk = newb; + } + + /* + * allocate a new direct block. + */ + + error = ext2fs_alloc(ip, bn, + ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]), + cred, &newb); + if (error) + return (error); + ip->i_e2fs_last_lblk = lbn; + ip->i_e2fs_last_blk = newb; + ip->i_e2fs_blocks[bn] = h2fs32(newb); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (bpp != NULL) { bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0); bp->b_blkno = fsbtodb(fs, newb); if (flags & B_CLRBUF) clrbuf(bp); + *bpp = bp; } - ip->i_e2fs_blocks[bn] = h2fs32(dbtofsb(fs, bp->b_blkno)); - ip->i_flag |= IN_CHANGE | IN_UPDATE; - *bpp = bp; return (0); } /* @@ -218,10 +244,6 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags) *allocblk++ = nb; ip->i_e2fs_last_lblk = lbn; ip->i_e2fs_last_blk = newb; - nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); - if (flags & B_CLRBUF) - clrbuf(nbp); bap[indirs[num].in_off] = h2fs32(nb); /* * If required, write synchronously, otherwise use @@ -232,21 +254,30 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags) } else { bdwrite(bp); } - *bpp = nbp; + if (bpp != NULL) { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + *bpp = nbp; + } return (0); } brelse(bp); - if (flags & B_CLRBUF) { - error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); - if (error) { - brelse(nbp); - goto fail; + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, + &nbp); + if (error) { + brelse(nbp); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); } - } else { - nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); + *bpp = nbp; } - *bpp = nbp; return (0); fail: /* @@ -288,5 +319,178 @@ fail: ip->i_e2fs_nblock -= btodb(deallocated); ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE; } + return error; +} + +int +ext2fs_ballocn(v) + void *v; +{ + struct vop_ballocn_args /* { + struct vnode *a_vp; + off_t a_offset; + off_t a_length; + struct ucred *a_cred; + int a_flags; + } */ *ap = v; + off_t off, len; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct m_ext2fs *fs = ip->i_e2fs; + int error, delta, bshift, bsize; + UVMHIST_FUNC("ext2fs_ballocn"); UVMHIST_CALLED(ubchist); + + bshift = fs->e2fs_bshift; + bsize = 1 << bshift; + + off = ap->a_offset; + len = ap->a_length; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = min(bsize, len); + UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x", + off, len, bsize, 0); + + error = ext2fs_balloc(ip, lblkno(fs, off), bsize, ap->a_cred, + NULL, ap->a_flags); + if (error) { + UVMHIST_LOG(ubchist, "error %d", error, 0,0,0); + return error; + } + + /* + * increase file size now, VOP_BALLOC() requires that + * EOF be up-to-date before each call. + */ + + if (ip->i_e2fs_size < off + bsize) { + UVMHIST_LOG(ubchist, "old 0x%x new 0x%x", + ip->i_e2fs_size, off + bsize,0,0); + ip->i_e2fs_size = off + bsize; + if (vp->v_uvm.u_size < ip->i_e2fs_size) { + uvm_vnp_setsize(vp, ip->i_e2fs_size); + } + } + + off += bsize; + len -= bsize; + } + return 0; +} + +/* + * allocate a range of blocks in a file. + * after this function returns, any page entirely contained within the range + * will map to invalid data and thus must be overwritten before it is made + * accessible to others. + */ + +int +ext2fs_balloc_range(vp, off, len, cred, flags) + struct vnode *vp; + off_t off, len; + struct ucred *cred; + int flags; +{ + off_t eof, pagestart, pageend; + struct uvm_object *uobj; + struct inode *ip = VTOI(vp); + int i, delta, error, npages1, npages2; + int bshift = vp->v_mount->mnt_fs_bshift; + int bsize = 1 << bshift; + int ppb = max(bsize >> PAGE_SHIFT, 1); + struct vm_page *pgs1[ppb], *pgs2[ppb]; + UVMHIST_FUNC("ext2fs_balloc_range"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x", + vp, off, len, vp->v_uvm.u_size); + + error = 0; + uobj = &vp->v_uvm.u_obj; + eof = max(vp->v_uvm.u_size, off + len); + vp->v_uvm.u_size = eof; + UVMHIST_LOG(ubchist, "new eof 0x%x", eof,0,0,0); + pgs1[0] = pgs2[0] = NULL; + + /* + * if the range does not start on a page and block boundary, + * cache the first block if the file so the page(s) will contain + * the correct data. hold the page(s) busy while we allocate + * the backing store for the range. + */ + + pagestart = trunc_page(off) & ~(bsize - 1); + if (off != pagestart) { + npages1 = min(ppb, (round_page(eof) - pagestart) >> + PAGE_SHIFT); + memset(pgs1, 0, npages1); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, pagestart, pgs1, &npages1, 0, + VM_PROT_READ, 0, PGO_SYNCIO); + if (error) { + UVMHIST_LOG(ubchist, "gp1 %d", error,0,0,0); + goto errout; + } + for (i = 0; i < npages1; i++) { + UVMHIST_LOG(ubchist, "got pgs1[%d] %p", i, pgs1[i],0,0); + } + } + + /* + * similarly if the range does not end on a page and block boundary. + */ + + pageend = trunc_page(off + len) & ~(bsize - 1); + if (off + len < ip->i_e2fs_size && + off + len != pageend && + pagestart != pageend) { + npages2 = min(ppb, (round_page(eof) - pageend) >> + PAGE_SHIFT); + memset(pgs2, 0, npages2); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, pageend, pgs2, &npages2, 0, + VM_PROT_READ, 0, PGO_SYNCIO); + if (error) { + UVMHIST_LOG(ubchist, "gp2 %d", error,0,0,0); + goto errout; + } + for (i = 0; i < npages2; i++) { + UVMHIST_LOG(ubchist, "got pgs2[%d] %p", i, pgs2[i],0,0); + } + } + + /* + * adjust off to be block-aligned. + */ + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + /* + * now allocate the range. + */ + + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL); + error = VOP_BALLOCN(vp, off, len, cred, flags); + UVMHIST_LOG(ubchist, "ballocn %d", error,0,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL); + + /* + * unbusy any pages we are holding. + */ + +errout: + simple_lock(&uobj->vmobjlock); + if (pgs1[0] != NULL) { + uvm_page_unbusy(pgs1, npages1); + } + if (pgs2[0] != NULL) { + uvm_page_unbusy(pgs2, npages2); + } + simple_unlock(&uobj->vmobjlock); return (error); } diff --git a/sys/ufs/ext2fs/ext2fs_extern.h b/sys/ufs/ext2fs/ext2fs_extern.h index 3367a00d0b5a..581455bd9fa5 100644 --- a/sys/ufs/ext2fs/ext2fs_extern.h +++ b/sys/ufs/ext2fs/ext2fs_extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: ext2fs_extern.h,v 1.8 2000/03/16 18:08:32 jdolecek Exp $ */ +/* $NetBSD: ext2fs_extern.h,v 1.9 2000/11/27 08:39:53 chs Exp $ */ /*- * Copyright (c) 1997 Manuel Bouyer. @@ -71,6 +71,9 @@ int ext2fs_vfree __P((void *)); /* ext2fs_balloc.c */ int ext2fs_balloc __P((struct inode *, ufs_daddr_t, int, struct ucred *, struct buf **, int)); +int ext2fs_ballocn __P((void *)); +int ext2fs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, + int)); /* ext2fs_bmap.c */ int ext2fs_bmap __P((void *)); diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c index bed43177ccf7..eb7af536ce43 100644 --- a/sys/ufs/ext2fs/ext2fs_inode.c +++ b/sys/ufs/ext2fs/ext2fs_inode.c @@ -1,4 +1,4 @@ -/* $NetBSD: ext2fs_inode.c,v 1.20 2000/06/28 14:16:37 mrg Exp $ */ +/* $NetBSD: ext2fs_inode.c,v 1.21 2000/11/27 08:39:53 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -101,7 +101,7 @@ out: * so that it can be reused immediately. */ if (ip->i_e2fs_dtime != 0) - vrecycle(vp, (struct simplelock *)0, p); + vrecycle(vp, NULL, p); return (error); } @@ -187,15 +187,14 @@ ext2fs_truncate(v) struct vnode *ovp = ap->a_vp; ufs_daddr_t lastblock; struct inode *oip; - ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + ufs_daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; off_t length = ap->a_length; struct m_ext2fs *fs; - struct buf *bp; int offset, size, level; long count, nblocks, blocksreleased = 0; int i; - int aflags, error, allerror = 0; + int error, allerror = 0; off_t osize; if (length < 0) @@ -232,24 +231,10 @@ ext2fs_truncate(v) if (length > fs->fs_maxfilesize) return (EFBIG); #endif - offset = blkoff(fs, length - 1); - lbn = lblkno(fs, length - 1); - aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2fs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp, - aflags); - if (error) - return (error); - oip->i_e2fs_size = length; - uvm_vnp_setsize(ovp, length); - (void) uvm_vnp_uncache(ovp); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); + ext2fs_balloc_range(ovp, length - 1, 1, ap->a_cred, + ap->a_flags & IO_SYNC ? B_SYNC : 0); oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT)); + return (VOP_UPDATE(ovp, NULL, NULL, 1)); } /* * Shorten the size of the file. If the file is not being @@ -259,26 +244,13 @@ ext2fs_truncate(v) * of subsequent file growth. */ offset = blkoff(fs, length); - if (offset == 0) { - oip->i_e2fs_size = length; - } else { - lbn = lblkno(fs, length); - aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2fs_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags); - if (error) - return (error); - oip->i_e2fs_size = length; + if (offset != 0) { size = fs->e2fs_bsize; - (void) uvm_vnp_uncache(ovp); - memset((char *)bp->b_data + offset, 0, (u_int)(size - offset)); - allocbuf(bp, size); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); + + /* XXXUBC we should handle more than just VREG */ + uvm_vnp_zerorange(ovp, length, size - offset); } + oip->i_e2fs_size = length; uvm_vnp_setsize(ovp, length); /* @@ -317,6 +289,7 @@ ext2fs_truncate(v) * Note that we save the new block configuration so we can check it * when we are done. */ + memcpy((caddr_t)newblks, (caddr_t)&oip->i_e2fs_blocks[0], sizeof newblks); memcpy((caddr_t)&oip->i_e2fs_blocks[0], (caddr_t)oldblks, sizeof oldblks); oip->i_e2fs_size = osize; @@ -359,20 +332,20 @@ ext2fs_truncate(v) ext2fs_blkfree(oip, bn); blocksreleased += btodb(fs->e2fs_bsize); } - if (lastblock < 0) - goto done; done: #ifdef DIAGNOSTIC for (level = SINGLE; level <= TRIPLE; level++) - if (newblks[NDADDR + level] != oip->i_e2fs_blocks[NDADDR + level]) - panic("itrunc1"); + if (newblks[NDADDR + level] != + oip->i_e2fs_blocks[NDADDR + level]) + panic("ext2fs_truncate1"); for (i = 0; i < NDADDR; i++) if (newblks[i] != oip->i_e2fs_blocks[i]) - panic("itrunc2"); + panic("ext2fs_truncate2"); if (length == 0 && - (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd))) - panic("itrunc3"); + (!LIST_EMPTY(&ovp->v_cleanblkhd) || + !LIST_EMPTY(&ovp->v_dirtyblkhd))) + panic("ext2fs_truncate3"); #endif /* DIAGNOSTIC */ /* * Put back the real size. diff --git a/sys/ufs/ext2fs/ext2fs_readwrite.c b/sys/ufs/ext2fs/ext2fs_readwrite.c index 8b366486935e..12349860073f 100644 --- a/sys/ufs/ext2fs/ext2fs_readwrite.c +++ b/sys/ufs/ext2fs/ext2fs_readwrite.c @@ -1,4 +1,4 @@ -/* $NetBSD: ext2fs_readwrite.c,v 1.13 2000/06/28 14:16:38 mrg Exp $ */ +/* $NetBSD: ext2fs_readwrite.c,v 1.14 2000/11/27 08:39:53 chs Exp $ */ /*- * Copyright (c) 1997 Manuel Bouyer. @@ -79,6 +79,8 @@ ext2fs_read(v) struct uio *uio; struct m_ext2fs *fs; struct buf *bp; + void *win; + vsize_t bytelen; ufs_daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; @@ -107,6 +109,27 @@ ext2fs_read(v) if (uio->uio_resid == 0) return (0); + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + + bytelen = min(ip->i_e2fs_size - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + } + goto out; + } + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_e2fs_size - uio->uio_offset) <= 0) break; @@ -148,14 +171,15 @@ ext2fs_read(v) break; xfersize = size; } - error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, - uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error) break; brelse(bp); } if (bp != NULL) brelse(bp); + +out: if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { ip->i_flag |= IN_ACCESS; if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) @@ -185,12 +209,17 @@ ext2fs_write(v) struct proc *p; ufs_daddr_t lbn; off_t osize; - int blkoffset, error, flags, ioflag, resid, size, xfersize; + int blkoffset, error, flags, ioflag, resid, xfersize; + vsize_t bytelen; + void *win; + off_t oldoff; + boolean_t rv; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); + error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -234,35 +263,66 @@ ext2fs_write(v) resid = uio->uio_resid; osize = ip->i_e2fs_size; - flags = ioflag & IO_SYNC ? B_SYNC : 0; + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = min(fs->e2fs_bsize - blkoffset, + uio->uio_resid); + + /* + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. + */ + + error = ext2fs_balloc_range(vp, uio->uio_offset, + bytelen, ap->a_cred, 0); + if (error) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } + } + goto out; + } + + flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->e2fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - if (fs->e2fs_bsize > xfersize) + xfersize = min(fs->e2fs_bsize - blkoffset, uio->uio_resid); + if (xfersize < fs->e2fs_bsize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; - - error = ext2fs_balloc(ip, - lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); + error = VOP_BALLOC(vp, lblktosize(fs, lbn), + blkoffset + xfersize, ap->a_cred, flags, + &bp); if (error) break; - if (uio->uio_offset + xfersize > ip->i_e2fs_size) { + if (ip->i_e2fs_size < uio->uio_offset + xfersize) { ip->i_e2fs_size = uio->uio_offset + xfersize; - uvm_vnp_setsize(vp, ip->i_e2fs_size); } - (void)uvm_vnp_uncache(vp); - - size = fs->e2fs_bsize - bp->b_resid; - if (size < xfersize) - xfersize = size; - - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->e2fs_bsize) @@ -274,13 +334,14 @@ ext2fs_write(v) bdwrite(bp); if (error || xfersize == 0) break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ +out: + ip->i_flag |= IN_CHANGE | IN_UPDATE; if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_e2fs_mode &= ~(ISUID | ISGID); if (error) { diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c index 3f8efbdb9f82..c3fd765c00f3 100644 --- a/sys/ufs/ext2fs/ext2fs_vfsops.c +++ b/sys/ufs/ext2fs/ext2fs_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: ext2fs_vfsops.c,v 1.39 2000/09/19 22:03:05 fvdl Exp $ */ +/* $NetBSD: ext2fs_vfsops.c,v 1.40 2000/11/27 08:39:53 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -592,15 +592,19 @@ ext2fs_mountfs(devvp, mp, p) mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_EXT2FS); mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ + mp->mnt_fs_bshift = m_fs->e2fs_bshift; ump->um_flags = 0; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = NINDIR(m_fs); + ump->um_lognindir = ffs(NINDIR(m_fs)) - 1; ump->um_bptrtodb = m_fs->e2fs_fsbtodb; ump->um_seqinc = 1; /* no frags */ devvp->v_specmountpoint = mp; return (0); + out: if (bp) brelse(bp); @@ -931,6 +935,7 @@ ext2fs_vget(mp, ino, vpp) ip->i_flag |= IN_MODIFIED; } + vp->v_uvm.u_size = ip->i_e2fs_size; *vpp = vp; return (0); } diff --git a/sys/ufs/ext2fs/ext2fs_vnops.c b/sys/ufs/ext2fs/ext2fs_vnops.c index c9d263a164ab..0d14eb83b659 100644 --- a/sys/ufs/ext2fs/ext2fs_vnops.c +++ b/sys/ufs/ext2fs/ext2fs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: ext2fs_vnops.c,v 1.29 2000/08/03 20:41:36 thorpej Exp $ */ +/* $NetBSD: ext2fs_vnops.c,v 1.30 2000/11/27 08:39:53 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -196,7 +196,6 @@ ext2fs_access(v) struct inode *ip = VTOI(vp); mode_t mode = ap->a_mode; - /* * Disallow write attempts on read-only file systems; * unless the file is a socket, fifo, or a block or @@ -421,8 +420,6 @@ ext2fs_chmod(vp, mode, cred, p) ip->i_e2fs_mode &= ~ALLPERMS; ip->i_e2fs_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; - if ((vp->v_flag & VTEXT) && (ip->i_e2fs_mode & S_ISTXT) == 0) - (void) uvm_vnp_uncache(vp); return (0); } @@ -1465,7 +1462,11 @@ struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = { { &vop_truncate_desc, ext2fs_truncate }, /* truncate */ { &vop_update_desc, ext2fs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL } + { &vop_ballocn_desc, ext2fs_ballocn }, /* ballocn */ + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_putpages_desc, genfs_putpages }, /* putpages */ + { &vop_size_desc, genfs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc ext2fs_vnodeop_opv_desc = { &ext2fs_vnodeop_p, ext2fs_vnodeop_entries }; @@ -1516,7 +1517,7 @@ struct vnodeopv_entry_desc ext2fs_specop_entries[] = { { &vop_truncate_desc, spec_truncate }, /* truncate */ { &vop_update_desc, ext2fs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc ext2fs_specop_opv_desc = { &ext2fs_specop_p, ext2fs_specop_entries }; @@ -1567,7 +1568,7 @@ struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = { { &vop_truncate_desc, fifo_truncate }, /* truncate */ { &vop_update_desc, ext2fs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc ext2fs_fifoop_opv_desc = { &ext2fs_fifoop_p, ext2fs_fifoop_entries }; diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 11ab50b276c3..d2fb7c669e60 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_alloc.c,v 1.36 2000/06/28 14:16:39 mrg Exp $ */ +/* $NetBSD: ffs_alloc.c,v 1.37 2000/11/27 08:39:54 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -109,15 +109,33 @@ ffs_alloc(ip, lbn, bpref, size, cred, bnp) struct ucred *cred; ufs_daddr_t *bnp; { - struct fs *fs; + struct fs *fs = ip->i_fs; ufs_daddr_t bno; int cg; #ifdef QUOTA int error; #endif +#ifdef UVM_PAGE_TRKOWN + if (ITOV(ip)->v_type == VREG && lbn > 0) { + struct vm_page *pg; + struct uvm_object *uobj = &ITOV(ip)->v_uvm.u_obj; + voff_t off = trunc_page(lblktosize(fs, lbn)); + voff_t endoff = round_page(lblktosize(fs, lbn) + size); + + simple_lock(&uobj->vmobjlock); + while (off < endoff) { + pg = uvm_pagelookup(uobj, off); + KASSERT(pg != NULL); + KASSERT(pg->owner == curproc->p_pid); + KASSERT((pg->flags & PG_CLEAN) == 0); + off += PAGE_SIZE; + } + simple_unlock(&uobj->vmobjlock); + } +#endif + *bnp = 0; - fs = ip->i_fs; #ifdef DIAGNOSTIC if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n", @@ -170,21 +188,39 @@ nospace: * invoked to get an appropriate block. */ int -ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) +ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp, blknop) struct inode *ip; ufs_daddr_t lbprev; ufs_daddr_t bpref; int osize, nsize; struct ucred *cred; struct buf **bpp; + ufs_daddr_t *blknop; { - struct fs *fs; + struct fs *fs = ip->i_fs; struct buf *bp; int cg, request, error; ufs_daddr_t bprev, bno; - *bpp = 0; - fs = ip->i_fs; +#ifdef UVM_PAGE_TRKOWN + if (ITOV(ip)->v_type == VREG) { + struct vm_page *pg; + struct uvm_object *uobj = &ITOV(ip)->v_uvm.u_obj; + voff_t off = trunc_page(lblktosize(fs, lbprev)); + voff_t endoff = round_page(lblktosize(fs, lbprev) + osize); + + simple_lock(&uobj->vmobjlock); + while (off < endoff) { + pg = uvm_pagelookup(uobj, off); + KASSERT(pg != NULL); + KASSERT(pg->owner == curproc->p_pid); + KASSERT((pg->flags & PG_CLEAN) == 0); + off += PAGE_SIZE; + } + simple_unlock(&uobj->vmobjlock); + } +#endif + #ifdef DIAGNOSTIC if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { @@ -206,7 +242,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) /* * Allocate the extra space in the buffer. */ - if ((error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) { + if (bpp != NULL && + (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) { brelse(bp); return (error); } @@ -221,14 +258,20 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) */ cg = dtog(fs, bprev); if ((bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) != 0) { - if (bp->b_blkno != fsbtodb(fs, bno)) - panic("bad blockno"); ip->i_ffs_blocks += btodb(nsize - osize); ip->i_flag |= IN_CHANGE | IN_UPDATE; - allocbuf(bp, nsize); - bp->b_flags |= B_DONE; - memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); - *bpp = bp; + + if (bpp != NULL) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("bad blockno"); + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + memset(bp->b_data + osize, 0, nsize - osize); + *bpp = bp; + } + if (blknop != NULL) { + *blknop = bno; + } return (0); } /* @@ -292,8 +335,6 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request, ffs_alloccg); if (bno > 0) { - bp->b_blkno = fsbtodb(fs, bno); - (void) uvm_vnp_uncache(ITOV(ip)); if (!DOINGSOFTDEP(ITOV(ip))) ffs_blkfree(ip, bprev, (long)osize); if (nsize < request) @@ -301,10 +342,16 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) (long)(request - nsize)); ip->i_ffs_blocks += btodb(nsize - osize); ip->i_flag |= IN_CHANGE | IN_UPDATE; - allocbuf(bp, nsize); - bp->b_flags |= B_DONE; - memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize); - *bpp = bp; + if (bpp != NULL) { + bp->b_blkno = fsbtodb(fs, bno); + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + memset(bp->b_data + osize, 0, (u_int)nsize - osize); + *bpp = bp; + } + if (blknop != NULL) { + *blknop = bno; + } return (0); } #ifdef QUOTA @@ -313,7 +360,10 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp) */ (void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE); #endif - brelse(bp); + if (bpp != NULL) { + brelse(bp); + } + nospace: /* * no space available @@ -344,7 +394,7 @@ struct ctldebug debug15 = { "prtrealloc", &prtrealloc }; #endif int doasyncfree = 1; -extern int doreallocblks; +int doreallocblks; int ffs_reallocblks(v) @@ -364,6 +414,9 @@ ffs_reallocblks(v) struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; int i, len, start_lvl, end_lvl, pref, ssize; + /* XXXUBC don't reallocblks for now */ + return ENOSPC; + vp = ap->a_vp; ip = VTOI(vp); fs = ip->i_fs; @@ -1725,5 +1778,6 @@ ffs_fserr(fs, uid, cp) char *cp; { - log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp); + log(LOG_ERR, "uid %d comm %s on %s: %s\n", + uid, curproc->p_comm, fs->fs_fsmnt, cp); } diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 7ec9a98a9fad..7d90bcc4ff85 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_balloc.c,v 1.22 2000/09/19 22:04:08 fvdl Exp $ */ +/* $NetBSD: ffs_balloc.c,v 1.23 2000/11/27 08:39:54 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -57,6 +57,8 @@ #include #include +#include + /* * Balloc defines the structure of file system storage * by allocating the physical blocks on a device given @@ -72,7 +74,7 @@ ffs_balloc(v) int a_size; struct ucred *a_cred; int a_flags; - struct buf *a_bpp; + struct buf **a_bpp; } */ *ap = v; ufs_daddr_t lbn; int size; @@ -88,15 +90,22 @@ ffs_balloc(v) int deallocated, osize, nsize, num, i, error; ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; int unwindidx = -1; + struct buf **bpp = ap->a_bpp; #ifdef FFS_EI const int needswap = UFS_FSNEEDSWAP(fs); #endif + UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist); lbn = lblkno(fs, ap->a_startoffset); size = blkoff(fs, ap->a_startoffset) + ap->a_size; if (size > fs->fs_bsize) panic("ffs_balloc: blk too big"); - *ap->a_bpp = NULL; + if (bpp != NULL) { + *bpp = NULL; + } + UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0); + + KASSERT(size <= fs->fs_bsize); if (lbn < 0) return (EFBIG); cred = ap->a_cred; @@ -107,71 +116,109 @@ ffs_balloc(v) * and the file is currently composed of a fragment * this fragment has to be extended to be a full block. */ + nb = lblkno(fs, ip->i_ffs_size); if (nb < NDADDR && nb < lbn) { osize = blksize(fs, ip, nb); if (osize < fs->fs_bsize && osize > 0) { error = ffs_realloccg(ip, nb, ffs_blkpref(ip, nb, (int)nb, &ip->i_ffs_db[0]), - osize, (int)fs->fs_bsize, cred, &bp); + osize, (int)fs->fs_bsize, cred, bpp, &newb); if (error) return (error); if (DOINGSOFTDEP(vp)) - softdep_setup_allocdirect(ip, nb, - dbtofsb(fs, bp->b_blkno), + softdep_setup_allocdirect(ip, nb, newb, ufs_rw32(ip->i_ffs_db[nb], needswap), - fs->fs_bsize, osize, bp); - ip->i_ffs_size = (nb + 1) * fs->fs_bsize; + fs->fs_bsize, osize, bpp ? *bpp : NULL); + ip->i_ffs_size = lblktosize(fs, nb + 1); uvm_vnp_setsize(vp, ip->i_ffs_size); - ip->i_ffs_db[nb] = ufs_rw32(dbtofsb(fs, bp->b_blkno), - needswap); + ip->i_ffs_db[nb] = ufs_rw32(newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; - if (flags & B_SYNC) - bwrite(bp); - else - bawrite(bp); + if (bpp) { + if (flags & B_SYNC) + bwrite(*bpp); + else + bawrite(*bpp); + } } } + /* * The first NDADDR blocks are direct blocks */ + if (lbn < NDADDR) { nb = ufs_rw32(ip->i_ffs_db[lbn], needswap); - if (nb != 0 && ip->i_ffs_size >= (lbn + 1) * fs->fs_bsize) { - error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); - if (error) { - brelse(bp); - return (error); + if (nb != 0 && ip->i_ffs_size >= lblktosize(fs, lbn + 1)) { + + /* + * The block is an already-allocated direct block + * and the file already extends past this block, + * thus this must be a whole block. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, fs->fs_bsize, NOCRED, + bpp); + if (error) { + brelse(*bpp); + return (error); + } } - *ap->a_bpp = bp; return (0); } if (nb != 0) { + /* * Consider need to reallocate a fragment. */ + osize = fragroundup(fs, blkoff(fs, ip->i_ffs_size)); nsize = fragroundup(fs, size); if (nsize <= osize) { - error = bread(vp, lbn, osize, NOCRED, &bp); - if (error) { - brelse(bp); - return (error); + + /* + * The existing block is already + * at least as big as we want. + * Just read the block (if requested). + */ + + if (bpp != NULL) { + error = bread(vp, lbn, osize, NOCRED, + bpp); + if (error) { + brelse(*bpp); + return (error); + } } + return 0; } else { + + /* + * The existing block is smaller than we want, + * grow it. + */ + error = ffs_realloccg(ip, lbn, ffs_blkpref(ip, lbn, (int)lbn, &ip->i_ffs_db[0]), osize, nsize, cred, - &bp); + bpp, &newb); if (error) return (error); if (DOINGSOFTDEP(vp)) softdep_setup_allocdirect(ip, lbn, - dbtofsb(fs, bp->b_blkno), nb, - nsize, osize, bp); + newb, nb, nsize, osize, + bpp ? *bpp : NULL); } } else { - if (ip->i_ffs_size < (lbn + 1) * fs->fs_bsize) + + /* + * the block was not previously allocated, + * allocate a new block or fragment. + */ + + if (ip->i_ffs_size < lblktosize(fs, lbn + 1)) nsize = fragroundup(fs, size); else nsize = fs->fs_bsize; @@ -180,18 +227,20 @@ ffs_balloc(v) nsize, cred, &newb); if (error) return (error); - bp = getblk(vp, lbn, nsize, 0, 0); - bp->b_blkno = fsbtodb(fs, newb); - if (flags & B_CLRBUF) - clrbuf(bp); - if (DOINGSOFTDEP(vp)) + if (bpp != NULL) { + bp = getblk(vp, lbn, nsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + *bpp = bp; + } + if (DOINGSOFTDEP(vp)) { softdep_setup_allocdirect(ip, lbn, newb, 0, - nsize, 0, bp); + nsize, 0, bpp ? *bpp : NULL); + } } - ip->i_ffs_db[lbn] = ufs_rw32(dbtofsb(fs, bp->b_blkno), - needswap); + ip->i_ffs_db[lbn] = ufs_rw32(newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; - *ap->a_bpp = bp; return (0); } /* @@ -200,6 +249,7 @@ ffs_balloc(v) pref = 0; if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) return(error); + #ifdef DIAGNOSTIC if (num < 1) panic ("ffs_balloc: ufs_bmaparray returned indirect block\n"); @@ -311,14 +361,20 @@ ffs_balloc(v) } nb = newb; *allocblk++ = nb; - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); - if (flags & B_CLRBUF) - clrbuf(nbp); + if (bpp != NULL) { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + *bpp = nbp; + } if (DOINGSOFTDEP(vp)) softdep_setup_allocindir_page(ip, lbn, bp, - indirs[num].in_off, nb, 0, nbp); + indirs[num].in_off, nb, 0, bpp ? *bpp : NULL); bap[indirs[num].in_off] = ufs_rw32(nb, needswap); + if (allocib == NULL && unwindidx < 0) { + unwindidx = i - 1; + } /* * If required, write synchronously, otherwise use * delayed write. @@ -328,21 +384,23 @@ ffs_balloc(v) } else { bdwrite(bp); } - *ap->a_bpp = nbp; return (0); } brelse(bp); - if (flags & B_CLRBUF) { - error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); - if (error) { - brelse(nbp); - goto fail; + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp); + if (error) { + brelse(nbp); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + clrbuf(nbp); } - } else { - nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); + *bpp = nbp; } - *ap->a_bpp = nbp; return (0); fail: /* @@ -401,3 +459,62 @@ fail: (void) VOP_FSYNC(vp, cred, FSYNC_WAIT, 0, 0, curproc); return (error); } + + +int +ffs_ballocn(v) + void *v; +{ + struct vop_ballocn_args /* { + struct vnode *a_vp; + off_t a_offset; + off_t a_length; + struct ucred *a_cred; + int a_flags; + } */ *ap = v; + + off_t off, len; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + int error, delta, bshift, bsize; + + error = 0; + bshift = fs->fs_bshift; + bsize = 1 << bshift; + + off = ap->a_offset; + len = ap->a_length; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = min(bsize, len); + + error = VOP_BALLOC(vp, off, bsize, ap->a_cred, ap->a_flags, + NULL); + if (error) { + goto out; + } + + /* + * increase file size now, VOP_BALLOC() requires that + * EOF be up-to-date before each call. + */ + + if (ip->i_ffs_size < off + bsize) { + ip->i_ffs_size = off + bsize; + if (vp->v_uvm.u_size < ip->i_ffs_size) { + uvm_vnp_setsize(vp, ip->i_ffs_size); + } + } + + off += bsize; + len -= bsize; + } + +out: + return error; +} diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 9d196fcf8f52..bdc9bbd2445d 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_extern.h,v 1.16 2000/04/04 09:23:20 jdolecek Exp $ */ +/* $NetBSD: ffs_extern.h,v 1.17 2000/11/27 08:39:54 chs Exp $ */ /*- * Copyright (c) 1991, 1993, 1994 @@ -79,7 +79,7 @@ __BEGIN_DECLS int ffs_alloc __P((struct inode *, ufs_daddr_t, ufs_daddr_t , int, struct ucred *, ufs_daddr_t *)); int ffs_realloccg __P((struct inode *, ufs_daddr_t, ufs_daddr_t, int, int , - struct ucred *, struct buf **)); + struct ucred *, struct buf **, ufs_daddr_t *)); int ffs_reallocblks __P((void *)); int ffs_valloc __P((void *)); ufs_daddr_t ffs_blkpref __P((struct inode *, ufs_daddr_t, int, ufs_daddr_t *)); @@ -89,6 +89,7 @@ void ffs_clusteracct __P((struct fs *, struct cg *, ufs_daddr_t, int)); /* ffs_balloc.c */ int ffs_balloc __P((void *)); +int ffs_ballocn __P((void *)); /* ffs_bswap.c */ void ffs_sb_swap __P((struct fs*, struct fs *, int)); @@ -137,6 +138,7 @@ int ffs_read __P((void *)); int ffs_write __P((void *)); int ffs_fsync __P((void *)); int ffs_reclaim __P((void *)); +int ffs_size __P((void *)); __END_DECLS diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 06f8e788e6dc..22a68820591d 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_inode.c,v 1.37 2000/09/19 22:04:09 fvdl Exp $ */ +/* $NetBSD: ffs_inode.c,v 1.38 2000/11/27 08:39:54 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -170,37 +170,25 @@ ffs_truncate(v) struct vnode *ovp = ap->a_vp; ufs_daddr_t lastblock; struct inode *oip; - ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + ufs_daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; off_t length = ap->a_length; struct fs *fs; - struct buf *bp; int offset, size, level; long count, nblocks, blocksreleased = 0; int i; - int aflags, error, allerror = 0; + int error, allerror = 0; off_t osize; if (length < 0) return (EINVAL); oip = VTOI(ovp); -#if 1 - /* - * XXX. Was in Kirk's patches. Is it good behavior to just - * return and not update modification times? - */ - if (oip->i_ffs_size == length) - return (0); -#endif if (ovp->v_type == VLNK && (oip->i_ffs_size < ovp->v_mount->mnt_maxsymlinklen || (ovp->v_mount->mnt_maxsymlinklen == 0 && oip->i_din.ffs_din.di_blocks == 0))) { -#ifdef DIAGNOSTIC - if (length != 0) - panic("ffs_truncate: partial truncate of symlink"); -#endif - memset((char *)&oip->i_ffs_shortlink, 0, (u_int)oip->i_ffs_size); + KDASSERT(length == 0); + memset(&oip->i_ffs_shortlink, 0, (size_t)oip->i_ffs_size); oip->i_ffs_size = 0; oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT)); @@ -214,12 +202,56 @@ ffs_truncate(v) return (error); #endif fs = oip->i_fs; + if (length > fs->fs_maxfilesize) + return (EFBIG); + osize = oip->i_ffs_size; ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + + if (osize < length) { + ufs_balloc_range(ovp, length - 1, 1, ap->a_cred, + ap->a_flags & IO_SYNC ? B_SYNC : 0); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (VOP_UPDATE(ovp, NULL, NULL, 1)); + } + + /* + * When truncating a regular file down to a non-block-aligned size, + * we must zero the part of last block which is past the new EOF. + * We must synchronously flush the zeroed pages to disk + * since the new pages will be invalidated as soon as we + * inform the VM system of the new, smaller size. + * We must to this before acquiring the GLOCK, since fetching + * the pages will acquire the GLOCK internally. + * So there is a window where another thread could see a whole + * zeroed page past EOF, but that's life. + */ + + offset = blkoff(fs, length); + if (ovp->v_type == VREG && length < osize && offset != 0) { + struct uvm_object *uobj; + voff_t eoz; + + size = blksize(fs, oip, lblkno(fs, length)); + eoz = min(lblktosize(fs, lblkno(fs, length)) + size, osize); + uvm_vnp_zerorange(ovp, length, eoz - length); + uobj = &ovp->v_uvm.u_obj; + simple_lock(&uobj->vmobjlock); + uobj->pgops->pgo_flush(uobj, length, eoz, + PGO_CLEANIT|PGO_DEACTIVATE|PGO_SYNCIO); + simple_unlock(&ovp->v_uvm.u_obj.vmobjlock); + } + + lockmgr(&ovp->v_glock, LK_EXCLUSIVE, NULL); + if (DOINGSOFTDEP(ovp)) { uvm_vnp_setsize(ovp, length); - (void) uvm_vnp_uncache(ovp); if (length > 0) { /* * If a file is only partially truncated, then @@ -231,73 +263,26 @@ ffs_truncate(v) * so that it will have no data structures left. */ if ((error = VOP_FSYNC(ovp, ap->a_cred, FSYNC_WAIT, - 0, 0, ap->a_p)) != 0) + 0, 0, ap->a_p)) != 0) { + lockmgr(&ovp->v_glock, LK_RELEASE, NULL); return (error); + } } else { #ifdef QUOTA (void) chkdq(oip, -oip->i_ffs_blocks, NOCRED, 0); #endif softdep_setup_freeblocks(oip, length); (void) vinvalbuf(ovp, 0, ap->a_cred, ap->a_p, 0, 0); + lockmgr(&ovp->v_glock, LK_RELEASE, NULL); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, NULL, NULL, 0)); } } + /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of osize is 0, length will be at least 1. + * Reduce the size of the file. */ - if (osize < length) { - if (length > fs->fs_maxfilesize) - return (EFBIG); - aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) - aflags |= B_SYNC; - error = VOP_BALLOC(ovp, length - 1, 1, ap->a_cred, aflags, &bp); - if (error) - return (error); - oip->i_ffs_size = length; - uvm_vnp_setsize(ovp, length); - (void) uvm_vnp_uncache(ovp); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); - oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT)); - } - /* - * Shorten the size of the file. If the file is not being - * truncated to a block boundary, the contents of the - * partial block following the end of the file must be - * zero'ed in case it ever becomes accessible again because - * of subsequent file growth. Directories however are not - * zero'ed as they should grow back initialized to empty. - */ - offset = blkoff(fs, length); - if (offset == 0) { - oip->i_ffs_size = length; - } else { - lbn = lblkno(fs, length); - aflags = B_CLRBUF; - if (ap->a_flags & IO_SYNC) - aflags |= B_SYNC; - error = VOP_BALLOC(ovp, length - 1, 1, ap->a_cred, aflags, &bp); - if (error) - return (error); - oip->i_ffs_size = length; - size = blksize(fs, oip, lbn); - (void) uvm_vnp_uncache(ovp); - if (ovp->v_type != VDIR) - memset((char *)bp->b_data + offset, 0, - (u_int)(size - offset)); - allocbuf(bp, size); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); - } + oip->i_ffs_size = length; uvm_vnp_setsize(ovp, length); /* * Calculate index into inode's block list of @@ -431,6 +416,7 @@ done: oip->i_ffs_blocks -= blocksreleased; if (oip->i_ffs_blocks < 0) /* sanity */ oip->i_ffs_blocks = 0; + lockmgr(&ovp->v_glock, LK_RELEASE, NULL); oip->i_flag |= IN_CHANGE; #ifdef QUOTA (void) chkdq(oip, -blocksreleased, NOCRED, 0); diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 6c4291dffac5..eeda78051bfb 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_softdep.c,v 1.7 2000/11/08 14:28:16 ad Exp $ */ +/* $NetBSD: ffs_softdep.c,v 1.8 2000/11/27 08:39:54 chs Exp $ */ /* * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. @@ -53,6 +53,10 @@ #include #include +#include +struct pool sdpcpool; +int softdep_lockedbufs; + /* * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide. */ @@ -98,6 +102,13 @@ extern char *memname[]; * End system adaptaion definitions. */ +/* + * Definitions for page cache info hashtable. + */ +#define PCBPHASHSIZE 1024 +LIST_HEAD(, buf) pcbphashhead[PCBPHASHSIZE]; +#define PCBPHASH(vp, lbn) ((((vaddr_t)(vp) >> 8) ^ (lbn)) & (PCBPHASHSIZE - 1)) + /* * Internal function prototypes. */ @@ -149,6 +160,16 @@ static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, static void pause_timer __P((void *)); static int request_cleanup __P((int, int)); static void add_to_worklist __P((struct worklist *)); +static struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t, + long)); +static void softdep_collect_pagecache __P((struct vnode *, + struct bufq_head *)); +static void softdep_free_pagecache __P((struct bufq_head *)); +static struct vnode *softdep_lookupvp(struct fs *, ino_t); +static struct buf *softdep_lookup_pcbp __P((struct vnode *, ufs_lbn_t)); +void softdep_pageiodone __P((struct buf *)); +void softdep_flush_vnode __P((struct vnode *, ufs_lbn_t)); +static void softdep_flush_indir __P((struct vnode *)); /* * Exported softdep operations. @@ -889,6 +910,7 @@ top: void softdep_initialize() { + int i; LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); @@ -902,6 +924,11 @@ softdep_initialize() newblk_hashtbl = hashinit(64, HASH_LIST, M_NEWBLK, M_WAITOK, &newblk_hash); sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); + pool_init(&sdpcpool, sizeof(struct buf), 0, 0, 0, "sdpcpool", + 0, pool_page_alloc_nointr, pool_page_free_nointr, M_TEMP); + for (i = 0; i < PCBPHASHSIZE; i++) { + LIST_INIT(&pcbphashhead[i]); + } } /* @@ -1161,6 +1188,18 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); + /* + * If we were not passed a bp to attach the dep to, + * then this must be for a regular file. + * Allocate a buffer to represent the page cache pages + * that are the real dependency. The pages themselves + * cannot refer to the dependency since we don't want to + * add a field to struct vm_page for this. + */ + + if (bp == NULL) { + bp = softdep_setup_pagecache(ip, lbn, newsize); + } WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NDADDR) { /* allocating an indirect block */ @@ -1310,7 +1349,10 @@ handle_workitem_freefrag(freefrag) vp.v_data = &tip; vp.v_mount = freefrag->ff_devvp->v_specmountpoint; tip.i_vnode = &vp; + lockinit(&vp.v_glock, PVFS, "fglock", 0, 0); + lockmgr(&vp.v_glock, LK_EXCLUSIVE, NULL); ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize); + lockmgr(&vp.v_glock, LK_RELEASE, NULL); FREE(freefrag, M_FREEFRAG); } @@ -1380,6 +1422,18 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct allocindir *aip; struct pagedep *pagedep; + /* + * If we are already holding "many" buffers busy (as the safe copies + * of indirect blocks) flush the dependency for one of those before + * potentially tying up more. otherwise we could fill the + * buffer cache with busy buffers and deadlock. + * XXXUBC I'm sure there's a better way to deal with this. + */ + + while (softdep_lockedbufs > nbuf >> 2) { + softdep_flush_indir(ITOV(ip)); + } + aip = newallocindir(ip, ptrno, newblkno, oldblkno); ACQUIRE_LOCK(&lk); /* @@ -1390,6 +1444,9 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) if ((ip->i_ffs_mode & IFMT) == IFDIR && pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); + if (nbp == NULL) { + nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize); + } WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); setup_allocindir_phase2(bp, ip, aip); @@ -1495,8 +1552,10 @@ setup_allocindir_phase2(bp, ip, aip) FREE_LOCK(&lk); } if (newindirdep) { - if (indirdep->ir_savebp != NULL) + if (indirdep->ir_savebp != NULL) { brelse(newindirdep->ir_savebp); + softdep_lockedbufs--; + } WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP); } if (indirdep) @@ -1513,6 +1572,7 @@ setup_allocindir_phase2(bp, ip, aip) } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); + softdep_lockedbufs++; newindirdep->ir_savebp->b_flags |= B_ASYNC; bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); } @@ -1555,8 +1615,9 @@ softdep_setup_freeblocks(ip, length) struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; - struct vnode *vp; + struct vnode *vp = ITOV(ip); struct buf *bp; + struct bufq_head fbqh; struct fs *fs = ip->i_fs; int i, error; #ifdef FFS_EI @@ -1616,7 +1677,13 @@ softdep_setup_freeblocks(ip, length) * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. + * We must remove any pagecache markers from the pagecache + * hashtable first because any I/Os in flight will want to see + * dependencies attached to their pagecache markers. We cannot + * free the pagecache markers until after we've freed all the + * dependencies that reference them later. */ + softdep_collect_pagecache(vp, &fbqh); merge_inode_lists(inodedep); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, 1); @@ -1628,7 +1695,6 @@ softdep_setup_freeblocks(ip, length) * Once they are all there, walk the list and get rid of * any dependencies. */ - vp = ITOV(ip); ACQUIRE_LOCK(&lk); drain_output(vp, 1); while (getdirtybuf(&vp->v_dirtyblkhd.lh_first, MNT_WAIT)) { @@ -1640,6 +1706,7 @@ softdep_setup_freeblocks(ip, length) brelse(bp); ACQUIRE_LOCK(&lk); } + softdep_free_pagecache(&fbqh); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we @@ -1730,8 +1797,8 @@ deallocate_dependencies(bp, inodedep) * If the inode has already been written, then they * can be dumped directly onto the work list. */ - for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; - dirrem = LIST_NEXT(dirrem, dm_next)) { + while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) + != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; if (inodedep == NULL || @@ -1944,6 +2011,10 @@ handle_workitem_freeblocks(freeblks) } nblocks = btodb(fs->fs_bsize); blocksreleased = 0; + + lockinit(&vp.v_glock, PVFS, "fglock", 0, 0); + lockmgr(&vp.v_glock, LK_EXCLUSIVE, NULL); + /* * Indirect blocks first. */ @@ -1966,6 +2037,7 @@ handle_workitem_freeblocks(freeblks) ffs_blkfree(&tip, bn, bsize); blocksreleased += btodb(bsize); } + lockmgr(&vp.v_glock, LK_RELEASE, NULL); #ifdef DIAGNOSTIC if (freeblks->fb_chkcnt != blocksreleased) @@ -2034,6 +2106,7 @@ indir_trunc(ip, dbn, level, lbn, countp) error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp); if (error) return (error); + softdep_lockedbufs++; } /* * Recursively free indirect blocks. @@ -2053,6 +2126,7 @@ indir_trunc(ip, dbn, level, lbn, countp) } bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); + softdep_lockedbufs--; return (allerror); } @@ -2793,6 +2867,8 @@ softdep_disk_io_initiation(bp) if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(indirdep->ir_savebp); + softdep_lockedbufs--; + /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; LIST_REMOVE(wk, wk_list); @@ -3681,8 +3757,9 @@ merge_inode_lists(inodedep) { struct allocdirect *listadp, *newadp; + listadp = TAILQ_FIRST(&inodedep->id_inoupdt); newadp = TAILQ_FIRST(&inodedep->id_newinoupdt); - for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) { + while (listadp && newadp) { if (listadp->ad_lbn < newadp->ad_lbn) { listadp = TAILQ_NEXT(listadp, ad_next); continue; @@ -3935,6 +4012,7 @@ loop: switch (wk->wk_type) { case D_ALLOCDIRECT: + KASSERT(vp->v_type != VREG); adp = WK_ALLOCDIRECT(wk); if (adp->ad_state & DEPCOMPLETE) break; @@ -4141,6 +4219,7 @@ flush_inodedep_deps(fs, ino) struct allocdirect *adp; int error, waitfor; struct buf *bp; + struct vnode *vp; /* * This work is done in two passes. The first pass grabs most @@ -4160,6 +4239,27 @@ flush_inodedep_deps(fs, ino) ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); + + /* + * When file data was in the buffer cache, + * softdep_sync_metadata() would start i/o on + * file data buffers itself. But now that + * we're using the page cache to hold file data, + * we need something else to trigger those flushes. + * let's just do it here. + */ + + vp = softdep_lookupvp(fs, ino); + if (vp) { + struct uvm_object *uobj = &vp->v_uvm.u_obj; + + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, + PGO_ALLPAGES|PGO_CLEANIT| + (waitfor == MNT_NOWAIT ? 0: PGO_SYNCIO)); + simple_unlock(&uobj->vmobjlock); + } + for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_state & DEPCOMPLETE) @@ -4727,3 +4827,236 @@ softdep_error(func, error) /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } + +/* + * Allocate a buffer on which to attach a dependency. + */ +static struct buf * +softdep_setup_pagecache(ip, lbn, size) + struct inode *ip; + ufs_lbn_t lbn; + long size; +{ + struct vnode *vp = ITOV(ip); + struct buf *bp; + int s; + + /* + * Enter pagecache dependency buf in hash. + */ + + bp = softdep_lookup_pcbp(vp, lbn); + if (bp == NULL) { + s = splbio(); + bp = pool_get(&sdpcpool, PR_WAITOK); + splx(s); + memset(bp, 0, sizeof(*bp)); + + bp->b_vp = vp; + bp->b_lblkno = lbn; + bp->b_bcount = bp->b_resid = size; + LIST_INIT(&bp->b_dep); + LIST_INSERT_HEAD(&pcbphashhead[PCBPHASH(vp, lbn)], bp, b_hash); + } else { + KASSERT(size >= bp->b_bcount); + bp->b_resid += size - bp->b_bcount; + bp->b_bcount = size; + } + return bp; +} + +/* + * softdep_collect_pagecache() and softdep_free_pagecache() + * are used to remove page cache dependency buffers when + * a file is being truncated to 0. + */ + +static void +softdep_collect_pagecache(vp, bqhp) + struct vnode *vp; + struct bufq_head *bqhp; +{ + struct buf *bp, *nextbp; + int i; + + TAILQ_INIT(bqhp); + for (i = 0; i < PCBPHASHSIZE; i++) { + for (bp = LIST_FIRST(&pcbphashhead[i]); + bp != NULL; + bp = nextbp) { + nextbp = LIST_NEXT(bp, b_hash); + if (bp->b_vp == vp) { + LIST_REMOVE(bp, b_hash); + TAILQ_INSERT_HEAD(bqhp, bp, b_freelist); + } + } + } +} + +static void +softdep_free_pagecache(bqhp) + struct bufq_head *bqhp; +{ + struct buf *bp, *nextbp; + + for (bp = TAILQ_FIRST(bqhp); bp != NULL; bp = nextbp) { + nextbp = TAILQ_NEXT(bp, b_freelist); + TAILQ_REMOVE(bqhp, bp, b_freelist); + KASSERT(LIST_FIRST(&bp->b_dep) == NULL); + pool_put(&sdpcpool, bp); + } +} + +static struct vnode * +softdep_lookupvp(fs, ino) + struct fs *fs; + ino_t ino; +{ + struct mount *mp; + extern struct vfsops ffs_vfsops; + + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_op == &ffs_vfsops && + VFSTOUFS(mp)->um_fs == fs) { + break; + } + } + if (mp == NULL) { + return NULL; + } + return ufs_ihashlookup(VFSTOUFS(mp)->um_dev, ino); +} + +/* + * Flush some dependent page cache data for any vnode *except* + * the one specified. + * XXXUBC this is a horrible hack and it's probably not too hard to deadlock + * even with this, but it's better than nothing. + */ + +static void +softdep_flush_indir(vp) + struct vnode *vp; +{ + struct buf *bp; + int i; + + for (i = 0; i < PCBPHASHSIZE; i++) { + LIST_FOREACH(bp, &pcbphashhead[i], b_hash) { + if (bp->b_vp == vp || + LIST_FIRST(&bp->b_dep)->wk_type != D_ALLOCINDIR) { + continue; + } + + VOP_FSYNC(bp->b_vp, curproc->p_ucred, FSYNC_WAIT, 0, 0, + curproc); + return; + } + } + printf("softdep_flush_indir: nothing to flush?\n"); +} + + +static struct buf * +softdep_lookup_pcbp(vp, lbn) + struct vnode *vp; + ufs_lbn_t lbn; +{ + struct buf *bp; + + LIST_FOREACH(bp, &pcbphashhead[PCBPHASH(vp, lbn)], b_hash) { + if (bp->b_vp == vp && bp->b_lblkno == lbn) { + break; + } + } + return bp; +} + +/* + * Do softdep i/o completion processing for page cache writes. + */ + +void +softdep_pageiodone(bp) + struct buf *bp; +{ + int npages = bp->b_bufsize >> PAGE_SHIFT; + struct vnode *vp = bp->b_vp; + struct vm_page *pg; + struct buf *pcbp = NULL; + struct allocdirect *adp; + struct allocindir *aip; + struct worklist *wk; + ufs_lbn_t lbn; + voff_t off; + long iosize = bp->b_bcount; + int size, asize, bshift, bsize; + int i; + + KASSERT(!(bp->b_flags & B_READ)); + bshift = vp->v_mount->mnt_fs_bshift; + bsize = 1 << bshift; + asize = min(PAGE_SIZE, bsize); + ACQUIRE_LOCK(&lk); + for (i = 0; i < npages; i++) { + pg = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); + if (pg == NULL) { + continue; + } + + for (off = pg->offset; + off < pg->offset + PAGE_SIZE; + off += bsize) { + size = min(asize, iosize); + iosize -= size; + lbn = off >> bshift; + if (pcbp == NULL || pcbp->b_lblkno != lbn) { + pcbp = softdep_lookup_pcbp(vp, lbn); + } + if (pcbp == NULL) { + continue; + } + pcbp->b_resid -= size; + if (pcbp->b_resid < 0) { + panic("softdep_pageiodone: " + "resid < 0, vp %p lbn 0x%lx pcbp %p", + vp, lbn, pcbp); + } + if (pcbp->b_resid > 0) { + continue; + } + + /* + * We've completed all the i/o for this block. + * mark the dep complete. + */ + + KASSERT(LIST_FIRST(&pcbp->b_dep) != NULL); + while ((wk = LIST_FIRST(&pcbp->b_dep))) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_ALLOCDIRECT: + adp = WK_ALLOCDIRECT(wk); + adp->ad_state |= COMPLETE; + handle_allocdirect_partdone(adp); + break; + + case D_ALLOCINDIR: + aip = WK_ALLOCINDIR(wk); + aip->ai_state |= COMPLETE; + handle_allocindir_partdone(aip); + break; + + default: + panic("softdep_pageiodone: " + "bad type %d, pcbp %p wk %p", + wk->wk_type, pcbp, wk); + } + } + LIST_REMOVE(pcbp, b_hash); + pool_put(&sdpcpool, pcbp); + pcbp = NULL; + } + } + FREE_LOCK(&lk); +} diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 89e2fddef534..2f150e94159e 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_vfsops.c,v 1.72 2000/10/13 16:40:26 simonb Exp $ */ +/* $NetBSD: ffs_vfsops.c,v 1.73 2000/11/27 08:39:55 chs Exp $ */ /* * Copyright (c) 1989, 1991, 1993, 1994 @@ -690,6 +690,8 @@ ffs_mountfs(devvp, mp, p) mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_FFS); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + mp->mnt_fs_bshift = fs->fs_bshift; + mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ mp->mnt_flag |= MNT_LOCAL; #ifdef FFS_EI if (needswap) @@ -699,6 +701,7 @@ ffs_mountfs(devvp, mp, p) ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; + ump->um_lognindir = ffs(fs->fs_nindir) - 1; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) @@ -797,6 +800,9 @@ ffs_unmount(mp, mntflags, p) if (ump->um_devvp->v_type != VBAD) ump->um_devvp->v_specmountpoint = NULL; vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + if (LIST_FIRST(&ump->um_devvp->v_dirtyblkhd)) { + panic("ffs_unmount: flush left dirty bufs %p", ump->um_devvp); + } error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE, NOCRED, p); vput(ump->um_devvp); @@ -1107,6 +1113,7 @@ ffs_vget(mp, ino, vpp) ip->i_ffs_uid = ip->i_din.ffs_din.di_ouid; /* XXX */ ip->i_ffs_gid = ip->i_din.ffs_din.di_ogid; /* XXX */ } /* XXX */ + uvm_vnp_setsize(vp, ip->i_ffs_size); *vpp = vp; return (0); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 5122173f630d..c2fb4524c2a7 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_vnops.c,v 1.34 2000/10/24 14:43:32 fvdl Exp $ */ +/* $NetBSD: ffs_vnops.c,v 1.35 2000/11/27 08:39:55 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -108,12 +108,16 @@ struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { { &vop_blkatoff_desc, ffs_blkatoff }, /* blkatoff */ { &vop_valloc_desc, ffs_valloc }, /* valloc */ { &vop_balloc_desc, ffs_balloc }, /* balloc */ + { &vop_ballocn_desc, ffs_ballocn }, /* balloc */ { &vop_reallocblks_desc, ffs_reallocblks }, /* reallocblks */ { &vop_vfree_desc, ffs_vfree }, /* vfree */ { &vop_truncate_desc, ffs_truncate }, /* truncate */ { &vop_update_desc, ffs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL } + { &vop_getpages_desc, genfs_getpages }, /* getpages */ + { &vop_putpages_desc, genfs_putpages }, /* putpages */ + { &vop_size_desc, ffs_size }, /* size */ + { NULL, NULL } }; struct vnodeopv_desc ffs_vnodeop_opv_desc = { &ffs_vnodeop_p, ffs_vnodeop_entries }; @@ -165,7 +169,7 @@ struct vnodeopv_entry_desc ffs_specop_entries[] = { { &vop_truncate_desc, spec_truncate }, /* truncate */ { &vop_update_desc, ffs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc ffs_specop_opv_desc = { &ffs_specop_p, ffs_specop_entries }; @@ -217,7 +221,7 @@ struct vnodeopv_entry_desc ffs_fifoop_entries[] = { { &vop_truncate_desc, fifo_truncate }, /* truncate */ { &vop_update_desc, ffs_update }, /* update */ { &vop_bwrite_desc, vn_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { NULL, NULL } }; struct vnodeopv_desc ffs_fifoop_opv_desc = { &ffs_fifoop_p, ffs_fifoop_entries }; @@ -239,7 +243,7 @@ ffs_fsync(v) off_t offhi; struct proc *a_p; } */ *ap = v; - struct buf *bp, *nbp, *ibp; + struct buf *bp; int s, num, error, i; struct indir ia[NIADDR + 1]; int bsize; @@ -260,38 +264,32 @@ ffs_fsync(v) if (ap->a_offhi % bsize != 0) blk_high++; - /* - * First, flush all data blocks in range. - */ -loop: s = splbio(); - for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { - nbp = LIST_NEXT(bp, b_vnbufs); - if ((bp->b_flags & B_BUSY)) - continue; - if (bp->b_lblkno < blk_low || bp->b_lblkno > blk_high) - continue; - bp->b_flags |= B_BUSY | B_VFLUSH; - splx(s); - bawrite(bp); - goto loop; - } /* - * Then, flush possibly unwritten indirect blocks. Without softdeps, - * these should be the only ones left. + * First, flush all pages in range. */ + + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + (vp->v_uvm.u_obj.pgops->pgo_flush)(&vp->v_uvm.u_obj, + ap->a_offlo, ap->a_offhi - ap->a_offlo, PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + /* + * Then, flush indirect blocks. + */ + if (!(ap->a_flags & FSYNC_DATAONLY) && blk_high >= NDADDR) { error = ufs_getlbns(vp, blk_high, ia, &num); - if (error != 0) + if (error) return error; for (i = 0; i < num; i++) { - ibp = incore(vp, ia[i].in_lbn); - if (ibp != NULL && !(ibp->b_flags & B_BUSY) && - (ibp->b_flags & B_DELWRI)) { - ibp->b_flags |= B_BUSY | B_VFLUSH; + bp = incore(vp, ia[i].in_lbn); + if (bp != NULL && !(bp->b_flags & B_BUSY) && + (bp->b_flags & B_DELWRI)) { + bp->b_flags |= B_BUSY | B_VFLUSH; splx(s); - bawrite(ibp); + bawrite(bp); s = splbio(); } } @@ -300,11 +298,9 @@ loop: if (ap->a_flags & FSYNC_WAIT) { while (vp->v_numoutput > 0) { vp->v_flag |= VBWAIT; - tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1, - "fsync_range", 0); + tsleep(&vp->v_numoutput, PRIBIO + 1, "fsync_range", 0); } } - splx(s); return (VOP_UPDATE(vp, NULL, NULL, @@ -330,23 +326,33 @@ ffs_full_fsync(v) struct vnode *vp = ap->a_vp; struct buf *bp, *nbp; int s, error, passes, skipmeta; + struct uvm_object *uobj; if (vp->v_type == VBLK && vp->v_specmountpoint != NULL && (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) softdep_fsync_mountdev(vp); - /* - * Flush all dirty buffers associated with a vnode + /* + * Flush all dirty data associated with a vnode. */ + + if (vp->v_type == VREG) { + uobj = &vp->v_uvm.u_obj; + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, PGO_ALLPAGES|PGO_CLEANIT| + ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0)); + simple_unlock(&uobj->vmobjlock); + } + passes = NIADDR + 1; skipmeta = 0; if (ap->a_flags & (FSYNC_DATAONLY|FSYNC_WAIT)) skipmeta = 1; s = splbio(); + loop: - for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; - bp = LIST_NEXT(bp, b_vnbufs)) + LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) bp->b_flags &= ~B_SCANNED; for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = LIST_NEXT(bp, b_vnbufs); @@ -445,3 +451,31 @@ ffs_reclaim(v) vp->v_data = NULL; return (0); } + +/* + * Return the last logical file offset that should be written for this file + * if we're doing a write that ends at "size". + */ +int +ffs_size(v) + void *v; +{ + struct vop_size_args /* { + struct vnode *a_vp; + off_t a_size; + off_t *a_eobp; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct fs *fs = ip->i_fs; + ufs_lbn_t olbn, nlbn; + + olbn = lblkno(fs, ip->i_ffs_size); + nlbn = lblkno(fs, ap->a_size); + + if (nlbn < NDADDR && olbn <= nlbn) { + *ap->a_eobp = fragroundup(fs, ap->a_size); + } else { + *ap->a_eobp = blkroundup(fs, ap->a_size); + } + return 0; +} diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c index 6d250ff53d51..da8a85776306 100644 --- a/sys/ufs/lfs/lfs_alloc.c +++ b/sys/ufs/lfs/lfs_alloc.c @@ -1,4 +1,4 @@ -/* $NetBSD: lfs_alloc.c,v 1.44 2000/11/27 03:33:57 perseant Exp $ */ +/* $NetBSD: lfs_alloc.c,v 1.45 2000/11/27 08:39:55 chs Exp $ */ /*- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. @@ -229,7 +229,6 @@ extend_ifile(struct lfs *fs, struct ucred *cred) } ip->i_ffs_size += fs->lfs_bsize; uvm_vnp_setsize(vp, ip->i_ffs_size); - (void)uvm_vnp_uncache(vp); VOP_UNLOCK(vp, 0); i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) * diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c index e90ff6c25dd3..0fdf49dca0c9 100644 --- a/sys/ufs/lfs/lfs_inode.c +++ b/sys/ufs/lfs/lfs_inode.c @@ -1,4 +1,4 @@ -/* $NetBSD: lfs_inode.c,v 1.48 2000/11/27 03:33:57 perseant Exp $ */ +/* $NetBSD: lfs_inode.c,v 1.49 2000/11/27 08:39:56 chs Exp $ */ /*- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. @@ -288,7 +288,6 @@ lfs_truncate(v) return (error); oip->i_ffs_size = length; uvm_vnp_setsize(ovp, length); - (void) uvm_vnp_uncache(ovp); (void) VOP_BWRITE(bp); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (VOP_UPDATE(ovp, NULL, NULL, 0)); @@ -338,7 +337,6 @@ lfs_truncate(v) odb = btodb(bp->b_bcount); oip->i_ffs_size = length; size = blksize(fs, oip, lbn); - (void) uvm_vnp_uncache(ovp); if (ovp->v_type != VDIR) memset((char *)bp->b_data + offset, 0, (u_int)(size - offset)); diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c index b02549f1650a..6304a7b53a83 100644 --- a/sys/ufs/lfs/lfs_segment.c +++ b/sys/ufs/lfs/lfs_segment.c @@ -1,4 +1,4 @@ -/* $NetBSD: lfs_segment.c,v 1.63 2000/11/27 03:33:57 perseant Exp $ */ +/* $NetBSD: lfs_segment.c,v 1.64 2000/11/27 08:39:56 chs Exp $ */ /*- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. @@ -309,7 +309,7 @@ lfs_vflush(vp) /* panic("VDIROP being flushed...this can\'t happen"); */ } if(vp->v_usecount<0) { - printf("usecount=%ld\n",vp->v_usecount); + printf("usecount=%d\n",vp->v_usecount); panic("lfs_vflush: usecount<0"); } #endif @@ -1864,8 +1864,8 @@ lfs_vunref(vp) #ifdef DIAGNOSTIC if(vp->v_usecount<=0) { printf("lfs_vunref: inum is %d\n", VTOI(vp)->i_number); - printf("lfs_vunref: flags are 0x%lx\n", vp->v_flag); - printf("lfs_vunref: usecount = %ld\n", vp->v_usecount); + printf("lfs_vunref: flags are 0x%x\n", vp->v_flag); + printf("lfs_vunref: usecount = %d\n", vp->v_usecount); panic("lfs_vunref: v_usecount<0"); } #endif diff --git a/sys/ufs/mfs/mfs_vnops.c b/sys/ufs/mfs/mfs_vnops.c index 3921ae920d42..c89f88681911 100644 --- a/sys/ufs/mfs/mfs_vnops.c +++ b/sys/ufs/mfs/mfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: mfs_vnops.c,v 1.25 2000/10/09 18:07:06 thorpej Exp $ */ +/* $NetBSD: mfs_vnops.c,v 1.26 2000/11/27 08:39:57 chs Exp $ */ /* * Copyright (c) 1989, 1993 @@ -263,7 +263,7 @@ mfs_close(v) * vnode, so if we find any other uses, it is a panic. */ if (vp->v_usecount > 1) - printf("mfs_close: ref count %ld > 1\n", vp->v_usecount); + printf("mfs_close: ref count %d > 1\n", vp->v_usecount); if (vp->v_usecount > 1 || BUFQ_FIRST(&mfsp->mfs_buflist) != NULL) panic("mfs_close"); /* diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index 15ecb0c5bd89..bfc4f4bd88e0 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_bmap.c,v 1.9 2000/03/30 12:41:14 augustss Exp $ */ +/* $NetBSD: ufs_bmap.c,v 1.10 2000/11/27 08:39:57 chs Exp $ */ /* * Copyright (c) 1989, 1991, 1993 @@ -186,6 +186,9 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp) xap->in_exists = 1; bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0); + if (bp == NULL) { + return ENOMEM; + } if (bp->b_flags & (B_DONE | B_DELWRI)) { trace(TR_BREADHIT, pack(vp, size), metalbn); } @@ -243,6 +246,7 @@ ufs_getlbns(vp, bn, ap, nump) long metalbn, realbn; struct ufsmount *ump; int64_t blockcnt; + int lbc; int i, numlevels, off; ump = VFSTOUFS(vp->v_mount); @@ -263,10 +267,15 @@ ufs_getlbns(vp, bn, ap, nump) * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ - for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + + bn -= NDADDR; + for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); - blockcnt *= MNINDIR(ump); + + lbc += ump->um_lognindir; + blockcnt = (int64_t)1 << lbc; + if (bn < blockcnt) break; } @@ -292,8 +301,9 @@ ufs_getlbns(vp, bn, ap, nump) if (metalbn == realbn) break; - blockcnt /= MNINDIR(ump); - off = (bn / blockcnt) % MNINDIR(ump); + lbc -= ump->um_lognindir; + blockcnt = (int64_t)1 << lbc; + off = (bn >> lbc) & (MNINDIR(ump) - 1); ++numlevels; ap->in_lbn = metalbn; diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index 5d4de35e152e..fb71c1d24bdb 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_extern.h,v 1.23 2000/03/16 18:26:49 jdolecek Exp $ */ +/* $NetBSD: ufs_extern.h,v 1.24 2000/11/27 08:39:57 chs Exp $ */ /*- * Copyright (c) 1991, 1993, 1994 @@ -113,6 +113,7 @@ void ufs_ihashrem __P((struct inode *)); /* ufs_inode.c */ int ufs_reclaim __P((struct vnode *, struct proc *)); +int ufs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, int)); /* ufs_lookup.c */ void ufs_dirbad __P((struct inode *, doff_t, char *)); diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index 6c803f5f4ac1..79d496fb114e 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_inode.c,v 1.15 2000/05/29 18:04:31 mycroft Exp $ */ +/* $NetBSD: ufs_inode.c,v 1.16 2000/11/27 08:39:57 chs Exp $ */ /* * Copyright (c) 1991, 1993 @@ -55,6 +55,8 @@ #include #include +#include + /* * Last reference to an inode. If necessary, write or delete it. */ @@ -73,7 +75,7 @@ ufs_inactive(v) extern int prtactive; if (prtactive && vp->v_usecount != 0) - vprint("ffs_inactive: pushing active", vp); + vprint("ufs_inactive: pushing active", vp); /* * Ignore inodes related to stale file handles. @@ -102,8 +104,9 @@ out: * If we are done with the inode, reclaim it * so that it can be reused immediately. */ + if (ip->i_ffs_mode == 0) - vrecycle(vp, (struct simplelock *)0, p); + vrecycle(vp, NULL, p); return (error); } @@ -146,3 +149,140 @@ ufs_reclaim(vp, p) #endif return (0); } + +/* + * allocate a range of blocks in a file. + * after this function returns, any page entirely contained within the range + * will map to invalid data and thus must be overwritten before it is made + * accessible to others. + */ + +int +ufs_balloc_range(vp, off, len, cred, flags) + struct vnode *vp; + off_t off, len; + struct ucred *cred; + int flags; +{ + off_t oldeof, neweof, oldeob, neweob, oldpagestart, pagestart; + struct uvm_object *uobj; + int i, delta, error, npages1, npages2; + int bshift = vp->v_mount->mnt_fs_bshift; + int bsize = 1 << bshift; + int ppb = max(bsize >> PAGE_SHIFT, 1); + struct vm_page *pgs1[ppb], *pgs2[ppb]; + UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x", + vp, off, len, vp->v_uvm.u_size); + + oldeof = vp->v_uvm.u_size; + error = VOP_SIZE(vp, oldeof, &oldeob); + if (error) { + return error; + } + + neweof = max(vp->v_uvm.u_size, off + len); + error = VOP_SIZE(vp, neweof, &neweob); + if (error) { + return error; + } + + error = 0; + uobj = &vp->v_uvm.u_obj; + pgs1[0] = pgs2[0] = NULL; + + /* + * if the last block in the file is not a full block (ie. it is a + * fragment), and this allocation is causing the fragment to change + * size (either to expand the fragment or promote it to a full block), + * cache the old last block (at its new size). + */ + + oldpagestart = trunc_page(oldeof) & ~(bsize - 1); + if ((oldeob & (bsize - 1)) != 0 && oldeob != neweob) { + npages1 = min(ppb, (round_page(neweob) - oldpagestart) >> + PAGE_SHIFT); + memset(pgs1, 0, npages1 * sizeof(struct vm_page *)); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, oldpagestart, pgs1, &npages1, + 0, VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF); + if (error) { + goto out; + } + for (i = 0; i < npages1; i++) { + UVMHIST_LOG(ubchist, "got pgs1[%d] %p", i, pgs1[i],0,0); + KASSERT((pgs1[i]->flags & PG_RELEASED) == 0); + pgs1[i]->flags &= ~PG_CLEAN; + } + } + + /* + * cache the new range as well. this will create zeroed pages + * where the new block will be and keep them locked until the + * new block is allocated, so there will be no window where + * the old contents of the new block is visible to racing threads. + */ + + pagestart = trunc_page(off) & ~(bsize - 1); + if (pagestart != oldpagestart || pgs1[0] == NULL) { + npages2 = min(ppb, (round_page(neweob) - pagestart) >> + PAGE_SHIFT); + memset(pgs2, 0, npages2 * sizeof(struct vm_page *)); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, pagestart, pgs2, &npages2, 0, + VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF); + if (error) { + goto out; + } + for (i = 0; i < npages2; i++) { + UVMHIST_LOG(ubchist, "got pgs2[%d] %p", i, pgs2[i],0,0); + KASSERT((pgs2[i]->flags & PG_RELEASED) == 0); + pgs2[i]->flags &= ~PG_CLEAN; + } + } + + /* + * adjust off to be block-aligned. + */ + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + /* + * now allocate the range. + */ + + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL); + error = VOP_BALLOCN(vp, off, len, cred, flags); + lockmgr(&vp->v_glock, LK_RELEASE, NULL); + + /* + * unbusy any pages we are holding. + * if we got an error, set the vnode size back to what it was before. + * this will free any pages we created past the old eof. + */ + +out: + if (error) { + uvm_vnp_setsize(vp, oldeof); + } + simple_lock(&uobj->vmobjlock); + if (pgs1[0] != NULL) { + uvm_page_unbusy(pgs1, npages1); + + /* + * The data in the frag might be moving to a new disk location. + * We need to flush pages to the new disk locations. + */ + + (uobj->pgops->pgo_flush)(uobj, oldeof & ~(bsize - 1), + min((oldeof + bsize) & ~(bsize - 1), neweof), + PGO_CLEANIT | ((flags & B_SYNC) ? PGO_SYNCIO : 0)); + } + if (pgs2[0] != NULL) { + uvm_page_unbusy(pgs2, npages2); + } + simple_unlock(&uobj->vmobjlock); + return error; +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index 33679ee8b42e..8c046918b5d4 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_readwrite.c,v 1.27 2000/09/09 04:49:55 perseant Exp $ */ +/* $NetBSD: ufs_readwrite.c,v 1.28 2000/11/27 08:39:57 chs Exp $ */ /*- * Copyright (c) 1993 @@ -73,17 +73,20 @@ READ(v) struct inode *ip; struct uio *uio; FS *fs; +#ifndef LFS_READWRITE + void *win; + vsize_t bytelen; +#endif struct buf *bp; ufs_daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; - u_short mode; vp = ap->a_vp; ip = VTOI(vp); - mode = ip->i_ffs_mode; uio = ap->a_uio; + error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -102,19 +105,39 @@ READ(v) return (EFBIG); if (uio->uio_resid == 0) return (0); + if (uio->uio_offset >= ip->i_ffs_size) { + goto out; + } + +#ifndef LFS_READWRITE + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + bytelen = min(ip->i_ffs_size - uio->uio_offset, + uio->uio_resid); + if (bytelen == 0) + break; + + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + goto out; + } +#endif for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { - if ((bytesinfile = ip->i_ffs_size - uio->uio_offset) <= 0) + bytesinfile = ip->i_ffs_size - uio->uio_offset; + if (bytesinfile <= 0) break; lbn = lblkno(fs, uio->uio_offset); nextlbn = lbn + 1; size = BLKSIZE(fs, ip, lbn); blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - if (bytesinfile < xfersize) - xfersize = bytesinfile; + xfersize = min(min(fs->fs_bsize - blkoffset, uio->uio_resid), + bytesinfile); #ifdef LFS_READWRITE (void)lfs_check(vp, lbn, 0); @@ -122,9 +145,6 @@ READ(v) #else if (lblktosize(fs, nextlbn) >= ip->i_ffs_size) error = bread(vp, lbn, size, NOCRED, &bp); - else if (doclusterread) - error = cluster_read(vp, - ip->i_ffs_size, lbn, size, NOCRED, &bp); else if (lbn - 1 == vp->v_lastr) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, @@ -149,14 +169,15 @@ READ(v) break; xfersize = size; } - error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, - uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error) break; brelse(bp); } if (bp != NULL) brelse(bp); + +out: if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { ip->i_flag |= IN_ACCESS; if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) @@ -187,6 +208,12 @@ WRITE(v) ufs_daddr_t lbn; off_t osize; int blkoffset, error, flags, ioflag, resid, size, xfersize; +#ifndef LFS_READWRITE + void *win; + vsize_t bytelen; + off_t oldoff; + boolean_t rv; +#endif ioflag = ap->a_ioflag; uio = ap->a_uio; @@ -240,14 +267,65 @@ WRITE(v) resid = uio->uio_resid; osize = ip->i_ffs_size; - flags = ioflag & IO_SYNC ? B_SYNC : 0; + error = 0; - for (error = 0; uio->uio_resid > 0;) { +#ifndef LFS_READWRITE + if (vp->v_type != VREG) { + goto bcache; + } + + while (uio->uio_resid > 0) { + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = min(fs->fs_bsize - blkoffset, uio->uio_resid); + + /* + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. + */ + + error = ufs_balloc_range(vp, uio->uio_offset, bytelen, + ap->a_cred, ioflag & IO_SYNC ? B_SYNC : 0); + if (error) { + return error; + } + + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, + UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (ioflag & IO_SYNC) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, oldoff, oldoff + bytelen, + PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } + if (error) { + break; + } + } + goto out; + +bcache: +#endif + flags = ioflag & IO_SYNC ? B_SYNC : 0; + while (uio->uio_resid > 0) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; + xfersize = min(fs->fs_bsize - blkoffset, uio->uio_resid); if (fs->fs_bsize > xfersize) flags |= B_CLRBUF; else @@ -262,14 +340,22 @@ WRITE(v) ip->i_ffs_size = uio->uio_offset + xfersize; uvm_vnp_setsize(vp, ip->i_ffs_size); } - (void)uvm_vnp_uncache(vp); - size = BLKSIZE(fs, ip, lbn) - bp->b_resid; - if (size < xfersize) + if (xfersize > size) xfersize = size; - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + + /* + * if we didn't clear the block and the uiomove failed, + * the buf will now contain part of some other file, + * so we need to invalidate it. + */ + if (error && (flags & B_CLRBUF) == 0) { + bp->b_flags |= B_INVAL; + brelse(bp); + break; + } #ifdef LFS_READWRITE if (!error) error = lfs_reserve(fs, vp, fsbtodb(fs, NIADDR + 1)); @@ -289,13 +375,16 @@ WRITE(v) #endif if (error || xfersize == 0) break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ +#ifndef LFS_READWRITE +out: +#endif + ip->i_flag |= IN_CHANGE | IN_UPDATE; if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_ffs_mode &= ~(ISUID | ISGID); if (error) { diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 67d8c208be99..0a274f4b177e 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_vnops.c,v 1.74 2000/10/19 10:55:35 pk Exp $ */ +/* $NetBSD: ufs_vnops.c,v 1.75 2000/11/27 08:40:02 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 @@ -461,8 +461,6 @@ ufs_chmod(vp, mode, cred, p) ip->i_ffs_mode &= ~ALLPERMS; ip->i_ffs_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; - if ((vp->v_flag & VTEXT) && (ip->i_ffs_mode & S_ISTXT) == 0) - (void) uvm_vnp_uncache(vp); return (0); } @@ -1632,6 +1630,7 @@ ufs_strategy(v) ip = VTOI(vp); if (vp->v_type == VBLK || vp->v_type == VCHR) panic("ufs_strategy: spec"); + KASSERT(bp->b_bcount != 0); if (bp->b_blkno == bp->b_lblkno) { error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL); diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index 857566f79f24..8fe2bede7647 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -1,4 +1,4 @@ -/* $NetBSD: ufsmount.h,v 1.7 1998/03/18 15:57:29 bouyer Exp $ */ +/* $NetBSD: ufsmount.h,v 1.8 2000/11/27 08:40:02 chs Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -82,6 +82,7 @@ struct ufsmount { struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ u_long um_nindir; /* indirect ptrs per block */ + u_long um_lognindir; /* log2 of um_nindir */ u_long um_bptrtodb; /* indir ptr to disk block */ u_long um_seqinc; /* inc between seq blocks */ time_t um_btime[MAXQUOTAS]; /* block quota time limit */ diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h index b9e630214973..8641167d1bce 100644 --- a/sys/uvm/uvm.h +++ b/sys/uvm/uvm.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm.h,v 1.23 2000/06/26 14:21:16 mrg Exp $ */ +/* $NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $ */ /* * @@ -76,6 +76,7 @@ struct uvm { /* vm_page related parameters */ + /* vm_page queues */ struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ struct pglist page_active; /* allocated pages, in use */ @@ -86,10 +87,17 @@ struct uvm { boolean_t page_init_done; /* TRUE if uvm_page_init() finished */ boolean_t page_idle_zero; /* TRUE if we should try to zero pages in the idle loop */ + /* page daemon trigger */ int pagedaemon; /* daemon sleeps on this */ struct proc *pagedaemon_proc; /* daemon's pid */ simple_lock_data_t pagedaemon_lock; + + /* aiodone daemon trigger */ + int aiodoned; /* daemon sleeps on this */ + struct proc *aiodoned_proc; /* daemon's pid */ + simple_lock_data_t aiodoned_lock; + /* page hash */ struct pglist *page_hash; /* page hash table (vp/off->page) */ int page_nhash; /* number of buckets */ @@ -105,7 +113,7 @@ struct uvm { simple_lock_data_t kentry_lock; /* aio_done is locked by uvm.pagedaemon_lock and splbio! */ - struct uvm_aiohead aio_done; /* done async i/o reqs */ + TAILQ_HEAD(, buf) aio_done; /* done async i/o reqs */ /* pager VM area bounds */ vaddr_t pager_sva; /* start of pager VA area */ @@ -145,6 +153,7 @@ extern struct uvm uvm; UVMHIST_DECL(maphist); UVMHIST_DECL(pdhist); +UVMHIST_DECL(ubchist); /* * UVM_UNLOCK_AND_WAIT: atomic unlock+wait... wrapper around the diff --git a/sys/uvm/uvm_bio.c b/sys/uvm/uvm_bio.c new file mode 100644 index 000000000000..86e7efd53330 --- /dev/null +++ b/sys/uvm/uvm_bio.c @@ -0,0 +1,549 @@ +/* $NetBSD: uvm_bio.c,v 1.2 2000/11/27 08:43:40 chs Exp $ */ + +/* + * Copyright (c) 1998 Chuck Silvers. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include "opt_uvmhist.h" + +/* + * uvm_bio.c: buffered i/o vnode mapping cache + */ + + +#include +#include +#include +#include +#include + +#include +#include + +/* + * global data structures + */ + +/* + * local functions + */ + +static int ubc_fault __P((struct uvm_faultinfo *, vaddr_t, + vm_page_t *, int, int, vm_fault_t, vm_prot_t, + int)); +static struct ubc_map *ubc_find_mapping __P((struct uvm_object *, voff_t)); + +/* + * local data structues + */ + +#define UBC_HASH(uobj, offset) (((((u_long)(uobj)) >> 8) + \ + (((u_long)(offset)) >> PAGE_SHIFT)) & \ + ubc_object.hashmask) + +#define UBC_QUEUE(offset) (&ubc_object.inactive[((offset) / UBC_WINSIZE) & \ + (UBC_NQUEUES - 1)]) + +struct ubc_map +{ + struct uvm_object * uobj; /* mapped object */ + voff_t offset; /* offset into uobj */ + int refcount; /* refcount on mapping */ + voff_t writeoff; /* overwrite offset */ + vsize_t writelen; /* overwrite len */ + + LIST_ENTRY(ubc_map) hash; /* hash table */ + TAILQ_ENTRY(ubc_map) inactive; /* inactive queue */ +}; + +static struct ubc_object +{ + struct uvm_object uobj; /* glue for uvm_map() */ + char *kva; /* where ubc_object is mapped */ + struct ubc_map *umap; /* array of ubc_map's */ + + LIST_HEAD(, ubc_map) *hash; /* hashtable for cached ubc_map's */ + u_long hashmask; /* mask for hashtable */ + + TAILQ_HEAD(ubc_inactive_head, ubc_map) *inactive; + /* inactive queues for ubc_map's */ + +} ubc_object; + +struct uvm_pagerops ubc_pager = +{ + NULL, /* init */ + NULL, /* reference */ + NULL, /* detach */ + ubc_fault, /* fault */ + /* ... rest are NULL */ +}; + +int ubc_nwins = UBC_NWINS; +int ubc_winsize = UBC_WINSIZE; +#ifdef PMAP_PREFER +int ubc_nqueues; +boolean_t ubc_release_unmap = FALSE; +#define UBC_NQUEUES ubc_nqueues +#define UBC_RELEASE_UNMAP ubc_release_unmap +#else +#define UBC_NQUEUES 1 +#define UBC_RELEASE_UNMAP FALSE +#endif + +/* + * ubc_init + * + * init pager private data structures. + */ + +void +ubc_init(void) +{ + struct ubc_map *umap; + vaddr_t va; + int i; + + /* + * init ubc_object. + * alloc and init ubc_map's. + * init inactive queues. + * alloc and init hashtable. + * map in ubc_object. + */ + + simple_lock_init(&ubc_object.uobj.vmobjlock); + ubc_object.uobj.pgops = &ubc_pager; + TAILQ_INIT(&ubc_object.uobj.memq); + ubc_object.uobj.uo_npages = 0; + ubc_object.uobj.uo_refs = UVM_OBJ_KERN; + + ubc_object.umap = malloc(ubc_nwins * sizeof(struct ubc_map), + M_TEMP, M_NOWAIT); + bzero(ubc_object.umap, ubc_nwins * sizeof(struct ubc_map)); + + va = (vaddr_t)1L; +#ifdef PMAP_PREFER + PMAP_PREFER(0, &va); + if (va < UBC_WINSIZE) { + va = UBC_WINSIZE; + } + ubc_nqueues = va / UBC_WINSIZE; + if (ubc_nqueues != 1) { + ubc_release_unmap = TRUE; + } +#endif + ubc_object.inactive = malloc(UBC_NQUEUES * + sizeof(struct ubc_inactive_head), + M_TEMP, M_NOWAIT); + for (i = 0; i < UBC_NQUEUES; i++) { + TAILQ_INIT(&ubc_object.inactive[i]); + } + for (i = 0; i < ubc_nwins; i++) { + umap = &ubc_object.umap[i]; + TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)], + umap, inactive); + } + + ubc_object.hash = hashinit(ubc_nwins, HASH_LIST, M_TEMP, M_NOWAIT, + &ubc_object.hashmask); + for (i = 0; i <= ubc_object.hashmask; i++) { + LIST_INIT(&ubc_object.hash[i]); + } + + if (uvm_map(kernel_map, (vaddr_t *)&ubc_object.kva, + ubc_nwins * UBC_WINSIZE, &ubc_object.uobj, 0, (vsize_t)va, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) + != KERN_SUCCESS) { + panic("ubc_init: failed to map ubc_object\n"); + } + UVMHIST_INIT(ubchist, 300); +} + + +/* + * ubc_fault: fault routine for ubc mapping + */ +static int +ubc_fault(ufi, ign1, ign2, ign3, ign4, fault_type, access_type, flags) + struct uvm_faultinfo *ufi; + vaddr_t ign1; + vm_page_t *ign2; + int ign3, ign4; + vm_fault_t fault_type; + vm_prot_t access_type; + int flags; +{ + struct uvm_object *uobj; + struct vnode *vp; + struct ubc_map *umap; + vaddr_t va, eva, ubc_offset, slot_offset; + int i, rv, npages; + struct vm_page *pgs[UBC_WINSIZE >> PAGE_SHIFT], *pg; + UVMHIST_FUNC("ubc_fault"); UVMHIST_CALLED(ubchist); + + /* + * no need to try with PGO_LOCKED... + * we don't need to have the map locked since we know that + * no one will mess with it until our reference is released. + */ + if (flags & PGO_LOCKED) { +#if 0 + return VM_PAGER_UNLOCK; +#else + uvmfault_unlockall(ufi, NULL, &ubc_object.uobj, NULL); + flags &= ~PGO_LOCKED; +#endif + } + + va = ufi->orig_rvaddr; + ubc_offset = va - (vaddr_t)ubc_object.kva; + + UVMHIST_LOG(ubchist, "va 0x%lx ubc_offset 0x%lx at %d", + va, ubc_offset, access_type,0); + + umap = &ubc_object.umap[ubc_offset / UBC_WINSIZE]; + KASSERT(umap->refcount != 0); + slot_offset = trunc_page(ubc_offset & (UBC_WINSIZE - 1)); + + /* no umap locking needed since we have a ref on the umap */ + uobj = umap->uobj; + vp = (struct vnode *)uobj; + KASSERT(uobj != NULL); + + npages = (UBC_WINSIZE - slot_offset) >> PAGE_SHIFT; + + /* + * XXXUBC + * if npages is more than 1 we have to be sure that + * we set PGO_OVERWRITE correctly. + */ + if (access_type == VM_PROT_WRITE) { + npages = 1; + } + +again: + memset(pgs, 0, sizeof (pgs)); + simple_lock(&uobj->vmobjlock); + + UVMHIST_LOG(ubchist, "slot_offset 0x%x writeoff 0x%x writelen 0x%x " + "u_size 0x%x", slot_offset, umap->writeoff, umap->writelen, + vp->v_uvm.u_size); + + if (access_type & VM_PROT_WRITE && + slot_offset >= umap->writeoff && + (slot_offset + PAGE_SIZE <= umap->writeoff + umap->writelen || + slot_offset + PAGE_SIZE >= vp->v_uvm.u_size - umap->offset)) { + UVMHIST_LOG(ubchist, "setting PGO_OVERWRITE", 0,0,0,0); + flags |= PGO_OVERWRITE; + } + else { UVMHIST_LOG(ubchist, "NOT setting PGO_OVERWRITE", 0,0,0,0); } + /* XXX be sure to zero any part of the page past EOF */ + + /* + * XXX + * ideally we'd like to pre-fault all of the pages we're overwriting. + * so for PGO_OVERWRITE, we should call VOP_GETPAGES() with all of the + * pages in [writeoff, writeoff+writesize] instead of just the one. + */ + + UVMHIST_LOG(ubchist, "getpages vp %p offset 0x%x npages %d", + uobj, umap->offset + slot_offset, npages, 0); + + rv = VOP_GETPAGES(vp, umap->offset + slot_offset, pgs, &npages, 0, + access_type, 0, flags); + UVMHIST_LOG(ubchist, "getpages rv %d npages %d", rv, npages,0,0); + + switch (rv) { + case VM_PAGER_OK: + break; + + case VM_PAGER_AGAIN: + tsleep(&lbolt, PVM, "ubc_fault", 0); + goto again; + + default: + return rv; + } + + if (npages == 0) { + return VM_PAGER_OK; + } + + va = ufi->orig_rvaddr; + eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT); + + UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0,0); + simple_lock(&uobj->vmobjlock); + for (i = 0; va < eva; i++, va += PAGE_SIZE) { + UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0); + pg = pgs[i]; + + if (pg == NULL || pg == PGO_DONTCARE) { + continue; + } + if (pg->flags & PG_WANTED) { + wakeup(pg); + } + KASSERT((pg->flags & PG_FAKE) == 0); + if (pg->flags & PG_RELEASED) { + rv = uobj->pgops->pgo_releasepg(pg, NULL); + KASSERT(rv); + continue; + } + KASSERT(access_type == VM_PROT_READ || + (pg->flags & PG_RDONLY) == 0); + + uvm_lock_pageq(); + uvm_pageactivate(pg); + uvm_unlock_pageq(); + + pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg), + VM_PROT_ALL, access_type); + + pg->flags &= ~(PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + } + simple_unlock(&uobj->vmobjlock); + return VM_PAGER_OK; +} + +/* + * local functions + */ + +static struct ubc_map * +ubc_find_mapping(uobj, offset) + struct uvm_object *uobj; + voff_t offset; +{ + struct ubc_map *umap; + + LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) { + if (umap->uobj == uobj && umap->offset == offset) { + return umap; + } + } + return NULL; +} + + +/* + * ubc interface functions + */ + +/* + * ubc_alloc: allocate a buffer mapping + */ +void * +ubc_alloc(uobj, offset, lenp, flags) + struct uvm_object *uobj; + voff_t offset; + vsize_t *lenp; + int flags; +{ + int s; + vaddr_t umap_offset, slot_offset, va; + struct ubc_map *umap; + UVMHIST_FUNC("ubc_alloc"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "uobj %p offset 0x%lx len 0x%lx filesize 0x%x", + uobj, offset, *lenp, ((struct uvm_vnode *)uobj)->u_size); + + umap_offset = (vaddr_t)(offset & ~((voff_t)UBC_WINSIZE - 1)); + slot_offset = (vaddr_t)(offset & ((voff_t)UBC_WINSIZE - 1)); + *lenp = min(*lenp, UBC_WINSIZE - slot_offset); + + /* + * the vnode is always locked here, so we don't need to add a ref. + */ + + s = splbio(); + +again: + simple_lock(&ubc_object.uobj.vmobjlock); + umap = ubc_find_mapping(uobj, umap_offset); + if (umap == NULL) { + umap = TAILQ_FIRST(UBC_QUEUE(offset)); + if (umap == NULL) { + simple_unlock(&ubc_object.uobj.vmobjlock); + tsleep(&lbolt, PVM, "ubc_alloc", 0); + goto again; + } + + /* + * remove from old hash (if any), + * add to new hash. + */ + + if (umap->uobj != NULL) { + LIST_REMOVE(umap, hash); + } + + umap->uobj = uobj; + umap->offset = umap_offset; + + LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)], + umap, hash); + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * UBC_WINSIZE); + pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE); + } + + if (umap->refcount == 0) { + TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive); + } + +#ifdef DIAGNOSTIC + if ((flags & UBC_WRITE) && + (umap->writeoff || umap->writelen)) { + panic("ubc_fault: concurrent writes vp %p", uobj); + } +#endif + if (flags & UBC_WRITE) { + umap->writeoff = slot_offset; + umap->writelen = *lenp; + } + + umap->refcount++; + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); + UVMHIST_LOG(ubchist, "umap %p refs %d va %p", + umap, umap->refcount, + ubc_object.kva + (umap - ubc_object.umap) * UBC_WINSIZE,0); + + return ubc_object.kva + + (umap - ubc_object.umap) * UBC_WINSIZE + slot_offset; +} + + +void +ubc_release(va, wlen) + void *va; + vsize_t wlen; +{ + struct ubc_map *umap; + struct uvm_object *uobj; + int s; + UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "va %p", va,0,0,0); + + s = splbio(); + simple_lock(&ubc_object.uobj.vmobjlock); + + umap = &ubc_object.umap[((char *)va - ubc_object.kva) / UBC_WINSIZE]; + uobj = umap->uobj; + KASSERT(uobj != NULL); + + umap->writeoff = 0; + umap->writelen = 0; + umap->refcount--; + if (umap->refcount == 0) { + if (UBC_RELEASE_UNMAP && + (((struct vnode *)uobj)->v_flag & VTEXT)) { + vaddr_t va; + + /* + * if this file is the executable image of + * some process, that process will likely have + * the file mapped at an alignment other than + * what PMAP_PREFER() would like. we'd like + * to have process text be able to use the + * cache even if someone is also reading the + * file, so invalidate mappings of such files + * as soon as possible. + */ + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * UBC_WINSIZE); + pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE); + LIST_REMOVE(umap, hash); + umap->uobj = NULL; + TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, + inactive); + } else { + TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap, + inactive); + } + } + UVMHIST_LOG(ubchist, "umap %p refs %d", umap, umap->refcount,0,0); + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); +} + + +/* + * removing a range of mappings from the ubc mapping cache. + */ + +void +ubc_flush(uobj, start, end) + struct uvm_object *uobj; + voff_t start, end; +{ + struct ubc_map *umap; + vaddr_t va; + int s; + UVMHIST_FUNC("ubc_flush"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "uobj %p start 0x%lx end 0x%lx", + uobj, start, end,0); + + s = splbio(); + simple_lock(&ubc_object.uobj.vmobjlock); + for (umap = ubc_object.umap; + umap < &ubc_object.umap[ubc_nwins]; + umap++) { + + if (umap->uobj != uobj || + umap->offset < start || + (umap->offset >= end && end != 0) || + umap->refcount > 0) { + continue; + } + + /* + * remove from hash, + * move to head of inactive queue. + */ + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * UBC_WINSIZE); + pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE); + + LIST_REMOVE(umap, hash); + umap->uobj = NULL; + TAILQ_REMOVE(UBC_QUEUE(umap->offset), umap, inactive); + TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, inactive); + } + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); +} diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h index 09225931be0a..2753263a509c 100644 --- a/sys/uvm/uvm_extern.h +++ b/sys/uvm/uvm_extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_extern.h,v 1.52 2000/11/27 04:36:40 nisimura Exp $ */ +/* $NetBSD: uvm_extern.h,v 1.53 2000/11/27 08:40:03 chs Exp $ */ /* * @@ -192,6 +192,21 @@ typedef struct vm_page *vm_page_t; #define UVM_PGA_USERESERVE 0x0001 /* ok to use reserve pages */ #define UVM_PGA_ZERO 0x0002 /* returned page must be zero'd */ +/* + * the following defines are for ubc_alloc's flags + */ +#define UBC_READ 0 +#define UBC_WRITE 1 + +/* + * flags for uvn_findpages(). + */ +#define UFP_ALL 0x0 +#define UFP_NOWAIT 0x1 +#define UFP_NOALLOC 0x2 +#define UFP_NOCACHE 0x4 +#define UFP_NORDONLY 0x8 + /* * lockflags that control the locking behavior of various functions. */ @@ -213,8 +228,11 @@ struct vm_anon; struct vmspace; struct pmap; struct vnode; +struct pool; struct simplelock; +extern struct pool *uvm_aiobuf_pool; + /* * uvmexp: global data structures that are exported to parts of the kernel * other than the vm system. @@ -414,9 +432,16 @@ void uao_detach_locked __P((struct uvm_object *)); void uao_reference __P((struct uvm_object *)); void uao_reference_locked __P((struct uvm_object *)); +/* uvm_bio.c */ +void ubc_init __P((void)); +void * ubc_alloc __P((struct uvm_object *, voff_t, vsize_t *, + int)); +void ubc_release __P((void *, vsize_t)); +void ubc_flush __P((struct uvm_object *, voff_t, voff_t)); + /* uvm_fault.c */ -int uvm_fault __P((vm_map_t, vaddr_t, - vm_fault_t, vm_prot_t)); +int uvm_fault __P((vm_map_t, vaddr_t, vm_fault_t, + vm_prot_t)); /* handle a page fault */ /* uvm_glue.c */ @@ -511,8 +536,14 @@ void uvm_page_physload __P((paddr_t, paddr_t, paddr_t, paddr_t, int)); void uvm_setpagesize __P((void)); +/* uvm_pager.c */ +void uvm_aio_biodone1 __P((struct buf *)); +void uvm_aio_biodone __P((struct buf *)); +void uvm_aio_aiodone __P((struct buf *)); + /* uvm_pdaemon.c */ void uvm_pageout __P((void *)); +void uvm_aiodone_daemon __P((void *)); /* uvm_pglist.c */ int uvm_pglistalloc __P((psize_t, paddr_t, @@ -538,10 +569,11 @@ int uvm_deallocate __P((vm_map_t, vaddr_t, vsize_t)); /* uvm_vnode.c */ void uvm_vnp_setsize __P((struct vnode *, voff_t)); void uvm_vnp_sync __P((struct mount *)); -void uvm_vnp_terminate __P((struct vnode *)); - /* terminate a uvm/uvn object */ -boolean_t uvm_vnp_uncache __P((struct vnode *)); struct uvm_object *uvn_attach __P((void *, vm_prot_t)); +void uvn_findpages __P((struct uvm_object *, voff_t, + int *, struct vm_page **, int)); +void uvm_vnp_zerorange __P((struct vnode *, off_t, size_t)); +void uvm_vnp_asyncget __P((struct vnode *, off_t, size_t)); /* kern_malloc.c */ void kmeminit_nkmempages __P((void)); diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c index 46b1fd56e670..5c91aeafcf65 100644 --- a/sys/uvm/uvm_fault.c +++ b/sys/uvm/uvm_fault.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $ */ +/* $NetBSD: uvm_fault.c,v 1.52 2000/11/27 08:40:03 chs Exp $ */ /* * @@ -458,12 +458,8 @@ uvmfault_anonget(ufi, amap, anon) } if (result != VM_PAGER_OK) { -#ifdef DIAGNOSTIC - if (result == VM_PAGER_PEND) { - panic("uvmfault_anonget: " - "got PENDING for non-async I/O"); - } -#endif + KASSERT(result != VM_PAGER_PEND); + /* remove page from anon */ anon->u.an_page = NULL; @@ -569,7 +565,7 @@ uvm_fault(orig_map, vaddr, fault_type, access_type) vm_prot_t enter_prot; boolean_t wired, narrow, promote, locked, shadowed; int npages, nback, nforw, centeridx, result, lcv, gotpages; - vaddr_t startva, objaddr, currva, offset; + vaddr_t startva, objaddr, currva, offset, uoff; paddr_t pa; struct vm_amap *amap; struct uvm_object *uobj; @@ -580,7 +576,8 @@ uvm_fault(orig_map, vaddr, fault_type, access_type) UVMHIST_LOG(maphist, "(map=0x%x, vaddr=0x%x, ft=%d, at=%d)", orig_map, vaddr, fault_type, access_type); - anon = NULL; /* XXX: shut up gcc */ + anon = NULL; + pg = NULL; uvmexp.faults++; /* XXX: locking? */ @@ -717,10 +714,8 @@ ReFault: if (narrow == FALSE) { /* wide fault (!narrow) */ -#ifdef DIAGNOSTIC - if (uvmadvice[ufi.entry->advice].advice != ufi.entry->advice) - panic("fault: advice mismatch!"); -#endif + KASSERT(uvmadvice[ufi.entry->advice].advice == + ufi.entry->advice); nback = min(uvmadvice[ufi.entry->advice].nback, (ufi.orig_rvaddr - ufi.entry->start) >> PAGE_SHIFT); startva = ufi.orig_rvaddr - (nback << PAGE_SHIFT); @@ -793,7 +788,7 @@ ReFault: /* now forget about the backpages */ if (amap) anons += nback; - startva = startva + (nback << PAGE_SHIFT); + startva += (nback << PAGE_SHIFT); npages -= nback; nback = centeridx = 0; } @@ -814,12 +809,10 @@ ReFault: * dont play with VAs that are already mapped * except for center) */ - if (lcv != centeridx) { - if (pmap_extract(ufi.orig_map->pmap, currva, &pa) == - TRUE) { - pages[lcv] = PGO_DONTCARE; - continue; - } + if (lcv != centeridx && + pmap_extract(ufi.orig_map->pmap, currva, &pa)) { + pages[lcv] = PGO_DONTCARE; + continue; } /* @@ -851,11 +844,13 @@ ReFault: " MAPPING: n anon: pm=0x%x, va=0x%x, pg=0x%x", ufi.orig_map->pmap, currva, anon->u.an_page, 0); uvmexp.fltnamap++; + /* * Since this isn't the page that's actually faulting, * ignore pmap_enter() failures; it's not critical * that we enter these right now. */ + (void) pmap_enter(ufi.orig_map->pmap, currva, VM_PAGE_TO_PHYS(anon->u.an_page), (anon->an_ref > 1) ? (enter_prot & ~VM_PROT_WRITE) : @@ -888,13 +883,13 @@ ReFault: */ if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) { - simple_lock(&uobj->vmobjlock); /* locked: maps(read), amap (if there), uobj */ result = uobj->pgops->pgo_fault(&ufi, startva, pages, npages, centeridx, fault_type, access_type, - PGO_LOCKED); + PGO_LOCKED|PGO_SYNCIO); + /* locked: nothing, pgo_fault has unlocked everything */ if (result == VM_PAGER_OK) @@ -925,7 +920,7 @@ ReFault: uvmexp.fltlget++; gotpages = npages; - result = uobj->pgops->pgo_get(uobj, ufi.entry->offset + + (void) uobj->pgops->pgo_get(uobj, ufi.entry->offset + (startva - ufi.entry->start), pages, &gotpages, centeridx, access_type & MASK(ufi.entry), @@ -946,29 +941,22 @@ ReFault: pages[lcv] == PGO_DONTCARE) continue; -#ifdef DIAGNOSTIC - /* - * pager sanity check: pgo_get with - * PGO_LOCKED should never return a - * released page to us. - */ - if (pages[lcv]->flags & PG_RELEASED) - panic("uvm_fault: pgo_get PGO_LOCKED gave us a RELEASED page"); -#endif + KASSERT((pages[lcv]->flags & PG_RELEASED) == 0); - /* - * if center page is resident and not - * PG_BUSY|PG_RELEASED then pgo_get - * made it PG_BUSY for us and gave - * us a handle to it. remember this - * page as "uobjpage." (for later use). - */ - - if (lcv == centeridx) { - uobjpage = pages[lcv]; - UVMHIST_LOG(maphist, " got uobjpage (0x%x) with locked get", + /* + * if center page is resident and not + * PG_BUSY|PG_RELEASED then pgo_get + * made it PG_BUSY for us and gave + * us a handle to it. remember this + * page as "uobjpage." (for later use). + */ + + if (lcv == centeridx) { + uobjpage = pages[lcv]; + UVMHIST_LOG(maphist, " got uobjpage " + "(0x%x) with locked get", uobjpage, 0,0,0); - continue; + continue; } /* @@ -987,15 +975,18 @@ ReFault: " MAPPING: n obj: pm=0x%x, va=0x%x, pg=0x%x", ufi.orig_map->pmap, currva, pages[lcv], 0); uvmexp.fltnomap++; + /* * Since this page isn't the page that's * actually fauling, ignore pmap_enter() * failures; it's not critical that we * enter these right now. */ + (void) pmap_enter(ufi.orig_map->pmap, currva, VM_PAGE_TO_PHYS(pages[lcv]), - enter_prot & MASK(ufi.entry), + pages[lcv]->flags & PG_RDONLY ? + VM_PROT_READ : enter_prot & MASK(ufi.entry), PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)); @@ -1004,18 +995,14 @@ ReFault: * because we've held the lock the whole time * we've had the handle. */ + pages[lcv]->flags &= ~(PG_BUSY); /* un-busy! */ UVM_PAGE_OWN(pages[lcv], NULL); - - /* done! */ } /* for "lcv" loop */ } /* "gotpages" != 0 */ - /* note: object still _locked_ */ } else { - uobjpage = NULL; - } /* locked (shadowed): maps(read), amap */ @@ -1078,13 +1065,9 @@ ReFault: case VM_PAGER_REFAULT: goto ReFault; - case VM_PAGER_ERROR: - /* - * An error occurred while trying to bring in the - * page -- this is the only error we return right - * now. - */ - return (KERN_PROTECTION_FAILURE); /* XXX */ + case VM_PAGER_AGAIN: + tsleep(&lbolt, PVM, "fltagain1", 0); + goto ReFault; default: #ifdef DIAGNOSTIC @@ -1105,6 +1088,7 @@ ReFault: /* * special handling for loaned pages */ + if (anon->u.an_page->loan_count) { if ((access_type & VM_PROT_WRITE) == 0) { @@ -1198,21 +1182,13 @@ ReFault: anon = uvm_analloc(); if (anon) pg = uvm_pagealloc(NULL, 0, anon, 0); -#ifdef __GNUC__ - else - pg = NULL; /* XXX: gcc */ -#endif /* check for out of RAM */ if (anon == NULL || pg == NULL) { if (anon) uvm_anfree(anon); uvmfault_unlockall(&ufi, amap, uobj, oanon); -#ifdef DIAGNOSTIC - if (uvmexp.swpgonly > uvmexp.swpages) { - panic("uvmexp.swpgonly botch"); - } -#endif + KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) { UVMHIST_LOG(maphist, "<- failed. out of VM",0,0,0,0); @@ -1243,7 +1219,7 @@ ReFault: */ } else { - + uvmexp.flt_anon++; oanon = anon; /* old, locked anon is same as anon */ pg = anon->u.an_page; @@ -1252,7 +1228,7 @@ ReFault: } - /* locked: maps(read), amap, anon */ + /* locked: maps(read), amap, oanon */ /* * now map the page in ... @@ -1274,10 +1250,7 @@ ReFault: * as the map may change while we're asleep. */ uvmfault_unlockall(&ufi, amap, uobj, oanon); -#ifdef DIAGNOSTIC - if (uvmexp.swpgonly > uvmexp.swpages) - panic("uvmexp.swpgonly botch"); -#endif + KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if (uvmexp.swpgonly == uvmexp.swpages) { UVMHIST_LOG(maphist, "<- failed. out of VM",0,0,0,0); @@ -1343,7 +1316,7 @@ Case2: uobjpage = PGO_DONTCARE; promote = TRUE; /* always need anon here */ } else { - /* assert(uobjpage != PGO_DONTCARE) */ + KASSERT(uobjpage != PGO_DONTCARE); promote = (access_type & VM_PROT_WRITE) && UVM_ET_ISCOPYONWRITE(ufi.entry); } @@ -1372,24 +1345,19 @@ Case2: uvmexp.fltget++; gotpages = 1; - result = uobj->pgops->pgo_get(uobj, - (ufi.orig_rvaddr - ufi.entry->start) + ufi.entry->offset, - &uobjpage, &gotpages, 0, - access_type & MASK(ufi.entry), - ufi.entry->advice, 0); + uoff = (ufi.orig_rvaddr - ufi.entry->start) + ufi.entry->offset; + result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages, + 0, access_type & MASK(ufi.entry), ufi.entry->advice, + PGO_SYNCIO); /* locked: uobjpage(if result OK) */ - + /* * recover from I/O */ if (result != VM_PAGER_OK) { -#ifdef DIAGNOSTIC - if (result == VM_PAGER_PEND) - panic("uvm_fault: pgo_get got PENDing " - "on non-async I/O"); -#endif + KASSERT(result != VM_PAGER_PEND); if (result == VM_PAGER_AGAIN) { UVMHIST_LOG(maphist, @@ -1448,11 +1416,8 @@ Case2: if (uobjpage->flags & PG_RELEASED) { uvmexp.fltpgrele++; -#ifdef DIAGNOSTIC - if (uobj->pgops->pgo_releasepg == NULL) - panic("uvm_fault: object has no " - "releasepg function"); -#endif + KASSERT(uobj->pgops->pgo_releasepg != NULL); + /* frees page */ if (uobj->pgops->pgo_releasepg(uobjpage,NULL)) /* unlock if still alive */ @@ -1479,7 +1444,6 @@ Case2: */ /* locked: maps(read), amap(if !null), uobj, uobjpage */ - } /* @@ -1616,10 +1580,6 @@ Case2: pg = uvm_pagealloc(NULL, 0, anon, (uobjpage == PGO_DONTCARE) ? UVM_PGA_ZERO : 0); } -#ifdef __GNUC__ - else - pg = NULL; /* XXX: gcc */ -#endif /* * out of memory resources? @@ -1635,21 +1595,15 @@ Case2: wakeup(uobjpage); uvm_lock_pageq(); - /* make sure it is in queues */ uvm_pageactivate(uobjpage); uvm_unlock_pageq(); - /* un-busy! (still locked) */ uobjpage->flags &= ~(PG_BUSY|PG_WANTED); UVM_PAGE_OWN(uobjpage, NULL); } /* unlock and fail ... */ uvmfault_unlockall(&ufi, amap, uobj, NULL); -#ifdef DIAGNOSTIC - if (uvmexp.swpgonly > uvmexp.swpages) { - panic("uvmexp.swpgonly botch"); - } -#endif + KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) { UVMHIST_LOG(maphist, " promote: out of VM", 0,0,0,0); @@ -1659,6 +1613,7 @@ Case2: UVMHIST_LOG(maphist, " out of RAM, waiting for more", 0,0,0,0); + anon->an_ref--; uvm_anfree(anon); uvmexp.fltnoram++; uvm_wait("flt_noram5"); @@ -1684,8 +1639,8 @@ Case2: /* * dispose of uobjpage. it can't be PG_RELEASED - * since we still hold the object lock. drop - * handle to uobj as well. + * since we still hold the object lock. + * drop handle to uobj as well. */ if (uobjpage->flags & PG_WANTED) @@ -1694,10 +1649,11 @@ Case2: uobjpage->flags &= ~(PG_BUSY|PG_WANTED); UVM_PAGE_OWN(uobjpage, NULL); uvm_lock_pageq(); - uvm_pageactivate(uobjpage); /* put it back */ + uvm_pageactivate(uobjpage); uvm_unlock_pageq(); simple_unlock(&uobj->vmobjlock); uobj = NULL; + UVMHIST_LOG(maphist, " promote uobjpage 0x%x to anon/page 0x%x/0x%x", uobjpage, anon, pg, 0); @@ -1732,9 +1688,12 @@ Case2: UVMHIST_LOG(maphist, " MAPPING: case2: pm=0x%x, va=0x%x, pg=0x%x, promote=%d", ufi.orig_map->pmap, ufi.orig_rvaddr, pg, promote); + KASSERT(access_type == VM_PROT_READ || (pg->flags & PG_RDONLY) == 0); if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr, VM_PAGE_TO_PHYS(pg), - enter_prot, access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) + pg->flags & PG_RDONLY ? VM_PROT_READ : enter_prot, + access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0)) != KERN_SUCCESS) { + /* * No need to undo what we did; we can simply think of * this as the pmap throwing away the mapping information. @@ -1742,6 +1701,7 @@ Case2: * We do, however, have to go through the ReFault path, * as the map may change while we're asleep. */ + if (pg->flags & PG_WANTED) wakeup(pg); /* lock still held */ @@ -1753,10 +1713,7 @@ Case2: pg->flags &= ~(PG_BUSY|PG_FAKE|PG_WANTED); UVM_PAGE_OWN(pg, NULL); uvmfault_unlockall(&ufi, amap, uobj, NULL); -#ifdef DIAGNOSTIC - if (uvmexp.swpgonly > uvmexp.swpages) - panic("uvmexp.swpgonly botch"); -#endif + KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if (uvmexp.swpgonly == uvmexp.swpages) { UVMHIST_LOG(maphist, "<- failed. out of VM",0,0,0,0); @@ -1788,7 +1745,6 @@ Case2: /* activate it */ uvm_pageactivate(pg); } - uvm_unlock_pageq(); if (pg->flags & PG_WANTED) @@ -1880,10 +1836,7 @@ uvm_fault_unwire_locked(map, start, end) paddr_t pa; struct vm_page *pg; -#ifdef DIAGNOSTIC - if (map->flags & VM_MAP_INTRSAFE) - panic("uvm_fault_unwire_locked: intrsafe map"); -#endif + KASSERT((map->flags & VM_MAP_INTRSAFE) == 0); /* * we assume that the area we are unwiring has actually been wired diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c index d3bcb3561b17..3510b68581f0 100644 --- a/sys/uvm/uvm_map.c +++ b/sys/uvm/uvm_map.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_map.c,v 1.85 2000/11/25 06:27:59 chs Exp $ */ +/* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -3267,16 +3267,16 @@ uvm_object_printit(uobj, full, pr) } } -const char page_flagbits[] = - "\20\4CLEAN\5BUSY\6WANTED\7TABLED\12FAKE\13FILLED\14DIRTY\15RELEASED" - "\16FAULTING\17CLEANCHK"; -const char page_pqflagbits[] = - "\20\1FREE\2INACTIVE\3ACTIVE\4LAUNDRY\5ANON\6AOBJ"; - /* * uvm_page_printit: actually print the page */ +static const char page_flagbits[] = + "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY" + "\11ZERO\15PAGER1"; +static const char page_pqflagbits[] = + "\20\1FREE\2INACTIVE\3ACTIVE\4LAUNDRY\5ANON\6AOBJ"; + void uvm_page_printit(pg, full, pr) struct vm_page *pg; @@ -3294,8 +3294,8 @@ uvm_page_printit(pg, full, pr) bitmask_snprintf(pg->pqflags, page_pqflagbits, pqbuf, sizeof(pqbuf)); (*pr)(" flags=%s, pqflags=%s, vers=%d, wire_count=%d, pa=0x%lx\n", pgbuf, pqbuf, pg->version, pg->wire_count, (long)pg->phys_addr); - (*pr)(" uobject=%p, uanon=%p, offset=0x%lx loan_count=%d\n", - pg->uobject, pg->uanon, pg->offset, pg->loan_count); + (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n", + pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count); #if defined(UVM_PAGE_TRKOWN) if (pg->flags & PG_BUSY) (*pr)(" owning process = %d, tag=%s\n", diff --git a/sys/uvm/uvm_map_i.h b/sys/uvm/uvm_map_i.h index 80cc2edb648c..7c8671715c31 100644 --- a/sys/uvm/uvm_map_i.h +++ b/sys/uvm/uvm_map_i.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_map_i.h,v 1.17 2000/05/08 22:59:35 thorpej Exp $ */ +/* $NetBSD: uvm_map_i.h,v 1.18 2000/11/27 08:40:04 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -69,8 +69,6 @@ #ifndef _UVM_UVM_MAP_I_H_ #define _UVM_UVM_MAP_I_H_ -#include "opt_uvmhist.h" - /* * uvm_map_i.h */ @@ -197,16 +195,6 @@ MAP_INLINE void uvm_map_reference(map) vm_map_t map; { - if (__predict_false(map == NULL)) { -#ifdef DIAGNOSTIC - printf("uvm_map_reference: reference to NULL map\n"); -#ifdef DDB - Debugger(); -#endif -#endif - return; - } - simple_lock(&map->ref_lock); map->ref_count++; simple_unlock(&map->ref_lock); @@ -225,20 +213,9 @@ uvm_map_deallocate(map) { int c; - if (__predict_false(map == NULL)) { -#ifdef DIAGNOSTIC - printf("uvm_map_deallocate: reference to NULL map\n"); -#ifdef DDB - Debugger(); -#endif -#endif - return; - } - simple_lock(&map->ref_lock); c = --map->ref_count; simple_unlock(&map->ref_lock); - if (c > 0) { return; } @@ -249,7 +226,6 @@ uvm_map_deallocate(map) uvm_unmap(map, map->min_offset, map->max_offset); pmap_destroy(map->pmap); - FREE(map, M_VMMAP); } diff --git a/sys/uvm/uvm_mmap.c b/sys/uvm/uvm_mmap.c index 9f378bb022c5..992c1cab6c72 100644 --- a/sys/uvm/uvm_mmap.c +++ b/sys/uvm/uvm_mmap.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_mmap.c,v 1.45 2000/11/24 23:30:01 soren Exp $ */ +/* $NetBSD: uvm_mmap.c,v 1.46 2000/11/27 08:40:04 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -262,26 +262,6 @@ sys_mincore(p, v, retval) return (error); } -#if 0 -/* - * munmapfd: unmap file descriptor - * - * XXX: is this acutally a useful function? could it be useful? - */ - -void -munmapfd(p, fd) - struct proc *p; - int fd; -{ - - /* - * XXX should vm_deallocate any regions mapped to this file - */ - p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED; -} -#endif - /* * sys_mmap: mmap system call. * @@ -375,7 +355,9 @@ sys_mmap(p, v, retval) * not fixed: make sure we skip over the largest possible heap. * we will refine our guess later (e.g. to account for VAC, etc) */ - if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr+MAXDSIZ)) + + if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr + + MAXDSIZ)) addr = round_page((vaddr_t)p->p_vmspace->vm_daddr + MAXDSIZ); } @@ -1157,36 +1139,8 @@ uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit) uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ? maxprot : (maxprot & ~VM_PROT_WRITE)); - /* - * XXXCDC: hack from old code - * don't allow vnodes which have been mapped - * shared-writeable to persist [forces them to be - * flushed out when last reference goes]. - * XXXCDC: interesting side effect: avoids a bug. - * note that in WRITE [ufs_readwrite.c] that we - * allocate buffer, uncache, and then do the write. - * the problem with this is that if the uncache causes - * VM data to be flushed to the same area of the file - * we are writing to... in that case we've got the - * buffer locked and our process goes to sleep forever. - * - * XXXCDC: checking maxprot protects us from the - * "persistbug" program but this is not a long term - * solution. - * - * XXXCDC: we don't bother calling uncache with the vp - * VOP_LOCKed since we know that we are already - * holding a valid reference to the uvn (from the - * uvn_attach above), and thus it is impossible for - * the uncache to kill the uvn and trigger I/O. - */ - if (flags & MAP_SHARED) { - if ((prot & VM_PROT_WRITE) || - (maxprot & VM_PROT_WRITE)) { - uvm_vnp_uncache(vp); - } - } - + /* XXX for now, attach doesn't gain a ref */ + VREF(vp); } else { uobj = udv_attach((void *) &vp->v_rdev, (flags & MAP_SHARED) ? diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c index 374315066212..44a8329231d0 100644 --- a/sys/uvm/uvm_page.c +++ b/sys/uvm/uvm_page.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_page.c,v 1.43 2000/11/09 19:15:28 christos Exp $ */ +/* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -70,10 +70,13 @@ * uvm_page.c: page ops. */ +#include "opt_uvmhist.h" + #include #include #include #include +#include #define UVM_PAGE /* pull in uvm_page.h functions */ #include @@ -94,8 +97,15 @@ int vm_nphysseg = 0; /* XXXCDC: uvm.nphysseg */ * of the things necessary to do idle page zero'ing efficiently. * We therefore provide a way to disable it from machdep code here. */ +/* + * XXX disabled until we can find a way to do this without causing + * problems for either cpu caches or DMA latency. + */ +boolean_t vm_page_zero_enable = FALSE; -boolean_t vm_page_zero_enable = TRUE; +u_long uvm_pgcnt_anon; +u_long uvm_pgcnt_vnode; +extern struct uvm_pagerops uvm_vnodeops; /* * local variables @@ -123,7 +133,7 @@ static struct pglist uvm_bootbucket; */ static void uvm_pageinsert __P((struct vm_page *)); - +static void uvm_pageremove __P((struct vm_page *)); /* * inline functions @@ -160,7 +170,6 @@ uvm_pageinsert(pg) TAILQ_INSERT_TAIL(&pg->uobject->memq, pg, listq); /* put in object */ pg->flags |= PG_TABLED; pg->uobject->uo_npages++; - } /* @@ -170,21 +179,14 @@ uvm_pageinsert(pg) * => caller must lock page queues */ -void __inline +static __inline void uvm_pageremove(pg) struct vm_page *pg; { struct pglist *buck; int s; -#ifdef DIAGNOSTIC - if ((pg->flags & (PG_FAULTING)) != 0) - panic("uvm_pageremove: page is faulting"); -#endif - - if ((pg->flags & PG_TABLED) == 0) - return; /* XXX: log */ - + KASSERT(pg->flags & PG_TABLED); buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)]; s = splimp(); simple_lock(&uvm.hashlock); @@ -192,6 +194,10 @@ uvm_pageremove(pg) simple_unlock(&uvm.hashlock); splx(s); + if (pg->uobject->pgops == &uvm_vnodeops) { + uvm_pgcnt_vnode--; + } + /* object should be locked */ TAILQ_REMOVE(&pg->uobject->memq, pg, listq); @@ -199,7 +205,6 @@ uvm_pageremove(pg) pg->uobject->uo_npages--; pg->uobject = NULL; pg->version++; - } /* @@ -217,7 +222,6 @@ uvm_page_init(kvm_startp, kvm_endp) int lcv, i; paddr_t paddr; - /* * step 1: init the page queues and page queue locks */ @@ -238,7 +242,7 @@ uvm_page_init(kvm_startp, kvm_endp) */ uvm.page_nhash = 1; /* 1 bucket */ - uvm.page_hashmask = 0; /* mask for hash function */ + uvm.page_hashmask = 0; /* mask for hash function */ uvm.page_hash = &uvm_bootbucket; /* install bootstrap bucket */ TAILQ_INIT(uvm.page_hash); /* init hash table */ simple_lock_init(&uvm.hashlock); /* init hash table lock */ @@ -291,7 +295,6 @@ uvm_page_init(kvm_startp, kvm_endp) */ for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) { - n = vm_physmem[lcv].end - vm_physmem[lcv].start; if (n > pagecount) { printf("uvm_page_init: lost %ld page(s) in init\n", @@ -317,6 +320,7 @@ uvm_page_init(kvm_startp, kvm_endp) } } } + /* * step 5: pass up the values of virtual_space_start and * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper @@ -327,10 +331,11 @@ uvm_page_init(kvm_startp, kvm_endp) *kvm_endp = trunc_page(virtual_space_end); /* - * step 6: init pagedaemon lock + * step 6: init locks for kernel threads */ simple_lock_init(&uvm.pagedaemon_lock); + simple_lock_init(&uvm.aiodoned_lock); /* * step 7: init reserve thresholds @@ -342,10 +347,6 @@ uvm_page_init(kvm_startp, kvm_endp) /* * step 8: determine if we should zero pages in the idle * loop. - * - * XXXJRT - might consider zero'ing up to the target *now*, - * but that could take an awfully long time if you - * have a lot of memory. */ uvm.page_idle_zero = vm_page_zero_enable; @@ -360,7 +361,6 @@ uvm_page_init(kvm_startp, kvm_endp) * uvm_setpagesize: set the page size * * => sets page_shift and page_mask from uvmexp.pagesize. - * => XXXCDC: move global vars. */ void @@ -889,22 +889,20 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list) struct pgfreelist *pgfl; boolean_t use_reserve; -#ifdef DIAGNOSTIC - /* sanity check */ - if (obj && anon) - panic("uvm_pagealloc: obj and anon != NULL"); -#endif - - s = uvm_lock_fpageq(); /* lock free page queue */ + KASSERT(obj == NULL || anon == NULL); + KASSERT(off == trunc_page(off)); + s = uvm_lock_fpageq(); /* * check to see if we need to generate some free pages waking * the pagedaemon. */ - if (uvmexp.free < uvmexp.freemin || (uvmexp.free < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg)) + if (uvmexp.free + uvmexp.paging < uvmexp.freemin || + (uvmexp.free + uvmexp.paging < uvmexp.freetarg && + uvmexp.inactive < uvmexp.inactarg)) { wakeup(&uvm.pagedaemon); + } /* * fail if any of these conditions is true: @@ -957,11 +955,7 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list) case UVM_PGA_STRAT_ONLY: case UVM_PGA_STRAT_FALLBACK: /* Attempt to allocate from the specified free list. */ -#ifdef DIAGNOSTIC - if (free_list >= VM_NFREELIST || free_list < 0) - panic("uvm_pagealloc_strat: bad free list %d", - free_list); -#endif + KASSERT(free_list >= 0 && free_list < VM_NFREELIST); pgfl = &uvm.page_free[free_list]; if ((pg = TAILQ_FIRST((freeq = &pgfl->pgfl_queues[try1]))) != NULL || @@ -1012,11 +1006,10 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list) pg->uanon = anon; pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE; pg->version++; - pg->wire_count = 0; - pg->loan_count = 0; if (anon) { anon->u.an_page = pg; pg->pqflags = PQ_ANON; + uvm_pgcnt_anon++; } else { if (obj) uvm_pageinsert(pg); @@ -1074,8 +1067,6 @@ uvm_pagerealloc(pg, newobj, newoff) pg->version++; uvm_pageinsert(pg); } - - return; } @@ -1089,14 +1080,20 @@ uvm_pagerealloc(pg, newobj, newoff) * => assumes all valid mappings of pg are gone */ -void uvm_pagefree(pg) - -struct vm_page *pg; - +void +uvm_pagefree(pg) + struct vm_page *pg; { int s; int saved_loan_count = pg->loan_count; +#ifdef DEBUG + if (pg->uobject == (void *)0xdeadbeef && + pg->uanon == (void *)0xdeadbeef) { + panic("uvm_pagefree: freeing free page %p\n", pg); + } +#endif + /* * if the page was an object page (and thus "TABLED"), remove it * from the object. @@ -1105,7 +1102,7 @@ struct vm_page *pg; if (pg->flags & PG_TABLED) { /* - * if the object page is on loan we are going to drop ownership. + * if the object page is on loan we are going to drop ownership. * it is possible that an anon will take over as owner for this * page later on. the anon will want a !PG_CLEAN page so that * it knows it needs to allocate swap if it wants to page the @@ -1114,7 +1111,6 @@ struct vm_page *pg; if (saved_loan_count) pg->flags &= ~PG_CLEAN; /* in case an anon takes over */ - uvm_pageremove(pg); /* @@ -1125,9 +1121,9 @@ struct vm_page *pg; * return (when the last loan is dropped, then the page can be * freed by whatever was holding the last loan). */ + if (saved_loan_count) return; - } else if (saved_loan_count && (pg->pqflags & PQ_ANON)) { /* @@ -1137,19 +1133,12 @@ struct vm_page *pg; * note that the kernel can't change the loan status of our * page as long as we are holding PQ lock. */ + pg->pqflags &= ~PQ_ANON; pg->uanon = NULL; return; } - -#ifdef DIAGNOSTIC - if (saved_loan_count) { - printf("uvm_pagefree: warning: freeing page with a loan " - "count of %d\n", saved_loan_count); - panic("uvm_pagefree: loan count"); - } -#endif - + KASSERT(saved_loan_count == 0); /* * now remove the page from the queues @@ -1172,13 +1161,17 @@ struct vm_page *pg; /* * if the page was wired, unwire it now. */ + if (pg->wire_count) { pg->wire_count = 0; uvmexp.wired--; } + if (pg->uanon) { + uvm_pgcnt_anon--; + } /* - * and put on free queue + * and put on free queue */ pg->flags &= ~PG_ZERO; @@ -1200,6 +1193,51 @@ struct vm_page *pg; uvm_unlock_fpageq(s); } +/* + * uvm_page_unbusy: unbusy an array of pages. + * + * => pages must either all belong to the same object, or all belong to anons. + * => if pages are object-owned, object must be locked. + * => if pages are anon-owned, anons must be unlockd and have 0 refcount. + */ + +void +uvm_page_unbusy(pgs, npgs) + struct vm_page **pgs; + int npgs; +{ + struct vm_page *pg; + struct uvm_object *uobj; + int i; + UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist); + + for (i = 0; i < npgs; i++) { + pg = pgs[i]; + + if (pg == NULL) { + continue; + } + if (pg->flags & PG_WANTED) { + wakeup(pg); + } + if (pg->flags & PG_RELEASED) { + UVMHIST_LOG(ubchist, "releasing pg %p", pg,0,0,0); + uobj = pg->uobject; + if (uobj != NULL) { + uobj->pgops->pgo_releasepg(pg, NULL); + } else { + pg->flags &= ~(PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + uvm_anfree(pg->uanon); + } + } else { + UVMHIST_LOG(ubchist, "unbusying pg %p", pg,0,0,0); + pg->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + } + } +} + #if defined(UVM_PAGE_TRKOWN) /* * uvm_page_own: set or release page ownership diff --git a/sys/uvm/uvm_page.h b/sys/uvm/uvm_page.h index 338496b2a48f..0b365c7b8700 100644 --- a/sys/uvm/uvm_page.h +++ b/sys/uvm/uvm_page.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_page.h,v 1.17 2000/10/03 20:50:49 mrg Exp $ */ +/* $NetBSD: uvm_page.h,v 1.18 2000/11/27 08:40:05 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -118,27 +118,27 @@ #include struct vm_page { - TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO - * queue or free list (P) */ - TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ - TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ + TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO + * queue or free list (P) */ + TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ + TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ - struct vm_anon *uanon; /* anon (O,P) */ - struct uvm_object *uobject; /* object (O,P) */ - voff_t offset; /* offset into object (O,P) */ + struct vm_anon *uanon; /* anon (O,P) */ + struct uvm_object *uobject; /* object (O,P) */ + voff_t offset; /* offset into object (O,P) */ - u_short flags; /* object flags [O] */ - u_short version; /* version count [O] */ - u_short wire_count; /* wired down map refs [P] */ - u_short pqflags; /* page queue flags [P] */ - u_int loan_count; /* number of active loans - * to read: [O or P] - * to modify: [O _and_ P] */ - paddr_t phys_addr; /* physical address of page */ + u_short flags; /* object flags [O] */ + u_short version; /* version count [O] */ + u_short wire_count; /* wired down map refs [P] */ + u_short pqflags; /* page queue flags [P] */ + u_int loan_count; /* number of active loans + * to read: [O or P] + * to modify: [O _and_ P] */ + paddr_t phys_addr; /* physical address of page */ #if defined(UVM_PAGE_TRKOWN) - /* debugging fields to track page ownership */ - pid_t owner; /* proc that set PG_BUSY */ - char *owner_tag; /* why it was set busy */ + /* debugging fields to track page ownership */ + pid_t owner; /* proc that set PG_BUSY */ + char *owner_tag; /* why it was set busy */ #endif }; @@ -157,25 +157,23 @@ struct vm_page { * PG_ZERO is used to indicate that a page has been pre-zero'd. This flag * is only set when the page is on no queues, and is cleared when the page * is placed on the free list. - * - * possible deadwood: PG_FAULTING, PQ_LAUNDRY */ + +#define PG_BUSY 0x0001 /* page is locked */ +#define PG_WANTED 0x0002 /* someone is waiting for page */ +#define PG_TABLED 0x0004 /* page is in VP table */ #define PG_CLEAN 0x0008 /* page has not been modified */ -#define PG_BUSY 0x0010 /* page is in transit */ -#define PG_WANTED 0x0020 /* someone is waiting for page */ -#define PG_TABLED 0x0040 /* page is in VP table */ -#define PG_ZERO 0x0100 /* page is pre-zero'd */ -#define PG_FAKE 0x0200 /* page is placeholder for pagein */ -#define PG_FILLED 0x0400 /* client flag to set when filled */ -#define PG_DIRTY 0x0800 /* client flag to set when dirty */ -#define PG_RELEASED 0x1000 /* page released while paging */ -#define PG_FAULTING 0x2000 /* page is being faulted in */ -#define PG_CLEANCHK 0x4000 /* clean bit has been checked */ +#define PG_CLEANCHK 0x0010 /* clean bit has been checked */ +#define PG_RELEASED 0x0020 /* page released while paging */ +#define PG_FAKE 0x0040 /* page is not yet initialized */ +#define PG_RDONLY 0x0080 /* page must be mapped read-only */ +#define PG_ZERO 0x0100 /* page is pre-zero'd */ + +#define PG_PAGER1 0x1000 /* pager-specific flag */ #define PQ_FREE 0x0001 /* page is on free list */ #define PQ_INACTIVE 0x0002 /* page is in inactive list */ #define PQ_ACTIVE 0x0004 /* page is in active list */ -#define PQ_LAUNDRY 0x0008 /* page is being cleaned now */ #define PQ_ANON 0x0010 /* page is part of an anon, rather than an uvm_object */ #define PQ_AOBJ 0x0020 /* page is part of an anonymous @@ -237,12 +235,9 @@ extern boolean_t vm_page_zero_enable; * ordered, in LRU-like fashion. */ -extern -struct pglist vm_page_queue_free; /* memory free queue */ -extern -struct pglist vm_page_queue_active; /* active memory queue */ -extern -struct pglist vm_page_queue_inactive; /* inactive memory queue */ +extern struct pglist vm_page_queue_free; /* memory free queue */ +extern struct pglist vm_page_queue_active; /* active memory queue */ +extern struct pglist vm_page_queue_inactive; /* inactive memory queue */ /* * physical memory config is stored in vm_physmem. @@ -283,9 +278,8 @@ vaddr_t uvm_pageboot_alloc __P((vsize_t)); PAGE_INLINE void uvm_pagecopy __P((struct vm_page *, struct vm_page *)); PAGE_INLINE void uvm_pagedeactivate __P((struct vm_page *)); void uvm_pagefree __P((struct vm_page *)); +void uvm_page_unbusy __P((struct vm_page **, int)); PAGE_INLINE struct vm_page *uvm_pagelookup __P((struct uvm_object *, voff_t)); -void uvm_pageremove __P((struct vm_page *)); -/* uvm_pagerename: not needed */ PAGE_INLINE void uvm_pageunwire __P((struct vm_page *)); PAGE_INLINE void uvm_pagewait __P((struct vm_page *, int)); PAGE_INLINE void uvm_pagewake __P((struct vm_page *)); diff --git a/sys/uvm/uvm_pager.c b/sys/uvm/uvm_pager.c index 3c8ca703ad89..0d7a466da2ed 100644 --- a/sys/uvm/uvm_pager.c +++ b/sys/uvm/uvm_pager.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_pager.c,v 1.34 2000/11/24 22:41:39 chs Exp $ */ +/* $NetBSD: uvm_pager.c,v 1.35 2000/11/27 08:40:05 chs Exp $ */ /* * @@ -44,21 +44,27 @@ #include #include #include +#include +#include #define UVM_PAGER #include +struct pool *uvm_aiobuf_pool; + /* * list of uvm pagers in the system */ extern struct uvm_pagerops uvm_deviceops; extern struct uvm_pagerops uvm_vnodeops; +extern struct uvm_pagerops ubc_pager; struct uvm_pagerops *uvmpagerops[] = { &aobj_pager, &uvm_deviceops, &uvm_vnodeops, + &ubc_pager, }; /* @@ -68,7 +74,8 @@ struct uvm_pagerops *uvmpagerops[] = { vm_map_t pager_map; /* XXX */ simple_lock_data_t pager_map_wanted_lock; boolean_t pager_map_wanted; /* locked by pager map */ - +static vaddr_t emergva; +static boolean_t emerginuse; /* * uvm_pager_init: init pagers (at boot time) @@ -83,10 +90,12 @@ uvm_pager_init() * init pager map */ - pager_map = uvm_km_suballoc(kernel_map, &uvm.pager_sva, &uvm.pager_eva, - PAGER_MAP_SIZE, 0, FALSE, NULL); - simple_lock_init(&pager_map_wanted_lock); - pager_map_wanted = FALSE; + pager_map = uvm_km_suballoc(kernel_map, &uvm.pager_sva, &uvm.pager_eva, + PAGER_MAP_SIZE, 0, FALSE, NULL); + simple_lock_init(&pager_map_wanted_lock); + pager_map_wanted = FALSE; + emergva = uvm_km_valloc(kernel_map, MAXBSIZE); + emerginuse = FALSE; /* * init ASYNC I/O queue @@ -112,22 +121,19 @@ uvm_pager_init() */ vaddr_t -uvm_pagermapin(pps, npages, aiop, flags) +uvm_pagermapin(pps, npages, flags) struct vm_page **pps; int npages; - struct uvm_aiodesc **aiop; /* OUT */ int flags; { vsize_t size; vaddr_t kva; - struct uvm_aiodesc *aio; vaddr_t cva; struct vm_page *pp; vm_prot_t prot; UVMHIST_FUNC("uvm_pagermapin"); UVMHIST_CALLED(maphist); - UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d, aiop=0x%x, flags=0x%x)", - pps, npages, aiop, flags); + UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d)", pps, npages,0,0); /* * compute protection. outgoing I/O only needs read @@ -139,24 +145,26 @@ uvm_pagermapin(pps, npages, aiop, flags) prot |= VM_PROT_WRITE; ReStart: - if (aiop) { - MALLOC(aio, struct uvm_aiodesc *, sizeof(*aio), M_TEMP, - (flags & UVMPAGER_MAPIN_WAITOK)); - if (aio == NULL) - return(0); - *aiop = aio; - } else { - aio = NULL; - } - size = npages << PAGE_SHIFT; kva = 0; /* let system choose VA */ if (uvm_map(pager_map, &kva, size, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != KERN_SUCCESS) { + if (curproc == uvm.pagedaemon_proc) { + simple_lock(&pager_map_wanted_lock); + if (emerginuse) { + UVM_UNLOCK_AND_WAIT(&emergva, + &pager_map_wanted_lock, FALSE, + "emergva", 0); + goto ReStart; + } + emerginuse = TRUE; + simple_unlock(&pager_map_wanted_lock); + kva = emergva; + KASSERT(npages <= MAXBSIZE >> PAGE_SHIFT); + goto enter; + } if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) { - if (aio) - FREE(aio, M_TEMP); UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0); return(0); } @@ -164,16 +172,17 @@ ReStart: pager_map_wanted = TRUE; UVMHIST_LOG(maphist, " SLEEPING on pager_map",0,0,0,0); UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, FALSE, - "pager_map",0); + "pager_map", 0); goto ReStart; } +enter: /* got it */ for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) { pp = *pps++; #ifdef DEBUG if ((pp->flags & PG_BUSY) == 0) - panic("uvm_pagermapin: page not busy"); + panic("uvm_pagermapin: pg %p not busy", pp); #endif pmap_enter(vm_map_pmap(pager_map), cva, VM_PAGE_TO_PHYS(pp), prot, PMAP_WIRED | prot); @@ -198,13 +207,22 @@ uvm_pagermapout(kva, npages) vsize_t size = npages << PAGE_SHIFT; vm_map_entry_t entries; UVMHIST_FUNC("uvm_pagermapout"); UVMHIST_CALLED(maphist); - + UVMHIST_LOG(maphist, " (kva=0x%x, npages=%d)", kva, npages,0,0); /* * duplicate uvm_unmap, but add in pager_map_wanted handling. */ + if (kva == emergva) { + simple_lock(&pager_map_wanted_lock); + emerginuse = FALSE; + wakeup(&emergva); + simple_unlock(&pager_map_wanted_lock); + entries = NULL; + goto remove; + } + vm_map_lock(pager_map); (void) uvm_unmap_remove(pager_map, kva, kva + size, &entries); simple_lock(&pager_map_wanted_lock); @@ -214,6 +232,8 @@ uvm_pagermapout(kva, npages) } simple_unlock(&pager_map_wanted_lock); vm_map_unlock(pager_map); +remove: + pmap_remove(pmap_kernel(), kva, kva + (npages << PAGE_SHIFT)); if (entries) uvm_unmap_detach(entries, 0); @@ -251,7 +271,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) { struct vm_page **ppsp, *pclust; voff_t lo, hi, curoff; - int center_idx, forward; + int center_idx, forward, incr; UVMHIST_FUNC("uvm_mk_pcluster"); UVMHIST_CALLED(maphist); /* @@ -273,9 +293,11 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) if (hi > mhi) hi = mhi; } - if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */ + if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */ #ifdef DIAGNOSTIC - printf("uvm_mk_pcluster: provided page array too small (fixed)\n"); + printf("uvm_mk_pcluster uobj %p npages %d lo 0x%llx hi 0x%llx " + "flags 0x%x\n", uobj, *npages, (long long)lo, + (long long)hi, flags); #endif pps[0] = center; *npages = 1; @@ -291,7 +313,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) pps[center_idx] = center; /* plug in the center page */ ppsp = &pps[center_idx]; *npages = 1; - + /* * attempt to cluster around the left [backward], and then * the right side [forward]. @@ -303,21 +325,23 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) */ for (forward = 0 ; forward <= 1 ; forward++) { - - curoff = center->offset + (forward ? PAGE_SIZE : -PAGE_SIZE); + incr = forward ? PAGE_SIZE : -PAGE_SIZE; + curoff = center->offset + incr; for ( ;(forward == 0 && curoff >= lo) || (forward && curoff < hi); - curoff += (forward ? 1 : -1) << PAGE_SHIFT) { + curoff += incr) { pclust = uvm_pagelookup(uobj, curoff); /* lookup page */ - if (pclust == NULL) + if (pclust == NULL) { break; /* no page */ + } /* handle active pages */ /* NOTE: inactive pages don't have pmap mappings */ if ((pclust->pqflags & PQ_INACTIVE) == 0) { - if ((flags & PGO_DOACTCLUST) == 0) + if ((flags & PGO_DOACTCLUST) == 0) { /* dont want mapped pages at all */ break; + } /* make sure "clean" bit is sync'd */ if ((pclust->flags & PG_CLEANCHK) == 0) { @@ -330,13 +354,16 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) pclust->flags |= PG_CLEANCHK; } } + /* is page available for cleaning and does it need it */ - if ((pclust->flags & (PG_CLEAN|PG_BUSY)) != 0) + if ((pclust->flags & (PG_CLEAN|PG_BUSY)) != 0) { break; /* page is already clean or is busy */ + } /* yes! enroll the page in our array */ pclust->flags |= PG_BUSY; /* busy! */ UVM_PAGE_OWN(pclust, "uvm_mk_pcluster"); + /* XXX: protect wired page? see above comment. */ pmap_page_protect(pclust, VM_PROT_READ); if (!forward) { @@ -346,7 +373,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi) /* move forward one page */ ppsp[*npages] = pclust; } - *npages = *npages + 1; + (*npages)++; } } @@ -409,6 +436,7 @@ uvm_pager_put(uobj, pg, ppsp_ptr, npages, flags, start, stop) int result; daddr_t swblk; struct vm_page **ppsp = *ppsp_ptr; + UVMHIST_FUNC("uvm_pager_put"); UVMHIST_CALLED(ubchist); /* * note that uobj is null if we are doing a swap-backed pageout. @@ -459,12 +487,12 @@ uvm_pager_put(uobj, pg, ppsp_ptr, npages, flags, start, stop) ReTry: if (uobj) { /* object is locked */ - result = uobj->pgops->pgo_put(uobj, ppsp, *npages, - flags & PGO_SYNCIO); + result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags); + UVMHIST_LOG(ubchist, "put -> %d", result, 0,0,0); /* object is now unlocked */ } else { /* nothing locked */ - result = uvm_swap_put(swblk, ppsp, *npages, flags & PGO_SYNCIO); + result = uvm_swap_put(swblk, ppsp, *npages, flags); /* nothing locked */ } @@ -565,7 +593,7 @@ ReTry: /* * a pager error occured (even after dropping the cluster, if there - * was one). give up! the caller only has one page ("pg") + * was one). give up! the caller only has one page ("pg") * to worry about. */ @@ -610,7 +638,8 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags) for (lcv = 0 ; lcv < *npages ; lcv++) { - if (ppsp[lcv] == pg) /* skip "pg" */ + /* skip "pg" or empty slot */ + if (ppsp[lcv] == pg || ppsp[lcv] == NULL) continue; /* @@ -637,9 +666,10 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags) } /* did someone want the page while we had it busy-locked? */ - if (ppsp[lcv]->flags & PG_WANTED) + if (ppsp[lcv]->flags & PG_WANTED) { /* still holding obj lock */ wakeup(ppsp[lcv]); + } /* if page was released, release it. otherwise un-busy it */ if (ppsp[lcv]->flags & PG_RELEASED) { @@ -690,7 +720,7 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags) continue; /* next page */ } else { - ppsp[lcv]->flags &= ~(PG_BUSY|PG_WANTED); + ppsp[lcv]->flags &= ~(PG_BUSY|PG_WANTED|PG_FAKE); UVM_PAGE_OWN(ppsp[lcv], NULL); } @@ -713,3 +743,167 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags) } } } + +/* + * interrupt-context iodone handler for nested i/o bufs. + * + * => must be at splbio(). + */ + +void +uvm_aio_biodone1(bp) + struct buf *bp; +{ + struct buf *mbp = bp->b_private; + + KASSERT(mbp != bp); + if (bp->b_flags & B_ERROR) { + mbp->b_flags |= B_ERROR; + mbp->b_error = bp->b_error; + } + mbp->b_resid -= bp->b_bcount; + pool_put(&bufpool, bp); + if (mbp->b_resid == 0) { + biodone(mbp); + } +} + +/* + * interrupt-context iodone handler for single-buf i/os + * or the top-level buf of a nested-buf i/o. + * + * => must be at splbio(). + */ + +void +uvm_aio_biodone(bp) + struct buf *bp; +{ + /* reset b_iodone for when this is a single-buf i/o. */ + bp->b_iodone = uvm_aio_aiodone; + + simple_lock(&uvm.aiodoned_lock); /* locks uvm.aio_done */ + TAILQ_INSERT_TAIL(&uvm.aio_done, bp, b_freelist); + wakeup(&uvm.aiodoned); + simple_unlock(&uvm.aiodoned_lock); +} + +/* + * uvm_aio_aiodone: do iodone processing for async i/os. + * this should be called in thread context, not interrupt context. + */ + +void +uvm_aio_aiodone(bp) + struct buf *bp; +{ + int npages = bp->b_bufsize >> PAGE_SHIFT; + struct vm_page *pg, *pgs[npages]; + struct uvm_object *uobj; + int s, i; + boolean_t release, write, swap; + UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0); + + release = (bp->b_flags & (B_ERROR|B_READ)) == (B_ERROR|B_READ); + write = (bp->b_flags & B_READ) == 0; + /* XXXUBC B_NOCACHE is for swap pager, should be done differently */ + if (write && !(bp->b_flags & B_NOCACHE)) { + /* XXXUBC */ + void softdep_pageiodone(struct buf *); + softdep_pageiodone(bp); + } + + uobj = NULL; + for (i = 0; i < npages; i++) { + pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); + UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0); + } + uvm_pagermapout((vaddr_t)bp->b_data, npages); + for (i = 0; i < npages; i++) { + pg = pgs[i]; + + if (i == 0) { + swap = (pg->pqflags & PQ_SWAPBACKED) != 0; + if (!swap) { + uobj = pg->uobject; + simple_lock(&uobj->vmobjlock); + } + } + KASSERT(swap || pg->uobject == uobj); + if (swap) { + if (pg->pqflags & PQ_ANON) { + simple_lock(&pg->uanon->an_lock); + } else { + simple_lock(&pg->uobject->vmobjlock); + } + } + + /* + * if this is a read and we got an error, mark the pages + * PG_RELEASED so that uvm_page_unbusy() will free them. + */ + + if (release) { + pg->flags |= PG_RELEASED; + continue; + } + KASSERT(!write || (pgs[i]->flags & PG_FAKE) == 0); + + /* + * if this is a read and the page is PG_FAKE + * or this was a write, mark the page PG_CLEAN and not PG_FAKE. + */ + + if (pgs[i]->flags & PG_FAKE || write) { + pmap_clear_reference(pgs[i]); + pmap_clear_modify(pgs[i]); + pgs[i]->flags |= PG_CLEAN; + pgs[i]->flags &= ~PG_FAKE; + } + if (swap) { + if (pg->pqflags & PQ_ANON) { + simple_unlock(&pg->uanon->an_lock); + } else { + simple_unlock(&pg->uobject->vmobjlock); + } + } + } + uvm_page_unbusy(pgs, npages); + if (!swap) { + simple_unlock(&uobj->vmobjlock); + } + + s = splbio(); + if (write && (bp->b_flags & B_AGE) != 0) { + vwakeup(bp); + } + pool_put(&bufpool, bp); + splx(s); +} + +/* + * translate unix errno values to VM_PAGER_*. + */ + +int +uvm_errno2vmerror(errno) + int errno; +{ + switch (errno) { + case 0: + return VM_PAGER_OK; + case EINVAL: + return VM_PAGER_BAD; + case EINPROGRESS: + return VM_PAGER_PEND; + case EIO: + return VM_PAGER_ERROR; + case EAGAIN: + return VM_PAGER_AGAIN; + case EBUSY: + return VM_PAGER_UNLOCK; + default: + return VM_PAGER_ERROR; + } +} diff --git a/sys/uvm/uvm_pager.h b/sys/uvm/uvm_pager.h index 65427b255e7a..c56ef008aa1f 100644 --- a/sys/uvm/uvm_pager.h +++ b/sys/uvm/uvm_pager.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_pager.h,v 1.19 2000/11/27 08:19:51 chs Exp $ */ +/* $NetBSD: uvm_pager.h,v 1.20 2000/11/27 08:40:05 chs Exp $ */ /* * @@ -81,21 +81,6 @@ * uvm_pager.h */ -/* - * async pager i/o descriptor structure - */ - -TAILQ_HEAD(uvm_aiohead, uvm_aiodesc); - -struct uvm_aiodesc { - void (*aiodone) __P((struct uvm_aiodesc *)); - /* aio done function */ - vaddr_t kva; /* KVA of mapped page(s) */ - int npages; /* # of pages in I/O req */ - void *pd_ptr; /* pager-dependent pointer */ - TAILQ_ENTRY(uvm_aiodesc) aioq; /* linked list of aio's */ -}; - /* * pager ops */ @@ -132,22 +117,22 @@ struct uvm_pagerops { /* pager flags [mostly for flush] */ #define PGO_CLEANIT 0x001 /* write dirty pages to backing store */ -#define PGO_SYNCIO 0x002 /* if PGO_CLEAN: use sync I/O? */ -/* - * obviously if neither PGO_INVALIDATE or PGO_FREE are set then the pages - * stay where they are. - */ +#define PGO_SYNCIO 0x002 /* if PGO_CLEANIT: use sync I/O? */ #define PGO_DEACTIVATE 0x004 /* deactivate flushed pages */ #define PGO_FREE 0x008 /* free flushed pages */ +/* if PGO_FREE is not set then the pages stay where they are. */ #define PGO_ALLPAGES 0x010 /* flush whole object/get all pages */ #define PGO_DOACTCLUST 0x020 /* flag to mk_pcluster to include active */ #define PGO_LOCKED 0x040 /* fault data structures are locked [get] */ #define PGO_PDFREECLUST 0x080 /* daemon's free cluster flag [uvm_pager_put] */ #define PGO_REALLOCSWAP 0x100 /* reallocate swap area [pager_dropcluster] */ +#define PGO_OVERWRITE 0x200 /* pages will be overwritten before unlocked */ +#define PGO_WEAK 0x400 /* "weak" put, for nfs */ +#define PGO_PASTEOF 0x800 /* allow allocation of pages past EOF */ /* page we are not interested in getting */ -#define PGO_DONTCARE ((struct vm_page *) -1) /* [get only] */ +#define PGO_DONTCARE ((struct vm_page *) -1L) /* [get only] */ #ifdef _KERNEL @@ -175,12 +160,12 @@ int uvm_pager_put __P((struct uvm_object *, struct vm_page *, PAGER_INLINE struct vm_page *uvm_pageratop __P((vaddr_t)); -vaddr_t uvm_pagermapin __P((struct vm_page **, int, - struct uvm_aiodesc **, int)); +vaddr_t uvm_pagermapin __P((struct vm_page **, int, int)); void uvm_pagermapout __P((vaddr_t, int)); struct vm_page **uvm_mk_pcluster __P((struct uvm_object *, struct vm_page **, int *, struct vm_page *, int, voff_t, voff_t)); +int uvm_errno2vmerror __P((int)); /* Flags to uvm_pagermapin() */ #define UVMPAGER_MAPIN_WAITOK 0x01 /* it's okay to wait */ diff --git a/sys/uvm/uvm_pdaemon.c b/sys/uvm/uvm_pdaemon.c index 82d888ea0ae9..dd2deb8ce895 100644 --- a/sys/uvm/uvm_pdaemon.c +++ b/sys/uvm/uvm_pdaemon.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $ */ +/* $NetBSD: uvm_pdaemon.c,v 1.24 2000/11/27 08:40:05 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -77,9 +77,13 @@ #include #include #include +#include #include +extern u_long uvm_pgcnt_vnode; +extern struct uvm_pagerops uvm_vnodeops; + /* * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedeamon will reactivate * in a pass thru the inactive list when swap is full. the value should be @@ -194,10 +198,8 @@ void uvm_pageout(void *arg) { int npages = 0; - int s; - struct uvm_aiodesc *aio, *nextaio; UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist); - + UVMHIST_LOG(pdhist,"", 0, 0, 0, 0); /* @@ -214,7 +216,82 @@ uvm_pageout(void *arg) /* * main loop */ - while (TRUE) { + + for (;;) { + simple_lock(&uvm.pagedaemon_lock); + + UVMHIST_LOG(pdhist," <>",0,0,0,0); + UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon, + &uvm.pagedaemon_lock, FALSE, "pgdaemon", 0); + uvmexp.pdwoke++; + UVMHIST_LOG(pdhist," <>",0,0,0,0); + + /* drain pool resources */ + pool_drain(0); + + /* + * now lock page queues and recompute inactive count + */ + + uvm_lock_pageq(); + if (npages != uvmexp.npages) { /* check for new pages? */ + npages = uvmexp.npages; + uvmpd_tune(); + } + + uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3; + if (uvmexp.inactarg <= uvmexp.freetarg) { + uvmexp.inactarg = uvmexp.freetarg + 1; + } + + UVMHIST_LOG(pdhist," free/ftarg=%d/%d, inact/itarg=%d/%d", + uvmexp.free, uvmexp.freetarg, uvmexp.inactive, + uvmexp.inactarg); + + /* + * scan if needed + */ + + if (uvmexp.free + uvmexp.paging < uvmexp.freetarg || + uvmexp.inactive < uvmexp.inactarg || + uvm_pgcnt_vnode > + (uvmexp.active + uvmexp.inactive + uvmexp.wired + + uvmexp.free) * 13 / 16) { + uvmpd_scan(); + } + + /* + * if there's any free memory to be had, + * wake up any waiters. + */ + + if (uvmexp.free > uvmexp.reserve_kernel || + uvmexp.paging == 0) { + wakeup(&uvmexp.free); + } + + /* + * scan done. unlock page queues (the only lock we are holding) + */ + + uvm_unlock_pageq(); + } + /*NOTREACHED*/ +} + + +/* + * uvm_aiodone_daemon: main loop for the aiodone daemon. + */ + +void +uvm_aiodone_daemon(void *arg) +{ + int s, free; + struct buf *bp, *nbp; + UVMHIST_FUNC("uvm_aiodoned"); UVMHIST_CALLED(pdhist); + + for (;;) { /* * carefully attempt to go to sleep (without losing "wakeups"!). @@ -223,95 +300,58 @@ uvm_pageout(void *arg) */ s = splbio(); - simple_lock(&uvm.pagedaemon_lock); - - /* - * if we've got done aio's, then bypass the sleep - */ - - if (uvm.aio_done.tqh_first == NULL) { - UVMHIST_LOG(maphist," <>",0,0,0,0); - UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon, - &uvm.pagedaemon_lock, FALSE, "daemon_slp", 0); - uvmexp.pdwoke++; + simple_lock(&uvm.aiodoned_lock); + if (TAILQ_FIRST(&uvm.aio_done) == NULL) { + UVMHIST_LOG(pdhist," <>",0,0,0,0); + UVM_UNLOCK_AND_WAIT(&uvm.aiodoned, + &uvm.aiodoned_lock, FALSE, "aiodoned", 0); UVMHIST_LOG(pdhist," <>",0,0,0,0); - /* relock pagedaemon_lock, still at splbio */ - simple_lock(&uvm.pagedaemon_lock); + /* relock aiodoned_lock, still at splbio */ + simple_lock(&uvm.aiodoned_lock); } /* * check for done aio structures */ - aio = uvm.aio_done.tqh_first; /* save current list (if any)*/ - if (aio) { - TAILQ_INIT(&uvm.aio_done); /* zero global list */ + bp = TAILQ_FIRST(&uvm.aio_done); + if (bp) { + TAILQ_INIT(&uvm.aio_done); } - simple_unlock(&uvm.pagedaemon_lock); /* unlock */ - splx(s); /* drop splbio */ - + simple_unlock(&uvm.aiodoned_lock); + splx(s); + /* - * first clear out any pending aios (to free space in case we - * want to pageout more stuff). + * process each i/o that's done. */ - for (/*null*/; aio != NULL ; aio = nextaio) { - - uvmexp.paging -= aio->npages; - nextaio = aio->aioq.tqe_next; - aio->aiodone(aio); - + free = uvmexp.free; + while (bp != NULL) { + if (bp->b_flags & B_PDAEMON) { + uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT; + } + nbp = TAILQ_NEXT(bp, b_freelist); + (*bp->b_iodone)(bp); + bp = nbp; } - - /* Next, drain pool resources */ - pool_drain(0); - - /* - * now lock page queues and recompute inactive count - */ - uvm_lock_pageq(); - - if (npages != uvmexp.npages) { /* check for new pages? */ - npages = uvmexp.npages; - uvmpd_tune(); - } - - uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3; - if (uvmexp.inactarg <= uvmexp.freetarg) - uvmexp.inactarg = uvmexp.freetarg + 1; - - UVMHIST_LOG(pdhist," free/ftarg=%d/%d, inact/itarg=%d/%d", - uvmexp.free, uvmexp.freetarg, uvmexp.inactive, - uvmexp.inactarg); - - /* - * scan if needed - * [XXX: note we are reading uvm.free without locking] - */ - if (uvmexp.free < uvmexp.freetarg || - uvmexp.inactive < uvmexp.inactarg) - uvmpd_scan(); - - /* - * done scan. unlock page queues (the only lock we are holding) - */ - uvm_unlock_pageq(); - - /* - * done! restart loop. - */ - if (uvmexp.free > uvmexp.reserve_kernel || - uvmexp.paging == 0) + if (free <= uvmexp.reserve_kernel) { + s = uvm_lock_fpageq(); + wakeup(&uvm.pagedaemon); + uvm_unlock_fpageq(s); + } else { + simple_lock(&uvm.pagedaemon_lock); wakeup(&uvmexp.free); + simple_unlock(&uvm.pagedaemon_lock); + } } - /*NOTREACHED*/ } + + /* - * uvmpd_scan_inactive: the first loop of uvmpd_scan broken out into - * its own function for ease of reading. + * uvmpd_scan_inactive: scan an inactive list for pages to clean or free. * * => called with page queues locked * => we work on meeting our free target by converting inactive pages @@ -334,9 +374,9 @@ uvmpd_scan_inactive(pglst) int swnpages, swcpages; /* XXX: see below */ int swslot; struct vm_anon *anon; - boolean_t swap_backed; + boolean_t swap_backed, vnode_only; vaddr_t start; - int dirtyreacts; + int dirtyreacts, vpgs; UVMHIST_FUNC("uvmpd_scan_inactive"); UVMHIST_CALLED(pdhist); /* @@ -349,75 +389,81 @@ uvmpd_scan_inactive(pglst) /* * swslot is non-zero if we are building a swap cluster. we want - * to stay in the loop while we have a page to scan or we have + * to stay in the loop while we have a page to scan or we have * a swap-cluster to build. */ + swslot = 0; swnpages = swcpages = 0; free = 0; dirtyreacts = 0; + vnode_only = FALSE; - for (p = pglst->tqh_first ; p != NULL || swslot != 0 ; p = nextpg) { + for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) { /* * note that p can be NULL iff we have traversed the whole * list and need to do one final swap-backed clustered pageout. */ + + uobj = NULL; + anon = NULL; + if (p) { + /* * update our copy of "free" and see if we've met * our target */ + s = uvm_lock_fpageq(); free = uvmexp.free; uvm_unlock_fpageq(s); + /* XXXUBC */ + vpgs = uvm_pgcnt_vnode - + (uvmexp.active + uvmexp.inactive + + uvmexp.wired + uvmexp.free) * 13 / 16; + if (free + uvmexp.paging >= uvmexp.freetarg << 2 || dirtyreacts == UVMPD_NUMDIRTYREACTS) { - UVMHIST_LOG(pdhist," met free target: " - "exit loop", 0, 0, 0, 0); - retval = TRUE; /* hit the target! */ + if (vpgs <= 0) { + UVMHIST_LOG(pdhist," met free target: " + "exit loop", 0, 0, 0, 0); + retval = TRUE; - if (swslot == 0) - /* exit now if no swap-i/o pending */ - break; + if (swslot == 0) + /* exit now if no + swap-i/o pending */ + break; - /* set p to null to signal final swap i/o */ - p = NULL; + /* set p to null to signal final + swap i/o */ + p = NULL; + } else { + vnode_only = TRUE; + } } } - uobj = NULL; /* be safe and shut gcc up */ - anon = NULL; /* be safe and shut gcc up */ - if (p) { /* if (we have a new page to consider) */ + /* * we are below target and have a new page to consider. */ - uvmexp.pdscans++; - nextpg = p->pageq.tqe_next; - /* - * move referenced pages back to active queue and - * skip to next page (unlikely to happen since - * inactive pages shouldn't have any valid mappings - * and we cleared reference before deactivating). - */ - if (pmap_is_referenced(p)) { - uvm_pageactivate(p); - uvmexp.pdreact++; - continue; - } - + uvmexp.pdscans++; + nextpg = TAILQ_NEXT(p, pageq); + /* * first we attempt to lock the object that this page * belongs to. if our attempt fails we skip on to * the next page (no harm done). it is important to * "try" locking the object as we are locking in the * wrong order (pageq -> object) and we don't want to - * get deadlocked. + * deadlock. * - * the only time we exepct to see an ownerless page + * the only time we expect to see an ownerless page * (i.e. a page with no uobject and !PQ_ANON) is if an * anon has loaned a page from a uvm_object and the * uvm_object has dropped the ownership. in that @@ -427,17 +473,12 @@ uvmpd_scan_inactive(pglst) /* is page part of an anon or ownerless ? */ if ((p->pqflags & PQ_ANON) || p->uobject == NULL) { - + if (vnode_only) { + uvm_pageactivate(p); + continue; + } anon = p->uanon; - -#ifdef DIAGNOSTIC - /* to be on inactive q, page must be part - * of _something_ */ - if (anon == NULL) - panic("pagedaemon: page with no anon " - "or object detected - loop 1"); -#endif - + KASSERT(anon != NULL); if (!simple_lock_try(&anon->an_lock)) /* lock failed, skip this page */ continue; @@ -446,41 +487,38 @@ uvmpd_scan_inactive(pglst) * if the page is ownerless, claim it in the * name of "anon"! */ - if ((p->pqflags & PQ_ANON) == 0) { -#ifdef DIAGNOSTIC - if (p->loan_count < 1) - panic("pagedaemon: non-loaned " - "ownerless page detected -" - " loop 1"); -#endif - p->loan_count--; - p->pqflags |= PQ_ANON; /* anon now owns it */ - } + if ((p->pqflags & PQ_ANON) == 0) { + KASSERT(p->loan_count > 0); + p->loan_count--; + p->pqflags |= PQ_ANON; + /* anon now owns it */ + } if (p->flags & PG_BUSY) { simple_unlock(&anon->an_lock); uvmexp.pdbusy++; /* someone else owns page, skip it */ continue; } - uvmexp.pdanscan++; - } else { - uobj = p->uobject; - + KASSERT(uobj != NULL); + if (vnode_only && + uobj->pgops != &uvm_vnodeops) { + uvm_pageactivate(p); + continue; + } if (!simple_lock_try(&uobj->vmobjlock)) /* lock failed, skip this page */ - continue; + continue; if (p->flags & PG_BUSY) { simple_unlock(&uobj->vmobjlock); uvmexp.pdbusy++; /* someone else owns page, skip it */ - continue; + continue; } - uvmexp.pdobscan++; } @@ -498,21 +536,18 @@ uvmpd_scan_inactive(pglst) simple_unlock(&uvm.swap_data_lock); } - /* zap all mappings with pmap_page_protect... */ - pmap_page_protect(p, VM_PROT_NONE); uvm_pagefree(p); uvmexp.pdfreed++; - + if (anon) { -#ifdef DIAGNOSTIC + /* * an anonymous page can only be clean - * if it has valid backing store. + * if it has backing store assigned. */ - if (anon->an_swslot == 0) - panic("pagedaemon: clean anon " - "page without backing store?"); -#endif + + KASSERT(anon->an_swslot != 0); + /* remove from object */ anon->u.an_page = NULL; simple_unlock(&anon->an_lock); @@ -528,6 +563,7 @@ uvmpd_scan_inactive(pglst) * this page is dirty, skip it if we'll have met our * free target when all the current pageouts complete. */ + if (free + uvmexp.paging > uvmexp.freetarg << 2) { if (anon) { simple_unlock(&anon->an_lock); @@ -543,11 +579,8 @@ uvmpd_scan_inactive(pglst) * reactivate it so that we eventually cycle * all pages thru the inactive queue. */ -#ifdef DIAGNOSTIC - if (uvmexp.swpgonly > uvmexp.swpages) { - panic("uvmexp.swpgonly botch"); - } -#endif + + KASSERT(uvmexp.swpgonly <= uvmexp.swpages); if ((p->pqflags & PQ_SWAPBACKED) && uvmexp.swpgonly == uvmexp.swpages) { dirtyreacts++; @@ -565,11 +598,8 @@ uvmpd_scan_inactive(pglst) * is full, free any swap allocated to the page * so that other pages can be paged out. */ -#ifdef DIAGNOSTIC - if (uvmexp.swpginuse > uvmexp.swpages) { - panic("uvmexp.swpginuse botch"); - } -#endif + + KASSERT(uvmexp.swpginuse <= uvmexp.swpages); if ((p->pqflags & PQ_SWAPBACKED) && uvmexp.swpginuse == uvmexp.swpages) { @@ -588,26 +618,25 @@ uvmpd_scan_inactive(pglst) * the page we are looking at is dirty. we must * clean it before it can be freed. to do this we * first mark the page busy so that no one else will - * touch the page. we write protect all the mappings - * of the page so that no one touches it while it is - * in I/O. + * touch the page. */ swap_backed = ((p->pqflags & PQ_SWAPBACKED) != 0); p->flags |= PG_BUSY; /* now we own it */ UVM_PAGE_OWN(p, "scan_inactive"); - pmap_page_protect(p, VM_PROT_READ); uvmexp.pgswapout++; /* * for swap-backed pages we need to (re)allocate * swap space. */ + if (swap_backed) { /* * free old swap slot (if any) */ + if (anon) { if (anon->an_swslot) { uvm_swap_free(anon->an_swslot, @@ -622,13 +651,11 @@ uvmpd_scan_inactive(pglst) /* * start new cluster (if necessary) */ - if (swslot == 0) { - /* want this much */ - swnpages = MAXBSIZE >> PAGE_SHIFT; + if (swslot == 0) { + swnpages = MAXBSIZE >> PAGE_SHIFT; swslot = uvm_swap_alloc(&swnpages, TRUE); - if (swslot == 0) { /* no swap? give up! */ p->flags &= ~PG_BUSY; @@ -647,6 +674,7 @@ uvmpd_scan_inactive(pglst) /* * add block to cluster */ + swpps[swcpages] = p; if (anon) anon->an_swslot = swslot + swcpages; @@ -655,11 +683,7 @@ uvmpd_scan_inactive(pglst) p->offset >> PAGE_SHIFT, swslot + swcpages); swcpages++; - - /* done (swap-backed) */ } - - /* end: if (p) ["if we have new page to consider"] */ } else { /* if p == NULL we must be doing a last swap i/o */ @@ -667,16 +691,16 @@ uvmpd_scan_inactive(pglst) } /* - * now consider doing the pageout. + * now consider doing the pageout. * - * for swap-backed pages, we do the pageout if we have either - * filled the cluster (in which case (swnpages == swcpages) or + * for swap-backed pages, we do the pageout if we have either + * filled the cluster (in which case (swnpages == swcpages) or * run out of pages (p == NULL). * * for object pages, we always do the pageout. */ - if (swap_backed) { + if (swap_backed) { if (p) { /* if we just added a page to cluster */ if (anon) simple_unlock(&anon->an_lock); @@ -699,21 +723,18 @@ uvmpd_scan_inactive(pglst) if (swcpages < swnpages) { uvm_swap_free(swslot + swcpages, (swnpages - swcpages)); - } - + } } else { - /* normal object pageout */ ppsp = pps; npages = sizeof(pps) / sizeof(struct vm_page *); /* not looked at because PGO_ALLPAGES is set */ start = 0; - } /* * now do the pageout. - * + * * for swap_backed pages we have already built the cluster. * for !swap_backed pages, uvm_pager_put will call the object's * "make put cluster" function to build a cluster on our behalf. @@ -734,7 +755,7 @@ uvmpd_scan_inactive(pglst) /* locked: uobj (if !swap_backed), page queues */ uvmexp.pdpageouts++; - result = uvm_pager_put((swap_backed) ? NULL : uobj, p, + result = uvm_pager_put(swap_backed ? NULL : uobj, p, &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0); /* locked: uobj (if !swap_backed && result != PEND) */ /* unlocked: pageqs, object (if swap_backed ||result == PEND) */ @@ -762,21 +783,27 @@ uvmpd_scan_inactive(pglst) if (result == VM_PAGER_PEND) { uvmexp.paging += npages; - uvm_lock_pageq(); /* relock page queues */ + uvm_lock_pageq(); uvmexp.pdpending++; if (p) { if (p->pqflags & PQ_INACTIVE) - /* reload! */ - nextpg = p->pageq.tqe_next; + nextpg = TAILQ_NEXT(p, pageq); else - /* reload! */ - nextpg = pglst->tqh_first; - } else { - nextpg = NULL; /* done list */ + nextpg = TAILQ_FIRST(pglst); + } else { + nextpg = NULL; } continue; } + if (result == VM_PAGER_ERROR && + curproc == uvm.pagedaemon_proc) { + uvm_lock_pageq(); + nextpg = TAILQ_NEXT(p, pageq); + uvm_pageactivate(p); + continue; + } + /* * clean up "p" if we have one */ @@ -812,12 +839,6 @@ uvmpd_scan_inactive(pglst) simple_lock(&uobj->vmobjlock); } -#ifdef DIAGNOSTIC - if (result == VM_PAGER_UNLOCK) - panic("pagedaemon: pageout returned " - "invalid 'unlock' code"); -#endif - /* handle PG_WANTED now */ if (p->flags & PG_WANTED) /* still holding object lock */ @@ -837,24 +858,19 @@ uvmpd_scan_inactive(pglst) pmap_page_protect(p, VM_PROT_NONE); anon = NULL; uvm_lock_pageq(); - nextpg = p->pageq.tqe_next; + nextpg = TAILQ_NEXT(p, pageq); /* free released page */ uvm_pagefree(p); } else { -#ifdef DIAGNOSTIC - if (uobj->pgops->pgo_releasepg == NULL) - panic("pagedaemon: no " - "pgo_releasepg function"); -#endif - - /* + /* * pgo_releasepg nukes the page and * gets "nextpg" for us. it returns * with the page queues locked (when * given nextpg ptr). */ + if (!uobj->pgops->pgo_releasepg(p, &nextpg)) /* uobj died after release */ @@ -864,35 +880,27 @@ uvmpd_scan_inactive(pglst) * lock page queues here so that they're * always locked at the end of the loop. */ + uvm_lock_pageq(); } - } else { /* page was not released during I/O */ - uvm_lock_pageq(); - nextpg = p->pageq.tqe_next; - + nextpg = TAILQ_NEXT(p, pageq); if (result != VM_PAGER_OK) { - /* pageout was a failure... */ if (result != VM_PAGER_AGAIN) uvm_pageactivate(p); pmap_clear_reference(p); /* XXXCDC: if (swap_backed) FREE p's * swap block? */ - } else { - /* pageout was a success... */ pmap_clear_reference(p); pmap_clear_modify(p); p->flags |= PG_CLEAN; - /* XXX: could free page here, but old - * pagedaemon does not */ - } } - + /* * drop object lock (if there is an object left). do * a safety check of nextpg to make sure it is on the @@ -906,26 +914,27 @@ uvmpd_scan_inactive(pglst) else if (uobj) simple_unlock(&uobj->vmobjlock); - } /* if (p) */ else { + } else { + + /* + * if p is null in this loop, make sure it stays null + * in the next loop. + */ - /* if p is null in this loop, make sure it stays null - * in next loop */ nextpg = NULL; /* * lock page queues here just so they're always locked * at the end of the loop. */ + uvm_lock_pageq(); } if (nextpg && (nextpg->pqflags & PQ_INACTIVE) == 0) { - printf("pagedaemon: invalid nextpg! reverting to " - "queue head\n"); - nextpg = pglst->tqh_first; /* reload! */ + nextpg = TAILQ_FIRST(pglst); /* reload! */ } - - } /* end of "inactive" 'for' loop */ + } return (retval); } @@ -945,10 +954,8 @@ uvmpd_scan() UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist); uvmexp.pdrevs++; /* counter */ + uobj = NULL; -#ifdef __GNUC__ - uobj = NULL; /* XXX gcc */ -#endif /* * get current "free" page count */ @@ -962,13 +969,11 @@ uvmpd_scan() * we need to unlock the page queues for this. */ if (free < uvmexp.freetarg) { - uvmexp.pdswout++; UVMHIST_LOG(pdhist," free %d < target %d: swapout", free, uvmexp.freetarg, 0, 0); uvm_unlock_pageq(); uvm_swapout_threads(); - pmap_update(); /* update so we can scan inactive q */ uvm_lock_pageq(); } @@ -984,8 +989,8 @@ uvmpd_scan() UVMHIST_LOG(pdhist, " starting 'free' loop",0,0,0,0); /* - * do loop #1! alternate starting queue between swap and object based - * on the low bit of uvmexp.pdrevs (which we bump by one each call). + * alternate starting queue between swap and object based on the + * low bit of uvmexp.pdrevs (which we bump by one each call). */ got_it = FALSE; @@ -1009,6 +1014,7 @@ uvmpd_scan() * detect if we're not going to be able to page anything out * until we free some swap resources from active pages. */ + swap_shortage = 0; if (uvmexp.free < uvmexp.freetarg && uvmexp.swpginuse == uvmexp.swpages && @@ -1016,13 +1022,13 @@ uvmpd_scan() pages_freed == 0) { swap_shortage = uvmexp.freetarg - uvmexp.free; } - + UVMHIST_LOG(pdhist, " loop 2: inactive_shortage=%d swap_shortage=%d", inactive_shortage, swap_shortage,0,0); - for (p = TAILQ_FIRST(&uvm.page_active); + for (p = TAILQ_FIRST(&uvm.page_active); p != NULL && (inactive_shortage > 0 || swap_shortage > 0); p = nextpg) { - nextpg = p->pageq.tqe_next; + nextpg = TAILQ_NEXT(p, pageq); if (p->flags & PG_BUSY) continue; /* quick check before trying to lock */ @@ -1031,22 +1037,13 @@ uvmpd_scan() */ /* is page anon owned or ownerless? */ if ((p->pqflags & PQ_ANON) || p->uobject == NULL) { - -#ifdef DIAGNOSTIC - if (p->uanon == NULL) - panic("pagedaemon: page with no anon or " - "object detected - loop 2"); -#endif + KASSERT(p->uanon != NULL); if (!simple_lock_try(&p->uanon->an_lock)) continue; /* take over the page? */ if ((p->pqflags & PQ_ANON) == 0) { -#ifdef DIAGNOSTIC - if (p->loan_count < 1) - panic("pagedaemon: non-loaned " - "ownerless page detected - loop 2"); -#endif + KASSERT(p->loan_count > 0); p->loan_count--; p->pqflags |= PQ_ANON; } @@ -1054,9 +1051,11 @@ uvmpd_scan() if (!simple_lock_try(&p->uobject->vmobjlock)) continue; } + /* * skip this page if it's busy. */ + if ((p->flags & PG_BUSY) != 0) { if (p->pqflags & PQ_ANON) simple_unlock(&p->uanon->an_lock); @@ -1064,11 +1063,12 @@ uvmpd_scan() simple_unlock(&p->uobject->vmobjlock); continue; } - + /* * if there's a shortage of swap, free any swap allocated * to this page so that other pages can be paged out. */ + if (swap_shortage > 0) { if ((p->pqflags & PQ_ANON) && p->uanon->an_swslot) { uvm_swap_free(p->uanon->an_swslot, 1); @@ -1086,11 +1086,12 @@ uvmpd_scan() } } } - + /* * deactivate this page if there's a shortage of * inactive pages. */ + if (inactive_shortage > 0) { pmap_page_protect(p, VM_PROT_NONE); /* no need to check wire_count as pg is "active" */ @@ -1098,7 +1099,6 @@ uvmpd_scan() uvmexp.pddeact++; inactive_shortage--; } - if (p->pqflags & PQ_ANON) simple_unlock(&p->uanon->an_lock); else diff --git a/sys/uvm/uvm_swap.c b/sys/uvm/uvm_swap.c index cfdf597a210c..1399073d06d6 100644 --- a/sys/uvm/uvm_swap.c +++ b/sys/uvm/uvm_swap.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $ */ +/* $NetBSD: uvm_swap.c,v 1.41 2000/11/27 08:40:05 chs Exp $ */ /* * Copyright (c) 1995, 1996, 1997 Matthew R. Green @@ -34,6 +34,7 @@ #include "fs_nfs.h" #include "opt_uvmhist.h" #include "opt_compat_netbsd.h" +#include "opt_ddb.h" #include #include @@ -77,11 +78,6 @@ * by the "swap_priority" global var. each "swappri" contains a * CIRCLEQ of "swapdev" structures at that priority. * - * the system maintains a fixed pool of "swapbuf" structures for use - * at swap i/o time. a swapbuf includes a "buf" structure and an - * "aiodone" [we want to avoid malloc()'ing anything at swapout time - * since memory may be low]. - * * locking: * - swap_syscall_lock (sleep lock): this lock serializes the swapctl * system call and prevents the swap priority list from changing @@ -89,8 +85,6 @@ * - uvm.swap_data_lock (simple_lock): this lock protects all swap data * structures including the priority list, the swapdev structures, * and the swapmap extent. - * - swap_buf_lock (simple_lock): this lock protects the free swapbuf - * pool. * * each swap device has the following info: * - swap device in use (could be disabled, preventing future use) @@ -157,15 +151,6 @@ struct swappri { LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ }; -/* - * swapbuf, swapbuffer plus async i/o info - */ -struct swapbuf { - struct buf sw_buf; /* a buffer structure */ - struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */ - SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */ -}; - /* * The following two structures are used to keep track of data transfers * on swap devices associated with regular files. @@ -222,8 +207,6 @@ cdev_decl(sw); * local variables */ static struct extent *swapmap; /* controls the mapping of /dev/drum */ -SIMPLEQ_HEAD(swapbufhead, swapbuf); -struct pool *swapbuf_pool; /* list of all active swap devices [by priority] */ LIST_HEAD(swap_priority, swappri); @@ -250,8 +233,6 @@ static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); static void sw_reg_iodone __P((struct buf *)); static void sw_reg_start __P((struct swapdev *)); -static void uvm_swap_aiodone __P((struct uvm_aiodesc *)); -static void uvm_swap_bufdone __P((struct buf *)); static int uvm_swap_io __P((struct vm_page **, int, int, int)); /* @@ -292,18 +273,9 @@ uvm_swap_init() panic("uvm_swap_init: extent_create failed"); /* - * allocate our private pool of "swapbuf" structures (includes - * a "buf" structure). ["nswbuf" comes from param.c and can - * be adjusted by MD code before we get here]. + * allocate pools for structures used for swapping to files. */ - swapbuf_pool = - pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0, - NULL, NULL, 0); - if (swapbuf_pool == NULL) - panic("swapinit: pool_create failed"); - /* XXX - set a maximum on swapbuf_pool? */ - vndxfer_pool = pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0, NULL, NULL, 0); @@ -1120,7 +1092,7 @@ swstrategy(bp) * be yanked out from under us because we are holding resources * in it (i.e. the blocks we are doing I/O on). */ - pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT; + pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT; simple_lock(&uvm.swap_data_lock); sdp = swapdrum_getsdp(pageno); simple_unlock(&uvm.swap_data_lock); @@ -1139,7 +1111,7 @@ swstrategy(bp) pageno -= sdp->swd_drumoffset; /* page # on swapdev */ bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */ - UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n", + UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld", ((bp->b_flags & B_READ) == 0) ? "write" : "read", sdp->swd_drumoffset, bn, bp->b_bcount); @@ -1174,14 +1146,14 @@ swstrategy(bp) vp->v_numoutput++; /* put it on swapdev */ } - /* + /* * dissassocate buffer with /dev/drum vnode * [could be null if buf was from physio] */ - if (bp->b_vp != NULLVP) + if (bp->b_vp != NULL) brelvp(bp); - /* + /* * finally plug in swapdev vnode and start I/O */ bp->b_vp = vp; @@ -1279,18 +1251,15 @@ sw_reg_strategy(sdp, bp, bn) /* * compute the size ("sz") of this transfer (in bytes). - * XXXCDC: ignores read-ahead for non-zero offset */ - if ((off = (byteoff % sdp->swd_bsize)) != 0) - sz = sdp->swd_bsize - off; - else - sz = (1 + nra) * sdp->swd_bsize; - - if (resid < sz) + off = byteoff % sdp->swd_bsize; + sz = (1 + nra) * sdp->swd_bsize - off; + if (sz > resid) sz = resid; - UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x", - sdp->swd_vp, vp, byteoff, nbn); + UVMHIST_LOG(pdhist, "sw_reg_strategy: " + "vp %p/%p offset 0x%x/0x%x", + sdp->swd_vp, vp, byteoff, nbn); /* * now get a buf structure. note that the vb_buf is @@ -1303,42 +1272,13 @@ sw_reg_strategy(sdp, bp, bn) nbp->vb_buf.b_bufsize = sz; nbp->vb_buf.b_error = 0; nbp->vb_buf.b_data = addr; + nbp->vb_buf.b_lblkno = 0; nbp->vb_buf.b_blkno = nbn + btodb(off); nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno; - nbp->vb_buf.b_proc = bp->b_proc; nbp->vb_buf.b_iodone = sw_reg_iodone; - nbp->vb_buf.b_vp = NULLVP; - nbp->vb_buf.b_vnbufs.le_next = NOLIST; - nbp->vb_buf.b_rcred = sdp->swd_cred; - nbp->vb_buf.b_wcred = sdp->swd_cred; + nbp->vb_buf.b_vp = NULL; LIST_INIT(&nbp->vb_buf.b_dep); - /* - * set b_dirtyoff/end and b_validoff/end. this is - * required by the NFS client code (otherwise it will - * just discard our I/O request). - */ - if (bp->b_dirtyend == 0) { - nbp->vb_buf.b_dirtyoff = 0; - nbp->vb_buf.b_dirtyend = sz; - } else { - nbp->vb_buf.b_dirtyoff = - max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); - nbp->vb_buf.b_dirtyend = - min(sz, - max(0, bp->b_dirtyend - (bp->b_bcount-resid))); - } - if (bp->b_validend == 0) { - nbp->vb_buf.b_validoff = 0; - nbp->vb_buf.b_validend = sz; - } else { - nbp->vb_buf.b_validoff = - max(0, bp->b_validoff - (bp->b_bcount-resid)); - nbp->vb_buf.b_validend = - min(sz, - max(0, bp->b_validend - (bp->b_bcount-resid))); - } - nbp->vb_xfer = vnx; /* patch it back in to vnx */ /* @@ -1352,7 +1292,7 @@ sw_reg_strategy(sdp, bp, bn) vnx->vx_pending++; /* assoc new buffer with underlying vnode */ - bgetvp(vp, &nbp->vb_buf); + bgetvp(vp, &nbp->vb_buf); /* sort it in and start I/O if we are not over our limit */ disksort_blkno(&sdp->swd_tab, &nbp->vb_buf); @@ -1411,6 +1351,7 @@ sw_reg_start(sdp) bp, bp->b_vp, bp->b_blkno, bp->b_bcount); if ((bp->b_flags & B_READ) == 0) bp->b_vp->v_numoutput++; + VOP_STRATEGY(bp); } sdp->swd_flags &= ~SWF_BUSY; @@ -1455,11 +1396,9 @@ sw_reg_iodone(bp) } /* - * disassociate this buffer from the vnode (if any). + * disassociate this buffer from the vnode. */ - if (vbp->vb_buf.b_vp != NULLVP) { - brelvp(&vbp->vb_buf); - } + brelvp(&vbp->vb_buf); /* * kill vbp structure @@ -1598,8 +1537,9 @@ uvm_swap_markbad(startslot, nslots) * we assume here that the range of slots will all be within * one swap device. */ - sdp->swd_npgbad += nslots; + sdp->swd_npgbad += nslots; + UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0); simple_unlock(&uvm.swap_data_lock); } @@ -1735,15 +1675,18 @@ uvm_swap_io(pps, startslot, npages, flags) int startslot, npages, flags; { daddr_t startblk; - struct swapbuf *sbp; struct buf *bp; vaddr_t kva; int result, s, mapinflags, pflag; + boolean_t write, async; UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", startslot, npages, flags, 0); + write = (flags & B_READ) == 0; + async = (flags & B_ASYNC) != 0; + /* * convert starting drum slot to block number */ @@ -1751,43 +1694,37 @@ uvm_swap_io(pps, startslot, npages, flags) /* * first, map the pages into the kernel (XXX: currently required - * by buffer system). note that we don't let pagermapin alloc - * an aiodesc structure because we don't want to chance a malloc. - * we've got our own pool of aiodesc structures (in swapbuf). + * by buffer system). */ - mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ : - UVMPAGER_MAPIN_WRITE; - if ((flags & B_ASYNC) == 0) + + mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; + if (!async) mapinflags |= UVMPAGER_MAPIN_WAITOK; - kva = uvm_pagermapin(pps, npages, NULL, mapinflags); + kva = uvm_pagermapin(pps, npages, mapinflags); if (kva == 0) return (VM_PAGER_AGAIN); /* - * now allocate a swap buffer off of freesbufs + * now allocate a buf for the i/o. * [make sure we don't put the pagedaemon to sleep...] */ s = splbio(); - pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) - ? 0 - : PR_WAITOK; - sbp = pool_get(swapbuf_pool, pflag); - splx(s); /* drop splbio */ + pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK; + bp = pool_get(&bufpool, pflag); + splx(s); /* - * if we failed to get a swapbuf, return "try again" + * if we failed to get a buf, return "try again" */ - if (sbp == NULL) + if (bp == NULL) return (VM_PAGER_AGAIN); /* * fill in the bp/sbp. we currently route our i/o through * /dev/drum's vnode [swapdev_vp]. */ - bp = &sbp->sw_buf; bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); bp->b_proc = &proc0; /* XXX */ - bp->b_rcred = bp->b_wcred = proc0.p_ucred; bp->b_vnbufs.le_next = NOLIST; bp->b_data = (caddr_t)kva; bp->b_blkno = startblk; @@ -1799,49 +1736,43 @@ uvm_swap_io(pps, startslot, npages, flags) /* XXXMRG: probably -- this is obviously something inherited... */ if (swapdev_vp->v_type == VBLK) bp->b_dev = swapdev_vp->v_rdev; - bp->b_bcount = npages << PAGE_SHIFT; + bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT; LIST_INIT(&bp->b_dep); /* - * for pageouts we must set "dirtyoff" [NFS client code needs it]. - * and we bump v_numoutput (counter of number of active outputs). + * bump v_numoutput (counter of number of active outputs). */ - if ((bp->b_flags & B_READ) == 0) { - bp->b_dirtyoff = 0; - bp->b_dirtyend = npages << PAGE_SHIFT; + if (write) { s = splbio(); swapdev_vp->v_numoutput++; splx(s); } /* - * for async ops we must set up the aiodesc and setup the callback - * XXX: we expect no async-reads, but we don't prevent it here. + * for async ops we must set up the iodone handler. */ - if (flags & B_ASYNC) { - sbp->sw_aio.aiodone = uvm_swap_aiodone; - sbp->sw_aio.kva = kva; - sbp->sw_aio.npages = npages; - sbp->sw_aio.pd_ptr = sbp; /* backpointer */ - bp->b_flags |= B_CALL; /* set callback */ - bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */ + if (async) { + /* XXXUBC pagedaemon */ + bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ? + B_PDAEMON : 0); + bp->b_iodone = uvm_aio_biodone; UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); } UVMHIST_LOG(pdhist, - "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld", + "about to start io: data = %p blkno = 0x%x, bcount = %ld", bp->b_data, bp->b_blkno, bp->b_bcount, 0); /* * now we start the I/O, and if async, return. */ VOP_STRATEGY(bp); - if (flags & B_ASYNC) + if (async) return (VM_PAGER_PEND); /* * must be sync i/o. wait for it to finish */ - bp->b_error = biowait(bp); + (void) biowait(bp); result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; /* @@ -1850,13 +1781,14 @@ uvm_swap_io(pps, startslot, npages, flags) uvm_pagermapout(kva, npages); /* - * now dispose of the swap buffer + * now dispose of the buf */ s = splbio(); if (bp->b_vp) brelvp(bp); - - pool_put(swapbuf_pool, sbp); + if (write) + vwakeup(bp); + pool_put(&bufpool, bp); splx(s); /* @@ -1865,96 +1797,3 @@ uvm_swap_io(pps, startslot, npages, flags) UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0); return (result); } - -/* - * uvm_swap_bufdone: called from the buffer system when the i/o is done - */ -static void -uvm_swap_bufdone(bp) - struct buf *bp; -{ - struct swapbuf *sbp = (struct swapbuf *) bp; - int s = splbio(); - UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist); - - UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0); -#ifdef DIAGNOSTIC - /* - * sanity check: swapbufs are private, so they shouldn't be wanted - */ - if (bp->b_flags & B_WANTED) - panic("uvm_swap_bufdone: private buf wanted"); -#endif - - /* - * drop the buffer's reference to the vnode. - */ - if (bp->b_vp) - brelvp(bp); - - /* - * now put the aio on the uvm.aio_done list and wake the - * pagedaemon (which will finish up our job in its context). - */ - simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */ - TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq); - simple_unlock(&uvm.pagedaemon_lock); - - wakeup(&uvm.pagedaemon); - splx(s); -} - -/* - * uvm_swap_aiodone: aiodone function for anonymous memory - * - * => this is called in the context of the pagedaemon (but with the - * page queues unlocked!) - * => our "aio" structure must be part of a "swapbuf" - */ -static void -uvm_swap_aiodone(aio) - struct uvm_aiodesc *aio; -{ - struct swapbuf *sbp = aio->pd_ptr; - struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT]; - int lcv, s; - vaddr_t addr; - UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist); - - UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0); -#ifdef DIAGNOSTIC - /* - * sanity check - */ - if (aio->npages > (MAXBSIZE >> PAGE_SHIFT)) - panic("uvm_swap_aiodone: aio too big!"); -#endif - - /* - * first, we have to recover the page pointers (pps) by poking in the - * kernel pmap (XXX: should be saved in the buf structure). - */ - for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; - addr += PAGE_SIZE, lcv++) { - pps[lcv] = uvm_pageratop(addr); - } - - /* - * now we can dispose of the kernel mappings of the buffer - */ - uvm_pagermapout(aio->kva, aio->npages); - - /* - * now we can dispose of the pages by using the dropcluster function - * [note that we have no "page of interest" so we pass in null] - */ - uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, - PGO_PDFREECLUST); - - /* - * finally, we can dispose of the swapbuf - */ - s = splbio(); - pool_put(swapbuf_pool, sbp); - splx(s); -} diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c index 52b691bf3aa0..65e3ecf37596 100644 --- a/sys/uvm/uvm_vnode.c +++ b/sys/uvm/uvm_vnode.c @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */ +/* $NetBSD: uvm_vnode.c,v 1.37 2000/11/27 08:40:06 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -47,6 +47,7 @@ #include "fs_nfs.h" #include "opt_uvmhist.h" +#include "opt_ddb.h" /* * uvm_vnode.c: the vnode pager. @@ -54,6 +55,7 @@ #include #include +#include #include #include #include @@ -61,62 +63,49 @@ #include #include #include +#include +#include #include #include #include -/* - * private global data structure - * - * we keep a list of writeable active vnode-backed VM objects for sync op. - * we keep a simpleq of vnodes that are currently being sync'd. - */ - -LIST_HEAD(uvn_list_struct, uvm_vnode); -static struct uvn_list_struct uvn_wlist; /* writeable uvns */ -static simple_lock_data_t uvn_wl_lock; /* locks uvn_wlist */ - -SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode); -static struct uvn_sq_struct uvn_sync_q; /* sync'ing uvns */ -lock_data_t uvn_sync_lock; /* locks sync operation */ +extern u_long uvm_pgcnt_vnode; /* * functions */ -static void uvn_cluster __P((struct uvm_object *, voff_t, - voff_t *, voff_t *)); -static void uvn_detach __P((struct uvm_object *)); -static boolean_t uvn_flush __P((struct uvm_object *, voff_t, - voff_t, int)); -static int uvn_get __P((struct uvm_object *, voff_t, - vm_page_t *, int *, int, - vm_prot_t, int, int)); -static void uvn_init __P((void)); -static int uvn_io __P((struct uvm_vnode *, vm_page_t *, - int, int, int)); -static int uvn_put __P((struct uvm_object *, vm_page_t *, - int, boolean_t)); -static void uvn_reference __P((struct uvm_object *)); -static boolean_t uvn_releasepg __P((struct vm_page *, - struct vm_page **)); +static void uvn_cluster __P((struct uvm_object *, voff_t, voff_t *, + voff_t *)); +static void uvn_detach __P((struct uvm_object *)); +static int uvn_findpage __P((struct uvm_object *, voff_t, + struct vm_page **, int)); +static boolean_t uvn_flush __P((struct uvm_object *, voff_t, voff_t, + int)); +static int uvn_get __P((struct uvm_object *, voff_t, vm_page_t *, + int *, int, vm_prot_t, int, int)); +static int uvn_put __P((struct uvm_object *, vm_page_t *, int, + boolean_t)); +static void uvn_reference __P((struct uvm_object *)); +static boolean_t uvn_releasepg __P((struct vm_page *, + struct vm_page **)); /* * master pager structure */ struct uvm_pagerops uvm_vnodeops = { - uvn_init, + NULL, uvn_reference, uvn_detach, - NULL, /* no specialized fault routine required */ + NULL, uvn_flush, uvn_get, uvn_put, uvn_cluster, - uvm_mk_pcluster, /* use generic version of this: see uvm_pager.c */ + uvm_mk_pcluster, uvn_releasepg, }; @@ -124,22 +113,6 @@ struct uvm_pagerops uvm_vnodeops = { * the ops! */ -/* - * uvn_init - * - * init pager private data structures. - */ - -static void -uvn_init() -{ - - LIST_INIT(&uvn_wlist); - simple_lock_init(&uvn_wl_lock); - /* note: uvn_sync_q init'd in uvm_vnp_sync() */ - lockinit(&uvn_sync_lock, PVM, "uvnsync", 0, 0); -} - /* * uvn_attach * @@ -163,23 +136,20 @@ uvn_attach(arg, accessprot) struct vnode *vp = arg; struct uvm_vnode *uvn = &vp->v_uvm; struct vattr vattr; - int oldflags, result; + int result; struct partinfo pi; - u_quad_t used_vnode_size; + voff_t used_vnode_size; UVMHIST_FUNC("uvn_attach"); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist, "(vn=0x%x)", arg,0,0,0); - - used_vnode_size = (u_quad_t)0; /* XXX gcc -Wuninitialized */ + used_vnode_size = (voff_t)0; /* * first get a lock on the uvn. */ simple_lock(&uvn->u_obj.vmobjlock); - while (uvn->u_flags & UVM_VNODE_BLOCKED) { - printf("uvn_attach: blocked at 0x%p flags 0x%x\n", - uvn, uvn->u_flags); - uvn->u_flags |= UVM_VNODE_WANTED; + while (uvn->u_flags & VXLOCK) { + uvn->u_flags |= VXWANT; UVMHIST_LOG(maphist, " SLEEPING on blocked vn",0,0,0,0); UVM_UNLOCK_AND_WAIT(uvn, &uvn->u_obj.vmobjlock, FALSE, "uvn_attach", 0); @@ -191,56 +161,26 @@ uvn_attach(arg, accessprot) * if we're mapping a BLK device, make sure it is a disk. */ if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) { - simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ + simple_unlock(&uvn->u_obj.vmobjlock); UVMHIST_LOG(maphist,"<- done (VBLK not D_DISK!)", 0,0,0,0); return(NULL); } - /* - * now we have lock and uvn must not be in a blocked state. - * first check to see if it is already active, in which case - * we can bump the reference count, check to see if we need to - * add it to the writeable list, and then return. - */ - if (uvn->u_flags & UVM_VNODE_VALID) { /* already active? */ - - /* regain VREF if we were persisting */ - if (uvn->u_obj.uo_refs == 0) { - VREF(vp); - UVMHIST_LOG(maphist," VREF (reclaim persisting vnode)", - 0,0,0,0); - } - uvn->u_obj.uo_refs++; /* bump uvn ref! */ - - /* check for new writeable uvn */ - if ((accessprot & VM_PROT_WRITE) != 0 && - (uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) { - simple_lock(&uvn_wl_lock); - LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - /* we are now on wlist! */ - uvn->u_flags |= UVM_VNODE_WRITEABLE; - } - - /* unlock and return */ - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs, - 0, 0, 0); - return (&uvn->u_obj); - } +#ifdef DIAGNOSTIC + if (vp->v_type != VREG) { + panic("uvn_attach: vp %p not VREG", vp); + } +#endif /* - * need to call VOP_GETATTR() to get the attributes, but that could - * block (due to I/O), so we want to unlock the object before calling. - * however, we want to keep anyone else from playing with the object - * while it is unlocked. to do this we set UVM_VNODE_ALOCK which - * prevents anyone from attaching to the vnode until we are done with - * it. + * set up our idea of the size + * if this hasn't been done already. */ - uvn->u_flags = UVM_VNODE_ALOCK; + if (uvn->u_size == VSIZENOTSET) { + + uvn->u_flags |= VXLOCK; simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */ /* XXX: curproc? */ - if (vp->v_type == VBLK) { /* * We could implement this as a specfs getattr call, but: @@ -254,8 +194,8 @@ uvn_attach(arg, accessprot) DIOCGPART, (caddr_t)&pi, FREAD, curproc); if (result == 0) { /* XXX should remember blocksize */ - used_vnode_size = (u_quad_t)pi.disklab->d_secsize * - (u_quad_t)pi.part->p_size; + used_vnode_size = (voff_t)pi.disklab->d_secsize * + (voff_t)pi.part->p_size; } } else { result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc); @@ -264,58 +204,26 @@ uvn_attach(arg, accessprot) } /* relock object */ - simple_lock(&uvn->u_obj.vmobjlock); + simple_lock(&uvn->u_obj.vmobjlock); + + if (uvn->u_flags & VXWANT) + wakeup(uvn); + uvn->u_flags &= ~(VXLOCK|VXWANT); if (result != 0) { - if (uvn->u_flags & UVM_VNODE_WANTED) - wakeup(uvn); - uvn->u_flags = 0; simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0); return(NULL); } - - /* - * make sure that the newsize fits within a vaddr_t - * XXX: need to revise addressing data types - */ -#ifdef DEBUG - if (vp->v_type == VBLK) - printf("used_vnode_size = %qu\n", (long long)used_vnode_size); -#endif - - /* - * now set up the uvn. - */ - uvn->u_obj.pgops = &uvm_vnodeops; - TAILQ_INIT(&uvn->u_obj.memq); - uvn->u_obj.uo_npages = 0; - uvn->u_obj.uo_refs = 1; /* just us... */ - oldflags = uvn->u_flags; - uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST; - uvn->u_nio = 0; uvn->u_size = used_vnode_size; - /* if write access, we need to add it to the wlist */ - if (accessprot & VM_PROT_WRITE) { - simple_lock(&uvn_wl_lock); - LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - uvn->u_flags |= UVM_VNODE_WRITEABLE; /* we are on wlist! */ } - /* - * add a reference to the vnode. this reference will stay as long - * as there is a valid mapping of the vnode. dropped when the - * reference count goes to zero [and we either free or persist]. - */ - VREF(vp); + /* unlock and return */ simple_unlock(&uvn->u_obj.vmobjlock); - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); - - UVMHIST_LOG(maphist,"<- done/VREF, ret 0x%x", &uvn->u_obj,0,0,0); - return(&uvn->u_obj); + UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs, + 0, 0, 0); + return (&uvn->u_obj); } @@ -335,23 +243,7 @@ static void uvn_reference(uobj) struct uvm_object *uobj; { -#ifdef DEBUG - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; -#endif - UVMHIST_FUNC("uvn_reference"); UVMHIST_CALLED(maphist); - - simple_lock(&uobj->vmobjlock); -#ifdef DEBUG - if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { - printf("uvn_reference: ref=%d, flags=0x%x\n", uvn->u_flags, - uobj->uo_refs); - panic("uvn_reference: invalid state"); - } -#endif - uobj->uo_refs++; - UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", - uobj, uobj->uo_refs,0,0); - simple_unlock(&uobj->vmobjlock); + VREF((struct vnode *)uobj); } /* @@ -367,291 +259,7 @@ static void uvn_detach(uobj) struct uvm_object *uobj; { - struct uvm_vnode *uvn; - struct vnode *vp; - int oldflags; - UVMHIST_FUNC("uvn_detach"); UVMHIST_CALLED(maphist); - - simple_lock(&uobj->vmobjlock); - - UVMHIST_LOG(maphist," (uobj=0x%x) ref=%d", uobj,uobj->uo_refs,0,0); - uobj->uo_refs--; /* drop ref! */ - if (uobj->uo_refs) { /* still more refs */ - simple_unlock(&uobj->vmobjlock); - UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0); - return; - } - - /* - * get other pointers ... - */ - - uvn = (struct uvm_vnode *) uobj; - vp = (struct vnode *) uobj; - - /* - * clear VTEXT flag now that there are no mappings left (VTEXT is used - * to keep an active text file from being overwritten). - */ - vp->v_flag &= ~VTEXT; - - /* - * we just dropped the last reference to the uvn. see if we can - * let it "stick around". - */ - - if (uvn->u_flags & UVM_VNODE_CANPERSIST) { - /* won't block */ - uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES); - simple_unlock(&uobj->vmobjlock); - vrele(vp); /* drop vnode reference */ - UVMHIST_LOG(maphist,"<- done/vrele! (persist)", 0,0,0,0); - return; - } - - /* - * its a goner! - */ - - UVMHIST_LOG(maphist," its a goner (flushing)!", 0,0,0,0); - - uvn->u_flags |= UVM_VNODE_DYING; - - /* - * even though we may unlock in flush, no one can gain a reference - * to us until we clear the "dying" flag [because it blocks - * attaches]. we will not do that until after we've disposed of all - * the pages with uvn_flush(). note that before the flush the only - * pages that could be marked PG_BUSY are ones that are in async - * pageout by the daemon. (there can't be any pending "get"'s - * because there are no references to the object). - */ - - (void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); - - UVMHIST_LOG(maphist," its a goner (done flush)!", 0,0,0,0); - - /* - * given the structure of this pager, the above flush request will - * create the following state: all the pages that were in the object - * have either been free'd or they are marked PG_BUSY|PG_RELEASED. - * the PG_BUSY bit was set either by us or the daemon for async I/O. - * in either case, if we have pages left we can't kill the object - * yet because i/o is pending. in this case we set the "relkill" - * flag which will cause pgo_releasepg to kill the object once all - * the I/O's are done [pgo_releasepg will be called from the aiodone - * routine or from the page daemon]. - */ - - if (uobj->uo_npages) { /* I/O pending. iodone will free */ -#ifdef DEBUG - /* - * XXXCDC: very unlikely to happen until we have async i/o - * so print a little info message in case it does. - */ - printf("uvn_detach: vn %p has pages left after flush - " - "relkill mode\n", uobj); -#endif - uvn->u_flags |= UVM_VNODE_RELKILL; - simple_unlock(&uobj->vmobjlock); - UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0, - 0, 0); - return; - } - - /* - * kill object now. note that we can't be on the sync q because - * all references are gone. - */ - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); /* protect uvn_wlist */ - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } -#ifdef DIAGNOSTIC - if (uobj->memq.tqh_first != NULL) - panic("uvn_deref: vnode VM object still has pages afer " - "syncio/free flush"); -#endif - oldflags = uvn->u_flags; - uvn->u_flags = 0; - simple_unlock(&uobj->vmobjlock); - - /* wake up any sleepers */ - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); - - /* - * drop our reference to the vnode. - */ - vrele(vp); - UVMHIST_LOG(maphist,"<- done (vrele) final", 0,0,0,0); - - return; -} - -/* - * uvm_vnp_terminate: external hook to clear out a vnode's VM - * - * called in two cases: - * [1] when a persisting vnode vm object (i.e. one with a zero reference - * count) needs to be freed so that a vnode can be reused. this - * happens under "getnewvnode" in vfs_subr.c. if the vnode from - * the free list is still attached (i.e. not VBAD) then vgone is - * called. as part of the vgone trace this should get called to - * free the vm object. this is the common case. - * [2] when a filesystem is being unmounted by force (MNT_FORCE, - * "umount -f") the vgone() function is called on active vnodes - * on the mounted file systems to kill their data (the vnodes become - * "dead" ones [see src/sys/miscfs/deadfs/...]). that results in a - * call here (even if the uvn is still in use -- i.e. has a non-zero - * reference count). this case happens at "umount -f" and during a - * "reboot/halt" operation. - * - * => the caller must XLOCK and VOP_LOCK the vnode before calling us - * [protects us from getting a vnode that is already in the DYING - * state...] - * => unlike uvn_detach, this function must not return until all the - * uvn's pages are disposed of. - * => in case [2] the uvn is still alive after this call, but all I/O - * ops will fail (due to the backing vnode now being "dead"). this - * will prob. kill any process using the uvn due to pgo_get failing. - */ - -void -uvm_vnp_terminate(vp) - struct vnode *vp; -{ - struct uvm_vnode *uvn = &vp->v_uvm; - int oldflags; - UVMHIST_FUNC("uvm_vnp_terminate"); UVMHIST_CALLED(maphist); - - /* - * lock object and check if it is valid - */ - simple_lock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, " vp=0x%x, ref=%d, flag=0x%x", vp, - uvn->u_obj.uo_refs, uvn->u_flags, 0); - if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, "<- done (not active)", 0, 0, 0, 0); - return; - } - - /* - * must be a valid uvn that is not already dying (because XLOCK - * protects us from that). the uvn can't in the ALOCK state - * because it is valid, and uvn's that are in the ALOCK state haven't - * been marked valid yet. - */ - -#ifdef DEBUG - /* - * debug check: are we yanking the vnode out from under our uvn? - */ - if (uvn->u_obj.uo_refs) { - printf("uvm_vnp_terminate(%p): terminating active vnode " - "(refs=%d)\n", uvn, uvn->u_obj.uo_refs); - } -#endif - - /* - * it is possible that the uvn was detached and is in the relkill - * state [i.e. waiting for async i/o to finish so that releasepg can - * kill object]. we take over the vnode now and cancel the relkill. - * we want to know when the i/o is done so we can recycle right - * away. note that a uvn can only be in the RELKILL state if it - * has a zero reference count. - */ - - if (uvn->u_flags & UVM_VNODE_RELKILL) - uvn->u_flags &= ~UVM_VNODE_RELKILL; /* cancel RELKILL */ - - /* - * block the uvn by setting the dying flag, and then flush the - * pages. (note that flush may unlock object while doing I/O, but - * it will re-lock it before it returns control here). - * - * also, note that we tell I/O that we are already VOP_LOCK'd so - * that uvn_io doesn't attempt to VOP_LOCK again. - * - * XXXCDC: setting VNISLOCKED on an active uvn which is being terminated - * due to a forceful unmount might not be a good idea. maybe we - * need a way to pass in this info to uvn_flush through a - * pager-defined PGO_ constant [currently there are none]. - */ - uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED; - - (void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); - - /* - * as we just did a flush we expect all the pages to be gone or in - * the process of going. sleep to wait for the rest to go [via iosync]. - */ - - while (uvn->u_obj.uo_npages) { -#ifdef DEBUG - struct vm_page *pp; - for (pp = uvn->u_obj.memq.tqh_first ; pp != NULL ; - pp = pp->listq.tqe_next) { - if ((pp->flags & PG_BUSY) == 0) - panic("uvm_vnp_terminate: detected unbusy pg"); - } - if (uvn->u_nio == 0) - panic("uvm_vnp_terminate: no I/O to wait for?"); - printf("uvm_vnp_terminate: waiting for I/O to fin.\n"); - /* - * XXXCDC: this is unlikely to happen without async i/o so we - * put a printf in just to keep an eye on it. - */ -#endif - uvn->u_flags |= UVM_VNODE_IOSYNC; - UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE, - "uvn_term",0); - simple_lock(&uvn->u_obj.vmobjlock); - } - - /* - * done. now we free the uvn if its reference count is zero - * (true if we are zapping a persisting uvn). however, if we are - * terminating a uvn with active mappings we let it live ... future - * calls down to the vnode layer will fail. - */ - - oldflags = uvn->u_flags; - if (uvn->u_obj.uo_refs) { - - /* - * uvn must live on it is dead-vnode state until all references - * are gone. restore flags. clear CANPERSIST state. - */ - - uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED| - UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST); - - } else { - - /* - * free the uvn now. note that the VREF reference is already - * gone [it is dropped when we enter the persist state]. - */ - if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) - panic("uvm_vnp_terminate: io sync wanted bit set"); - - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } - uvn->u_flags = 0; /* uvn is history, clear all bits */ - } - - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); /* object lock still held */ - - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0); - + vrele((struct vnode *)uobj); } /* @@ -664,7 +272,7 @@ uvm_vnp_terminate(vp) * => returns TRUE if page's object is still alive, FALSE if we * killed the page's object. if we return TRUE, then we * return with the object locked. - * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return + * => if (nextpgp != NULL) => we return the next page on the queue, and return * with the page queues locked [for pagedaemon] * => if (nextpgp == NULL) => we return with page queues unlocked [normal case] * => we kill the uvn if it is not referenced and we are suppose to @@ -676,11 +284,7 @@ uvn_releasepg(pg, nextpgp) struct vm_page *pg; struct vm_page **nextpgp; /* OUT */ { - struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject; -#ifdef DIAGNOSTIC - if ((pg->flags & PG_RELEASED) == 0) - panic("uvn_releasepg: page not released!"); -#endif + KASSERT(pg->flags & PG_RELEASED); /* * dispose of the page [caller handles PG_WANTED] @@ -688,37 +292,11 @@ uvn_releasepg(pg, nextpgp) pmap_page_protect(pg, VM_PROT_NONE); uvm_lock_pageq(); if (nextpgp) - *nextpgp = pg->pageq.tqe_next; /* next page for daemon */ + *nextpgp = TAILQ_NEXT(pg, pageq); uvm_pagefree(pg); if (!nextpgp) uvm_unlock_pageq(); - /* - * now see if we need to kill the object - */ - if (uvn->u_flags & UVM_VNODE_RELKILL) { - if (uvn->u_obj.uo_refs) - panic("uvn_releasepg: kill flag set on referenced " - "object!"); - if (uvn->u_obj.uo_npages == 0) { - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } -#ifdef DIAGNOSTIC - if (uvn->u_obj.memq.tqh_first) - panic("uvn_releasepg: pages in object with npages == 0"); -#endif - if (uvn->u_flags & UVM_VNODE_WANTED) - /* still holding object lock */ - wakeup(uvn); - - uvn->u_flags = 0; /* DEAD! */ - simple_unlock(&uvn->u_obj.vmobjlock); - return (FALSE); - } - } return (TRUE); } @@ -822,32 +400,48 @@ uvn_flush(uobj, start, stop, flags) voff_t start, stop; int flags; { - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; + struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; + struct vnode *vp = (struct vnode *)uobj; struct vm_page *pp, *ppnext, *ptmp; - struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; + struct vm_page *pps[256], **ppsp; + int s; int npages, result, lcv; - boolean_t retval, need_iosync, by_list, needs_clean, all; + boolean_t retval, need_iosync, by_list, needs_clean, all, wasclean; voff_t curoff; u_short pp_version; UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist, "uobj %p start 0x%x stop 0x%x flags 0x%x", + uobj, start, stop, flags); + KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); + +#ifdef DEBUG + if (uvn->u_size == VSIZENOTSET) { + printf("uvn_flush: size not set vp %p\n", uvn); + vprint("uvn_flush VSIZENOTSET", vp); + flags |= PGO_ALLPAGES; + } +#endif - curoff = 0; /* XXX: shut up gcc */ /* * get init vals and determine how we are going to traverse object */ + curoff = 0; need_iosync = FALSE; - retval = TRUE; /* return value */ + retval = TRUE; + wasclean = TRUE; if (flags & PGO_ALLPAGES) { all = TRUE; - by_list = TRUE; /* always go by the list */ + by_list = TRUE; } else { start = trunc_page(start); stop = round_page(stop); #ifdef DEBUG - if (stop > round_page(uvn->u_size)) - printf("uvn_flush: strange, got an out of range " - "flush (fixed)\n"); + if (stop > round_page(uvn->u_size)) { + printf("uvn_flush: oor vp %p start 0x%x stop 0x%x " + "size 0x%x\n", uvn, (int)start, (int)stop, + (int)round_page(uvn->u_size)); + } #endif all = FALSE; by_list = (uobj->uo_npages <= @@ -872,8 +466,7 @@ uvn_flush(uobj, start, stop, flags) if ((flags & PGO_CLEANIT) != 0 && uobj->pgops->pgo_mk_pcluster != NULL) { if (by_list) { - for (pp = uobj->memq.tqh_first ; pp != NULL ; - pp = pp->listq.tqe_next) { + TAILQ_FOREACH(pp, &uobj->memq, listq) { if (!all && (pp->offset < start || pp->offset >= stop)) continue; @@ -897,45 +490,32 @@ uvn_flush(uobj, start, stop, flags) */ if (by_list) { - pp = uobj->memq.tqh_first; + pp = TAILQ_FIRST(&uobj->memq); } else { curoff = start; pp = uvm_pagelookup(uobj, curoff); } - ppnext = NULL; /* XXX: shut up gcc */ - ppsp = NULL; /* XXX: shut up gcc */ - uvm_lock_pageq(); /* page queues locked */ + ppnext = NULL; + ppsp = NULL; + uvm_lock_pageq(); /* locked: both page queues and uobj */ for ( ; (by_list && pp != NULL) || - (!by_list && curoff < stop) ; pp = ppnext) { - + (!by_list && curoff < stop) ; pp = ppnext) { if (by_list) { - - /* - * range check - */ - if (!all && (pp->offset < start || pp->offset >= stop)) { - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); continue; } - } else { - - /* - * null check - */ - curoff += PAGE_SIZE; if (pp == NULL) { if (curoff < stop) ppnext = uvm_pagelookup(uobj, curoff); continue; } - } /* @@ -951,11 +531,11 @@ uvn_flush(uobj, start, stop, flags) if ((flags & PGO_CLEANIT) == 0 || (pp->flags & PG_BUSY) != 0) { needs_clean = FALSE; - if ((pp->flags & PG_BUSY) != 0 && - (flags & (PGO_CLEANIT|PGO_SYNCIO)) == + if ((flags & (PGO_CLEANIT|PGO_SYNCIO)) == (PGO_CLEANIT|PGO_SYNCIO)) need_iosync = TRUE; } else { + /* * freeing: nuke all mappings so we can sync * PG_CLEAN bit with no race @@ -967,8 +547,7 @@ uvn_flush(uobj, start, stop, flags) if ((pp->flags & PG_CLEAN) != 0 && pmap_is_modified(pp)) pp->flags &= ~(PG_CLEAN); - pp->flags |= PG_CLEANCHK; /* update "hint" */ - + pp->flags |= PG_CLEANCHK; needs_clean = ((pp->flags & PG_CLEAN) == 0); } @@ -976,15 +555,13 @@ uvn_flush(uobj, start, stop, flags) * if we don't need a clean... load ppnext and dispose of pp */ if (!needs_clean) { - /* load ppnext */ if (by_list) - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); else { if (curoff < stop) ppnext = uvm_pagelookup(uobj, curoff); } - /* now dispose of pp */ if (flags & PGO_DEACTIVATE) { if ((pp->pqflags & PQ_INACTIVE) == 0 && pp->wire_count == 0) { @@ -994,11 +571,9 @@ uvn_flush(uobj, start, stop, flags) } else if (flags & PGO_FREE) { if (pp->flags & PG_BUSY) { - /* release busy pages */ pp->flags |= PG_RELEASED; } else { pmap_page_protect(pp, VM_PROT_NONE); - /* removed page from object */ uvm_pagefree(pp); } } @@ -1015,6 +590,7 @@ uvn_flush(uobj, start, stop, flags) * note: locked: uobj and page queues. */ + wasclean = FALSE; pp->flags |= PG_BUSY; /* we 'own' page now */ UVM_PAGE_OWN(pp, "uvn_flush"); pmap_page_protect(pp, VM_PROT_READ); @@ -1025,7 +601,7 @@ ReTry: /* locked: page queues, uobj */ result = uvm_pager_put(uobj, pp, &ppsp, &npages, - flags | PGO_DOACTCLUST, start, stop); + flags | PGO_DOACTCLUST, start, stop); /* unlocked: page queues, uobj */ /* @@ -1048,7 +624,8 @@ ReTry: */ if (result == VM_PAGER_AGAIN) { - /* + + /* * it is unlikely, but page could have been released * while we had the object lock dropped. we ignore * this now and retry the I/O. we will detect and @@ -1075,27 +652,22 @@ ReTry: * we can move on to the next page. */ - if (result == VM_PAGER_PEND) { + if (result == VM_PAGER_PEND && + (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { - if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { - /* - * no per-page ops: refresh ppnext and continue - */ - if (by_list) { - if (pp->version == pp_version) - ppnext = pp->listq.tqe_next; - else - /* reset */ - ppnext = uobj->memq.tqh_first; - } else { - if (curoff < stop) - ppnext = uvm_pagelookup(uobj, - curoff); - } - continue; + /* + * no per-page ops: refresh ppnext and continue + */ + if (by_list) { + if (pp->version == pp_version) + ppnext = TAILQ_NEXT(pp, listq); + else + ppnext = TAILQ_FIRST(&uobj->memq); + } else { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, curoff); } - - /* need to do anything here? */ + continue; } /* @@ -1122,18 +694,19 @@ ReTry: /* set up next page for outer loop */ if (by_list) { if (pp->version == pp_version) - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); else - /* reset */ - ppnext = uobj->memq.tqh_first; + ppnext = TAILQ_FIRST( + &uobj->memq); } else { if (curoff < stop) - ppnext = uvm_pagelookup(uobj, curoff); + ppnext = uvm_pagelookup(uobj, + curoff); } } /* - * verify the page didn't get moved while obj was + * verify the page wasn't moved while obj was * unlocked */ if (result == VM_PAGER_PEND && ptmp->uobject != uobj) @@ -1147,26 +720,32 @@ ReTry: */ if (result != VM_PAGER_PEND) { - if (ptmp->flags & PG_WANTED) + if (ptmp->flags & PG_WANTED) { /* still holding object lock */ wakeup(ptmp); - + } ptmp->flags &= ~(PG_WANTED|PG_BUSY); UVM_PAGE_OWN(ptmp, NULL); if (ptmp->flags & PG_RELEASED) { - - /* pgo_releasepg wants this */ uvm_unlock_pageq(); - if (!uvn_releasepg(ptmp, NULL)) + if (!uvn_releasepg(ptmp, NULL)) { + UVMHIST_LOG(maphist, + "released %p", + ptmp, 0,0,0); return (TRUE); - - uvm_lock_pageq(); /* relock */ - continue; /* next page */ - + } + uvm_lock_pageq(); + continue; } else { - ptmp->flags |= (PG_CLEAN|PG_CLEANCHK); - if ((flags & PGO_FREE) == 0) - pmap_clear_modify(ptmp); + if ((flags & PGO_WEAK) == 0 && + !(result == VM_PAGER_ERROR && + curproc == uvm.pagedaemon_proc)) { + ptmp->flags |= + (PG_CLEAN|PG_CLEANCHK); + if ((flags & PGO_FREE) == 0) { + pmap_clear_modify(ptmp); + } + } } } @@ -1180,7 +759,6 @@ ReTry: pmap_page_protect(ptmp, VM_PROT_NONE); uvm_pagedeactivate(ptmp); } - } else if (flags & PGO_FREE) { if (result == VM_PAGER_PEND) { if ((ptmp->flags & PG_BUSY) != 0) @@ -1189,10 +767,10 @@ ReTry: } else { if (result != VM_PAGER_OK) { printf("uvn_flush: obj=%p, " - "offset=0x%llx. error " - "during pageout.\n", + "offset=0x%llx. error %d\n", pp->uobject, - (long long)pp->offset); + (long long)pp->offset, + result); printf("uvn_flush: WARNING: " "changes to page may be " "lost!\n"); @@ -1202,31 +780,36 @@ ReTry: uvm_pagefree(ptmp); } } - } /* end of "lcv" for loop */ - } /* end of "pp" for loop */ - /* - * done with pagequeues: unlock - */ uvm_unlock_pageq(); - - /* - * now wait for all I/O if required. - */ + if ((flags & PGO_CLEANIT) && all && wasclean && + LIST_FIRST(&vp->v_dirtyblkhd) == NULL && + (vp->v_flag & VONWORKLST)) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); + } if (need_iosync) { - UVMHIST_LOG(maphist," <>",0,0,0,0); - while (uvn->u_nio != 0) { - uvn->u_flags |= UVM_VNODE_IOSYNC; - UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, - FALSE, "uvn_flush",0); + + /* + * XXX this doesn't use the new two-flag scheme, + * but to use that, all i/o initiators will have to change. + */ + + s = splbio(); + while (vp->v_numoutput != 0) { + UVMHIST_LOG(ubchist, "waiting for vp %p num %d", + vp, vp->v_numoutput,0,0); + + vp->v_flag |= VBWAIT; + UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, + &uvn->u_obj.vmobjlock, + FALSE, "uvn_flush",0); simple_lock(&uvn->u_obj.vmobjlock); } - if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) - wakeup(&uvn->u_flags); - uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED); + splx(s); } /* return, with object locked! */ @@ -1250,31 +833,18 @@ uvn_cluster(uobj, offset, loffset, hoffset) voff_t offset; voff_t *loffset, *hoffset; /* OUT */ { - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; + struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; + *loffset = offset; - - if (*loffset >= uvn->u_size) - panic("uvn_cluster: offset out of range"); - - /* - * XXX: old pager claims we could use VOP_BMAP to get maxcontig value. - */ - *hoffset = *loffset + MAXBSIZE; - if (*hoffset > round_page(uvn->u_size)) /* past end? */ - *hoffset = round_page(uvn->u_size); - - return; + *hoffset = min(offset + MAXBSIZE, round_page(uvn->u_size)); } /* * uvn_put: flush page data to backing store. * - * => prefer map unlocked (not required) * => object must be locked! we will _unlock_ it before starting I/O. * => flags: PGO_SYNCIO -- use sync. I/O * => note: caller must set PG_CLEAN and pmap_clear_modify (if needed) - * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. - * [thus we never do async i/o! see iodone comment] */ static int @@ -1283,13 +853,11 @@ uvn_put(uobj, pps, npages, flags) struct vm_page **pps; int npages, flags; { - int retval; + struct vnode *vp = (struct vnode *)uobj; + int error; - /* note: object locked */ - retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE); - /* note: object unlocked */ - - return(retval); + error = VOP_PUTPAGES(vp, pps, npages, flags, NULL); + return uvm_errno2vmerror(error); } @@ -1310,551 +878,123 @@ uvn_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags) voff_t offset; struct vm_page **pps; /* IN/OUT */ int *npagesp; /* IN (OUT if PGO_LOCKED) */ - int centeridx, advice, flags; + int centeridx; vm_prot_t access_type; + int advice, flags; { - voff_t current_offset; - struct vm_page *ptmp; - int lcv, result, gotpages; - boolean_t done; - UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(maphist); - UVMHIST_LOG(maphist, "flags=%d", flags,0,0,0); + struct vnode *vp = (struct vnode *)uobj; + int error; + UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(ubchist); - /* - * step 1: handled the case where fault data structures are locked. - */ - - if (flags & PGO_LOCKED) { - - /* - * gotpages is the current number of pages we've gotten (which - * we pass back up to caller via *npagesp. - */ - - gotpages = 0; - - /* - * step 1a: get pages that are already resident. only do this - * if the data structures are locked (i.e. the first time - * through). - */ - - done = TRUE; /* be optimistic */ - - for (lcv = 0, current_offset = offset ; lcv < *npagesp ; - lcv++, current_offset += PAGE_SIZE) { - - /* do we care about this page? if not, skip it */ - if (pps[lcv] == PGO_DONTCARE) - continue; - - /* lookup page */ - ptmp = uvm_pagelookup(uobj, current_offset); - - /* to be useful must get a non-busy, non-released pg */ - if (ptmp == NULL || - (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { - if (lcv == centeridx || (flags & PGO_ALLPAGES) - != 0) - done = FALSE; /* need to do a wait or I/O! */ - continue; - } - - /* - * useful page: busy/lock it and plug it in our - * result array - */ - ptmp->flags |= PG_BUSY; /* loan up to caller */ - UVM_PAGE_OWN(ptmp, "uvn_get1"); - pps[lcv] = ptmp; - gotpages++; - - } /* "for" lcv loop */ - - /* - * XXX: given the "advice", should we consider async read-ahead? - * XXX: fault current does deactive of pages behind us. is - * this good (other callers might now). - */ - /* - * XXX: read-ahead currently handled by buffer cache (bread) - * level. - * XXX: no async i/o available. - * XXX: so we don't do anything now. - */ - - /* - * step 1c: now we've either done everything needed or we to - * unlock and do some waiting or I/O. - */ - - *npagesp = gotpages; /* let caller know */ - if (done) - return(VM_PAGER_OK); /* bingo! */ - else - /* EEK! Need to unlock and I/O */ - return(VM_PAGER_UNLOCK); - } - - /* - * step 2: get non-resident or busy pages. - * object is locked. data structures are unlocked. - * - * XXX: because we can't do async I/O at this level we get things - * page at a time (otherwise we'd chunk). the VOP_READ() will do - * async-read-ahead for us at a lower level. - */ - - for (lcv = 0, current_offset = offset ; - lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) { - - /* skip over pages we've already gotten or don't want */ - /* skip over pages we don't _have_ to get */ - if (pps[lcv] != NULL || (lcv != centeridx && - (flags & PGO_ALLPAGES) == 0)) - continue; - - /* - * we have yet to locate the current page (pps[lcv]). we first - * look for a page that is already at the current offset. if - * we fine a page, we check to see if it is busy or released. - * if that is the case, then we sleep on the page until it is - * no longer busy or released and repeat the lookup. if the - * page we found is neither busy nor released, then we busy it - * (so we own it) and plug it into pps[lcv]. this breaks the - * following while loop and indicates we are ready to move on - * to the next page in the "lcv" loop above. - * - * if we exit the while loop with pps[lcv] still set to NULL, - * then it means that we allocated a new busy/fake/clean page - * ptmp in the object and we need to do I/O to fill in the data. - */ - - while (pps[lcv] == NULL) { /* top of "pps" while loop */ - - /* look for a current page */ - ptmp = uvm_pagelookup(uobj, current_offset); - - /* nope? allocate one now (if we can) */ - if (ptmp == NULL) { - - ptmp = uvm_pagealloc(uobj, current_offset, - NULL, 0); - - /* out of RAM? */ - if (ptmp == NULL) { - simple_unlock(&uobj->vmobjlock); - uvm_wait("uvn_getpage"); - simple_lock(&uobj->vmobjlock); - - /* goto top of pps while loop */ - continue; - } - - /* - * got new page ready for I/O. break pps - * while loop. pps[lcv] is still NULL. - */ - break; - } - - /* page is there, see if we need to wait on it */ - if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { - ptmp->flags |= PG_WANTED; - UVM_UNLOCK_AND_WAIT(ptmp, - &uobj->vmobjlock, FALSE, "uvn_get",0); - simple_lock(&uobj->vmobjlock); - continue; /* goto top of pps while loop */ - } - - /* - * if we get here then the page has become resident - * and unbusy between steps 1 and 2. we busy it - * now (so we own it) and set pps[lcv] (so that we - * exit the while loop). - */ - ptmp->flags |= PG_BUSY; - UVM_PAGE_OWN(ptmp, "uvn_get2"); - pps[lcv] = ptmp; - } - - /* - * if we own the a valid page at the correct offset, pps[lcv] - * will point to it. nothing more to do except go to the - * next page. - */ - - if (pps[lcv]) - continue; /* next lcv */ - - /* - * we have a "fake/busy/clean" page that we just allocated. do - * I/O to fill it with valid data. note that object must be - * locked going into uvn_io, but will be unlocked afterwards. - */ - - result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1, - PGO_SYNCIO, UIO_READ); - - /* - * I/O done. object is unlocked (by uvn_io). because we used - * syncio the result can not be PEND or AGAIN. we must relock - * and check for errors. - */ - - /* lock object. check for errors. */ - simple_lock(&uobj->vmobjlock); - if (result != VM_PAGER_OK) { - if (ptmp->flags & PG_WANTED) - /* object lock still held */ - wakeup(ptmp); - - ptmp->flags &= ~(PG_WANTED|PG_BUSY); - UVM_PAGE_OWN(ptmp, NULL); - uvm_lock_pageq(); - uvm_pagefree(ptmp); - uvm_unlock_pageq(); - simple_unlock(&uobj->vmobjlock); - return(result); - } - - /* - * we got the page! clear the fake flag (indicates valid - * data now in page) and plug into our result array. note - * that page is still busy. - * - * it is the callers job to: - * => check if the page is released - * => unbusy the page - * => activate the page - */ - - ptmp->flags &= ~PG_FAKE; /* data is valid ... */ - pmap_clear_modify(ptmp); /* ... and clean */ - pps[lcv] = ptmp; - - } /* lcv loop */ - - /* - * finally, unlock object and return. - */ - - simple_unlock(&uobj->vmobjlock); - return (VM_PAGER_OK); + UVMHIST_LOG(ubchist, "vp %p off 0x%x", vp, (int)offset, 0,0); + error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx, + access_type, advice, flags); + return uvm_errno2vmerror(error); } + /* - * uvn_io: do I/O to a vnode - * - * => prefer map unlocked (not required) - * => object must be locked! we will _unlock_ it before starting I/O. - * => flags: PGO_SYNCIO -- use sync. I/O - * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. - * [thus we never do async i/o! see iodone comment] + * uvn_findpages: + * return the page for the uobj and offset requested, allocating if needed. + * => uobj must be locked. + * => returned page will be BUSY. */ +void +uvn_findpages(uobj, offset, npagesp, pps, flags) + struct uvm_object *uobj; + voff_t offset; + int *npagesp; + struct vm_page **pps; + int flags; +{ + int i, rv, npages; + + rv = 0; + npages = *npagesp; + for (i = 0; i < npages; i++, offset += PAGE_SIZE) { + rv += uvn_findpage(uobj, offset, &pps[i], flags); + } + *npagesp = rv; +} + static int -uvn_io(uvn, pps, npages, flags, rw) - struct uvm_vnode *uvn; - vm_page_t *pps; - int npages, flags, rw; +uvn_findpage(uobj, offset, pgp, flags) + struct uvm_object *uobj; + voff_t offset; + struct vm_page **pgp; + int flags; { - struct vnode *vn; - struct uio uio; - struct iovec iov; - vaddr_t kva; - off_t file_offset; - int waitf, result, mapinflags; - size_t got, wanted; - UVMHIST_FUNC("uvn_io"); UVMHIST_CALLED(maphist); + struct vm_page *pg; + UVMHIST_FUNC("uvn_findpage"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%lx", uobj, offset,0,0); - UVMHIST_LOG(maphist, "rw=%d", rw,0,0,0); - - /* - * init values - */ + if (*pgp != NULL) { + UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0); + return 0; + } + for (;;) { + /* look for an existing page */ + pg = uvm_pagelookup(uobj, offset); - waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT; - vn = (struct vnode *) uvn; - file_offset = pps[0]->offset; - - /* - * check for sync'ing I/O. - */ - - while (uvn->u_flags & UVM_VNODE_IOSYNC) { - if (waitf == M_NOWAIT) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- try again (iosync)",0,0,0,0); - return(VM_PAGER_AGAIN); + /* nope? allocate one now */ + if (pg == NULL) { + if (flags & UFP_NOALLOC) { + UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0); + return 0; + } + if (uvm_pgcnt_vnode > + (uvmexp.active + uvmexp.inactive + uvmexp.wired + + uvmexp.free) * 7 / 8) { + pg = NULL; + } else { + pg = uvm_pagealloc(uobj, offset, NULL, 0); + uvm_pgcnt_vnode++; + } + if (pg == NULL) { + if (flags & UFP_NOWAIT) { + UVMHIST_LOG(ubchist, "nowait",0,0,0,0); + return 0; + } + simple_unlock(&uobj->vmobjlock); + uvm_wait("uvn_fp1"); + simple_lock(&uobj->vmobjlock); + continue; + } + UVMHIST_LOG(ubchist, "alloced",0,0,0,0); + break; + } else if (flags & UFP_NOCACHE) { + UVMHIST_LOG(ubchist, "nocache",0,0,0,0); + return 0; } - uvn->u_flags |= UVM_VNODE_IOSYNCWANTED; - UVM_UNLOCK_AND_WAIT(&uvn->u_flags, &uvn->u_obj.vmobjlock, - FALSE, "uvn_iosync",0); - simple_lock(&uvn->u_obj.vmobjlock); - } - /* - * check size - */ - - if (file_offset >= uvn->u_size) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- BAD (size check)",0,0,0,0); - return(VM_PAGER_BAD); - } - - /* - * first try and map the pages in (without waiting) - */ - - mapinflags = (rw == UIO_READ) ? - UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; - - kva = uvm_pagermapin(pps, npages, NULL, mapinflags); - if (kva == 0 && waitf == M_NOWAIT) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- mapin failed (try again)",0,0,0,0); - return(VM_PAGER_AGAIN); - } - - /* - * ok, now bump u_nio up. at this point we are done with uvn - * and can unlock it. if we still don't have a kva, try again - * (this time with sleep ok). - */ - - uvn->u_nio++; /* we have an I/O in progress! */ - simple_unlock(&uvn->u_obj.vmobjlock); - /* NOTE: object now unlocked */ - if (kva == 0) - kva = uvm_pagermapin(pps, npages, NULL, - mapinflags | UVMPAGER_MAPIN_WAITOK); - - /* - * ok, mapped in. our pages are PG_BUSY so they are not going to - * get touched (so we can look at "offset" without having to lock - * the object). set up for I/O. - */ - - /* - * fill out uio/iov - */ - - iov.iov_base = (caddr_t) kva; - wanted = npages << PAGE_SHIFT; - if (file_offset + wanted > uvn->u_size) - wanted = uvn->u_size - file_offset; /* XXX: needed? */ - iov.iov_len = wanted; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = file_offset; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = rw; - uio.uio_resid = wanted; - uio.uio_procp = NULL; - - /* - * do the I/O! (XXX: curproc?) - */ - - UVMHIST_LOG(maphist, "calling VOP",0,0,0,0); - - /* - * This process may already have this vnode locked, if we faulted in - * copyin() or copyout() on a region backed by this vnode - * while doing I/O to the vnode. If this is the case, don't - * panic.. instead, return the error to the user. - * - * XXX this is a stopgap to prevent a panic. - * Ideally, this kind of operation *should* work. - */ - result = 0; - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) - result = vn_lock(vn, LK_EXCLUSIVE | LK_RETRY | LK_RECURSEFAIL); - - if (result == 0) { - /* NOTE: vnode now locked! */ - - if (rw == UIO_READ) - result = VOP_READ(vn, &uio, 0, curproc->p_ucred); - else - result = VOP_WRITE(vn, &uio, 0, curproc->p_ucred); - - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) - VOP_UNLOCK(vn, 0); - } - - /* NOTE: vnode now unlocked (unless vnislocked) */ - - UVMHIST_LOG(maphist, "done calling VOP",0,0,0,0); - - /* - * result == unix style errno (0 == OK!) - * - * zero out rest of buffer (if needed) - */ - - if (result == 0) { - got = wanted - uio.uio_resid; - - if (wanted && got == 0) { - result = EIO; /* XXX: error? */ - } else if (got < PAGE_SIZE * npages && rw == UIO_READ) { - memset((void *) (kva + got), 0, - (npages << PAGE_SHIFT) - got); + /* page is there, see if we need to wait on it */ + if ((pg->flags & (PG_BUSY|PG_RELEASED)) != 0) { + if (flags & UFP_NOWAIT) { + UVMHIST_LOG(ubchist, "nowait",0,0,0,0); + return 0; + } + pg->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0, + "uvn_fp2", 0); + simple_lock(&uobj->vmobjlock); + continue; } - } - - /* - * now remove pager mapping - */ - uvm_pagermapout(kva, npages); - - /* - * now clean up the object (i.e. drop I/O count) - */ - - simple_lock(&uvn->u_obj.vmobjlock); - /* NOTE: object now locked! */ - - uvn->u_nio--; /* I/O DONE! */ - if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) { - wakeup(&uvn->u_nio); - } - simple_unlock(&uvn->u_obj.vmobjlock); - /* NOTE: object now unlocked! */ - - /* - * done! - */ - - UVMHIST_LOG(maphist, "<- done (result %d)", result,0,0,0); - if (result == 0) - return(VM_PAGER_OK); - else - return(VM_PAGER_ERROR); -} - -/* - * uvm_vnp_uncache: disable "persisting" in a vnode... when last reference - * is gone we will kill the object (flushing dirty pages back to the vnode - * if needed). - * - * => returns TRUE if there was no uvm_object attached or if there was - * one and we killed it [i.e. if there is no active uvn] - * => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if - * needed] - * - * => XXX: given that we now kill uvn's when a vnode is recycled (without - * having to hold a reference on the vnode) and given a working - * uvm_vnp_sync(), how does that effect the need for this function? - * [XXXCDC: seems like it can die?] - * - * => XXX: this function should DIE once we merge the VM and buffer - * cache. - * - * research shows that this is called in the following places: - * ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode - * changes sizes - * ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we - * are written to - * ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit - * is off - * ffs_realloccg: when we can't extend the current block and have - * to allocate a new one we call this [XXX: why?] - * nfsrv_rename, rename_files: called when the target filename is there - * and we want to remove it - * nfsrv_remove, sys_unlink: called on file we are removing - * nfsrv_access: if VTEXT and we want WRITE access and we don't uncache - * then return "text busy" - * nfs_open: seems to uncache any file opened with nfs - * vn_writechk: if VTEXT vnode and can't uncache return "text busy" - */ - -boolean_t -uvm_vnp_uncache(vp) - struct vnode *vp; -{ - struct uvm_vnode *uvn = &vp->v_uvm; - - /* - * lock uvn part of the vnode and check to see if we need to do anything - */ - - simple_lock(&uvn->u_obj.vmobjlock); - if ((uvn->u_flags & UVM_VNODE_VALID) == 0 || - (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { - simple_unlock(&uvn->u_obj.vmobjlock); - return(TRUE); - } - - /* - * we have a valid, non-blocked uvn. clear persist flag. - * if uvn is currently active we can return now. - */ - - uvn->u_flags &= ~UVM_VNODE_CANPERSIST; - if (uvn->u_obj.uo_refs) { - simple_unlock(&uvn->u_obj.vmobjlock); - return(FALSE); - } - - /* - * uvn is currently persisting! we have to gain a reference to - * it so that we can call uvn_detach to kill the uvn. - */ - - VREF(vp); /* seems ok, even with VOP_LOCK */ - uvn->u_obj.uo_refs++; /* value is now 1 */ - simple_unlock(&uvn->u_obj.vmobjlock); - - -#ifdef DEBUG - /* - * carry over sanity check from old vnode pager: the vnode should - * be VOP_LOCK'd, and we confirm it here. - */ - if (!VOP_ISLOCKED(vp)) { - boolean_t is_ok_anyway = FALSE; -#ifdef NFS - extern int (**nfsv2_vnodeop_p) __P((void *)); - extern int (**spec_nfsv2nodeop_p) __P((void *)); - extern int (**fifo_nfsv2nodeop_p) __P((void *)); - - /* vnode is NOT VOP_LOCKed: some vnode types _never_ lock */ - if (vp->v_op == nfsv2_vnodeop_p || - vp->v_op == spec_nfsv2nodeop_p) { - is_ok_anyway = TRUE; + + /* skip PG_RDONLY pages if requested */ + if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) { + UVMHIST_LOG(ubchist, "nordonly",0,0,0,0); + return 0; } - if (vp->v_op == fifo_nfsv2nodeop_p) { - is_ok_anyway = TRUE; - } -#endif /* NFS */ - if (!is_ok_anyway) - panic("uvm_vnp_uncache: vnode not locked!"); - } -#endif /* DEBUG */ - /* - * now drop our reference to the vnode. if we have the sole - * reference to the vnode then this will cause it to die [as we - * just cleared the persist flag]. we have to unlock the vnode - * while we are doing this as it may trigger I/O. - * - * XXX: it might be possible for uvn to get reclaimed while we are - * unlocked causing us to return TRUE when we should not. we ignore - * this as a false-positive return value doesn't hurt us. - */ - VOP_UNLOCK(vp, 0); - uvn_detach(&uvn->u_obj); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - - /* - * and return... - */ - - return(TRUE); + /* mark the page BUSY and we're done. */ + pg->flags |= PG_BUSY; + UVM_PAGE_OWN(pg, "uvn_findpage"); + UVMHIST_LOG(ubchist, "found",0,0,0,0); + break; + } + *pgp = pg; + return 1; } /* @@ -1881,150 +1021,48 @@ uvm_vnp_setsize(vp, newsize) voff_t newsize; { struct uvm_vnode *uvn = &vp->v_uvm; + UVMHIST_FUNC("uvm_vnp_setsize"); UVMHIST_CALLED(ubchist); - /* - * lock uvn and check for valid object, and if valid: do it! - */ simple_lock(&uvn->u_obj.vmobjlock); - if (uvn->u_flags & UVM_VNODE_VALID) { - /* - * now check if the size has changed: if we shrink we had better - * toss some pages... - */ - - if (uvn->u_size > newsize) { - (void)uvn_flush(&uvn->u_obj, newsize, - uvn->u_size, PGO_FREE); - } - uvn->u_size = newsize; - } - simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(ubchist, "old 0x%x new 0x%x", uvn->u_size, newsize, 0,0); /* - * done + * now check if the size has changed: if we shrink we had better + * toss some pages... */ - return; + + if (uvn->u_size > newsize && uvn->u_size != VSIZENOTSET) { + (void) uvn_flush(&uvn->u_obj, newsize, uvn->u_size, PGO_FREE); + } + uvn->u_size = newsize; + simple_unlock(&uvn->u_obj.vmobjlock); } /* - * uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes. - * - * => called from sys_sync with no VM structures locked - * => only one process can do a sync at a time (because the uvn - * structure only has one queue for sync'ing). we ensure this - * by holding the uvn_sync_lock while the sync is in progress. - * other processes attempting a sync will sleep on this lock - * until we are done. + * uvm_vnp_zerorange: set a range of bytes in a file to zero. */ void -uvm_vnp_sync(mp) - struct mount *mp; -{ - struct uvm_vnode *uvn; +uvm_vnp_zerorange(vp, off, len) struct vnode *vp; - boolean_t got_lock; + off_t off; + size_t len; +{ + void *win; - /* - * step 1: ensure we are only ones using the uvn_sync_q by locking - * our lock... - */ - lockmgr(&uvn_sync_lock, LK_EXCLUSIVE, (void *)0); + /* + * XXXUBC invent kzero() and use it + */ - /* - * step 2: build up a simpleq of uvns of interest based on the - * write list. we gain a reference to uvns of interest. must - * be careful about locking uvn's since we will be holding uvn_wl_lock - * in the body of the loop. - */ - SIMPLEQ_INIT(&uvn_sync_q); - simple_lock(&uvn_wl_lock); - for (uvn = uvn_wlist.lh_first ; uvn != NULL ; - uvn = uvn->u_wlist.le_next) { + while (len) { + vsize_t bytelen = len; - vp = (struct vnode *) uvn; - if (mp && vp->v_mount != mp) - continue; + win = ubc_alloc(&vp->v_uvm.u_obj, off, &bytelen, UBC_WRITE); + memset(win, 0, bytelen); + ubc_release(win, 0); - /* attempt to gain reference */ - while ((got_lock = simple_lock_try(&uvn->u_obj.vmobjlock)) == - FALSE && - (uvn->u_flags & UVM_VNODE_BLOCKED) == 0) - /* spin */ ; - - /* - * we will exit the loop if either if the following are true: - * - we got the lock [always true if NCPU == 1] - * - we failed to get the lock but noticed the vnode was - * "blocked" -- in this case the vnode must be a dying - * vnode, and since dying vnodes are in the process of - * being flushed out, we can safely skip this one - * - * we want to skip over the vnode if we did not get the lock, - * or if the vnode is already dying (due to the above logic). - * - * note that uvn must already be valid because we found it on - * the wlist (this also means it can't be ALOCK'd). - */ - if (!got_lock || (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { - if (got_lock) - simple_unlock(&uvn->u_obj.vmobjlock); - continue; /* skip it */ - } - - /* - * gain reference. watch out for persisting uvns (need to - * regain vnode REF). - */ - if (uvn->u_obj.uo_refs == 0) - VREF(vp); - uvn->u_obj.uo_refs++; - simple_unlock(&uvn->u_obj.vmobjlock); - - /* - * got it! - */ - SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq); - } - simple_unlock(&uvn_wl_lock); - - /* - * step 3: we now have a list of uvn's that may need cleaning. - * we are holding the uvn_sync_lock, but have dropped the uvn_wl_lock - * (so we can now safely lock uvn's again). - */ - - for (uvn = uvn_sync_q.sqh_first ; uvn ; uvn = uvn->u_syncq.sqe_next) { - simple_lock(&uvn->u_obj.vmobjlock); -#ifdef DEBUG - if (uvn->u_flags & UVM_VNODE_DYING) { - printf("uvm_vnp_sync: dying vnode on sync list\n"); - } -#endif - uvn_flush(&uvn->u_obj, 0, 0, - PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST); - - /* - * if we have the only reference and we just cleaned the uvn, - * then we can pull it out of the UVM_VNODE_WRITEABLE state - * thus allowing us to avoid thinking about flushing it again - * on later sync ops. - */ - if (uvn->u_obj.uo_refs == 1 && - (uvn->u_flags & UVM_VNODE_WRITEABLE)) { - LIST_REMOVE(uvn, u_wlist); - uvn->u_flags &= ~UVM_VNODE_WRITEABLE; - } - - simple_unlock(&uvn->u_obj.vmobjlock); - - /* now drop our reference to the uvn */ - uvn_detach(&uvn->u_obj); - } - - /* - * done! release sync lock - */ - lockmgr(&uvn_sync_lock, LK_RELEASE, (void *)0); + off += bytelen; + len -= bytelen; + } } diff --git a/sys/uvm/uvm_vnode.h b/sys/uvm/uvm_vnode.h index cdeec76e3ced..f0249e8dd140 100644 --- a/sys/uvm/uvm_vnode.h +++ b/sys/uvm/uvm_vnode.h @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_vnode.h,v 1.9 2000/03/26 20:54:48 kleink Exp $ */ +/* $NetBSD: uvm_vnode.h,v 1.10 2000/11/27 08:40:06 chs Exp $ */ /* * @@ -54,56 +54,6 @@ struct uvm_vnode { int u_flags; /* flags */ int u_nio; /* number of running I/O requests */ voff_t u_size; /* size of object */ - - /* the following entry is locked by uvn_wl_lock */ - LIST_ENTRY(uvm_vnode) u_wlist; /* list of writeable vnode objects */ - - /* the following entry is locked by uvn_sync_lock */ - SIMPLEQ_ENTRY(uvm_vnode) u_syncq; /* vnode objects due for a "sync" */ }; -/* - * u_flags values - */ -#define UVM_VNODE_VALID 0x001 /* we are attached to the vnode */ -#define UVM_VNODE_CANPERSIST 0x002 /* we can persist after ref == 0 */ -#define UVM_VNODE_ALOCK 0x004 /* uvn_attach is locked out */ -#define UVM_VNODE_DYING 0x008 /* final detach/terminate in - progress */ -#define UVM_VNODE_RELKILL 0x010 /* uvn should be killed by releasepg - when final i/o is done */ -#define UVM_VNODE_WANTED 0x020 /* someone is waiting for alock, - dying, or relkill to clear */ -#define UVM_VNODE_VNISLOCKED 0x040 /* underlying vnode struct is locked - (valid when DYING is true) */ -#define UVM_VNODE_IOSYNC 0x080 /* I/O sync in progress ... setter - sleeps on &uvn->u_nio */ -#define UVM_VNODE_IOSYNCWANTED 0x100 /* a process is waiting for the - i/o sync to clear so it can do - i/o */ -#define UVM_VNODE_WRITEABLE 0x200 /* uvn has pages that are writeable */ - -/* - * UVM_VNODE_BLOCKED: any condition that should new processes from - * touching the vnode [set WANTED and sleep to wait for it to clear] - */ -#define UVM_VNODE_BLOCKED (UVM_VNODE_ALOCK|UVM_VNODE_DYING|UVM_VNODE_RELKILL) - -#ifdef _KERNEL - -/* - * prototypes - */ - -#if 0 -/* - * moved uvn_attach to uvm_extern.h because uvm_vnode.h is needed to - * include sys/vnode.h, and files that include sys/vnode.h don't know - * what a vm_prot_t is. - */ -struct uvm_object *uvn_attach __P((void *, vm_prot_t)); -#endif - -#endif /* _KERNEL */ - #endif /* _UVM_UVM_VNODE_H_ */