Initial integration of the Unified Buffer Cache project.

2000-11-27 08:39:39 +00:00 · 2000-11-27 08:39:39 +00:00 · aeda8d3b77
commit aeda8d3b77
parent c29a1b4461
80 changed files with 4961 additions and 3435 deletions
--- a/sys/adosfs/advnops.c
+++ b/sys/adosfs/advnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: advnops.c,v 1.54 2000/08/03 00:54:23 thorpej Exp $	*/
+/*	$NetBSD: advnops.c,v 1.55 2000/11/27 08:39:39 chs Exp $	*/

 /*
 * Copyright (c) 1994 Christian E. Hopps
@ -143,7 +143,9 @@ struct vnodeopv_entry_desc adosfs_vnodeop_entries[] = {
 	{ &vop_truncate_desc, adosfs_truncate },	/* truncate */
 	{ &vop_update_desc, adosfs_update },		/* update */
 	{ &vop_bwrite_desc, adosfs_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_size_desc, genfs_size },			/* size */
+	{ NULL, NULL }
 };

 struct vnodeopv_desc adosfs_vnodeop_opv_desc =
@ -226,6 +228,7 @@ adosfs_read(v)
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *sp = v;
+	struct vnode *vp = sp->a_vp;
 	struct adosfsmount *amp;
 	struct anode *ap;
 	struct uio *uio;
@ -265,6 +268,28 @@ adosfs_read(v)
 	/*
 	 * taken from ufs_read()
 	 */
+
+	if (vp->v_type == VREG) {
+		error = 0;
+		while (uio->uio_resid > 0) {
+			void *win;
+			vsize_t bytelen = min(ap->fsize - uio->uio_offset,
+					      uio->uio_resid);
+
+			if (bytelen == 0) {
+				break;
+			}
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error) {
+				break;
+			}
+		}
+		goto out;
+	}
+
 	do {
 		/*
 		 * we are only supporting ADosFFS currently
@ -326,6 +351,8 @@ adosfs_read(v)
 				amp->bsize - amp->dbsize, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
+
+out:
 reterr:
 #ifdef ADOSFS_DIAGNOSTIC
 	printf(" %d)", error);
--- a/sys/arch/atari/dev/md_root.c
+++ b/sys/arch/atari/dev/md_root.c
@ -1,4 +1,4 @@
-/*	$NetBSD: md_root.c,v 1.14 2000/01/21 23:29:02 thorpej Exp $	*/
+/*	$NetBSD: md_root.c,v 1.15 2000/11/27 08:39:40 chs Exp $	*/

 /*
 * Copyright (c) 1996 Leo Weppelman.
@ -159,7 +159,6 @@ struct proc		*proc;
 	 * Initialize our buffer header:
 	 */
 	memset(&buf, 0, sizeof(buf));
-	buf.b_rcred = buf.b_wcred = proc->p_ucred;
 	buf.b_vnbufs.le_next = NOLIST;
 	buf.b_flags = B_BUSY;
 	buf.b_dev   = ld_dev;
--- a/sys/coda/coda_subr.c
+++ b/sys/coda/coda_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: coda_subr.c,v 1.9 2000/03/30 11:24:16 augustss Exp $	*/
+/*	$NetBSD: coda_subr.c,v 1.10 2000/11/27 08:39:40 chs Exp $	*/

 /*
 * 
@ -227,7 +227,7 @@ coda_kill(whoIam, dcstat)
 #endif
 				count++;
 				CODADEBUG(CODA_FLUSH, 
-					 myprintf(("Live cnode fid %lx.%lx.%lx flags %d count %ld\n",
+					 myprintf(("Live cnode fid %lx.%lx.%lx flags %d count %d\n",
 						   (cp->c_fid).Volume,
 						   (cp->c_fid).Vnode,
 						   (cp->c_fid).Unique, 
@ -277,7 +277,7 @@ coda_testflush(void)
 	for (cp = coda_cache[hash];
 	     cp != NULL;
 	     cp = CNODE_NEXT(cp)) {  
-	    myprintf(("Live cnode fid %lx.%lx.%lx count %ld\n",
+	    myprintf(("Live cnode fid %lx.%lx.%lx count %d\n",
 		      (cp->c_fid).Volume,(cp->c_fid).Vnode,
 		      (cp->c_fid).Unique, CTOV(cp)->v_usecount));
 	}
@ -424,7 +424,7 @@ int handleDownCall(opcode, out)
 	      if (CTOV(cp)->v_flag & VTEXT)
 		  error = coda_vmflush(cp);
 	      CODADEBUG(CODA_ZAPFILE, myprintf(("zapfile: fid = (%lx.%lx.%lx), 
-                                              refcnt = %ld, error = %d\n",
+                                              refcnt = %d, error = %d\n",
 					      cp->c_fid.Volume, 
 					      cp->c_fid.Vnode, 
 					      cp->c_fid.Unique, 
@ -452,7 +452,7 @@ int handleDownCall(opcode, out)
 	      coda_nc_zapParentfid(&out->coda_zapdir.CodaFid, IS_DOWNCALL);     
 	      
 	      CODADEBUG(CODA_ZAPDIR, myprintf(("zapdir: fid = (%lx.%lx.%lx), 
-                                          refcnt = %ld\n",cp->c_fid.Volume, 
+                                          refcnt = %d\n",cp->c_fid.Volume, 
 					     cp->c_fid.Vnode, 
 					     cp->c_fid.Unique, 
 					     CTOV(cp)->v_usecount - 1)););
@ -486,7 +486,7 @@ int handleDownCall(opcode, out)
 		  
 		  error = coda_vmflush(cp);
 	      }
-	      CODADEBUG(CODA_PURGEFID, myprintf(("purgefid: fid = (%lx.%lx.%lx), refcnt = %ld, error = %d\n",
+	      CODADEBUG(CODA_PURGEFID, myprintf(("purgefid: fid = (%lx.%lx.%lx), refcnt = %d, error = %d\n",
                                            cp->c_fid.Volume, cp->c_fid.Vnode,
                                            cp->c_fid.Unique, 
 					    CTOV(cp)->v_usecount - 1, error)););
--- a/sys/coda/coda_vnops.c
+++ b/sys/coda/coda_vnops.c
@ -6,7 +6,7 @@ mkdir
 rmdir
 symlink
 */
-/*	$NetBSD: coda_vnops.c,v 1.21 2000/09/19 22:00:01 fvdl Exp $	*/
+/*	$NetBSD: coda_vnops.c,v 1.22 2000/11/27 08:39:40 chs Exp $	*/

 /*
 * 
@ -453,7 +453,7 @@ printf("coda_rdwr: Internally Opening %p\n", vp);
    }

    /* Have UFS handle the call. */
-    CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %ld\n",
+    CODADEBUG(CODA_RDWR, myprintf(("indirect rdwr: fid = (%lx.%lx.%lx), refcnt = %d\n",
 			      cp->c_fid.Volume, cp->c_fid.Vnode, 
 			      cp->c_fid.Unique, CTOV(cp)->v_usecount)); )

@ -873,9 +873,9 @@ coda_inactive(v)

    if (IS_UNMOUNTING(cp)) {
 #ifdef	DEBUG
-	printf("coda_inactive: IS_UNMOUNTING use %ld: vp %p, cp %p\n", vp->v_usecount, vp, cp);
+	printf("coda_inactive: IS_UNMOUNTING use %d: vp %p, cp %p\n", vp->v_usecount, vp, cp);
 	if (cp->c_ovp != NULL)
-	    printf("coda_inactive: cp->ovp != NULL use %ld: vp %p, cp %p\n",
+	    printf("coda_inactive: cp->ovp != NULL use %d: vp %p, cp %p\n",
 	    	   vp->v_usecount, vp, cp);
 #endif
 	lockmgr(&vp->v_lock, LK_RELEASE, &vp->v_interlock);
@ -1727,7 +1727,7 @@ printf("coda_readdir: Internally Opening %p\n", vp);
 	}
 	
 	/* Have UFS handle the call. */
-	CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %ld\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); )
+	CODADEBUG(CODA_READDIR, myprintf(("indirect readdir: fid = (%lx.%lx.%lx), refcnt = %d\n",cp->c_fid.Volume, cp->c_fid.Vnode, cp->c_fid.Unique, vp->v_usecount)); )
 	error = VOP_READDIR(cp->c_ovp, uiop, cred, eofflag, cookies,
 			       ncookies);
 	if (error)
--- a/sys/conf/files
+++ b/sys/conf/files
@ -1,4 +1,4 @@
-#	$NetBSD: files,v 1.404 2000/11/26 17:44:02 ad Exp $
+#	$NetBSD: files,v 1.405 2000/11/27 08:39:41 chs Exp $

 #	@(#)files.newconf	7.5 (Berkeley) 5/10/93

@ -1060,6 +1060,7 @@ file ufs/ufs/ufs_vnops.c		ffs | lfs | mfs | ext2fs
 file uvm/uvm_amap.c
 file uvm/uvm_anon.c
 file uvm/uvm_aobj.c
+file uvm/uvm_bio.c
 file uvm/uvm_device.c
 file uvm/uvm_fault.c
 file uvm/uvm_glue.c
--- a/sys/dev/vnd.c
+++ b/sys/dev/vnd.c
@ -1,4 +1,4 @@
-/*	$NetBSD: vnd.c,v 1.68 2000/09/12 08:03:24 enami Exp $	*/
+/*	$NetBSD: vnd.c,v 1.69 2000/11/27 08:39:41 chs Exp $	*/

 /*-
 * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc.
@ -445,29 +445,7 @@ vndstrategy(bp)
 		nbp->vb_buf.b_proc = bp->b_proc;
 		nbp->vb_buf.b_iodone = vndiodone;
 		nbp->vb_buf.b_vp = NULLVP;
-		nbp->vb_buf.b_rcred = vnd->sc_cred;	/* XXX crdup? */
-		nbp->vb_buf.b_wcred = vnd->sc_cred;	/* XXX crdup? */
 		LIST_INIT(&nbp->vb_buf.b_dep);
-		if (bp->b_dirtyend == 0) {
-			nbp->vb_buf.b_dirtyoff = 0;
-			nbp->vb_buf.b_dirtyend = sz;
-		} else {
-			nbp->vb_buf.b_dirtyoff =
-			    max(0, bp->b_dirtyoff - (bp->b_bcount - resid));
-			nbp->vb_buf.b_dirtyend =
-			    min(sz,
-				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
-		}
-		if (bp->b_validend == 0) {
-			nbp->vb_buf.b_validoff = 0;
-			nbp->vb_buf.b_validend = sz;
-		} else {
-			nbp->vb_buf.b_validoff =
-			    max(0, bp->b_validoff - (bp->b_bcount - resid));
-			nbp->vb_buf.b_validend =
-			    min(sz,
-				max(0, bp->b_validend - (bp->b_bcount-resid)));
-		}

 		nbp->vb_xfer = vnx;

--- a/sys/filecorefs/filecore_vfsops.c
+++ b/sys/filecorefs/filecore_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: filecore_vfsops.c,v 1.11 2000/03/16 18:08:22 jdolecek Exp $	*/
+/*	$NetBSD: filecore_vfsops.c,v 1.12 2000/11/27 08:39:41 chs Exp $	*/

 /*-
 * Copyright (c) 1998 Andrew McMurry
@ -324,6 +324,9 @@ filecore_mountfs(devvp, mp, p, argp)
 	mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_FILECORE);
 	mp->mnt_maxsymlinklen = 0;
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_dev_bshift = fcdr->log2secsize;
+	mp->mnt_fs_bshift = fcmp->log2bsize;
+
 	fcmp->fc_mountp = mp;
 	fcmp->fc_dev = dev;
 	fcmp->fc_devvp = devvp;
--- a/sys/filecorefs/filecore_vnops.c
+++ b/sys/filecorefs/filecore_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: filecore_vnops.c,v 1.9 2000/08/03 03:38:39 thorpej Exp $	*/
+/*	$NetBSD: filecore_vnops.c,v 1.10 2000/11/27 08:39:42 chs Exp $	*/

 /*-
 * Copyright (c) 1998 Andrew McMurry
@ -162,6 +162,28 @@ filecore_read(v)
 		return (EINVAL);
 	ip->i_flag |= IN_ACCESS;
 	fcmp = ip->i_mnt;
+
+	if (vp->v_type == VREG) {
+		error = 0;
+		while (uio->uio_resid > 0) {
+			void *win;
+			vsize_t bytelen = min(ip->i_size - uio->uio_offset,
+					      uio->uio_resid);
+
+			if (bytelen == 0) {
+				break;
+			}
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error) {
+				break;
+			}
+		}
+		goto out;
+	}
+
 	do {
 		lbn = lblkno(fcmp, uio->uio_offset);
 		on = blkoff(fcmp, uio->uio_offset);
@ -213,6 +235,8 @@ filecore_read(v)
 #endif
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
+
+out:
 	return (error);
 }

@ -571,7 +595,9 @@ struct vnodeopv_entry_desc filecore_vnodeop_entries[] = {
 	{ &vop_truncate_desc, filecore_truncate },	/* truncate */
 	{ &vop_update_desc, filecore_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_size_desc, genfs_size },			/* size */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc filecore_vnodeop_opv_desc =
 	{ &filecore_vnodeop_p, filecore_vnodeop_entries };
--- a/sys/isofs/cd9660/cd9660_vfsops.c
+++ b/sys/isofs/cd9660/cd9660_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cd9660_vfsops.c,v 1.49 2000/07/15 21:40:44 jdolecek Exp $	*/
+/*	$NetBSD: cd9660_vfsops.c,v 1.50 2000/11/27 08:39:42 chs Exp $	*/

 /*-
 * Copyright (c) 1994
@ -399,6 +399,8 @@ iso_mountfs(devvp, mp, p, argp)
 	mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_CD9660);
 	mp->mnt_maxsymlinklen = 0;
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_dev_bshift = iso_bsize;
+	mp->mnt_fs_bshift = isomp->im_bshift;
 	isomp->im_mountp = mp;
 	isomp->im_dev = dev;
 	isomp->im_devvp = devvp;
--- a/sys/isofs/cd9660/cd9660_vnops.c
+++ b/sys/isofs/cd9660/cd9660_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: cd9660_vnops.c,v 1.60 2000/11/14 22:26:32 thorpej Exp $	*/
+/*	$NetBSD: cd9660_vnops.c,v 1.61 2000/11/27 08:39:42 chs Exp $	*/

 /*-
 * Copyright (c) 1994
@ -278,6 +278,26 @@ cd9660_read(v)
 		return (EINVAL);
 	ip->i_flag |= IN_ACCESS;
 	imp = ip->i_mnt;
+
+	if (vp->v_type == VREG) {
+		error = 0;
+		while (uio->uio_resid > 0) {
+			void *win;
+			vsize_t bytelen = min(ip->i_size - uio->uio_offset,
+					      uio->uio_resid);
+
+			if (bytelen == 0)
+				break;
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error)
+				break;
+		}
+		goto out;
+	}
+
 	do {
 		lbn = lblkno(imp, uio->uio_offset);
 		on = blkoff(imp, uio->uio_offset);
@ -315,6 +335,8 @@ cd9660_read(v)
 		error = uiomove(bp->b_data + on, (int)n, uio);
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
+
+out:
 	return (error);
 }

@ -955,7 +977,9 @@ struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = {
 	{ &vop_truncate_desc, cd9660_truncate },	/* truncate */
 	{ &vop_update_desc, cd9660_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_size_desc, genfs_size },			/* size */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc cd9660_vnodeop_opv_desc =
 	{ &cd9660_vnodeop_p, cd9660_vnodeop_entries };
@ -1009,7 +1033,7 @@ struct vnodeopv_entry_desc cd9660_specop_entries[] = {
 	{ &vop_truncate_desc, spec_truncate },		/* truncate */
 	{ &vop_update_desc, cd9660_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc cd9660_specop_opv_desc =
 	{ &cd9660_specop_p, cd9660_specop_entries };
@ -1060,7 +1084,7 @@ struct vnodeopv_entry_desc cd9660_fifoop_entries[] = {
 	{ &vop_truncate_desc, fifo_truncate },		/* truncate */
 	{ &vop_update_desc, cd9660_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc cd9660_fifoop_opv_desc =
 	{ &cd9660_fifoop_p, cd9660_fifoop_entries };
--- a/sys/kern/exec_subr.c
+++ b/sys/kern/exec_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: exec_subr.c,v 1.25 2000/11/05 22:41:35 tv Exp $	*/
+/*	$NetBSD: exec_subr.c,v 1.26 2000/11/27 08:39:42 chs Exp $	*/

 /*
 * Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
@ -163,6 +163,7 @@ vmcmd_map_pagedvn(struct proc *p, struct exec_vmcmd *cmd)
        uobj = uvn_attach((void *) cmd->ev_vp, VM_PROT_READ|VM_PROT_EXECUTE);
        if (uobj == NULL)
                return(ENOMEM);
+	VREF(cmd->ev_vp);

 	/*
 	 * do the map
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@ -1,4 +1,4 @@
-/*	$NetBSD: init_main.c,v 1.184 2000/11/21 00:37:56 jdolecek Exp $	*/
+/*	$NetBSD: init_main.c,v 1.185 2000/11/27 08:39:43 chs Exp $	*/

 /*
 * Copyright (c) 1995 Christopher G. Demetriou.  All rights reserved.
@ -323,6 +323,8 @@ main(void)
 	/* Configure the system hardware.  This will enable interrupts. */
 	configure();

+	ubc_init();		/* must be after autoconfig */
+
 	/* Lock the kernel on behalf of proc0. */
 	KERNEL_PROC_LOCK(p);

@ -472,6 +474,10 @@ main(void)
 	if (kthread_create1(sched_sync, NULL, NULL, "ioflush"))
 		panic("fork syncer");

+	/* Create the aiodone daemon kernel thread. */
+	if (kthread_create1(uvm_aiodone_daemon, NULL, NULL, "aiodoned"))
+		panic("fork aiodoned");
+
 #if defined(MULTIPROCESSOR)
 	/* Boot the secondary processors. */
 	cpu_boot_secondary_processors();
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_exec.c,v 1.124 2000/11/21 00:37:56 jdolecek Exp $	*/
+/*	$NetBSD: kern_exec.c,v 1.125 2000/11/27 08:39:43 chs Exp $	*/

 /*-
 * Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
@ -152,6 +152,7 @@ check_exec(struct proc *p, struct exec_package *epp)
 	VOP_UNLOCK(vp, 0);

 	/* now we have the file, get the exec header */
+	uvn_attach(vp, VM_PROT_READ);
 	error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
 			UIO_SYSSPACE, 0, p->p_ucred, &resid, p);
 	if (error)
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@ -1,4 +1,4 @@
-/*	$NetBSD: kern_physio.c,v 1.44 2000/09/29 13:27:12 ad Exp $	*/
+/*	$NetBSD: kern_physio.c,v 1.45 2000/11/27 08:39:43 chs Exp $	*/

 /*-
 * Copyright (c) 1994 Christopher G. Demetriou
@ -290,8 +290,7 @@ getphysbuf()
 	splx(s);
 	memset(bp, 0, sizeof(*bp));

-	/* XXXCDC: are the following two lines necessary? */
-	bp->b_rcred = bp->b_wcred = NOCRED;
+	/* XXXCDC: is the following line necessary? */
 	bp->b_vnbufs.le_next = NOLIST;

 	return(bp);
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_bio.c,v 1.72 2000/11/18 15:58:13 simonb Exp $	*/
+/*	$NetBSD: vfs_bio.c,v 1.73 2000/11/27 08:39:43 chs Exp $	*/

 /*-
 * Copyright (c) 1994 Christopher G. Demetriou
@ -59,7 +59,7 @@
 #include <sys/resourcevar.h>
 #include <sys/conf.h>

-#include <uvm/uvm_extern.h>
+#include <uvm/uvm.h>

 #include <miscfs/specfs/specdev.h>

@ -72,7 +72,7 @@
 * Definitions for the buffer hash lists.
 */
 #define	BUFHASH(dvp, lbn)	\
-	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
+	(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 u_long	bufhash;
 struct bio_ops bioops;	/* I/O operation notification */
@ -134,7 +134,6 @@ bremfree(bp)
 			panic("bremfree: lost tail");
 	}
 	TAILQ_REMOVE(dp, bp, b_freelist);
-
 	splx(s);
 }

@ -166,8 +165,6 @@ bufinit()
 		bp = &buf[i];
 		memset((char *)bp, 0, sizeof(*bp));
 		bp->b_dev = NODEV;
-		bp->b_rcred = NOCRED;
-		bp->b_wcred = NOCRED;
 		bp->b_vnbufs.le_next = NOLIST;
 		LIST_INIT(&bp->b_dep);
 		bp->b_data = buffers + i * MAXBSIZE;
@ -201,12 +198,8 @@ bio_doread(vp, blkno, size, cred, async)
 	 * Therefore, it's valid if it's I/O has completed or been delayed.
 	 */
 	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
-		/* Start I/O for the buffer (keeping credentials). */
+		/* Start I/O for the buffer. */
 		SET(bp->b_flags, B_READ | async);
-		if (cred != NOCRED && bp->b_rcred == NOCRED) {
-			crhold(cred);
-			bp->b_rcred = cred;
-		}
 		VOP_STRATEGY(bp);

 		/* Pay for the read. */
@ -375,7 +368,6 @@ bwrite(bp)
 	bp->b_vp->v_numoutput++;
 	splx(s);

-	SET(bp->b_flags, B_WRITEINPROG);
 	VOP_STRATEGY(bp);

 	if (sync) {
@ -509,6 +501,8 @@ brelse(bp)
 	struct bqueues *bufq;
 	int s;

+	KASSERT(ISSET(bp->b_flags, B_BUSY));
+
 	/* Wake up any processes waiting for any buffer to become free. */
 	if (needbuffer) {
 		needbuffer = 0;
@ -602,6 +596,7 @@ brelse(bp)
 already_queued:
 	/* Unlock the buffer. */
 	CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE|B_ORDERED);
+	SET(bp->b_flags, B_CACHE);

 	/* Allow disk interrupts. */
 	splx(s);
@ -630,7 +625,7 @@ incore(vp, blkno)
 		return (bp);
 	}

-	return (0);
+	return (NULL);
 }

 /*
@ -647,56 +642,38 @@ getblk(vp, blkno, size, slpflag, slptimeo)
 	daddr_t blkno;
 	int size, slpflag, slptimeo;
 {
-	struct bufhashhdr *bh;
 	struct buf *bp;
 	int s, err;

-	/*
-	 * XXX
-	 * The following is an inlined version of 'incore()', but with
-	 * the 'invalid' test moved to after the 'busy' test.  It's
-	 * necessary because there are some cases in which the NFS
-	 * code sets B_INVAL prior to writing data to the server, but 
-	 * in which the buffers actually contain valid data.  In this
-	 * case, we can't allow the system to allocate a new buffer for
-	 * the block until the write is finished.
-	 */
-	bh = BUFHASH(vp, blkno);
 start:
-        bp = bh->lh_first;
-        for (; bp != NULL; bp = bp->b_hash.le_next) {
-                if (bp->b_lblkno != blkno || bp->b_vp != vp)
-			continue;
-
+	bp = incore(vp, blkno);
+	if (bp != NULL) {
 		s = splbio();
 		if (ISSET(bp->b_flags, B_BUSY)) {
+			if (curproc == uvm.pagedaemon_proc) {
+				splx(s);
+				return NULL;
+			}
 			SET(bp->b_flags, B_WANTED);
 			err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
-			    slptimeo);
+				     slptimeo);
 			splx(s);
 			if (err)
 				return (NULL);
 			goto start;
 		}
-
-		if (!ISSET(bp->b_flags, B_INVAL)) {
 #ifdef DIAGNOSTIC
-			if (ISSET(bp->b_flags, B_DONE|B_DELWRI) &&
-			    bp->b_bcount < size)
-				panic("getblk: block size invariant failed");
+		if (ISSET(bp->b_flags, B_DONE|B_DELWRI) && bp->b_bcount < size)
+			panic("getblk: block size invariant failed");
 #endif
-			SET(bp->b_flags, B_BUSY);
-			bremfree(bp);
-			splx(s);
-			break;
-		}
+		SET(bp->b_flags, B_BUSY);
+		bremfree(bp);
 		splx(s);
-        }
-
-	if (bp == NULL) {
+	} else {
 		if ((bp = getnewbuf(slpflag, slptimeo)) == NULL)
 			goto start;
-		binshash(bp, bh);
+
+		binshash(bp, BUFHASH(vp, blkno));
 		bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
 		s = splbio();
 		bgetvp(vp, bp);
@ -720,7 +697,6 @@ geteblk(size)
 	SET(bp->b_flags, B_INVAL);
 	binshash(bp, &invalhash);
 	allocbuf(bp, size);
-
 	return (bp);
 }

@ -737,9 +713,9 @@ allocbuf(bp, size)
 	struct buf *bp;
 	int size;
 {
-	struct buf      *nbp;
-	vsize_t       desired_size;
-	int	     s;
+	struct buf *nbp;
+	vsize_t desired_size;
+	int s;

 	desired_size = round_page((vsize_t)size);
 	if (desired_size > MAXBSIZE)
@ -759,6 +735,7 @@ allocbuf(bp, size)
 		/* find a buffer */
 		while ((nbp = getnewbuf(0, 0)) == NULL)
 			;
+
 		SET(nbp->b_flags, B_INVAL);
 		binshash(nbp, &invalhash);

@ -836,7 +813,7 @@ start:
 		needbuffer = 1;
 		tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo);
 		splx(s);
-		return (0);
+		return (NULL);
 	}

 	if (ISSET(bp->b_flags, B_VFLUSH)) {
@ -882,18 +859,6 @@ start:
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
-	bp->b_dirtyoff = bp->b_dirtyend = 0;
-	bp->b_validoff = bp->b_validend = 0;
-
-	/* nuke any credentials we were holding */
-	if (bp->b_rcred != NOCRED) {
-		crfree(bp->b_rcred);
-		bp->b_rcred = NOCRED; 
-	}
-	if (bp->b_wcred != NOCRED) {
-		crfree(bp->b_wcred);
-		bp->b_wcred = NOCRED;
-	}
 	
 	bremhash(bp);
 	return (bp); 
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_subr.c,v 1.141 2000/11/24 03:59:09 chs Exp $	*/
+/*	$NetBSD: vfs_subr.c,v 1.142 2000/11/27 08:39:44 chs Exp $	*/

 /*-
 * Copyright (c) 1997, 1998 The NetBSD Foundation, Inc.
@ -413,6 +413,8 @@ getnewvnode(tag, mp, vops, vpp)
 	int (**vops) __P((void *));
 	struct vnode **vpp;
 {
+	extern struct uvm_pagerops uvm_vnodeops;
+	struct uvm_object *uobj;
 	struct proc *p = curproc;	/* XXX */
 	struct freelst *listhd;
 	static int toggle;
@ -451,6 +453,7 @@ getnewvnode(tag, mp, vops, vpp)
 	 * vnode_hold_list because we will lose the identity of all its
 	 * referencing buffers.
 	 */
+
 	toggle ^= 1;
 	if (numvnodes > 2 * desiredvnodes)
 		toggle = 0;
@ -461,7 +464,7 @@ getnewvnode(tag, mp, vops, vpp)
 	    (TAILQ_FIRST(listhd = &vnode_hold_list) == NULL || toggle))) {
 		simple_unlock(&vnode_free_list_slock);
 		vp = pool_get(&vnode_pool, PR_WAITOK);
-		memset((char *)vp, 0, sizeof(*vp));
+		memset(vp, 0, sizeof(*vp));
 		simple_lock_init(&vp->v_interlock);
 		numvnodes++;
 	} else {
@ -522,6 +525,7 @@ getnewvnode(tag, mp, vops, vpp)
 	vp->v_type = VNON;
 	vp->v_vnlock = &vp->v_lock;
 	lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
+	lockinit(&vp->v_glock, PVFS, "glock", 0, 0);
 	cache_purge(vp);
 	vp->v_tag = tag;
 	vp->v_op = vops;
@ -530,6 +534,16 @@ getnewvnode(tag, mp, vops, vpp)
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	simple_lock_init(&vp->v_uvm.u_obj.vmobjlock);
+
+	/*
+	 * initialize uvm_object within vnode.
+	 */
+
+	uobj = &vp->v_uvm.u_obj;
+	uobj->pgops = &uvm_vnodeops;
+	TAILQ_INIT(&uobj->memq);
+	vp->v_uvm.u_size = VSIZENOTSET;
+
 	if (mp && error != EDEADLK)
 		vfs_unbusy(mp);
 	return (0);
@ -606,7 +620,6 @@ vwakeup(bp)
 {
 	struct vnode *vp;

-	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp) != NULL) {
 		if (--vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput, vp %p", vp);
@ -630,9 +643,21 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	struct proc *p;
 	int slpflag, slptimeo;
 {
+	struct uvm_object *uobj = &vp->v_uvm.u_obj;
 	struct buf *bp, *nbp;
-	int s, error;
+	int s, error, rv;
+	int flushflags = PGO_ALLPAGES|PGO_FREE|PGO_SYNCIO|
+		(flags & V_SAVE ? PGO_CLEANIT : 0);

+	/* XXXUBC this doesn't look at flags or slp* */
+	if (vp->v_type == VREG) {
+		simple_lock(&uobj->vmobjlock);
+		rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags);
+		simple_unlock(&uobj->vmobjlock);
+		if (!rv) {
+			return EIO;
+		}
+	}
 	if (flags & V_SAVE) {
 		error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0, p);
 		if (error)
@ -714,10 +739,22 @@ vtruncbuf(vp, lbn, slpflag, slptimeo)
 	daddr_t lbn;
 	int slpflag, slptimeo;
 {
+	struct uvm_object *uobj = &vp->v_uvm.u_obj;
 	struct buf *bp, *nbp;
-	int s, error;
+	int s, error, rv;

 	s = splbio();
+	if (vp->v_type == VREG) {
+		simple_lock(&uobj->vmobjlock);
+		rv = (uobj->pgops->pgo_flush)(uobj,
+		    round_page(lbn << vp->v_mount->mnt_fs_bshift),
+		    vp->v_uvm.u_size, PGO_FREE);
+		simple_unlock(&uobj->vmobjlock);
+		if (!rv) {
+			splx(s);
+			return EIO;
+		}
+	}

 restart:
 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
@ -726,7 +763,7 @@ restart:
 			continue;
 		if (bp->b_flags & B_BUSY) {
 			bp->b_flags |= B_WANTED;
-			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
+			error = tsleep(bp, slpflag | (PRIBIO + 1),
 			    "vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
@ -744,7 +781,7 @@ restart:
 			continue;
 		if (bp->b_flags & B_BUSY) {
 			bp->b_flags |= B_WANTED;
-			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
+			error = tsleep(bp, slpflag | (PRIBIO + 1),
 			    "vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
@ -766,9 +803,18 @@ vflushbuf(vp, sync)
 	struct vnode *vp;
 	int sync;
 {
+	struct uvm_object *uobj = &vp->v_uvm.u_obj;
 	struct buf *bp, *nbp;
 	int s;

+	if (vp->v_type == VREG) {
+		int flags = PGO_CLEANIT|PGO_ALLPAGES| (sync ? PGO_SYNCIO : 0);
+
+		simple_lock(&uobj->vmobjlock);
+		(uobj->pgops->pgo_flush)(uobj, 0, 0, flags);
+		simple_unlock(&uobj->vmobjlock);
+	}
+
 loop:
 	s = splbio();
 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
@ -850,11 +896,14 @@ brelvp(bp)
 	 */
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
-	if ((vp->v_flag & VONWORKLST) && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
+
+	if (vp->v_type != VREG && (vp->v_flag & VONWORKLST) &&
+	    LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
-	bp->b_vp = (struct vnode *) 0;
+
+	bp->b_vp = NULL;
 	HOLDRELE(vp);
 	splx(s);
 }
@ -874,11 +923,6 @@ reassignbuf(bp, newvp)
 	struct buflists *listheadp;
 	int delay;

-	if (newvp == NULL) {
-		printf("reassignbuf: NULL");
-		return;
-	}
-
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
@ -890,7 +934,8 @@ reassignbuf(bp, newvp)
 	 */
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		listheadp = &newvp->v_cleanblkhd;
-		if ((newvp->v_flag & VONWORKLST) &&
+		if (newvp->v_type != VREG &&
+		    (newvp->v_flag & VONWORKLST) &&
 		    LIST_FIRST(&newvp->v_dirtyblkhd) == NULL) {
 			newvp->v_flag &= ~VONWORKLST;
 			LIST_REMOVE(newvp, v_synclist);
@ -1074,9 +1119,13 @@ vget(vp, flags)
 	 * return failure. Cleaning is determined by checking that
 	 * the VXLOCK flag is set.
 	 */
+
 	if ((flags & LK_INTERLOCK) == 0)
 		simple_lock(&vp->v_interlock);
 	if (vp->v_flag & VXLOCK) {
+		if (flags & LK_NOWAIT) {
+			return EBUSY;
+		}
 		vp->v_flag |= VXWANT;
 		ltsleep((caddr_t)vp, PINOD|PNORELOCK,
 		    "vget", 0, &vp->v_interlock);
@ -1167,6 +1216,7 @@ vput(vp)
 	else
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~VTEXT;
 	simple_unlock(&vp->v_interlock);
 	VOP_INACTIVE(vp, p);
 }
@ -1194,7 +1244,7 @@ vrele(vp)
 #ifdef DIAGNOSTIC
 	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
 		vprint("vrele: bad ref count", vp);
-		panic("vrele: ref cnt");
+		panic("vrele: ref cnt vp %p", vp);
 	}
 #endif
 	/*
@ -1206,6 +1256,7 @@ vrele(vp)
 	else
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	simple_unlock(&vnode_free_list_slock);
+	vp->v_flag &= ~VTEXT;
 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0)
 		VOP_INACTIVE(vp, p);
 }
@ -1256,6 +1307,7 @@ holdrele(vp)
 	if (vp->v_holdcnt <= 0)
 		panic("holdrele: holdcnt vp %p", vp);
 	vp->v_holdcnt--;
+
 	/*
 	 * If it is on the holdlist and the hold count drops to
 	 * zero, move it to the free list. The test of the back
@ -1269,6 +1321,7 @@ holdrele(vp)
 	 * getnewvnode after removing it from a freelist to ensure
 	 * that we do not try to move it here.
 	 */
+
 	if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
 	    vp->v_holdcnt == 0 && vp->v_usecount == 0) {
 		simple_lock(&vnode_free_list_slock);
@ -1427,6 +1480,8 @@ vclean(vp, flags, p)
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock, vp %p", vp);
 	vp->v_flag |= VXLOCK;
+	vp->v_flag &= ~VTEXT;
+
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
@ -1437,11 +1492,7 @@ vclean(vp, flags, p)
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK);

 	/*
-	 * clean out any VM data associated with the vnode.
-	 */
-	uvm_vnp_terminate(vp);
-	/*
-	 * Clean out any buffers associated with the vnode.
+	 * Clean out any cached data associated with the vnode.
 	 */
 	if (flags & DOCLOSE)
 		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
@ -1467,7 +1518,6 @@ vclean(vp, flags, p)
 	 */
 	if (VOP_RECLAIM(vp, p))
 		panic("vclean: cannot reclaim, vp %p", vp);
-
 	if (active) {
 		/*
 		 * Inline copy of vrele() since VOP_INACTIVE
@ -1484,6 +1534,7 @@ vclean(vp, flags, p)
 			/*
 			 * Insert at tail of LRU list.
 			 */
+
 			simple_unlock(&vp->v_interlock);
 			simple_lock(&vnode_free_list_slock);
 #ifdef DIAGNOSTIC
@ -1740,7 +1791,7 @@ vprint(label, vp)

 	if (label != NULL)
 		printf("%s: ", label);
-	printf("tag %d type %s, usecount %ld, writecount %ld, refcount %ld,",
+	printf("tag %d type %s, usecount %d, writecount %ld, refcount %ld,",
 	    vp->v_tag, typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
@ -2365,7 +2416,7 @@ vfs_shutdown()
 	/* avoid coming back this way again if we panic. */
 	doing_shutdown = 1;

-	sys_sync(p, (void *)0, (register_t *)0);
+	sys_sync(p, NULL, NULL);

 	/* Wait for sync to finish. */
 	dcount = 10000;
@ -2608,10 +2659,10 @@ vfs_detach(vfs)

 #ifdef DDB
 const char buf_flagbits[] =
-	"\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6CACHE\7CALL\10DELWRI"
+	"\20\1AGE\2NEEDCOMMIT\3ASYNC\4BAD\5BUSY\6SCANNED\7CALL\10DELWRI"
 	"\11DIRTY\12DONE\13EINTR\14ERROR\15GATHERED\16INVAL\17LOCKED\20NOCACHE"
-	"\21PAGET\22PGIN\23PHYS\24RAW\25READ\26TAPE\27UAREA\30WANTED"
-	"\31WRITEINPROG\32XXX\33VFLUSH";
+	"\21ORDERED\22CACHE\23PHYS\24RAW\25READ\26TAPE\30WANTED"
+	"\32XXX\33VFLUSH";

 void
 vfs_buf_print(bp, full, pr)
@ -2629,15 +2680,9 @@ vfs_buf_print(bp, full, pr)

 	(*pr)("  bufsize 0x%x bcount 0x%x resid 0x%x\n",
 		  bp->b_bufsize, bp->b_bcount, bp->b_resid);
-	(*pr)("  data %p saveaddr %p\n",
-		  bp->b_data, bp->b_saveaddr);
+	(*pr)("  data %p saveaddr %p dep %p\n",
+		  bp->b_data, bp->b_saveaddr, LIST_FIRST(&bp->b_dep));
 	(*pr)("  iodone %p\n", bp->b_iodone);
-
-	(*pr)("  dirtyoff 0x%x dirtyend 0x%x validoff 0x%x validend 0x%x\n",
-		  bp->b_dirtyoff, bp->b_dirtyend,
-		  bp->b_validoff, bp->b_validend);
-
-	(*pr)("  rcred %p wcred %p\n", bp->b_rcred, bp->b_wcred);	
 }


@ -2689,16 +2734,17 @@ vfs_vnode_print(vp, full, pr)
 	int full;
 	void (*pr) __P((const char *, ...));
 {
-	char buf[1024];
+	char buf[256];

 	const char *vtype, *vtag;

 	uvm_object_printit(&vp->v_uvm.u_obj, full, pr);
 	bitmask_snprintf(vp->v_flag, vnode_flagbits, buf, sizeof(buf));
 	(*pr)("\nVNODE flags %s\n", buf);
-	(*pr)("nio %d size 0x%x wlist %s\n",
-	      vp->v_uvm.u_nio, vp->v_uvm.u_size,
-	      vp->v_uvm.u_wlist.le_next ? "YES" : "NO");
+	(*pr)("mp %p nio %d size 0x%x rwlock 0x%x glock 0x%x\n",
+	      vp->v_mount, vp->v_uvm.u_nio, (int)vp->v_uvm.u_size,
+	      vp->v_vnlock ? lockstatus(vp->v_vnlock) : 0x999,
+	      lockstatus(&vp->v_glock));

 	(*pr)("data %p usecount %d writecount %d holdcnt %d numoutput %d\n",
 	      vp->v_data, vp->v_usecount, vp->v_writecount,
@ -2723,16 +2769,14 @@ vfs_vnode_print(vp, full, pr)
 		struct buf *bp;

 		(*pr)("clean bufs:\n");
-		for (bp = LIST_FIRST(&vp->v_cleanblkhd);
-		     bp != NULL;
-		     bp = LIST_NEXT(bp, b_vnbufs)) {
+		LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
+			(*pr)(" bp %p\n", bp);
 			vfs_buf_print(bp, full, pr);
 		}

 		(*pr)("dirty bufs:\n");
-		for (bp = LIST_FIRST(&vp->v_dirtyblkhd);
-		     bp != NULL;
-		     bp = LIST_NEXT(bp, b_vnbufs)) {
+		LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+			(*pr)(" bp %p\n", bp);
 			vfs_buf_print(bp, full, pr);
 		}
 	}
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_syscalls.c,v 1.163 2000/09/28 06:43:20 enami Exp $	*/
+/*	$NetBSD: vfs_syscalls.c,v 1.164 2000/11/27 08:39:44 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -571,7 +571,6 @@ sys_sync(p, v, retval)
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
-			uvm_vnp_sync(mp);
 			VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p);
 			if (asyncflag)
 				 mp->mnt_flag |= MNT_ASYNC;
@ -1181,6 +1180,11 @@ sys_fhopen(p, v, retval)
 	}
 	if ((error = VOP_OPEN(vp, flags, cred, p)) != 0)
 		goto bad;
+	if (vp->v_type == VREG &&
+	    uvn_attach(vp, flags & FWRITE ? VM_PROT_WRITE : 0) == NULL) {
+		error = EIO;
+		goto bad;
+	}
 	if (flags & FWRITE)
 		vp->v_writecount++;

@ -1583,8 +1587,6 @@ sys_unlink(p, v, retval)
 		goto out;
 	}

-	(void)uvm_vnp_uncache(vp);
-
 	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
 	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
 	error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
@ -2852,7 +2854,6 @@ out:
 		if (fromnd.ni_dvp != tdvp)
 			VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
 		if (tvp) {
-			(void)uvm_vnp_uncache(tvp);
 			VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_vnops.c,v 1.44 2000/08/12 16:43:00 sommerfeld Exp $	*/
+/*	$NetBSD: vfs_vnops.c,v 1.45 2000/11/27 08:39:44 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -156,8 +156,14 @@ vn_open(ndp, fmode, cmode)
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0)
 		goto bad;
+	if (vp->v_type == VREG &&
+	    uvn_attach(vp, fmode & FWRITE ? VM_PROT_WRITE : 0) == NULL) {
+		error = EIO;
+		goto bad;
+	}
 	if (fmode & FWRITE)
 		vp->v_writecount++;
+
 	return (0);
 bad:
 	vput(vp);
@ -174,11 +180,10 @@ vn_writechk(vp)
 {

 	/*
-	 * If there's shared text associated with
-	 * the vnode, try to free it up once.  If
-	 * we fail, we can't allow writing.
+	 * If the vnode is in use as a process's text,
+	 * we can't allow writing.
 	 */
-	if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp))
+	if (vp->v_flag & VTEXT)
 		return (ETXTBSY);
 	return (0);
 }
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@ -1,4 +1,4 @@
-#	$NetBSD: vnode_if.src,v 1.25 2000/09/19 21:57:14 fvdl Exp $
+#	$NetBSD: vnode_if.src,v 1.26 2000/11/27 08:39:45 chs Exp $
 #
 # Copyright (c) 1992, 1993
 #	The Regents of the University of California.  All rights reserved.
@ -502,6 +502,17 @@ vop_balloc {
 	OUT struct buf **bpp;
 };

+#
+#% ballocn    vp      L L L
+#
+vop_ballocn {
+	IN struct vnode *vp;
+	IN off_t offset;
+	IN off_t length;
+	IN struct ucred *cred;
+	IN int flags;
+};
+
 #
 #% reallocblks        vp      L L L
 #
@ -569,3 +580,37 @@ vop_whiteout {
 #vop_bwrite {
 #	IN struct buf *bp;
 #};
+
+#
+#% getpages	vp L L L
+#
+vop_getpages {
+	IN struct vnode *vp;
+	IN voff_t offset;
+	IN vm_page_t *m;
+	IN int *count;
+	IN int centeridx;
+	IN vm_prot_t access_type;
+	IN int advice;
+	IN int flags;
+};
+
+#
+#% putpages	vp L L L
+#
+vop_putpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int flags;
+	IN int *rtvals;
+};
+
+#
+#% size		vp = = =
+#
+vop_size {
+	IN struct vnode *vp;
+	IN off_t size;
+	OUT off_t *eobp;
+};
--- a/sys/miscfs/genfs/genfs.h
+++ b/sys/miscfs/genfs/genfs.h
@ -1,4 +1,4 @@
-/*	$NetBSD: genfs.h,v 1.10 1999/08/03 20:19:19 wrstuden Exp $	*/
+/*	$NetBSD: genfs.h,v 1.11 2000/11/27 08:39:45 chs Exp $	*/

 int	genfs_badop		__P((void *));
 int	genfs_nullop		__P((void *));
@ -22,3 +22,6 @@ int	genfs_lease_check	__P((void *));
 int	genfs_lock		__P((void *));
 int	genfs_islocked		__P((void *));
 int	genfs_unlock		__P((void *));
+int	genfs_getpages		__P((void *));
+int	genfs_putpages		__P((void *));
+int	genfs_size		__P((void *));
--- a/sys/miscfs/genfs/genfs_vnops.c
+++ b/sys/miscfs/genfs/genfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: genfs_vnops.c,v 1.20 2000/09/19 22:01:59 fvdl Exp $	*/
+/*	$NetBSD: genfs_vnops.c,v 1.21 2000/11/27 08:39:45 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -50,6 +50,9 @@
 #include <miscfs/genfs/genfs.h>
 #include <miscfs/specfs/specdev.h>

+#include <uvm/uvm.h>
+#include <uvm/uvm_pager.h>
+
 #ifdef NFSSERVER
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
@ -414,3 +417,659 @@ genfs_lease_check(v)
 	return (0);
 #endif /* NFSSERVER */
 }
+
+/*
+ * generic VM getpages routine.
+ * Return PG_BUSY pages for the given range,
+ * reading from backing store if necessary.
+ */
+
+int
+genfs_getpages(v)
+	void *v;
+{
+	struct vop_getpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offset;
+		vm_page_t *a_m;
+		int *a_count;
+		int a_centeridx;
+		vm_prot_t a_access_type;
+		int a_advice;
+		int a_flags;
+	} */ *ap = v;
+
+	off_t eof, offset, origoffset, startoffset, endoffset, raoffset;
+	daddr_t lbn, blkno;
+	int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount;
+	int fs_bshift, fs_bsize, dev_bshift, dev_bsize;
+	int flags = ap->a_flags;
+	size_t bytes, iobytes, tailbytes, totalbytes, skipbytes;
+	vaddr_t kva;
+	struct buf *bp, *mbp;
+	struct vnode *vp = ap->a_vp;
+	struct uvm_object *uobj = &vp->v_uvm.u_obj;
+	struct vm_page *pgs[16];			/* XXXUBC 16 */
+	struct ucred *cred = curproc->p_ucred;		/* XXXUBC curproc */
+	boolean_t async = (flags & PGO_SYNCIO) == 0;
+	boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0;
+	boolean_t sawhole = FALSE;
+	UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
+
+	/* XXXUBC temp limit */
+	if (*ap->a_count > 16) {
+		return EINVAL;
+	}
+
+	error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof);
+	if (error) {
+		return error;
+	}
+
+#ifdef DIAGNOSTIC
+	if (ap->a_centeridx < 0 || ap->a_centeridx > *ap->a_count) {
+		panic("genfs_getpages: centeridx %d out of range",
+		      ap->a_centeridx);
+	}
+	if (ap->a_offset & (PAGE_SIZE - 1) || ap->a_offset < 0) {
+		panic("genfs_getpages: offset 0x%x", (int)ap->a_offset);
+	}
+	if (*ap->a_count < 0) {
+		panic("genfs_getpages: count %d < 0", *ap->a_count);
+	}
+#endif
+
+	/*
+	 * Bounds-check the request.
+	 */
+
+	error = 0;
+	origoffset = ap->a_offset;
+
+	if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= eof &&
+	    (flags & PGO_PASTEOF) == 0) {
+		if ((flags & PGO_LOCKED) == 0) {
+			simple_unlock(&uobj->vmobjlock);
+		}
+		UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x",
+			    origoffset, *ap->a_count, eof,0);
+		return EINVAL;
+	}
+
+	/*
+	 * For PGO_LOCKED requests, just return whatever's in memory.
+	 */
+
+	if (flags & PGO_LOCKED) {
+		uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m,
+			      UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY);
+
+		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
+	}
+
+	/* vnode is VOP_LOCKed, uobj is locked */
+
+	if (write && (vp->v_flag & VONWORKLST) == 0) {
+		vn_syncer_add_to_worklist(vp, filedelay);
+	}
+
+	/*
+	 * find the requested pages and make some simple checks.
+	 * leave space in the page array for a whole block.
+	 */
+
+	fs_bshift = vp->v_mount->mnt_fs_bshift;
+	fs_bsize = 1 << fs_bshift;
+	dev_bshift = vp->v_mount->mnt_dev_bshift;
+	dev_bsize = 1 << dev_bshift;
+	KASSERT((eof & (dev_bsize - 1)) == 0);
+
+	orignpages = min(*ap->a_count,
+	    round_page(eof - origoffset) >> PAGE_SHIFT);
+	if (flags & PGO_PASTEOF) {
+		orignpages = *ap->a_count;
+	}
+	npages = orignpages;
+	startoffset = origoffset & ~(fs_bsize - 1);
+	endoffset = round_page((origoffset + (npages << PAGE_SHIFT)
+				+ fs_bsize - 1) & ~(fs_bsize - 1));
+	endoffset = min(endoffset, round_page(eof));
+	ridx = (origoffset - startoffset) >> PAGE_SHIFT;
+
+	memset(pgs, 0, sizeof(pgs));
+	uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL);
+
+	/*
+	 * if PGO_OVERWRITE is set, don't bother reading the pages.
+	 * PGO_OVERWRITE also means that the caller guarantees
+	 * that the pages already have backing store allocated.
+	 */
+
+	if (flags & PGO_OVERWRITE) {
+		UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
+
+		for (i = 0; i < npages; i++) {
+			struct vm_page *pg = pgs[ridx + i];
+
+			if (pg->flags & PG_FAKE) {
+				uvm_pagezero(pg);
+				pg->flags &= ~(PG_FAKE);
+			}
+			pg->flags &= ~(PG_RDONLY);
+		}
+		goto out;
+	}
+
+	/*
+	 * if the pages are already resident, just return them.
+	 */
+
+	for (i = 0; i < npages; i++) {
+		struct vm_page *pg = pgs[ridx + i];
+
+		if ((pg->flags & PG_FAKE) ||
+		    (write && (pg->flags & PG_RDONLY))) {
+			break;
+		}
+	}
+	if (i == npages) {
+		UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
+		raoffset = origoffset + (orignpages << PAGE_SHIFT);
+		goto raout;
+	}
+
+	/*
+	 * the page wasn't resident and we're not overwriting,
+	 * so we're going to have to do some i/o.
+	 * find any additional pages needed to cover the expanded range.
+	 */
+
+	if (startoffset != origoffset) {
+
+		/*
+		 * XXXUBC we need to avoid deadlocks caused by locking
+		 * additional pages at lower offsets than pages we
+		 * already have locked.  for now, unlock them all and
+		 * start over.
+		 */
+
+		for (i = 0; i < npages; i++) {
+			struct vm_page *pg = pgs[ridx + i];
+
+			if (pg->flags & PG_FAKE) {
+				pg->flags |= PG_RELEASED;
+			}
+		}
+		uvm_page_unbusy(&pgs[ridx], npages);
+		memset(pgs, 0, sizeof(pgs));
+
+		UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x",
+			    startoffset, endoffset, 0,0);
+		npages = (endoffset - startoffset) >> PAGE_SHIFT;
+		npgs = npages;
+		uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL);
+	}
+	simple_unlock(&uobj->vmobjlock);
+
+	/*
+	 * read the desired page(s).
+	 */
+
+	totalbytes = npages << PAGE_SHIFT;
+	bytes = min(totalbytes, eof - startoffset);
+	tailbytes = totalbytes - bytes;
+	skipbytes = 0;
+
+	kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK |
+			     UVMPAGER_MAPIN_READ);
+
+	s = splbio();
+	mbp = pool_get(&bufpool, PR_WAITOK);
+	splx(s);
+	mbp->b_bufsize = totalbytes;
+	mbp->b_data = (void *)kva;
+	mbp->b_resid = mbp->b_bcount = bytes;
+	mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0);
+	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_vp = vp;
+	LIST_INIT(&mbp->b_dep);
+
+	/*
+	 * if EOF is in the middle of the last page, zero the part past EOF.
+	 */
+
+	if (tailbytes > 0) {
+		memset((void *)(kva + bytes), 0, tailbytes);
+	}
+
+	/*
+	 * now loop over the pages, reading as needed.
+	 */
+
+	if (write) {
+		lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL);
+	} else {
+		lockmgr(&vp->v_glock, LK_SHARED, NULL);
+	}
+
+	bp = NULL;
+	for (offset = startoffset;
+	     bytes > 0;
+	     offset += iobytes, bytes -= iobytes) {
+
+		/*
+		 * skip pages which don't need to be read.
+		 */
+
+		pidx = (offset - startoffset) >> PAGE_SHIFT;
+		while ((pgs[pidx]->flags & PG_FAKE) == 0) {
+			size_t b;
+
+#ifdef DEBUG
+			if (offset & (PAGE_SIZE - 1)) {
+				panic("genfs_getpages: skipping from middle "
+				      "of page");
+			}
+#endif
+
+			b = min(PAGE_SIZE, bytes);
+			offset += b;
+			bytes -= b;
+			skipbytes += b;
+			pidx++;
+			UVMHIST_LOG(ubchist, "skipping, new offset 0x%x",
+				    offset, 0,0,0);
+			if (bytes == 0) {
+				goto loopdone;
+			}
+		}
+
+		/*
+		 * bmap the file to find out the blkno to read from and
+		 * how much we can read in one i/o.  if bmap returns an error,
+		 * skip the rest of the top-level i/o.
+		 */
+
+		lbn = offset >> fs_bshift;
+		error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
+		if (error) {
+			UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n",
+				    lbn, error,0,0);
+			skipbytes += bytes;
+			goto loopdone;
+		}
+
+		/*
+		 * see how many pages can be read with this i/o.
+		 * reduce the i/o size if necessary to avoid
+		 * overwriting pages with valid data.
+		 */
+
+		iobytes = min(((lbn + 1 + run) << fs_bshift) - offset, bytes);
+		if (offset + iobytes > round_page(offset)) {
+			pcount = 1;
+			while (pidx + pcount < npages &&
+			       pgs[pidx + pcount]->flags & PG_FAKE) {
+				pcount++;
+			}
+			iobytes = min(iobytes, (pcount << PAGE_SHIFT) -
+				      (offset - trunc_page(offset)));
+		}
+
+		/*
+		 * if this block isn't allocated, zero it instead of reading it.
+		 * if this is a read access, mark the pages we zeroed PG_RDONLY.
+		 */
+
+		if (blkno < 0) {
+			UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0);
+
+			sawhole = TRUE;
+			memset((char *)kva + (offset - startoffset), 0,
+			       iobytes);
+			skipbytes += iobytes;
+
+			if (!write) {
+				int holepages =
+					(round_page(offset + iobytes) - 
+					 trunc_page(offset)) >> PAGE_SHIFT;
+				for (i = 0; i < holepages; i++) {
+					pgs[pidx + i]->flags |= PG_RDONLY;
+				}
+			}
+			continue;
+		}
+
+		/*
+		 * allocate a sub-buf for this piece of the i/o
+		 * (or just use mbp if there's only 1 piece),
+		 * and start it going.
+		 */
+
+		if (offset == startoffset && iobytes == bytes) {
+			bp = mbp;
+		} else {
+			s = splbio();
+			bp = pool_get(&bufpool, PR_WAITOK);
+			splx(s);
+			bp->b_data = (char *)kva + offset - startoffset;
+			bp->b_resid = bp->b_bcount = iobytes;
+			bp->b_flags = B_BUSY|B_READ|B_CALL;
+			bp->b_iodone = uvm_aio_biodone1;
+			bp->b_vp = vp;
+			LIST_INIT(&bp->b_dep);
+		}
+		bp->b_lblkno = 0;
+		bp->b_private = mbp;
+
+		/* adjust physical blkno for partial blocks */
+		bp->b_blkno = blkno + ((offset - (lbn << fs_bshift)) >>
+				       dev_bshift);
+
+		UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x",
+			    bp, offset, iobytes, bp->b_blkno);
+
+		VOP_STRATEGY(bp);
+	}
+
+loopdone:
+	if (skipbytes) {
+		s = splbio();
+		if (error) {
+			mbp->b_flags |= B_ERROR;
+			mbp->b_error = error;
+		}
+		mbp->b_resid -= skipbytes;
+		if (mbp->b_resid == 0) {
+			biodone(mbp);
+		}
+		splx(s);
+	}
+
+	if (async) {
+		UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0);
+		lockmgr(&vp->v_glock, LK_RELEASE, NULL);
+		return EINPROGRESS;
+	}
+	if (bp != NULL) {
+		error = biowait(mbp);
+	}
+	s = splbio();
+	pool_put(&bufpool, mbp);
+	splx(s);
+	uvm_pagermapout(kva, npages);
+	raoffset = offset;
+
+	/*
+	 * if this we encountered a hole then we have to do a little more work.
+	 * for read faults, we marked the page PG_RDONLY so that future
+	 * write accesses to the page will fault again.
+	 * for write faults, we must make sure that the backing store for
+	 * the page is completely allocated while the pages are locked.
+	 */
+
+	if (error == 0 && sawhole && write) {
+		error = VOP_BALLOCN(vp, startoffset, npages << PAGE_SHIFT,
+				   cred, 0);
+		if (error) {
+			UVMHIST_LOG(ubchist, "balloc lbn 0x%x -> %d",
+				    lbn, error,0,0);
+			lockmgr(&vp->v_glock, LK_RELEASE, NULL);
+			simple_lock(&uobj->vmobjlock);
+			goto out;
+		}
+	}
+	lockmgr(&vp->v_glock, LK_RELEASE, NULL);
+	simple_lock(&uobj->vmobjlock);
+
+	/*
+	 * see if we want to start any readahead.
+	 * XXXUBC for now, just read the next 128k on 64k boundaries.
+	 * this is pretty nonsensical, but it is 50% faster than reading
+	 * just the next 64k.
+	 */
+
+raout:
+	if (!async && !write && ((int)raoffset & 0xffff) == 0 &&
+	    PAGE_SHIFT <= 16) {
+		int racount;
+
+		racount = 1 << (16 - PAGE_SHIFT);
+		(void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0,
+				    VM_PROT_READ, 0, 0);
+		simple_lock(&uobj->vmobjlock);
+
+		racount = 1 << (16 - PAGE_SHIFT);
+		(void) VOP_GETPAGES(vp, raoffset + 0x10000, NULL, &racount, 0,
+				    VM_PROT_READ, 0, 0);
+		simple_lock(&uobj->vmobjlock);
+	}
+
+	/*
+	 * we're almost done!  release the pages...
+	 * for errors, we free the pages.
+	 * otherwise we activate them and mark them as valid and clean.
+	 * also, unbusy pages that were not actually requested.
+	 */
+
+out:
+	if (error) {
+		uvm_lock_pageq();
+		for (i = 0; i < npages; i++) {
+			if (pgs[i] == NULL) {
+				continue;
+			}
+			UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
+				    pgs[i], pgs[i]->flags, 0,0);
+			if ((pgs[i]->flags & PG_FAKE) == 0) {
+				continue;
+			}
+			if (pgs[i]->flags & PG_WANTED) {
+				wakeup(pgs[i]);
+			}
+			uvm_pagefree(pgs[i]);
+		}
+		uvm_unlock_pageq();
+		simple_unlock(&uobj->vmobjlock);
+		UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0);
+		return error;
+	}
+
+	UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0);
+	for (i = 0; i < npages; i++) {
+		if (pgs[i] == NULL) {
+			continue;
+		}
+		UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x",
+			    pgs[i], pgs[i]->flags, 0,0);
+		if (pgs[i]->flags & PG_FAKE) {
+			UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x",
+				    pgs[i], pgs[i]->offset,0,0);
+			pgs[i]->flags &= ~(PG_FAKE);
+			pmap_clear_modify(pgs[i]);
+			pmap_clear_reference(pgs[i]);
+		}
+		if (write) {
+			pgs[i]->flags &= ~(PG_RDONLY);
+		}
+		if (i < ridx || i >= ridx + orignpages || async) {
+			UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x",
+				    pgs[i], pgs[i]->offset,0,0);
+			if (pgs[i]->flags & PG_WANTED) {
+				wakeup(pgs[i]);
+			}
+			if (pgs[i]->wire_count == 0) {
+				uvm_pageactivate(pgs[i]);
+			}
+			pgs[i]->flags &= ~(PG_WANTED|PG_BUSY);
+			UVM_PAGE_OWN(pgs[i], NULL);
+		}
+	}
+	simple_unlock(&uobj->vmobjlock);
+	if (ap->a_m != NULL) {
+		memcpy(ap->a_m, &pgs[ridx],
+		       orignpages * sizeof(struct vm_page *));
+	}
+	return 0;
+}
+
+/*
+ * generic VM putpages routine.
+ * Write the given range of pages to backing store.
+ */
+
+int
+genfs_putpages(v)
+	void *v;
+{
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		struct vm_page **a_m;
+		int a_count;
+		int a_flags;
+		int *a_rtvals;
+	} */ *ap = v;
+
+	int s, error, error2, npages, run;
+	int fs_bshift, dev_bshift, dev_bsize;
+	vaddr_t kva;
+	off_t eof, offset, startoffset;
+	size_t bytes, iobytes, skipbytes;
+	daddr_t lbn, blkno;
+	struct vm_page *pg;
+	struct buf *mbp, *bp;
+	struct vnode *vp = ap->a_vp;
+	boolean_t async = (ap->a_flags & PGO_SYNCIO) == 0;
+	UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
+
+	simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+
+	error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof);
+	if (error) {
+		return error;
+	}
+
+	error = error2 = 0;
+	npages = ap->a_count;
+	fs_bshift = vp->v_mount->mnt_fs_bshift;
+	dev_bshift = vp->v_mount->mnt_dev_bshift;
+	dev_bsize = 1 << dev_bshift;
+	KASSERT((eof & (dev_bsize - 1)) == 0);
+
+	pg = ap->a_m[0];
+	startoffset = pg->offset;
+	bytes = min(npages << PAGE_SHIFT, eof - startoffset);
+	skipbytes = 0;
+	KASSERT(bytes != 0);
+
+	kva = uvm_pagermapin(ap->a_m, npages, UVMPAGER_MAPIN_WAITOK);
+
+	s = splbio();
+	vp->v_numoutput += 2;
+	mbp = pool_get(&bufpool, PR_WAITOK);
+	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+		    vp, mbp, vp->v_numoutput, bytes);
+	splx(s);
+	mbp->b_bufsize = npages << PAGE_SHIFT;
+	mbp->b_data = (void *)kva;
+	mbp->b_resid = mbp->b_bcount = bytes;
+	mbp->b_flags = B_BUSY|B_WRITE|B_AGE |
+		(async ? B_CALL : 0) |
+		(curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0);
+	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_vp = vp;
+	LIST_INIT(&mbp->b_dep);
+
+	bp = NULL;
+	for (offset = startoffset;
+	     bytes > 0;
+	     offset += iobytes, bytes -= iobytes) {
+		lbn = offset >> fs_bshift;
+		error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
+		if (error) {
+			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
+			skipbytes += bytes;
+			bytes = 0;
+			break;
+		}
+
+		iobytes = min(((lbn + 1 + run) << fs_bshift) - offset, bytes);
+		if (blkno == (daddr_t)-1) {
+			skipbytes += iobytes;
+			continue;
+		}
+
+		/* if it's really one i/o, don't make a second buf */
+		if (offset == startoffset && iobytes == bytes) {
+			bp = mbp;
+		} else {
+			s = splbio();
+			vp->v_numoutput++;
+			bp = pool_get(&bufpool, PR_WAITOK);
+			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+				    vp, bp, vp->v_numoutput, 0);
+			splx(s);
+			bp->b_data = (char *)kva +
+				(vaddr_t)(offset - pg->offset);
+			bp->b_resid = bp->b_bcount = iobytes;
+			bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC;
+			bp->b_iodone = uvm_aio_biodone1;
+			bp->b_vp = vp;
+			LIST_INIT(&bp->b_dep);
+		}
+		bp->b_lblkno = 0;
+		bp->b_private = mbp;
+
+		/* adjust physical blkno for partial blocks */
+		bp->b_blkno = blkno + ((offset - (lbn << fs_bshift)) >>
+				       dev_bshift);
+		UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x",
+			    vp, offset, bp->b_bcount, bp->b_blkno);
+		VOP_STRATEGY(bp);
+	}
+	if (skipbytes) {
+		UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0);
+		s = splbio();
+		mbp->b_resid -= skipbytes;
+		if (mbp->b_resid == 0) {
+			biodone(mbp);
+		}
+		splx(s);
+	}
+	if (async) {
+		UVMHIST_LOG(ubchist, "returning PEND", 0,0,0,0);
+		return EINPROGRESS;
+	}
+	if (bp != NULL) {
+		UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0);
+		error2 = biowait(mbp);
+	}
+	{
+		/* XXXUBC */
+		void softdep_pageiodone(struct buf *);
+		softdep_pageiodone(mbp);
+	}
+	s = splbio();
+	vwakeup(mbp);
+	pool_put(&bufpool, mbp);
+	splx(s);
+	uvm_pagermapout(kva, npages);
+	UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0);
+	return error ? error : error2;
+}
+
+int
+genfs_size(v)
+	void *v;
+{
+	struct vop_size_args /* {
+		struct vnode *a_vp;
+		off_t a_size;
+		off_t *a_eobp;
+	} */ *ap = v;
+	int bsize;
+
+	bsize = 1 << ap->a_vp->v_mount->mnt_fs_bshift;
+	*ap->a_eobp = (ap->a_size + bsize) & ~(bsize - 1);
+	return 0;
+}
--- a/sys/miscfs/genfs/layer_subr.c
+++ b/sys/miscfs/genfs/layer_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: layer_subr.c,v 1.6 2000/03/16 18:08:24 jdolecek Exp $	*/
+/*	$NetBSD: layer_subr.c,v 1.7 2000/11/27 08:39:45 chs Exp $	*/

 /*
 * Copyright (c) 1999 National Aeronautics & Space Administration
@ -272,6 +272,7 @@ layer_node_alloc(mp, lowervp, vpp)
 	VREF(lowervp);	/* Take into account reference held in layer_node */
 	hd = LAYER_NHASH(lmp, lowervp);
 	LIST_INSERT_HEAD(hd, xp, layer_hash);
+	uvm_vnp_setsize(vp, 0);
 	simple_unlock(&lmp->layerm_hashlock);
 	return (0);
 }
--- a/sys/miscfs/procfs/procfs_subr.c
+++ b/sys/miscfs/procfs/procfs_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: procfs_subr.c,v 1.33 2000/11/24 18:58:37 chs Exp $	*/
+/*	$NetBSD: procfs_subr.c,v 1.34 2000/11/27 08:39:46 chs Exp $	*/

 /*
 * Copyright (c) 1994 Christopher G. Demetriou.  All rights reserved.
@ -167,6 +167,7 @@ procfs_allocvp(mp, vpp, pid, pfs_type)
 	}

 	procfs_hashins(pfs);
+	uvm_vnp_setsize(vp, 0);
 	lockmgr(&pfs_hashlock, LK_RELEASE, NULL);

 	return (error);
--- a/sys/miscfs/syncfs/sync_subr.c
+++ b/sys/miscfs/syncfs/sync_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: sync_subr.c,v 1.7 2000/10/06 19:08:00 jdolecek Exp $	*/
+/*	$NetBSD: sync_subr.c,v 1.8 2000/11/27 08:39:46 chs Exp $	*/

 /*
 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
@ -188,16 +188,14 @@ sched_sync(v)
 			}
 			s = splbio();
 			if (LIST_FIRST(slp) == vp) {
-				if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
-				    vp->v_type != VBLK)
-					panic("sched_sync: fsync failed vp %p tag %d",
-					      vp, vp->v_tag);
+
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
+
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			splx(s);
@ -235,7 +233,7 @@ sched_sync(v)
 		 * filesystem activity.
 		 */
 		if (time.tv_sec == starttime)
-			tsleep(&lbolt, PPAUSE, "syncer", 0);
+			tsleep(&rushjob, PPAUSE, "syncer", hz);
 	}
 }

@ -247,21 +245,12 @@ sched_sync(v)
 int
 speedup_syncer()
 {
-	int s;
-	
-	/*
-	 * XXX Should not be doing this, should be using ltsleep()
-	 * XXX with a timeout, rather than sleeping on lbolt.
-	 */
-	SCHED_LOCK(s);
-	if (updateproc && updateproc->p_wchan == &lbolt)
-		setrunnable(updateproc);
-	SCHED_UNLOCK(s);
-
-	if (rushjob < syncdelay / 2) {
-		rushjob += 1;
-		stat_rush_requests += 1;
-		return (1);
+	if (rushjob >= syncdelay / 2) {
+		return (0);
 	}
-	return(0);
+
+	rushjob++;
+	wakeup(&rushjob);
+	stat_rush_requests += 1;
+	return (1);
 }
--- a/sys/msdosfs/msdosfs_denode.c
+++ b/sys/msdosfs/msdosfs_denode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: msdosfs_denode.c,v 1.46 2000/11/08 14:28:14 ad Exp $	*/
+/*	$NetBSD: msdosfs_denode.c,v 1.47 2000/11/27 08:39:46 chs Exp $	*/

 /*-
 * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
@ -323,6 +323,7 @@ deget(pmp, dirclust, diroffset, depp)
 		nvp->v_type = VREG;
 	VREF(ldep->de_devvp);
 	*depp = ldep;
+	nvp->v_uvm.u_size = ldep->de_FileSize;
 	return (0);
 }

@ -427,7 +428,7 @@ detrunc(dep, length, flags, cred, p)
 #endif
 			return (error);
 		}
-		uvm_vnp_uncache(DETOV(dep));	/* what's this for? */
+
 		/*
 		 * is this the right place for it?
 		 */
--- a/sys/msdosfs/msdosfs_fat.c
+++ b/sys/msdosfs/msdosfs_fat.c
@ -1,4 +1,4 @@
-/*	$NetBSD: msdosfs_fat.c,v 1.33 2000/05/13 06:04:42 cgd Exp $	*/
+/*	$NetBSD: msdosfs_fat.c,v 1.34 2000/11/27 08:39:46 chs Exp $	*/

 /*-
 * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
@ -965,6 +965,7 @@ fillinusemap(pmp)
 * the de_flag field of the denode and it does not change the de_FileSize
 * field.  This is left for the caller to do.
 */
+
 int
 extendfile(dep, count, bpp, ncp, flags)
 	struct denode *dep;
@ -974,8 +975,7 @@ extendfile(dep, count, bpp, ncp, flags)
 	int flags;
 {
 	int error;
-	u_long frcn;
-	u_long cn, got;
+	u_long frcn, cn, got, origcount;
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct buf *bp;

@ -1002,16 +1002,19 @@ extendfile(dep, count, bpp, ncp, flags)
 			return (error);
 	}

+	origcount = count;
 	while (count > 0) {
+
 		/*
 		 * Allocate a new cluster chain and cat onto the end of the
-		 * file.  * If the file is empty we make de_StartCluster point
+		 * file.  If the file is empty we make de_StartCluster point
 		 * to the new block.  Note that de_StartCluster being 0 is
 		 * sufficient to be sure the file is empty since we exclude
 		 * attempts to extend the root directory above, and the root
 		 * dir is the only file with a startcluster of 0 that has
 		 * blocks allocated (sort of).
 		 */
+
 		if (dep->de_StartCluster == 0)
 			cn = 0;
 		else
@ -1046,41 +1049,33 @@ extendfile(dep, count, bpp, ncp, flags)
 		}

 		/*
-		 * Update the "last cluster of the file" entry in the denode's fat
-		 * cache.
+		 * Update the "last cluster of the file" entry in the
+		 * denode's fat cache.
 		 */
-		fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1);

-		if (flags & DE_CLEAR) {
+		fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1);
+		if ((flags & DE_CLEAR) &&
+		    (dep->de_Attributes & ATTR_DIRECTORY)) {
 			while (got-- > 0) {
-				/*
-				 * Get the buf header for the new block of the file.
-				 */
-				if (dep->de_Attributes & ATTR_DIRECTORY)
-					bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++),
-						    pmp->pm_bpcluster, 0, 0);
-				else {
-					bp = getblk(DETOV(dep), de_cn2bn(pmp, frcn++),
+				bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++),
 					    pmp->pm_bpcluster, 0, 0);
-					/*
-					 * Do the bmap now, as in msdosfs_write
-					 */
-					if (pcbmap(dep,
-					    de_bn2cn(pmp, bp->b_lblkno),
-					    &bp->b_blkno, 0, 0))
-						bp->b_blkno = -1;
-					if (bp->b_blkno == -1)
-						panic("extendfile: pcbmap");
-				}
 				clrbuf(bp);
 				if (bpp) {
 					*bpp = bp;
-					bpp = NULL;
-				} else
+						bpp = NULL;
+				} else {
 					bdwrite(bp);
+				}					
 			}
 		}
 	}

+	if ((flags & DE_CLEAR) && !(dep->de_Attributes & ATTR_DIRECTORY)) {
+		int cnshift = pmp->pm_cnshift;
+
+		uvm_vnp_zerorange(DETOV(dep), frcn << cnshift,
+				  origcount << cnshift);
+	}
+
 	return (0);
 }
--- a/sys/msdosfs/msdosfs_vfsops.c
+++ b/sys/msdosfs/msdosfs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: msdosfs_vfsops.c,v 1.70 2000/09/19 22:02:10 fvdl Exp $	*/
+/*	$NetBSD: msdosfs_vfsops.c,v 1.71 2000/11/27 08:39:47 chs Exp $	*/

 /*-
 * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
@ -699,6 +699,9 @@ msdosfs_mountfs(devvp, mp, p, argp)
        mp->mnt_stat.f_fsid.val[0] = (long)dev;
        mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_MSDOS);
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_dev_bshift = pmp->pm_bnshift;
+	mp->mnt_fs_bshift = pmp->pm_cnshift;
+
 #ifdef QUOTA
 	/*
 	 * If we ever do quotas for DOS filesystems this would be a place
--- a/sys/msdosfs/msdosfs_vnops.c
+++ b/sys/msdosfs/msdosfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: msdosfs_vnops.c,v 1.104 2000/08/03 20:41:29 thorpej Exp $	*/
+/*	$NetBSD: msdosfs_vnops.c,v 1.105 2000/11/27 08:39:47 chs Exp $	*/

 /*-
 * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
@ -464,11 +464,11 @@ msdosfs_read(v)
 	int error = 0;
 	int64_t diff;
 	int blsize;
-	int isadir;
 	long n;
 	long on;
 	daddr_t lbn;
-	daddr_t rablock;
+	void *win;
+	vsize_t bytelen;
 	struct buf *bp;
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
@ -478,12 +478,31 @@ msdosfs_read(v)
 	/*
 	 * If they didn't ask for any data, then we are done.
 	 */
+
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)
 		return (EINVAL);

-	isadir = dep->de_Attributes & ATTR_DIRECTORY;
+	if (vp->v_type == VREG) {
+		while (uio->uio_resid > 0) {
+			bytelen = min(dep->de_FileSize - uio->uio_offset,
+				      uio->uio_resid);
+
+			if (bytelen == 0)
+				break;
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error)
+				break;
+		}
+		dep->de_flag |= DE_ACCESS;
+		goto out;
+	}
+
+	/* this loop is only for directories now */
 	do {
 		lbn = de_cluster(pmp, uio->uio_offset);
 		on = uio->uio_offset & pmp->pm_crbomask;
@ -494,41 +513,28 @@ msdosfs_read(v)
 		diff = dep->de_FileSize - uio->uio_offset;
 		if (diff < n)
 			n = (long) diff;
-		/* convert cluster # to block # if a directory */
-		if (isadir) {
-			error = pcbmap(dep, lbn, &lbn, 0, &blsize);
-			if (error)
-				return (error);
-		}
+
+		/* convert cluster # to block # */
+		error = pcbmap(dep, lbn, &lbn, 0, &blsize);
+		if (error)
+			return (error);
+
 		/*
 		 * If we are operating on a directory file then be sure to
 		 * do i/o with the vnode for the filesystem instead of the
 		 * vnode for the directory.
 		 */
-		if (isadir) {
-			error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
-		} else {
-			rablock = lbn + 1;
-			if (vp->v_lastr + 1 == lbn &&
-			    de_cn2off(pmp, rablock) < dep->de_FileSize)
-				error = breada(vp, de_cn2bn(pmp, lbn),
-				    pmp->pm_bpcluster, de_cn2bn(pmp, rablock),
-				    pmp->pm_bpcluster, NOCRED, &bp);
-			else
-				error = bread(vp, de_cn2bn(pmp, lbn),
-				    pmp->pm_bpcluster, NOCRED, &bp);
-			vp->v_lastr = lbn;
-		}
+		error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp);
 		n = min(n, pmp->pm_bpcluster - bp->b_resid);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		error = uiomove(bp->b_data + on, (int) n, uio);
-		if (!isadir)
-			dep->de_flag |= DE_ACCESS;
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
+ 
+out:
 	if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
 		error = deupdat(dep, 1);
 	return (error);
@ -547,19 +553,19 @@ msdosfs_write(v)
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap = v;
-	int n;
-	int croffset;
 	int resid;
 	u_long osize;
 	int error = 0;
 	u_long count;
-	daddr_t bn, lastcn;
-	struct buf *bp;
+	daddr_t lastcn;
 	int ioflag = ap->a_ioflag;
+	void *win;
+	vsize_t bytelen;
+	off_t oldoff;
+	boolean_t rv;
 	struct uio *uio = ap->a_uio;
 	struct proc *p = uio->uio_procp;
 	struct vnode *vp = ap->a_vp;
-	struct vnode *thisvp;
 	struct denode *dep = VTODE(vp);
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct ucred *cred = ap->a_cred;
@ -575,7 +581,6 @@ msdosfs_write(v)
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = dep->de_FileSize;
-		thisvp = vp;
 		break;
 	case VDIR:
 		return EISDIR;
@ -630,84 +635,53 @@ msdosfs_write(v)
 	} else
 		lastcn = de_clcount(pmp, osize) - 1;

+	if (dep->de_FileSize < uio->uio_offset + resid) {
+		dep->de_FileSize = uio->uio_offset + resid;
+		uvm_vnp_setsize(vp, dep->de_FileSize);
+	}
+
 	do {
-		if (de_cluster(pmp, uio->uio_offset) > lastcn) {
+		oldoff = uio->uio_offset;
+		if (de_cluster(pmp, oldoff) > lastcn) {
 			error = ENOSPC;
 			break;
 		}
+		bytelen = min(dep->de_FileSize - oldoff, uio->uio_resid);

-		bn = de_blk(pmp, uio->uio_offset);
-		if ((uio->uio_offset & pmp->pm_crbomask) == 0
-		    && (de_blk(pmp, uio->uio_offset + uio->uio_resid) > de_blk(pmp, uio->uio_offset)
-			|| uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) {
-			/*
-			 * If either the whole cluster gets written,
-			 * or we write the cluster from its start beyond EOF,
-			 * then no need to read data from disk.
-			 */
-			bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0);
-			clrbuf(bp);
-			/*
-			 * Do the bmap now, since pcbmap needs buffers
-			 * for the fat table. (see msdosfs_strategy)
-			 */
-			if (bp->b_blkno == bp->b_lblkno) {
-				error = pcbmap(dep,
-					       de_bn2cn(pmp, bp->b_lblkno),
-					       &bp->b_blkno, 0, 0);
-				if (error)
-					bp->b_blkno = -1;
-			}
-			if (bp->b_blkno == -1) {
-				brelse(bp);
-				if (!error)
-					error = EIO;		/* XXX */
-				break;
-			}
-		} else {
-			/*
-			 * The block we need to write into exists, so read it in.
-			 */
-			error = bread(thisvp, bn, pmp->pm_bpcluster,
-				      NOCRED, &bp);
-			if (error) {
-				brelse(bp);
-				break;
-			}
+		/*
+		 * XXXUBC if file is mapped and this is the last block,
+		 * process one page at a time.
+		 */
+
+		if (bytelen == 0)
+			break;
+		win = ubc_alloc(&vp->v_uvm.u_obj, oldoff, &bytelen, UBC_READ);
+		error = uiomove(win, bytelen, uio);
+		ubc_release(win, 0);
+		if (error) {
+			break;
 		}

-		croffset = uio->uio_offset & pmp->pm_crbomask;
-		n = min(uio->uio_resid, pmp->pm_bpcluster - croffset);
-		if (uio->uio_offset + n > dep->de_FileSize) {
-			dep->de_FileSize = uio->uio_offset + n;
-			uvm_vnp_setsize(vp, dep->de_FileSize);/* why? */
+		/*
+		 * flush what we just wrote if necessary.
+		 * XXXUBC simplistic async flushing.
+		 */
+
+		if (ioflag & IO_SYNC) {
+			simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+			rv = vp->v_uvm.u_obj.pgops->pgo_flush(
+			    &vp->v_uvm.u_obj, oldoff,
+			    oldoff + bytelen, PGO_CLEANIT|PGO_SYNCIO);
+			simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+		} else if (oldoff >> 16 != uio->uio_offset >> 16) {
+			simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+			rv = vp->v_uvm.u_obj.pgops->pgo_flush(
+			    &vp->v_uvm.u_obj, (oldoff >> 16) << 16,
+			    (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+			simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
 		}
-		(void) uvm_vnp_uncache(vp);	/* why not? */
-		/*
-		 * Should these vnode_pager_* functions be done on dir
-		 * files?
-		 */
-
-		/*
-		 * Copy the data from user space into the buf header.
-		 */
-		error = uiomove(bp->b_data + croffset, n, uio);
-
-		/*
-		 * If they want this synchronous then write it and wait for
-		 * it.  Otherwise, if on a cluster boundary write it
-		 * asynchronously so we can move on to the next block
-		 * without delay.  Otherwise do a delayed write because we
-		 * may want to write somemore into the block later.
-		 */
-		if (ioflag & IO_SYNC)
-			(void) bwrite(bp);
-		else if (n + croffset == pmp->pm_bpcluster)
-			bawrite(bp);
-		else
-			bdwrite(bp);
-		dep->de_flag |= DE_UPDATE;
 	} while (error == 0 && uio->uio_resid > 0);
+	dep->de_flag |= DE_UPDATE;

 	/*
 	 * If the write failed and they want us to, truncate the file back
@ -720,7 +694,8 @@ errexit:
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		} else {
-			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL);
+			detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED,
+			    NULL);
 			if (uio->uio_resid != resid)
 				error = 0;
 		}
@ -1805,12 +1780,12 @@ msdosfs_strategy(v)
 		biodone(bp);
 		return (error);
 	}
-#ifdef DIAGNOSTIC
-#endif
+
 	/*
 	 * Read/write the block from/to the disk that contains the desired
 	 * file block.
 	 */
+
 	vp = dep->de_devvp;
 	bp->b_dev = vp->v_rdev;
 	VOCALL(vp->v_op, VOFFSET(vop_strategy), ap);
@ -1934,7 +1909,10 @@ struct vnodeopv_entry_desc msdosfs_vnodeop_entries[] = {
 	{ &vop_reallocblks_desc, msdosfs_reallocblks },	/* reallocblks */
 	{ &vop_update_desc, msdosfs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL }
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
+	{ &vop_size_desc, genfs_size },			/* size */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc msdosfs_vnodeop_opv_desc =
 	{ &msdosfs_vnodeop_p, msdosfs_vnodeop_entries };
--- a/sys/nfs/nfs.h
+++ b/sys/nfs/nfs.h
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs.h,v 1.24 2000/09/19 22:05:55 fvdl Exp $	*/
+/*	$NetBSD: nfs.h,v 1.25 2000/11/27 08:39:48 chs Exp $	*/
 /*
 * Copyright (c) 1989, 1993, 1995
 *	The Regents of the University of California.  All rights reserved.
@ -84,8 +84,18 @@ extern int nfs_niothreads;              /* Number of async_daemons desired */
 * DIRBLKSIZ.
 */

+#if 1
+/*
+ * XXXUBC temp hack because of the removal of b_validend.
+ * eventually we'll store NFS VDIR data in the page cache as well,
+ * we'll fix this at that point.
+ */
+#define	NFS_DIRBLKSIZ	PAGE_SIZE
+#define	NFS_DIRFRAGSIZ	PAGE_SIZE
+#else
 #define	NFS_DIRBLKSIZ	8192		/* Must be a multiple of DIRBLKSIZ */
 #define NFS_DIRFRAGSIZ	 512		/* Same as DIRBLKSIZ, generally */
+#endif

 /*
 * Maximum number of directory entries cached per NFS node, to avoid
@ -120,10 +130,10 @@ extern int nfs_niothreads;              /* Number of async_daemons desired */
 #endif

 /*
- * The B_INVAFTERWRITE flag should be set to whatever is required by the
- * buffer cache code to say "Invalidate the block after it is written back".
+ * Use the vm_page flag reserved for pager use to indicate pages
+ * which have been written to the server but not yet committed.
 */
-#define	B_INVAFTERWRITE	B_INVAL
+#define PG_NEEDCOMMIT PG_PAGER1

 /*
 * The IO_METASYNC flag should be implemented for local file systems.
--- a/sys/nfs/nfs_bio.c
+++ b/sys/nfs/nfs_bio.c
--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_node.c,v 1.37 2000/11/08 14:28:15 ad Exp $	*/
+/*	$NetBSD: nfs_node.c,v 1.38 2000/11/27 08:39:48 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -140,7 +140,7 @@ nfs_nget(mntp, fhp, fhsize, npp)
 loop:
 	for (np = nhpp->lh_first; np != 0; np = np->n_hash.le_next) {
 		if (mntp != NFSTOV(np)->v_mount || np->n_fhsize != fhsize ||
-		    memcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize))
+		    memcmp(fhp, np->n_fhp, fhsize))
 			continue;
 		vp = NFSTOV(np);
 		if (vget(vp, LK_EXCLUSIVE))
@ -159,10 +159,11 @@ loop:
 	nvp->v_vnlock = 0;	/* XXX At least untill we do locking */
 	vp = nvp;
 	np = pool_get(&nfs_node_pool, PR_WAITOK);
-	memset((caddr_t)np, 0, sizeof *np);
+	memset(np, 0, sizeof *np);
 	lockinit(&np->n_commitlock, PINOD, "nfsclock", 0, 0);
 	vp->v_data = np;
 	np->n_vnode = vp;
+
 	/*
 	 * Insert the nfsnode in the hash queue for its new file handle
 	 */
@ -171,11 +172,21 @@ loop:
 		np->n_fhp = malloc(fhsize, M_NFSBIGFH, M_WAITOK);
 	} else
 		np->n_fhp = &np->n_fh;
-	memcpy((caddr_t)np->n_fhp, (caddr_t)fhp, fhsize);
+	memcpy(np->n_fhp, fhp, fhsize);
 	np->n_fhsize = fhsize;
 	np->n_accstamp = -1;
 	np->n_vattr = pool_get(&nfs_vattr_pool, PR_WAITOK);
-	memset(np->n_vattr, 0, sizeof (struct vattr));
+
+	/*
+	 * XXXUBC doing this while holding the nfs_hashlock is bad,
+	 * but there's no alternative at the moment.
+	 */
+	error = VOP_GETATTR(vp, np->n_vattr, curproc->p_ucred, curproc);
+	if (error) {
+		return error;
+	}
+	uvm_vnp_setsize(vp, np->n_vattr->va_size);
+
 	lockmgr(&nfs_hashlock, LK_RELEASE, 0);
 	*npp = np;
 	return (0);
@ -227,7 +238,7 @@ nfs_inactive(v)
 		nfs_removeit(sp);
 		crfree(sp->s_cred);
 		vrele(sp->s_dvp);
-		FREE((caddr_t)sp, M_NFSREQ);
+		FREE(sp, M_NFSREQ);
 	}
 	np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT | NQNFSEVICTED |
 		NQNFSNONCACHE | NQNFSWRITE);
@ -272,12 +283,18 @@ nfs_reclaim(v)
 		FREE(np->n_dircache, M_NFSDIROFF);
 	}
 	if (np->n_fhsize > NFS_SMALLFH) {
-		free((caddr_t)np->n_fhp, M_NFSBIGFH);
+		free(np->n_fhp, M_NFSBIGFH);
 	}

 	pool_put(&nfs_vattr_pool, np->n_vattr);
+	if (np->n_rcred) {
+		crfree(np->n_rcred);
+	}
+	if (np->n_wcred) {
+		crfree(np->n_wcred);
+	}
 	cache_purge(vp);
 	pool_put(&nfs_node_pool, vp->v_data);
-	vp->v_data = (void *)0;
+	vp->v_data = NULL;
 	return (0);
 }
--- a/sys/nfs/nfs_serv.c
+++ b/sys/nfs/nfs_serv.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_serv.c,v 1.58 2000/09/19 22:05:29 fvdl Exp $	*/
+/*	$NetBSD: nfs_serv.c,v 1.59 2000/11/27 08:39:49 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -1729,7 +1729,6 @@ nfsrv_remove(nfsd, slp, procp, mrq)
 		}
 out:
 		if (!error) {
-			(void)uvm_vnp_uncache(vp);
 			nqsrv_getl(nd.ni_dvp, ND_WRITE);
 			nqsrv_getl(vp, ND_WRITE);
 			error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
@ -1904,7 +1903,6 @@ out:
 		nqsrv_getl(fromnd.ni_dvp, ND_WRITE);
 		nqsrv_getl(tdvp, ND_WRITE);
 		if (tvp) {
-			(void)uvm_vnp_uncache(tvp);
 			nqsrv_getl(tvp, ND_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
@ -3389,12 +3387,12 @@ nfsrv_access(vp, flags, cred, rdonly, p, override)
 				break;
 			}
 		}
+
 		/*
-		 * If there's shared text associated with
-		 * the inode, try to free it up once.  If
-		 * we fail, we can't allow writing.
+		 * If the vnode is in use as a process's text,
+		 * we can't allow writing.
 		 */
-		if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp))
+		if (vp->v_flag & VTEXT) 
 			return (ETXTBSY);
 	}
 	error = VOP_GETATTR(vp, &vattr, cred, p);
--- a/sys/nfs/nfs_subs.c
+++ b/sys/nfs/nfs_subs.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_subs.c,v 1.88 2000/11/08 14:28:15 ad Exp $	*/
+/*	$NetBSD: nfs_subs.c,v 1.89 2000/11/27 08:39:49 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -1665,17 +1665,14 @@ nfs_loadattrcache(vpp, fp, vaper)
 		vap->va_filerev = 0;
 	}
 	if (vap->va_size != np->n_size) {
-		if (vap->va_type == VREG) {
-			if (np->n_flag & NMODIFIED) {
-				if (vap->va_size < np->n_size)
-					vap->va_size = np->n_size;
-				else
-					np->n_size = vap->va_size;
-			} else
-				np->n_size = vap->va_size;
-			uvm_vnp_setsize(vp, np->n_size);
-		} else
+		if ((np->n_flag & NMODIFIED) && vap->va_size < np->n_size) {
+			vap->va_size = np->n_size;
+		} else {
 			np->n_size = vap->va_size;
+			if (vap->va_type == VREG) {
+				uvm_vnp_setsize(vp, np->n_size);
+			}
+		}
 	}
 	np->n_attrstamp = time.tv_sec;
 	if (vaper != NULL) {
@ -2366,7 +2363,6 @@ netaddr_match(family, haddr, nam)
 	return (0);
 }

-
 /*
 * The write verifier has changed (probably due to a server reboot), so all
 * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
@ -2377,17 +2373,14 @@ void
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
-	struct vnode *vp, *nvp;
-	struct buf *bp, *nbp;
+	struct vnode *vp;
 	struct nfsnode *np;
+	struct vm_page *pg;
 	int s;

 	s = splbio();
-loop:
-	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
-		if (vp->v_mount != mp)	/* Paranoia */
-			goto loop;
-		nvp = vp->v_mntvnodes.le_next;
+	LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
+		KASSERT(vp->v_mount == mp);
 		if (vp->v_type == VNON)
 			continue;
 		np = VTONFS(vp);
@ -2395,12 +2388,11 @@ loop:
 		    np->n_pushedhi = 0;
 		np->n_commitflags &=
 		    ~(NFS_COMMIT_PUSH_VALID | NFS_COMMIT_PUSHED_VALID);
-		for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
-			nbp = bp->b_vnbufs.le_next;
-			if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT))
-				== (B_DELWRI | B_NEEDCOMMIT))
-				bp->b_flags &= ~B_NEEDCOMMIT;
+		simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+		TAILQ_FOREACH(pg, &vp->v_uvm.u_obj.memq, listq) {
+			pg->flags &= ~PG_NEEDCOMMIT;
 		}
+		simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
 	}
 	splx(s);
 }
@ -2432,47 +2424,47 @@ nfs_merge_commit_ranges(vp)
 }

 int
-nfs_in_committed_range(vp, bp)
+nfs_in_committed_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;

 	if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID))
 		return 0;
-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	return (lo >= np->n_pushedlo && hi <= np->n_pushedhi);
 }

 int
-nfs_in_tobecommitted_range(vp, bp)
+nfs_in_tobecommitted_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;

 	if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID))
 		return 0;
-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	return (lo >= np->n_pushlo && hi <= np->n_pushhi);
 }

 void
-nfs_add_committed_range(vp, bp)
+nfs_add_committed_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;

-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) {
 		np->n_pushedlo = lo;
@ -2491,9 +2483,9 @@ nfs_add_committed_range(vp, bp)
 }

 void
-nfs_del_committed_range(vp, bp)
+nfs_del_committed_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;
@ -2501,8 +2493,8 @@ nfs_del_committed_range(vp, bp)
 	if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID))
 		return;

-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	if (lo > np->n_pushedhi || hi < np->n_pushedlo)
 		return;
@ -2528,15 +2520,15 @@ nfs_del_committed_range(vp, bp)
 }

 void
-nfs_add_tobecommitted_range(vp, bp)
+nfs_add_tobecommitted_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;

-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) {
 		np->n_pushlo = lo;
@ -2555,9 +2547,9 @@ nfs_add_tobecommitted_range(vp, bp)
 }

 void
-nfs_del_tobecommitted_range(vp, bp)
+nfs_del_tobecommitted_range(vp, off, len)
 	struct vnode *vp;
-	struct buf *bp;
+	off_t off, len;
 {
 	struct nfsnode *np = VTONFS(vp);
 	off_t lo, hi;
@ -2565,8 +2557,8 @@ nfs_del_tobecommitted_range(vp, bp)
 	if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID))
 		return;

-	lo = (off_t)bp->b_blkno * DEV_BSIZE;
-	hi = lo + bp->b_dirtyend;
+	lo = off;
+	hi = lo + len;

 	if (lo > np->n_pushhi || hi < np->n_pushlo)
 		return;
--- a/sys/nfs/nfs_syscalls.c
+++ b/sys/nfs/nfs_syscalls.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_syscalls.c,v 1.47 2000/11/24 23:30:03 chs Exp $	*/
+/*	$NetBSD: nfs_syscalls.c,v 1.48 2000/11/27 08:39:50 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -970,10 +970,7 @@ nfssvc_iod(p)
 		    nmp->nm_bufqwant = FALSE;
 		    wakeup(&nmp->nm_bufq);
 		}
-		if (bp->b_flags & B_READ)
-		    (void) nfs_doio(bp, bp->b_rcred, (struct proc *)0);
-		else
-		    (void) nfs_doio(bp, bp->b_wcred, (struct proc *)0);
+		(void) nfs_doio(bp, NULL);
 		/*
 		 * If there are more than one iod on this mount, then defect
 		 * so that the iods can be shared out fairly between the mounts
--- a/sys/nfs/nfs_var.h
+++ b/sys/nfs/nfs_var.h
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_var.h,v 1.18 2000/09/19 22:14:59 fvdl Exp $	*/
+/*	$NetBSD: nfs_var.h,v 1.19 2000/11/27 08:39:50 chs Exp $	*/

 /*-
 * Copyright (c) 1996 The NetBSD Foundation, Inc.
@ -74,8 +74,8 @@ int nfs_bioread __P((struct vnode *, struct uio *, int, struct ucred *, int));
 struct buf *nfs_getcacheblk __P((struct vnode *, daddr_t, int, struct proc *));
 int nfs_vinvalbuf __P((struct vnode *, int, struct ucred *, struct proc *,
 		       int));
-int nfs_asyncio __P((struct buf *, struct ucred *));
-int nfs_doio __P((struct buf *, struct ucred *, struct proc *));
+int nfs_asyncio __P((struct buf *));
+int nfs_doio __P((struct buf *, struct proc *));

 /* nfs_boot.c */
 /* see nfsdiskless.h */
@ -91,9 +91,8 @@ int nfs_null __P((struct vnode *, struct ucred *, struct proc *));
 int nfs_setattrrpc __P((struct vnode *, struct vattr *, struct ucred *,
 			struct proc *));
 int nfs_readlinkrpc __P((struct vnode *, struct uio *, struct ucred *));
-int nfs_readrpc __P((struct vnode *, struct uio *, struct ucred *));
-int nfs_writerpc __P((struct vnode *, struct uio *, struct ucred *, int *,
-		      int *));
+int nfs_readrpc __P((struct vnode *, struct uio *));
+int nfs_writerpc __P((struct vnode *, struct uio *, int *, int *));
 int nfs_mknodrpc __P((struct vnode *, struct vnode **, struct componentname *,
 		      struct vattr *));
 int nfs_removeit __P((struct sillyrename *));
@ -109,8 +108,7 @@ int nfs_sillyrename __P((struct vnode *, struct vnode *,
 			 struct componentname *));
 int nfs_lookitup __P((struct vnode *, const char *, int, struct ucred *,
 		      struct proc *, struct nfsnode **));
-int nfs_commit __P((struct vnode *, u_quad_t, unsigned, struct ucred *,
-		    struct proc *));
+int nfs_commit __P((struct vnode *, off_t, uint32_t, struct proc *));
 int nfs_flush __P((struct vnode *, struct ucred *, int, struct proc *, int));

 /* nfs_nqlease.c */
@ -267,12 +265,12 @@ int netaddr_match __P((int, union nethostaddr *, struct mbuf *));

 void nfs_clearcommit __P((struct mount *));
 void nfs_merge_commit_ranges __P((struct vnode *));
-int nfs_in_committed_range __P((struct vnode *, struct buf *));
-int nfs_in_tobecommitted_range __P((struct vnode *, struct buf *));
-void nfs_add_committed_range __P((struct vnode *, struct buf *));
-void nfs_del_committed_range __P((struct vnode *, struct buf *));
-void nfs_add_tobecommitted_range __P((struct vnode *, struct buf *));
-void nfs_del_tobecommitted_range __P((struct vnode *, struct buf *));
+int nfs_in_committed_range __P((struct vnode *, off_t, off_t));
+int nfs_in_tobecommitted_range __P((struct vnode *, off_t, off_t));
+void nfs_add_committed_range __P((struct vnode *, off_t, off_t));
+void nfs_del_committed_range __P((struct vnode *, off_t, off_t));
+void nfs_add_tobecommitted_range __P((struct vnode *, off_t, off_t));
+void nfs_del_tobecommitted_range __P((struct vnode *, off_t, off_t));

 int nfsrv_errmap __P((struct nfsrv_descript *, int));
 void nfsrvw_sort __P((gid_t *, int));
--- a/sys/nfs/nfs_vfsops.c
+++ b/sys/nfs/nfs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_vfsops.c,v 1.96 2000/09/19 22:15:41 fvdl Exp $	*/
+/*	$NetBSD: nfs_vfsops.c,v 1.97 2000/11/27 08:39:50 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993, 1995
@ -689,6 +689,8 @@ mountnfs(argp, mp, nam, pth, hst, vpp, p)
 #else
 	mp->mnt_stat.f_type = 0;
 #endif
+	mp->mnt_fs_bshift = DEV_BSHIFT;
+	mp->mnt_dev_bshift = -1;
 	strncpy(&mp->mnt_stat.f_fstypename[0], mp->mnt_op->vfs_name,
 	    MFSNAMELEN);
 	memcpy(mp->mnt_stat.f_mntfromname, hst, MNAMELEN);
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_vnops.c,v 1.123 2000/11/08 05:20:32 chs Exp $	*/
+/*	$NetBSD: nfs_vnops.c,v 1.124 2000/11/27 08:39:51 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -43,6 +43,7 @@
 */

 #include "opt_nfs.h"
+#include "opt_uvmhist.h"

 #include <sys/param.h>
 #include <sys/proc.h>
@ -63,6 +64,7 @@
 #include <sys/unistd.h>

 #include <uvm/uvm_extern.h>
+#include <uvm/uvm.h>

 #include <miscfs/fifofs/fifo.h>
 #include <miscfs/genfs/genfs.h>
@ -136,7 +138,9 @@ struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = {
 	{ &vop_truncate_desc, nfs_truncate },		/* truncate */
 	{ &vop_update_desc, nfs_update },		/* update */
 	{ &vop_bwrite_desc, nfs_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ &vop_getpages_desc, nfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, nfs_putpages },		/* putpages */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc nfsv2_vnodeop_opv_desc =
 	{ &nfsv2_vnodeop_p, nfsv2_vnodeop_entries };
@ -163,7 +167,7 @@ struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
 	{ &vop_poll_desc, spec_poll },			/* poll */
 	{ &vop_revoke_desc, spec_revoke },		/* revoke */
 	{ &vop_mmap_desc, spec_mmap },			/* mmap */
-	{ &vop_fsync_desc, nfs_fsync },			/* fsync */
+	{ &vop_fsync_desc, spec_fsync },		/* fsync */
 	{ &vop_seek_desc, spec_seek },			/* seek */
 	{ &vop_remove_desc, spec_remove },		/* remove */
 	{ &vop_link_desc, spec_link },			/* link */
@ -191,7 +195,7 @@ struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = {
 	{ &vop_truncate_desc, spec_truncate },		/* truncate */
 	{ &vop_update_desc, nfs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc spec_nfsv2nodeop_opv_desc =
 	{ &spec_nfsv2nodeop_p, spec_nfsv2nodeop_entries };
@ -243,7 +247,7 @@ struct vnodeopv_entry_desc fifo_nfsv2nodeop_entries[] = {
 	{ &vop_truncate_desc, fifo_truncate },		/* truncate */
 	{ &vop_update_desc, nfs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc =
 	{ &fifo_nfsv2nodeop_p, fifo_nfsv2nodeop_entries };
@ -432,11 +436,9 @@ nfs_open(v)
 	int error;

 	if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
-#ifdef DIAGNOSTIC
-		printf("open eacces vtyp=%d\n",vp->v_type);
-#endif
 		return (EACCES);
 	}
+
 #ifndef NFS_V2_ONLY
 	/*
 	 * Get a valid lease. If cached data is stale, flush it.
@ -454,7 +456,6 @@ nfs_open(v)
 			if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred,
 				ap->a_p, 1)) == EINTR)
 				return (error);
-			(void) uvm_vnp_uncache(vp);
 			np->n_brev = np->n_lrev;
 		    }
 		}
@ -465,7 +466,6 @@ nfs_open(v)
 			if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred,
 				ap->a_p, 1)) == EINTR)
 				return (error);
-			(void) uvm_vnp_uncache(vp);
 			np->n_attrstamp = 0;
 			if (vp->v_type == VDIR) {
 				nfs_invaldircache(vp, 0);
@ -487,7 +487,6 @@ nfs_open(v)
 				if ((error = nfs_vinvalbuf(vp, V_SAVE,
 					ap->a_cred, ap->a_p, 1)) == EINTR)
 					return (error);
-				(void) uvm_vnp_uncache(vp);
 				np->n_mtime = vattr.va_mtime.tv_sec;
 			}
 		}
@ -542,6 +541,7 @@ nfs_close(v)
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	int error = 0;
+	UVMHIST_FUNC("nfs_close"); UVMHIST_CALLED(ubchist);

 	if (vp->v_type == VREG) {
 	    if ((VFSTONFS(vp->v_mount)->nm_flag & NFSMNT_NQNFS) == 0 &&
@ -558,6 +558,7 @@ nfs_close(v)
 		error = np->n_error;
 	    }
 	}
+	UVMHIST_LOG(ubchist, "returning %d", error,0,0,0);
 	return (error);
 }

@ -1020,10 +1021,9 @@ nfs_readlinkrpc(vp, uiop, cred)
 * Ditto above
 */
 int
-nfs_readrpc(vp, uiop, cred)
+nfs_readrpc(vp, uiop)
 	struct vnode *vp;
 	struct uio *uiop;
-	struct ucred *cred;
 {
 	u_int32_t *tl;
 	caddr_t cp;
@ -1055,7 +1055,8 @@ nfs_readrpc(vp, uiop, cred)
 			*tl++ = txdr_unsigned(len);
 			*tl = 0;
 		}
-		nfsm_request(vp, NFSPROC_READ, uiop->uio_procp, cred);
+		nfsm_request(vp, NFSPROC_READ, uiop->uio_procp,
+			     VTONFS(vp)->n_rcred);
 		if (v3) {
 			nfsm_postop_attr(vp, attrflag);
 			if (error) {
@ -1084,10 +1085,9 @@ nfsmout:
 * nfs write call
 */
 int
-nfs_writerpc(vp, uiop, cred, iomode, must_commit)
+nfs_writerpc(vp, uiop, iomode, must_commit)
 	struct vnode *vp;
 	struct uio *uiop;
-	struct ucred *cred;
 	int *iomode, *must_commit;
 {
 	u_int32_t *tl;
@ -1110,7 +1110,7 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit)
 		return (EFBIG);
 	while (tsiz > 0) {
 		nfsstats.rpccnt[NFSPROC_WRITE]++;
-		len = (tsiz > nmp->nm_wsize) ? nmp->nm_wsize : tsiz;
+		len = min(tsiz, nmp->nm_wsize);
 		nfsm_reqhead(vp, NFSPROC_WRITE,
 			NFSX_FH(v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len));
 		nfsm_fhtom(vp, v3);
@ -1135,7 +1135,8 @@ nfs_writerpc(vp, uiop, cred, iomode, must_commit)

 		}
 		nfsm_uiotom(uiop, len);
-		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp, cred);
+		nfsm_request(vp, NFSPROC_WRITE, uiop->uio_procp,
+			     VTONFS(vp)->n_wcred);
 		if (v3) {
 			wccflag = NFSV3_WCCCHK;
 			nfsm_wcc_data(vp, wccflag);
@ -2595,11 +2596,10 @@ nfs_lookitup(dvp, name, len, cred, procp, npp)
 * Nfs Version 3 commit rpc
 */
 int
-nfs_commit(vp, offset, cnt, cred, procp)
+nfs_commit(vp, offset, cnt, procp)
 	struct vnode *vp;
-	u_quad_t offset;
-	unsigned cnt;
-	struct ucred *cred;
+	off_t offset;
+	uint32_t cnt;
 	struct proc *procp;
 {
 	caddr_t cp;
@ -2624,7 +2624,7 @@ nfs_commit(vp, offset, cnt, cred, procp)
 	txdr_hyper(offset, tl);
 	tl += 2;
 	*tl = txdr_unsigned(cnt);
-	nfsm_request(vp, NFSPROC_COMMIT, procp, cred);
+	nfsm_request(vp, NFSPROC_COMMIT, procp, VTONFS(vp)->n_wcred);
 	nfsm_wcc_data(vp, wccflag);
 	if (!error) {
 		nfsm_dissect(tl, u_int32_t *, NFSX_V3WRITEVERF);
@ -2680,28 +2680,25 @@ nfs_strategy(v)
 {
 	struct vop_strategy_args *ap = v;
 	struct buf *bp = ap->a_bp;
-	struct ucred *cr;
 	struct proc *p;
 	int error = 0;

 	if ((bp->b_flags & (B_PHYS|B_ASYNC)) == (B_PHYS|B_ASYNC))
 		panic("nfs physio/async");
 	if (bp->b_flags & B_ASYNC)
-		p = (struct proc *)0;
+		p = NULL;
 	else
 		p = curproc;	/* XXX */
-	if (bp->b_flags & B_READ)
-		cr = bp->b_rcred;
-	else
-		cr = bp->b_wcred;
+
 	/*
 	 * If the op is asynchronous and an i/o daemon is waiting
 	 * queue the request, wake it up and wait for completion
 	 * otherwise just do it ourselves.
 	 */
+
 	if ((bp->b_flags & B_ASYNC) == 0 ||
-		nfs_asyncio(bp, NOCRED))
-		error = nfs_doio(bp, cr, p);
+	    nfs_asyncio(bp))
+		error = nfs_doio(bp, p);
 	return (error);
 }

@ -2750,16 +2747,7 @@ nfs_fsync(v)
 }

 /*
- * Flush all the blocks associated with a vnode.
- * 	Walk through the buffer pool and push any dirty pages
- *	associated with the vnode.
- *
- *	Don't bother to cluster commits; the commitrange code will
- *	do that. In the first pass, push all dirty buffers to the
- *	server, using stable writes if commit is set to 1.
- *	In the 2nd pass, push anything that might be left,
- *	i.e. the buffer was busy in the first pass, or it wasn't
- *	committed in the first pass.
+ * Flush all the data associated with a vnode.
 */
 int
 nfs_flush(vp, cred, waitfor, p, commit)
@ -2769,104 +2757,25 @@ nfs_flush(vp, cred, waitfor, p, commit)
 	struct proc *p;
 	int commit;
 {
+	struct uvm_object *uobj = &vp->v_uvm.u_obj;
 	struct nfsnode *np = VTONFS(vp);
-	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
-	struct buf *bp;
-	struct buf *nbp;
-	int pass, s, error, slpflag, slptimeo;
+	int error;
+	int flushflags = PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO;
+	int rv;
+	UVMHIST_FUNC("nfs_flush"); UVMHIST_CALLED(ubchist);

-	pass = 1;
 	error = 0;
-	slptimeo = 0;
-	slpflag = nmp->nm_flag & NFSMNT_INT ? PCATCH : 0;
-loop:
-	s = splbio();
-	for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
-		nbp = bp->b_vnbufs.le_next;
-		if (bp->b_flags & B_BUSY) {
-			if (pass == 2 && waitfor == MNT_WAIT) {
-				bp->b_flags |= B_WANTED;
-				error = tsleep((caddr_t)bp,
-				    slpflag | (PRIBIO + 1),
-				    "nfsfsync", slptimeo);
-				splx(s);
-				if (error) {
-				    if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
-					return (EINTR);
-				    if (slpflag == PCATCH) {
-					slpflag = 0;
-					slptimeo = 2 * hz;
-				    }
-				}
-				goto loop;
-			} else
-				continue;
-		}
-#ifdef DIAGNOSTIC
-		if ((bp->b_flags & B_DELWRI) == 0)
-			panic("nfs_fsync: not dirty");
-#endif
-		if (!commit && (bp->b_flags & B_NEEDCOMMIT))
-			continue;
-		/*
-		 * Note: can't use B_VFLUSH here, since there is no
-		 * real vnode lock, so we can't leave the buffer on
-		 * the freelist.
-		 */
-		bremfree(bp);
-		if (commit && vp->v_type == VREG)
-			/*
-			 * Setting B_NOCACHE has the effect
-			 * effect of nfs_doio using a stable write
-			 * RPC. XXX this abuses the B_NOCACHE flag,
-			 * but it is needed to tell nfs_strategy
-			 * that this buffer is async, but needs to
-			 * be written with a stable RPC. nfs_doio
-			 * will remove B_NOCACHE again.
-			 */
-			bp->b_flags |= B_NOCACHE;
-
-		bp->b_flags |= B_BUSY | B_ASYNC;
-		splx(s);
-		VOP_BWRITE(bp);
-		goto loop;
-	}
-	splx(s);
-
-	if (commit && pass == 1) {
-		pass = 2;
-		goto loop;
-	}
-
-	if (waitfor == MNT_WAIT) {
-		s = splbio();
-		while (vp->v_numoutput) {
-			vp->v_flag |= VBWAIT;
-			error = tsleep((caddr_t)&vp->v_numoutput,
-				slpflag | (PRIBIO + 1), "nfsfsync", slptimeo);
-			if (error) {
-			    splx(s);
-			    if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
-				return (EINTR);
-			    if (slpflag == PCATCH) {
-				slpflag = 0;
-				slptimeo = 2 * hz;
-			    }
-			    s = splbio();
-			}
-		}
-		splx(s);
-		if (vp->v_dirtyblkhd.lh_first && commit) {
-#if 0
-			vprint("nfs_fsync: dirty", vp);
-#endif
-			goto loop;
-		}
+	simple_lock(&uobj->vmobjlock);
+	rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags);
+	simple_unlock(&uobj->vmobjlock);
+	if (!rv) {
+		error = EIO;
 	}
 	if (np->n_flag & NWRITEERR) {
 		error = np->n_error;
 		np->n_flag &= ~NWRITEERR;
 	}
+	UVMHIST_LOG(ubchist, "returning %d", error,0,0,0);
 	return (error);
 }

--- a/sys/nfs/nfsnode.h
+++ b/sys/nfs/nfsnode.h
@ -1,4 +1,4 @@
-/*	 $NetBSD: nfsnode.h,v 1.30 2000/09/19 22:18:40 fvdl Exp $	*/
+/*	 $NetBSD: nfsnode.h,v 1.31 2000/11/27 08:39:51 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -140,6 +140,8 @@ struct nfsnode {
 	off_t			n_pushhi;	/* Last block in range */
 	struct lock		n_commitlock;	/* Serialize commits XXX */
 	int			n_commitflags;
+	struct ucred		*n_rcred;
+	struct ucred		*n_wcred;
 };

 /*
@ -173,7 +175,7 @@ struct nfsnode {
 * Convert between nfsnode pointers and vnode pointers
 */
 #define VTONFS(vp)	((struct nfsnode *)(vp)->v_data)
-#define NFSTOV(np)	((struct vnode *)(np)->n_vnode)
+#define NFSTOV(np)	((np)->n_vnode)

 /*
 * Queue head for nfsiod's
@ -235,6 +237,8 @@ int	nfs_bwrite	__P((void *));
 #define	nfs_vfree	genfs_nullop
 int	nfs_truncate	__P((void *));
 int	nfs_update	__P((void *));
+int	nfs_getpages	__P((void *));
+int	nfs_putpages	__P((void *));

 extern int (**nfsv2_vnodeop_p) __P((void *));

--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@ -1,4 +1,4 @@
-/*	$NetBSD: buf.h,v 1.43 2000/04/10 02:22:15 chs Exp $	*/
+/*	$NetBSD: buf.h,v 1.44 2000/11/27 08:39:51 chs Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -190,13 +190,8 @@ struct buf {
 					   number (not partition relative) */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
-	struct	vnode *b_vp;		/* Device vnode. */
-	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
-	int	b_dirtyend;		/* Offset of end of dirty region. */
-	struct	ucred *b_rcred;		/* Read credentials reference. */
-	struct	ucred *b_wcred;		/* Write credentials reference. */
-	int	b_validoff;		/* Offset in buffer of valid region. */
-	int	b_validend;		/* Offset of end of valid region. */
+	struct	vnode *b_vp;		/* File vnode. */
+	void	*b_private;		/* Private data for owner */
 	off_t	b_dcookie;		/* Offset cookie if dir block */
 	struct  workhead b_dep;		/* List of filesystem dependencies. */
 };
@ -230,15 +225,16 @@ struct buf {
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_ORDERED	0x00010000	/* ordered I/O request */
+#define	B_CACHE		0x00020000	/* Bread found us in the cache. */
 #define	B_PHYS		0x00040000	/* I/O to user memory. */
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_TAPE		0x00200000	/* Magnetic tape I/O. */
 #define	B_WANTED	0x00800000	/* Process wants this buffer. */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
-#define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_VFLUSH	0x04000000	/* Buffer is being synced. */
+#define	B_PDAEMON	0x10000000	/* I/O initiated by pagedaemon. */

 /*
 * This structure describes a clustered I/O.  It is stored in the b_saveaddr
@ -268,6 +264,7 @@ do {									\
 #define B_SYNC		0x02	/* Do all allocations synchronously. */

 #ifdef _KERNEL
+
 extern	int nbuf;		/* The number of buffer headers */
 extern	struct buf *buf;	/* The buffer headers. */
 extern	char *buffers;		/* The buffer contents. */
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@ -1,4 +1,4 @@
-/*	$NetBSD: mount.h,v 1.86 2000/06/26 21:10:34 christos Exp $	*/
+/*	$NetBSD: mount.h,v 1.87 2000/11/27 08:39:52 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1991, 1993
@ -131,6 +131,8 @@ struct mount {
 	struct lock	mnt_lock;		/* mount structure lock */
 	int		mnt_flag;		/* flags */
 	int		mnt_maxsymlinklen;	/* max size of short symlink */
+	int		mnt_fs_bshift;		/* offset shift for lblkno */
+	int		mnt_dev_bshift;		/* shift for device sectors */
 	struct statfs	mnt_stat;		/* cache of filesystem stats */
 	qaddr_t		mnt_data;		/* private data */
 	int		mnt_wcnt;		/* count of vfs_busy waiters */
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.111 2000/11/11 00:53:24 thorpej Exp $	*/
+/*	$NetBSD: param.h,v 1.112 2000/11/27 08:39:52 chs Exp $	*/

 /*-
 * Copyright (c) 1982, 1986, 1989, 1993
@ -252,4 +252,15 @@
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)

+/*
+ * Defaults for Unified Buffer Cache parameters.
+ */
+
+#ifndef UBC_WINSIZE
+#define UBC_WINSIZE 8192
+#endif
+#ifndef UBC_NWINS
+#define UBC_NWINS 1024
+#endif
+
 #endif /* !_SYS_PARAM_H_ */
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@ -1,4 +1,4 @@
-/*	$NetBSD: vnode.h,v 1.83 2000/07/09 00:59:05 mycroft Exp $	*/
+/*	$NetBSD: vnode.h,v 1.84 2000/11/27 08:39:52 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -85,8 +85,10 @@ LIST_HEAD(buflists, buf);
 */
 struct vnode {
 	struct uvm_vnode v_uvm;			/* uvm data */
-	u_long	v_flag;				/* vnode flags (see below) */
-	long	v_usecount;			/* reference count of users */
+#define v_flag v_uvm.u_flags
+#define v_usecount v_uvm.u_obj.uo_refs
+#define v_interlock v_uvm.u_obj.vmobjlock
+#define v_numoutput v_uvm.u_nio
 	long	v_writecount;			/* reference count of writers */
 	long	v_holdcnt;			/* page & buffer references */
 	daddr_t	v_lastr;			/* last read (read-ahead) */
@ -97,13 +99,11 @@ struct vnode {
 	LIST_ENTRY(vnode) v_mntvnodes;		/* vnodes for mount point */
 	struct	buflists v_cleanblkhd;		/* clean blocklist head */
 	struct	buflists v_dirtyblkhd;		/* dirty blocklist head */
-	long	v_numoutput;			/* num of writes in progress */
 	LIST_ENTRY(vnode) v_synclist;		/* vnodes with dirty buffers */
 	enum	vtype v_type;			/* vnode type */
 	union {
 		struct mount	*vu_mountedhere;/* ptr to mounted vfs (VDIR) */
 		struct socket	*vu_socket;	/* unix ipc (VSOCK) */
-		caddr_t		vu_vmdata;	/* private data for vm (VREG) */
 		struct specinfo	*vu_specinfo;	/* device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* fifo (VFIFO) */
 	} v_un;
@ -114,15 +114,14 @@ struct vnode {
 	int	v_clen;				/* length of current cluster */
 	int	v_ralen;			/* Read-ahead length */
 	daddr_t	v_maxra;			/* last readahead block */
-	struct	simplelock v_interlock;		/* lock on usecount and flag */
 	struct	lock	v_lock;			/* lock for this vnode */
+	struct	lock	v_glock;		/* getpage lock */
 	struct	lock *v_vnlock;			/* pointer to vnode lock */
 	enum	vtagtype v_tag;			/* type of underlying data */
 	void 	*v_data;			/* private data for fs */
 };
 #define	v_mountedhere	v_un.vu_mountedhere
 #define	v_socket	v_un.vu_socket
-#define	v_vmdata	v_un.vu_vmdata
 #define	v_specinfo	v_un.vu_specinfo
 #define	v_fifoinfo	v_un.vu_fifoinfo
 /*
@ -143,7 +142,9 @@ struct vnode {
 */
 #define	VROOT		0x0001	/* root of its file system */
 #define	VTEXT		0x0002	/* vnode is a pure text prototype */
+	/* VSYSTEM only used to skip vflush()ing quota files */
 #define	VSYSTEM		0x0004	/* vnode being used by kernel */
+	/* VISTTY used when reading dead vnodes */
 #define	VISTTY		0x0008	/* vnode represents a tty */
 #define	VXLOCK		0x0100	/* vnode is locked to change underlying type */
 #define	VXWANT		0x0200	/* process is waiting for vnode */
@ -152,6 +153,9 @@ struct vnode {
 #define	VDIROP		0x1000	/* LFS: vnode is involved in a directory op */
 #define VLAYER		0x2000	/* vnode is on a layer filesystem */
 #define	VONWORKLST	0x4000	/* On syncer work-list */
+#define VDIRTY		0x8000	/* vnode possibly has dirty pages */
+
+#define VSIZENOTSET	((voff_t)-1)

 /*
 * Vnode attributes.  A field value of VNOVAL represents a field whose value
@ -453,6 +457,10 @@ struct vop_generic_args {
 * vclean changes the ops vector and then wants to call ops with the old
 * vector.
 */
+/*
+ * actually, vclean doesn't use it anymore, but nfs does,
+ * for device specials and fifos.
+ */
 #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))

 /*
--- a/sys/ufs/ext2fs/ext2fs_balloc.c
+++ b/sys/ufs/ext2fs/ext2fs_balloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_balloc.c,v 1.6 2000/06/28 14:16:37 mrg Exp $	*/
+/*	$NetBSD: ext2fs_balloc.c,v 1.7 2000/11/27 08:39:53 chs Exp $	*/

 /*
 * Copyright (c) 1997 Manuel Bouyer.
@ -37,12 +37,19 @@
 * Modified for ext2fs by Manuel Bouyer.
 */

+#if defined(_KERNEL) && !defined(_LKM)
+#include "opt_uvmhist.h"
+#endif
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/vnode.h>
+#include <sys/mount.h>
+
+#include <uvm/uvm.h>

 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
@ -75,8 +82,13 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags)
 	u_int deallocated;
 	ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
 	int unwindidx = -1;
+	UVMHIST_FUNC("ext2fs_balloc"); UVMHIST_CALLED(ubchist);

-	*bpp = NULL;
+	UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0);
+
+	if (bpp != NULL) {
+		*bpp = NULL;
+	}
 	if (bn < 0)
 		return (EFBIG);
 	fs = ip->i_e2fs;
@ -88,29 +100,43 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags)
 	if (bn < NDADDR) {
 		nb = fs2h32(ip->i_e2fs_blocks[bn]);
 		if (nb != 0) {
-			error = bread(vp, bn, fs->e2fs_bsize, NOCRED, &bp);
-			if (error) {
-				brelse(bp);
-				return (error);
+
+			/*
+			 * the block is already allocated, just read it.
+			 */
+
+			if (bpp != NULL) {
+				error = bread(vp, bn, fs->e2fs_bsize, NOCRED,
+					      &bp);
+				if (error) {
+					brelse(bp);
+					return (error);
+				}
+				*bpp = bp;
 			}
-			*bpp = bp;
 			return (0);
-		} else {
-			error = ext2fs_alloc(ip, bn,
-				ext2fs_blkpref(ip, bn, (int)bn, &ip->i_e2fs_blocks[0]),
-				cred, &newb);
-			if (error)
-				return (error);
-			ip->i_e2fs_last_lblk = lbn;
-			ip->i_e2fs_last_blk = newb;
+		}
+
+		/*
+		 * allocate a new direct block.
+		 */
+
+		error = ext2fs_alloc(ip, bn,
+		    ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]),
+		    cred, &newb);
+		if (error)
+			return (error);
+		ip->i_e2fs_last_lblk = lbn;
+		ip->i_e2fs_last_blk = newb;
+		ip->i_e2fs_blocks[bn] = h2fs32(newb);
+		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		if (bpp != NULL) {
 			bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0);
 			bp->b_blkno = fsbtodb(fs, newb);
 			if (flags & B_CLRBUF)
 				clrbuf(bp);
+			*bpp = bp;
 		}
-		ip->i_e2fs_blocks[bn] = h2fs32(dbtofsb(fs, bp->b_blkno));
-		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		*bpp = bp;
 		return (0);
 	}
 	/*
@ -218,10 +244,6 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags)
 		*allocblk++ = nb;
 		ip->i_e2fs_last_lblk = lbn;
 		ip->i_e2fs_last_blk = newb;
-		nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
-		nbp->b_blkno = fsbtodb(fs, nb);
-		if (flags & B_CLRBUF)
-			clrbuf(nbp);
 		bap[indirs[num].in_off] = h2fs32(nb);
 		/*
 		 * If required, write synchronously, otherwise use
@ -232,21 +254,30 @@ ext2fs_balloc(ip, bn, size, cred, bpp, flags)
 		} else {
 			bdwrite(bp);
 		}
-		*bpp = nbp;
+		if (bpp != NULL) {
+			nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
+			if (flags & B_CLRBUF)
+				clrbuf(nbp);
+			*bpp = nbp;
+		}
 		return (0);
 	}
 	brelse(bp);
-	if (flags & B_CLRBUF) {
-		error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp);
-		if (error) {
-			brelse(nbp);
-			goto fail;
+	if (bpp != NULL) {
+		if (flags & B_CLRBUF) {
+			error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED,
+				      &nbp);
+			if (error) {
+				brelse(nbp);
+				goto fail;
+			}
+		} else {
+			nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
 		}
-	} else {
-		nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0);
-		nbp->b_blkno = fsbtodb(fs, nb);
+		*bpp = nbp;
 	}
-	*bpp = nbp;
 	return (0);
 fail:
 	/*
@ -288,5 +319,178 @@ fail:
 		ip->i_e2fs_nblock -= btodb(deallocated);
 		ip->i_e2fs_flags |= IN_CHANGE | IN_UPDATE;
 	}
+	return error;
+}
+
+int
+ext2fs_ballocn(v)
+	void *v;
+{
+	struct vop_ballocn_args /* {
+		struct vnode *a_vp;
+		off_t a_offset;
+		off_t a_length;
+		struct ucred *a_cred;
+		int a_flags;
+	} */ *ap = v;
+	off_t off, len;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct m_ext2fs *fs = ip->i_e2fs;
+	int error, delta, bshift, bsize;
+	UVMHIST_FUNC("ext2fs_ballocn"); UVMHIST_CALLED(ubchist);
+
+	bshift = fs->e2fs_bshift;
+	bsize = 1 << bshift;
+
+	off = ap->a_offset;
+	len = ap->a_length;
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	while (len > 0) {
+		bsize = min(bsize, len);
+		UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x",
+			    off, len, bsize, 0);
+
+		error = ext2fs_balloc(ip, lblkno(fs, off), bsize, ap->a_cred,
+		    NULL, ap->a_flags);
+		if (error) {
+			UVMHIST_LOG(ubchist, "error %d", error, 0,0,0);
+			return error;
+		}
+
+		/*
+		 * increase file size now, VOP_BALLOC() requires that
+		 * EOF be up-to-date before each call.
+		 */
+
+		if (ip->i_e2fs_size < off + bsize) {
+			UVMHIST_LOG(ubchist, "old 0x%x new 0x%x",
+				    ip->i_e2fs_size, off + bsize,0,0);
+			ip->i_e2fs_size = off + bsize;
+			if (vp->v_uvm.u_size < ip->i_e2fs_size) {
+				uvm_vnp_setsize(vp, ip->i_e2fs_size);
+			}
+		}
+
+		off += bsize;
+		len -= bsize;
+	}
+	return 0;
+}
+
+/*
+ * allocate a range of blocks in a file.
+ * after this function returns, any page entirely contained within the range
+ * will map to invalid data and thus must be overwritten before it is made
+ * accessible to others.
+ */
+
+int
+ext2fs_balloc_range(vp, off, len, cred, flags)
+	struct vnode *vp;
+	off_t off, len;
+	struct ucred *cred;
+	int flags;
+{
+	off_t eof, pagestart, pageend;
+	struct uvm_object *uobj;
+	struct inode *ip = VTOI(vp);
+	int i, delta, error, npages1, npages2;
+	int bshift = vp->v_mount->mnt_fs_bshift;
+	int bsize = 1 << bshift;
+	int ppb = max(bsize >> PAGE_SHIFT, 1);
+	struct vm_page *pgs1[ppb], *pgs2[ppb];
+	UVMHIST_FUNC("ext2fs_balloc_range"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
+		    vp, off, len, vp->v_uvm.u_size);
+
+	error = 0;
+	uobj = &vp->v_uvm.u_obj;
+	eof = max(vp->v_uvm.u_size, off + len);
+	vp->v_uvm.u_size = eof;
+	UVMHIST_LOG(ubchist, "new eof 0x%x", eof,0,0,0);
+	pgs1[0] = pgs2[0] = NULL;
+
+	/*
+	 * if the range does not start on a page and block boundary,
+	 * cache the first block if the file so the page(s) will contain
+	 * the correct data.  hold the page(s) busy while we allocate
+	 * the backing store for the range.
+	 */
+
+	pagestart = trunc_page(off) & ~(bsize - 1);
+	if (off != pagestart) {
+		npages1 = min(ppb, (round_page(eof) - pagestart) >>
+			      PAGE_SHIFT);
+		memset(pgs1, 0, npages1);
+		simple_lock(&uobj->vmobjlock);
+		error = VOP_GETPAGES(vp, pagestart, pgs1, &npages1, 0,
+				     VM_PROT_READ, 0, PGO_SYNCIO);
+		if (error) {
+			UVMHIST_LOG(ubchist, "gp1 %d", error,0,0,0);
+			goto errout;
+		}
+		for (i = 0; i < npages1; i++) {
+			UVMHIST_LOG(ubchist, "got pgs1[%d] %p", i, pgs1[i],0,0);
+		}
+	}
+
+	/*
+	 * similarly if the range does not end on a page and block boundary.
+	 */
+
+	pageend = trunc_page(off + len) & ~(bsize - 1);
+	if (off + len < ip->i_e2fs_size &&
+	    off + len != pageend &&
+	    pagestart != pageend) {
+		npages2 = min(ppb, (round_page(eof) - pageend) >>
+			      PAGE_SHIFT);
+		memset(pgs2, 0, npages2);
+		simple_lock(&uobj->vmobjlock);
+		error = VOP_GETPAGES(vp, pageend, pgs2, &npages2, 0,
+				     VM_PROT_READ, 0, PGO_SYNCIO);
+		if (error) {
+			UVMHIST_LOG(ubchist, "gp2 %d", error,0,0,0);
+			goto errout;
+		}
+		for (i = 0; i < npages2; i++) {
+			UVMHIST_LOG(ubchist, "got pgs2[%d] %p", i, pgs2[i],0,0);
+		}
+	}
+
+	/*
+	 * adjust off to be block-aligned.
+	 */
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	/*
+	 * now allocate the range.
+	 */
+
+	lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL);
+	error = VOP_BALLOCN(vp, off, len, cred, flags);
+	UVMHIST_LOG(ubchist, "ballocn %d", error,0,0,0);
+	lockmgr(&vp->v_glock, LK_RELEASE, NULL);
+
+	/*
+	 * unbusy any pages we are holding.
+	 */
+
+errout:
+	simple_lock(&uobj->vmobjlock);
+	if (pgs1[0] != NULL) {
+		uvm_page_unbusy(pgs1, npages1);
+	}
+	if (pgs2[0] != NULL) {
+		uvm_page_unbusy(pgs2, npages2);
+	}
+	simple_unlock(&uobj->vmobjlock);
 	return (error);
 }
--- a/sys/ufs/ext2fs/ext2fs_extern.h
+++ b/sys/ufs/ext2fs/ext2fs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_extern.h,v 1.8 2000/03/16 18:08:32 jdolecek Exp $	*/
+/*	$NetBSD: ext2fs_extern.h,v 1.9 2000/11/27 08:39:53 chs Exp $	*/

 /*-
 * Copyright (c) 1997 Manuel Bouyer.
@ -71,6 +71,9 @@ int ext2fs_vfree __P((void *));
 /* ext2fs_balloc.c */
 int ext2fs_balloc __P((struct inode *, ufs_daddr_t, int, struct ucred *,
 			struct buf **, int));
+int ext2fs_ballocn __P((void *));
+int ext2fs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *,
+			     int));

 /* ext2fs_bmap.c */
 int ext2fs_bmap __P((void *));
--- a/sys/ufs/ext2fs/ext2fs_inode.c
+++ b/sys/ufs/ext2fs/ext2fs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_inode.c,v 1.20 2000/06/28 14:16:37 mrg Exp $	*/
+/*	$NetBSD: ext2fs_inode.c,v 1.21 2000/11/27 08:39:53 chs Exp $	*/

 /*
 * Copyright (c) 1997 Manuel Bouyer.
@ -101,7 +101,7 @@ out:
 	 * so that it can be reused immediately.
 	 */
 	if (ip->i_e2fs_dtime != 0)
-		vrecycle(vp, (struct simplelock *)0, p);
+		vrecycle(vp, NULL, p);
 	return (error);
 }   

@ -187,15 +187,14 @@ ext2fs_truncate(v)
 	struct vnode *ovp = ap->a_vp;
 	ufs_daddr_t lastblock;
 	struct inode *oip;
-	ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
+	ufs_daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
 	ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	off_t length = ap->a_length;
 	struct m_ext2fs *fs;
-	struct buf *bp;
 	int offset, size, level;
 	long count, nblocks, blocksreleased = 0;
 	int i;
-	int aflags, error, allerror = 0;
+	int error, allerror = 0;
 	off_t osize;

 	if (length < 0)
@ -232,24 +231,10 @@ ext2fs_truncate(v)
 		if (length > fs->fs_maxfilesize)
 			return (EFBIG);
 #endif
-		offset = blkoff(fs, length - 1);
-		lbn = lblkno(fs, length - 1);
-		aflags = B_CLRBUF;
-		if (ap->a_flags & IO_SYNC)
-			aflags |= B_SYNC;
-		error = ext2fs_balloc(oip, lbn, offset + 1, ap->a_cred, &bp,
-				   aflags);
-		if (error)
-			return (error);
-		oip->i_e2fs_size = length;
-		uvm_vnp_setsize(ovp, length);
-		(void) uvm_vnp_uncache(ovp);
-		if (aflags & B_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
+		ext2fs_balloc_range(ovp, length - 1, 1, ap->a_cred,
+		    ap->a_flags & IO_SYNC ? B_SYNC : 0);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT));
+		return (VOP_UPDATE(ovp, NULL, NULL, 1));
 	}
 	/*
 	 * Shorten the size of the file. If the file is not being
@ -259,26 +244,13 @@ ext2fs_truncate(v)
 	 * of subsequent file growth.
 	 */
 	offset = blkoff(fs, length);
-	if (offset == 0) {
-		oip->i_e2fs_size = length;
-	} else {
-		lbn = lblkno(fs, length);
-		aflags = B_CLRBUF;
-		if (ap->a_flags & IO_SYNC)
-			aflags |= B_SYNC;
-		error = ext2fs_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags);
-		if (error)
-			return (error);
-		oip->i_e2fs_size = length;
+	if (offset != 0) {
 		size = fs->e2fs_bsize;
-		(void) uvm_vnp_uncache(ovp);
-		memset((char *)bp->b_data + offset, 0,  (u_int)(size - offset));
-		allocbuf(bp, size);
-		if (aflags & B_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
+
+		/* XXXUBC we should handle more than just VREG */
+		uvm_vnp_zerorange(ovp, length, size - offset);
 	}
+	oip->i_e2fs_size = length;
 	uvm_vnp_setsize(ovp, length);

 	/*
@ -317,6 +289,7 @@ ext2fs_truncate(v)
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
+
 	memcpy((caddr_t)newblks, (caddr_t)&oip->i_e2fs_blocks[0], sizeof newblks);
 	memcpy((caddr_t)&oip->i_e2fs_blocks[0], (caddr_t)oldblks, sizeof oldblks);
 	oip->i_e2fs_size = osize;
@ -359,20 +332,20 @@ ext2fs_truncate(v)
 		ext2fs_blkfree(oip, bn);
 		blocksreleased += btodb(fs->e2fs_bsize);
 	}
-	if (lastblock < 0)
-		goto done;

 done:
 #ifdef DIAGNOSTIC
 	for (level = SINGLE; level <= TRIPLE; level++)
-		if (newblks[NDADDR + level] != oip->i_e2fs_blocks[NDADDR + level])
-			panic("itrunc1");
+		if (newblks[NDADDR + level] !=
+		    oip->i_e2fs_blocks[NDADDR + level])
+			panic("ext2fs_truncate1");
 	for (i = 0; i < NDADDR; i++)
 		if (newblks[i] != oip->i_e2fs_blocks[i])
-			panic("itrunc2");
+			panic("ext2fs_truncate2");
 	if (length == 0 &&
-	    (!LIST_EMPTY(&ovp->v_cleanblkhd) || !LIST_EMPTY(&ovp->v_dirtyblkhd)))
-		panic("itrunc3");
+	    (!LIST_EMPTY(&ovp->v_cleanblkhd) ||
+	     !LIST_EMPTY(&ovp->v_dirtyblkhd)))
+		panic("ext2fs_truncate3");
 #endif /* DIAGNOSTIC */
 	/*
 	 * Put back the real size.
--- a/sys/ufs/ext2fs/ext2fs_readwrite.c
+++ b/sys/ufs/ext2fs/ext2fs_readwrite.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_readwrite.c,v 1.13 2000/06/28 14:16:38 mrg Exp $	*/
+/*	$NetBSD: ext2fs_readwrite.c,v 1.14 2000/11/27 08:39:53 chs Exp $	*/

 /*-
 * Copyright (c) 1997 Manuel Bouyer.
@ -79,6 +79,8 @@ ext2fs_read(v)
 	struct uio *uio;
 	struct m_ext2fs *fs;
 	struct buf *bp;
+	void *win;
+	vsize_t bytelen;
 	ufs_daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
@ -107,6 +109,27 @@ ext2fs_read(v)
 	if (uio->uio_resid == 0)
 		return (0);

+	if (vp->v_type == VREG) {
+		error = 0;
+		while (uio->uio_resid > 0) {
+
+			bytelen = min(ip->i_e2fs_size - uio->uio_offset,
+			    uio->uio_resid);
+
+			if (bytelen == 0) {
+				break;
+			}
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error) {
+				break;
+			}
+		}
+		goto out;
+	}
+
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_e2fs_size - uio->uio_offset) <= 0)
 			break;
@ -148,14 +171,15 @@ ext2fs_read(v)
 				break;
 			xfersize = size;
 		}
-		error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize,
-				uio);
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
 		if (error)
 			break;
 		brelse(bp);
 	}
 	if (bp != NULL)
 		brelse(bp);
+
+out:
 	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
 		ip->i_flag |= IN_ACCESS;
 		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
@ -185,12 +209,17 @@ ext2fs_write(v)
 	struct proc *p;
 	ufs_daddr_t lbn;
 	off_t osize;
-	int blkoffset, error, flags, ioflag, resid, size, xfersize;
+	int blkoffset, error, flags, ioflag, resid, xfersize;
+	vsize_t bytelen;
+	void *win;
+	off_t oldoff;
+	boolean_t rv;

 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
+	error = 0;

 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
@ -234,35 +263,66 @@ ext2fs_write(v)

 	resid = uio->uio_resid;
 	osize = ip->i_e2fs_size;
-	flags = ioflag & IO_SYNC ? B_SYNC : 0;

+	if (vp->v_type == VREG) {
+		while (uio->uio_resid > 0) {
+			oldoff = uio->uio_offset;
+			blkoffset = blkoff(fs, uio->uio_offset);
+			bytelen = min(fs->e2fs_bsize - blkoffset,
+			    uio->uio_resid);
+
+			/*
+			 * XXXUBC if file is mapped and this is the last block,
+			 * process one page at a time.
+			 */
+
+			error = ext2fs_balloc_range(vp, uio->uio_offset,
+			    bytelen, ap->a_cred, 0);
+			if (error) {
+				break;
+			}
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+			    &bytelen, UBC_WRITE);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error) {
+				break;
+			}
+
+			/*
+			 * flush what we just wrote if necessary.
+			 * XXXUBC simplistic async flushing.
+			 */
+
+			if (oldoff >> 16 != uio->uio_offset >> 16) {
+				simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+				rv = vp->v_uvm.u_obj.pgops->pgo_flush(
+				    &vp->v_uvm.u_obj, (oldoff >> 16) << 16,
+				    (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+				simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+			}
+		}
+		goto out;
+	}
+
+	flags = ioflag & IO_SYNC ? B_SYNC : 0;
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
-		xfersize = fs->e2fs_bsize - blkoffset;
-		if (uio->uio_resid < xfersize)
-			xfersize = uio->uio_resid;
-		if (fs->e2fs_bsize > xfersize)
+		xfersize = min(fs->e2fs_bsize - blkoffset, uio->uio_resid);
+		if (xfersize < fs->e2fs_bsize)
 			flags |= B_CLRBUF;
 		else
 			flags &= ~B_CLRBUF;
-
-		error = ext2fs_balloc(ip,
-			lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
+		error = VOP_BALLOC(vp, lblktosize(fs, lbn),
+				   blkoffset + xfersize, ap->a_cred, flags,
+				   &bp);
 		if (error)
 			break;
-		if (uio->uio_offset + xfersize > ip->i_e2fs_size) {
+		if (ip->i_e2fs_size < uio->uio_offset + xfersize) {
 			ip->i_e2fs_size = uio->uio_offset + xfersize;
-			uvm_vnp_setsize(vp, ip->i_e2fs_size);
 		}
-		(void)uvm_vnp_uncache(vp);
-
-		size = fs->e2fs_bsize - bp->b_resid;
-		if (size < xfersize)
-			xfersize = size;
-
-		error =
-			uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (xfersize + blkoffset == fs->e2fs_bsize)
@ -274,13 +334,14 @@ ext2fs_write(v)
 			bdwrite(bp);
 		if (error || xfersize == 0)
 			break;
-		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
+out:
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_e2fs_mode &= ~(ISUID | ISGID);
 	if (error) {
--- a/sys/ufs/ext2fs/ext2fs_vfsops.c
+++ b/sys/ufs/ext2fs/ext2fs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_vfsops.c,v 1.39 2000/09/19 22:03:05 fvdl Exp $	*/
+/*	$NetBSD: ext2fs_vfsops.c,v 1.40 2000/11/27 08:39:53 chs Exp $	*/

 /*
 * Copyright (c) 1997 Manuel Bouyer.
@ -592,15 +592,19 @@ ext2fs_mountfs(devvp, mp, p)
 	mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_EXT2FS);
 	mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN;
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
+	mp->mnt_fs_bshift = m_fs->e2fs_bshift;
 	ump->um_flags = 0;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = NINDIR(m_fs);
+	ump->um_lognindir = ffs(NINDIR(m_fs)) - 1;
 	ump->um_bptrtodb = m_fs->e2fs_fsbtodb;
 	ump->um_seqinc = 1; /* no frags */
 	devvp->v_specmountpoint = mp;
 	return (0);
+
 out:
 	if (bp)
 		brelse(bp);
@ -931,6 +935,7 @@ ext2fs_vget(mp, ino, vpp)
 			ip->i_flag |= IN_MODIFIED;
 	}

+	vp->v_uvm.u_size = ip->i_e2fs_size;
 	*vpp = vp;
 	return (0);
 }
--- a/sys/ufs/ext2fs/ext2fs_vnops.c
+++ b/sys/ufs/ext2fs/ext2fs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ext2fs_vnops.c,v 1.29 2000/08/03 20:41:36 thorpej Exp $	*/
+/*	$NetBSD: ext2fs_vnops.c,v 1.30 2000/11/27 08:39:53 chs Exp $	*/

 /*
 * Copyright (c) 1997 Manuel Bouyer.
@ -196,7 +196,6 @@ ext2fs_access(v)
 	struct inode *ip = VTOI(vp);
 	mode_t mode = ap->a_mode;

-
 	/*
 	 * Disallow write attempts on read-only file systems;
 	 * unless the file is a socket, fifo, or a block or
@ -421,8 +420,6 @@ ext2fs_chmod(vp, mode, cred, p)
 	ip->i_e2fs_mode &= ~ALLPERMS;
 	ip->i_e2fs_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
-	if ((vp->v_flag & VTEXT) && (ip->i_e2fs_mode & S_ISTXT) == 0)
-		(void) uvm_vnp_uncache(vp);
 	return (0);
 }

@ -1465,7 +1462,11 @@ struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = {
 	{ &vop_truncate_desc, ext2fs_truncate },	/* truncate */
 	{ &vop_update_desc, ext2fs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL }
+	{ &vop_ballocn_desc, ext2fs_ballocn },		/* ballocn */
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
+	{ &vop_size_desc, genfs_size },			/* size */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ext2fs_vnodeop_opv_desc =
 	{ &ext2fs_vnodeop_p, ext2fs_vnodeop_entries };
@ -1516,7 +1517,7 @@ struct vnodeopv_entry_desc ext2fs_specop_entries[] = {
 	{ &vop_truncate_desc, spec_truncate },		/* truncate */
 	{ &vop_update_desc, ext2fs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ext2fs_specop_opv_desc =
 	{ &ext2fs_specop_p, ext2fs_specop_entries };
@ -1567,7 +1568,7 @@ struct vnodeopv_entry_desc ext2fs_fifoop_entries[] = {
 	{ &vop_truncate_desc, fifo_truncate },		/* truncate */
 	{ &vop_update_desc, ext2fs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ext2fs_fifoop_opv_desc =
 	{ &ext2fs_fifoop_p, ext2fs_fifoop_entries };
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_alloc.c,v 1.36 2000/06/28 14:16:39 mrg Exp $	*/
+/*	$NetBSD: ffs_alloc.c,v 1.37 2000/11/27 08:39:54 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -109,15 +109,33 @@ ffs_alloc(ip, lbn, bpref, size, cred, bnp)
 	struct ucred *cred;
 	ufs_daddr_t *bnp;
 {
-	struct fs *fs;
+	struct fs *fs = ip->i_fs;
 	ufs_daddr_t bno;
 	int cg;
 #ifdef QUOTA
 	int error;
 #endif
 	
+#ifdef UVM_PAGE_TRKOWN
+	if (ITOV(ip)->v_type == VREG && lbn > 0) {
+		struct vm_page *pg;
+		struct uvm_object *uobj = &ITOV(ip)->v_uvm.u_obj;
+		voff_t off = trunc_page(lblktosize(fs, lbn));
+		voff_t endoff = round_page(lblktosize(fs, lbn) + size);
+
+		simple_lock(&uobj->vmobjlock);
+		while (off < endoff) {
+			pg = uvm_pagelookup(uobj, off);
+			KASSERT(pg != NULL);
+			KASSERT(pg->owner == curproc->p_pid);
+			KASSERT((pg->flags & PG_CLEAN) == 0);
+			off += PAGE_SIZE;
+		}
+		simple_unlock(&uobj->vmobjlock);
+	}
+#endif
+
 	*bnp = 0;
-	fs = ip->i_fs;
 #ifdef DIAGNOSTIC
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = 0x%x, bsize = %d, size = %d, fs = %s\n",
@ -170,21 +188,39 @@ nospace:
 * invoked to get an appropriate block.
 */
 int
-ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
+ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp, blknop)
 	struct inode *ip;
 	ufs_daddr_t lbprev;
 	ufs_daddr_t bpref;
 	int osize, nsize;
 	struct ucred *cred;
 	struct buf **bpp;
+	ufs_daddr_t *blknop;
 {
-	struct fs *fs;
+	struct fs *fs = ip->i_fs;
 	struct buf *bp;
 	int cg, request, error;
 	ufs_daddr_t bprev, bno;

-	*bpp = 0;
-	fs = ip->i_fs;
+#ifdef UVM_PAGE_TRKOWN
+	if (ITOV(ip)->v_type == VREG) {
+		struct vm_page *pg;
+		struct uvm_object *uobj = &ITOV(ip)->v_uvm.u_obj;
+		voff_t off = trunc_page(lblktosize(fs, lbprev));
+		voff_t endoff = round_page(lblktosize(fs, lbprev) + osize);
+
+		simple_lock(&uobj->vmobjlock);
+		while (off < endoff) {
+			pg = uvm_pagelookup(uobj, off);
+			KASSERT(pg != NULL);
+			KASSERT(pg->owner == curproc->p_pid);
+			KASSERT((pg->flags & PG_CLEAN) == 0);
+			off += PAGE_SIZE;
+		}
+		simple_unlock(&uobj->vmobjlock);
+	}
+#endif
+
 #ifdef DIAGNOSTIC
 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
@ -206,7 +242,8 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 	/*
 	 * Allocate the extra space in the buffer.
 	 */
-	if ((error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) {
+	if (bpp != NULL &&
+	    (error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		return (error);
 	}
@ -221,14 +258,20 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 	 */
 	cg = dtog(fs, bprev);
 	if ((bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize)) != 0) {
-		if (bp->b_blkno != fsbtodb(fs, bno))
-			panic("bad blockno");
 		ip->i_ffs_blocks += btodb(nsize - osize);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		allocbuf(bp, nsize);
-		bp->b_flags |= B_DONE;
-		memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
-		*bpp = bp;
+
+		if (bpp != NULL) {
+			if (bp->b_blkno != fsbtodb(fs, bno))
+				panic("bad blockno");
+			allocbuf(bp, nsize);
+			bp->b_flags |= B_DONE;
+			memset(bp->b_data + osize, 0, nsize - osize);
+			*bpp = bp;
+		}
+		if (blknop != NULL) {
+			*blknop = bno;
+		}
 		return (0);
 	}
 	/*
@ -292,8 +335,6 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 	bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
 	    			     ffs_alloccg);
 	if (bno > 0) {
-		bp->b_blkno = fsbtodb(fs, bno);
-		(void) uvm_vnp_uncache(ITOV(ip));
 		if (!DOINGSOFTDEP(ITOV(ip)))
 			ffs_blkfree(ip, bprev, (long)osize);
 		if (nsize < request)
@ -301,10 +342,16 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 			    (long)(request - nsize));
 		ip->i_ffs_blocks += btodb(nsize - osize);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		allocbuf(bp, nsize);
-		bp->b_flags |= B_DONE;
-		memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
-		*bpp = bp;
+		if (bpp != NULL) {
+			bp->b_blkno = fsbtodb(fs, bno);
+			allocbuf(bp, nsize);
+			bp->b_flags |= B_DONE;
+			memset(bp->b_data + osize, 0, (u_int)nsize - osize);
+			*bpp = bp;
+		}
+		if (blknop != NULL) {
+			*blknop = bno;
+		}
 		return (0);
 	}
 #ifdef QUOTA
@ -313,7 +360,10 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 	 */
 	(void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE);
 #endif
-	brelse(bp);
+	if (bpp != NULL) {
+		brelse(bp);
+	}
+
 nospace:
 	/*
 	 * no space available
@ -344,7 +394,7 @@ struct ctldebug debug15 = { "prtrealloc", &prtrealloc };
 #endif

 int doasyncfree = 1;
-extern int doreallocblks;
+int doreallocblks;

 int
 ffs_reallocblks(v)
@ -364,6 +414,9 @@ ffs_reallocblks(v)
 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
 	int i, len, start_lvl, end_lvl, pref, ssize;

+	/* XXXUBC don't reallocblks for now */
+	return ENOSPC;
+
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	fs = ip->i_fs;
@ -1725,5 +1778,6 @@ ffs_fserr(fs, uid, cp)
 	char *cp;
 {

-	log(LOG_ERR, "uid %d on %s: %s\n", uid, fs->fs_fsmnt, cp);
+	log(LOG_ERR, "uid %d comm %s on %s: %s\n",
+	    uid, curproc->p_comm, fs->fs_fsmnt, cp);
 }
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_balloc.c,v 1.22 2000/09/19 22:04:08 fvdl Exp $	*/
+/*	$NetBSD: ffs_balloc.c,v 1.23 2000/11/27 08:39:54 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -57,6 +57,8 @@
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>

+#include <uvm/uvm.h>
+
 /*
 * Balloc defines the structure of file system storage
 * by allocating the physical blocks on a device given
@ -72,7 +74,7 @@ ffs_balloc(v)
 		int a_size;
 		struct ucred *a_cred;
 		int a_flags;
-		struct buf *a_bpp;
+		struct buf **a_bpp;
 	} */ *ap = v;
 	ufs_daddr_t lbn;
 	int size;
@ -88,15 +90,22 @@ ffs_balloc(v)
 	int deallocated, osize, nsize, num, i, error;
 	ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
 	int unwindidx = -1;
+	struct buf **bpp = ap->a_bpp;
 #ifdef FFS_EI
 	const int needswap = UFS_FSNEEDSWAP(fs);
 #endif
+	UVMHIST_FUNC("ffs_balloc"); UVMHIST_CALLED(ubchist);

 	lbn = lblkno(fs, ap->a_startoffset);
 	size = blkoff(fs, ap->a_startoffset) + ap->a_size;
 	if (size > fs->fs_bsize)
 		panic("ffs_balloc: blk too big");
-	*ap->a_bpp = NULL;
+	if (bpp != NULL) {
+		*bpp = NULL;
+	}
+	UVMHIST_LOG(ubchist, "vp %p lbn 0x%x size 0x%x", vp, lbn, size,0);
+
+	KASSERT(size <= fs->fs_bsize);
 	if (lbn < 0)
 		return (EFBIG);
 	cred = ap->a_cred;
@ -107,71 +116,109 @@ ffs_balloc(v)
 	 * and the file is currently composed of a fragment
 	 * this fragment has to be extended to be a full block.
 	 */
+
 	nb = lblkno(fs, ip->i_ffs_size);
 	if (nb < NDADDR && nb < lbn) {
 		osize = blksize(fs, ip, nb);
 		if (osize < fs->fs_bsize && osize > 0) {
 			error = ffs_realloccg(ip, nb,
 				ffs_blkpref(ip, nb, (int)nb, &ip->i_ffs_db[0]),
-				osize, (int)fs->fs_bsize, cred, &bp);
+				osize, (int)fs->fs_bsize, cred, bpp, &newb);
 			if (error)
 				return (error);
 			if (DOINGSOFTDEP(vp))
-				softdep_setup_allocdirect(ip, nb,
-				    dbtofsb(fs, bp->b_blkno),
+				softdep_setup_allocdirect(ip, nb, newb,
 				    ufs_rw32(ip->i_ffs_db[nb], needswap),
-				    fs->fs_bsize, osize, bp);
-			ip->i_ffs_size = (nb + 1) * fs->fs_bsize;
+				    fs->fs_bsize, osize, bpp ? *bpp : NULL);
+			ip->i_ffs_size = lblktosize(fs, nb + 1);
 			uvm_vnp_setsize(vp, ip->i_ffs_size);
-			ip->i_ffs_db[nb] = ufs_rw32(dbtofsb(fs, bp->b_blkno),
-			    needswap);
+			ip->i_ffs_db[nb] = ufs_rw32(newb, needswap);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			if (flags & B_SYNC)
-				bwrite(bp);
-			else
-				bawrite(bp);
+			if (bpp) {
+				if (flags & B_SYNC)
+					bwrite(*bpp);
+				else
+					bawrite(*bpp);
+			}
 		}
 	}
+
 	/*
 	 * The first NDADDR blocks are direct blocks
 	 */
+
 	if (lbn < NDADDR) {
 		nb = ufs_rw32(ip->i_ffs_db[lbn], needswap);
-		if (nb != 0 && ip->i_ffs_size >= (lbn + 1) * fs->fs_bsize) {
-			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
-			if (error) {
-				brelse(bp);
-				return (error);
+		if (nb != 0 && ip->i_ffs_size >= lblktosize(fs, lbn + 1)) {
+
+			/*
+			 * The block is an already-allocated direct block
+			 * and the file already extends past this block,
+			 * thus this must be a whole block.
+			 * Just read the block (if requested).
+			 */
+
+			if (bpp != NULL) {
+				error = bread(vp, lbn, fs->fs_bsize, NOCRED,
+					      bpp);
+				if (error) {
+					brelse(*bpp);
+					return (error);
+				}
 			}
-			*ap->a_bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
+
 			/*
 			 * Consider need to reallocate a fragment.
 			 */
+
 			osize = fragroundup(fs, blkoff(fs, ip->i_ffs_size));
 			nsize = fragroundup(fs, size);
 			if (nsize <= osize) {
-				error = bread(vp, lbn, osize, NOCRED, &bp);
-				if (error) {
-					brelse(bp);
-					return (error);
+
+				/*
+				 * The existing block is already
+				 * at least as big as we want.
+				 * Just read the block (if requested).
+				 */
+
+				if (bpp != NULL) {
+					error = bread(vp, lbn, osize, NOCRED,
+						      bpp);
+					if (error) {
+						brelse(*bpp);
+						return (error);
+					}
 				}
+				return 0;
 			} else {
+
+				/*
+				 * The existing block is smaller than we want,
+				 * grow it.
+				 */
+
 				error = ffs_realloccg(ip, lbn,
 				    ffs_blkpref(ip, lbn, (int)lbn,
 					&ip->i_ffs_db[0]), osize, nsize, cred,
-					&bp);
+					bpp, &newb);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
 					softdep_setup_allocdirect(ip, lbn,
-					    dbtofsb(fs, bp->b_blkno), nb,
-					    nsize, osize, bp);
+					    newb, nb, nsize, osize,
+					    bpp ? *bpp : NULL);
 			}
 		} else {
-			if (ip->i_ffs_size < (lbn + 1) * fs->fs_bsize)
+
+			/*
+			 * the block was not previously allocated,
+			 * allocate a new block or fragment.
+			 */
+
+			if (ip->i_ffs_size < lblktosize(fs, lbn + 1))
 				nsize = fragroundup(fs, size);
 			else
 				nsize = fs->fs_bsize;
@ -180,18 +227,20 @@ ffs_balloc(v)
 				nsize, cred, &newb);
 			if (error)
 				return (error);
-			bp = getblk(vp, lbn, nsize, 0, 0);
-			bp->b_blkno = fsbtodb(fs, newb);
-			if (flags & B_CLRBUF)
-				clrbuf(bp);
-			if (DOINGSOFTDEP(vp))
+			if (bpp != NULL) {
+				bp = getblk(vp, lbn, nsize, 0, 0);
+				bp->b_blkno = fsbtodb(fs, newb);
+				if (flags & B_CLRBUF)
+					clrbuf(bp);
+				*bpp = bp;
+			}
+			if (DOINGSOFTDEP(vp)) {
 				softdep_setup_allocdirect(ip, lbn, newb, 0,
-				    nsize, 0, bp);
+				    nsize, 0, bpp ? *bpp : NULL);
+			}
 		}
-		ip->i_ffs_db[lbn] = ufs_rw32(dbtofsb(fs, bp->b_blkno),
-			needswap);
+		ip->i_ffs_db[lbn] = ufs_rw32(newb, needswap);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
-		*ap->a_bpp = bp;
 		return (0);
 	}
 	/*
@ -200,6 +249,7 @@ ffs_balloc(v)
 	pref = 0;
 	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
 		return(error);
+
 #ifdef DIAGNOSTIC
 	if (num < 1)
 		panic ("ffs_balloc: ufs_bmaparray returned indirect block\n");
@ -311,14 +361,20 @@ ffs_balloc(v)
 		}
 		nb = newb;
 		*allocblk++ = nb;
-		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
-		nbp->b_blkno = fsbtodb(fs, nb);
-		if (flags & B_CLRBUF)
-			clrbuf(nbp);
+		if (bpp != NULL) {
+			nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
+			if (flags & B_CLRBUF)
+				clrbuf(nbp);
+			*bpp = nbp;
+		}
 		if (DOINGSOFTDEP(vp))
 			softdep_setup_allocindir_page(ip, lbn, bp,
-			    indirs[num].in_off, nb, 0, nbp);
+			    indirs[num].in_off, nb, 0, bpp ? *bpp : NULL);
 		bap[indirs[num].in_off] = ufs_rw32(nb, needswap);
+		if (allocib == NULL && unwindidx < 0) {
+			unwindidx = i - 1;
+		}
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
@ -328,21 +384,23 @@ ffs_balloc(v)
 		} else {
 			bdwrite(bp);
 		}
-		*ap->a_bpp = nbp;
 		return (0);
 	}
 	brelse(bp);
-	if (flags & B_CLRBUF) {
-		error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
-		if (error) {
-			brelse(nbp);
-			goto fail;
+	if (bpp != NULL) {
+		if (flags & B_CLRBUF) {
+			error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
+			if (error) {
+				brelse(nbp);
+				goto fail;
+			}
+		} else {
+			nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
+			nbp->b_blkno = fsbtodb(fs, nb);
+			clrbuf(nbp);
 		}
-	} else {
-		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
-		nbp->b_blkno = fsbtodb(fs, nb);
+		*bpp = nbp;
 	}
-	*ap->a_bpp = nbp;
 	return (0);
 fail:
 	/*
@ -401,3 +459,62 @@ fail:
 	(void) VOP_FSYNC(vp, cred, FSYNC_WAIT, 0, 0, curproc);
 	return (error);
 }
+
+
+int
+ffs_ballocn(v)
+	void *v;
+{
+	struct vop_ballocn_args /* {
+		struct vnode *a_vp;
+		off_t a_offset;
+		off_t a_length;
+		struct ucred *a_cred;
+		int a_flags;
+	} */ *ap = v;
+
+	off_t off, len;
+	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
+	struct fs *fs = ip->i_fs;
+	int error, delta, bshift, bsize;
+
+	error = 0;
+	bshift = fs->fs_bshift;
+	bsize = 1 << bshift;
+
+	off = ap->a_offset;
+	len = ap->a_length;
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	while (len > 0) {
+		bsize = min(bsize, len);
+
+		error = VOP_BALLOC(vp, off, bsize, ap->a_cred, ap->a_flags,
+				   NULL);
+		if (error) {
+			goto out;
+		}
+
+		/*
+		 * increase file size now, VOP_BALLOC() requires that
+		 * EOF be up-to-date before each call.
+		 */
+
+		if (ip->i_ffs_size < off + bsize) {
+			ip->i_ffs_size = off + bsize;
+			if (vp->v_uvm.u_size < ip->i_ffs_size) {
+				uvm_vnp_setsize(vp, ip->i_ffs_size);
+			}
+		}
+
+		off += bsize;
+		len -= bsize;
+	}
+
+out:
+	return error;
+}
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_extern.h,v 1.16 2000/04/04 09:23:20 jdolecek Exp $	*/
+/*	$NetBSD: ffs_extern.h,v 1.17 2000/11/27 08:39:54 chs Exp $	*/

 /*-
 * Copyright (c) 1991, 1993, 1994
@ -79,7 +79,7 @@ __BEGIN_DECLS
 int ffs_alloc __P((struct inode *, ufs_daddr_t, ufs_daddr_t , int, struct ucred *,
 		   ufs_daddr_t *));
 int ffs_realloccg __P((struct inode *, ufs_daddr_t, ufs_daddr_t, int, int ,
-		       struct ucred *, struct buf **));
+		       struct ucred *, struct buf **, ufs_daddr_t *));
 int ffs_reallocblks __P((void *));
 int ffs_valloc __P((void *));
 ufs_daddr_t ffs_blkpref __P((struct inode *, ufs_daddr_t, int, ufs_daddr_t *));
@ -89,6 +89,7 @@ void ffs_clusteracct __P((struct fs *, struct cg *, ufs_daddr_t, int));

 /* ffs_balloc.c */
 int ffs_balloc __P((void *));
+int ffs_ballocn __P((void *));

 /* ffs_bswap.c */
 void ffs_sb_swap __P((struct fs*, struct fs *, int));
@ -137,6 +138,7 @@ int ffs_read __P((void *));
 int ffs_write __P((void *));
 int ffs_fsync __P((void *));
 int ffs_reclaim __P((void *));
+int ffs_size __P((void *));
 __END_DECLS

 
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_inode.c,v 1.37 2000/09/19 22:04:09 fvdl Exp $	*/
+/*	$NetBSD: ffs_inode.c,v 1.38 2000/11/27 08:39:54 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -170,37 +170,25 @@ ffs_truncate(v)
 	struct vnode *ovp = ap->a_vp;
 	ufs_daddr_t lastblock;
 	struct inode *oip;
-	ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
+	ufs_daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR];
 	ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	off_t length = ap->a_length;
 	struct fs *fs;
-	struct buf *bp;
 	int offset, size, level;
 	long count, nblocks, blocksreleased = 0;
 	int i;
-	int aflags, error, allerror = 0;
+	int error, allerror = 0;
 	off_t osize;

 	if (length < 0)
 		return (EINVAL);
 	oip = VTOI(ovp);
-#if 1
-	/*
-	 * XXX. Was in Kirk's patches. Is it good behavior to just
-	 * return and not update modification times?
-	 */
-	if (oip->i_ffs_size == length)
-		return (0);
-#endif
 	if (ovp->v_type == VLNK &&
 	    (oip->i_ffs_size < ovp->v_mount->mnt_maxsymlinklen ||
 	     (ovp->v_mount->mnt_maxsymlinklen == 0 &&
 	      oip->i_din.ffs_din.di_blocks == 0))) {
-#ifdef DIAGNOSTIC
-		if (length != 0)
-			panic("ffs_truncate: partial truncate of symlink");
-#endif
-		memset((char *)&oip->i_ffs_shortlink, 0, (u_int)oip->i_ffs_size);
+		KDASSERT(length == 0);
+		memset(&oip->i_ffs_shortlink, 0, (size_t)oip->i_ffs_size);
 		oip->i_ffs_size = 0;
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT));
@ -214,12 +202,56 @@ ffs_truncate(v)
 		return (error);
 #endif
 	fs = oip->i_fs;
+	if (length > fs->fs_maxfilesize)
+		return (EFBIG);
+
 	osize = oip->i_ffs_size;
 	ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0;

+	/*
+	 * Lengthen the size of the file. We must ensure that the
+	 * last byte of the file is allocated. Since the smallest
+	 * value of osize is 0, length will be at least 1.
+	 */
+
+	if (osize < length) {
+		ufs_balloc_range(ovp, length - 1, 1, ap->a_cred,
+		    ap->a_flags & IO_SYNC ? B_SYNC : 0);
+		oip->i_flag |= IN_CHANGE | IN_UPDATE;
+		return (VOP_UPDATE(ovp, NULL, NULL, 1));
+	}
+
+	/*
+	 * When truncating a regular file down to a non-block-aligned size,
+	 * we must zero the part of last block which is past the new EOF.
+	 * We must synchronously flush the zeroed pages to disk
+	 * since the new pages will be invalidated as soon as we
+	 * inform the VM system of the new, smaller size.
+	 * We must to this before acquiring the GLOCK, since fetching
+	 * the pages will acquire the GLOCK internally.
+	 * So there is a window where another thread could see a whole
+	 * zeroed page past EOF, but that's life.
+	 */
+
+	offset = blkoff(fs, length);
+	if (ovp->v_type == VREG && length < osize && offset != 0) {
+		struct uvm_object *uobj;
+		voff_t eoz;
+
+		size = blksize(fs, oip, lblkno(fs, length));
+		eoz = min(lblktosize(fs, lblkno(fs, length)) + size, osize);
+		uvm_vnp_zerorange(ovp, length, eoz - length);
+		uobj = &ovp->v_uvm.u_obj;
+		simple_lock(&uobj->vmobjlock);
+		uobj->pgops->pgo_flush(uobj, length, eoz,
+		    PGO_CLEANIT|PGO_DEACTIVATE|PGO_SYNCIO);
+		simple_unlock(&ovp->v_uvm.u_obj.vmobjlock);
+	}
+
+	lockmgr(&ovp->v_glock, LK_EXCLUSIVE, NULL);
+
 	if (DOINGSOFTDEP(ovp)) {
 		uvm_vnp_setsize(ovp, length);
-		(void) uvm_vnp_uncache(ovp);
 		if (length > 0) {
 			/*
 			 * If a file is only partially truncated, then
@ -231,73 +263,26 @@ ffs_truncate(v)
 			 * so that it will have no data structures left.
 			 */
 			if ((error = VOP_FSYNC(ovp, ap->a_cred, FSYNC_WAIT,
-			    0, 0, ap->a_p)) != 0)
+			    0, 0, ap->a_p)) != 0) {
+				lockmgr(&ovp->v_glock, LK_RELEASE, NULL);
 				return (error);
+			}
 		} else {
 #ifdef QUOTA
 			(void) chkdq(oip, -oip->i_ffs_blocks, NOCRED, 0);
 #endif
 			softdep_setup_freeblocks(oip, length);
 			(void) vinvalbuf(ovp, 0, ap->a_cred, ap->a_p, 0, 0);
+			lockmgr(&ovp->v_glock, LK_RELEASE, NULL);
 			oip->i_flag |= IN_CHANGE | IN_UPDATE;
 			return (VOP_UPDATE(ovp, NULL, NULL, 0));
 		}
 	}
+
 	/*
-	 * Lengthen the size of the file. We must ensure that the
-	 * last byte of the file is allocated. Since the smallest
-	 * value of osize is 0, length will be at least 1.
+	 * Reduce the size of the file.
 	 */
-	if (osize < length) {
-		if (length > fs->fs_maxfilesize)
-			return (EFBIG);
-		aflags = B_CLRBUF;
-		if (ap->a_flags & IO_SYNC)
-			aflags |= B_SYNC;
-		error = VOP_BALLOC(ovp, length - 1, 1, ap->a_cred, aflags, &bp);
-		if (error)
-			return (error);
-		oip->i_ffs_size = length;
-		uvm_vnp_setsize(ovp, length);
-		(void) uvm_vnp_uncache(ovp);
-		if (aflags & B_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
-		oip->i_flag |= IN_CHANGE | IN_UPDATE;
-		return (VOP_UPDATE(ovp, NULL, NULL, UPDATE_WAIT));
-	}
-	/*
-	 * Shorten the size of the file. If the file is not being
-	 * truncated to a block boundary, the contents of the
-	 * partial block following the end of the file must be
-	 * zero'ed in case it ever becomes accessible again because
-	 * of subsequent file growth. Directories however are not
-	 * zero'ed as they should grow back initialized to empty.
-	 */
-	offset = blkoff(fs, length);
-	if (offset == 0) {
-		oip->i_ffs_size = length;
-	} else {
-		lbn = lblkno(fs, length);
-		aflags = B_CLRBUF;
-		if (ap->a_flags & IO_SYNC)
-			aflags |= B_SYNC;
-		error = VOP_BALLOC(ovp, length - 1, 1, ap->a_cred, aflags, &bp);
-		if (error)
-			return (error);
-		oip->i_ffs_size = length;
-		size = blksize(fs, oip, lbn);
-		(void) uvm_vnp_uncache(ovp);
-		if (ovp->v_type != VDIR)
-			memset((char *)bp->b_data + offset, 0,
-			       (u_int)(size - offset));
-		allocbuf(bp, size);
-		if (aflags & B_SYNC)
-			bwrite(bp);
-		else
-			bawrite(bp);
-	}
+	oip->i_ffs_size = length;
 	uvm_vnp_setsize(ovp, length);
 	/*
 	 * Calculate index into inode's block list of
@ -431,6 +416,7 @@ done:
 	oip->i_ffs_blocks -= blocksreleased;
 	if (oip->i_ffs_blocks < 0)			/* sanity */
 		oip->i_ffs_blocks = 0;
+	lockmgr(&ovp->v_glock, LK_RELEASE, NULL);
 	oip->i_flag |= IN_CHANGE;
 #ifdef QUOTA
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_softdep.c,v 1.7 2000/11/08 14:28:16 ad Exp $	*/
+/*	$NetBSD: ffs_softdep.c,v 1.8 2000/11/27 08:39:54 chs Exp $	*/

 /*
 * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved.
@ -53,6 +53,10 @@
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufs_bswap.h>

+#include <uvm/uvm.h>
+struct pool sdpcpool;
+int softdep_lockedbufs;
+
 /*
 * For now we want the safety net that the DIAGNOSTIC and DEBUG flags provide.
 */
@ -98,6 +102,13 @@ extern char *memname[];
 * End system adaptaion definitions.
 */

+/*
+ * Definitions for page cache info hashtable.
+ */
+#define PCBPHASHSIZE 1024
+LIST_HEAD(, buf) pcbphashhead[PCBPHASHSIZE];
+#define PCBPHASH(vp, lbn) ((((vaddr_t)(vp) >> 8) ^ (lbn)) & (PCBPHASHSIZE - 1))
+
 /*
 * Internal function prototypes.
 */
@ -149,6 +160,16 @@ static	int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
 static	void pause_timer __P((void *));
 static	int request_cleanup __P((int, int));
 static	void add_to_worklist __P((struct worklist *));
+static	struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t,
+						 long));
+static	void softdep_collect_pagecache __P((struct vnode *,
+					    struct bufq_head *));
+static	void softdep_free_pagecache __P((struct bufq_head *));
+static	struct vnode *softdep_lookupvp(struct fs *, ino_t);
+static	struct buf *softdep_lookup_pcbp __P((struct vnode *, ufs_lbn_t));
+void softdep_pageiodone __P((struct buf *));
+void softdep_flush_vnode __P((struct vnode *, ufs_lbn_t));
+static void softdep_flush_indir __P((struct vnode *));

 /*
 * Exported softdep operations.
@ -889,6 +910,7 @@ top:
 void 
 softdep_initialize()
 {
+	int i;

 	LIST_INIT(&mkdirlisthd);
 	LIST_INIT(&softdep_workitem_pending);
@ -902,6 +924,11 @@ softdep_initialize()
 	newblk_hashtbl = hashinit(64, HASH_LIST, M_NEWBLK, M_WAITOK,
 	    &newblk_hash);
 	sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
+	pool_init(&sdpcpool, sizeof(struct buf), 0, 0, 0, "sdpcpool",
+	    0, pool_page_alloc_nointr, pool_page_free_nointr, M_TEMP);
+	for (i = 0; i < PCBPHASHSIZE; i++) {
+		LIST_INIT(&pcbphashhead[i]);
+	}
 }

 /*
@ -1161,6 +1188,18 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
 	LIST_REMOVE(newblk, nb_hash);
 	FREE(newblk, M_NEWBLK);

+	/*
+	 * If we were not passed a bp to attach the dep to,
+	 * then this must be for a regular file.
+	 * Allocate a buffer to represent the page cache pages
+	 * that are the real dependency.  The pages themselves
+	 * cannot refer to the dependency since we don't want to
+	 * add a field to struct vm_page for this.
+	 */
+
+	if (bp == NULL) {
+		bp = softdep_setup_pagecache(ip, lbn, newsize);
+	}
 	WORKLIST_INSERT(&bp->b_dep, &adp->ad_list);
 	if (lbn >= NDADDR) {
 		/* allocating an indirect block */
@ -1310,7 +1349,10 @@ handle_workitem_freefrag(freefrag)
 	vp.v_data = &tip;
 	vp.v_mount = freefrag->ff_devvp->v_specmountpoint;
 	tip.i_vnode = &vp;
+	lockinit(&vp.v_glock, PVFS, "fglock", 0, 0);
+	lockmgr(&vp.v_glock, LK_EXCLUSIVE, NULL);
 	ffs_blkfree(&tip, freefrag->ff_blkno, freefrag->ff_fragsize);
+	lockmgr(&vp.v_glock, LK_RELEASE, NULL);
 	FREE(freefrag, M_FREEFRAG);
 }

@ -1380,6 +1422,18 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	struct allocindir *aip;
 	struct pagedep *pagedep;

+	/*
+	 * If we are already holding "many" buffers busy (as the safe copies
+	 * of indirect blocks) flush the dependency for one of those before
+	 * potentially tying up more.  otherwise we could fill the
+	 * buffer cache with busy buffers and deadlock.
+	 * XXXUBC I'm sure there's a better way to deal with this.
+	 */
+
+	while (softdep_lockedbufs > nbuf >> 2) {
+		softdep_flush_indir(ITOV(ip));
+	}
+
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno);
 	ACQUIRE_LOCK(&lk);
 	/*
@ -1390,6 +1444,9 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
 	if ((ip->i_ffs_mode & IFMT) == IFDIR &&
 	    pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0)
 		WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list);
+	if (nbp == NULL) {
+		nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize);
+	}
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list);
 	FREE_LOCK(&lk);
 	setup_allocindir_phase2(bp, ip, aip);
@ -1495,8 +1552,10 @@ setup_allocindir_phase2(bp, ip, aip)
 			FREE_LOCK(&lk);
 		}
 		if (newindirdep) {
-			if (indirdep->ir_savebp != NULL)
+			if (indirdep->ir_savebp != NULL) {
 				brelse(newindirdep->ir_savebp);
+				softdep_lockedbufs--;
+			}
 			WORKITEM_FREE((caddr_t)newindirdep, D_INDIRDEP);
 		}
 		if (indirdep)
@ -1513,6 +1572,7 @@ setup_allocindir_phase2(bp, ip, aip)
 		}
 		newindirdep->ir_savebp =
 		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0);
+		softdep_lockedbufs++;
 		newindirdep->ir_savebp->b_flags |= B_ASYNC;
 		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
 	}
@ -1555,8 +1615,9 @@ softdep_setup_freeblocks(ip, length)
 	struct freeblks *freeblks;
 	struct inodedep *inodedep;
 	struct allocdirect *adp;
-	struct vnode *vp;
+	struct vnode *vp = ITOV(ip);
 	struct buf *bp;
+	struct bufq_head fbqh;
 	struct fs *fs = ip->i_fs;
 	int i, error;
 #ifdef FFS_EI
@ -1616,7 +1677,13 @@ softdep_setup_freeblocks(ip, length)
 	 * with this inode are obsolete and can simply be de-allocated.
 	 * We must first merge the two dependency lists to get rid of
 	 * any duplicate freefrag structures, then purge the merged list.
+	 * We must remove any pagecache markers from the pagecache
+	 * hashtable first because any I/Os in flight will want to see
+	 * dependencies attached to their pagecache markers.  We cannot
+	 * free the pagecache markers until after we've freed all the
+	 * dependencies that reference them later.
 	 */
+	softdep_collect_pagecache(vp, &fbqh);
 	merge_inode_lists(inodedep);
 	while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
 		free_allocdirect(&inodedep->id_inoupdt, adp, 1);
@ -1628,7 +1695,6 @@ softdep_setup_freeblocks(ip, length)
 	 * Once they are all there, walk the list and get rid of
 	 * any dependencies.
 	 */
-	vp = ITOV(ip);
 	ACQUIRE_LOCK(&lk);
 	drain_output(vp, 1);
 	while (getdirtybuf(&vp->v_dirtyblkhd.lh_first, MNT_WAIT)) {
@ -1640,6 +1706,7 @@ softdep_setup_freeblocks(ip, length)
 		brelse(bp);
 		ACQUIRE_LOCK(&lk);
 	}
+	softdep_free_pagecache(&fbqh);
 	/*
 	 * Add the freeblks structure to the list of operations that
 	 * must await the zero'ed inode being written to disk. If we
@ -1730,8 +1797,8 @@ deallocate_dependencies(bp, inodedep)
 			 * If the inode has already been written, then they 
 			 * can be dumped directly onto the work list.
 			 */
-			for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem;
-			     dirrem = LIST_NEXT(dirrem, dm_next)) {
+			while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd))
+			       != NULL) {
 				LIST_REMOVE(dirrem, dm_next);
 				dirrem->dm_dirinum = pagedep->pd_ino;
 				if (inodedep == NULL ||
@ -1944,6 +2011,10 @@ handle_workitem_freeblocks(freeblks)
 	}
 	nblocks = btodb(fs->fs_bsize);
 	blocksreleased = 0;
+
+	lockinit(&vp.v_glock, PVFS, "fglock", 0, 0);
+	lockmgr(&vp.v_glock, LK_EXCLUSIVE, NULL);
+
 	/*
 	 * Indirect blocks first.
 	 */
@ -1966,6 +2037,7 @@ handle_workitem_freeblocks(freeblks)
 		ffs_blkfree(&tip, bn, bsize);
 		blocksreleased += btodb(bsize);
 	}
+	lockmgr(&vp.v_glock, LK_RELEASE, NULL);

 #ifdef DIAGNOSTIC
 	if (freeblks->fb_chkcnt != blocksreleased)
@ -2034,6 +2106,7 @@ indir_trunc(ip, dbn, level, lbn, countp)
 		error = bread(ip->i_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp);
 		if (error)
 			return (error);
+		softdep_lockedbufs++;
 	}
 	/*
 	 * Recursively free indirect blocks.
@ -2053,6 +2126,7 @@ indir_trunc(ip, dbn, level, lbn, countp)
 	}
 	bp->b_flags |= B_INVAL | B_NOCACHE;
 	brelse(bp);
+	softdep_lockedbufs--;
 	return (allerror);
 }

@ -2793,6 +2867,8 @@ softdep_disk_io_initiation(bp)
 			if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) {
 				indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE;
 				brelse(indirdep->ir_savebp);
+				softdep_lockedbufs--;
+
 				/* inline expand WORKLIST_REMOVE(wk); */
 				wk->wk_state &= ~ONWORKLIST;
 				LIST_REMOVE(wk, wk_list);
@ -3681,8 +3757,9 @@ merge_inode_lists(inodedep)
 {
 	struct allocdirect *listadp, *newadp;

+	listadp = TAILQ_FIRST(&inodedep->id_inoupdt);
 	newadp = TAILQ_FIRST(&inodedep->id_newinoupdt);
-	for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp && newadp;) {
+	while (listadp && newadp) {
 		if (listadp->ad_lbn < newadp->ad_lbn) {
 			listadp = TAILQ_NEXT(listadp, ad_next);
 			continue;
@ -3935,6 +4012,7 @@ loop:
 		switch (wk->wk_type) {

 		case D_ALLOCDIRECT:
+			KASSERT(vp->v_type != VREG);
 			adp = WK_ALLOCDIRECT(wk);
 			if (adp->ad_state & DEPCOMPLETE)
 				break;
@ -4141,6 +4219,7 @@ flush_inodedep_deps(fs, ino)
 	struct allocdirect *adp;
 	int error, waitfor;
 	struct buf *bp;
+	struct vnode *vp;

 	/*
 	 * This work is done in two passes. The first pass grabs most
@ -4160,6 +4239,27 @@ flush_inodedep_deps(fs, ino)
 		ACQUIRE_LOCK(&lk);
 		if (inodedep_lookup(fs, ino, 0, &inodedep) == 0)
 			return (0);
+
+		/*
+		 * When file data was in the buffer cache,
+		 * softdep_sync_metadata() would start i/o on
+		 * file data buffers itself.  But now that
+		 * we're using the page cache to hold file data,
+		 * we need something else to trigger those flushes.
+		 * let's just do it here.
+		 */
+
+		vp = softdep_lookupvp(fs, ino);
+		if (vp) {
+			struct uvm_object *uobj = &vp->v_uvm.u_obj;
+
+			simple_lock(&uobj->vmobjlock);
+			(uobj->pgops->pgo_flush)(uobj, 0, 0,
+			    PGO_ALLPAGES|PGO_CLEANIT|
+			    (waitfor == MNT_NOWAIT ? 0: PGO_SYNCIO));
+			simple_unlock(&uobj->vmobjlock);
+		}
+
 		for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
 		     adp = TAILQ_NEXT(adp, ad_next)) {
 			if (adp->ad_state & DEPCOMPLETE)
@ -4727,3 +4827,236 @@ softdep_error(func, error)
 	/* XXX should do something better! */
 	printf("%s: got error %d while accessing filesystem\n", func, error);
 }
+
+/*
+ * Allocate a buffer on which to attach a dependency.
+ */
+static struct buf *
+softdep_setup_pagecache(ip, lbn, size)
+	struct inode *ip;
+	ufs_lbn_t lbn;
+	long size;
+{
+	struct vnode *vp = ITOV(ip);
+	struct buf *bp;
+	int s;
+
+	/*
+	 * Enter pagecache dependency buf in hash.
+	 */
+
+	bp = softdep_lookup_pcbp(vp, lbn);
+	if (bp == NULL) {
+		s = splbio();
+		bp = pool_get(&sdpcpool, PR_WAITOK);
+		splx(s);
+		memset(bp, 0, sizeof(*bp));
+
+		bp->b_vp = vp;
+		bp->b_lblkno = lbn;
+		bp->b_bcount = bp->b_resid = size;
+		LIST_INIT(&bp->b_dep);
+		LIST_INSERT_HEAD(&pcbphashhead[PCBPHASH(vp, lbn)], bp, b_hash);
+	} else {
+		KASSERT(size >= bp->b_bcount);
+		bp->b_resid += size - bp->b_bcount;
+		bp->b_bcount = size;
+	}
+	return bp;
+}
+
+/*
+ * softdep_collect_pagecache() and softdep_free_pagecache()
+ * are used to remove page cache dependency buffers when
+ * a file is being truncated to 0.
+ */
+
+static void
+softdep_collect_pagecache(vp, bqhp)
+	struct vnode *vp;
+	struct bufq_head *bqhp;
+{
+	struct buf *bp, *nextbp;
+	int i;
+
+	TAILQ_INIT(bqhp);
+	for (i = 0; i < PCBPHASHSIZE; i++) {
+		for (bp = LIST_FIRST(&pcbphashhead[i]);
+		     bp != NULL;
+		     bp = nextbp) {
+			nextbp = LIST_NEXT(bp, b_hash);
+			if (bp->b_vp == vp) {
+				LIST_REMOVE(bp, b_hash);
+				TAILQ_INSERT_HEAD(bqhp, bp, b_freelist);
+			}
+		}
+	}
+}
+
+static void
+softdep_free_pagecache(bqhp)
+	struct bufq_head *bqhp;
+{
+	struct buf *bp, *nextbp;
+
+	for (bp = TAILQ_FIRST(bqhp); bp != NULL; bp = nextbp) {
+		nextbp = TAILQ_NEXT(bp, b_freelist);
+		TAILQ_REMOVE(bqhp, bp, b_freelist);
+		KASSERT(LIST_FIRST(&bp->b_dep) == NULL);
+		pool_put(&sdpcpool, bp);
+	}
+}
+
+static struct vnode *
+softdep_lookupvp(fs, ino)
+	struct fs *fs;
+	ino_t ino;
+{
+	struct mount *mp;
+	extern struct vfsops ffs_vfsops;
+
+	CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) {
+		if (mp->mnt_op == &ffs_vfsops &&
+		    VFSTOUFS(mp)->um_fs == fs) {
+			break;
+		}
+	}
+	if (mp == NULL) {
+		return NULL;
+	}
+	return ufs_ihashlookup(VFSTOUFS(mp)->um_dev, ino);
+}
+
+/*
+ * Flush some dependent page cache data for any vnode *except*
+ * the one specified.
+ * XXXUBC this is a horrible hack and it's probably not too hard to deadlock
+ * even with this, but it's better than nothing.
+ */
+
+static void
+softdep_flush_indir(vp)
+	struct vnode *vp;
+{
+	struct buf *bp;
+	int i;
+
+	for (i = 0; i < PCBPHASHSIZE; i++) {
+		LIST_FOREACH(bp, &pcbphashhead[i], b_hash) {
+			if (bp->b_vp == vp ||
+			    LIST_FIRST(&bp->b_dep)->wk_type != D_ALLOCINDIR) {
+				continue;
+			}
+
+			VOP_FSYNC(bp->b_vp, curproc->p_ucred, FSYNC_WAIT, 0, 0,
+				  curproc);
+			return;
+		}
+	}
+	printf("softdep_flush_indir: nothing to flush?\n");
+}
+
+
+static struct buf *
+softdep_lookup_pcbp(vp, lbn)
+	struct vnode *vp;
+	ufs_lbn_t lbn;
+{
+	struct buf *bp;
+
+	LIST_FOREACH(bp, &pcbphashhead[PCBPHASH(vp, lbn)], b_hash) {
+		if (bp->b_vp == vp && bp->b_lblkno == lbn) {
+			break;
+		}
+	}
+	return bp;	     
+}
+
+/*
+ * Do softdep i/o completion processing for page cache writes.
+ */
+ 
+void
+softdep_pageiodone(bp)
+	struct buf *bp;
+{
+	int npages = bp->b_bufsize >> PAGE_SHIFT;
+	struct vnode *vp = bp->b_vp;
+	struct vm_page *pg;
+	struct buf *pcbp = NULL;
+	struct allocdirect *adp;
+	struct allocindir *aip;
+	struct worklist *wk;
+	ufs_lbn_t lbn;
+	voff_t off;
+	long iosize = bp->b_bcount;
+	int size, asize, bshift, bsize;
+	int i;
+
+	KASSERT(!(bp->b_flags & B_READ));
+	bshift = vp->v_mount->mnt_fs_bshift;
+	bsize = 1 << bshift;
+	asize = min(PAGE_SIZE, bsize);
+	ACQUIRE_LOCK(&lk);
+	for (i = 0; i < npages; i++) {
+		pg = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
+		if (pg == NULL) {
+			continue;
+		}
+
+		for (off = pg->offset;
+		     off < pg->offset + PAGE_SIZE;
+		     off += bsize) {
+			size = min(asize, iosize);
+			iosize -= size;
+			lbn = off >> bshift;
+			if (pcbp == NULL || pcbp->b_lblkno != lbn) {
+				pcbp = softdep_lookup_pcbp(vp, lbn);
+			}
+			if (pcbp == NULL) {
+				continue;
+			}
+			pcbp->b_resid -= size;
+			if (pcbp->b_resid < 0) {
+				panic("softdep_pageiodone: "
+				      "resid < 0, vp %p lbn 0x%lx pcbp %p",
+				      vp, lbn, pcbp);
+			}
+			if (pcbp->b_resid > 0) {
+				continue;
+			}
+
+			/*
+			 * We've completed all the i/o for this block.
+			 * mark the dep complete.
+			 */
+
+			KASSERT(LIST_FIRST(&pcbp->b_dep) != NULL);
+			while ((wk = LIST_FIRST(&pcbp->b_dep))) {
+				WORKLIST_REMOVE(wk);
+				switch (wk->wk_type) {
+				case D_ALLOCDIRECT:
+					adp = WK_ALLOCDIRECT(wk);
+					adp->ad_state |= COMPLETE;
+					handle_allocdirect_partdone(adp);
+					break;
+
+				case D_ALLOCINDIR:
+					aip = WK_ALLOCINDIR(wk);
+					aip->ai_state |= COMPLETE;
+					handle_allocindir_partdone(aip);
+					break;
+
+				default:
+					panic("softdep_pageiodone: "
+					      "bad type %d, pcbp %p wk %p",
+					      wk->wk_type, pcbp, wk);
+				}
+			}
+			LIST_REMOVE(pcbp, b_hash);
+			pool_put(&sdpcpool, pcbp);
+			pcbp = NULL;
+		}
+	}
+	FREE_LOCK(&lk);
+}
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.72 2000/10/13 16:40:26 simonb Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.73 2000/11/27 08:39:55 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1991, 1993, 1994
@ -690,6 +690,8 @@ ffs_mountfs(devvp, mp, p)
 	mp->mnt_stat.f_fsid.val[0] = (long)dev;
 	mp->mnt_stat.f_fsid.val[1] = makefstype(MOUNT_FFS);
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
+	mp->mnt_fs_bshift = fs->fs_bshift;
+	mp->mnt_dev_bshift = DEV_BSHIFT;	/* XXX */
 	mp->mnt_flag |= MNT_LOCAL;
 #ifdef FFS_EI
 	if (needswap)
@ -699,6 +701,7 @@ ffs_mountfs(devvp, mp, p)
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
+	ump->um_lognindir = ffs(fs->fs_nindir) - 1;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
@ -797,6 +800,9 @@ ffs_unmount(mp, mntflags, p)
 	if (ump->um_devvp->v_type != VBAD)
 		ump->um_devvp->v_specmountpoint = NULL;
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
+	if (LIST_FIRST(&ump->um_devvp->v_dirtyblkhd)) {
+		panic("ffs_unmount: flush left dirty bufs %p", ump->um_devvp);
+	}
 	error = VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE,
 		NOCRED, p);
 	vput(ump->um_devvp);
@ -1107,6 +1113,7 @@ ffs_vget(mp, ino, vpp)
 		ip->i_ffs_uid = ip->i_din.ffs_din.di_ouid;	/* XXX */
 		ip->i_ffs_gid = ip->i_din.ffs_din.di_ogid;	/* XXX */
 	}							/* XXX */
+	uvm_vnp_setsize(vp, ip->i_ffs_size);

 	*vpp = vp;
 	return (0);
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vnops.c,v 1.34 2000/10/24 14:43:32 fvdl Exp $	*/
+/*	$NetBSD: ffs_vnops.c,v 1.35 2000/11/27 08:39:55 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -108,12 +108,16 @@ struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
 	{ &vop_blkatoff_desc, ffs_blkatoff },		/* blkatoff */
 	{ &vop_valloc_desc, ffs_valloc },		/* valloc */
 	{ &vop_balloc_desc, ffs_balloc },		/* balloc */
+	{ &vop_ballocn_desc, ffs_ballocn },		/* balloc */
 	{ &vop_reallocblks_desc, ffs_reallocblks },	/* reallocblks */
 	{ &vop_vfree_desc, ffs_vfree },			/* vfree */
 	{ &vop_truncate_desc, ffs_truncate },		/* truncate */
 	{ &vop_update_desc, ffs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL }
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+	{ &vop_putpages_desc, genfs_putpages },		/* putpages */
+	{ &vop_size_desc, ffs_size },			/* size */
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ffs_vnodeop_opv_desc =
 	{ &ffs_vnodeop_p, ffs_vnodeop_entries };
@ -165,7 +169,7 @@ struct vnodeopv_entry_desc ffs_specop_entries[] = {
 	{ &vop_truncate_desc, spec_truncate },		/* truncate */
 	{ &vop_update_desc, ffs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ffs_specop_opv_desc =
 	{ &ffs_specop_p, ffs_specop_entries };
@ -217,7 +221,7 @@ struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
 	{ &vop_truncate_desc, fifo_truncate },		/* truncate */
 	{ &vop_update_desc, ffs_update },		/* update */
 	{ &vop_bwrite_desc, vn_bwrite },		/* bwrite */
-	{ (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL }
+	{ NULL, NULL }
 };
 struct vnodeopv_desc ffs_fifoop_opv_desc =
 	{ &ffs_fifoop_p, ffs_fifoop_entries };
@ -239,7 +243,7 @@ ffs_fsync(v)
 		off_t offhi;
 		struct proc *a_p;
 	} */ *ap = v;
-	struct buf *bp, *nbp, *ibp;
+	struct buf *bp;
 	int s, num, error, i;
 	struct indir ia[NIADDR + 1];
 	int bsize;
@ -260,38 +264,32 @@ ffs_fsync(v)
 	if (ap->a_offhi % bsize != 0)
 		blk_high++;

-	/*
-	 * First, flush all data blocks in range.
-	 */
-loop:
 	s = splbio();
-	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
-		nbp = LIST_NEXT(bp, b_vnbufs);
-		if ((bp->b_flags & B_BUSY))
-			continue;
-		if (bp->b_lblkno < blk_low || bp->b_lblkno > blk_high)
-			continue;
-		bp->b_flags |= B_BUSY | B_VFLUSH;
-		splx(s);
-		bawrite(bp);
-		goto loop;
-	}

 	/*
-	 * Then, flush possibly unwritten indirect blocks. Without softdeps,
-	 * these should be the only ones left.
+	 * First, flush all pages in range.
 	 */
+
+	simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+	(vp->v_uvm.u_obj.pgops->pgo_flush)(&vp->v_uvm.u_obj,
+	    ap->a_offlo, ap->a_offhi - ap->a_offlo, PGO_CLEANIT|PGO_SYNCIO);
+	simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+
+	/*
+	 * Then, flush indirect blocks.
+	 */
+
 	if (!(ap->a_flags & FSYNC_DATAONLY) && blk_high >= NDADDR) {
 		error = ufs_getlbns(vp, blk_high, ia, &num);
-		if (error != 0)
+		if (error)
 			return error;
 		for (i = 0; i < num; i++) {
-			ibp = incore(vp, ia[i].in_lbn);
-			if (ibp != NULL && !(ibp->b_flags & B_BUSY) &&
-			    (ibp->b_flags & B_DELWRI)) {
-				ibp->b_flags |= B_BUSY | B_VFLUSH;
+			bp = incore(vp, ia[i].in_lbn);
+			if (bp != NULL && !(bp->b_flags & B_BUSY) &&
+			    (bp->b_flags & B_DELWRI)) {
+				bp->b_flags |= B_BUSY | B_VFLUSH;
 				splx(s);
-				bawrite(ibp);
+				bawrite(bp);
 				s = splbio();
 			}
 		}
@ -300,11 +298,9 @@ loop:
 	if (ap->a_flags & FSYNC_WAIT) {
 		while (vp->v_numoutput > 0) {
 			vp->v_flag |= VBWAIT;
-			tsleep((caddr_t)&vp->v_numoutput, PRIBIO + 1,
-			    "fsync_range", 0);
+			tsleep(&vp->v_numoutput, PRIBIO + 1, "fsync_range", 0);
 		}
 	}
-
 	splx(s);

 	return (VOP_UPDATE(vp, NULL, NULL,
@ -330,23 +326,33 @@ ffs_full_fsync(v)
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp, *nbp;
 	int s, error, passes, skipmeta;
+	struct uvm_object *uobj;

 	if (vp->v_type == VBLK &&
 	    vp->v_specmountpoint != NULL &&
 	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP))
 		softdep_fsync_mountdev(vp);

-	/* 
-	 * Flush all dirty buffers associated with a vnode
+	/*
+	 * Flush all dirty data associated with a vnode.
 	 */
+
+	if (vp->v_type == VREG) {
+		uobj = &vp->v_uvm.u_obj;
+		simple_lock(&uobj->vmobjlock);
+		(uobj->pgops->pgo_flush)(uobj, 0, 0, PGO_ALLPAGES|PGO_CLEANIT|
+		    ((ap->a_flags & FSYNC_WAIT) ? PGO_SYNCIO : 0));
+		simple_unlock(&uobj->vmobjlock);
+	}
+
 	passes = NIADDR + 1;
 	skipmeta = 0;
 	if (ap->a_flags & (FSYNC_DATAONLY|FSYNC_WAIT))
 		skipmeta = 1;
 	s = splbio();
+
 loop:
-	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp;
-	     bp = LIST_NEXT(bp, b_vnbufs))
+	LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
 		bp->b_flags &= ~B_SCANNED;
 	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 		nbp = LIST_NEXT(bp, b_vnbufs);
@ -445,3 +451,31 @@ ffs_reclaim(v)
 	vp->v_data = NULL;
 	return (0);
 }
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".
+ */
+int
+ffs_size(v)
+	void *v;
+{
+	struct vop_size_args /* {
+		struct vnode *a_vp;
+		off_t a_size;
+		off_t *a_eobp;
+	} */ *ap = v;
+	struct inode *ip = VTOI(ap->a_vp);
+	struct fs *fs = ip->i_fs;
+	ufs_lbn_t olbn, nlbn;
+
+	olbn = lblkno(fs, ip->i_ffs_size);
+	nlbn = lblkno(fs, ap->a_size);
+
+	if (nlbn < NDADDR && olbn <= nlbn) {
+		*ap->a_eobp = fragroundup(fs, ap->a_size);
+	} else {
+		*ap->a_eobp = blkroundup(fs, ap->a_size);
+	}
+	return 0;
+}
--- a/sys/ufs/lfs/lfs_alloc.c
+++ b/sys/ufs/lfs/lfs_alloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_alloc.c,v 1.44 2000/11/27 03:33:57 perseant Exp $	*/
+/*	$NetBSD: lfs_alloc.c,v 1.45 2000/11/27 08:39:55 chs Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -229,7 +229,6 @@ extend_ifile(struct lfs *fs, struct ucred *cred)
 	}
 	ip->i_ffs_size += fs->lfs_bsize;
 	uvm_vnp_setsize(vp, ip->i_ffs_size);
-	(void)uvm_vnp_uncache(vp);
 	VOP_UNLOCK(vp, 0);
 	
 	i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
--- a/sys/ufs/lfs/lfs_inode.c
+++ b/sys/ufs/lfs/lfs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_inode.c,v 1.48 2000/11/27 03:33:57 perseant Exp $	*/
+/*	$NetBSD: lfs_inode.c,v 1.49 2000/11/27 08:39:56 chs Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -288,7 +288,6 @@ lfs_truncate(v)
 			return (error);
 		oip->i_ffs_size = length;
 		uvm_vnp_setsize(ovp, length);
-		(void) uvm_vnp_uncache(ovp);
 		(void) VOP_BWRITE(bp);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, NULL, NULL, 0));
@ -338,7 +337,6 @@ lfs_truncate(v)
 		odb = btodb(bp->b_bcount);
 		oip->i_ffs_size = length;
 		size = blksize(fs, oip, lbn);
-		(void) uvm_vnp_uncache(ovp);
 		if (ovp->v_type != VDIR)
 			memset((char *)bp->b_data + offset, 0,
 			       (u_int)(size - offset));
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_segment.c,v 1.63 2000/11/27 03:33:57 perseant Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.64 2000/11/27 08:39:56 chs Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -309,7 +309,7 @@ lfs_vflush(vp)
 		/* panic("VDIROP being flushed...this can\'t happen"); */
 	}
 	if(vp->v_usecount<0) {
-		printf("usecount=%ld\n",vp->v_usecount);
+		printf("usecount=%d\n",vp->v_usecount);
 		panic("lfs_vflush: usecount<0");
 	}
 #endif
@ -1864,8 +1864,8 @@ lfs_vunref(vp)
 #ifdef DIAGNOSTIC
 	if(vp->v_usecount<=0) {
 		printf("lfs_vunref: inum is %d\n", VTOI(vp)->i_number);
-		printf("lfs_vunref: flags are 0x%lx\n", vp->v_flag);
-		printf("lfs_vunref: usecount = %ld\n", vp->v_usecount);
+		printf("lfs_vunref: flags are 0x%x\n", vp->v_flag);
+		printf("lfs_vunref: usecount = %d\n", vp->v_usecount);
 		panic("lfs_vunref: v_usecount<0");
 	}
 #endif
--- a/sys/ufs/mfs/mfs_vnops.c
+++ b/sys/ufs/mfs/mfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: mfs_vnops.c,v 1.25 2000/10/09 18:07:06 thorpej Exp $	*/
+/*	$NetBSD: mfs_vnops.c,v 1.26 2000/11/27 08:39:57 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -263,7 +263,7 @@ mfs_close(v)
 	 * vnode, so if we find any other uses, it is a panic.
 	 */
 	if (vp->v_usecount > 1)
-		printf("mfs_close: ref count %ld > 1\n", vp->v_usecount);
+		printf("mfs_close: ref count %d > 1\n", vp->v_usecount);
 	if (vp->v_usecount > 1 || BUFQ_FIRST(&mfsp->mfs_buflist) != NULL)
 		panic("mfs_close");
 	/*
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_bmap.c,v 1.9 2000/03/30 12:41:14 augustss Exp $	*/
+/*	$NetBSD: ufs_bmap.c,v 1.10 2000/11/27 08:39:57 chs Exp $	*/

 /*
 * Copyright (c) 1989, 1991, 1993
@ -186,6 +186,9 @@ ufs_bmaparray(vp, bn, bnp, ap, nump, runp)

 		xap->in_exists = 1;
 		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
+		if (bp == NULL) {
+			return ENOMEM;
+		}
 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
 			trace(TR_BREADHIT, pack(vp, size), metalbn);
 		}
@ -243,6 +246,7 @@ ufs_getlbns(vp, bn, ap, nump)
 	long metalbn, realbn;
 	struct ufsmount *ump;
 	int64_t blockcnt;
+	int lbc;
 	int i, numlevels, off;

 	ump = VFSTOUFS(vp->v_mount);
@ -263,10 +267,15 @@ ufs_getlbns(vp, bn, ap, nump)
 	 * at the given level of indirection, and NIADDR - i is the number
 	 * of levels of indirection needed to locate the requested block.
 	 */
-	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
+
+	bn -= NDADDR;
+	for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) {
 		if (i == 0)
 			return (EFBIG);
-		blockcnt *= MNINDIR(ump);
+
+		lbc += ump->um_lognindir;
+		blockcnt = (int64_t)1 << lbc;
+
 		if (bn < blockcnt)
 			break;
 	}
@ -292,8 +301,9 @@ ufs_getlbns(vp, bn, ap, nump)
 		if (metalbn == realbn)
 			break;

-		blockcnt /= MNINDIR(ump);
-		off = (bn / blockcnt) % MNINDIR(ump);
+		lbc -= ump->um_lognindir;
+		blockcnt = (int64_t)1 << lbc;
+		off = (bn >> lbc) & (MNINDIR(ump) - 1);

 		++numlevels;
 		ap->in_lbn = metalbn;
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_extern.h,v 1.23 2000/03/16 18:26:49 jdolecek Exp $	*/
+/*	$NetBSD: ufs_extern.h,v 1.24 2000/11/27 08:39:57 chs Exp $	*/

 /*-
 * Copyright (c) 1991, 1993, 1994
@ -113,6 +113,7 @@ void ufs_ihashrem __P((struct inode *));

 /* ufs_inode.c */
 int ufs_reclaim __P((struct vnode *, struct proc *));
+int ufs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, int));

 /* ufs_lookup.c */
 void ufs_dirbad __P((struct inode *, doff_t, char *));
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_inode.c,v 1.15 2000/05/29 18:04:31 mycroft Exp $	*/
+/*	$NetBSD: ufs_inode.c,v 1.16 2000/11/27 08:39:57 chs Exp $	*/

 /*
 * Copyright (c) 1991, 1993
@ -55,6 +55,8 @@
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>

+#include <uvm/uvm.h>
+
 /*
 * Last reference to an inode.  If necessary, write or delete it.
 */
@ -73,7 +75,7 @@ ufs_inactive(v)
 	extern int prtactive;

 	if (prtactive && vp->v_usecount != 0)
-		vprint("ffs_inactive: pushing active", vp);
+		vprint("ufs_inactive: pushing active", vp);

 	/*
 	 * Ignore inodes related to stale file handles.
@ -102,8 +104,9 @@ out:
 	 * If we are done with the inode, reclaim it
 	 * so that it can be reused immediately.
 	 */
+
 	if (ip->i_ffs_mode == 0)
-		vrecycle(vp, (struct simplelock *)0, p);
+		vrecycle(vp, NULL, p);
 	return (error);
 }

@ -146,3 +149,140 @@ ufs_reclaim(vp, p)
 #endif
 	return (0);
 }
+
+/*
+ * allocate a range of blocks in a file.
+ * after this function returns, any page entirely contained within the range
+ * will map to invalid data and thus must be overwritten before it is made
+ * accessible to others.
+ */
+
+int
+ufs_balloc_range(vp, off, len, cred, flags)
+	struct vnode *vp;
+	off_t off, len;
+	struct ucred *cred;
+	int flags;
+{
+	off_t oldeof, neweof, oldeob, neweob, oldpagestart, pagestart;
+	struct uvm_object *uobj;
+	int i, delta, error, npages1, npages2;
+	int bshift = vp->v_mount->mnt_fs_bshift;
+	int bsize = 1 << bshift;
+	int ppb = max(bsize >> PAGE_SHIFT, 1);
+	struct vm_page *pgs1[ppb], *pgs2[ppb];
+	UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x",
+		    vp, off, len, vp->v_uvm.u_size);
+
+	oldeof = vp->v_uvm.u_size;
+	error = VOP_SIZE(vp, oldeof, &oldeob);
+	if (error) {
+		return error;
+	}
+
+	neweof = max(vp->v_uvm.u_size, off + len);
+	error = VOP_SIZE(vp, neweof, &neweob);
+	if (error) {
+		return error;
+	}
+
+	error = 0;
+	uobj = &vp->v_uvm.u_obj;
+	pgs1[0] = pgs2[0] = NULL;
+
+	/*
+	 * if the last block in the file is not a full block (ie. it is a
+	 * fragment), and this allocation is causing the fragment to change
+	 * size (either to expand the fragment or promote it to a full block),
+	 * cache the old last block (at its new size).
+	 */
+
+	oldpagestart = trunc_page(oldeof) & ~(bsize - 1);
+	if ((oldeob & (bsize - 1)) != 0 && oldeob != neweob) {
+		npages1 = min(ppb, (round_page(neweob) - oldpagestart) >>
+			      PAGE_SHIFT);
+		memset(pgs1, 0, npages1 * sizeof(struct vm_page *));
+		simple_lock(&uobj->vmobjlock);
+		error = VOP_GETPAGES(vp, oldpagestart, pgs1, &npages1,
+		    0, VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF);
+		if (error) {
+			goto out;
+		}
+		for (i = 0; i < npages1; i++) {
+			UVMHIST_LOG(ubchist, "got pgs1[%d] %p", i, pgs1[i],0,0);
+			KASSERT((pgs1[i]->flags & PG_RELEASED) == 0);
+			pgs1[i]->flags &= ~PG_CLEAN;
+		}
+	}
+
+	/*
+	 * cache the new range as well.  this will create zeroed pages
+	 * where the new block will be and keep them locked until the
+	 * new block is allocated, so there will be no window where
+	 * the old contents of the new block is visible to racing threads.
+	 */
+
+	pagestart = trunc_page(off) & ~(bsize - 1);
+	if (pagestart != oldpagestart || pgs1[0] == NULL) {
+		npages2 = min(ppb, (round_page(neweob) - pagestart) >>
+			      PAGE_SHIFT);
+		memset(pgs2, 0, npages2 * sizeof(struct vm_page *));
+		simple_lock(&uobj->vmobjlock);
+		error = VOP_GETPAGES(vp, pagestart, pgs2, &npages2, 0,
+		    VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF);
+		if (error) {
+			goto out;
+		}
+		for (i = 0; i < npages2; i++) {
+			UVMHIST_LOG(ubchist, "got pgs2[%d] %p", i, pgs2[i],0,0);
+			KASSERT((pgs2[i]->flags & PG_RELEASED) == 0);
+			pgs2[i]->flags &= ~PG_CLEAN;
+		}
+	}
+
+	/*
+	 * adjust off to be block-aligned.
+	 */
+
+	delta = off & (bsize - 1);
+	off -= delta;
+	len += delta;
+
+	/*
+	 * now allocate the range.
+	 */
+
+	lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL);
+	error = VOP_BALLOCN(vp, off, len, cred, flags);
+	lockmgr(&vp->v_glock, LK_RELEASE, NULL);
+
+	/*
+	 * unbusy any pages we are holding.
+	 * if we got an error, set the vnode size back to what it was before.
+	 * this will free any pages we created past the old eof.
+	 */
+
+out:
+	if (error) {
+		uvm_vnp_setsize(vp, oldeof);
+	}
+	simple_lock(&uobj->vmobjlock);
+	if (pgs1[0] != NULL) {
+		uvm_page_unbusy(pgs1, npages1);
+
+		/*
+		 * The data in the frag might be moving to a new disk location.
+		 * We need to flush pages to the new disk locations.
+		 */
+
+		(uobj->pgops->pgo_flush)(uobj, oldeof & ~(bsize - 1),
+		    min((oldeof + bsize) & ~(bsize - 1), neweof),
+		    PGO_CLEANIT | ((flags & B_SYNC) ? PGO_SYNCIO : 0));
+	}
+	if (pgs2[0] != NULL) {
+		uvm_page_unbusy(pgs2, npages2);
+	}
+	simple_unlock(&uobj->vmobjlock);
+	return error;
+}
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.27 2000/09/09 04:49:55 perseant Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.28 2000/11/27 08:39:57 chs Exp $	*/

 /*-
 * Copyright (c) 1993
@ -73,17 +73,20 @@ READ(v)
 	struct inode *ip;
 	struct uio *uio;
 	FS *fs;
+#ifndef LFS_READWRITE
+	void *win;
+	vsize_t bytelen;
+#endif
 	struct buf *bp;
 	ufs_daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error;
-	u_short mode;

 	vp = ap->a_vp;
 	ip = VTOI(vp);
-	mode = ip->i_ffs_mode;
 	uio = ap->a_uio;
+	error = 0;

 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
@ -102,19 +105,39 @@ READ(v)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
+	if (uio->uio_offset >= ip->i_ffs_size) {
+		goto out;
+	}
+
+#ifndef LFS_READWRITE
+	if (vp->v_type == VREG) {
+		while (uio->uio_resid > 0) {
+			bytelen = min(ip->i_ffs_size - uio->uio_offset,
+			    uio->uio_resid);
+			if (bytelen == 0)
+				break;
+
+			win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset,
+					&bytelen, UBC_READ);
+			error = uiomove(win, bytelen, uio);
+			ubc_release(win, 0);
+			if (error)
+				break;
+		}
+		goto out;
+	}
+#endif

 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
-		if ((bytesinfile = ip->i_ffs_size - uio->uio_offset) <= 0)
+		bytesinfile = ip->i_ffs_size - uio->uio_offset;
+		if (bytesinfile <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = BLKSIZE(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
-		xfersize = fs->fs_bsize - blkoffset;
-		if (uio->uio_resid < xfersize)
-			xfersize = uio->uio_resid;
-		if (bytesinfile < xfersize)
-			xfersize = bytesinfile;
+		xfersize = min(min(fs->fs_bsize - blkoffset, uio->uio_resid),
+		    bytesinfile);

 #ifdef LFS_READWRITE
 		(void)lfs_check(vp, lbn, 0);
@ -122,9 +145,6 @@ READ(v)
 #else
 		if (lblktosize(fs, nextlbn) >= ip->i_ffs_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
-		else if (doclusterread)
-			error = cluster_read(vp,
-			    ip->i_ffs_size, lbn, size, NOCRED, &bp);
 		else if (lbn - 1 == vp->v_lastr) {
 			int nextsize = BLKSIZE(fs, ip, nextlbn);
 			error = breadn(vp, lbn,
@ -149,14 +169,15 @@ READ(v)
 				break;
 			xfersize = size;
 		}
-		error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize,
-				uio);
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
 		if (error)
 			break;
 		brelse(bp);
 	}
 	if (bp != NULL)
 		brelse(bp);
+
+out:
 	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
 		ip->i_flag |= IN_ACCESS;
 		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
@ -187,6 +208,12 @@ WRITE(v)
 	ufs_daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
+#ifndef LFS_READWRITE
+	void *win;
+	vsize_t bytelen;
+	off_t oldoff;
+	boolean_t rv;
+#endif

 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
@ -240,14 +267,65 @@ WRITE(v)

 	resid = uio->uio_resid;
 	osize = ip->i_ffs_size;
-	flags = ioflag & IO_SYNC ? B_SYNC : 0;
+	error = 0;

-	for (error = 0; uio->uio_resid > 0;) {
+#ifndef LFS_READWRITE
+	if (vp->v_type != VREG) {
+		goto bcache;
+	}
+
+	while (uio->uio_resid > 0) {
+		oldoff = uio->uio_offset;
+		blkoffset = blkoff(fs, uio->uio_offset);
+		bytelen = min(fs->fs_bsize - blkoffset, uio->uio_resid);
+
+		/*
+		 * XXXUBC if file is mapped and this is the last block,
+		 * process one page at a time.
+		 */
+
+		error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
+		    ap->a_cred, ioflag & IO_SYNC ? B_SYNC : 0);
+		if (error) {
+			return error;
+		}
+
+		win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen,
+				UBC_WRITE);
+		error = uiomove(win, bytelen, uio);
+		ubc_release(win, 0);
+
+		/*
+		 * flush what we just wrote if necessary.
+		 * XXXUBC simplistic async flushing.
+		 */
+
+		if (ioflag & IO_SYNC) {
+			simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+			rv = vp->v_uvm.u_obj.pgops->pgo_flush(
+			    &vp->v_uvm.u_obj, oldoff, oldoff + bytelen,
+			    PGO_CLEANIT|PGO_SYNCIO);
+			simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+		} else if (oldoff >> 16 != uio->uio_offset >> 16) {
+			simple_lock(&vp->v_uvm.u_obj.vmobjlock);
+			rv = vp->v_uvm.u_obj.pgops->pgo_flush(
+			    &vp->v_uvm.u_obj, (oldoff >> 16) << 16,
+			    (uio->uio_offset >> 16) << 16, PGO_CLEANIT);
+			simple_unlock(&vp->v_uvm.u_obj.vmobjlock);
+		}
+		if (error) {
+			break;
+		}
+	}
+	goto out;
+
+bcache:
+#endif
+	flags = ioflag & IO_SYNC ? B_SYNC : 0;
+	while (uio->uio_resid > 0) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
-		xfersize = fs->fs_bsize - blkoffset;
-		if (uio->uio_resid < xfersize)
-			xfersize = uio->uio_resid;
+		xfersize = min(fs->fs_bsize - blkoffset, uio->uio_resid);
 		if (fs->fs_bsize > xfersize)
 			flags |= B_CLRBUF;
 		else
@ -262,14 +340,22 @@ WRITE(v)
 			ip->i_ffs_size = uio->uio_offset + xfersize;
 			uvm_vnp_setsize(vp, ip->i_ffs_size);
 		}
-		(void)uvm_vnp_uncache(vp);
-
 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
-		if (size < xfersize)
+		if (xfersize > size)
 			xfersize = size;

-		error =
-		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
+		error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
+
+		/*
+		 * if we didn't clear the block and the uiomove failed,
+		 * the buf will now contain part of some other file,
+		 * so we need to invalidate it.
+		 */
+		if (error && (flags & B_CLRBUF) == 0) {
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			break;
+		}
 #ifdef LFS_READWRITE
 		if (!error)
 			error = lfs_reserve(fs, vp, fsbtodb(fs, NIADDR + 1));
@ -289,13 +375,16 @@ WRITE(v)
 #endif
 		if (error || xfersize == 0)
 			break;
-		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
+#ifndef LFS_READWRITE
+out:
+#endif
+	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_ffs_mode &= ~(ISUID | ISGID);
 	if (error) {
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_vnops.c,v 1.74 2000/10/19 10:55:35 pk Exp $	*/
+/*	$NetBSD: ufs_vnops.c,v 1.75 2000/11/27 08:40:02 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993, 1995
@ -461,8 +461,6 @@ ufs_chmod(vp, mode, cred, p)
 	ip->i_ffs_mode &= ~ALLPERMS;
 	ip->i_ffs_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
-	if ((vp->v_flag & VTEXT) && (ip->i_ffs_mode & S_ISTXT) == 0)
-		(void) uvm_vnp_uncache(vp);
 	return (0);
 }

@ -1632,6 +1630,7 @@ ufs_strategy(v)
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("ufs_strategy: spec");
+	KASSERT(bp->b_bcount != 0);
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
 				 NULL);
--- a/sys/ufs/ufs/ufsmount.h
+++ b/sys/ufs/ufs/ufsmount.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ufsmount.h,v 1.7 1998/03/18 15:57:29 bouyer Exp $	*/
+/*	$NetBSD: ufsmount.h,v 1.8 2000/11/27 08:40:02 chs Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -82,6 +82,7 @@ struct ufsmount {
 	struct	vnode *um_quotas[MAXQUOTAS];	/* pointer to quota files */
 	struct	ucred *um_cred[MAXQUOTAS];	/* quota file access cred */
 	u_long	um_nindir;			/* indirect ptrs per block */
+	u_long	um_lognindir;			/* log2 of um_nindir */
 	u_long	um_bptrtodb;			/* indir ptr to disk block */
 	u_long	um_seqinc;			/* inc between seq blocks */
 	time_t	um_btime[MAXQUOTAS];		/* block quota time limit */
--- a/sys/uvm/uvm.h
+++ b/sys/uvm/uvm.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm.h,v 1.23 2000/06/26 14:21:16 mrg Exp $	*/
+/*	$NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $	*/

 /*
 *
@ -76,6 +76,7 @@

 struct uvm {
 	/* vm_page related parameters */
+
 		/* vm_page queues */
 	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
 	struct pglist page_active;	/* allocated pages, in use */
@ -86,10 +87,17 @@ struct uvm {
 	boolean_t page_init_done;	/* TRUE if uvm_page_init() finished */
 	boolean_t page_idle_zero;	/* TRUE if we should try to zero
 					   pages in the idle loop */
+
 		/* page daemon trigger */
 	int pagedaemon;			/* daemon sleeps on this */
 	struct proc *pagedaemon_proc;	/* daemon's pid */
 	simple_lock_data_t pagedaemon_lock;
+
+		/* aiodone daemon trigger */
+	int aiodoned;			/* daemon sleeps on this */
+	struct proc *aiodoned_proc;	/* daemon's pid */
+	simple_lock_data_t aiodoned_lock;
+
 		/* page hash */
 	struct pglist *page_hash;	/* page hash table (vp/off->page) */
 	int page_nhash;			/* number of buckets */
@ -105,7 +113,7 @@ struct uvm {
 	simple_lock_data_t kentry_lock;

 	/* aio_done is locked by uvm.pagedaemon_lock and splbio! */
-	struct uvm_aiohead aio_done;	/* done async i/o reqs */
+	TAILQ_HEAD(, buf) aio_done;		/* done async i/o reqs */

 	/* pager VM area bounds */
 	vaddr_t pager_sva;		/* start of pager VA area */
@ -145,6 +153,7 @@ extern struct uvm uvm;

 UVMHIST_DECL(maphist);
 UVMHIST_DECL(pdhist);
+UVMHIST_DECL(ubchist);

 /*
 * UVM_UNLOCK_AND_WAIT: atomic unlock+wait... wrapper around the
--- a/sys/uvm/uvm_bio.c
+++ b/sys/uvm/uvm_bio.c
@ -0,0 +1,549 @@
+/*	$NetBSD: uvm_bio.c,v 1.2 2000/11/27 08:43:40 chs Exp $	*/
+
+/* 
+ * Copyright (c) 1998 Chuck Silvers.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include "opt_uvmhist.h"
+
+/*
+ * uvm_bio.c: buffered i/o vnode mapping cache
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_page.h>
+
+/*
+ * global data structures
+ */
+
+/*
+ * local functions
+ */
+
+static int	ubc_fault __P((struct uvm_faultinfo *, vaddr_t, 
+			       vm_page_t *, int, int, vm_fault_t, vm_prot_t,
+			       int));
+static struct ubc_map *ubc_find_mapping __P((struct uvm_object *, voff_t));
+
+/*
+ * local data structues
+ */
+
+#define UBC_HASH(uobj, offset) (((((u_long)(uobj)) >> 8) + \
+				 (((u_long)(offset)) >> PAGE_SHIFT)) & \
+				ubc_object.hashmask)
+
+#define UBC_QUEUE(offset) (&ubc_object.inactive[((offset) / UBC_WINSIZE) & \
+					       (UBC_NQUEUES - 1)])
+
+struct ubc_map
+{
+	struct uvm_object *	uobj;		/* mapped object */
+	voff_t			offset;		/* offset into uobj */
+	int			refcount;	/* refcount on mapping */
+	voff_t			writeoff;	/* overwrite offset */
+	vsize_t			writelen;	/* overwrite len */
+
+	LIST_ENTRY(ubc_map)	hash;		/* hash table */
+	TAILQ_ENTRY(ubc_map)	inactive;	/* inactive queue */
+};
+
+static struct ubc_object
+{
+	struct uvm_object uobj;		/* glue for uvm_map() */
+	char *kva;			/* where ubc_object is mapped */
+	struct ubc_map *umap;		/* array of ubc_map's */
+
+	LIST_HEAD(, ubc_map) *hash;	/* hashtable for cached ubc_map's */
+	u_long hashmask;		/* mask for hashtable */
+
+	TAILQ_HEAD(ubc_inactive_head, ubc_map) *inactive;
+					/* inactive queues for ubc_map's */
+
+} ubc_object;
+
+struct uvm_pagerops ubc_pager =
+{
+	NULL,		/* init */
+	NULL,		/* reference */
+	NULL,		/* detach */
+	ubc_fault,	/* fault */
+	/* ... rest are NULL */
+};
+
+int ubc_nwins = UBC_NWINS;
+int ubc_winsize = UBC_WINSIZE;
+#ifdef PMAP_PREFER
+int ubc_nqueues;
+boolean_t ubc_release_unmap = FALSE;
+#define UBC_NQUEUES ubc_nqueues
+#define UBC_RELEASE_UNMAP ubc_release_unmap
+#else
+#define UBC_NQUEUES 1
+#define UBC_RELEASE_UNMAP FALSE
+#endif
+
+/*
+ * ubc_init
+ *
+ * init pager private data structures.
+ */
+
+void
+ubc_init(void)
+{
+	struct ubc_map *umap;
+	vaddr_t va;
+	int i;
+
+	/*
+	 * init ubc_object.
+	 * alloc and init ubc_map's.
+	 * init inactive queues.
+	 * alloc and init hashtable.
+	 * map in ubc_object.
+	 */
+
+	simple_lock_init(&ubc_object.uobj.vmobjlock);
+	ubc_object.uobj.pgops = &ubc_pager;
+	TAILQ_INIT(&ubc_object.uobj.memq);
+	ubc_object.uobj.uo_npages = 0;
+	ubc_object.uobj.uo_refs = UVM_OBJ_KERN;
+
+	ubc_object.umap = malloc(ubc_nwins * sizeof(struct ubc_map),
+				 M_TEMP, M_NOWAIT);
+	bzero(ubc_object.umap, ubc_nwins * sizeof(struct ubc_map));
+
+	va = (vaddr_t)1L;
+#ifdef PMAP_PREFER
+	PMAP_PREFER(0, &va);
+	if (va < UBC_WINSIZE) {
+		va = UBC_WINSIZE;
+	}
+	ubc_nqueues = va / UBC_WINSIZE;
+	if (ubc_nqueues != 1) {
+		ubc_release_unmap = TRUE;
+	}
+#endif
+	ubc_object.inactive = malloc(UBC_NQUEUES *
+				     sizeof(struct ubc_inactive_head),
+				     M_TEMP, M_NOWAIT);
+	for (i = 0; i < UBC_NQUEUES; i++) {
+		TAILQ_INIT(&ubc_object.inactive[i]);
+	}
+	for (i = 0; i < ubc_nwins; i++) {
+		umap = &ubc_object.umap[i];
+		TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)],
+				  umap, inactive);
+	}
+
+	ubc_object.hash = hashinit(ubc_nwins, HASH_LIST, M_TEMP, M_NOWAIT,
+				   &ubc_object.hashmask);
+	for (i = 0; i <= ubc_object.hashmask; i++) {
+		LIST_INIT(&ubc_object.hash[i]);
+	}
+
+	if (uvm_map(kernel_map, (vaddr_t *)&ubc_object.kva,
+		    ubc_nwins * UBC_WINSIZE, &ubc_object.uobj, 0, (vsize_t)va,
+		    UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
+				UVM_ADV_RANDOM, UVM_FLAG_NOMERGE))
+	    != KERN_SUCCESS) {
+		panic("ubc_init: failed to map ubc_object\n");
+	}
+	UVMHIST_INIT(ubchist, 300);
+}
+
+
+/*
+ * ubc_fault: fault routine for ubc mapping
+ */
+static int
+ubc_fault(ufi, ign1, ign2, ign3, ign4, fault_type, access_type, flags)
+	struct uvm_faultinfo *ufi;
+	vaddr_t ign1;
+	vm_page_t *ign2;
+	int ign3, ign4;
+	vm_fault_t fault_type;
+	vm_prot_t access_type;
+	int flags;
+{
+	struct uvm_object *uobj;
+	struct vnode *vp;
+	struct ubc_map *umap;
+	vaddr_t va, eva, ubc_offset, slot_offset;
+	int i, rv, npages;
+	struct vm_page *pgs[UBC_WINSIZE >> PAGE_SHIFT], *pg;
+	UVMHIST_FUNC("ubc_fault");  UVMHIST_CALLED(ubchist);
+
+	/*
+	 * no need to try with PGO_LOCKED...
+	 * we don't need to have the map locked since we know that
+	 * no one will mess with it until our reference is released.
+	 */
+	if (flags & PGO_LOCKED) {
+#if 0
+		return VM_PAGER_UNLOCK;
+#else
+		uvmfault_unlockall(ufi, NULL, &ubc_object.uobj, NULL);
+		flags &= ~PGO_LOCKED;
+#endif
+	}
+
+	va = ufi->orig_rvaddr;
+	ubc_offset = va - (vaddr_t)ubc_object.kva;
+
+	UVMHIST_LOG(ubchist, "va 0x%lx ubc_offset 0x%lx at %d",
+		    va, ubc_offset, access_type,0);
+
+	umap = &ubc_object.umap[ubc_offset / UBC_WINSIZE];
+	KASSERT(umap->refcount != 0);
+	slot_offset = trunc_page(ubc_offset & (UBC_WINSIZE - 1));
+
+	/* no umap locking needed since we have a ref on the umap */
+	uobj = umap->uobj;
+	vp = (struct vnode *)uobj;
+	KASSERT(uobj != NULL);
+
+	npages = (UBC_WINSIZE - slot_offset) >> PAGE_SHIFT;
+
+	/*
+	 * XXXUBC
+	 * if npages is more than 1 we have to be sure that
+	 * we set PGO_OVERWRITE correctly.
+	 */
+	if (access_type == VM_PROT_WRITE) {
+		npages = 1;
+	}
+
+again:
+	memset(pgs, 0, sizeof (pgs));
+	simple_lock(&uobj->vmobjlock);
+
+	UVMHIST_LOG(ubchist, "slot_offset 0x%x writeoff 0x%x writelen 0x%x "
+		    "u_size 0x%x", slot_offset, umap->writeoff, umap->writelen,
+		    vp->v_uvm.u_size);
+
+	if (access_type & VM_PROT_WRITE &&
+	    slot_offset >= umap->writeoff &&
+	    (slot_offset + PAGE_SIZE <= umap->writeoff + umap->writelen ||
+	     slot_offset + PAGE_SIZE >= vp->v_uvm.u_size - umap->offset)) {
+		UVMHIST_LOG(ubchist, "setting PGO_OVERWRITE", 0,0,0,0);
+		flags |= PGO_OVERWRITE;
+	}
+	else { UVMHIST_LOG(ubchist, "NOT setting PGO_OVERWRITE", 0,0,0,0); }
+	/* XXX be sure to zero any part of the page past EOF */
+
+	/*
+	 * XXX
+	 * ideally we'd like to pre-fault all of the pages we're overwriting.
+	 * so for PGO_OVERWRITE, we should call VOP_GETPAGES() with all of the
+	 * pages in [writeoff, writeoff+writesize] instead of just the one.
+	 */
+
+	UVMHIST_LOG(ubchist, "getpages vp %p offset 0x%x npages %d",
+		    uobj, umap->offset + slot_offset, npages, 0);
+
+	rv = VOP_GETPAGES(vp, umap->offset + slot_offset, pgs, &npages, 0,
+			  access_type, 0, flags);
+	UVMHIST_LOG(ubchist, "getpages rv %d npages %d", rv, npages,0,0);
+
+	switch (rv) {
+	case VM_PAGER_OK:
+		break;
+
+	case VM_PAGER_AGAIN:
+		tsleep(&lbolt, PVM, "ubc_fault", 0);
+		goto again;
+
+	default:
+		return rv;
+	}
+
+	if (npages == 0) {
+		return VM_PAGER_OK;
+	}
+
+	va = ufi->orig_rvaddr;
+	eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT);
+
+	UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0,0);
+	simple_lock(&uobj->vmobjlock);
+	for (i = 0; va < eva; i++, va += PAGE_SIZE) {
+		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
+		pg = pgs[i];
+
+		if (pg == NULL || pg == PGO_DONTCARE) {
+			continue;
+		}
+		if (pg->flags & PG_WANTED) {
+			wakeup(pg);
+		}
+		KASSERT((pg->flags & PG_FAKE) == 0);
+		if (pg->flags & PG_RELEASED) {
+			rv = uobj->pgops->pgo_releasepg(pg, NULL);
+			KASSERT(rv);
+			continue;
+		}
+		KASSERT(access_type == VM_PROT_READ ||
+			(pg->flags & PG_RDONLY) == 0);
+
+		uvm_lock_pageq();
+		uvm_pageactivate(pg);
+		uvm_unlock_pageq();
+
+		pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg),
+			   VM_PROT_ALL, access_type);
+
+		pg->flags &= ~(PG_BUSY);
+		UVM_PAGE_OWN(pg, NULL);
+	}
+	simple_unlock(&uobj->vmobjlock);
+	return VM_PAGER_OK;
+}
+
+/*
+ * local functions
+ */
+
+static struct ubc_map *
+ubc_find_mapping(uobj, offset)
+	struct uvm_object *uobj;
+	voff_t offset;
+{
+	struct ubc_map *umap;
+
+	LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) {
+		if (umap->uobj == uobj && umap->offset == offset) {
+			return umap;
+		}
+	}
+	return NULL;
+}
+
+
+/*
+ * ubc interface functions
+ */
+
+/*
+ * ubc_alloc:  allocate a buffer mapping
+ */
+void *
+ubc_alloc(uobj, offset, lenp, flags)
+	struct uvm_object *uobj;
+	voff_t offset;
+	vsize_t *lenp;
+	int flags;
+{
+	int s;
+	vaddr_t umap_offset, slot_offset, va;
+	struct ubc_map *umap;
+	UVMHIST_FUNC("ubc_alloc"); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "uobj %p offset 0x%lx len 0x%lx filesize 0x%x",
+		    uobj, offset, *lenp, ((struct uvm_vnode *)uobj)->u_size);
+
+	umap_offset = (vaddr_t)(offset & ~((voff_t)UBC_WINSIZE - 1));
+	slot_offset = (vaddr_t)(offset & ((voff_t)UBC_WINSIZE - 1));
+	*lenp = min(*lenp, UBC_WINSIZE - slot_offset);
+
+	/*
+	 * the vnode is always locked here, so we don't need to add a ref.
+	 */
+
+	s = splbio();
+
+again:
+	simple_lock(&ubc_object.uobj.vmobjlock);
+	umap = ubc_find_mapping(uobj, umap_offset);
+	if (umap == NULL) {
+		umap = TAILQ_FIRST(UBC_QUEUE(offset));
+		if (umap == NULL) {
+			simple_unlock(&ubc_object.uobj.vmobjlock);
+			tsleep(&lbolt, PVM, "ubc_alloc", 0);
+			goto again;
+		}
+
+		/*
+		 * remove from old hash (if any),
+		 * add to new hash.
+		 */
+
+		if (umap->uobj != NULL) {
+			LIST_REMOVE(umap, hash);
+		}
+
+		umap->uobj = uobj;
+		umap->offset = umap_offset;
+
+		LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)],
+				 umap, hash);
+
+		va = (vaddr_t)(ubc_object.kva +
+			       (umap - ubc_object.umap) * UBC_WINSIZE);
+		pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE);
+	}
+
+	if (umap->refcount == 0) {
+		TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive);
+	}
+
+#ifdef DIAGNOSTIC
+	if ((flags & UBC_WRITE) &&
+	    (umap->writeoff || umap->writelen)) {
+		panic("ubc_fault: concurrent writes vp %p", uobj);
+	}
+#endif
+	if (flags & UBC_WRITE) {
+		umap->writeoff = slot_offset;
+		umap->writelen = *lenp;
+	}
+
+	umap->refcount++;
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+	splx(s);
+	UVMHIST_LOG(ubchist, "umap %p refs %d va %p",
+		    umap, umap->refcount,
+		    ubc_object.kva + (umap - ubc_object.umap) * UBC_WINSIZE,0);
+
+	return ubc_object.kva +
+		(umap - ubc_object.umap) * UBC_WINSIZE + slot_offset;
+}
+
+
+void
+ubc_release(va, wlen)
+	void *va;
+	vsize_t wlen;
+{
+	struct ubc_map *umap;
+	struct uvm_object *uobj;
+	int s;
+	UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "va %p", va,0,0,0);
+
+	s = splbio();
+	simple_lock(&ubc_object.uobj.vmobjlock);
+
+	umap = &ubc_object.umap[((char *)va - ubc_object.kva) / UBC_WINSIZE];
+	uobj = umap->uobj;
+	KASSERT(uobj != NULL);
+
+	umap->writeoff = 0;
+	umap->writelen = 0;
+	umap->refcount--;
+	if (umap->refcount == 0) {
+		if (UBC_RELEASE_UNMAP &&
+		    (((struct vnode *)uobj)->v_flag & VTEXT)) {
+			vaddr_t va;
+
+			/*
+			 * if this file is the executable image of
+			 * some process, that process will likely have
+			 * the file mapped at an alignment other than
+			 * what PMAP_PREFER() would like.  we'd like
+			 * to have process text be able to use the
+			 * cache even if someone is also reading the
+			 * file, so invalidate mappings of such files
+			 * as soon as possible.
+			 */
+
+			va = (vaddr_t)(ubc_object.kva +
+			    (umap - ubc_object.umap) * UBC_WINSIZE);
+			pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE);
+			LIST_REMOVE(umap, hash);
+			umap->uobj = NULL;
+			TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap,
+			    inactive);
+		} else {
+			TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap,
+			    inactive);
+		}
+	}
+	UVMHIST_LOG(ubchist, "umap %p refs %d", umap, umap->refcount,0,0);
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+	splx(s);
+}
+
+
+/*
+ * removing a range of mappings from the ubc mapping cache.
+ */
+
+void
+ubc_flush(uobj, start, end)
+	struct uvm_object *uobj;
+	voff_t start, end;
+{
+	struct ubc_map *umap;
+	vaddr_t va;
+	int s;
+	UVMHIST_FUNC("ubc_flush");  UVMHIST_CALLED(ubchist);
+
+	UVMHIST_LOG(ubchist, "uobj %p start 0x%lx end 0x%lx",
+		    uobj, start, end,0);
+
+	s = splbio(); 
+	simple_lock(&ubc_object.uobj.vmobjlock);
+	for (umap = ubc_object.umap;
+	     umap < &ubc_object.umap[ubc_nwins];
+	     umap++) {
+
+		if (umap->uobj != uobj || 
+		    umap->offset < start ||
+		    (umap->offset >= end && end != 0) ||
+		    umap->refcount > 0) {
+			continue;
+		}
+
+		/*
+		 * remove from hash,
+		 * move to head of inactive queue.
+		 */
+
+		va = (vaddr_t)(ubc_object.kva +
+			       (umap - ubc_object.umap) * UBC_WINSIZE);
+		pmap_remove(pmap_kernel(), va, va + UBC_WINSIZE);
+
+		LIST_REMOVE(umap, hash);
+		umap->uobj = NULL;
+		TAILQ_REMOVE(UBC_QUEUE(umap->offset), umap, inactive);
+		TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, inactive);
+	}
+	simple_unlock(&ubc_object.uobj.vmobjlock);
+	splx(s);
+}
--- a/sys/uvm/uvm_extern.h
+++ b/sys/uvm/uvm_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.52 2000/11/27 04:36:40 nisimura Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.53 2000/11/27 08:40:03 chs Exp $	*/

 /*
 *
@ -192,6 +192,21 @@ typedef struct vm_page  *vm_page_t;
 #define UVM_PGA_USERESERVE	0x0001	/* ok to use reserve pages */
 #define	UVM_PGA_ZERO		0x0002	/* returned page must be zero'd */

+/*
+ * the following defines are for ubc_alloc's flags
+ */
+#define UBC_READ	0
+#define UBC_WRITE	1
+
+/*
+ * flags for uvn_findpages().
+ */
+#define UFP_ALL		0x0
+#define UFP_NOWAIT	0x1
+#define UFP_NOALLOC	0x2
+#define UFP_NOCACHE	0x4
+#define UFP_NORDONLY	0x8
+
 /*
 * lockflags that control the locking behavior of various functions.
 */
@ -213,8 +228,11 @@ struct vm_anon;
 struct vmspace;
 struct pmap;
 struct vnode;
+struct pool;
 struct simplelock;

+extern struct pool *uvm_aiobuf_pool;
+
 /*
 * uvmexp: global data structures that are exported to parts of the kernel
 * other than the vm system.
@ -414,9 +432,16 @@ void			uao_detach_locked __P((struct uvm_object *));
 void			uao_reference __P((struct uvm_object *));
 void			uao_reference_locked __P((struct uvm_object *));

+/* uvm_bio.c */
+void			ubc_init __P((void));
+void *			ubc_alloc __P((struct uvm_object *, voff_t, vsize_t *,
+				       int));
+void			ubc_release __P((void *, vsize_t));
+void			ubc_flush __P((struct uvm_object *, voff_t, voff_t));
+
 /* uvm_fault.c */
-int			uvm_fault __P((vm_map_t, vaddr_t, 
-				vm_fault_t, vm_prot_t));
+int			uvm_fault __P((vm_map_t, vaddr_t, vm_fault_t,
+				       vm_prot_t));
 				/* handle a page fault */

 /* uvm_glue.c */
@ -511,8 +536,14 @@ void			uvm_page_physload __P((paddr_t, paddr_t,
 					       paddr_t, paddr_t, int));
 void			uvm_setpagesize __P((void));

+/* uvm_pager.c */
+void			uvm_aio_biodone1 __P((struct buf *));
+void			uvm_aio_biodone __P((struct buf *));
+void			uvm_aio_aiodone __P((struct buf *));
+
 /* uvm_pdaemon.c */
 void			uvm_pageout __P((void *));
+void			uvm_aiodone_daemon __P((void *));

 /* uvm_pglist.c */
 int			uvm_pglistalloc __P((psize_t, paddr_t,
@ -538,10 +569,11 @@ int			uvm_deallocate __P((vm_map_t, vaddr_t, vsize_t));
 /* uvm_vnode.c */
 void			uvm_vnp_setsize __P((struct vnode *, voff_t));
 void			uvm_vnp_sync __P((struct mount *));
-void 			uvm_vnp_terminate __P((struct vnode *));
-				/* terminate a uvm/uvn object */
-boolean_t		uvm_vnp_uncache __P((struct vnode *));
 struct uvm_object	*uvn_attach __P((void *, vm_prot_t));
+void			uvn_findpages __P((struct uvm_object *, voff_t,
+					   int *, struct vm_page **, int));
+void			uvm_vnp_zerorange __P((struct vnode *, off_t, size_t));
+void			uvm_vnp_asyncget __P((struct vnode *, off_t, size_t));

 /* kern_malloc.c */
 void			kmeminit_nkmempages __P((void));
--- a/sys/uvm/uvm_fault.c
+++ b/sys/uvm/uvm_fault.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $	*/
+/*	$NetBSD: uvm_fault.c,v 1.52 2000/11/27 08:40:03 chs Exp $	*/

 /*
 *
@ -458,12 +458,8 @@ uvmfault_anonget(ufi, amap, anon)
 			}

 			if (result != VM_PAGER_OK) {
-#ifdef DIAGNOSTIC
-				if (result == VM_PAGER_PEND) {
-					panic("uvmfault_anonget: "
-					      "got PENDING for non-async I/O");
-				}
-#endif
+				KASSERT(result != VM_PAGER_PEND);
+
 				/* remove page from anon */
 				anon->u.an_page = NULL;

@ -569,7 +565,7 @@ uvm_fault(orig_map, vaddr, fault_type, access_type)
 	vm_prot_t enter_prot;
 	boolean_t wired, narrow, promote, locked, shadowed;
 	int npages, nback, nforw, centeridx, result, lcv, gotpages;
-	vaddr_t startva, objaddr, currva, offset;
+	vaddr_t startva, objaddr, currva, offset, uoff;
 	paddr_t pa; 
 	struct vm_amap *amap;
 	struct uvm_object *uobj;
@ -580,7 +576,8 @@ uvm_fault(orig_map, vaddr, fault_type, access_type)
 	UVMHIST_LOG(maphist, "(map=0x%x, vaddr=0x%x, ft=%d, at=%d)",
 	      orig_map, vaddr, fault_type, access_type);

-	anon = NULL; /* XXX: shut up gcc */
+	anon = NULL;
+	pg = NULL;

 	uvmexp.faults++;	/* XXX: locking? */

@ -717,10 +714,8 @@ ReFault:
 	if (narrow == FALSE) {

 		/* wide fault (!narrow) */
-#ifdef DIAGNOSTIC
-		if (uvmadvice[ufi.entry->advice].advice != ufi.entry->advice)
-			panic("fault: advice mismatch!");
-#endif
+		KASSERT(uvmadvice[ufi.entry->advice].advice ==
+			 ufi.entry->advice);
 		nback = min(uvmadvice[ufi.entry->advice].nback,
 			    (ufi.orig_rvaddr - ufi.entry->start) >> PAGE_SHIFT);
 		startva = ufi.orig_rvaddr - (nback << PAGE_SHIFT);
@ -793,7 +788,7 @@ ReFault:
 		/* now forget about the backpages */
 		if (amap)
 			anons += nback;
-		startva = startva + (nback << PAGE_SHIFT);
+		startva += (nback << PAGE_SHIFT);
 		npages -= nback;
 		nback = centeridx = 0;
 	}
@ -814,12 +809,10 @@ ReFault:
 		 * dont play with VAs that are already mapped
 		 * except for center)
 		 */
-		if (lcv != centeridx) {
-			if (pmap_extract(ufi.orig_map->pmap, currva, &pa) ==
-			    TRUE) {
-				pages[lcv] = PGO_DONTCARE;
-				continue;
-			}
+		if (lcv != centeridx &&
+		    pmap_extract(ufi.orig_map->pmap, currva, &pa)) {
+			pages[lcv] = PGO_DONTCARE;
+			continue;
 		}

 		/*
@ -851,11 +844,13 @@ ReFault:
 			    "  MAPPING: n anon: pm=0x%x, va=0x%x, pg=0x%x",
 			    ufi.orig_map->pmap, currva, anon->u.an_page, 0);
 			uvmexp.fltnamap++;
+
 			/*
 			 * Since this isn't the page that's actually faulting,
 			 * ignore pmap_enter() failures; it's not critical
 			 * that we enter these right now.
 			 */
+
 			(void) pmap_enter(ufi.orig_map->pmap, currva,
 			    VM_PAGE_TO_PHYS(anon->u.an_page),
 			    (anon->an_ref > 1) ? (enter_prot & ~VM_PROT_WRITE) :
@ -888,13 +883,13 @@ ReFault:
 	 */

 	if (uobj && shadowed == FALSE && uobj->pgops->pgo_fault != NULL) {
-
 		simple_lock(&uobj->vmobjlock);

 		/* locked: maps(read), amap (if there), uobj */
 		result = uobj->pgops->pgo_fault(&ufi, startva, pages, npages,
 				    centeridx, fault_type, access_type,
-				    PGO_LOCKED);
+				    PGO_LOCKED|PGO_SYNCIO);
+
 		/* locked: nothing, pgo_fault has unlocked everything */

 		if (result == VM_PAGER_OK)
@ -925,7 +920,7 @@ ReFault:

 		uvmexp.fltlget++;
 		gotpages = npages;
-		result = uobj->pgops->pgo_get(uobj, ufi.entry->offset +
+		(void) uobj->pgops->pgo_get(uobj, ufi.entry->offset +
 				(startva - ufi.entry->start),
 				pages, &gotpages, centeridx,
 				access_type & MASK(ufi.entry),
@ -946,29 +941,22 @@ ReFault:
 				    pages[lcv] == PGO_DONTCARE)
 					continue;

-#ifdef DIAGNOSTIC
-					/*
-					 * pager sanity check: pgo_get with
-					 * PGO_LOCKED should never return a
-					 * released page to us.
-					 */
-					if (pages[lcv]->flags & PG_RELEASED) 
-		panic("uvm_fault: pgo_get PGO_LOCKED gave us a RELEASED page");
-#endif
+				KASSERT((pages[lcv]->flags & PG_RELEASED) == 0);

-					/*
-					 * if center page is resident and not
-					 * PG_BUSY|PG_RELEASED then pgo_get
-					 * made it PG_BUSY for us and gave
-					 * us a handle to it.   remember this
-					 * page as "uobjpage." (for later use).
-					 */
-
-					if (lcv == centeridx) {
-						uobjpage = pages[lcv];
-	UVMHIST_LOG(maphist, "  got uobjpage (0x%x) with locked get", 
+				/*
+				 * if center page is resident and not
+				 * PG_BUSY|PG_RELEASED then pgo_get
+				 * made it PG_BUSY for us and gave
+				 * us a handle to it.   remember this
+				 * page as "uobjpage." (for later use).
+				 */
+				
+				if (lcv == centeridx) {
+					uobjpage = pages[lcv];
+					UVMHIST_LOG(maphist, "  got uobjpage "
+					    "(0x%x) with locked get", 
 					    uobjpage, 0,0,0);
-						continue;
+					continue;
 				}
 	
 				/* 
@ -987,15 +975,18 @@ ReFault:
 				  "  MAPPING: n obj: pm=0x%x, va=0x%x, pg=0x%x",
 				  ufi.orig_map->pmap, currva, pages[lcv], 0);
 				uvmexp.fltnomap++;
+
 				/*
 				 * Since this page isn't the page that's
 				 * actually fauling, ignore pmap_enter()
 				 * failures; it's not critical that we
 				 * enter these right now.
 				 */
+
 				(void) pmap_enter(ufi.orig_map->pmap, currva,
 				    VM_PAGE_TO_PHYS(pages[lcv]),
-				    enter_prot & MASK(ufi.entry),
+				    pages[lcv]->flags & PG_RDONLY ?
+				    VM_PROT_READ : enter_prot & MASK(ufi.entry),
 				    PMAP_CANFAIL |
 				     (wired ? PMAP_WIRED : 0));

@ -1004,18 +995,14 @@ ReFault:
 				 * because we've held the lock the whole time
 				 * we've had the handle.
 				 */
+
 				pages[lcv]->flags &= ~(PG_BUSY); /* un-busy! */
 				UVM_PAGE_OWN(pages[lcv], NULL);
-	 
-				/* done! */
 			}	/* for "lcv" loop */
 		}   /* "gotpages" != 0 */
-
 		/* note: object still _locked_ */
 	} else {
-		
 		uobjpage = NULL;
-
 	}

 	/* locked (shadowed): maps(read), amap */
@ -1078,13 +1065,9 @@ ReFault:
 	case VM_PAGER_REFAULT:
 		goto ReFault;

-	case VM_PAGER_ERROR:
-		/*
-		 * An error occurred while trying to bring in the
-		 * page -- this is the only error we return right
-		 * now.
-		 */
-		return (KERN_PROTECTION_FAILURE);	/* XXX */
+	case VM_PAGER_AGAIN:
+		tsleep(&lbolt, PVM, "fltagain1", 0);
+		goto ReFault;

 	default:
 #ifdef DIAGNOSTIC
@ -1105,6 +1088,7 @@ ReFault:
 	/*
 	 * special handling for loaned pages 
 	 */
+
 	if (anon->u.an_page->loan_count) {

 		if ((access_type & VM_PROT_WRITE) == 0) {
@ -1198,21 +1182,13 @@ ReFault:
 		anon = uvm_analloc();
 		if (anon)
 			pg = uvm_pagealloc(NULL, 0, anon, 0);
-#ifdef __GNUC__
-		else
-			pg = NULL; /* XXX: gcc */
-#endif

 		/* check for out of RAM */
 		if (anon == NULL || pg == NULL) {
 			if (anon)
 				uvm_anfree(anon);
 			uvmfault_unlockall(&ufi, amap, uobj, oanon);
-#ifdef DIAGNOSTIC
-			if (uvmexp.swpgonly > uvmexp.swpages) {
-				panic("uvmexp.swpgonly botch");
-			}
-#endif
+			KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 			if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
 				UVMHIST_LOG(maphist,
 				    "<- failed.  out of VM",0,0,0,0);
@ -1243,7 +1219,7 @@ ReFault:
 		 */

 	} else {
-		
+
 		uvmexp.flt_anon++;
 		oanon = anon;		/* old, locked anon is same as anon */
 		pg = anon->u.an_page;
@ -1252,7 +1228,7 @@ ReFault:

 	}

-	/* locked: maps(read), amap, anon */
+	/* locked: maps(read), amap, oanon */

 	/*
 	 * now map the page in ...
@ -1274,10 +1250,7 @@ ReFault:
 		 * as the map may change while we're asleep.
 		 */
 		uvmfault_unlockall(&ufi, amap, uobj, oanon);
-#ifdef DIAGNOSTIC
-		if (uvmexp.swpgonly > uvmexp.swpages)
-			panic("uvmexp.swpgonly botch");
-#endif
+		KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 		if (uvmexp.swpgonly == uvmexp.swpages) {
 			UVMHIST_LOG(maphist,
 			    "<- failed.  out of VM",0,0,0,0);
@ -1343,7 +1316,7 @@ Case2:
 		uobjpage = PGO_DONTCARE;	
 		promote = TRUE;		/* always need anon here */
 	} else {
-		/* assert(uobjpage != PGO_DONTCARE) */
+		KASSERT(uobjpage != PGO_DONTCARE);
 		promote = (access_type & VM_PROT_WRITE) &&
 		     UVM_ET_ISCOPYONWRITE(ufi.entry);
 	}
@ -1372,24 +1345,19 @@ Case2:

 		uvmexp.fltget++;
 		gotpages = 1;
-		result = uobj->pgops->pgo_get(uobj,
-		    (ufi.orig_rvaddr - ufi.entry->start) + ufi.entry->offset,
-		    &uobjpage, &gotpages, 0,
-			access_type & MASK(ufi.entry),
-			ufi.entry->advice, 0);
+		uoff = (ufi.orig_rvaddr - ufi.entry->start) + ufi.entry->offset;
+		result = uobj->pgops->pgo_get(uobj, uoff, &uobjpage, &gotpages,
+		    0, access_type & MASK(ufi.entry), ufi.entry->advice,
+		    PGO_SYNCIO);

 		/* locked: uobjpage(if result OK) */
-		
+
 		/*
 		 * recover from I/O
 		 */

 		if (result != VM_PAGER_OK) {
-#ifdef DIAGNOSTIC 
-			if (result == VM_PAGER_PEND)
-				panic("uvm_fault: pgo_get got PENDing "
-				    "on non-async I/O");
-#endif
+			KASSERT(result != VM_PAGER_PEND);

 			if (result == VM_PAGER_AGAIN) {
 				UVMHIST_LOG(maphist,
@ -1448,11 +1416,8 @@ Case2:

 			if (uobjpage->flags & PG_RELEASED) {
 				uvmexp.fltpgrele++;
-#ifdef DIAGNOSTIC
-				if (uobj->pgops->pgo_releasepg == NULL)
-					panic("uvm_fault: object has no "
-					    "releasepg function");
-#endif
+				KASSERT(uobj->pgops->pgo_releasepg != NULL);
+
 				/* frees page */
 				if (uobj->pgops->pgo_releasepg(uobjpage,NULL))
 					/* unlock if still alive */
@ -1479,7 +1444,6 @@ Case2:
 		 */

 		/* locked: maps(read), amap(if !null), uobj, uobjpage */
-
 	}

 	/*
@ -1616,10 +1580,6 @@ Case2:
 			pg = uvm_pagealloc(NULL, 0, anon,
 			    (uobjpage == PGO_DONTCARE) ? UVM_PGA_ZERO : 0);
 		}
-#ifdef __GNUC__
-		else
-			pg = NULL; /* XXX: gcc */
-#endif

 		/*
 		 * out of memory resources?
@ -1635,21 +1595,15 @@ Case2:
 					wakeup(uobjpage);

 				uvm_lock_pageq();
-				/* make sure it is in queues */
 				uvm_pageactivate(uobjpage);
 				uvm_unlock_pageq();
-				/* un-busy! (still locked) */
 				uobjpage->flags &= ~(PG_BUSY|PG_WANTED);
 				UVM_PAGE_OWN(uobjpage, NULL);
 			}

 			/* unlock and fail ... */
 			uvmfault_unlockall(&ufi, amap, uobj, NULL);
-#ifdef DIAGNOSTIC
-			if (uvmexp.swpgonly > uvmexp.swpages) {
-				panic("uvmexp.swpgonly botch");
-			}
-#endif
+			KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 			if (anon == NULL || uvmexp.swpgonly == uvmexp.swpages) {
 				UVMHIST_LOG(maphist, "  promote: out of VM",
 				    0,0,0,0);
@ -1659,6 +1613,7 @@ Case2:

 			UVMHIST_LOG(maphist, "  out of RAM, waiting for more",
 			    0,0,0,0);
+			anon->an_ref--;
 			uvm_anfree(anon);
 			uvmexp.fltnoram++;
 			uvm_wait("flt_noram5");
@ -1684,8 +1639,8 @@ Case2:
 			
 			/*
 			 * dispose of uobjpage.  it can't be PG_RELEASED
-			 * since we still hold the object lock.   drop
-			 * handle to uobj as well.
+			 * since we still hold the object lock.
+			 * drop handle to uobj as well.
 			 */

 			if (uobjpage->flags & PG_WANTED)
@ -1694,10 +1649,11 @@ Case2:
 			uobjpage->flags &= ~(PG_BUSY|PG_WANTED);
 			UVM_PAGE_OWN(uobjpage, NULL);
 			uvm_lock_pageq();
-			uvm_pageactivate(uobjpage);	/* put it back */
+			uvm_pageactivate(uobjpage);
 			uvm_unlock_pageq();
 			simple_unlock(&uobj->vmobjlock);
 			uobj = NULL;
+
 			UVMHIST_LOG(maphist,
 			    "  promote uobjpage 0x%x to anon/page 0x%x/0x%x",
 			    uobjpage, anon, pg, 0);
@ -1732,9 +1688,12 @@ Case2:
 	UVMHIST_LOG(maphist,
 	    "  MAPPING: case2: pm=0x%x, va=0x%x, pg=0x%x, promote=%d",
 	    ufi.orig_map->pmap, ufi.orig_rvaddr, pg, promote);
+	KASSERT(access_type == VM_PROT_READ || (pg->flags & PG_RDONLY) == 0);
 	if (pmap_enter(ufi.orig_map->pmap, ufi.orig_rvaddr, VM_PAGE_TO_PHYS(pg),
-	    enter_prot, access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0))
+	    pg->flags & PG_RDONLY ? VM_PROT_READ : enter_prot,
+	    access_type | PMAP_CANFAIL | (wired ? PMAP_WIRED : 0))
 	    != KERN_SUCCESS) {
+
 		/*
 		 * No need to undo what we did; we can simply think of
 		 * this as the pmap throwing away the mapping information.
@ -1742,6 +1701,7 @@ Case2:
 		 * We do, however, have to go through the ReFault path,
 		 * as the map may change while we're asleep.
 		 */
+
 		if (pg->flags & PG_WANTED)
 			wakeup(pg);		/* lock still held */

@ -1753,10 +1713,7 @@ Case2:
 		pg->flags &= ~(PG_BUSY|PG_FAKE|PG_WANTED);
 		UVM_PAGE_OWN(pg, NULL);
 		uvmfault_unlockall(&ufi, amap, uobj, NULL);
-#ifdef DIAGNOSTIC
-		if (uvmexp.swpgonly > uvmexp.swpages)
-			panic("uvmexp.swpgonly botch");
-#endif
+		KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 		if (uvmexp.swpgonly == uvmexp.swpages) {
 			UVMHIST_LOG(maphist,
 			    "<- failed.  out of VM",0,0,0,0);
@ -1788,7 +1745,6 @@ Case2:
 		/* activate it */
 		uvm_pageactivate(pg);
 	}
-
 	uvm_unlock_pageq();

 	if (pg->flags & PG_WANTED)
@ -1880,10 +1836,7 @@ uvm_fault_unwire_locked(map, start, end)
 	paddr_t pa;
 	struct vm_page *pg;

-#ifdef DIAGNOSTIC
-	if (map->flags & VM_MAP_INTRSAFE)
-		panic("uvm_fault_unwire_locked: intrsafe map");
-#endif
+	KASSERT((map->flags & VM_MAP_INTRSAFE) == 0);

 	/*
 	 * we assume that the area we are unwiring has actually been wired
--- a/sys/uvm/uvm_map.c
+++ b/sys/uvm/uvm_map.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_map.c,v 1.85 2000/11/25 06:27:59 chs Exp $	*/
+/*	$NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -3267,16 +3267,16 @@ uvm_object_printit(uobj, full, pr)
 	}
 } 

-const char page_flagbits[] =
-	"\20\4CLEAN\5BUSY\6WANTED\7TABLED\12FAKE\13FILLED\14DIRTY\15RELEASED"
-	"\16FAULTING\17CLEANCHK";
-const char page_pqflagbits[] =
-	"\20\1FREE\2INACTIVE\3ACTIVE\4LAUNDRY\5ANON\6AOBJ";
-
 /*
 * uvm_page_printit: actually print the page
 */

+static const char page_flagbits[] =
+	"\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY"
+	"\11ZERO\15PAGER1";
+static const char page_pqflagbits[] =
+	"\20\1FREE\2INACTIVE\3ACTIVE\4LAUNDRY\5ANON\6AOBJ";
+
 void
 uvm_page_printit(pg, full, pr)
 	struct vm_page *pg;
@ -3294,8 +3294,8 @@ uvm_page_printit(pg, full, pr)
 	bitmask_snprintf(pg->pqflags, page_pqflagbits, pqbuf, sizeof(pqbuf));
 	(*pr)("  flags=%s, pqflags=%s, vers=%d, wire_count=%d, pa=0x%lx\n",
 	    pgbuf, pqbuf, pg->version, pg->wire_count, (long)pg->phys_addr);
-	(*pr)("  uobject=%p, uanon=%p, offset=0x%lx loan_count=%d\n", 
-	    pg->uobject, pg->uanon, pg->offset, pg->loan_count);
+	(*pr)("  uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n",
+	    pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count);
 #if defined(UVM_PAGE_TRKOWN)
 	if (pg->flags & PG_BUSY)
 		(*pr)("  owning process = %d, tag=%s\n",
--- a/sys/uvm/uvm_map_i.h
+++ b/sys/uvm/uvm_map_i.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_map_i.h,v 1.17 2000/05/08 22:59:35 thorpej Exp $	*/
+/*	$NetBSD: uvm_map_i.h,v 1.18 2000/11/27 08:40:04 chs Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -69,8 +69,6 @@
 #ifndef _UVM_UVM_MAP_I_H_
 #define _UVM_UVM_MAP_I_H_

-#include "opt_uvmhist.h"
-
 /*
 * uvm_map_i.h
 */
@ -197,16 +195,6 @@ MAP_INLINE void
 uvm_map_reference(map)
 	vm_map_t map;
 {
-	if (__predict_false(map == NULL)) {
-#ifdef DIAGNOSTIC
-		printf("uvm_map_reference: reference to NULL map\n");
-#ifdef DDB
-		Debugger();
-#endif
-#endif
-		return;
-	}
-
 	simple_lock(&map->ref_lock);
 	map->ref_count++; 
 	simple_unlock(&map->ref_lock);
@ -225,20 +213,9 @@ uvm_map_deallocate(map)
 {
 	int c;

-	if (__predict_false(map == NULL)) {
-#ifdef DIAGNOSTIC
-		printf("uvm_map_deallocate: reference to NULL map\n");
-#ifdef DDB
-		Debugger();
-#endif
-#endif
-		return;
-	}
-
 	simple_lock(&map->ref_lock);
 	c = --map->ref_count;
 	simple_unlock(&map->ref_lock);
-
 	if (c > 0) {
 		return;
 	}
@ -249,7 +226,6 @@ uvm_map_deallocate(map)

 	uvm_unmap(map, map->min_offset, map->max_offset);
 	pmap_destroy(map->pmap);
-
 	FREE(map, M_VMMAP);
 }

--- a/sys/uvm/uvm_mmap.c
+++ b/sys/uvm/uvm_mmap.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_mmap.c,v 1.45 2000/11/24 23:30:01 soren Exp $	*/
+/*	$NetBSD: uvm_mmap.c,v 1.46 2000/11/27 08:40:04 chs Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -262,26 +262,6 @@ sys_mincore(p, v, retval)
 	return (error);
 }

-#if 0
-/*
- * munmapfd: unmap file descriptor
- *
- * XXX: is this acutally a useful function?   could it be useful?
- */
-
-void
-munmapfd(p, fd)
-	struct proc *p;
-	int fd;
-{
-
-	/*
-	 * XXX should vm_deallocate any regions mapped to this file
-	 */
-	p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
-}
-#endif
-
 /*
 * sys_mmap: mmap system call.
 *
@ -375,7 +355,9 @@ sys_mmap(p, v, retval)
 		 * not fixed: make sure we skip over the largest possible heap.
 		 * we will refine our guess later (e.g. to account for VAC, etc)
 		 */
-		if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr+MAXDSIZ))
+
+		if (addr < round_page((vaddr_t)p->p_vmspace->vm_daddr +
+		    MAXDSIZ))
 			addr = round_page((vaddr_t)p->p_vmspace->vm_daddr +
 			    MAXDSIZ);
 	}
@ -1157,36 +1139,8 @@ uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
 			uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ?
 			   maxprot : (maxprot & ~VM_PROT_WRITE));

-			/*
-			 * XXXCDC: hack from old code
-			 * don't allow vnodes which have been mapped
-			 * shared-writeable to persist [forces them to be
-			 * flushed out when last reference goes].
-			 * XXXCDC: interesting side effect: avoids a bug.
-			 * note that in WRITE [ufs_readwrite.c] that we
-			 * allocate buffer, uncache, and then do the write.
-			 * the problem with this is that if the uncache causes
-			 * VM data to be flushed to the same area of the file
-			 * we are writing to... in that case we've got the
-			 * buffer locked and our process goes to sleep forever.
-			 *
-			 * XXXCDC: checking maxprot protects us from the
-			 * "persistbug" program but this is not a long term
-			 * solution.
-			 * 
-			 * XXXCDC: we don't bother calling uncache with the vp
-			 * VOP_LOCKed since we know that we are already
-			 * holding a valid reference to the uvn (from the
-			 * uvn_attach above), and thus it is impossible for
-			 * the uncache to kill the uvn and trigger I/O.
-			 */
-			if (flags & MAP_SHARED) {
-				if ((prot & VM_PROT_WRITE) ||
-				    (maxprot & VM_PROT_WRITE)) {
-					uvm_vnp_uncache(vp);
-				}
-			}
-
+			/* XXX for now, attach doesn't gain a ref */
+			VREF(vp);
 		} else {
 			uobj = udv_attach((void *) &vp->v_rdev,
 			    (flags & MAP_SHARED) ?
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.c,v 1.43 2000/11/09 19:15:28 christos Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -70,10 +70,13 @@
 * uvm_page.c: page ops.
 */

+#include "opt_uvmhist.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/sched.h>
+#include <sys/kernel.h>

 #define UVM_PAGE                /* pull in uvm_page.h functions */
 #include <uvm/uvm.h>
@ -94,8 +97,15 @@ int vm_nphysseg = 0;				/* XXXCDC: uvm.nphysseg */
 * of the things necessary to do idle page zero'ing efficiently.
 * We therefore provide a way to disable it from machdep code here.
 */
+/*
+ * XXX disabled until we can find a way to do this without causing
+ * problems for either cpu caches or DMA latency.
+ */
+boolean_t vm_page_zero_enable = FALSE;

-boolean_t vm_page_zero_enable = TRUE;
+u_long uvm_pgcnt_anon;
+u_long uvm_pgcnt_vnode;
+extern struct uvm_pagerops uvm_vnodeops;

 /*
 * local variables
@ -123,7 +133,7 @@ static struct pglist uvm_bootbucket;
 */

 static void uvm_pageinsert __P((struct vm_page *));
-
+static void uvm_pageremove __P((struct vm_page *));

 /*
 * inline functions
@ -160,7 +170,6 @@ uvm_pageinsert(pg)
 	TAILQ_INSERT_TAIL(&pg->uobject->memq, pg, listq); /* put in object */
 	pg->flags |= PG_TABLED;
 	pg->uobject->uo_npages++;
-
 }

 /*
@ -170,21 +179,14 @@ uvm_pageinsert(pg)
 * => caller must lock page queues
 */

-void __inline
+static __inline void
 uvm_pageremove(pg)
 	struct vm_page *pg;
 {
 	struct pglist *buck;
 	int s;

-#ifdef DIAGNOSTIC
-	if ((pg->flags & (PG_FAULTING)) != 0)
-		panic("uvm_pageremove: page is faulting");
-#endif
-
-	if ((pg->flags & PG_TABLED) == 0)
-		return;				/* XXX: log */
-
+	KASSERT(pg->flags & PG_TABLED);
 	buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)];
 	s = splimp();
 	simple_lock(&uvm.hashlock);
@ -192,6 +194,10 @@ uvm_pageremove(pg)
 	simple_unlock(&uvm.hashlock);
 	splx(s);

+	if (pg->uobject->pgops == &uvm_vnodeops) {
+		uvm_pgcnt_vnode--;
+	}
+
 	/* object should be locked */
 	TAILQ_REMOVE(&pg->uobject->memq, pg, listq);

@ -199,7 +205,6 @@ uvm_pageremove(pg)
 	pg->uobject->uo_npages--;
 	pg->uobject = NULL;
 	pg->version++;
-
 }

 /*
@ -217,7 +222,6 @@ uvm_page_init(kvm_startp, kvm_endp)
 	int lcv, i;  
 	paddr_t paddr;

-
 	/*
 	 * step 1: init the page queues and page queue locks
 	 */
@ -238,7 +242,7 @@ uvm_page_init(kvm_startp, kvm_endp)
 	 */

 	uvm.page_nhash = 1;			/* 1 bucket */
-	uvm.page_hashmask = 0;		/* mask for hash function */
+	uvm.page_hashmask = 0;			/* mask for hash function */
 	uvm.page_hash = &uvm_bootbucket;	/* install bootstrap bucket */
 	TAILQ_INIT(uvm.page_hash);		/* init hash table */
 	simple_lock_init(&uvm.hashlock);	/* init hash table lock */
@ -291,7 +295,6 @@ uvm_page_init(kvm_startp, kvm_endp)
 	 */

 	for (lcv = 0 ; lcv < vm_nphysseg ; lcv++) {
-	 
 		n = vm_physmem[lcv].end - vm_physmem[lcv].start;
 		if (n > pagecount) {
 			printf("uvm_page_init: lost %ld page(s) in init\n",
@ -317,6 +320,7 @@ uvm_page_init(kvm_startp, kvm_endp)
 			}
 		}
 	}
+
 	/*
 	 * step 5: pass up the values of virtual_space_start and
 	 * virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
@ -327,10 +331,11 @@ uvm_page_init(kvm_startp, kvm_endp)
 	*kvm_endp = trunc_page(virtual_space_end);

 	/*
-	 * step 6: init pagedaemon lock
+	 * step 6: init locks for kernel threads
 	 */

 	simple_lock_init(&uvm.pagedaemon_lock);
+	simple_lock_init(&uvm.aiodoned_lock);

 	/*
 	 * step 7: init reserve thresholds
@ -342,10 +347,6 @@ uvm_page_init(kvm_startp, kvm_endp)
 	/*
 	 * step 8: determine if we should zero pages in the idle
 	 * loop.
-	 *
-	 * XXXJRT - might consider zero'ing up to the target *now*,
-	 *	    but that could take an awfully long time if you
-	 *	    have a lot of memory.
 	 */
 	uvm.page_idle_zero = vm_page_zero_enable;

@ -360,7 +361,6 @@ uvm_page_init(kvm_startp, kvm_endp)
 * uvm_setpagesize: set the page size
 * 
 * => sets page_shift and page_mask from uvmexp.pagesize.
- * => XXXCDC: move global vars.
 */   

 void
@ -889,22 +889,20 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list)
 	struct pgfreelist *pgfl;
 	boolean_t use_reserve;

-#ifdef DIAGNOSTIC
-	/* sanity check */
-	if (obj && anon)
-		panic("uvm_pagealloc: obj and anon != NULL");
-#endif
-
-	s = uvm_lock_fpageq();		/* lock free page queue */
+	KASSERT(obj == NULL || anon == NULL);
+	KASSERT(off == trunc_page(off));
+	s = uvm_lock_fpageq();

 	/*
 	 * check to see if we need to generate some free pages waking
 	 * the pagedaemon.
 	 */

-	if (uvmexp.free < uvmexp.freemin || (uvmexp.free < uvmexp.freetarg &&
-	    uvmexp.inactive < uvmexp.inactarg))
+	if (uvmexp.free + uvmexp.paging < uvmexp.freemin ||
+	    (uvmexp.free + uvmexp.paging < uvmexp.freetarg &&
+	     uvmexp.inactive < uvmexp.inactarg)) {
 		wakeup(&uvm.pagedaemon);
+	}

 	/*
 	 * fail if any of these conditions is true:
@ -957,11 +955,7 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list)
 	case UVM_PGA_STRAT_ONLY:
 	case UVM_PGA_STRAT_FALLBACK:
 		/* Attempt to allocate from the specified free list. */
-#ifdef DIAGNOSTIC
-		if (free_list >= VM_NFREELIST || free_list < 0)
-			panic("uvm_pagealloc_strat: bad free list %d",
-			    free_list);
-#endif
+		KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
 		pgfl = &uvm.page_free[free_list];
 		if ((pg = TAILQ_FIRST((freeq =
 		      &pgfl->pgfl_queues[try1]))) != NULL ||
@ -1012,11 +1006,10 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list)
 	pg->uanon = anon;
 	pg->flags = PG_BUSY|PG_CLEAN|PG_FAKE;
 	pg->version++;
-	pg->wire_count = 0;
-	pg->loan_count = 0;
 	if (anon) {
 		anon->u.an_page = pg;
 		pg->pqflags = PQ_ANON;
+		uvm_pgcnt_anon++;
 	} else {
 		if (obj)
 			uvm_pageinsert(pg);
@ -1074,8 +1067,6 @@ uvm_pagerealloc(pg, newobj, newoff)
 		pg->version++;
 		uvm_pageinsert(pg);
 	}
- 
-	return;
 }


@ -1089,14 +1080,20 @@ uvm_pagerealloc(pg, newobj, newoff)
 * => assumes all valid mappings of pg are gone
 */

-void uvm_pagefree(pg)
-
-struct vm_page *pg;
-
+void
+uvm_pagefree(pg)
+	struct vm_page *pg;
 {
 	int s;
 	int saved_loan_count = pg->loan_count;

+#ifdef DEBUG
+	if (pg->uobject == (void *)0xdeadbeef &&
+	    pg->uanon == (void *)0xdeadbeef) {
+		panic("uvm_pagefree: freeing free page %p\n", pg);
+	}
+#endif
+
 	/*
 	 * if the page was an object page (and thus "TABLED"), remove it
 	 * from the object.
@ -1105,7 +1102,7 @@ struct vm_page *pg;
 	if (pg->flags & PG_TABLED) {

 		/*
-		 * if the object page is on loan we are going to drop ownership.  
+		 * if the object page is on loan we are going to drop ownership.
 		 * it is possible that an anon will take over as owner for this
 		 * page later on.   the anon will want a !PG_CLEAN page so that
 		 * it knows it needs to allocate swap if it wants to page the 
@ -1114,7 +1111,6 @@ struct vm_page *pg;

 		if (saved_loan_count)
 			pg->flags &= ~PG_CLEAN;	/* in case an anon takes over */
-
 		uvm_pageremove(pg);
 		
 		/*
@ -1125,9 +1121,9 @@ struct vm_page *pg;
 		 * return (when the last loan is dropped, then the page can be 
 		 * freed by whatever was holding the last loan).
 		 */
+
 		if (saved_loan_count) 
 			return;
-
 	} else if (saved_loan_count && (pg->pqflags & PQ_ANON)) {

 		/*
@ -1137,19 +1133,12 @@ struct vm_page *pg;
 		 * note that the kernel can't change the loan status of our
 		 * page as long as we are holding PQ lock.
 		 */
+
 		pg->pqflags &= ~PQ_ANON;
 		pg->uanon = NULL;
 		return;
 	}
-
-#ifdef DIAGNOSTIC
-	if (saved_loan_count) {
-		printf("uvm_pagefree: warning: freeing page with a loan "
-		    "count of %d\n", saved_loan_count);
-		panic("uvm_pagefree: loan count");
-	}
-#endif
-	
+	KASSERT(saved_loan_count == 0);

 	/*
 	 * now remove the page from the queues
@ -1172,13 +1161,17 @@ struct vm_page *pg;
 	/*
 	 * if the page was wired, unwire it now.
 	 */
+
 	if (pg->wire_count) {
 		pg->wire_count = 0;
 		uvmexp.wired--;
 	}
+	if (pg->uanon) {
+		uvm_pgcnt_anon--;
+	}

 	/*
-	 * and put on free queue 
+	 * and put on free queue
 	 */

 	pg->flags &= ~PG_ZERO;
@ -1200,6 +1193,51 @@ struct vm_page *pg;
 	uvm_unlock_fpageq(s);
 }

+/*
+ * uvm_page_unbusy: unbusy an array of pages.
+ *
+ * => pages must either all belong to the same object, or all belong to anons.
+ * => if pages are object-owned, object must be locked.
+ * => if pages are anon-owned, anons must be unlockd and have 0 refcount.
+ */
+
+void
+uvm_page_unbusy(pgs, npgs)
+	struct vm_page **pgs;
+	int npgs;
+{
+	struct vm_page *pg;
+	struct uvm_object *uobj;
+	int i;
+	UVMHIST_FUNC("uvm_page_unbusy"); UVMHIST_CALLED(ubchist);
+
+	for (i = 0; i < npgs; i++) {
+		pg = pgs[i];
+
+		if (pg == NULL) {
+			continue;
+		}
+		if (pg->flags & PG_WANTED) {
+			wakeup(pg);
+		}
+		if (pg->flags & PG_RELEASED) {
+			UVMHIST_LOG(ubchist, "releasing pg %p", pg,0,0,0);
+			uobj = pg->uobject;
+			if (uobj != NULL) {
+				uobj->pgops->pgo_releasepg(pg, NULL);
+			} else {
+				pg->flags &= ~(PG_BUSY);
+				UVM_PAGE_OWN(pg, NULL);
+				uvm_anfree(pg->uanon);
+			}
+		} else {
+			UVMHIST_LOG(ubchist, "unbusying pg %p", pg,0,0,0);
+			pg->flags &= ~(PG_WANTED|PG_BUSY);
+			UVM_PAGE_OWN(pg, NULL);
+		}
+	}
+}
+
 #if defined(UVM_PAGE_TRKOWN)
 /*
 * uvm_page_own: set or release page ownership
--- a/sys/uvm/uvm_page.h
+++ b/sys/uvm/uvm_page.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.h,v 1.17 2000/10/03 20:50:49 mrg Exp $	*/
+/*	$NetBSD: uvm_page.h,v 1.18 2000/11/27 08:40:05 chs Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -118,27 +118,27 @@
 #include <uvm/uvm_pglist.h>

 struct vm_page {
-  TAILQ_ENTRY(vm_page)	pageq;		/* queue info for FIFO
-					 * queue or free list (P) */
-  TAILQ_ENTRY(vm_page)	hashq;		/* hash table links (O)*/
-  TAILQ_ENTRY(vm_page)	listq;		/* pages in same object (O)*/
+	TAILQ_ENTRY(vm_page)	pageq;		/* queue info for FIFO
+						 * queue or free list (P) */
+	TAILQ_ENTRY(vm_page)	hashq;		/* hash table links (O)*/
+	TAILQ_ENTRY(vm_page)	listq;		/* pages in same object (O)*/

-  struct vm_anon	*uanon;		/* anon (O,P) */
-  struct uvm_object	*uobject;	/* object (O,P) */
-  voff_t		offset;		/* offset into object (O,P) */
+	struct vm_anon		*uanon;		/* anon (O,P) */
+	struct uvm_object	*uobject;	/* object (O,P) */
+	voff_t			offset;		/* offset into object (O,P) */

-  u_short		flags;		/* object flags [O] */
-  u_short		version;	/* version count [O] */
-  u_short		wire_count;	/* wired down map refs [P] */
-  u_short 		pqflags;	/* page queue flags [P] */
-  u_int			loan_count;	/* number of active loans
-					 * to read: [O or P]
-					 * to modify: [O _and_ P] */
-  paddr_t		phys_addr;	/* physical address of page */
+	u_short			flags;		/* object flags [O] */
+	u_short			version;	/* version count [O] */
+	u_short			wire_count;	/* wired down map refs [P] */
+	u_short			pqflags;	/* page queue flags [P] */
+	u_int			loan_count;	/* number of active loans
+						 * to read: [O or P]
+						 * to modify: [O _and_ P] */
+	paddr_t			phys_addr;	/* physical address of page */
 #if defined(UVM_PAGE_TRKOWN)
-  /* debugging fields to track page ownership */
-  pid_t			owner;		/* proc that set PG_BUSY */
-  char			*owner_tag;	/* why it was set busy */
+	/* debugging fields to track page ownership */
+	pid_t			owner;		/* proc that set PG_BUSY */
+	char			*owner_tag;	/* why it was set busy */
 #endif
 };

@ -157,25 +157,23 @@ struct vm_page {
 * PG_ZERO is used to indicate that a page has been pre-zero'd.  This flag
 * is only set when the page is on no queues, and is cleared when the page
 * is placed on the free list.
- *
- * possible deadwood: PG_FAULTING, PQ_LAUNDRY
 */
+
+#define	PG_BUSY		0x0001		/* page is locked */
+#define	PG_WANTED	0x0002		/* someone is waiting for page */
+#define	PG_TABLED	0x0004		/* page is in VP table  */
 #define	PG_CLEAN	0x0008		/* page has not been modified */
-#define	PG_BUSY		0x0010		/* page is in transit  */
-#define	PG_WANTED	0x0020		/* someone is waiting for page */
-#define	PG_TABLED	0x0040		/* page is in VP table  */
-#define	PG_ZERO		0x0100		/* page is pre-zero'd */
-#define	PG_FAKE		0x0200		/* page is placeholder for pagein */
-#define	PG_FILLED	0x0400		/* client flag to set when filled */
-#define	PG_DIRTY	0x0800		/* client flag to set when dirty */
-#define PG_RELEASED	0x1000		/* page released while paging */
-#define	PG_FAULTING	0x2000		/* page is being faulted in */
-#define PG_CLEANCHK	0x4000		/* clean bit has been checked */
+#define PG_CLEANCHK	0x0010		/* clean bit has been checked */
+#define PG_RELEASED	0x0020		/* page released while paging */
+#define	PG_FAKE		0x0040		/* page is not yet initialized */
+#define PG_RDONLY	0x0080		/* page must be mapped read-only */
+#define PG_ZERO		0x0100		/* page is pre-zero'd */
+
+#define PG_PAGER1	0x1000		/* pager-specific flag */

 #define PQ_FREE		0x0001		/* page is on free list */
 #define PQ_INACTIVE	0x0002		/* page is in inactive list */
 #define PQ_ACTIVE	0x0004		/* page is in active list */
-#define PQ_LAUNDRY	0x0008		/* page is being cleaned now */
 #define PQ_ANON		0x0010		/* page is part of an anon, rather
 					   than an uvm_object */
 #define PQ_AOBJ		0x0020		/* page is part of an anonymous
@ -237,12 +235,9 @@ extern boolean_t vm_page_zero_enable;
 *		ordered, in LRU-like fashion.
 */

-extern
-struct pglist	vm_page_queue_free;	/* memory free queue */
-extern
-struct pglist	vm_page_queue_active;	/* active memory queue */
-extern
-struct pglist	vm_page_queue_inactive;	/* inactive memory queue */
+extern struct pglist	vm_page_queue_free;	/* memory free queue */
+extern struct pglist	vm_page_queue_active;	/* active memory queue */
+extern struct pglist	vm_page_queue_inactive;	/* inactive memory queue */

 /*
 * physical memory config is stored in vm_physmem.
@ -283,9 +278,8 @@ vaddr_t uvm_pageboot_alloc __P((vsize_t));
 PAGE_INLINE void uvm_pagecopy __P((struct vm_page *, struct vm_page *));
 PAGE_INLINE void uvm_pagedeactivate __P((struct vm_page *));
 void uvm_pagefree __P((struct vm_page *));
+void uvm_page_unbusy __P((struct vm_page **, int));
 PAGE_INLINE struct vm_page *uvm_pagelookup __P((struct uvm_object *, voff_t));
-void uvm_pageremove __P((struct vm_page *));
-/* uvm_pagerename: not needed */
 PAGE_INLINE void uvm_pageunwire __P((struct vm_page *));
 PAGE_INLINE void uvm_pagewait __P((struct vm_page *, int));
 PAGE_INLINE void uvm_pagewake __P((struct vm_page *));
--- a/sys/uvm/uvm_pager.c
+++ b/sys/uvm/uvm_pager.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pager.c,v 1.34 2000/11/24 22:41:39 chs Exp $	*/
+/*	$NetBSD: uvm_pager.c,v 1.35 2000/11/27 08:40:05 chs Exp $	*/

 /*
 *
@ -44,21 +44,27 @@
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
+#include <sys/pool.h>
+#include <sys/vnode.h>

 #define UVM_PAGER
 #include <uvm/uvm.h>

+struct pool *uvm_aiobuf_pool;
+
 /*
 * list of uvm pagers in the system
 */

 extern struct uvm_pagerops uvm_deviceops;
 extern struct uvm_pagerops uvm_vnodeops;
+extern struct uvm_pagerops ubc_pager;

 struct uvm_pagerops *uvmpagerops[] = {
 	&aobj_pager,
 	&uvm_deviceops,
 	&uvm_vnodeops,
+	&ubc_pager,
 };

 /*
@ -68,7 +74,8 @@ struct uvm_pagerops *uvmpagerops[] = {
 vm_map_t pager_map;		/* XXX */
 simple_lock_data_t pager_map_wanted_lock;
 boolean_t pager_map_wanted;	/* locked by pager map */
-
+static vaddr_t emergva;
+static boolean_t emerginuse;

 /*
 * uvm_pager_init: init pagers (at boot time)
@ -83,10 +90,12 @@ uvm_pager_init()
 	 * init pager map
 	 */

-	 pager_map = uvm_km_suballoc(kernel_map, &uvm.pager_sva, &uvm.pager_eva,
-	 			PAGER_MAP_SIZE, 0, FALSE, NULL);
-	 simple_lock_init(&pager_map_wanted_lock);
-	 pager_map_wanted = FALSE;
+	pager_map = uvm_km_suballoc(kernel_map, &uvm.pager_sva, &uvm.pager_eva,
+	 			    PAGER_MAP_SIZE, 0, FALSE, NULL);
+	simple_lock_init(&pager_map_wanted_lock);
+	pager_map_wanted = FALSE;
+	emergva = uvm_km_valloc(kernel_map, MAXBSIZE);
+	emerginuse = FALSE;

 	/*
 	 * init ASYNC I/O queue
@ -112,22 +121,19 @@ uvm_pager_init()
 */

 vaddr_t
-uvm_pagermapin(pps, npages, aiop, flags)
+uvm_pagermapin(pps, npages, flags)
 	struct vm_page **pps;
 	int npages;
-	struct uvm_aiodesc **aiop;	/* OUT */
 	int flags;
 {
 	vsize_t size;
 	vaddr_t kva;
-	struct uvm_aiodesc *aio;
 	vaddr_t cva;
 	struct vm_page *pp;
 	vm_prot_t prot;
 	UVMHIST_FUNC("uvm_pagermapin"); UVMHIST_CALLED(maphist);

-	UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d, aiop=0x%x, flags=0x%x)",
-	      pps, npages, aiop, flags);
+	UVMHIST_LOG(maphist,"(pps=0x%x, npages=%d)", pps, npages,0,0);

 	/*
 	 * compute protection.  outgoing I/O only needs read
@ -139,24 +145,26 @@ uvm_pagermapin(pps, npages, aiop, flags)
 		prot |= VM_PROT_WRITE;

 ReStart:
-	if (aiop) {
-		MALLOC(aio, struct uvm_aiodesc *, sizeof(*aio), M_TEMP,
-		    (flags & UVMPAGER_MAPIN_WAITOK));
-		if (aio == NULL)
-			return(0);
-		*aiop = aio;
-	} else {
-		aio = NULL;
-	}
-
 	size = npages << PAGE_SHIFT;
 	kva = 0;			/* let system choose VA */

 	if (uvm_map(pager_map, &kva, size, NULL, 
 	      UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != KERN_SUCCESS) {
+		if (curproc == uvm.pagedaemon_proc) {
+			simple_lock(&pager_map_wanted_lock);
+			if (emerginuse) {
+				UVM_UNLOCK_AND_WAIT(&emergva,
+				    &pager_map_wanted_lock, FALSE,
+				    "emergva", 0);
+				goto ReStart;
+			}
+			emerginuse = TRUE;
+			simple_unlock(&pager_map_wanted_lock);
+			kva = emergva;
+			KASSERT(npages <= MAXBSIZE >> PAGE_SHIFT);
+			goto enter;
+		}
 		if ((flags & UVMPAGER_MAPIN_WAITOK) == 0) {
-			if (aio)
-				FREE(aio, M_TEMP);
 			UVMHIST_LOG(maphist,"<- NOWAIT failed", 0,0,0,0);
 			return(0);
 		}
@ -164,16 +172,17 @@ ReStart:
 		pager_map_wanted = TRUE; 
 		UVMHIST_LOG(maphist, "  SLEEPING on pager_map",0,0,0,0);
 		UVM_UNLOCK_AND_WAIT(pager_map, &pager_map_wanted_lock, FALSE, 
-		    "pager_map",0);
+		    "pager_map", 0);
 		goto ReStart;
 	}

+enter:
 	/* got it */
 	for (cva = kva ; size != 0 ; size -= PAGE_SIZE, cva += PAGE_SIZE) {
 		pp = *pps++;
 #ifdef DEBUG
 		if ((pp->flags & PG_BUSY) == 0)
-			panic("uvm_pagermapin: page not busy");
+			panic("uvm_pagermapin: pg %p not busy", pp);
 #endif
 		pmap_enter(vm_map_pmap(pager_map), cva, VM_PAGE_TO_PHYS(pp),
 		    prot, PMAP_WIRED | prot);
@ -198,13 +207,22 @@ uvm_pagermapout(kva, npages)
 	vsize_t size = npages << PAGE_SHIFT;
 	vm_map_entry_t entries;
 	UVMHIST_FUNC("uvm_pagermapout"); UVMHIST_CALLED(maphist);
-	
+
 	UVMHIST_LOG(maphist, " (kva=0x%x, npages=%d)", kva, npages,0,0);

 	/*
 	 * duplicate uvm_unmap, but add in pager_map_wanted handling.
 	 */

+	if (kva == emergva) {
+		simple_lock(&pager_map_wanted_lock);
+		emerginuse = FALSE;
+		wakeup(&emergva);
+		simple_unlock(&pager_map_wanted_lock);
+		entries = NULL;
+		goto remove;
+	}
+
 	vm_map_lock(pager_map);
 	(void) uvm_unmap_remove(pager_map, kva, kva + size, &entries);
 	simple_lock(&pager_map_wanted_lock);
@ -214,6 +232,8 @@ uvm_pagermapout(kva, npages)
 	}
 	simple_unlock(&pager_map_wanted_lock);
 	vm_map_unlock(pager_map);
+remove:
+	pmap_remove(pmap_kernel(), kva, kva + (npages << PAGE_SHIFT));
 	if (entries)
 		uvm_unmap_detach(entries, 0);

@ -251,7 +271,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 {
 	struct vm_page **ppsp, *pclust;
 	voff_t lo, hi, curoff;
-	int center_idx, forward;
+	int center_idx, forward, incr;
 	UVMHIST_FUNC("uvm_mk_pcluster"); UVMHIST_CALLED(maphist);

 	/* 
@ -273,9 +293,11 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 		if (hi > mhi)
 			hi = mhi;
 	}
-	if ((hi - lo) >> PAGE_SHIFT > *npages) {  /* pps too small, bail out! */
+	if ((hi - lo) >> PAGE_SHIFT > *npages) { /* pps too small, bail out! */
 #ifdef DIAGNOSTIC
-	    printf("uvm_mk_pcluster: provided page array too small (fixed)\n");
+		printf("uvm_mk_pcluster uobj %p npages %d lo 0x%llx hi 0x%llx "
+		       "flags 0x%x\n", uobj, *npages, (long long)lo,
+		       (long long)hi, flags);
 #endif
 		pps[0] = center;
 		*npages = 1;
@ -291,7 +313,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 	pps[center_idx] = center;	/* plug in the center page */
 	ppsp = &pps[center_idx];
 	*npages = 1;
-	
+
 	/*
 	 * attempt to cluster around the left [backward], and then 
 	 * the right side [forward].    
@ -303,21 +325,23 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 	 */

 	for (forward  = 0 ; forward <= 1 ; forward++) {
-
-		curoff = center->offset + (forward ? PAGE_SIZE : -PAGE_SIZE);
+		incr = forward ? PAGE_SIZE : -PAGE_SIZE;
+		curoff = center->offset + incr;
 		for ( ;(forward == 0 && curoff >= lo) ||
 		       (forward && curoff < hi);
-		      curoff += (forward ? 1 : -1) << PAGE_SHIFT) {
+		      curoff += incr) {

 			pclust = uvm_pagelookup(uobj, curoff); /* lookup page */
-			if (pclust == NULL)
+			if (pclust == NULL) {
 				break;			/* no page */
+			}
 			/* handle active pages */
 			/* NOTE: inactive pages don't have pmap mappings */
 			if ((pclust->pqflags & PQ_INACTIVE) == 0) {
-				if ((flags & PGO_DOACTCLUST) == 0)
+				if ((flags & PGO_DOACTCLUST) == 0) {
 					/* dont want mapped pages at all */
 					break;
+				}

 				/* make sure "clean" bit is sync'd */
 				if ((pclust->flags & PG_CLEANCHK) == 0) {
@ -330,13 +354,16 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 					pclust->flags |= PG_CLEANCHK;
 				}
 			}
+
 			/* is page available for cleaning and does it need it */
-			if ((pclust->flags & (PG_CLEAN|PG_BUSY)) != 0)
+			if ((pclust->flags & (PG_CLEAN|PG_BUSY)) != 0) {
 				break;	/* page is already clean or is busy */
+			}

 			/* yes!   enroll the page in our array */
 			pclust->flags |= PG_BUSY;		/* busy! */
 			UVM_PAGE_OWN(pclust, "uvm_mk_pcluster");
+
 			/* XXX: protect wired page?   see above comment. */
 			pmap_page_protect(pclust, VM_PROT_READ);
 			if (!forward) {
@ -346,7 +373,7 @@ uvm_mk_pcluster(uobj, pps, npages, center, flags, mlo, mhi)
 				/* move forward one page */
 				ppsp[*npages] = pclust;
 			}
-			*npages = *npages + 1;
+			(*npages)++;
 		}
 	}
 	
@ -409,6 +436,7 @@ uvm_pager_put(uobj, pg, ppsp_ptr, npages, flags, start, stop)
 	int result;
 	daddr_t swblk;
 	struct vm_page **ppsp = *ppsp_ptr;
+	UVMHIST_FUNC("uvm_pager_put"); UVMHIST_CALLED(ubchist);

 	/*
 	 * note that uobj is null  if we are doing a swap-backed pageout.
@ -459,12 +487,12 @@ uvm_pager_put(uobj, pg, ppsp_ptr, npages, flags, start, stop)
 ReTry:
 	if (uobj) {
 		/* object is locked */
-		result = uobj->pgops->pgo_put(uobj, ppsp, *npages,
-		    flags & PGO_SYNCIO);
+		result = uobj->pgops->pgo_put(uobj, ppsp, *npages, flags);
+		UVMHIST_LOG(ubchist, "put -> %d", result, 0,0,0);
 		/* object is now unlocked */
 	} else {
 		/* nothing locked */
-		result = uvm_swap_put(swblk, ppsp, *npages, flags & PGO_SYNCIO);
+		result = uvm_swap_put(swblk, ppsp, *npages, flags);
 		/* nothing locked */
 	}

@ -565,7 +593,7 @@ ReTry:

 	/*
 	 * a pager error occured (even after dropping the cluster, if there
-	 * was one).    give up!   the caller only has one page ("pg")
+	 * was one).  give up! the caller only has one page ("pg")
 	 * to worry about.
 	 */
 	
@ -610,7 +638,8 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags)

 	for (lcv = 0 ; lcv < *npages ; lcv++) {

-		if (ppsp[lcv] == pg)		/* skip "pg" */
+		/* skip "pg" or empty slot */
+		if (ppsp[lcv] == pg || ppsp[lcv] == NULL)
 			continue;
 	
 		/*
@ -637,9 +666,10 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags)
 		}

 		/* did someone want the page while we had it busy-locked? */
-		if (ppsp[lcv]->flags & PG_WANTED)
+		if (ppsp[lcv]->flags & PG_WANTED) {
 			/* still holding obj lock */
 			wakeup(ppsp[lcv]);
+		}

 		/* if page was released, release it.  otherwise un-busy it */
 		if (ppsp[lcv]->flags & PG_RELEASED) {
@ -690,7 +720,7 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags)
 			continue;		/* next page */

 		} else {
-			ppsp[lcv]->flags &= ~(PG_BUSY|PG_WANTED);
+			ppsp[lcv]->flags &= ~(PG_BUSY|PG_WANTED|PG_FAKE);
 			UVM_PAGE_OWN(ppsp[lcv], NULL);
 		}

@ -713,3 +743,167 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags)
 		}
 	}
 }
+
+/*
+ * interrupt-context iodone handler for nested i/o bufs.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone1(bp)
+	struct buf *bp;
+{
+	struct buf *mbp = bp->b_private;
+
+	KASSERT(mbp != bp);
+	if (bp->b_flags & B_ERROR) {
+		mbp->b_flags |= B_ERROR;
+		mbp->b_error = bp->b_error;
+	}
+	mbp->b_resid -= bp->b_bcount;
+	pool_put(&bufpool, bp);
+	if (mbp->b_resid == 0) {
+		biodone(mbp);
+	}
+}
+
+/*
+ * interrupt-context iodone handler for single-buf i/os
+ * or the top-level buf of a nested-buf i/o.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone(bp)
+	struct buf *bp;
+{
+	/* reset b_iodone for when this is a single-buf i/o. */
+	bp->b_iodone = uvm_aio_aiodone;
+
+	simple_lock(&uvm.aiodoned_lock);	/* locks uvm.aio_done */
+	TAILQ_INSERT_TAIL(&uvm.aio_done, bp, b_freelist);
+	wakeup(&uvm.aiodoned);
+	simple_unlock(&uvm.aiodoned_lock);
+}
+
+/*
+ * uvm_aio_aiodone: do iodone processing for async i/os.
+ * this should be called in thread context, not interrupt context.
+ */
+
+void
+uvm_aio_aiodone(bp)
+	struct buf *bp;
+{
+	int npages = bp->b_bufsize >> PAGE_SHIFT;
+	struct vm_page *pg, *pgs[npages];
+	struct uvm_object *uobj;
+	int s, i;
+	boolean_t release, write, swap;
+	UVMHIST_FUNC("uvm_aio_aiodone"); UVMHIST_CALLED(ubchist);
+	UVMHIST_LOG(ubchist, "bp %p", bp, 0,0,0);
+
+	release = (bp->b_flags & (B_ERROR|B_READ)) == (B_ERROR|B_READ);
+	write = (bp->b_flags & B_READ) == 0;
+	/* XXXUBC B_NOCACHE is for swap pager, should be done differently */
+	if (write && !(bp->b_flags & B_NOCACHE)) {
+		/* XXXUBC */
+		void softdep_pageiodone(struct buf *);
+		softdep_pageiodone(bp);
+	}
+
+	uobj = NULL;
+	for (i = 0; i < npages; i++) {
+		pgs[i] = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT));
+		UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0);
+	}
+	uvm_pagermapout((vaddr_t)bp->b_data, npages);
+	for (i = 0; i < npages; i++) {
+		pg = pgs[i];
+
+		if (i == 0) {
+			swap = (pg->pqflags & PQ_SWAPBACKED) != 0;
+			if (!swap) {
+				uobj = pg->uobject;
+				simple_lock(&uobj->vmobjlock);
+			}
+		}
+		KASSERT(swap || pg->uobject == uobj);
+		if (swap) {
+			if (pg->pqflags & PQ_ANON) {
+				simple_lock(&pg->uanon->an_lock);
+			} else {
+				simple_lock(&pg->uobject->vmobjlock);
+			}
+		}
+
+		/*
+		 * if this is a read and we got an error, mark the pages
+		 * PG_RELEASED so that uvm_page_unbusy() will free them.
+		 */
+
+		if (release) {
+			pg->flags |= PG_RELEASED;
+			continue;
+		}
+		KASSERT(!write || (pgs[i]->flags & PG_FAKE) == 0);
+
+		/*
+		 * if this is a read and the page is PG_FAKE
+		 * or this was a write, mark the page PG_CLEAN and not PG_FAKE.
+		 */
+
+		if (pgs[i]->flags & PG_FAKE || write) {
+			pmap_clear_reference(pgs[i]);
+			pmap_clear_modify(pgs[i]);
+			pgs[i]->flags |= PG_CLEAN;
+			pgs[i]->flags &= ~PG_FAKE;
+		}
+		if (swap) {
+			if (pg->pqflags & PQ_ANON) {
+				simple_unlock(&pg->uanon->an_lock);
+			} else {
+				simple_unlock(&pg->uobject->vmobjlock);
+			}
+		}
+	}
+	uvm_page_unbusy(pgs, npages);
+	if (!swap) {
+		simple_unlock(&uobj->vmobjlock);
+	}
+
+	s = splbio();
+	if (write && (bp->b_flags & B_AGE) != 0) {
+		vwakeup(bp);
+	}
+	pool_put(&bufpool, bp);
+	splx(s);
+}
+
+/*
+ * translate unix errno values to VM_PAGER_*.
+ */
+
+int
+uvm_errno2vmerror(errno)
+	int errno;
+{
+	switch (errno) {
+	case 0:
+		return VM_PAGER_OK;
+	case EINVAL:
+		return VM_PAGER_BAD;
+	case EINPROGRESS:
+		return VM_PAGER_PEND;
+	case EIO:
+		return VM_PAGER_ERROR;
+	case EAGAIN:
+		return VM_PAGER_AGAIN;
+	case EBUSY:
+		return VM_PAGER_UNLOCK;
+	default:
+		return VM_PAGER_ERROR;
+	}
+}
--- a/sys/uvm/uvm_pager.h
+++ b/sys/uvm/uvm_pager.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pager.h,v 1.19 2000/11/27 08:19:51 chs Exp $	*/
+/*	$NetBSD: uvm_pager.h,v 1.20 2000/11/27 08:40:05 chs Exp $	*/

 /*
 *
@ -81,21 +81,6 @@
 * uvm_pager.h
 */

-/*
- * async pager i/o descriptor structure
- */
-
-TAILQ_HEAD(uvm_aiohead, uvm_aiodesc);
-
-struct uvm_aiodesc {
-	void (*aiodone) __P((struct uvm_aiodesc *));
-						/* aio done function */
-	vaddr_t kva;			/* KVA of mapped page(s) */
-	int npages;				/* # of pages in I/O req */
-	void *pd_ptr;				/* pager-dependent pointer */
-	TAILQ_ENTRY(uvm_aiodesc) aioq;		/* linked list of aio's */
-};
-
 /*
 * pager ops
 */
@ -132,22 +117,22 @@ struct uvm_pagerops {
 /* pager flags [mostly for flush] */

 #define PGO_CLEANIT	0x001	/* write dirty pages to backing store */
-#define PGO_SYNCIO	0x002	/* if PGO_CLEAN: use sync I/O? */
-/*
- * obviously if neither PGO_INVALIDATE or PGO_FREE are set then the pages
- * stay where they are.
- */
+#define PGO_SYNCIO	0x002	/* if PGO_CLEANIT: use sync I/O? */
 #define PGO_DEACTIVATE	0x004	/* deactivate flushed pages */
 #define PGO_FREE	0x008	/* free flushed pages */
+/* if PGO_FREE is not set then the pages stay where they are. */

 #define PGO_ALLPAGES	0x010	/* flush whole object/get all pages */
 #define PGO_DOACTCLUST	0x020	/* flag to mk_pcluster to include active */
 #define PGO_LOCKED	0x040	/* fault data structures are locked [get] */
 #define PGO_PDFREECLUST	0x080	/* daemon's free cluster flag [uvm_pager_put] */
 #define PGO_REALLOCSWAP	0x100	/* reallocate swap area [pager_dropcluster] */
+#define PGO_OVERWRITE	0x200	/* pages will be overwritten before unlocked */
+#define PGO_WEAK	0x400	/* "weak" put, for nfs */
+#define PGO_PASTEOF	0x800	/* allow allocation of pages past EOF */

 /* page we are not interested in getting */
-#define PGO_DONTCARE ((struct vm_page *) -1)	/* [get only] */
+#define PGO_DONTCARE ((struct vm_page *) -1L)	/* [get only] */

 #ifdef _KERNEL

@ -175,12 +160,12 @@ int		uvm_pager_put __P((struct uvm_object *, struct vm_page *,

 PAGER_INLINE struct vm_page *uvm_pageratop __P((vaddr_t));

-vaddr_t	uvm_pagermapin __P((struct vm_page **, int, 
-				    struct uvm_aiodesc **, int));
+vaddr_t		uvm_pagermapin __P((struct vm_page **, int, int));
 void		uvm_pagermapout __P((vaddr_t, int));
 struct vm_page **uvm_mk_pcluster  __P((struct uvm_object *, struct vm_page **,
 				       int *, struct vm_page *, int, 
 				       voff_t, voff_t));
+int		uvm_errno2vmerror __P((int));

 /* Flags to uvm_pagermapin() */
 #define	UVMPAGER_MAPIN_WAITOK	0x01	/* it's okay to wait */
--- a/sys/uvm/uvm_pdaemon.c
+++ b/sys/uvm/uvm_pdaemon.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $	*/
+/*	$NetBSD: uvm_pdaemon.c,v 1.24 2000/11/27 08:40:05 chs Exp $	*/

 /* 
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -77,9 +77,13 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/pool.h>
+#include <sys/buf.h>

 #include <uvm/uvm.h>

+extern u_long uvm_pgcnt_vnode;
+extern struct uvm_pagerops uvm_vnodeops;
+
 /*
 * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedeamon will reactivate
 * in a pass thru the inactive list when swap is full.  the value should be
@ -194,10 +198,8 @@ void
 uvm_pageout(void *arg)
 {
 	int npages = 0;
-	int s;
-	struct uvm_aiodesc *aio, *nextaio;
 	UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
-	 
+
 	UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);

 	/*
@ -214,7 +216,82 @@ uvm_pageout(void *arg)
 	/*
 	 * main loop
 	 */
-	while (TRUE) {
+
+	for (;;) {
+		simple_lock(&uvm.pagedaemon_lock);
+
+		UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
+		UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
+		    &uvm.pagedaemon_lock, FALSE, "pgdaemon", 0);
+		uvmexp.pdwoke++;
+		UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
+
+		/* drain pool resources */
+		pool_drain(0);
+
+		/*
+		 * now lock page queues and recompute inactive count
+		 */
+
+		uvm_lock_pageq();
+		if (npages != uvmexp.npages) {	/* check for new pages? */
+			npages = uvmexp.npages;
+			uvmpd_tune();
+		}
+
+		uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3;
+		if (uvmexp.inactarg <= uvmexp.freetarg) {
+			uvmexp.inactarg = uvmexp.freetarg + 1;
+		}
+
+		UVMHIST_LOG(pdhist,"  free/ftarg=%d/%d, inact/itarg=%d/%d",
+		    uvmexp.free, uvmexp.freetarg, uvmexp.inactive,
+		    uvmexp.inactarg);
+
+		/*
+		 * scan if needed
+		 */
+
+		if (uvmexp.free + uvmexp.paging < uvmexp.freetarg ||
+		    uvmexp.inactive < uvmexp.inactarg ||
+		    uvm_pgcnt_vnode >
+		    (uvmexp.active + uvmexp.inactive + uvmexp.wired +
+		     uvmexp.free) * 13 / 16) {
+			uvmpd_scan();
+		}
+
+		/*
+		 * if there's any free memory to be had,
+		 * wake up any waiters.
+		 */
+
+		if (uvmexp.free > uvmexp.reserve_kernel ||
+		    uvmexp.paging == 0) {
+			wakeup(&uvmexp.free);
+		}
+
+		/*
+		 * scan done.  unlock page queues (the only lock we are holding)
+		 */
+
+		uvm_unlock_pageq();
+	}
+	/*NOTREACHED*/
+}
+
+
+/*
+ * uvm_aiodone_daemon:  main loop for the aiodone daemon.
+ */
+
+void
+uvm_aiodone_daemon(void *arg)
+{
+	int s, free;
+	struct buf *bp, *nbp;
+	UVMHIST_FUNC("uvm_aiodoned"); UVMHIST_CALLED(pdhist);
+
+	for (;;) {

 		/*
 		 * carefully attempt to go to sleep (without losing "wakeups"!).
@ -223,95 +300,58 @@ uvm_pageout(void *arg)
 		 */

 		s = splbio();
-		simple_lock(&uvm.pagedaemon_lock);
-
-		/*
-		 * if we've got done aio's, then bypass the sleep
-		 */
-
-		if (uvm.aio_done.tqh_first == NULL) {
-			UVMHIST_LOG(maphist,"  <<SLEEPING>>",0,0,0,0);
-			UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
-			    &uvm.pagedaemon_lock, FALSE, "daemon_slp", 0);
-			uvmexp.pdwoke++;
+		simple_lock(&uvm.aiodoned_lock);
+		if (TAILQ_FIRST(&uvm.aio_done) == NULL) {
+			UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
+			UVM_UNLOCK_AND_WAIT(&uvm.aiodoned,
+			    &uvm.aiodoned_lock, FALSE, "aiodoned", 0);
 			UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);

-			/* relock pagedaemon_lock, still at splbio */
-			simple_lock(&uvm.pagedaemon_lock);
+			/* relock aiodoned_lock, still at splbio */
+			simple_lock(&uvm.aiodoned_lock);
 		}

 		/*
 		 * check for done aio structures
 		 */

-		aio = uvm.aio_done.tqh_first;	/* save current list (if any)*/
-		if (aio) {
-			TAILQ_INIT(&uvm.aio_done);	/* zero global list */
+		bp = TAILQ_FIRST(&uvm.aio_done);
+		if (bp) {
+			TAILQ_INIT(&uvm.aio_done);
 		}

-		simple_unlock(&uvm.pagedaemon_lock);	/* unlock */
-		splx(s);				/* drop splbio */
- 
+		simple_unlock(&uvm.aiodoned_lock);
+		splx(s);
+
 		/*
-		 * first clear out any pending aios (to free space in case we
-		 * want to pageout more stuff).
+		 * process each i/o that's done.
 		 */

-		for (/*null*/; aio != NULL ; aio = nextaio) {
-
-			uvmexp.paging -= aio->npages;
-			nextaio = aio->aioq.tqe_next;
-			aio->aiodone(aio);
-
+		free = uvmexp.free;
+		while (bp != NULL) {
+			if (bp->b_flags & B_PDAEMON) {
+				uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
+			}
+			nbp = TAILQ_NEXT(bp, b_freelist);
+			(*bp->b_iodone)(bp);
+			bp = nbp;
 		}
-
-		/* Next, drain pool resources */
-		pool_drain(0);
-
-		/*
-		 * now lock page queues and recompute inactive count
-		 */
-		uvm_lock_pageq();
-
-		if (npages != uvmexp.npages) {	/* check for new pages? */
-			npages = uvmexp.npages;
-			uvmpd_tune();
-		}
-
-		uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3;
-		if (uvmexp.inactarg <= uvmexp.freetarg)
-			uvmexp.inactarg = uvmexp.freetarg + 1;
-
-		UVMHIST_LOG(pdhist,"  free/ftarg=%d/%d, inact/itarg=%d/%d",
-		    uvmexp.free, uvmexp.freetarg, uvmexp.inactive,
-		    uvmexp.inactarg);
-
-		/*
-		 * scan if needed
-		 * [XXX: note we are reading uvm.free without locking]
-		 */
-		if (uvmexp.free < uvmexp.freetarg ||
-		    uvmexp.inactive < uvmexp.inactarg)
-			uvmpd_scan();
-
-		/*
-		 * done scan.  unlock page queues (the only lock we are holding)
-		 */
-		uvm_unlock_pageq();
-
-		/*
-		 * done!    restart loop.
-		 */
-		if (uvmexp.free > uvmexp.reserve_kernel ||
-		    uvmexp.paging == 0)
+		if (free <= uvmexp.reserve_kernel) {
+			s = uvm_lock_fpageq();
+			wakeup(&uvm.pagedaemon);
+			uvm_unlock_fpageq(s);
+		} else {
+			simple_lock(&uvm.pagedaemon_lock);
 			wakeup(&uvmexp.free);
+			simple_unlock(&uvm.pagedaemon_lock);
+		}
 	}
-	/*NOTREACHED*/
 }

+
+
 /*
- * uvmpd_scan_inactive: the first loop of uvmpd_scan broken out into
- * 	its own function for ease of reading.
+ * uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
 *
 * => called with page queues locked
 * => we work on meeting our free target by converting inactive pages
@ -334,9 +374,9 @@ uvmpd_scan_inactive(pglst)
 	int swnpages, swcpages;				/* XXX: see below */
 	int swslot;
 	struct vm_anon *anon;
-	boolean_t swap_backed;
+	boolean_t swap_backed, vnode_only;
 	vaddr_t start;
-	int dirtyreacts;
+	int dirtyreacts, vpgs;
 	UVMHIST_FUNC("uvmpd_scan_inactive"); UVMHIST_CALLED(pdhist);

 	/*
@ -349,75 +389,81 @@ uvmpd_scan_inactive(pglst)

 	/*
 	 * swslot is non-zero if we are building a swap cluster.  we want
-	 * to stay in the loop while we have a page to scan or we have 
+	 * to stay in the loop while we have a page to scan or we have
 	 * a swap-cluster to build.
 	 */
+
 	swslot = 0;
 	swnpages = swcpages = 0;
 	free = 0;
 	dirtyreacts = 0;
+	vnode_only = FALSE;

-	for (p = pglst->tqh_first ; p != NULL || swslot != 0 ; p = nextpg) {
+	for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {

 		/*
 		 * note that p can be NULL iff we have traversed the whole
 		 * list and need to do one final swap-backed clustered pageout.
 		 */
+
+		uobj = NULL;
+		anon = NULL;
+
 		if (p) {
+
 			/*
 			 * update our copy of "free" and see if we've met
 			 * our target
 			 */
+
 			s = uvm_lock_fpageq();
 			free = uvmexp.free;
 			uvm_unlock_fpageq(s);

+			/* XXXUBC */
+			vpgs = uvm_pgcnt_vnode -
+				(uvmexp.active + uvmexp.inactive +
+				 uvmexp.wired + uvmexp.free) * 13 / 16;
+
 			if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
 			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
-				UVMHIST_LOG(pdhist,"  met free target: "
-				    "exit loop", 0, 0, 0, 0);
-				retval = TRUE;		/* hit the target! */
+				if (vpgs <= 0) {
+					UVMHIST_LOG(pdhist,"  met free target: "
+						    "exit loop", 0, 0, 0, 0);
+					retval = TRUE;

-				if (swslot == 0)
-					/* exit now if no swap-i/o pending */
-					break;
+					if (swslot == 0)
+						/* exit now if no
+                                                   swap-i/o pending */
+						break;

-				/* set p to null to signal final swap i/o */
-				p = NULL;
+					/* set p to null to signal final
+                                           swap i/o */
+					p = NULL;
+				} else {
+					vnode_only = TRUE;
+				}
 			}
 		}

-		uobj = NULL;	/* be safe and shut gcc up */
-		anon = NULL;	/* be safe and shut gcc up */
-
 		if (p) {	/* if (we have a new page to consider) */
+
 			/*
 			 * we are below target and have a new page to consider.
 			 */
-			uvmexp.pdscans++;
-			nextpg = p->pageq.tqe_next;

-			/*
-			 * move referenced pages back to active queue and
-			 * skip to next page (unlikely to happen since
-			 * inactive pages shouldn't have any valid mappings
-			 * and we cleared reference before deactivating).
-			 */
-			if (pmap_is_referenced(p)) {
-				uvm_pageactivate(p);
-				uvmexp.pdreact++;
-				continue;
-			}
-			
+			uvmexp.pdscans++;
+			nextpg = TAILQ_NEXT(p, pageq);
+
 			/*
 			 * first we attempt to lock the object that this page
 			 * belongs to.  if our attempt fails we skip on to
 			 * the next page (no harm done).  it is important to
 			 * "try" locking the object as we are locking in the
 			 * wrong order (pageq -> object) and we don't want to
-			 * get deadlocked.
+			 * deadlock.
 			 *
-			 * the only time we exepct to see an ownerless page
+			 * the only time we expect to see an ownerless page
 			 * (i.e. a page with no uobject and !PQ_ANON) is if an
 			 * anon has loaned a page from a uvm_object and the
 			 * uvm_object has dropped the ownership.  in that
@ -427,17 +473,12 @@ uvmpd_scan_inactive(pglst)
 		
 			/* is page part of an anon or ownerless ? */
 			if ((p->pqflags & PQ_ANON) || p->uobject == NULL) {
-
+				if (vnode_only) {
+					uvm_pageactivate(p);
+					continue;
+				}
 				anon = p->uanon;
-
-#ifdef DIAGNOSTIC
-				/* to be on inactive q, page must be part
-				 * of _something_ */
-				if (anon == NULL)
-					panic("pagedaemon: page with no anon "
-					    "or object detected - loop 1");
-#endif
-
+				KASSERT(anon != NULL);
 				if (!simple_lock_try(&anon->an_lock))
 					/* lock failed, skip this page */
 					continue;
@ -446,41 +487,38 @@ uvmpd_scan_inactive(pglst)
 				 * if the page is ownerless, claim it in the
 				 * name of "anon"!
 				 */
-				if ((p->pqflags & PQ_ANON) == 0) {
-#ifdef DIAGNOSTIC
-					if (p->loan_count < 1)
-						panic("pagedaemon: non-loaned "
-						    "ownerless page detected -"
-						    " loop 1");
-#endif
-					p->loan_count--;
-					p->pqflags |= PQ_ANON;      /* anon now owns it */
-				}

+				if ((p->pqflags & PQ_ANON) == 0) {
+					KASSERT(p->loan_count > 0);
+					p->loan_count--;
+					p->pqflags |= PQ_ANON;
+					/* anon now owns it */
+				}
 				if (p->flags & PG_BUSY) {
 					simple_unlock(&anon->an_lock);
 					uvmexp.pdbusy++;
 					/* someone else owns page, skip it */
 					continue;
 				}
-
 				uvmexp.pdanscan++;
-
 			} else {
-
 				uobj = p->uobject;
-
+				KASSERT(uobj != NULL);
+				if (vnode_only &&
+				    uobj->pgops != &uvm_vnodeops) {
+					uvm_pageactivate(p);
+					continue;
+				}
 				if (!simple_lock_try(&uobj->vmobjlock))
 					/* lock failed, skip this page */
-					continue;	
+					continue;

 				if (p->flags & PG_BUSY) {
 					simple_unlock(&uobj->vmobjlock);
 					uvmexp.pdbusy++;
 					/* someone else owns page, skip it */
-					continue;	
+					continue;
 				}
-
 				uvmexp.pdobscan++;
 			}

@ -498,21 +536,18 @@ uvmpd_scan_inactive(pglst)
 					simple_unlock(&uvm.swap_data_lock);
 				}

-				/* zap all mappings with pmap_page_protect... */
-				pmap_page_protect(p, VM_PROT_NONE);
 				uvm_pagefree(p);
 				uvmexp.pdfreed++;
-			
+
 				if (anon) {
-#ifdef DIAGNOSTIC
+
 					/*
 					 * an anonymous page can only be clean
-					 * if it has valid backing store.
+					 * if it has backing store assigned.
 					 */
-					if (anon->an_swslot == 0)
-						panic("pagedaemon: clean anon "
-						 "page without backing store?");
-#endif
+
+					KASSERT(anon->an_swslot != 0);
+
 					/* remove from object */
 					anon->u.an_page = NULL;
 					simple_unlock(&anon->an_lock);
@ -528,6 +563,7 @@ uvmpd_scan_inactive(pglst)
 			 * this page is dirty, skip it if we'll have met our
 			 * free target when all the current pageouts complete.
 			 */
+
 			if (free + uvmexp.paging > uvmexp.freetarg << 2) {
 				if (anon) {
 					simple_unlock(&anon->an_lock);
@ -543,11 +579,8 @@ uvmpd_scan_inactive(pglst)
 			 * reactivate it so that we eventually cycle
 			 * all pages thru the inactive queue.
 			 */
-#ifdef DIAGNOSTIC
-			if (uvmexp.swpgonly > uvmexp.swpages) {
-				panic("uvmexp.swpgonly botch");
-			}
-#endif
+
+			KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
 			if ((p->pqflags & PQ_SWAPBACKED) &&
 			    uvmexp.swpgonly == uvmexp.swpages) {
 				dirtyreacts++;
@ -565,11 +598,8 @@ uvmpd_scan_inactive(pglst)
 			 * is full, free any swap allocated to the page
 			 * so that other pages can be paged out.
 			 */
-#ifdef DIAGNOSTIC
-			if (uvmexp.swpginuse > uvmexp.swpages) {
-				panic("uvmexp.swpginuse botch");
-			}
-#endif
+
+			KASSERT(uvmexp.swpginuse <= uvmexp.swpages);
 			if ((p->pqflags & PQ_SWAPBACKED) &&
 			    uvmexp.swpginuse == uvmexp.swpages) {

@ -588,26 +618,25 @@ uvmpd_scan_inactive(pglst)
 			 * the page we are looking at is dirty.   we must
 			 * clean it before it can be freed.  to do this we
 			 * first mark the page busy so that no one else will
-			 * touch the page.   we write protect all the mappings
-			 * of the page so that no one touches it while it is
-			 * in I/O.
+			 * touch the page.
 			 */
 		
 			swap_backed = ((p->pqflags & PQ_SWAPBACKED) != 0);
 			p->flags |= PG_BUSY;		/* now we own it */
 			UVM_PAGE_OWN(p, "scan_inactive");
-			pmap_page_protect(p, VM_PROT_READ);
 			uvmexp.pgswapout++;

 			/*
 			 * for swap-backed pages we need to (re)allocate
 			 * swap space.
 			 */
+
 			if (swap_backed) {

 				/*
 				 * free old swap slot (if any)
 				 */
+
 				if (anon) {
 					if (anon->an_swslot) {
 						uvm_swap_free(anon->an_swslot,
@ -622,13 +651,11 @@ uvmpd_scan_inactive(pglst)
 				/*
 				 * start new cluster (if necessary)
 				 */
-				if (swslot == 0) {
-					/* want this much */
-					swnpages = MAXBSIZE >> PAGE_SHIFT;

+				if (swslot == 0) {
+					swnpages = MAXBSIZE >> PAGE_SHIFT;
 					swslot = uvm_swap_alloc(&swnpages,
 					    TRUE);
-
 					if (swslot == 0) {
 						/* no swap?  give up! */
 						p->flags &= ~PG_BUSY;
@ -647,6 +674,7 @@ uvmpd_scan_inactive(pglst)
 				/*
 				 * add block to cluster
 				 */
+
 				swpps[swcpages] = p;
 				if (anon)
 					anon->an_swslot = swslot + swcpages;
@ -655,11 +683,7 @@ uvmpd_scan_inactive(pglst)
 					    p->offset >> PAGE_SHIFT,
 					    swslot + swcpages);
 				swcpages++;
-
-				/* done (swap-backed) */
 			}
-
-			/* end: if (p) ["if we have new page to consider"] */ 
 		} else {

 			/* if p == NULL we must be doing a last swap i/o */
@ -667,16 +691,16 @@ uvmpd_scan_inactive(pglst)
 		}

 		/*
-		 * now consider doing the pageout.   
+		 * now consider doing the pageout.
 		 *
-		 * for swap-backed pages, we do the pageout if we have either 
-		 * filled the cluster (in which case (swnpages == swcpages) or 
+		 * for swap-backed pages, we do the pageout if we have either
+		 * filled the cluster (in which case (swnpages == swcpages) or
 		 * run out of pages (p == NULL).
 		 *
 		 * for object pages, we always do the pageout.
 		 */
-		if (swap_backed) {

+		if (swap_backed) {
 			if (p) {	/* if we just added a page to cluster */
 				if (anon)
 					simple_unlock(&anon->an_lock);
@ -699,21 +723,18 @@ uvmpd_scan_inactive(pglst)
 			if (swcpages < swnpages) {
 				uvm_swap_free(swslot + swcpages,
 				    (swnpages - swcpages));
-			} 
-	
+			}
 		} else {
-
 			/* normal object pageout */
 			ppsp = pps;
 			npages = sizeof(pps) / sizeof(struct vm_page *);
 			/* not looked at because PGO_ALLPAGES is set */
 			start = 0;
-
 		}

 		/*
 		 * now do the pageout.
-		 * 
+		 *
 		 * for swap_backed pages we have already built the cluster.
 		 * for !swap_backed pages, uvm_pager_put will call the object's
 		 * "make put cluster" function to build a cluster on our behalf.
@ -734,7 +755,7 @@ uvmpd_scan_inactive(pglst)

 		/* locked: uobj (if !swap_backed), page queues */
 		uvmexp.pdpageouts++;
-		result = uvm_pager_put((swap_backed) ? NULL : uobj, p,
+		result = uvm_pager_put(swap_backed ? NULL : uobj, p,
 		    &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0);
 		/* locked: uobj (if !swap_backed && result != PEND) */
 		/* unlocked: pageqs, object (if swap_backed ||result == PEND) */
@ -762,21 +783,27 @@ uvmpd_scan_inactive(pglst)

 		if (result == VM_PAGER_PEND) {
 			uvmexp.paging += npages;
-			uvm_lock_pageq();		/* relock page queues */
+			uvm_lock_pageq();
 			uvmexp.pdpending++;
 			if (p) {
 				if (p->pqflags & PQ_INACTIVE)
-					/* reload! */
-					nextpg = p->pageq.tqe_next;
+					nextpg = TAILQ_NEXT(p, pageq);
 				else
-					/* reload! */
-					nextpg = pglst->tqh_first;
-				} else {
-					nextpg = NULL;		/* done list */
+					nextpg = TAILQ_FIRST(pglst);
+			} else {
+				nextpg = NULL;
 			}
 			continue;
 		}

+		if (result == VM_PAGER_ERROR &&
+		    curproc == uvm.pagedaemon_proc) {
+			uvm_lock_pageq();
+			nextpg = TAILQ_NEXT(p, pageq);
+			uvm_pageactivate(p);
+			continue;
+		}
+
 		/*
 		 * clean up "p" if we have one
 		 */
@ -812,12 +839,6 @@ uvmpd_scan_inactive(pglst)
 					simple_lock(&uobj->vmobjlock);
 			}

-#ifdef DIAGNOSTIC
-			if (result == VM_PAGER_UNLOCK)
-				panic("pagedaemon: pageout returned "
-				    "invalid 'unlock' code");
-#endif
-
 			/* handle PG_WANTED now */
 			if (p->flags & PG_WANTED)
 				/* still holding object lock */
@ -837,24 +858,19 @@ uvmpd_scan_inactive(pglst)
 					pmap_page_protect(p, VM_PROT_NONE);
 					anon = NULL;
 					uvm_lock_pageq();
-					nextpg = p->pageq.tqe_next;
+					nextpg = TAILQ_NEXT(p, pageq);
 					/* free released page */
 					uvm_pagefree(p);

 				} else {

-#ifdef DIAGNOSTIC
-					if (uobj->pgops->pgo_releasepg == NULL)
-						panic("pagedaemon: no "
-						   "pgo_releasepg function");
-#endif
-
-					/* 
+					/*
 					 * pgo_releasepg nukes the page and
 					 * gets "nextpg" for us.  it returns
 					 * with the page queues locked (when
 					 * given nextpg ptr).
 					 */
+
 					if (!uobj->pgops->pgo_releasepg(p,
 					    &nextpg))
 						/* uobj died after release */
@ -864,35 +880,27 @@ uvmpd_scan_inactive(pglst)
 					 * lock page queues here so that they're
 					 * always locked at the end of the loop.
 					 */
+
 					uvm_lock_pageq();
 				}
-
 			} else {	/* page was not released during I/O */
-
 				uvm_lock_pageq();
-				nextpg = p->pageq.tqe_next;
-
+				nextpg = TAILQ_NEXT(p, pageq);
 				if (result != VM_PAGER_OK) {
-
 					/* pageout was a failure... */
 					if (result != VM_PAGER_AGAIN)
 						uvm_pageactivate(p);
 					pmap_clear_reference(p);
 					/* XXXCDC: if (swap_backed) FREE p's
 					 * swap block? */
-
 				} else {
-
 					/* pageout was a success... */
 					pmap_clear_reference(p);
 					pmap_clear_modify(p);
 					p->flags |= PG_CLEAN;
-					/* XXX: could free page here, but old
-					 * pagedaemon does not */
-
 				}
 			}
-			
+
 			/*
 			 * drop object lock (if there is an object left).   do
 			 * a safety check of nextpg to make sure it is on the
@ -906,26 +914,27 @@ uvmpd_scan_inactive(pglst)
 			else if (uobj)
 				simple_unlock(&uobj->vmobjlock);

-		} /* if (p) */ else {
+		} else {
+
+			/*
+			 * if p is null in this loop, make sure it stays null
+			 * in the next loop.
+			 */

-			/* if p is null in this loop, make sure it stays null
-			 * in next loop */
 			nextpg = NULL;
 			
 			/*
 			 * lock page queues here just so they're always locked
 			 * at the end of the loop.
 			 */
+
 			uvm_lock_pageq();
 		}

 		if (nextpg && (nextpg->pqflags & PQ_INACTIVE) == 0) {
-			printf("pagedaemon: invalid nextpg!   reverting to "
-			    "queue head\n");
-			nextpg = pglst->tqh_first;	/* reload! */
+			nextpg = TAILQ_FIRST(pglst);	/* reload! */
 		}
-
-	}	/* end of "inactive" 'for' loop */
+	}
 	return (retval);
 }

@ -945,10 +954,8 @@ uvmpd_scan()
 	UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);

 	uvmexp.pdrevs++;		/* counter */
+	uobj = NULL;

-#ifdef __GNUC__
-	uobj = NULL;	/* XXX gcc */
-#endif
 	/*
 	 * get current "free" page count
 	 */
@ -962,13 +969,11 @@ uvmpd_scan()
 	 * we need to unlock the page queues for this.
 	 */
 	if (free < uvmexp.freetarg) {
-
 		uvmexp.pdswout++;
 		UVMHIST_LOG(pdhist,"  free %d < target %d: swapout", free,
 		    uvmexp.freetarg, 0, 0);
 		uvm_unlock_pageq();
 		uvm_swapout_threads();
-		pmap_update();		/* update so we can scan inactive q */
 		uvm_lock_pageq();

 	}
@ -984,8 +989,8 @@ uvmpd_scan()
 	UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);

 	/*
-	 * do loop #1!   alternate starting queue between swap and object based
-	 * on the low bit of uvmexp.pdrevs (which we bump by one each call).
+	 * alternate starting queue between swap and object based on the
+	 * low bit of uvmexp.pdrevs (which we bump by one each call).
 	 */

 	got_it = FALSE;
@ -1009,6 +1014,7 @@ uvmpd_scan()
 	 * detect if we're not going to be able to page anything out
 	 * until we free some swap resources from active pages.
 	 */
+
 	swap_shortage = 0;
 	if (uvmexp.free < uvmexp.freetarg &&
 	    uvmexp.swpginuse == uvmexp.swpages &&
@ -1016,13 +1022,13 @@ uvmpd_scan()
 	    pages_freed == 0) {
 		swap_shortage = uvmexp.freetarg - uvmexp.free;
 	}
- 
+
 	UVMHIST_LOG(pdhist, "  loop 2: inactive_shortage=%d swap_shortage=%d",
 		    inactive_shortage, swap_shortage,0,0);
-	for (p = TAILQ_FIRST(&uvm.page_active); 
+	for (p = TAILQ_FIRST(&uvm.page_active);
 	     p != NULL && (inactive_shortage > 0 || swap_shortage > 0);
 	     p = nextpg) {
-		nextpg = p->pageq.tqe_next;
+		nextpg = TAILQ_NEXT(p, pageq);
 		if (p->flags & PG_BUSY)
 			continue;	/* quick check before trying to lock */

@ -1031,22 +1037,13 @@ uvmpd_scan()
 		 */
 		/* is page anon owned or ownerless? */
 		if ((p->pqflags & PQ_ANON) || p->uobject == NULL) {
-
-#ifdef DIAGNOSTIC
-			if (p->uanon == NULL)
-				panic("pagedaemon: page with no anon or "
-				    "object detected - loop 2");
-#endif
+			KASSERT(p->uanon != NULL);
 			if (!simple_lock_try(&p->uanon->an_lock))
 				continue;

 			/* take over the page? */
 			if ((p->pqflags & PQ_ANON) == 0) {
-#ifdef DIAGNOSTIC
-				if (p->loan_count < 1)
-					panic("pagedaemon: non-loaned "
-					    "ownerless page detected - loop 2");
-#endif
+				KASSERT(p->loan_count > 0);
 				p->loan_count--;
 				p->pqflags |= PQ_ANON;
 			}
@ -1054,9 +1051,11 @@ uvmpd_scan()
 			if (!simple_lock_try(&p->uobject->vmobjlock))
 				continue;
 		}
+
 		/*
 		 * skip this page if it's busy.
 		 */
+
 		if ((p->flags & PG_BUSY) != 0) {
 			if (p->pqflags & PQ_ANON)
 				simple_unlock(&p->uanon->an_lock);
@ -1064,11 +1063,12 @@ uvmpd_scan()
 				simple_unlock(&p->uobject->vmobjlock);
 			continue;
 		}
- 
+
 		/*
 		 * if there's a shortage of swap, free any swap allocated
 		 * to this page so that other pages can be paged out.
 		 */
+
 		if (swap_shortage > 0) {
 			if ((p->pqflags & PQ_ANON) && p->uanon->an_swslot) {
 				uvm_swap_free(p->uanon->an_swslot, 1);
@ -1086,11 +1086,12 @@ uvmpd_scan()
 				}
 			}
 		}
- 
+
 		/*
 		 * deactivate this page if there's a shortage of
 		 * inactive pages.
 		 */
+
 		if (inactive_shortage > 0) {
 			pmap_page_protect(p, VM_PROT_NONE);
 			/* no need to check wire_count as pg is "active" */
@ -1098,7 +1099,6 @@ uvmpd_scan()
 			uvmexp.pddeact++;
 			inactive_shortage--;
 		}
-
 		if (p->pqflags & PQ_ANON)
 			simple_unlock(&p->uanon->an_lock);
 		else
--- a/sys/uvm/uvm_swap.c
+++ b/sys/uvm/uvm_swap.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_swap.c,v 1.40 2000/11/17 11:39:39 mrg Exp $	*/
+/*	$NetBSD: uvm_swap.c,v 1.41 2000/11/27 08:40:05 chs Exp $	*/

 /*
 * Copyright (c) 1995, 1996, 1997 Matthew R. Green
@ -34,6 +34,7 @@
 #include "fs_nfs.h"
 #include "opt_uvmhist.h"
 #include "opt_compat_netbsd.h"
+#include "opt_ddb.h"

 #include <sys/param.h>
 #include <sys/systm.h>
@ -77,11 +78,6 @@
 * by the "swap_priority" global var.    each "swappri" contains a 
 * CIRCLEQ of "swapdev" structures at that priority.
 *
- * the system maintains a fixed pool of "swapbuf" structures for use
- * at swap i/o time.  a swapbuf includes a "buf" structure and an 
- * "aiodone" [we want to avoid malloc()'ing anything at swapout time
- * since memory may be low].
- *
 * locking:
 *  - swap_syscall_lock (sleep lock): this lock serializes the swapctl
 *    system call and prevents the swap priority list from changing
@ -89,8 +85,6 @@
 *  - uvm.swap_data_lock (simple_lock): this lock protects all swap data
 *    structures including the priority list, the swapdev structures,
 *    and the swapmap extent.
- *  - swap_buf_lock (simple_lock): this lock protects the free swapbuf
- *    pool.
 *
 * each swap device has the following info:
 *  - swap device in use (could be disabled, preventing future use)
@ -157,15 +151,6 @@ struct swappri {
 	LIST_ENTRY(swappri)	spi_swappri;      /* global list of pri's */
 };

-/*
- * swapbuf, swapbuffer plus async i/o info
- */
-struct swapbuf {
-	struct buf sw_buf;		/* a buffer structure */
-	struct uvm_aiodesc sw_aio;	/* aiodesc structure, used if ASYNC */
-	SIMPLEQ_ENTRY(swapbuf) sw_sq;	/* free list pointer */
-};
-
 /*
 * The following two structures are used to keep track of data transfers
 * on swap devices associated with regular files.
@ -222,8 +207,6 @@ cdev_decl(sw);
 * local variables
 */
 static struct extent *swapmap;		/* controls the mapping of /dev/drum */
-SIMPLEQ_HEAD(swapbufhead, swapbuf);
-struct pool *swapbuf_pool;

 /* list of all active swap devices [by priority] */
 LIST_HEAD(swap_priority, swappri);
@ -250,8 +233,6 @@ static void sw_reg_strategy __P((struct swapdev *, struct buf *, int));
 static void sw_reg_iodone __P((struct buf *));
 static void sw_reg_start __P((struct swapdev *));

-static void uvm_swap_aiodone __P((struct uvm_aiodesc *));
-static void uvm_swap_bufdone __P((struct buf *));
 static int uvm_swap_io __P((struct vm_page **, int, int, int));

 /*
@ -292,18 +273,9 @@ uvm_swap_init()
 		panic("uvm_swap_init: extent_create failed");

 	/*
-	 * allocate our private pool of "swapbuf" structures (includes
-	 * a "buf" structure).  ["nswbuf" comes from param.c and can
-	 * be adjusted by MD code before we get here].
+	 * allocate pools for structures used for swapping to files.
 	 */

-	swapbuf_pool =
-		pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0,
-			    NULL, NULL, 0);
-	if (swapbuf_pool == NULL)
-		panic("swapinit: pool_create failed");
-	/* XXX - set a maximum on swapbuf_pool? */
-
 	vndxfer_pool =
 		pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0,
 			    NULL, NULL, 0);
@ -1120,7 +1092,7 @@ swstrategy(bp)
 	 * be yanked out from under us because we are holding resources
 	 * in it (i.e. the blocks we are doing I/O on).
 	 */
-	pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT;
+	pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
 	simple_lock(&uvm.swap_data_lock);
 	sdp = swapdrum_getsdp(pageno);
 	simple_unlock(&uvm.swap_data_lock);
@ -1139,7 +1111,7 @@ swstrategy(bp)
 	pageno -= sdp->swd_drumoffset;	/* page # on swapdev */
 	bn = btodb(pageno << PAGE_SHIFT);	/* convert to diskblock */

-	UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld\n",
+	UVMHIST_LOG(pdhist, "  %s: mapoff=%x bn=%x bcount=%ld",
 		((bp->b_flags & B_READ) == 0) ? "write" : "read",
 		sdp->swd_drumoffset, bn, bp->b_bcount);

@ -1174,14 +1146,14 @@ swstrategy(bp)
 			vp->v_numoutput++;	/* put it on swapdev */
 		}

-		/* 
+		/*
 		 * dissassocate buffer with /dev/drum vnode 
 		 * [could be null if buf was from physio]
 		 */
-		if (bp->b_vp != NULLVP)
+		if (bp->b_vp != NULL)
 			brelvp(bp);

-		/* 
+		/*
 		 * finally plug in swapdev vnode and start I/O
 		 */
 		bp->b_vp = vp;
@ -1279,18 +1251,15 @@ sw_reg_strategy(sdp, bp, bn)

 		/*
 		 * compute the size ("sz") of this transfer (in bytes).
-		 * XXXCDC: ignores read-ahead for non-zero offset
 		 */
-		if ((off = (byteoff % sdp->swd_bsize)) != 0)
-			sz = sdp->swd_bsize - off;
-		else
-			sz = (1 + nra) * sdp->swd_bsize;
-
-		if (resid < sz)
+		off = byteoff % sdp->swd_bsize;
+		sz = (1 + nra) * sdp->swd_bsize - off;
+		if (sz > resid)
 			sz = resid;

-		UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x",
-				sdp->swd_vp, vp, byteoff, nbn);
+		UVMHIST_LOG(pdhist, "sw_reg_strategy: "
+			    "vp %p/%p offset 0x%x/0x%x",
+			    sdp->swd_vp, vp, byteoff, nbn);

 		/*
 		 * now get a buf structure.   note that the vb_buf is
@ -1303,42 +1272,13 @@ sw_reg_strategy(sdp, bp, bn)
 		nbp->vb_buf.b_bufsize  = sz;
 		nbp->vb_buf.b_error    = 0;
 		nbp->vb_buf.b_data     = addr;
+		nbp->vb_buf.b_lblkno   = 0;
 		nbp->vb_buf.b_blkno    = nbn + btodb(off);
 		nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
-		nbp->vb_buf.b_proc     = bp->b_proc;
 		nbp->vb_buf.b_iodone   = sw_reg_iodone;
-		nbp->vb_buf.b_vp       = NULLVP;
-		nbp->vb_buf.b_vnbufs.le_next = NOLIST;
-		nbp->vb_buf.b_rcred    = sdp->swd_cred;
-		nbp->vb_buf.b_wcred    = sdp->swd_cred;
+		nbp->vb_buf.b_vp       = NULL;
 		LIST_INIT(&nbp->vb_buf.b_dep);

-		/* 
-		 * set b_dirtyoff/end and b_validoff/end.   this is
-		 * required by the NFS client code (otherwise it will
-		 * just discard our I/O request).
-		 */
-		if (bp->b_dirtyend == 0) {
-			nbp->vb_buf.b_dirtyoff = 0;
-			nbp->vb_buf.b_dirtyend = sz;
-		} else {
-			nbp->vb_buf.b_dirtyoff =
-			    max(0, bp->b_dirtyoff - (bp->b_bcount-resid));
-			nbp->vb_buf.b_dirtyend =
-			    min(sz,
-				max(0, bp->b_dirtyend - (bp->b_bcount-resid)));
-		}
-		if (bp->b_validend == 0) {
-			nbp->vb_buf.b_validoff = 0;
-			nbp->vb_buf.b_validend = sz;
-		} else {
-			nbp->vb_buf.b_validoff =
-			    max(0, bp->b_validoff - (bp->b_bcount-resid));
-			nbp->vb_buf.b_validend =
-			    min(sz,
-				max(0, bp->b_validend - (bp->b_bcount-resid)));
-		}
-
 		nbp->vb_xfer = vnx;	/* patch it back in to vnx */

 		/*
@ -1352,7 +1292,7 @@ sw_reg_strategy(sdp, bp, bn)
 		vnx->vx_pending++;

 		/* assoc new buffer with underlying vnode */
-		bgetvp(vp, &nbp->vb_buf);	
+		bgetvp(vp, &nbp->vb_buf);

 		/* sort it in and start I/O if we are not over our limit */
 		disksort_blkno(&sdp->swd_tab, &nbp->vb_buf);
@ -1411,6 +1351,7 @@ sw_reg_start(sdp)
 		    bp, bp->b_vp, bp->b_blkno, bp->b_bcount);
 		if ((bp->b_flags & B_READ) == 0)
 			bp->b_vp->v_numoutput++;
+
 		VOP_STRATEGY(bp);
 	}
 	sdp->swd_flags &= ~SWF_BUSY;
@ -1455,11 +1396,9 @@ sw_reg_iodone(bp)
 	}

 	/*
-	 * disassociate this buffer from the vnode (if any).
+	 * disassociate this buffer from the vnode.
 	 */
-	if (vbp->vb_buf.b_vp != NULLVP) {
-		brelvp(&vbp->vb_buf);
-	}
+	brelvp(&vbp->vb_buf);

 	/*
 	 * kill vbp structure
@ -1598,8 +1537,9 @@ uvm_swap_markbad(startslot, nslots)
 	 * we assume here that the range of slots will all be within
 	 * one swap device.
 	 */
-	sdp->swd_npgbad += nslots;

+	sdp->swd_npgbad += nslots;
+	UVMHIST_LOG(pdhist, "now %d bad", sdp->swd_npgbad, 0,0,0);
 	simple_unlock(&uvm.swap_data_lock);
 }

@ -1735,15 +1675,18 @@ uvm_swap_io(pps, startslot, npages, flags)
 	int startslot, npages, flags;
 {
 	daddr_t startblk;
-	struct swapbuf *sbp;
 	struct	buf *bp;
 	vaddr_t kva;
 	int	result, s, mapinflags, pflag;
+	boolean_t write, async;
 	UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist);

 	UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d",
 	    startslot, npages, flags, 0);

+	write = (flags & B_READ) == 0;
+	async = (flags & B_ASYNC) != 0;
+
 	/*
 	 * convert starting drum slot to block number
 	 */
@ -1751,43 +1694,37 @@ uvm_swap_io(pps, startslot, npages, flags)

 	/*
 	 * first, map the pages into the kernel (XXX: currently required
-	 * by buffer system).   note that we don't let pagermapin alloc
-	 * an aiodesc structure because we don't want to chance a malloc.
-	 * we've got our own pool of aiodesc structures (in swapbuf).
+	 * by buffer system).
 	 */
-	mapinflags = (flags & B_READ) ? UVMPAGER_MAPIN_READ :
-	    UVMPAGER_MAPIN_WRITE;
-	if ((flags & B_ASYNC) == 0)
+
+	mapinflags = !write ? UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE;
+	if (!async)
 		mapinflags |= UVMPAGER_MAPIN_WAITOK;
-	kva = uvm_pagermapin(pps, npages, NULL, mapinflags);
+	kva = uvm_pagermapin(pps, npages, mapinflags);
 	if (kva == 0)
 		return (VM_PAGER_AGAIN);

 	/* 
-	 * now allocate a swap buffer off of freesbufs
+	 * now allocate a buf for the i/o.
 	 * [make sure we don't put the pagedaemon to sleep...]
 	 */
 	s = splbio();
-	pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc)
-		? 0
-		: PR_WAITOK;
-	sbp = pool_get(swapbuf_pool, pflag);
-	splx(s);		/* drop splbio */
+	pflag = (async || curproc == uvm.pagedaemon_proc) ? 0 : PR_WAITOK;
+	bp = pool_get(&bufpool, pflag);
+	splx(s);

 	/*
-	 * if we failed to get a swapbuf, return "try again"
+	 * if we failed to get a buf, return "try again"
 	 */
-	if (sbp == NULL)
+	if (bp == NULL)
 		return (VM_PAGER_AGAIN);

 	/*
 	 * fill in the bp/sbp.   we currently route our i/o through
 	 * /dev/drum's vnode [swapdev_vp].
 	 */
-	bp = &sbp->sw_buf;
 	bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC));
 	bp->b_proc = &proc0;	/* XXX */
-	bp->b_rcred = bp->b_wcred = proc0.p_ucred;
 	bp->b_vnbufs.le_next = NOLIST;
 	bp->b_data = (caddr_t)kva;
 	bp->b_blkno = startblk;
@ -1799,49 +1736,43 @@ uvm_swap_io(pps, startslot, npages, flags)
 	/* XXXMRG: probably -- this is obviously something inherited... */
 	if (swapdev_vp->v_type == VBLK)
 		bp->b_dev = swapdev_vp->v_rdev;
-	bp->b_bcount = npages << PAGE_SHIFT;
+	bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
 	LIST_INIT(&bp->b_dep);

 	/* 
-	 * for pageouts we must set "dirtyoff" [NFS client code needs it].
-	 * and we bump v_numoutput (counter of number of active outputs).
+	 * bump v_numoutput (counter of number of active outputs).
 	 */
-	if ((bp->b_flags & B_READ) == 0) {
-		bp->b_dirtyoff = 0;
-		bp->b_dirtyend = npages << PAGE_SHIFT;
+	if (write) {
 		s = splbio();
 		swapdev_vp->v_numoutput++;
 		splx(s);
 	}

 	/*
-	 * for async ops we must set up the aiodesc and setup the callback
-	 * XXX: we expect no async-reads, but we don't prevent it here.
+	 * for async ops we must set up the iodone handler.
 	 */
-	if (flags & B_ASYNC) {
-		sbp->sw_aio.aiodone = uvm_swap_aiodone;
-		sbp->sw_aio.kva = kva;
-		sbp->sw_aio.npages = npages;
-		sbp->sw_aio.pd_ptr = sbp;	/* backpointer */
-		bp->b_flags |= B_CALL;		/* set callback */
-		bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */
+	if (async) {
+		/* XXXUBC pagedaemon */
+		bp->b_flags |= B_CALL | (curproc == uvm.pagedaemon_proc ?
+					 B_PDAEMON : 0);
+		bp->b_iodone = uvm_aio_biodone;
 		UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
 	}
 	UVMHIST_LOG(pdhist,
-	    "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld",
+	    "about to start io: data = %p blkno = 0x%x, bcount = %ld",
 	    bp->b_data, bp->b_blkno, bp->b_bcount, 0);

 	/*
 	 * now we start the I/O, and if async, return.
 	 */
 	VOP_STRATEGY(bp);
-	if (flags & B_ASYNC)
+	if (async)
 		return (VM_PAGER_PEND);

 	/*
 	 * must be sync i/o.   wait for it to finish
 	 */
-	bp->b_error = biowait(bp);
+	(void) biowait(bp);
 	result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK;

 	/*
@ -1850,13 +1781,14 @@ uvm_swap_io(pps, startslot, npages, flags)
 	uvm_pagermapout(kva, npages);

 	/*
-	 * now dispose of the swap buffer
+	 * now dispose of the buf
 	 */
 	s = splbio();
 	if (bp->b_vp)
 		brelvp(bp);
-
-	pool_put(swapbuf_pool, sbp);
+	if (write)
+		vwakeup(bp);
+	pool_put(&bufpool, bp);
 	splx(s);

 	/*
@ -1865,96 +1797,3 @@ uvm_swap_io(pps, startslot, npages, flags)
 	UVMHIST_LOG(pdhist, "<- done (sync)  result=%d", result, 0, 0, 0);
 	return (result);
 }
-
-/*
- * uvm_swap_bufdone: called from the buffer system when the i/o is done
- */
-static void
-uvm_swap_bufdone(bp)
-	struct buf *bp;
-{
-	struct swapbuf *sbp = (struct swapbuf *) bp;
-	int	s = splbio();
-	UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist);
-
-	UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0);
-#ifdef DIAGNOSTIC
-	/*
-	 * sanity check: swapbufs are private, so they shouldn't be wanted
-	 */
-	if (bp->b_flags & B_WANTED)
-		panic("uvm_swap_bufdone: private buf wanted");
-#endif
-
-	/*
-	 * drop the buffer's reference to the vnode.
-	 */
-	if (bp->b_vp)
-		brelvp(bp);
-
-	/*
-	 * now put the aio on the uvm.aio_done list and wake the
-	 * pagedaemon (which will finish up our job in its context).
-	 */
-	simple_lock(&uvm.pagedaemon_lock);	/* locks uvm.aio_done */
-	TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq);
-	simple_unlock(&uvm.pagedaemon_lock);
-
-	wakeup(&uvm.pagedaemon);
-	splx(s);
-}
-
-/*
- * uvm_swap_aiodone: aiodone function for anonymous memory
- *
- * => this is called in the context of the pagedaemon (but with the
- *	page queues unlocked!)
- * => our "aio" structure must be part of a "swapbuf"
- */
-static void
-uvm_swap_aiodone(aio)
-	struct uvm_aiodesc *aio;
-{
-	struct swapbuf *sbp = aio->pd_ptr;
-	struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT];
-	int lcv, s;
-	vaddr_t addr;
-	UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist);
-
-	UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0);
-#ifdef DIAGNOSTIC
-	/*
-	 * sanity check
-	 */
-	if (aio->npages > (MAXBSIZE >> PAGE_SHIFT))
-		panic("uvm_swap_aiodone: aio too big!");
-#endif
-
-	/*
-	 * first, we have to recover the page pointers (pps) by poking in the
-	 * kernel pmap (XXX: should be saved in the buf structure).
-	 */
-	for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; 
-		addr += PAGE_SIZE, lcv++) {
-		pps[lcv] = uvm_pageratop(addr);
-	}
-
-	/*
-	 * now we can dispose of the kernel mappings of the buffer
-	 */
-	uvm_pagermapout(aio->kva, aio->npages);
-
-	/*
-	 * now we can dispose of the pages by using the dropcluster function
-	 * [note that we have no "page of interest" so we pass in null]
-	 */
-	uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, 
-				PGO_PDFREECLUST);
-
-	/*
-	 * finally, we can dispose of the swapbuf
-	 */
-	s = splbio();
-	pool_put(swapbuf_pool, sbp);
-	splx(s);
-}
--- a/sys/uvm/uvm_vnode.c
+++ b/sys/uvm/uvm_vnode.c
--- a/sys/uvm/uvm_vnode.h
+++ b/sys/uvm/uvm_vnode.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_vnode.h,v 1.9 2000/03/26 20:54:48 kleink Exp $	*/
+/*	$NetBSD: uvm_vnode.h,v 1.10 2000/11/27 08:40:06 chs Exp $	*/

 /*
 *
@ -54,56 +54,6 @@ struct uvm_vnode {
 	int u_flags;			/* flags */
 	int u_nio;			/* number of running I/O requests */
 	voff_t u_size;			/* size of object */
-
-	/* the following entry is locked by uvn_wl_lock */
-	LIST_ENTRY(uvm_vnode) u_wlist;	/* list of writeable vnode objects */
-
-	/* the following entry is locked by uvn_sync_lock */
-	SIMPLEQ_ENTRY(uvm_vnode) u_syncq; /* vnode objects due for a "sync" */
 };

-/*
- * u_flags values
- */
-#define UVM_VNODE_VALID		0x001	/* we are attached to the vnode */
-#define UVM_VNODE_CANPERSIST	0x002	/* we can persist after ref == 0 */
-#define UVM_VNODE_ALOCK		0x004	/* uvn_attach is locked out */
-#define UVM_VNODE_DYING		0x008	/* final detach/terminate in 
-					   progress */
-#define UVM_VNODE_RELKILL	0x010	/* uvn should be killed by releasepg
-					   when final i/o is done */
-#define UVM_VNODE_WANTED	0x020	/* someone is waiting for alock,
-					   dying, or relkill to clear */
-#define UVM_VNODE_VNISLOCKED	0x040	/* underlying vnode struct is locked
-					   (valid when DYING is true) */
-#define UVM_VNODE_IOSYNC	0x080	/* I/O sync in progress ... setter
-					   sleeps on &uvn->u_nio */
-#define UVM_VNODE_IOSYNCWANTED	0x100	/* a process is waiting for the
-					   i/o sync to clear so it can do
-					   i/o */
-#define UVM_VNODE_WRITEABLE	0x200	/* uvn has pages that are writeable */
-
-/*
- * UVM_VNODE_BLOCKED: any condition that should new processes from
- * touching the vnode [set WANTED and sleep to wait for it to clear]
- */
-#define UVM_VNODE_BLOCKED (UVM_VNODE_ALOCK|UVM_VNODE_DYING|UVM_VNODE_RELKILL)
-
-#ifdef _KERNEL
-
-/*
- * prototypes
- */
-
-#if 0
-/*
- * moved uvn_attach to uvm_extern.h because uvm_vnode.h is needed to
- * include sys/vnode.h, and files that include sys/vnode.h don't know
- * what a vm_prot_t is.
- */
-struct uvm_object  *uvn_attach __P((void *, vm_prot_t));
-#endif
-
-#endif /* _KERNEL */
-
 #endif /* _UVM_UVM_VNODE_H_ */