From b397c875ae91c0eea34205c51de1153a7d8510b6 Mon Sep 17 00:00:00 2001
From: perseant <perseant@NetBSD.org>
Date: Mon, 17 Feb 2003 23:48:08 +0000
Subject: [PATCH] Add code to UBCify LFS.  This is still behind "#ifdef
 LFS_UBC" for now (there are still some details to work out) but expect that
 to go away soon.  To support these basic changes (creation of lfs_putpages,
 lfs_gop_write, mods to lfs_balloc) several other changes were made, to wit:

* Create a writer daemon kernel thread whose purpose is to handle page
  writes for the pagedaemon, but which also takes over some of the
  functions of lfs_check().  This thread is started the first time an
  LFS is mounted.

* Add a "flags" parameter to GOP_SIZE.  Current values are
  GOP_SIZE_READ, meaning that the call should return the size of the
  in-core version of the file, and GOP_SIZE_WRITE, meaning that it
  should return the on-disk size.  One of GOP_SIZE_READ or
  GOP_SIZE_WRITE must be specified.

* Instead of using malloc(...M_WAITOK) for everything, reserve enough
  resources to get by and use malloc(...M_NOWAIT), using the reserves if
  necessary.  Use the pool subsystem for structures small enough that
  this is feasible.  This also obsoletes LFS_THROTTLE.

And a few that are not strictly necessary:

* Moves the LFS inode extensions off onto a separately allocated
  structure; getting closer to LFS as an LKM.  "Welcome to 1.6O."

* Unified GOP_ALLOC between FFS and LFS.

* Update LFS copyright headers to correct values.

* Actually cast to unsigned in lfs_shellsort, like the comment says.

* Keep track of which segments were empty before the previous
  checkpoint; any segments that pass two checkpoints both dirty and
  empty can be summarily cleaned.  Do this.  Right now lfs_segclean
  still works, but this should be turned into an effectless
  compatibility syscall.
---
 libexec/lfs_cleanerd/print.c   |   6 +-
 sbin/fsck_lfs/inode.c          |  10 +-
 sbin/fsck_lfs/pass0.c          |  10 +-
 sys/conf/osrelease.sh          |   4 +-
 sys/miscfs/genfs/genfs_node.h  |  14 +-
 sys/miscfs/genfs/genfs_vnops.c |  19 +-
 sys/nfs/nfs_node.c             |  11 +-
 sys/sys/param.h                |   4 +-
 sys/ufs/ffs/ffs_extern.h       |   4 +-
 sys/ufs/ffs/ffs_vfsops.c       |   6 +-
 sys/ufs/ffs/ffs_vnops.c        |  10 +-
 sys/ufs/lfs/TODO               |  23 +-
 sys/ufs/lfs/lfs.h              | 122 +++++-
 sys/ufs/lfs/lfs_alloc.c        |  75 +---
 sys/ufs/lfs/lfs_balloc.c       | 120 ++---
 sys/ufs/lfs/lfs_bio.c          | 128 +++---
 sys/ufs/lfs/lfs_cksum.c        |   6 +-
 sys/ufs/lfs/lfs_debug.c        |   8 +-
 sys/ufs/lfs/lfs_extern.h       |  37 +-
 sys/ufs/lfs/lfs_inode.c        | 108 +++--
 sys/ufs/lfs/lfs_segment.c      | 726 +++++++++++++++++++------------
 sys/ufs/lfs/lfs_subr.c         | 331 ++++++++++----
 sys/ufs/lfs/lfs_syscalls.c     |  74 ++--
 sys/ufs/lfs/lfs_vfsops.c       | 434 +++++++++++++++++-
 sys/ufs/lfs/lfs_vnops.c        | 774 +++++++++++++++++++++++++++++++--
 sys/ufs/ufs/inode.h            |  14 +-
 sys/ufs/ufs/ufs_extern.h       |   3 +-
 sys/ufs/ufs/ufs_inode.c        |   8 +-
 sys/ufs/ufs/ufs_readwrite.c    |  21 +-
 sys/ufs/ufs/ufs_vnops.c        |  52 ++-
 sys/uvm/uvm_page.c             |   6 +-
 sys/uvm/uvm_pager.h            |   3 +-
 usr.sbin/dumplfs/dumplfs.c     |   6 +-
 usr.sbin/quotaon/quotaon.c     |   7 +-
 34 files changed, 2401 insertions(+), 783 deletions(-)

diff --git a/libexec/lfs_cleanerd/print.c b/libexec/lfs_cleanerd/print.c
index 60f691707abd..9097e83975c4 100644
--- a/libexec/lfs_cleanerd/print.c
+++ b/libexec/lfs_cleanerd/print.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: print.c,v 1.13 2003/01/24 21:55:05 fvdl Exp $	*/
+/*	$NetBSD: print.c,v 1.14 2003/02/17 23:48:08 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1992, 1993
@@ -38,7 +38,7 @@
 #if 0
 static char sccsid[] = "from: @(#)print.c	8.1 (Berkeley) 6/4/93";
 #else
-__RCSID("$NetBSD: print.c,v 1.13 2003/01/24 21:55:05 fvdl Exp $");
+__RCSID("$NetBSD: print.c,v 1.14 2003/02/17 23:48:08 perseant Exp $");
 #endif
 #endif /* not lint */
 
@@ -261,7 +261,7 @@ dump_super(struct lfs *lfsp)
 
 	syslog(LOG_DEBUG, "Checkpoint Info\n");
 	syslog(LOG_DEBUG, "%s%d\t%s0x%X\t%s%d\n",
-		"free     ", lfsp->lfs_free,
+		"freehd   ", lfsp->lfs_freehd,
 		"idaddr   ", lfsp->lfs_idaddr,
 		"ifile    ", lfsp->lfs_ifile);
 	syslog(LOG_DEBUG, "%s%d\t%s%d\t%s%d\n",
diff --git a/sbin/fsck_lfs/inode.c b/sbin/fsck_lfs/inode.c
index 6bdd1253fe9f..32572a5bc51f 100644
--- a/sbin/fsck_lfs/inode.c
+++ b/sbin/fsck_lfs/inode.c
@@ -1,4 +1,4 @@
-/* $NetBSD: inode.c,v 1.14 2003/01/24 21:55:10 fvdl Exp $	 */
+/* $NetBSD: inode.c,v 1.15 2003/02/17 23:48:09 perseant Exp $	 */
 
 /*
  * Copyright (c) 1997, 1998
@@ -348,8 +348,8 @@ lfs_ginode(ino_t inumber)
 		if (reply("free")) {
 			ifp = lfs_ientry(inumber, &bp);
 			ifp->if_daddr = LFS_UNUSED_DADDR;
-			ifp->if_nextfree = sblock.lfs_free;
-			sblock.lfs_free = inumber;
+			ifp->if_nextfree = sblock.lfs_freehd;
+			sblock.lfs_freehd = inumber;
 			sbdirty();
 			dirty(bp);
 			bp->b_flags &= ~B_INUSE;
@@ -700,8 +700,8 @@ clri(struct inodesc *idesc, char *type, int flag)
 
 		ifp = lfs_ientry(idesc->id_number, &bp);
 		ifp->if_daddr = LFS_UNUSED_DADDR;
-		ifp->if_nextfree = sblock.lfs_free;
-		sblock.lfs_free = idesc->id_number;
+		ifp->if_nextfree = sblock.lfs_freehd;
+		sblock.lfs_freehd = idesc->id_number;
 		sbdirty();
 		dirty(bp);
 		bp->b_flags &= ~B_INUSE;
diff --git a/sbin/fsck_lfs/pass0.c b/sbin/fsck_lfs/pass0.c
index 30913dfedd0f..d4d7d1661853 100644
--- a/sbin/fsck_lfs/pass0.c
+++ b/sbin/fsck_lfs/pass0.c
@@ -1,4 +1,4 @@
-/* $NetBSD: pass0.c,v 1.12 2003/01/24 21:55:10 fvdl Exp $	 */
+/* $NetBSD: pass0.c,v 1.13 2003/02/17 23:48:09 perseant Exp $	 */
 
 /*
  * Copyright (c) 1998 Konrad E. Schroder.
@@ -86,7 +86,7 @@ pass0()
 	memset(visited, 0, maxino * sizeof(ino_t));
 
 	plastino = 0;
-	ino = sblock.lfs_free;
+	ino = sblock.lfs_freehd;
 	while (ino) {
 		if (ino >= maxino) {
 			printf("! Ino %d out of range (last was %d)\n", ino,
@@ -115,7 +115,7 @@ pass0()
 			       ino, (long long)daddr);
 			if (preen || reply("FIX") == 1) {
 				if (plastino == 0) {
-					sblock.lfs_free = nextino;
+					sblock.lfs_freehd = nextino;
 					sbdirty();
 				} else {
 					ifp = lfs_ientry(plastino, &bp);
@@ -145,8 +145,8 @@ pass0()
 
 		pwarn("! Ino %d free, but not on the free list\n", ino);
 		if (preen || reply("FIX") == 1) {
-			ifp->if_nextfree = sblock.lfs_free;
-			sblock.lfs_free = ino;
+			ifp->if_nextfree = sblock.lfs_freehd;
+			sblock.lfs_freehd = ino;
 			sbdirty();
 			dirty(bp);
 		}
diff --git a/sys/conf/osrelease.sh b/sys/conf/osrelease.sh
index 31bfb77fc870..fc247da50614 100644
--- a/sys/conf/osrelease.sh
+++ b/sys/conf/osrelease.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 #
-#	$NetBSD: osrelease.sh,v 1.90 2003/02/01 06:26:30 thorpej Exp $
+#	$NetBSD: osrelease.sh,v 1.91 2003/02/17 23:48:09 perseant Exp $
 #
 # Copyright (c) 1997 The NetBSD Foundation, Inc.
 # All rights reserved.
@@ -42,7 +42,7 @@
 #	sys/sys/param.h:	__NetBSD_Version__
 #	share/tmac/doc-common:	ds oS
 #
-release=1.6N
+release=1.6O
 
 case $1 in
 -s)
diff --git a/sys/miscfs/genfs/genfs_node.h b/sys/miscfs/genfs/genfs_node.h
index da1f1afca5e6..19d05c17fa16 100644
--- a/sys/miscfs/genfs/genfs_node.h
+++ b/sys/miscfs/genfs/genfs_node.h
@@ -1,4 +1,4 @@
-/* $NetBSD: genfs_node.h,v 1.3 2001/12/18 07:49:36 chs Exp $ */
+/* $NetBSD: genfs_node.h,v 1.4 2003/02/17 23:48:10 perseant Exp $ */
 
 /*
  * Copyright (c) 2001 Chuck Silvers.
@@ -36,18 +36,22 @@
 struct vm_page;
 
 struct genfs_ops {
-	void	(*gop_size)(struct vnode *, off_t, off_t *);
+	void	(*gop_size)(struct vnode *, off_t, off_t *, int);
 	int	(*gop_alloc)(struct vnode *, off_t, off_t, int, struct ucred *);
 	int	(*gop_write)(struct vnode *, struct vm_page **, int, int);
 };
 
-#define GOP_SIZE(vp, size, eobp) \
-	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp))
+#define GOP_SIZE(vp, size, eobp, flags) \
+	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp), (flags))
 #define GOP_ALLOC(vp, off, len, flags, cred) \
 	(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
 #define GOP_WRITE(vp, pgs, npages, flags) \
 	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))
 
+/* Flags to GOP_SIZE */
+#define GOP_SIZE_READ  0x1	/* Advise how many pages to read/create */
+#define GOP_SIZE_WRITE 0x2	/* Tell how many pages to write */
+
 struct genfs_node {
 	struct genfs_ops	*g_op;		/* ops vector */
 	struct lock		g_glock;	/* getpages lock */
@@ -55,7 +59,7 @@ struct genfs_node {
 
 #define VTOG(vp) ((struct genfs_node *)(vp)->v_data)
 
-void	genfs_size(struct vnode *, off_t, off_t *);
+void	genfs_size(struct vnode *, off_t, off_t *, int);
 void	genfs_node_init(struct vnode *, struct genfs_ops *);
 int	genfs_gop_write(struct vnode *, struct vm_page **, int, int);
 int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
diff --git a/sys/miscfs/genfs/genfs_vnops.c b/sys/miscfs/genfs/genfs_vnops.c
index 3b62dbe01169..baf3220869c3 100644
--- a/sys/miscfs/genfs/genfs_vnops.c
+++ b/sys/miscfs/genfs/genfs_vnops.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: genfs_vnops.c,v 1.71 2003/02/05 21:38:42 pk Exp $	*/
+/*	$NetBSD: genfs_vnops.c,v 1.72 2003/02/17 23:48:11 perseant Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.71 2003/02/05 21:38:42 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.72 2003/02/17 23:48:11 perseant Exp $");
 
 #include "opt_nfsserver.h"
 
@@ -495,11 +495,11 @@ genfs_getpages(void *v)
 	error = 0;
 	origoffset = ap->a_offset;
 	orignpages = *ap->a_count;
-	GOP_SIZE(vp, vp->v_size, &diskeof);
+	GOP_SIZE(vp, vp->v_size, &diskeof, GOP_SIZE_READ);
 	if (flags & PGO_PASTEOF) {
 		newsize = MAX(vp->v_size,
 		    origoffset + (orignpages << PAGE_SHIFT));
-		GOP_SIZE(vp, newsize, &memeof);
+		GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_READ);
 	} else {
 		memeof = diskeof;
 	}
@@ -1139,8 +1139,13 @@ genfs_putpages(void *v)
 		yield = (l->l_cpu->ci_schedstate.spc_flags &
 		    SPCF_SHOULDYIELD) && !pagedaemon;
 		if (pg->flags & PG_BUSY || yield) {
-			KASSERT(!pagedaemon);
 			UVMHIST_LOG(ubchist, "busy %p", pg,0,0,0);
+			if (flags & PGO_BUSYFAIL && pg->flags & PG_BUSY) {
+				UVMHIST_LOG(ubchist, "busyfail %p", pg, 0,0,0);
+				error = EDEADLK;
+				break;
+			}
+			KASSERT(!pagedaemon);
 			if (by_list) {
 				TAILQ_INSERT_BEFORE(pg, &curmp, listq);
 				UVMHIST_LOG(ubchist, "curmp next %p",
@@ -1381,7 +1386,7 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
 	    vp, pgs, npages, flags);
 
-	GOP_SIZE(vp, vp->v_size, &eof);
+	GOP_SIZE(vp, vp->v_size, &eof, GOP_SIZE_WRITE);
 	if (vp->v_type == VREG) {
 		fs_bshift = vp->v_mount->mnt_fs_bshift;
 		dev_bshift = vp->v_mount->mnt_dev_bshift;
@@ -1523,7 +1528,7 @@ genfs_node_init(struct vnode *vp, struct genfs_ops *ops)
 }
 
 void
-genfs_size(struct vnode *vp, off_t size, off_t *eobp)
+genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
 	int bsize;
 
diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c
index a78de26dfb4c..1a93a45602ef 100644
--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_node.c,v 1.60 2003/02/15 18:00:25 drochner Exp $	*/
+/*	$NetBSD: nfs_node.c,v 1.61 2003/02/17 23:48:12 perseant Exp $	*/
 
 /*
  * Copyright (c) 1989, 1993
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nfs_node.c,v 1.60 2003/02/15 18:00:25 drochner Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nfs_node.c,v 1.61 2003/02/17 23:48:12 perseant Exp $");
 
 #include "opt_nfs.h"
 
@@ -80,7 +80,7 @@ extern int prtactive;
 
 #define	nfs_hash(x,y)	hash32_buf((x), (y), HASH32_BUF_INIT)
 
-void nfs_gop_size(struct vnode *, off_t, off_t *);
+void nfs_gop_size(struct vnode *, off_t, off_t *, int);
 int nfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
 int nfs_gop_write(struct vnode *, struct vm_page **, int, int);
 
@@ -315,8 +315,11 @@ nfs_reclaim(v)
 }
 
 void
-nfs_gop_size(struct vnode *vp, off_t size, off_t *eobp)
+nfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE))
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
 	*eobp = MAX(size, vp->v_size);
 }
 
diff --git a/sys/sys/param.h b/sys/sys/param.h
index a659e5cc7948..808916eb4df4 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.159 2003/02/01 06:26:30 thorpej Exp $	*/
+/*	$NetBSD: param.h,v 1.160 2003/02/17 23:48:13 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -67,7 +67,7 @@
  * Don't forget to change conf/osrelease.sh too.
  */
 
-#define	__NetBSD_Version__	106140000	/* NetBSD 1.6N */
+#define	__NetBSD_Version__	106150000	/* NetBSD 1.6O */
 
 /*
  * Historical NetBSD #define
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index d3f0722fb020..4cf6f99967ec 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_extern.h,v 1.25 2003/01/24 21:55:22 fvdl Exp $	*/
+/*	$NetBSD: ffs_extern.h,v 1.26 2003/02/17 23:48:14 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993, 1994
@@ -151,7 +151,7 @@ int ffs_fsync __P((void *));
 int ffs_reclaim __P((void *));
 int ffs_getpages __P((void *));
 int ffs_putpages __P((void *));
-void ffs_gop_size __P((struct vnode *, off_t, off_t *));
+void ffs_gop_size __P((struct vnode *, off_t, off_t *, int));
 __END_DECLS
 
  
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index c29089ef5428..e0a368eca950 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.106 2003/01/24 21:55:23 fvdl Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.107 2003/02/17 23:48:14 perseant Exp $	*/
 
 /*
  * Copyright (c) 1989, 1991, 1993, 1994
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.106 2003/01/24 21:55:23 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.107 2003/02/17 23:48:14 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -117,7 +117,7 @@ struct vfsops ffs_vfsops = {
 
 struct genfs_ops ffs_genfsops = {
 	ffs_gop_size,
-	ffs_gop_alloc,
+	ufs_gop_alloc,
 	genfs_gop_write,
 };
 
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 74b4a5d7995a..e845ed874cdd 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vnops.c,v 1.54 2003/02/05 21:38:44 pk Exp $	*/
+/*	$NetBSD: ffs_vnops.c,v 1.55 2003/02/17 23:48:15 perseant Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.54 2003/02/05 21:38:44 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.55 2003/02/17 23:48:15 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -567,12 +567,16 @@ ffs_putpages(void *v)
  */
 
 void
-ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp)
+ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
 	struct inode *ip = VTOI(vp);
 	struct fs *fs = ip->i_fs;
 	daddr_t olbn, nlbn;
 
+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE)) 
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
+
 	olbn = lblkno(fs, ip->i_ffs_size);
 	nlbn = lblkno(fs, size);
 	if (nlbn < NDADDR && olbn <= nlbn) {
diff --git a/sys/ufs/lfs/TODO b/sys/ufs/lfs/TODO
index c60e2b69c4a7..b968cd5834f9 100644
--- a/sys/ufs/lfs/TODO
+++ b/sys/ufs/lfs/TODO
@@ -1,4 +1,19 @@
-#   $NetBSD: TODO,v 1.5 2001/07/13 20:30:22 perseant Exp $
+#   $NetBSD: TODO,v 1.6 2003/02/17 23:48:16 perseant Exp $
+
+- Lock audit.  Need to check locking for multiprocessor case in particular.
+
+- Get rid of the syscalls: make them into ioctl calls instead.  This would
+  allow LFS to be loaded as a module.  We would then ideally have an
+  in-kernel cleaner that runs if no userland cleaner has asserted itself.
+
+- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it
+  has passed two checkpoints containing zero live bytes.
+
+- Now that our cache is basically all of physical memory, we need to make
+  sure that segwrite is not starving other important things.  Need a way
+  to prioritize which blocks are most important to write, and write only
+  those before giving up the seglock to do the rest.  How does this change
+  our notion of what a checkpoint is?
 
 - Investigate alternate inode locking strategy: Inode locks are useful
   for locking against simultaneous changes to inode size (balloc,
@@ -11,12 +26,6 @@
 - Fully working fsck_lfs.  (Really, need a general-purpose external
   partial-segment writer.)
 
-- Inode blocks are currently the same size as the fs block size; but all
-  the ones I've seen are mostly empty, and this will be especially true
-  if atime information is kept in the ifile instead of the inode.  Could
-  we shrink the inode block size to DEV_BSIZE?  Or parametrize it at fs
-  creation time?
-
 - Get rid of DEV_BSIZE, pay attention to the media block size at mount time.
 
 - More fs ops need to call lfs_imtime.  Which ones?  (Blackwell et al., 1995)
diff --git a/sys/ufs/lfs/lfs.h b/sys/ufs/lfs/lfs.h
index 989773499039..94fe31443506 100644
--- a/sys/ufs/lfs/lfs.h
+++ b/sys/ufs/lfs/lfs.h
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs.h,v 1.45 2003/01/29 13:14:33 yamt Exp $	*/
+/*	$NetBSD: lfs.h,v 1.46 2003/02/17 23:48:16 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -95,11 +95,44 @@
 #define BW_CLEAN	1
 #define MIN_FREE_SEGS	2
 #define LFS_MAX_ACTIVE	10
-#define LFS_MAXDIROP	(desiredvnodes >> 2)
 #ifndef LFS_ATIME_IFILE
 # define LFS_ATIME_IFILE 0
 #endif
 
+/* Local definition for LFS's usage of PG_PAGER1 */
+#define PG_DELWRI	PG_PAGER1
+
+/* Types for lfs_newbuf and lfs_malloc */
+#define LFS_NB_UNKNOWN -1
+#define LFS_NB_SUMMARY	0
+#define LFS_NB_SBLOCK	1
+#define LFS_NB_IBLOCK	2
+#define LFS_NB_CLUSTER	3
+#define LFS_NB_CLEAN	4
+#define LFS_NB_COUNT	5 /* always last */
+
+/* Number of reserved memory blocks of each type */
+#define LFS_N_SUMMARIES 2
+#define LFS_N_SBLOCKS   1   /* Always 1, to throttle superblock writes */
+#define LFS_N_IBLOCKS   16  /* In theory ssize/bsize; in practice around 2 */
+#define LFS_N_CLUSTERS  16  /* In theory ssize/MAXPHYS */
+#define LFS_N_CLEAN     0
+
+/* Total count of "large" (non-pool) types */
+#define LFS_N_TOTAL (LFS_N_SUMMARIES + LFS_N_SBLOCKS + LFS_N_IBLOCKS + LFS_N_CLUSTERS + LFS_N_CLEAN)
+
+/* Counts for pool types */
+#define LFS_N_CL        LFS_N_CLUSTERS
+#define LFS_N_BPP       2
+#define LFS_N_SEG	2
+
+/* Structure to keep reserved blocks */
+typedef struct lfs_res_blk {
+	void *p;
+	LIST_ENTRY(lfs_res_blk) res;
+	char inuse;
+} res_t;
+
 /*
  * #define WRITE_THRESHHOLD    ((nbuf >> 1) - 10)
  * #define WAIT_THRESHHOLD     (nbuf - (nbuf >> 2) - 10)
@@ -109,8 +142,17 @@
 /* These are new ... is LFS taking up too much memory in its buffers? */
 #define LFS_MAX_BYTES       (((bufpages >> 2) - 10) * NBPG)
 #define LFS_WAIT_BYTES      (((bufpages >> 1) - (bufpages >> 3) - 10) * NBPG)
+#define LFS_MAX_DIROP	    ((desiredvnodes >> 2) + (desiredvnodes >> 3))
 #define LFS_BUFWAIT         2
 
+#define LFS_MAX_PAGES \
+     (((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemin) >> 8)
+#define LFS_WAIT_PAGES \
+     (((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemax) >> 8)
+
+#define LFS_IS_MALLOC_BUF(bp) (((bp)->b_flags & B_CALL) && 		\
+     ((bp)->b_iodone == lfs_callback || (bp)->b_iodone == lfs_fakebuf_iodone))
+
 #define LFS_LOCK_BUF(bp) do {						\
 	if (((bp)->b_flags & (B_LOCKED | B_CALL)) == 0) {		\
 		++locked_queue_count;       				\
@@ -237,7 +279,21 @@ extern struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
 	(ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);		\
 } while (0)
 
-#define WRITEINPROG(vp) (vp->v_dirtyblkhd.lh_first && !(VTOI(vp)->i_flag & \
+/*
+ * How to find out whether a vnode had dirty buffers or pages,
+ * to know whether it needs to retain IN_MODIFIED after a write.
+ */
+#ifdef LFS_UBC
+int lfs_checkifempty(struct vnode *);
+#  define VPISEMPTY(vp)  lfs_checkifempty(vp)
+#else
+# define VPISEMPTY(vp)  ((vp)->v_dirtyblkhd.lh_first == NULL)
+#endif
+/*
+ * WRITEINPROG does not use VPISEMPTY because any dirty pages will
+ * have been given buffer headers, if they are "in progress".
+ */
+#define WRITEINPROG(vp) ((vp)->v_dirtyblkhd.lh_first && !(VTOI(vp)->i_flag & \
 				(IN_MODIFIED | IN_ACCESSED | IN_CLEANING)))
 
 /* Here begins the berkeley code */
@@ -257,6 +313,7 @@ struct segusage {
 #define	SEGUSE_DIRTY		0x02	/*  segment has data in it */
 #define	SEGUSE_SUPERBLOCK	0x04	/*  segment contains a superblock */
 #define SEGUSE_ERROR            0x08    /*  cleaner: do not clean segment */
+#define SEGUSE_EMPTY            0x10    /*  segment is empty */
 	u_int32_t su_flags;		/* 12: segment flags */
 	u_int64_t su_lastmod;		/* 16: last modified timestamp */
 };
@@ -304,7 +361,7 @@ struct dlfs {
         u_int32_t dlfs_frag;      /* 28: number of frags in a block in fs */
 
 /* Checkpoint region. */
-        u_int32_t dlfs_free;      /* 32: start of the free list */
+        u_int32_t dlfs_freehd;      /* 32: start of the free list */
         u_int32_t dlfs_bfree;     /* 36: number of free disk blocks */
         u_int32_t dlfs_nfiles;    /* 40: number of allocated inodes */
         int32_t   dlfs_avail;     /* 44: blocks available for writing */
@@ -371,9 +428,6 @@ struct dlfs {
 	u_int32_t dlfs_cksum;     /* 508: checksum for superblock checking */
 };
 
-/* Maximum number of io's we can have pending at once */
-#define LFS_THROTTLE  32 /* XXX should be better paramtrized - ? */
-
 /* In-memory super block. */
 struct lfs {
         struct dlfs lfs_dlfs;           /* on-disk parameters */
@@ -385,7 +439,7 @@ struct lfs {
 #define lfs_bsize lfs_dlfs.dlfs_bsize
 #define lfs_fsize lfs_dlfs.dlfs_fsize
 #define lfs_frag lfs_dlfs.dlfs_frag
-#define lfs_free lfs_dlfs.dlfs_free
+#define lfs_freehd lfs_dlfs.dlfs_freehd
 #define lfs_bfree lfs_dlfs.dlfs_bfree
 #define lfs_nfiles lfs_dlfs.dlfs_nfiles
 #define lfs_avail lfs_dlfs.dlfs_avail
@@ -455,20 +509,26 @@ struct lfs {
 #define LFS_WARNED  0x04
 	int8_t	  lfs_flags;		/* currently unused flag */
 	u_int16_t lfs_activesb;         /* toggle between superblocks */
-#ifdef LFS_TRACK_IOS
-	daddr_t   lfs_pending[LFS_THROTTLE]; /* daddrs of pending writes */
-#endif /* LFS_TRACK_IOS */
 	daddr_t   lfs_sbactive;         /* disk address of in-progress sb write */
 	struct vnode *lfs_flushvp;      /* vnode being flushed */
 	struct vnode *lfs_unlockvp;     /* being inactivated in lfs_segunlock */
 	u_int32_t lfs_diropwait;	/* # procs waiting on dirop flush */
 	size_t lfs_devbsize;		/* Device block size */
 	size_t lfs_devbshift;		/* Device block shift */
-	struct lock lfs_freelock;
 	struct lock lfs_fraglock;
 	pid_t lfs_rfpid;		/* Process ID of roll-forward agent */
 	int       lfs_nadirop;		/* number of active dirop nodes */
 	long      lfs_ravail;           /* blocks pre-reserved for writing */
+	res_t *lfs_resblk;		/* Reserved memory for pageout */
+	TAILQ_HEAD(, inode) lfs_dchainhd; /* dirop vnodes */
+	TAILQ_HEAD(, inode) lfs_pchainhd; /* paging vnodes */
+#define LFS_RESHASH_WIDTH 17
+	LIST_HEAD(, lfs_res_blk) lfs_reshash[LFS_RESHASH_WIDTH]; 
+	int       lfs_pdflush;           /* pagedaemon wants us to flush */
+	u_int32_t **lfs_suflags;	/* Segment use flags */
+	struct pool lfs_clpool;		/* Pool for struct lfs_cluster */
+	struct pool lfs_bpppool;	/* Pool for bpp */
+	struct pool lfs_segpool;	/* Pool for struct segment */
 };
 
 /*
@@ -659,14 +719,14 @@ struct segsum {
 #define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do {                       \
 	if ((FS)->lfs_version > 1) {                                    \
 		LFS_CLEANERINFO((CIP), (FS), (BP));                     \
-		(FS)->lfs_free = (CIP)->free_head;			\
+		(FS)->lfs_freehd = (CIP)->free_head;			\
 		brelse(BP);                                             \
 	}								\
-	*(FREEP) = (FS)->lfs_free;					\
+	*(FREEP) = (FS)->lfs_freehd;					\
 } while (0)
 
 #define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do {                         \
-	(FS)->lfs_free = (VAL);						\
+	(FS)->lfs_freehd = (VAL);						\
 	if ((FS)->lfs_version > 1) {                                    \
 		LFS_CLEANERINFO((CIP), (FS), (BP));                     \
 		(CIP)->free_head = (VAL);                 		\
@@ -721,6 +781,15 @@ struct segsum {
 		(SP) = (SEGUSE *)(BP)->b_data + ((IN) % (F)->lfs_sepb);	\
 } while(0)
 
+#define LFS_WRITESEGENTRY(SP, F, IN, BP) do {				\
+	if ((SP)->su_nbytes == 0)					\
+		(SP)->su_flags |= SEGUSE_EMPTY;				\
+	else								\
+		(SP)->su_flags &= ~SEGUSE_EMPTY;			\
+	(F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags;	\
+	LFS_BWRITE_LOG(BP);						\
+} while(0)
+
 /* Determine if a buffer belongs to the ifile */
 #define IS_IFILE(bp)	(VTOI(bp->b_vp)->i_number == LFS_IFILE_INUM)
 
@@ -773,15 +842,16 @@ struct segment {
 #define	SEGM_CLEAN	0x02		/* cleaner call; don't sort */
 #define	SEGM_SYNC	0x04		/* wait for segment */
 #define	SEGM_PROT	0x08		/* don't inactivate at segunlock */
+#define SEGM_PAGEDAEMON 0x10		/* pagedaemon called us */
 	u_int16_t seg_flags;		/* run-time flags for this segment */
 	u_int32_t seg_iocount;		/* number of ios pending */
 	int	  ndupino;              /* number of duplicate inodes */
 };
 
 struct lfs_cluster {
+	size_t bufsize;        /* Size of kept data */
 	struct buf **bpp;      /* Array of kept buffers */
 	int bufcount;          /* Number of kept buffers */
-	size_t bufsize;        /* Size of kept data */
 #define LFS_CL_MALLOC	0x00000001
 #define LFS_CL_SHIFT	0x00000002
 #define LFS_CL_SYNC	0x00000004
@@ -789,9 +859,25 @@ struct lfs_cluster {
 	struct lfs *fs;        /* LFS that this belongs to */
 	struct segment *seg;   /* Segment structure, for LFS_CL_SYNC */
 	void *saveaddr;        /* Original contents of saveaddr */
-	char *olddata;		/* Original b_data, if LFS_CL_MALLOC */
+	char *olddata;	       /* Original b_data, if LFS_CL_MALLOC */
 };
 
+/*
+ * LFS inode extensions; moved from <ufs/ufs/inode.h> so that file didn't
+ * have to change every time LFS changed.
+ */
+struct lfs_inode_ext {
+	off_t	  lfs_osize;		/* size of file on disk */
+	u_int32_t lfs_effnblocks;  /* number of blocks when i/o completes */
+	size_t    lfs_fragsize[NDADDR]; /* size of on-disk direct blocks */
+	TAILQ_ENTRY(inode) lfs_dchain; /* Dirop chain. */
+	TAILQ_ENTRY(inode) lfs_pchain; /* Paging chain. */
+};
+#define i_lfs_osize		inode_ext.lfs->lfs_osize
+#define i_lfs_effnblks		inode_ext.lfs->lfs_effnblocks
+#define i_lfs_fragsize		inode_ext.lfs->lfs_fragsize
+#define i_lfs_dchain		inode_ext.lfs->lfs_dchain
+
 /*
  * Macros for determining free space on the disk, with the variable metadata
  * of segment summaries and inode blocks taken into account.
diff --git a/sys/ufs/lfs/lfs_alloc.c b/sys/ufs/lfs/lfs_alloc.c
index 31a4a538c6d4..bfc30918afb2 100644
--- a/sys/ufs/lfs/lfs_alloc.c
+++ b/sys/ufs/lfs/lfs_alloc.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $	*/
+/*	$NetBSD: lfs_alloc.c,v 1.63 2003/02/17 23:48:16 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.63 2003/02/17 23:48:16 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@@ -85,7 +85,6 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/mount.h>
-#include <sys/malloc.h>
 #include <sys/pool.h>
 #include <sys/proc.h>
 
@@ -99,6 +98,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");
 
 extern int lfs_dirvcount;
 extern struct lock ufs_hashlock;
+extern struct simplelock lfs_subsys_lock;
+extern int lfs_subsys_pages;     
 
 static int extend_ifile(struct lfs *, struct ucred *);
 static int lfs_ialloc(struct lfs *, struct vnode *, ino_t, int, struct vnode **);
@@ -207,6 +208,7 @@ lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
 		(void)lfs_vunref(vp);
 		--lfs_dirvcount;
 		vp->v_flag &= ~VDIROP;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 		--fs->lfs_nadirop;
 		ip->i_flag &= ~IN_ADIROP;
 	}
@@ -245,7 +247,7 @@ extend_ifile(struct lfs *fs, struct ucred *cred)
 	LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
 	LFS_PUT_HEADFREE(fs, cip, cbp, i);
 #ifdef DIAGNOSTIC
-	if (fs->lfs_free == LFS_UNUSED_INUM)
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
 		panic("inode 0 allocated [2]");
 #endif /* DIAGNOSTIC */
 	max = i + fs->lfs_ifpb;
@@ -300,21 +302,7 @@ lfs_valloc(void *v)
 		return EROFS;
 	*ap->a_vpp = NULL;
 	
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_seglock(fs, SEGM_PROT);
-#else
-	if (fs->lfs_version == 1) {
-		/*
-		 * Use lfs_seglock here, instead of fs->lfs_freelock, to
-		 * ensure that the free list is not changed in between
-		 * the time that the ifile blocks are written to disk
-		 * and the time that the superblock is written to disk.
-		 */
-		lfs_seglock(fs, SEGM_PROT);
-	} else {
-		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
-	}
-#endif
 
 	/* Get the head of the freelist. */
 	LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
@@ -345,33 +333,20 @@ lfs_valloc(void *v)
 	brelse(bp);
 
 	/* Extend IFILE so that the next lfs_valloc will succeed. */
-	if (fs->lfs_free == LFS_UNUSED_INUM) {
+	if (fs->lfs_freehd == LFS_UNUSED_INUM) {
 		if ((error = extend_ifile(fs, ap->a_cred)) != 0) {
 			LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 			lfs_segunlock(fs);
-#else
-			if (fs->lfs_version == 1)
-				lfs_segunlock(fs);
-			else
-				lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
 			return error;
 		}
 	}
 #ifdef DIAGNOSTIC
-	if (fs->lfs_free == LFS_UNUSED_INUM)
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
 		panic("inode 0 allocated [3]");
 #endif /* DIAGNOSTIC */
 
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#else
-	if (fs->lfs_version == 1)
-		lfs_segunlock(fs);
-	else
-		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
+
 	return lfs_ialloc(fs, ap->a_pvp, new_ino, new_gen, ap->a_vpp);
 }
 
@@ -417,17 +392,16 @@ lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
 
 	uvm_vnp_setsize(vp, 0);
 	*vpp = vp;
-#if 1
 	if (!(vp->v_flag & VDIROP)) {
 		(void)lfs_vref(vp);
 		++lfs_dirvcount;
+		TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 	}
 	vp->v_flag |= VDIROP;
 
 	if (!(ip->i_flag & IN_ADIROP))
 		++fs->lfs_nadirop;
 	ip->i_flag |= IN_ADIROP;
-#endif
 	genfs_node_init(vp, &lfs_genfsops);
 	VREF(ip->i_devvp);
 	/* Set superblock modified bit and increment file count. */
@@ -439,17 +413,13 @@ lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
 	/*
 	 * Put the new inum back on the free list.
 	 */
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_seglock(fs, SEGM_PROT);
-#endif
 	LFS_IENTRY(ifp, fs, new_ino, bp);
 	ifp->if_daddr = LFS_UNUSED_DADDR;
 	LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
 	LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
 	(void) LFS_BWRITE_LOG(bp); /* Ifile */
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#endif
 
 	*vpp = NULLVP;
 	return (error);
@@ -470,6 +440,7 @@ lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
 	
 	/* Initialize the inode. */
 	ip = pool_get(&lfs_inode_pool, PR_WAITOK);
+	ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_devvp = ump->um_devvp;
@@ -487,8 +458,6 @@ lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
 	ip->i_ffs_blocks = 0;
 	ip->i_lfs_effnblks = 0;
 	ip->i_flag = 0;
-	/* Why was IN_MODIFIED ever set here? */
-	/* LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED); */
 
 #ifdef DEBUG_LFS_VNLOCK
 	if (ino == LFS_IFILE_INUM)
@@ -531,18 +500,12 @@ lfs_vfree(void *v)
 		tsleep(vp, (PRIBIO+1), "lfs_vfree", 0);
 	splx(s);
 
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT); /* XXX */;
-#else
-	if (fs->lfs_version == 1)
-		lfs_seglock(fs, SEGM_PROT);
-	else
-		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
-#endif
+	lfs_seglock(fs, SEGM_PROT);
 	
 	if (vp->v_flag & VDIROP) {
 		--lfs_dirvcount;
 		vp->v_flag &= ~VDIROP;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 		wakeup(&lfs_dirvcount);
 		lfs_vunref(vp);
 	}
@@ -597,20 +560,14 @@ lfs_vfree(void *v)
 		}
 #endif
 		sup->su_nbytes -= DINODE_SIZE;
-		(void) LFS_BWRITE_LOG(bp); /* Ifile */
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */
 	}
 	
 	/* Set superblock modified bit and decrement file count. */
 	fs->lfs_fmod = 1;
 	--fs->lfs_nfiles;
 	
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#else
-	if (fs->lfs_version == 1)
-		lfs_segunlock(fs);
-	else
-		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
+
 	return (0);
 }
diff --git a/sys/ufs/lfs/lfs_balloc.c b/sys/ufs/lfs/lfs_balloc.c
index b0054b2d5f6c..ed6d79343c96 100644
--- a/sys/ufs/lfs/lfs_balloc.c
+++ b/sys/ufs/lfs/lfs_balloc.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $	*/
+/*	$NetBSD: lfs_balloc.c,v 1.36 2003/02/17 23:48:16 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.36 2003/02/17 23:48:16 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@@ -96,6 +96,10 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $")
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>
 
+#include <uvm/uvm.h>
+
+extern int lfs_subsys_pages;
+
 int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, struct ucred *);
 
 /*
@@ -127,7 +131,7 @@ lfs_balloc(void *v)
 	int offset;
 	u_long iosize;
 	daddr_t daddr, idaddr;
-	struct buf *ibp, *bp;
+	struct buf *ibp, *bp, **bpp;
 	struct inode *ip;
 	struct lfs *fs;
 	struct indir indirs[NIADDR+2], *idp;
@@ -141,8 +145,9 @@ lfs_balloc(void *v)
 	offset = blkoff(fs, ap->a_startoffset);
 	iosize = ap->a_size;
 	lbn = lblkno(fs, ap->a_startoffset);
-	(void)lfs_check(vp, lbn, 0);
-	
+	/* (void)lfs_check(vp, lbn, 0); */
+	bpp = ap->a_bpp;
+
 	/* 
 	 * Three cases: it's a block beyond the end of file, it's a block in
 	 * the file that may or may not have been assigned a disk address or
@@ -159,7 +164,8 @@ lfs_balloc(void *v)
 	 * to rewrite it.
 	 */
 	
-	*ap->a_bpp = NULL;
+	if (bpp)
+		*bpp = NULL;
 	
 	/* Check for block beyond end of file and fragment extension needed. */
 	lastblock = lblkno(fs, ip->i_ffs_size);
@@ -167,13 +173,15 @@ lfs_balloc(void *v)
 		osize = blksize(fs, ip, lastblock);
 		if (osize < fs->lfs_bsize && osize > 0) {
 			if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
-						    lastblock, &bp,
+						    lastblock,
+						    (bpp ? &bp : NULL),
 						    ap->a_cred)))
 				return (error);
 			ip->i_ffs_size = (lastblock + 1) * fs->lfs_bsize;
 			uvm_vnp_setsize(vp, ip->i_ffs_size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			(void) VOP_BWRITE(bp);
+			if (bpp)
+				(void) VOP_BWRITE(bp);
 		}
 	}
 
@@ -192,25 +200,30 @@ lfs_balloc(void *v)
 			/* Brand new block or fragment */
 			frags = numfrags(fs, nsize);
 			bb = fragstofsb(fs, frags);
-			*ap->a_bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+			if (bpp) {
+				*ap->a_bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+				bp->b_blkno = UNWRITTEN;
+			}
 			if (ap->a_flags & B_CLRBUF)
 				clrbuf(bp);
 			ip->i_lfs_effnblks += bb;
 			ip->i_lfs->lfs_bfree -= bb;
-			ip->i_ffs_db[lbn] = bp->b_blkno = UNWRITTEN;
+			ip->i_ffs_db[lbn] = UNWRITTEN;
 		} else {
 			if (nsize <= osize) {
 				/* No need to extend */
-				if ((error = bread(vp, lbn, osize, NOCRED, &bp)))
+				if (bpp && (error = bread(vp, lbn, osize, NOCRED, &bp)))
 					return error;
 			} else {
 				/* Extend existing block */
 				if ((error =
-				     lfs_fragextend(vp, osize, nsize, lbn, &bp,
+				     lfs_fragextend(vp, osize, nsize, lbn,
+						    (bpp ? &bp : NULL),
 						    ap->a_cred)))
 					return error;
 			}
-			*ap->a_bpp = bp;
+			if (bpp)
+				*bpp = bp;
 		}
 		return 0;
 	}
@@ -279,10 +292,11 @@ lfs_balloc(void *v)
 
 
 	/*
-	 * Get the existing block from the cache.
+	 * Get the existing block from the cache, if requested.
 	 */
 	frags = fsbtofrags(fs, bb);
-	*ap->a_bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
+	if (bpp)
+		*bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
 	
 	/* 
 	 * The block we are writing may be a brand new block
@@ -293,11 +307,13 @@ lfs_balloc(void *v)
 	 * disk address UNWRITTEN.
 	 */
 	if (daddr == UNASSIGNED) {
-		if (ap->a_flags & B_CLRBUF)
-			clrbuf(bp);
+		if (bpp) {
+			if (ap->a_flags & B_CLRBUF)
+				clrbuf(bp);
 		
-		/* Note the new address */
-		bp->b_blkno = UNWRITTEN;
+			/* Note the new address */
+			bp->b_blkno = UNWRITTEN;
+		}
 		
 		switch (num) {
 		    case 0:
@@ -316,7 +332,7 @@ lfs_balloc(void *v)
 			((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
 			VOP_BWRITE(ibp);
 		}
-	} else if (!(bp->b_flags & (B_DONE|B_DELWRI))) {
+	} else if (bpp && !(bp->b_flags & (B_DONE|B_DELWRI))) {
 		/*
 		 * Not a brand new block, also not in the cache;
 		 * read it in from disk.
@@ -356,26 +372,35 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	error = 0;
 
 	/*
-	 * Get the seglock so we don't enlarge blocks or change the segment
-	 * accounting information while a segment is being written.
+	 * Get the seglock so we don't enlarge blocks while a segment
+	 * is being written.  If we're called with bpp==NULL, though,
+	 * we are only pretending to change a buffer, so we don't have to
+	 * lock.
 	 */
     top:
-#ifdef LFS_MALLOC_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
-#endif
+	if (bpp) {
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
+	}
+
 	if (!ISSPACE(fs, bb, cred)) {
 		error = ENOSPC;
 		goto out;
 	}
-	if ((error = bread(vp, lbn, osize, NOCRED, bpp))) {
+
+	/*
+	 * If we are not asked to actually return the block, all we need
+	 * to do is allocate space for it.  UBC will handle dirtying the
+	 * appropriate things and making sure it all goes to disk.
+	 * Don't bother to read in that case.
+	 */
+	if (bpp && (error = bread(vp, lbn, osize, NOCRED, bpp))) {
 		brelse(*bpp);
 		goto out;
 	}
 #ifdef QUOTA
 	if ((error = chkdq(ip, bb, cred, 0))) {
-		brelse(*bpp);
+		if (bpp)
+			brelse(*bpp);
 		goto out;
 	}
 #endif
@@ -386,17 +411,14 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	 * release both and start over after waiting.
 	 */
 
-	if ((*bpp)->b_flags & B_DELWRI) {
+	if (bpp && ((*bpp)->b_flags & B_DELWRI)) {
 		if (!lfs_fits(fs, bb)) {
-			brelse(*bpp);
+			if (bpp)
+				brelse(*bpp);
 #ifdef QUOTA
 			chkdq(ip, -bb, cred, 0);
 #endif
-#ifdef LFS_FRAGSIZE_SEGLOCK
-			lfs_segunlock(fs);
-#else
 			lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
 			lfs_availwait(fs, bb);
 			goto top;
 		}
@@ -407,24 +429,24 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	ip->i_lfs_effnblks += bb;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 
-	LFS_DEBUG_COUNTLOCKED("frag1");
+	if (bpp) {
+		LFS_DEBUG_COUNTLOCKED("frag1");
 
-	obufsize = (*bpp)->b_bufsize;
-	allocbuf(*bpp, nsize);
+		obufsize = (*bpp)->b_bufsize;
+		allocbuf(*bpp, nsize);
 
-	/* Adjust locked-list accounting */
-	if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
-		locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
+		/* Adjust locked-list accounting */
+		if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
+			locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
 
-	LFS_DEBUG_COUNTLOCKED("frag2");
+		LFS_DEBUG_COUNTLOCKED("frag2");
 
-	bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
+		bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
+	}
 
     out:
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_segunlock(fs);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
+	if (bpp) {
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+	}
 	return (error);
 }
diff --git a/sys/ufs/lfs/lfs_bio.c b/sys/ufs/lfs/lfs_bio.c
index de3599f8f145..dd05a33d8f45 100644
--- a/sys/ufs/lfs/lfs_bio.c
+++ b/sys/ufs/lfs/lfs_bio.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.58 2003/02/17 23:48:17 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.58 2003/02/17 23:48:17 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -86,10 +86,11 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
-#include <sys/malloc.h>
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>
 
+#include <uvm/uvm.h>
+
 /* Macros to clear/set/test flags. */
 # define	SET(t, f)	(t) |= (f)
 # define	CLR(t, f)	(t) &= ~(f)
@@ -102,11 +103,14 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
  * No write cost accounting is done.
  * This is almost certainly wrong for synchronous operations and NFS.
  */
-int	locked_queue_count   = 0;	/* XXX Count of locked-down buffers. */
-long	locked_queue_bytes   = 0L;	/* XXX Total size of locked buffers. */
+int	locked_queue_count   = 0;	/* Count of locked-down buffers. */
+long	locked_queue_bytes   = 0L;	/* Total size of locked buffers. */
+int	lfs_subsys_pages     = 0L;      /* Total number LFS-written pages */
 int	lfs_writing          = 0;	/* Set if already kicked off a writer
 					   because of buffer space */
+struct simplelock lfs_subsys_lock;	/* Lock on subsys_pages */
 extern int lfs_dostats;
+extern int lfs_do_flush;
 
 /*
  * reserved number/bytes of locked buffers
@@ -402,7 +406,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 	int fsb, s;
 
 	KASSERT(bp->b_flags & B_BUSY);
-	KASSERT(flags & BW_CLEAN || !(bp->b_flags & B_CALL));
+	KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));
 
 	/*
 	 * Don't write *any* blocks if we're mounted read-only.
@@ -411,7 +415,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
         if (VTOI(bp->b_vp)->i_lfs->lfs_ronly) {
 		bp->b_flags &= ~(B_DELWRI | B_READ | B_ERROR);
 		LFS_UNLOCK_BUF(bp);
-		if (bp->b_flags & B_CALL)
+		if (LFS_IS_MALLOC_BUF(bp))
 			bp->b_flags &= ~B_BUSY;
 		else
 			brelse(bp);
@@ -465,28 +469,26 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 void
 lfs_flush_fs(struct lfs *fs, int flags)
 {
-	if (fs->lfs_ronly == 0 && fs->lfs_dirops == 0)
-	{
-		/* disallow dirops during flush */
-		fs->lfs_writer++;
+	if (fs->lfs_ronly)
+		return;
 
-		/*
-		 * We set the queue to 0 here because we
-		 * are about to write all the dirty
-		 * buffers we have.  If more come in
-		 * while we're writing the segment, they
-		 * may not get written, so we want the
-		 * count to reflect these new writes
-		 * after the segwrite completes.
-		 */
-		if (lfs_dostats)
-			++lfs_stats.flush_invoked;
-		lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+	/* disallow dirops during flush */
+	fs->lfs_writer++;
 
-		/* XXX KS - allow dirops again */
-		if (--fs->lfs_writer == 0)
-			wakeup(&fs->lfs_dirops);
+	/* drain dirops */
+	while (fs->lfs_dirops > 0) {
+		++fs->lfs_diropwait;
+		tsleep(&fs->lfs_writer, PRIBIO+1, "fldirop", 0);
+		--fs->lfs_diropwait; 
 	}
+
+	if (lfs_dostats)
+		++lfs_stats.flush_invoked;
+	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+
+	/* allow dirops again */
+	if (--fs->lfs_writer == 0)
+		wakeup(&fs->lfs_dirops);
 }
 
 /*
@@ -512,6 +514,9 @@ lfs_flush(struct lfs *fs, int flags)
 	}
 	lfs_writing = 1;
 	
+	lfs_subsys_pages = 0; /* XXXUBC need a better way to count this */
+	wakeup(&lfs_subsys_pages);
+
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
@@ -525,7 +530,6 @@ lfs_flush(struct lfs *fs, int flags)
 		vfs_unbusy(mp);
 	}
 	simple_unlock(&mountlist_slock);
-
 	LFS_DEBUG_COUNTLOCKED("flush");
 
 	lfs_writing = 0;
@@ -562,25 +566,40 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 	while (fs->lfs_dirops > 0 &&
 	       (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
                 locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
-                lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0))
+		lfs_subsys_pages > LFS_MAX_PAGES ||
+                lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
 	{
 		++fs->lfs_diropwait;
 		tsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0);
 		--fs->lfs_diropwait;
 	}
 
+#ifdef DEBUG_LFS_FLUSH
+	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
+		printf("lqc = %d, max %d\n", locked_queue_count + INOCOUNT(fs),
+			LFS_MAX_BUFS);
+	if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
+		printf("lqb = %ld, max %d\n", locked_queue_bytes + INOBYTES(fs),
+			LFS_MAX_BYTES);
+	if (lfs_subsys_pages > LFS_MAX_PAGES)
+		printf("lssp = %d, max %d\n", lfs_subsys_pages, LFS_MAX_PAGES);
+	if (lfs_dirvcount > LFS_MAX_DIROP)
+		printf("ldvc = %d, max %d\n", lfs_dirvcount, LFS_MAX_DIROP);
+	if (fs->lfs_diropwait > 0)
+		printf("ldvw = %d\n", fs->lfs_diropwait);
+#endif
 	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
-	    lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0)
+	    lfs_subsys_pages > LFS_MAX_PAGES ||
+	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0)
 	{
-		++fs->lfs_writer;
 		lfs_flush(fs, flags);
-		if (--fs->lfs_writer == 0)
-			wakeup(&fs->lfs_dirops);
 	}
 
-	while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS
-	       || locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES)
+	while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS ||
+		locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES ||
+		lfs_subsys_pages > LFS_WAIT_PAGES ||
+		lfs_dirvcount > LFS_MAX_DIROP)
 	{
 		if (lfs_dostats)
 			++lfs_stats.wait_exceeded;
@@ -601,10 +620,7 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 		if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 		    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
 		{
-			++fs->lfs_writer;
 			lfs_flush(fs, flags | SEGM_CKP);
-			if (--fs->lfs_writer == 0)
-				wakeup(&fs->lfs_dirops);
 		}
 	}
 	return (error);
@@ -613,15 +629,8 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 /*
  * Allocate a new buffer header.
  */
-#ifdef MALLOCLOG
-# define DOMALLOC(S, T, F) _malloc((S), (T), (F), file, line)
 struct buf *
-lfs_newbuf_malloclog(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, char *file, int line)
-#else
-# define DOMALLOC(S, T, F) malloc((S), (T), (F))
-struct buf *
-lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
-#endif
+lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
 {
 	struct buf *bp;
 	size_t nbytes;
@@ -629,11 +638,13 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
 	
 	nbytes = roundup(size, fsbtob(fs, 1));
 	
-	bp = DOMALLOC(sizeof(struct buf), M_SEGMENT, M_WAITOK);
-	bzero(bp, sizeof(struct buf));
+	s = splbio();
+	bp = pool_get(&bufpool, PR_WAITOK);
+	splx(s);
+	memset(bp, 0, sizeof(struct buf));
 	if (nbytes) {
-		bp->b_data = DOMALLOC(nbytes, M_SEGMENT, M_WAITOK);
-		bzero(bp->b_data, nbytes);
+		bp->b_data = lfs_malloc(fs, nbytes, type);
+		/* memset(bp->b_data, 0, nbytes); */
 	}
 #ifdef DIAGNOSTIC	
 	if (vp == NULL)
@@ -659,27 +670,20 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
 	return (bp);
 }
 
-#ifdef MALLOCLOG
-# define DOFREE(A, T) _free((A), (T), file, line)
 void
-lfs_freebuf_malloclog(struct buf *bp, char *file, int line)
-#else
-# define DOFREE(A, T) free((A), (T))
-void
-lfs_freebuf(struct buf *bp)
-#endif
+lfs_freebuf(struct lfs *fs, struct buf *bp)
 {
 	int s;
 	
 	s = splbio();
 	if (bp->b_vp)
 		brelvp(bp);
-	splx(s);
 	if (!(bp->b_flags & B_INVAL)) { /* B_INVAL indicates a "fake" buffer */
-		DOFREE(bp->b_data, M_SEGMENT);
+		lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
 		bp->b_data = NULL;
 	}
-	DOFREE(bp, M_SEGMENT);
+	pool_put(&bufpool, bp);
+	splx(s);
 }
 
 /*
@@ -707,7 +711,7 @@ lfs_countlocked(int *count, long *bytes, char *msg)
 
 	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
 	    bp = bp->b_freelist.tqe_next) {
-		if (bp->b_flags & B_CALL) /* Malloced buffer */
+		if (bp->b_flags & B_CALL)
 			continue;
 		n++;
 		size += bp->b_bufsize;
diff --git a/sys/ufs/lfs/lfs_cksum.c b/sys/ufs/lfs/lfs_cksum.c
index 35ba63d8fce7..a8f652ea30a8 100644
--- a/sys/ufs/lfs/lfs_cksum.c
+++ b/sys/ufs/lfs/lfs_cksum.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_cksum.c,v 1.20 2002/06/16 00:13:15 perseant Exp $	*/
+/*	$NetBSD: lfs_cksum.c,v 1.21 2003/02/17 23:48:18 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.20 2002/06/16 00:13:15 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.21 2003/02/17 23:48:18 perseant Exp $");
 
 #include <sys/param.h>
 #ifdef _KERNEL
diff --git a/sys/ufs/lfs/lfs_debug.c b/sys/ufs/lfs/lfs_debug.c
index 978ab4fc2393..069dc804fa61 100644
--- a/sys/ufs/lfs/lfs_debug.c
+++ b/sys/ufs/lfs/lfs_debug.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_debug.c,v 1.19 2003/01/29 13:14:34 yamt Exp $	*/
+/*	$NetBSD: lfs_debug.c,v 1.20 2003/02/17 23:48:18 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -73,7 +73,7 @@
 #ifdef DEBUG
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.19 2003/01/29 13:14:34 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.20 2003/02/17 23:48:18 perseant Exp $");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
@@ -167,7 +167,7 @@ lfs_dump_super(struct lfs *lfsp)
 	
 	printf("Checkpoint Info\n");
 	printf("%s%d\t%s%x\t%s%d\n",
-	       "free	 ", lfsp->lfs_free,
+	       "freehd	 ", lfsp->lfs_freehd,
 	       "idaddr	 ", lfsp->lfs_idaddr,
 	       "ifile	 ", lfsp->lfs_ifile);
 	printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n",
diff --git a/sys/ufs/lfs/lfs_extern.h b/sys/ufs/lfs/lfs_extern.h
index 66e42e3afda5..31696f9a10cc 100644
--- a/sys/ufs/lfs/lfs_extern.h
+++ b/sys/ufs/lfs/lfs_extern.h
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_extern.h,v 1.38 2003/02/01 18:34:14 tron Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.39 2003/02/17 23:48:18 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -88,7 +88,7 @@ MALLOC_DECLARE(M_SEGMENT);
 #define LFS_WRITEINDIR	 1 /* flush indirect blocks on non-checkpoint writes */
 #define LFS_CLEAN_VNHEAD 2 /* put prev unrefed cleaned vnodes on head of free list */
 #define LFS_DOSTATS      3
-#define LFS_STATS        4
+#define LFS_MAXPAGES	 4
 #define LFS_MAXID	 5
 
 #define LFS_NAMES { \
@@ -96,7 +96,7 @@ MALLOC_DECLARE(M_SEGMENT);
 	{ "flushindir", CTLTYPE_INT }, \
 	{ "clean_vnhead", CTLTYPE_INT }, \
 	{ "dostats", CTLTYPE_INT }, \
-	{ "stats", CTLTYPE_STRUCT }, \
+	{ "maxpages", CTLTYPE_INT }, \
 }
 
 struct fid;
@@ -117,7 +117,8 @@ struct segment;
 struct ucred;
 
 extern int lfs_allclean_wakeup;
-extern struct pool lfs_inode_pool;		/* memory pool for inodes */
+extern struct pool lfs_inode_pool;	/* memory pool for inodes */
+extern struct pool lfs_inoext_pool;	/* memory pool for inode extension */
 
 __BEGIN_DECLS
 /* lfs_alloc.c */
@@ -130,16 +131,8 @@ int lfs_fits(struct lfs *, int);
 void lfs_flush_fs(struct lfs *, int);
 void lfs_flush(struct lfs *, int);
 int lfs_check(struct vnode *, daddr_t, int);
-#ifdef MALLOCLOG
-void lfs_freebuf_malloclog(struct buf *, char *, int);
-struct buf *lfs_newbuf_malloclog(struct lfs *, struct vnode *,
-				 daddr_t, size_t, char *, int);
-#define lfs_freebuf(BP) lfs_freebuf_malloclog((BP), __FILE__, __LINE__)
-#define lfs_newbuf(F, V, A, S) lfs_newbuf_malloclog((F),(V),(A),(S),__FILE__,__LINE__)
-#else
-void lfs_freebuf(struct buf *);
-struct buf *lfs_newbuf(struct lfs *, struct vnode *, daddr_t, size_t);
-#endif
+void lfs_freebuf(struct lfs *, struct buf *);
+struct buf *lfs_newbuf(struct lfs *, struct vnode *, daddr_t, size_t, int);
 void lfs_countlocked(int *, long *, char *);
 int lfs_reserve(struct lfs *, struct vnode *, struct vnode *, int);
 
@@ -169,6 +162,7 @@ void lfs_writefile(struct lfs *, struct segment *, struct vnode *);
 int lfs_writeinode(struct lfs *, struct segment *, struct inode *);
 int lfs_gatherblock(struct segment *, struct buf *, int *);
 int lfs_gather(struct lfs *, struct segment *, struct vnode *, int (*match )(struct lfs *, struct buf *));
+void lfs_update_single(struct lfs *, struct segment *, daddr_t, int32_t, int, int);
 void lfs_updatemeta(struct segment *);
 int lfs_initseg(struct lfs *);
 void lfs_newseg(struct lfs *);
@@ -187,12 +181,17 @@ void lfs_vunref(struct vnode *);
 void lfs_vunref_head(struct vnode *);
 
 /* lfs_subr.c */
-void lfs_seglock(struct lfs *, unsigned long);
+void lfs_setup_resblks(struct lfs *);
+void lfs_free_resblks(struct lfs *);
+void *lfs_malloc(struct lfs *, size_t, int);
+void lfs_free(struct lfs *, void *, int);
+int lfs_seglock(struct lfs *, unsigned long);
 void lfs_segunlock(struct lfs *);
 
 /* lfs_syscalls.c */
 int lfs_fastvget(struct mount *, ino_t, daddr_t, struct vnode **, struct dinode *);
 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
+int lfs_do_segclean(struct lfs *, unsigned long);
 
 /* lfs_vfsops.c */
 void lfs_init(void);
@@ -200,7 +199,6 @@ void lfs_reinit(void);
 void lfs_done(void);
 int lfs_mountroot(void);
 int lfs_mount(struct mount *, const char *, void *, struct nameidata *, struct proc *);
-int lfs_mountfs(struct vnode *, struct mount *, struct proc *);
 int lfs_unmount(struct mount *, int, struct proc *);
 int lfs_statfs(struct mount *, struct statfs *, struct proc *);
 int lfs_sync(struct mount *, int, struct ucred *, struct proc *);
@@ -213,6 +211,10 @@ int lfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, struct proc *);
 void lfs_unmark_vnode(struct vnode *);
 void lfs_itimes(struct inode *, struct timespec *, struct timespec *,
 		struct timespec *);
+int lfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
+void lfs_gop_size(struct vnode *, off_t, off_t *, int);
+int lfs_putpages_ext(void *, int);
+int lfs_gatherpages(struct vnode *);
 
 int lfs_balloc	 (void *);
 int lfs_valloc	 (void *);
@@ -230,6 +232,7 @@ int lfs_read	 (void *);
 int lfs_remove	 (void *);
 int lfs_rmdir	 (void *);
 int lfs_link	 (void *);
+int lfs_mmap	 (void *);
 int lfs_rename	 (void *);
 int lfs_getattr	 (void *);
 int lfs_setattr	 (void *);
diff --git a/sys/ufs/lfs/lfs_inode.c b/sys/ufs/lfs/lfs_inode.c
index 315eaca097bf..e0f8618aa318 100644
--- a/sys/ufs/lfs/lfs_inode.c
+++ b/sys/ufs/lfs/lfs_inode.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_inode.c,v 1.63 2003/01/25 16:40:29 fvdl Exp $	*/
+/*	$NetBSD: lfs_inode.c,v 1.64 2003/02/17 23:48:18 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.63 2003/01/25 16:40:29 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.64 2003/02/17 23:48:18 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@@ -231,6 +231,9 @@ lfs_truncate(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *ovp = ap->a_vp;
+#ifdef LFS_UBC
+	struct genfs_node *gp = VTOG(ovp);
+#endif
 	daddr_t lastblock;
 	struct inode *oip;
 	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
@@ -247,6 +250,7 @@ lfs_truncate(void *v)
 	long lastseg;
 	size_t bc;
 	int obufsize, odb;
+	int usepc, needunlock;
 
 	if (length < 0)
 		return (EINVAL);
@@ -282,6 +286,10 @@ lfs_truncate(void *v)
 	fs = oip->i_lfs;
 	lfs_imtime(fs);
 	osize = oip->i_ffs_size;
+	needunlock = usepc = 0;
+#ifdef LFS_UBC
+	usepc = (ovp->v_type == VREG && osize > length && ovp != fs->lfs_ivnode);
+#endif
 
 	/*
 	 * Lengthen the size of the file. We must ensure that the
@@ -313,18 +321,7 @@ lfs_truncate(void *v)
 	if ((error = lfs_reserve(fs, ovp, NULL,
 	    btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0)
 		return (error);
-	/*
-	 * Make sure no writes to this inode can happen while we're
-	 * truncating.  Otherwise, blocks which are accounted for on the
-	 * inode *and* which have been created for cleaning can coexist,
-	 * and cause an overcounting.
-	 */
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
-#endif
-	
+
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundary, the contents of the
@@ -338,7 +335,12 @@ lfs_truncate(void *v)
 	bc = 0;
 	if (offset == 0) {
 		oip->i_ffs_size = length;
-	} else {
+	} else
+#ifdef LFS_UBC
+	if (!usepc)
+#endif
+	{
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
 		lbn = lblkno(fs, length);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
@@ -347,11 +349,7 @@ lfs_truncate(void *v)
 		if (error) {
 			lfs_reserve(fs, ovp, NULL,
 			    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
-#ifdef LFS_FRAGSIZE_SEGLOCK
-			lfs_segunlock(fs);
-#else
 			lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
 			return (error);
 		}
 		obufsize = bp->b_bufsize;
@@ -367,7 +365,45 @@ lfs_truncate(void *v)
 		if (bp->b_flags & B_DELWRI)
 			fs->lfs_avail += odb - btofsb(fs, size);
 		(void) VOP_BWRITE(bp);
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 	}
+#ifdef LFS_UBC
+        /*
+         * When truncating a regular file down to a non-block-aligned size,
+         * we must zero the part of last block which is past the new EOF.
+         * We must synchronously flush the zeroed pages to disk
+         * since the new pages will be invalidated as soon as we
+         * inform the VM system of the new, smaller size.
+         * We must do this before acquiring the GLOCK, since fetching
+         * the pages will acquire the GLOCK internally.
+         * So there is a window where another thread could see a whole
+         * zeroed page past EOF, but that's life.
+         */
+
+        else { /* vp->v_type == VREG && length < osize && offset != 0 */
+                voff_t eoz;
+
+		aflags = ap->a_flags & IO_SYNC ? B_SYNC : 0;
+                error = ufs_balloc_range(ovp, length - 1, 1, ap->a_cred,
+			aflags);
+                if (error) {
+                        return error;
+                }
+                size = blksize(fs, oip, lblkno(fs, length));
+                eoz = MIN(lblktosize(fs, lblkno(fs, length)) + size, osize);
+                uvm_vnp_zerorange(ovp, length, eoz - length);
+                simple_lock(&ovp->v_interlock);
+                error = VOP_PUTPAGES(ovp, trunc_page(length), round_page(eoz),
+                    PGO_CLEANIT | PGO_DEACTIVATE | PGO_SYNCIO);
+                if (error) {
+                        return error;
+                }
+        }
+
+        lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
+#endif
+
+	oip->i_ffs_size = length;
 	uvm_vnp_setsize(ovp, length);
 	/*
 	 * Calculate index into inode's block list of
@@ -428,6 +464,10 @@ lfs_truncate(void *v)
 			goto done;
 	}
 
+	if (!usepc) {
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
+		needunlock = 1;
+	}
 	/*
 	 * All whole direct blocks or frags.
 	 */
@@ -516,10 +556,10 @@ done:
 #endif
 	lfs_reserve(fs, ovp, NULL,
 	    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_segunlock(fs);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+	if (needunlock)
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+#ifdef LFS_UBC
+	lockmgr(&gp->g_glock, LK_RELEASE, NULL);
 #endif
 	return (allerror);
 }
@@ -550,7 +590,6 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 {
 	SEGUSE *sup;
 	struct buf *bp;
-	int error;
 
 	if (lastseg < 0 || num == 0)
 		return 0;
@@ -563,8 +602,9 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 		sup->su_nbytes = num;
 	}
 	sup->su_nbytes -= num;
-	error = LFS_BWRITE_LOG(bp); /* Ifile */
-	return error;
+	LFS_WRITESEGENTRY(sup, fs, lastseg, bp);
+
+	return 0;
 }
 
 /*
@@ -707,6 +747,8 @@ lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
 /*
  * Destroy any in core blocks past the truncation length.
  * Inlined from vtruncbuf, so that lfs_avail could be updated.
+ * We take the fraglock to prevent cleaning from occurring while we are
+ * invalidating blocks.
  */
 static int
 lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo)
@@ -714,10 +756,19 @@ lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo)
 	struct buf *bp, *nbp;
 	int s, error;
 	struct lfs *fs;
+	voff_t off;
+
+	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
+	simple_lock(&vp->v_interlock);
+	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
+	if (error) {
+		return error;
+	} 
 
 	fs = VTOI(vp)->i_lfs;
 	s = splbio();
 
+	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
 restart:
 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 		nbp = LIST_NEXT(bp, b_vnbufs);
@@ -729,6 +780,7 @@ restart:
 			    "lfs_vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
+				lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 				return (error);
 			}
 			goto restart;
@@ -753,6 +805,7 @@ restart:
 			    "lfs_vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
+				lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 				return (error);
 			}
 			goto restart;
@@ -768,6 +821,7 @@ restart:
 	}
 
 	splx(s);
+	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 
 	return (0);
 }
diff --git a/sys/ufs/lfs/lfs_segment.c b/sys/ufs/lfs/lfs_segment.c
index 6290d7484ba4..b797f9ba5eb6 100644
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_segment.c,v 1.100 2003/02/05 21:38:45 pk Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.101 2003/02/17 23:48:19 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.100 2003/02/05 21:38:45 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.101 2003/02/17 23:48:19 perseant Exp $");
 
 #define ivndebug(vp,str) printf("ino %d: %s\n",VTOI(vp)->i_number,(str))
 
@@ -89,7 +89,6 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.100 2003/02/05 21:38:45 pk Exp $")
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
-#include <sys/malloc.h>
 #include <sys/mount.h>
 
 #include <miscfs/specfs/specdev.h>
@@ -110,6 +109,7 @@ MALLOC_DEFINE(M_SEGMENT, "LFS segment", "Segment for LFS");
 
 extern int count_lock_queue(void);
 extern struct simplelock vnode_free_list_slock;		/* XXX */
+extern int lfs_subsys_pages; 
 
 static void lfs_generic_callback(struct buf *, void (*)(struct buf *));
 static void lfs_super_aiodone(struct buf *);
@@ -206,6 +206,10 @@ lfs_vflush(struct vnode *vp)
 	struct segment *sp;
 	struct buf *bp, *nbp, *tbp, *tnbp;
 	int error, s;
+	int flushed;
+#if 0
+	int redo;
+#endif
 
 	ip = VTOI(vp);
 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
@@ -219,28 +223,57 @@ lfs_vflush(struct vnode *vp)
 
 		/*
 		 * Toss any cleaning buffers that have real counterparts
-		 * to avoid losing new data
+		 * to avoid losing new data.
 		 */
 		s = splbio();
 		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = LIST_NEXT(bp, b_vnbufs);
-			if (bp->b_flags & B_CALL) {
-				for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
-				    tbp = tnbp)
-				{
-					tnbp = LIST_NEXT(tbp, b_vnbufs);
-					if (tbp->b_vp == bp->b_vp
-					   && tbp->b_lblkno == bp->b_lblkno
-					   && tbp != bp)
-					{
-						fs->lfs_avail += btofsb(fs, bp->b_bcount);
+			if (!LFS_IS_MALLOC_BUF(bp))
+				continue;
+#ifdef LFS_UBC
+			/*
+			 * In the UBC case, look for *pages* matching
+			 * the range covered by cleaning blocks.
+			 */
+			if (bp->b_lblkno > 0 && vp->v_type == VREG &&
+			    vp != fs->lfs_ivnode) {
+				struct vm_page *pg;
+				voff_t off;
+
+				for (off = lblktosize(fs, bp->b_lblkno);
+				     off < lblktosize(fs, bp->b_lblkno + 1);
+				     off += PAGE_SIZE) {
+					pg = uvm_pagelookup(&vp->v_uobj, off);
+					if (pg && pmap_is_modified(pg)) {
+						fs->lfs_avail += btofsb(fs,
+							bp->b_bcount);
 						wakeup(&fs->lfs_avail);
-						lfs_freebuf(bp);
+						lfs_freebuf(fs, bp);
 						bp = NULL;
-						break;
+						goto nextbp;
 					}
 				}
 			}
+#endif
+			for (tbp = LIST_FIRST(&vp->v_dirtyblkhd); tbp;
+			    tbp = tnbp)
+			{
+				tnbp = LIST_NEXT(tbp, b_vnbufs);
+				if (tbp->b_vp == bp->b_vp
+				   && tbp->b_lblkno == bp->b_lblkno
+				   && tbp != bp)
+				{
+					fs->lfs_avail += btofsb(fs,
+						bp->b_bcount);
+					wakeup(&fs->lfs_avail);
+					lfs_freebuf(fs, bp);
+					bp = NULL;
+					break;
+				}
+			}
+#ifdef LFS_UBC
+		    nextbp:
+#endif
 		}
 		splx(s);
 	}
@@ -272,9 +305,7 @@ lfs_vflush(struct vnode *vp)
 			}
 			/* Copied from lfs_writeseg */
 			if (bp->b_flags & B_CALL) {
-				/* if B_CALL, it was created with newbuf */
-				lfs_freebuf(bp);
-				bp = NULL;
+				biodone(bp);
 			} else {
 				bremfree(bp);
 				LFS_UNLOCK_BUF(bp);
@@ -305,16 +336,19 @@ lfs_vflush(struct vnode *vp)
 	}
 	sp = fs->lfs_sp;
 
-	if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) {
+	flushed = 0;
+	if (VPISEMPTY(vp)) {
 		lfs_writevnodes(fs, vp->v_mount, sp, VN_EMPTY);
+		++flushed;
 	} else if ((ip->i_flag & IN_CLEANING) &&
 		  (fs->lfs_sp->seg_flags & SEGM_CLEAN)) {
 #ifdef DEBUG_LFS
 		ivndebug(vp,"vflush/clean");
 #endif
 		lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
+		++flushed;
 	} else if (lfs_dostats) {
-		if (LIST_FIRST(&vp->v_dirtyblkhd) || (VTOI(vp)->i_flag & IN_ALLMOD))
+		if (!VPISEMPTY(vp) || (VTOI(vp)->i_flag & IN_ALLMOD))
 			++lfs_stats.vflush_invoked;
 #ifdef DEBUG_LFS
 		ivndebug(vp,"vflush");
@@ -333,13 +367,24 @@ lfs_vflush(struct vnode *vp)
 	}
 #endif
 
+#if 1 /* XXX */
 	do {
 		do {
 			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
 				lfs_writefile(fs, sp, vp);
 		} while (lfs_writeinode(fs, sp, ip));
 	} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
-	
+#else
+	if (flushed && vp != fs->lfs_ivnode)
+		lfs_writeseg(fs, sp);
+	else do {
+		fs->lfs_flags &= ~LFS_IFDIRTY;
+		lfs_writefile(fs, sp, vp);
+		redo = lfs_writeinode(fs, sp, ip);
+		redo += lfs_writeseg(fs, sp);
+		redo += (fs->lfs_flags & LFS_IFDIRTY);
+	} while (redo && vp == fs->lfs_ivnode);
+#endif
 	if (lfs_dostats) {
 		++lfs_stats.nwrites;
 		if (sp->seg_flags & SEGM_SYNC)
@@ -418,7 +463,7 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
 			continue;
 		}
 		
-		if (op == VN_EMPTY && LIST_FIRST(&vp->v_dirtyblkhd)) {
+		if (op == VN_EMPTY && !VPISEMPTY(vp)) {
 			vndebug(vp,"empty");
 			continue;
 		}
@@ -439,17 +484,12 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
 		/*
 		 * Write the inode/file if dirty and it's not the IFILE.
 		 */
-		if ((ip->i_flag & IN_ALLMOD) ||
-		     (LIST_FIRST(&vp->v_dirtyblkhd) != NULL))
-		{
+		if ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp)) {
 			only_cleaning = ((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
 
-			if (ip->i_number != LFS_IFILE_INUM
-			   && LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
-			{
+			if (ip->i_number != LFS_IFILE_INUM)
 				lfs_writefile(fs, sp, vp);
-			}
-			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
+			if (!VPISEMPTY(vp)) {
 				if (WRITEINPROG(vp)) {
 #ifdef DEBUG_LFS
 					ivndebug(vp,"writevnodes/write2");
@@ -490,6 +530,7 @@ lfs_segwrite(struct mount *mp, int flags)
 	int writer_set = 0;
 	int dirty;
 	int redo;
+	int loopcount;
 	
 	fs = VFSTOUFS(mp)->um_lfs;
 
@@ -550,11 +591,12 @@ lfs_segwrite(struct mount *mp, int flags)
 				if ((error = tsleep(&fs->lfs_writer, PRIBIO + 1,
 						"lfs writer", 0)))
 				{
+					printf("segwrite mysterious error\n");
 					/* XXX why not segunlock? */
-					free(sp->bpp, M_SEGMENT);
+					pool_put(&fs->lfs_bpppool, sp->bpp);
 					sp->bpp = NULL;
-					free(sp, M_SEGMENT); 
-					fs->lfs_sp = NULL;
+					pool_put(&fs->lfs_segpool, sp);
+					sp = fs->lfs_sp = NULL;
 					return (error);
 				}
 			fs->lfs_writer++;
@@ -613,31 +655,28 @@ lfs_segwrite(struct mount *mp, int flags)
 
 	did_ckp = 0;
 	if (do_ckp || fs->lfs_doifile) {
+		loopcount = 10;
 		do {
 			vp = fs->lfs_ivnode;
 
-			vget(vp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
 #ifdef DEBUG
 			LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0);
 #endif
 			fs->lfs_flags &= ~LFS_IFDIRTY;
 
 			ip = VTOI(vp);
+
 			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
 				lfs_writefile(fs, sp, vp);
+
 			if (ip->i_flag & IN_ALLMOD)
 				++did_ckp;
 			redo = lfs_writeinode(fs, sp, ip);
-
-			vput(vp);
-			/*
-			 * if we know we'll redo, no need to writeseg here.
-			 */
-			if (!(redo && do_ckp)) {
-				redo += lfs_writeseg(fs, sp);
-			}
+			redo += lfs_writeseg(fs, sp);
 			redo += (fs->lfs_flags & LFS_IFDIRTY);
-		} while (redo && do_ckp);
+		} while (redo && do_ckp && --loopcount > 0);
+		if (loopcount <= 0)
+			printf("lfs_segwrite: possibly invalid checkpoint!\n");
 
 		/* The ifile should now be all clear */
 		if (do_ckp && LIST_FIRST(&vp->v_dirtyblkhd)) {
@@ -670,7 +709,10 @@ lfs_segwrite(struct mount *mp, int flags)
 	 * At the moment, the user's process hangs around so we can
 	 * sleep. 
 	 */
-	fs->lfs_doifile = 0;
+	if (loopcount <= 0)
+		fs->lfs_doifile = 1;
+	else
+		fs->lfs_doifile = 0;
 	if (writer_set && --fs->lfs_writer == 0)
 		wakeup(&fs->lfs_dirops);
 
@@ -738,10 +780,29 @@ lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
 		 * The same is true of the Ifile since checkpoints assume
 		 * that all valid Ifile blocks are written.
 		 */
-	   	if (IS_FLUSHING(fs,vp) || vp == fs->lfs_ivnode)
+	   	if (IS_FLUSHING(fs,vp) || vp == fs->lfs_ivnode) {
 			lfs_gather(fs, sp, vp, lfs_match_data);
-	} else
+			/*
+			 * Don't call VOP_PUTPAGES: if we're flushing,
+			 * we've already done it, and the Ifile doesn't
+			 * use the page cache.
+			 */
+		}
+	} else {
 		lfs_gather(fs, sp, vp, lfs_match_data);
+#ifdef LFS_UBC
+		/*
+		 * If we're flushing, we've already called VOP_PUTPAGES
+		 * so don't do it again.  Otherwise, we want to write
+		 * everything we've got.
+		 */
+		if (!IS_FLUSHING(fs, vp)) {
+			VOP_PUTPAGES(vp, 0, 0,
+				     PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED |
+				     PGO_BUSYFAIL);
+		}
+#endif
+	}
 
 	/*
 	 * It may not be necessary to write the meta-data blocks at this point,
@@ -865,6 +926,10 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	/*
 	 * If we are cleaning, ensure that we don't write UNWRITTEN disk
 	 * addresses to disk; possibly revert the inode size.
+	 * XXX By not writing these blocks, we are making the lfs_avail
+	 * XXX count on disk wrong by the same amount.  We should be
+	 * XXX able to "borrow" from lfs_avail and return it after the
+	 * XXX Ifile is written.  See also in lfs_writeseg.
 	 */
 	if (ip->i_lfs_effnblks != ip->i_ffs_blocks) {
 		cdp->di_size = ip->i_lfs_osize;
@@ -992,7 +1057,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 			(ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
 		if (redo_ifile)
 			fs->lfs_flags |= LFS_IFDIRTY;
-		error = LFS_BWRITE_LOG(bp); /* Ifile */
+		LFS_WRITESEGENTRY(sup, fs, oldsn, bp); /* Ifile */
 	}
 	return (redo_ifile);
 }
@@ -1002,7 +1067,8 @@ lfs_gatherblock(struct segment *sp, struct buf *bp, int *sptr)
 {
 	struct lfs *fs;
 	int version;
-	
+	int j, blksinblk;
+
 	/*
 	 * If full, finish this segment.  We may be doing I/O, so
 	 * release and reacquire the splbio().
@@ -1012,7 +1078,8 @@ lfs_gatherblock(struct segment *sp, struct buf *bp, int *sptr)
 		panic ("lfs_gatherblock: Null vp in segment");
 #endif
 	fs = sp->fs;
-	if (sp->sum_bytes_left < sizeof(int32_t) ||
+	blksinblk = howmany(bp->b_bcount, fs->lfs_bsize);
+	if (sp->sum_bytes_left < sizeof(int32_t) * blksinblk ||
 	    sp->seg_bytes_left < bp->b_bcount) {
 		if (sptr)
 			splx(*sptr);
@@ -1045,7 +1112,9 @@ lfs_gatherblock(struct segment *sp, struct buf *bp, int *sptr)
 	bp->b_flags &= ~B_DONE;
 
 	*sp->cbpp++ = bp;
-	sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno;
+	for (j = 0; j < blksinblk; j++)
+		sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno +
+			(j << fs->lfs_fbshift);
 	
 	sp->sum_bytes_left -= sizeof(int32_t);
 	sp->seg_bytes_left -= bp->b_bcount;
@@ -1128,6 +1197,135 @@ loop:	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 	return count;
 }
 
+#if DEBUG
+# define DEBUG_OOFF(n) do {						\
+	if (ooff == 0) {						\
+		printf("lfs_updatemeta[%d]: warning: writing "		\
+			"ino %d lbn %" PRId64 " at 0x%" PRIx64		\
+			", was 0x0\n", (n), ip->i_number, lbn, daddr);	\
+	}								\
+} while(0)
+#else
+# define DEBUG_OOFF(n)
+#endif
+
+/*
+ * Change the given block's address to ndaddr, finding its previous
+ * location using ufs_bmaparray().
+ *
+ * Account for this change in the segment table.
+ */
+void
+lfs_update_single(struct lfs *fs, struct segment *sp, daddr_t lbn,
+		  int32_t ndaddr, int size, int num)
+{
+	SEGUSE *sup;
+	struct buf *bp;
+	struct indir a[NIADDR + 2], *ap;
+	struct inode *ip;
+	struct vnode *vp;
+	daddr_t daddr, ooff;
+	int error;
+	int bb, osize, obb;
+	
+	vp = sp->vp;
+	ip = VTOI(vp);
+
+	error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL);
+	if (error)
+		panic("lfs_updatemeta: ufs_bmaparray returned %d", error);
+	if (daddr > 0)
+		daddr = dbtofsb(fs, daddr);
+	
+	bb = fragstofsb(fs, numfrags(fs, size));
+	switch (num) {
+	    case 0:
+		    ooff = ip->i_ffs_db[lbn];
+		    DEBUG_OOFF(0);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs_blocks += bb;
+		    else {
+			    /* possible fragment truncation or extension */
+			    obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
+			    ip->i_ffs_blocks += (bb - obb);
+		    }
+		    ip->i_ffs_db[lbn] = ndaddr;
+		    break;
+	    case 1:
+		    ooff = ip->i_ffs_ib[a[0].in_off];
+		    DEBUG_OOFF(1);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs_blocks += bb;
+		    ip->i_ffs_ib[a[0].in_off] = ndaddr;
+		    break;
+	    default:
+		    ap = &a[num - 1];
+		    if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
+			    panic("lfs_updatemeta: bread bno %" PRId64,
+				  ap->in_lbn);
+
+		    /* XXX ondisk32 */
+		    ooff = ((int32_t *)bp->b_data)[ap->in_off];
+		    DEBUG_OOFF(num);
+		    if (ooff == UNWRITTEN)
+			    ip->i_ffs_blocks += bb;
+		    /* XXX ondisk32 */
+		    ((int32_t *)bp->b_data)[ap->in_off] = ndaddr;
+		    (void) VOP_BWRITE(bp);
+	}
+	KASSERT(daddr < fs->lfs_lastpseg || daddr > ndaddr);
+
+	/*
+	 * Update segment usage information, based on old size
+	 * and location.
+	 */
+	if (daddr > 0) {
+		u_int32_t oldsn = dtosn(fs, daddr);
+#ifdef DIAGNOSTIC
+		int ndupino = (sp->seg_number == oldsn) ?
+			sp->ndupino : 0;
+#endif
+		if (lbn >= 0 && lbn < NDADDR)
+			osize = ip->i_lfs_fragsize[lbn];
+		else
+			osize = fs->lfs_bsize;
+		LFS_SEGENTRY(sup, fs, oldsn, bp);
+#ifdef DIAGNOSTIC
+		if (sup->su_nbytes + DINODE_SIZE * ndupino < osize) {
+			printf("lfs_updatemeta: negative bytes "
+			       "(segment %" PRIu32 " short by %" PRId64
+			       ")\n", dtosn(fs, daddr),
+			       (int64_t)osize -
+			       (DINODE_SIZE * sp->ndupino +
+				sup->su_nbytes));
+			printf("lfs_updatemeta: ino %d, lbn %" PRId64
+			       ", addr = 0x%" PRIx64 "\n",
+			       VTOI(sp->vp)->i_number, lbn, daddr);
+			printf("lfs_updatemeta: ndupino=%d\n", ndupino);
+			panic("lfs_updatemeta: negative bytes");
+			sup->su_nbytes = osize - DINODE_SIZE * sp->ndupino;
+		}
+#endif
+#ifdef DEBUG_SU_NBYTES
+		printf("seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
+		       " db 0x%" PRIx64 "\n",
+		       dtosn(fs, daddr), osize,
+		       VTOI(sp->vp)->i_number, lbn, daddr);
+#endif
+		sup->su_nbytes -= osize;
+		if (!(bp->b_flags & B_GATHERED))
+			fs->lfs_flags |= LFS_IFDIRTY;
+		LFS_WRITESEGENTRY(sup, fs, oldsn, bp);
+	}
+	/*
+	 * Now that this block has a new address, and its old
+	 * segment no longer owns it, we can forget about its
+	 * old size.
+	 */
+	if (lbn >= 0 && lbn < NDADDR)
+		ip->i_lfs_fragsize[lbn] = size;
+}
+
 /*
  * Update the metadata that points to the blocks listed in the FINFO
  * array.
@@ -1135,32 +1333,28 @@ loop:	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 void
 lfs_updatemeta(struct segment *sp)
 {
-	SEGUSE *sup;
-	struct buf *bp, *sbp;
+	struct buf *sbp;
 	struct lfs *fs;
 	struct vnode *vp;
-	struct indir a[NIADDR + 2], *ap;
-	struct inode *ip;
-	daddr_t daddr, lbn, off;
-	daddr_t ooff;
-	int error, i, nblocks, num;
-	int bb, osize, obb;
+	daddr_t lbn;
+	int i, nblocks, num;
+	int bb;
+	int bytesleft, size;
 	
 	vp = sp->vp;
 	nblocks = &sp->fip->fi_blocks[sp->fip->fi_nblocks] - sp->start_lbp;
-	if (nblocks < 0)
-		panic("This is a bad thing");
-	if (vp == NULL || nblocks == 0) 
+	KASSERT(nblocks >= 0);
+	if (vp == NULL || nblocks == 0)
 		return;
 	
-	/* Sort the blocks. */
 	/*
-	 * XXX KS - We have to sort even if the blocks come from the
+	 * Sort the blocks.
+	 *
+	 * We have to sort even if the blocks come from the
 	 * cleaner, because there might be other pending blocks on the
 	 * same inode...and if we don't sort, and there are fragments
 	 * present, blocks may be written in the wrong place.
 	 */
-	/* if (!(sp->seg_flags & SEGM_CLEAN)) */
 	lfs_shellsort(sp->start_bpp, sp->start_lbp, nblocks);
 	
 	/*
@@ -1174,24 +1368,18 @@ lfs_updatemeta(struct segment *sp)
 	 * XXX true until lfs_markv is fixed to do everything with
 	 * XXX fake blocks (including fake inodes and fake indirect blocks).
 	 */
-	sp->fip->fi_lastlength = sp->start_bpp[nblocks - 1]->b_bcount;
+	fs = sp->fs;
+	sp->fip->fi_lastlength = ((sp->start_bpp[nblocks - 1]->b_bcount - 1) &
+		fs->lfs_bmask) + 1;
 	
 	/*
 	 * Assign disk addresses, and update references to the logical
 	 * block and the segment usage information.
 	 */
-	fs = sp->fs;
 	for (i = nblocks; i--; ++sp->start_bpp) {
-		lbn = *sp->start_lbp++;
 		sbp = *sp->start_bpp;
-
+		lbn = *sp->start_lbp++;
 		sbp->b_blkno = fsbtodb(fs, fs->lfs_offset);
-		off = fs->lfs_offset;
-		if (sbp->b_blkno == sbp->b_lblkno) {
-			printf("lfs_updatemeta: ino %d blk %" PRId64
-			       " has same lbn and daddr\n",
-			       VTOI(vp)->i_number, off);
-		}
 
 		/*
 		 * If we write a frag in the wrong place, the cleaner won't
@@ -1200,124 +1388,24 @@ lfs_updatemeta(struct segment *sp)
 		 * that the indirect block that actually ends the list
 		 * is of a smaller size!)
 		 */
-		if (sbp->b_bcount < fs->lfs_bsize && i != 0)
+		if ((sbp->b_bcount & fs->lfs_bmask) && i != 0)
 			panic("lfs_updatemeta: fragment is not last block");
-
-		bb = fragstofsb(fs, numfrags(fs, sbp->b_bcount));
-		fs->lfs_offset += bb;
-		error = ufs_bmaparray(vp, lbn, &daddr, a, &num, NULL);
-		if (daddr > 0)
-			daddr = dbtofsb(fs, daddr);
-		if (error)
-			panic("lfs_updatemeta: ufs_bmaparray %d", error);
-		ip = VTOI(vp);
-		switch (num) {
-		case 0:
-			ooff = ip->i_ffs_db[lbn];
-#ifdef DEBUG
-			if (ooff == 0) {
-				printf("lfs_updatemeta[1]: warning: writing "
-					"ino %d lbn %" PRId64 " at 0x%" PRIx64
-					", was 0x0\n", ip->i_number, lbn, off);
-			}
-#endif
-			if (ooff == UNWRITTEN)
-				ip->i_ffs_blocks += bb;
-			else {
-				/* possible fragment truncation or extension */
-				obb = btofsb(fs, ip->i_lfs_fragsize[lbn]);
-				ip->i_ffs_blocks += (bb - obb);
-			}
-			ip->i_ffs_db[lbn] = off;
-			break;
-		case 1:
-			ooff = ip->i_ffs_ib[a[0].in_off];
-#ifdef DEBUG
-			if (ooff == 0) {
-				printf("lfs_updatemeta[2]: warning: writing "
-					"ino %d lbn %" PRId64 " at 0x%" PRIx64
-					", was 0x0\n", ip->i_number, lbn, off);
-			}
-#endif
-			if (ooff == UNWRITTEN)
-				ip->i_ffs_blocks += bb;
-			ip->i_ffs_ib[a[0].in_off] = off;
-			break;
-		default:
-			ap = &a[num - 1];
-			if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
-				panic("lfs_updatemeta: bread bno %" PRId64,
-				      ap->in_lbn);
-
-			/* XXX ondisk32 */
-			ooff = ((int32_t *)bp->b_data)[ap->in_off];
-#if DEBUG
-			if (ooff == 0) {
-				printf("lfs_updatemeta[3]: warning: writing "
-					"ino %d lbn %" PRId64 " at 0x%" PRIx64
-					", was 0x0\n", ip->i_number, lbn, off);
-			}
-#endif
-			if (ooff == UNWRITTEN)
-				ip->i_ffs_blocks += bb;
-			/* XXX ondisk32 */
-			((int32_t *)bp->b_data)[ap->in_off] = off;
-			(void) VOP_BWRITE(bp);
-		}
-#ifdef DEBUG
-		if (daddr >= fs->lfs_lastpseg && daddr <= off) {
-			printf("lfs_updatemeta: ino %d, lbn %" PRId64 ", "
-				"addr = %" PRIx64 " in same pseg\n",
-				VTOI(sp->vp)->i_number, sbp->b_lblkno, daddr);
-		}
-#endif
+		
 		/*
-		 * Update segment usage information, based on old size
-		 * and location.
+		 * For each subblock in this possibly oversized block,
+		 * update its address on disk.
 		 */
-		if (daddr > 0) {
-			u_int32_t oldsn = dtosn(fs, daddr);
-#ifdef DIAGNOSTIC
-			int ndupino = (sp->seg_number == oldsn) ?
-			    sp->ndupino : 0;
-#endif
-			if (lbn >= 0 && lbn < NDADDR)
-				osize = ip->i_lfs_fragsize[lbn];
-			else
-				osize = fs->lfs_bsize;
-			LFS_SEGENTRY(sup, fs, oldsn, bp);
-#ifdef DIAGNOSTIC
-			if (sup->su_nbytes + DINODE_SIZE * ndupino < osize) {
-				printf("lfs_updatemeta: negative bytes "
-				       "(segment %" PRIu32 " short by %d)\n",
-				       dtosn(fs, daddr),
-				       osize - sup->su_nbytes);
-				printf("lfs_updatemeta: ino %d, lbn %" PRId64
-				       ", addr = 0x%" PRIx64 "\n",
-				       VTOI(sp->vp)->i_number, lbn, daddr);
-				printf("lfs_updatemeta: ndupino=%d\n", ndupino);
-				panic("lfs_updatemeta: negative bytes");
-				sup->su_nbytes = osize;
-			}
-#endif
-#ifdef DEBUG_SU_NBYTES
-			printf("seg %" PRIu32 " -= %d for ino %d lbn %" PRId64
-				" db 0x%" PRIx64 "\n",
-				dtosn(fs, daddr), osize,
-				VTOI(sp->vp)->i_number, lbn, daddr);
-#endif
-			sup->su_nbytes -= osize;
-			if (!(bp->b_flags & B_GATHERED))
-				fs->lfs_flags |= LFS_IFDIRTY;
-			error = LFS_BWRITE_LOG(bp); /* Ifile */
+		KASSERT(lbn >= 0 || sbp->b_bcount == fs->lfs_bsize);
+		for (bytesleft = sbp->b_bcount; bytesleft > 0;
+		     bytesleft -= fs->lfs_bsize) {
+			size = MIN(bytesleft, fs->lfs_bsize);
+			bb = fragstofsb(fs, numfrags(fs, size));
+			lfs_update_single(fs, sp, lbn, fs->lfs_offset,
+					  size, num);
+			fs->lfs_offset += bb;
+			++lbn;
 		}
-		/*
-		 * Now that this block has a new address, and its old
-		 * segment no longer owns it, we can forget about its
-		 * old size.
-		 */
-		if (lbn >= 0 && lbn < NDADDR)
-			ip->i_lfs_fragsize[lbn] = sbp->b_bcount;
+
 	}
 }
 
@@ -1347,8 +1435,10 @@ lfs_initseg(struct lfs *fs)
 		lfs_newseg(fs);
 		repeat = 1;
 		fs->lfs_offset = fs->lfs_curseg;
+	
 		sp->seg_number = dtosn(fs, fs->lfs_curseg);
 		sp->seg_bytes_left = fsbtob(fs, fs->lfs_fsbpseg);
+
 		/*
 		 * If the segment contains a superblock, update the offset
 		 * and summary address to skip over it.
@@ -1382,15 +1472,15 @@ lfs_initseg(struct lfs *fs)
 	sp->cbpp = sp->bpp;
 #ifdef LFS_MALLOC_SUMMARY
 	sbp = *sp->cbpp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
-				     fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize);
+				     fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize, LFS_NB_SUMMARY);
   	sp->segsum = (*sp->cbpp)->b_data;
 #else
 	sbp = *sp->cbpp = getblk(VTOI(fs->lfs_ivnode)->i_devvp,
 				 fsbtodb(fs, fs->lfs_offset), NBPG, 0, 0);
-	memset(sbp->b_data, 0x5a, NBPG);
+	/* memset(sbp->b_data, 0x5a, NBPG); */
 	sp->segsum = (*sp->cbpp)->b_data + NBPG - fs->lfs_sumsize;
 #endif
-	bzero(sp->segsum, fs->lfs_sumsize);
+	memset(sp->segsum, 0, fs->lfs_sumsize);
 	sp->start_bpp = ++sp->cbpp;
 	fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
 	
@@ -1436,14 +1526,14 @@ lfs_newseg(struct lfs *fs)
 	sup->su_nbytes = 0;
 	sup->su_nsums = 0;
 	sup->su_ninos = 0;
-	(void) LFS_BWRITE_LOG(bp); /* Ifile */
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_nextseg), bp);
 
 	LFS_CLEANERINFO(cip, fs, bp);
 	--cip->clean;
 	++cip->dirty;
 	fs->lfs_nclean = cip->clean;
 	LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
-	
+
 	fs->lfs_lastseg = fs->lfs_curseg;
 	fs->lfs_curseg = fs->lfs_nextseg;
 	for (sn = curseg = dtosn(fs, fs->lfs_curseg) + fs->lfs_interleave;;) {
@@ -1452,7 +1542,12 @@ lfs_newseg(struct lfs *fs)
 			panic("lfs_nextseg: no clean segments");
 		LFS_SEGENTRY(sup, fs, sn, bp);
 		isdirty = sup->su_flags & SEGUSE_DIRTY;
-		brelse(bp);
+		/* Check SEGUSE_EMPTY as we go along */
+		if (isdirty && sup->su_nbytes == 0 && !(sup->su_flags & SEGUSE_EMPTY))
+			LFS_WRITESEGENTRY(sup, fs, sn, bp);
+		else
+			brelse(bp);
+
 		if (!isdirty)
 			break;
 	}
@@ -1478,7 +1573,7 @@ lookahead_pagemove(struct buf **bpp, int nblocks, size_t *size)
 	return bpp;
 #else
 	while((bp = *bpp) != NULL && *size < maxsize && nblocks--) {
-		if(bp->b_flags & B_CALL)
+		if(LFS_IS_MALLOC_BUF(bp))
 			return bpp;
 		if(bp->b_bcount % NBPG)
 			return bpp;
@@ -1503,6 +1598,8 @@ extern LIST_HEAD(bufhashhdr, buf) invalhash;
 #define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
 #define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
 
+extern int maxbpp;
+
 static struct buf *
 lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, int n)
 {
@@ -1510,8 +1607,8 @@ lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, int n)
 	struct buf **bpp, *bp;
 	int s;
 
-	cl = (struct lfs_cluster *)malloc(sizeof(*cl), M_SEGMENT, M_WAITOK);
-	bpp = (struct buf **)malloc(n*sizeof(*bpp), M_SEGMENT, M_WAITOK);
+	cl = (struct lfs_cluster *)pool_get(&fs->lfs_clpool, PR_WAITOK);
+	bpp = (struct buf **)pool_get(&fs->lfs_bpppool, PR_WAITOK);
 	memset(cl, 0, sizeof(*cl));
 	cl->fs = fs;
 	cl->bpp = bpp;
@@ -1575,7 +1672,8 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	SEGSUM *ssp;
 	dev_t i_dev;
 	char *datap, *dp;
-	int do_again, i, nblocks, s;
+	int i, s;
+	int do_again, nblocks, byteoffset;
 	size_t el_size;
  	struct lfs_cluster *cl;
 	int (*strategy)(void *);
@@ -1606,6 +1704,11 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	if ((nblocks = sp->cbpp - sp->bpp) == 1)
 		return (0);
 	
+#if 0
+	printf("lfs_writeseg: %d blocks at 0x%x\n", nblocks,
+		dbtofsb(fs, sp->bpp[0]->b_blkno));
+#endif
+
 	i_dev = VTOI(fs->lfs_ivnode)->i_dev;
 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 
@@ -1646,7 +1749,8 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);
 
 	do_again = !(bp->b_flags & B_GATHERED);
-	(void)LFS_BWRITE_LOG(bp); /* Ifile */
+	LFS_WRITESEGENTRY(sup, fs, sp->seg_number, bp); /* Ifile */
+
 	/*
 	 * Mark blocks B_BUSY, to prevent then from being changed between
 	 * the checksum computation and the actual write.
@@ -1657,9 +1761,11 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	 */
 	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
 		++bpp;
-		if ((*bpp)->b_flags & B_CALL)
-			continue;
 		bp = *bpp;
+		if (bp->b_flags & B_CALL) { /* UBC or malloced buffer */
+			bp->b_flags |= B_BUSY;
+			continue;
+		}
 	    again:
 		s = splbio();
 		if (bp->b_flags & B_BUSY) {
@@ -1675,7 +1781,10 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		}
 		bp->b_flags |= B_BUSY;
 		splx(s);
-		/* Check and replace indirect block UNWRITTEN bogosity */
+		/*
+		 * Check and replace indirect block UNWRITTEN bogosity.
+		 * XXX See comment in lfs_writefile.
+		 */
 		if (bp->b_lblkno < 0 && bp->b_vp != devvp && bp->b_vp &&
 		   VTOI(bp->b_vp)->i_ffs_blocks !=
 		   VTOI(bp->b_vp)->i_lfs_effnblks) {
@@ -1687,11 +1796,10 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 #endif
 			/* Make a copy we'll make changes to */
 			newbp = lfs_newbuf(fs, bp->b_vp, bp->b_lblkno,
-					   bp->b_bcount);
+					   bp->b_bcount, LFS_NB_IBLOCK);
 			newbp->b_blkno = bp->b_blkno;
 			memcpy(newbp->b_data, bp->b_data,
 			       newbp->b_bcount);
-			*bpp = newbp;
 
 			changed = 0;
 			/* XXX ondisk32 */
@@ -1699,10 +1807,32 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			     daddrp < (int32_t *)(newbp->b_data +
 						  newbp->b_bcount); daddrp++) {
 				if (*daddrp == UNWRITTEN) {
-					++changed;
 #ifdef DEBUG_LFS
-					printf("lfs_writeseg: replacing UNWRITTEN\n");
-#endif
+					off_t doff;
+					int32_t ioff;
+					
+					ioff = daddrp - (int32_t *)(newbp->b_data);
+					doff = (-bp->b_lblkno + ioff) * fs->lfs_bsize;
+					printf("ino %d lbn %" PRId64 " entry %d off %" PRIx64 "\n",
+					       VTOI(bp->b_vp)->i_number,
+					       bp->b_lblkno, ioff, doff);
+# ifdef LFS_UBC
+					if (bp->b_vp->v_type == VREG) {
+						/*
+						 * What is up with this page?
+						 */
+						struct vm_page *pg;
+						for (; doff / fs->lfs_bsize == (-bp->b_lblkno + ioff); doff += PAGE_SIZE) {
+							pg = uvm_pagelookup(&bp->b_vp->v_uobj, doff);
+							if (pg == NULL)
+								printf("  page at %" PRIx64 " is NULL\n", doff);
+							else
+								printf("  page at %" PRIx64 " flags 0x%x pqflags 0x%x\n", doff, pg->flags, pg->pqflags);
+						}
+					}
+# endif /* LFS_UBC */
+#endif /* DEBUG_LFS */
+					++changed;
 					*daddrp = 0;
 				}
 			}
@@ -1711,9 +1841,18 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			 * though, if it still has dirty data on it.
 			 */
 			if (changed) {
-				bp->b_flags &= ~(B_ERROR | B_GATHERED);
+#ifdef DEBUG_LFS
+				printf("lfs_writeseg: replacing UNWRITTEN(%d):"
+					" bp = %p newbp = %p\n", changed, bp,
+					newbp);
+#endif
+				*bpp = newbp;
+				bp->b_flags &= ~(B_ERROR | B_GATHERED | B_DONE);
 				if (bp->b_flags & B_CALL) {
-					lfs_freebuf(bp);
+					printf("lfs_writeseg: indir bp should not be B_CALL\n");
+					s = splbio();
+					biodone(bp);
+					splx(s);
 					bp = NULL;
 				} else {
 					/* Still on free list, leave it there */
@@ -1731,22 +1870,8 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 					fs->lfs_avail -= btofsb(fs, bp->b_bcount);
 				}
 			} else {
-				bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
-						 B_GATHERED);
-				if (bp->b_flags & B_CALL) {
-					lfs_freebuf(bp);
-					bp = NULL;
-				} else {
-					bremfree(bp);
-					bp->b_flags |= B_DONE;
-					s = splbio();
-					reassignbuf(bp, bp->b_vp);
-					splx(s);
-					LFS_UNLOCK_BUF(bp);
-					brelse(bp);
-				}
+				lfs_freebuf(fs, newbp);
 			}
-			
 		}
 	}
 	/*
@@ -1757,21 +1882,31 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	 * XXX
 	 * Fix this to do it inline, instead of malloc/copy.
 	 */
+	datap = dp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
 	if (fs->lfs_version == 1)
 		el_size = sizeof(u_long);
 	else
 		el_size = sizeof(u_int32_t);
-	datap = dp = malloc(nblocks * el_size, M_SEGMENT, M_WAITOK);
-	for (bpp = sp->bpp, i = nblocks - 1; i--;) {
-		if (((*++bpp)->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) {
-			if (copyin((*bpp)->b_saveaddr, dp, el_size))
-				panic("lfs_writeseg: copyin failed [1]: "
-				      "ino %d blk %" PRId64,
-				      VTOI((*bpp)->b_vp)->i_number,
-				      (*bpp)->b_lblkno);
-		} else
-			memcpy(dp, (*bpp)->b_data, el_size);
-		dp += el_size;
+	for (bpp = sp->bpp, i = nblocks - 1; i--; ) {
+		++bpp;
+		/* Loop through gop_write cluster blocks */
+		for (byteoffset = 0; byteoffset < (*bpp)->b_bcount;
+		     byteoffset += fs->lfs_bsize) {
+			if (((*bpp)->b_flags & (B_CALL | B_INVAL)) ==
+			    (B_CALL | B_INVAL)) {
+				if (copyin((caddr_t)(*bpp)->b_saveaddr +
+					   byteoffset, dp, el_size)) {
+					panic("lfs_writeseg: copyin failed [1]: "
+						"ino %d blk %" PRId64,
+						VTOI((*bpp)->b_vp)->i_number,
+						(*bpp)->b_lblkno);
+				}
+			} else {
+				memcpy(dp, (*bpp)->b_data + byteoffset,
+				       el_size);
+			}
+			dp += el_size;
+		}
 	}
 	if (fs->lfs_version == 1)
 		ssp->ss_ocreate = time.tv_sec;
@@ -1787,7 +1922,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * el_size);
 	ssp->ss_sumsum =
 	    cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
-	free(datap, M_SEGMENT);
+	pool_put(&fs->lfs_bpppool, datap);
 	datap = dp = NULL;
 #ifdef DIAGNOSTIC
 	if (fs->lfs_bfree < btofsb(fs, ninos * fs->lfs_ibsize) + btofsb(fs, fs->lfs_sumsize))
@@ -1854,7 +1989,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		if(use_pagemove == 0) {
 			cl->flags |= LFS_CL_MALLOC;
 			cl->olddata = cbp->b_data;
-			cbp->b_data = malloc(CHUNKSIZE, M_SEGMENT, M_WAITOK);
+			cbp->b_data = lfs_malloc(fs, CHUNKSIZE, LFS_NB_CLUSTER);
 		}
 #if defined(DEBUG) && defined(DIAGNOSTIC)
 		if(dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno + btodb((*bpp)->b_bcount - 1))) !=
@@ -1870,12 +2005,6 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		/*
 		 * Construct the cluster.
 		 */
-		while (fs->lfs_iocount >= LFS_THROTTLE) {
-#ifdef DEBUG_LFS
-			printf("[%d]", fs->lfs_iocount);
-#endif
-			tsleep(&fs->lfs_iocount, PRIBIO+1, "lfs_throttle", 0);
-		}
 		++fs->lfs_iocount;
 
 		for (p = cbp->b_data; i && cbp->b_bcount < CHUNKSIZE; i--) {
@@ -1884,6 +2013,17 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			if (bp->b_bcount > (CHUNKSIZE - cbp->b_bcount))
 				break;
 
+#ifdef DIAGNOSTIC
+			if (dtosn(fs, dbtofsb(fs, bp->b_blkno +
+					      btodb(bp->b_bcount - 1))) !=
+			    sp->seg_number) {
+				printf("blk size %ld daddr %" PRIx64 " not in seg %d\n",
+					bp->b_bcount, bp->b_blkno,
+					sp->seg_number);
+				panic("segment overwrite");
+			}
+#endif
+
 			/*
 			 * Fake buffers from the cleaner are marked as B_INVAL.
 			 * We need to copy the data from user space rather than
@@ -1939,7 +2079,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			bp->b_flags &= ~(B_DELWRI | B_READ | B_ERROR);
 #ifdef LFS_MNOBUSY
 			if (cl->flags & LFS_CL_MALLOC) {
-				if (!(bp->b_flags & B_CALL))
+				if (!LFS_IS_MALLOC_BUF(bp)))
 					brelse(bp); /* Still B_LOCKED */
 			}
 #endif
@@ -1966,7 +2106,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 				printf("lfs_writeseg: marking ino %d\n",
 				       ip->i_number);
 #endif
-				if (bp->b_flags & B_CALL)
+				if (LFS_IS_MALLOC_BUF(bp))
 					LFS_SET_UINO(ip, IN_CLEANING);
 				else
 					LFS_SET_UINO(ip, IN_MODIFIED);
@@ -1980,7 +2120,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		/*
 		 * In order to include the summary in a clustered block,
 		 * it may be necessary to shift the block forward (since
-		 * summary blocks are in generay smaller than can be
+		 * summary blocks are in general smaller than can be
 		 * addressed by pagemove().  After the write, the block
 		 * will be corrected before disassembly.
 		 */
@@ -2036,7 +2176,8 @@ lfs_writesuper(struct lfs *fs, daddr_t daddr)
 
 	/* Checksum the superblock and copy it into a buffer. */
 	fs->lfs_cksum = lfs_sb_cksum(&(fs->lfs_dlfs));
-	bp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, fsbtodb(fs, daddr), LFS_SBPAD);
+	bp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp, fsbtodb(fs, daddr), LFS_SBPAD, LFS_NB_SBLOCK);
+	memset(bp->b_data + sizeof(struct dlfs), 0, LFS_SBPAD - sizeof(struct dlfs));
 	*(struct dlfs *)bp->b_data = fs->lfs_dlfs;
 	
 	bp->b_dev = i_dev;
@@ -2062,9 +2203,17 @@ lfs_writesuper(struct lfs *fs, daddr_t daddr)
 int
 lfs_match_fake(struct lfs *fs, struct buf *bp)
 {
-	return (bp->b_flags & B_CALL);
+	return LFS_IS_MALLOC_BUF(bp);
 }
 
+#if 0
+int
+lfs_match_real(struct lfs *fs, struct buf *bp)
+{
+	return (lfs_match_data(fs, bp) && !lfs_match_fake(fs, bp));
+}
+#endif
+
 int
 lfs_match_data(struct lfs *fs, struct buf *bp)
 {
@@ -2108,9 +2257,10 @@ lfs_match_tindir(struct lfs *fs, struct buf *bp)
 void
 lfs_callback(struct buf *bp)
 {
-	/* struct lfs *fs; */
-	/* fs = (struct lfs *)bp->b_saveaddr; */
-	lfs_freebuf(bp);
+	struct lfs *fs;
+
+	fs = (struct lfs *)bp->b_saveaddr;
+	lfs_freebuf(fs, bp);
 }
 
 static void
@@ -2121,9 +2271,9 @@ lfs_super_aiodone(struct buf *bp)
 	fs = (struct lfs *)bp->b_saveaddr;
 	fs->lfs_sbactive = 0;
 	wakeup(&fs->lfs_sbactive);
-	if (--fs->lfs_iocount < LFS_THROTTLE)
+	if (--fs->lfs_iocount == 0)
 		wakeup(&fs->lfs_iocount);
-	lfs_freebuf(bp);
+	lfs_freebuf(fs, bp);
 }
 
 static void
@@ -2132,7 +2282,7 @@ lfs_cluster_aiodone(struct buf *bp)
 	struct lfs_cluster *cl;
 	struct lfs *fs;
 	struct buf *tbp;
-	struct vnode *vp;
+	struct vnode *vp, *devvp;
 	int s, error=0;
 	char *cp;
 	extern int locked_queue_count;
@@ -2143,6 +2293,7 @@ lfs_cluster_aiodone(struct buf *bp)
 
 	cl = (struct lfs_cluster *)bp->b_saveaddr;
 	fs = cl->fs;
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 	bp->b_saveaddr = cl->saveaddr;
 
 	/* If shifted, shift back now */
@@ -2172,13 +2323,19 @@ lfs_cluster_aiodone(struct buf *bp)
 		 * the cluster was written, free it.  Otherwise, keep it on
 		 * the locked list to be written again.
 		 */
+		vp = tbp->b_vp;
 		if ((tbp->b_flags & (B_LOCKED | B_DELWRI)) == B_LOCKED)
 			LFS_UNLOCK_BUF(tbp);
+#if 0
+		else if (vp != devvp)
+			printf("dirtied while busy?! bp %p, ino %d, lbn %d\n",
+				tbp, vp ? VTOI(vp)->i_number : -1,
+				tbp->b_lblkno);
+#endif
 		tbp->b_flags &= ~B_GATHERED;
 
 		LFS_BCLEAN_LOG(fs, tbp);
 
-		vp = tbp->b_vp;
 		/* Segment summary for a shifted cluster */
 		if(!cl->bufcount && (cl->flags & LFS_CL_SHIFT))
 			tbp->b_flags |= B_INVAL;
@@ -2197,7 +2354,30 @@ lfs_cluster_aiodone(struct buf *bp)
 		}
 #endif
 		if (tbp->b_flags & (B_BUSY | B_CALL)) {
+			if ((tbp->b_flags & B_CALL) && !LFS_IS_MALLOC_BUF(tbp)) {
+				/* printf("flags 0x%lx\n", tbp->b_flags); */
+				/*
+				 * A buffer from the page daemon.
+				 * We use the same iodone as it does,
+				 * so we must manually disassociate its
+				 * buffers from the vp.
+				 */
+				if (tbp->b_vp) {
+					/* This is just silly */
+					s = splbio();
+					brelvp(tbp);
+					tbp->b_vp = vp;
+					splx(s);
+				}
+				/* Put it back the way it was */
+				tbp->b_flags |= B_ASYNC;
+				/* Master buffers have B_AGE */
+				if (tbp->b_private == tbp)
+					tbp->b_flags |= B_AGE;
+			}
+			s = splbio();
 			biodone(tbp);
+			splx(s);
 		}
 	}
 
@@ -2209,7 +2389,7 @@ lfs_cluster_aiodone(struct buf *bp)
 			 (char *)bp->b_data, bp->b_bufsize);
 	}
 	if(cl->flags & LFS_CL_MALLOC) {
-		free(bp->b_data, M_SEGMENT);
+		lfs_free(fs, bp->b_data, LFS_NB_CLUSTER);
 		bp->b_data = cl->olddata;
 	}
 	bp->b_bcount = 0;
@@ -2231,23 +2411,12 @@ lfs_cluster_aiodone(struct buf *bp)
 	if (fs->lfs_iocount == 0)
 		panic("lfs_cluster_aiodone: zero iocount");
 #endif
-	if (--fs->lfs_iocount < LFS_THROTTLE)
+	if (--fs->lfs_iocount == 0)
 		wakeup(&fs->lfs_iocount);
-#if 0
-	if (fs->lfs_iocount == 0) {
-		/*
-		 * Vinvalbuf can move locked buffers off the locked queue
-		 * and we have no way of knowing about this.  So, after
-		 * doing a big write, we recalculate how many buffers are
-		 * really still left on the locked queue.
-		 */
-		lfs_countlocked(&locked_queue_count, &locked_queue_bytes, "lfs_cluster_callback");
-		wakeup(&locked_queue_count);
-	}
-#endif
 
-	free(cl->bpp, M_SEGMENT);
-	free(cl, M_SEGMENT);
+	pool_put(&fs->lfs_bpppool, cl->bpp);
+	cl->bpp = NULL;
+	pool_put(&fs->lfs_clpool, cl);
 }
 
 static void
@@ -2294,15 +2463,16 @@ lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb)
 	static int __rsshell_increments[] = { 4, 1, 0 };
 	int incr, *incrp, t1, t2;
 	struct buf *bp_temp;
-	u_long lb_temp;
+	u_int32_t lbt, *lba;
 
+	lba = (u_int32_t *)lb_array;
 	for (incrp = __rsshell_increments; (incr = *incrp++) != 0;)
 		for (t1 = incr; t1 < nmemb; ++t1)
 			for (t2 = t1 - incr; t2 >= 0;)
-				if (lb_array[t2] > lb_array[t2 + incr]) {
-					lb_temp = lb_array[t2];
-					lb_array[t2] = lb_array[t2 + incr];
-					lb_array[t2 + incr] = lb_temp;
+				if (lba[t2] > lba[t2 + incr]) {
+					lbt = lba[t2];
+					lba[t2] = lba[t2 + incr];
+					lba[t2 + incr] = lbt;
 					bp_temp = bp_array[t2];
 					bp_array[t2] = bp_array[t2 + incr];
 					bp_array[t2 + incr] = bp_temp;
diff --git a/sys/ufs/lfs/lfs_subr.c b/sys/ufs/lfs/lfs_subr.c
index 8c7d0d0070db..bb146df9eb7b 100644
--- a/sys/ufs/lfs/lfs_subr.c
+++ b/sys/ufs/lfs/lfs_subr.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $	*/
+/*	$NetBSD: lfs_subr.c,v 1.31 2003/02/17 23:48:20 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.31 2003/02/17 23:48:20 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -86,6 +86,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $");
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>
 
+#include <uvm/uvm.h>
+
 /*
  * Return buffer with the contents of block "offset" from the beginning of
  * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
@@ -122,12 +124,177 @@ lfs_blkatoff(void *v)
 	return (0);
 }
 
+#ifdef LFS_DEBUG_MALLOC
+char *lfs_res_names[LFS_NB_COUNT] = {
+	"summary",
+	"superblock",
+	"ifile block",
+	"cluster",
+	"clean",
+};
+#endif
+
+int lfs_res_qty[LFS_NB_COUNT] = {
+	LFS_N_SUMMARIES,
+	LFS_N_SBLOCKS,
+	LFS_N_IBLOCKS,
+	LFS_N_CLUSTERS,
+	LFS_N_CLEAN,
+};
+
+void
+lfs_setup_resblks(struct lfs *fs)
+{
+	int i, j;
+	int maxbpp;
+
+	fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
+				          M_WAITOK);
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		fs->lfs_resblk[i].inuse = 0;
+		fs->lfs_resblk[i].p = NULL;
+	}
+	for (i = 0; i < LFS_RESHASH_WIDTH; i++)
+		LIST_INIT(fs->lfs_reshash + i);
+
+	/*
+	 * These types of allocations can be larger than a page,
+	 * so we can't use the pool subsystem for them.
+	 */
+	for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
+		fs->lfs_resblk[i].p = malloc(fs->lfs_sumsize, M_SEGMENT,
+					    M_WAITOK);
+	for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(LFS_SBPAD, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(fs->lfs_bsize, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(MAXPHYS, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_CLEAN; j++, i++)
+		fs->lfs_resblk[i].p = malloc(MAXPHYS, M_SEGMENT, M_WAITOK);
+
+	/*
+	 * Initialize pools for small types (XXX is BPP small?)
+	 */
+	maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
+	maxbpp = MIN(maxbpp, fs->lfs_ssize / fs->lfs_fsize + 2);
+        pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0,
+		LFS_N_BPP, "lfsbpppl", &pool_allocator_nointr);
+        pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0,
+		LFS_N_CL, "lfsclpl", &pool_allocator_nointr);
+	pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0,
+		LFS_N_SEG, "lfssegpool", &pool_allocator_nointr);
+}
+
+void
+lfs_free_resblks(struct lfs *fs)
+{
+	int i;
+
+	pool_destroy(&fs->lfs_bpppool);
+	pool_destroy(&fs->lfs_segpool);
+	pool_destroy(&fs->lfs_clpool);
+
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		while(fs->lfs_resblk[i].inuse)
+			tsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0);
+		if (fs->lfs_resblk[i].p != NULL)
+			free(fs->lfs_resblk[i].p, M_SEGMENT);
+	}
+	free(fs->lfs_resblk, M_SEGMENT);
+}
+
+static unsigned int
+lfs_mhash(void *vp)
+{
+	return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
+}
+
+/*
+ * Return memory of the given size for the given purpose, or use one of a
+ * number of spare last-resort buffers, if malloc returns NULL.
+ */ 
+void *
+lfs_malloc(struct lfs *fs, size_t size, int type)
+{
+	struct lfs_res_blk *re;
+	void *r;
+	int i, s, start;
+	unsigned int h;
+
+	/* If no mem allocated for this type, it just waits */
+	if (lfs_res_qty[type] == 0)
+		return malloc(size, M_SEGMENT, M_WAITOK);
+
+	/* Otherwise try a quick malloc, and if it works, great */
+	if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL)
+		return r;
+
+	/*
+	 * If malloc returned NULL, we are forced to use one of our
+	 * reserve blocks.  We have on hand at least one summary block,
+	 * at least one cluster block, at least one superblock,
+	 * and several indirect blocks.
+	 */
+	/* skip over blocks of other types */
+	for (i = 0, start = 0; i < type; i++)
+		start += lfs_res_qty[i];
+	while (r == NULL) {
+		for (i = 0; i < lfs_res_qty[type]; i++) {
+			if (fs->lfs_resblk[start + i].inuse == 0) {
+				re = fs->lfs_resblk + start + i;
+				re->inuse = 1;
+				r = re->p;
+				h = lfs_mhash(r);
+				s = splbio();
+				LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
+				splx(s);
+				return r;
+			}
+		}
+#ifdef LFS_DEBUG_MALLOC
+		printf("sleeping on %s (%d)\n", lfs_res_names[type], lfs_res_qty[type]);
+#endif
+		tsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0);
+#ifdef LFS_DEBUG_MALLOC
+		printf("done sleeping on %s\n", lfs_res_names[type]);
+#endif
+	}
+	/* NOTREACHED */
+	return r;
+}
+
+void
+lfs_free(struct lfs *fs, void *p, int type)
+{
+	int s;
+	unsigned int h;
+	res_t *re;
+
+	h = lfs_mhash(p);
+	s = splbio();
+	LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
+		if (re->p == p) {
+			LIST_REMOVE(re, res);
+			re->inuse = 0;
+			wakeup(&fs->lfs_resblk);
+			splx(s);
+			return;
+		}
+	}
+	splx(s);
+
+	/*
+	 * If we didn't find it, free it.
+	 */
+	free(p, M_SEGMENT);
+}
 
 /*
  * lfs_seglock --
  *	Single thread the segment writer.
  */
-void
+int
 lfs_seglock(struct lfs *fs, unsigned long flags)
 {
 	struct segment *sp;
@@ -136,8 +303,10 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 		if (fs->lfs_lockpid == curproc->p_pid) {
 			++fs->lfs_seglock;
 			fs->lfs_sp->seg_flags |= flags;
-			return;			
-		} else while (fs->lfs_seglock)
+			return 0;
+		} else if (flags & SEGM_PAGEDAEMON)
+			return EWOULDBLOCK;
+		else while (fs->lfs_seglock)
 			(void)tsleep(&fs->lfs_seglock, PRIBIO + 1,
 				     "lfs seglock", 0);
 	}
@@ -148,10 +317,8 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 	/* Drain fragment size changes out */
 	lockmgr(&fs->lfs_fraglock, LK_EXCLUSIVE, 0);
 
-	sp = fs->lfs_sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
-	sp->bpp = malloc(((fs->lfs_sumsize - SEGSUM_SIZE(fs)) /
-			  sizeof(int32_t) + 1) * sizeof(struct buf *),
-			 M_SEGMENT, M_WAITOK);
+	sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
+	sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
 	sp->seg_flags = flags;
 	sp->vp = NULL;
 	sp->seg_iocount = 0;
@@ -164,8 +331,70 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 	 * the writes we intend to do.
 	 */
 	++fs->lfs_iocount;
+	return 0;
 }
 
+static void lfs_unmark_dirop(struct lfs *);
+
+static void
+lfs_unmark_dirop(struct lfs *fs)
+{
+	struct inode *ip, *nip;
+	struct vnode *vp;
+	extern int lfs_dirvcount;
+
+	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+		nip = TAILQ_NEXT(ip, i_lfs_dchain);
+		vp = ITOV(ip);
+
+		if (VOP_ISLOCKED(vp) &&
+                           vp->v_lock.lk_lockholder != curproc->p_pid) {
+			continue;
+		}
+		if ((VTOI(vp)->i_flag & IN_ADIROP) == 0) {
+			--lfs_dirvcount;
+			vp->v_flag &= ~VDIROP;
+			TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+			wakeup(&lfs_dirvcount);
+			fs->lfs_unlockvp = vp;
+			vrele(vp);
+			fs->lfs_unlockvp = NULL;
+		}
+	}
+}
+
+#ifndef LFS_NO_AUTO_SEGCLEAN
+static void
+lfs_auto_segclean(struct lfs *fs)
+{
+	int i, error;
+
+	/*
+	 * Now that we've swapped lfs_activesb, but while we still
+	 * hold the segment lock, run through the segment list marking
+	 * the empty ones clean.
+	 * XXX - do we really need to do them all at once?
+	 */
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		if ((fs->lfs_suflags[0][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
+		    (fs->lfs_suflags[1][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
+
+			if ((error = lfs_do_segclean(fs, i)) != 0) {
+#ifdef DEBUG
+				printf("lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i);
+#endif /* DEBUG */
+			}
+		}
+		fs->lfs_suflags[1 - fs->lfs_activesb][i] =
+			fs->lfs_suflags[fs->lfs_activesb][i];
+	}
+}
+#endif /* LFS_AUTO_SEGCLEAN */
+
 /*
  * lfs_segunlock --
  *	Single thread the segment writer.
@@ -176,9 +405,6 @@ lfs_segunlock(struct lfs *fs)
 	struct segment *sp;
 	unsigned long sync, ckp;
 	struct buf *bp;
-	struct vnode *vp, *nvp;
-	struct mount *mp;
-	extern int lfs_dirvcount;
 #ifdef LFS_MALLOC_SUMMARY
 	extern int locked_queue_count;
 	extern long locked_queue_bytes;
@@ -186,63 +412,9 @@ lfs_segunlock(struct lfs *fs)
 	
 	sp = fs->lfs_sp;
 
-	if (fs->lfs_seglock == 1 && !(sp->seg_flags & SEGM_PROT)) {
-
-		mp = fs->lfs_ivnode->v_mount;
-		/*
-		 * Go through and unmark all DIROP vnodes, possibly
-		 * calling VOP_INACTIVE (through vrele).  This is
-		 * delayed until now in order not to accidentally
-		 * write a DIROP node through lfs_flush.
-		 */
-#ifndef LFS_NO_BACKVP_HACK
-	/* BEGIN HACK */
-#define	VN_OFFSET	(((caddr_t)&LIST_NEXT(vp, v_mntvnodes)) - (caddr_t)vp)
-#define	BACK_VP(VP)	((struct vnode *)(((caddr_t)(VP)->v_mntvnodes.le_prev) - VN_OFFSET))
-#define	BEG_OF_VLIST	((struct vnode *)(((caddr_t)&LIST_FIRST(&mp->mnt_vnodelist)) - VN_OFFSET))
-	
-		/* Find last vnode. */
-	loop:	for (vp = LIST_FIRST(&mp->mnt_vnodelist);
-		     vp && LIST_NEXT(vp, v_mntvnodes) != NULL;
-		     vp = LIST_NEXT(vp, v_mntvnodes));
-		for (; vp && vp != BEG_OF_VLIST; vp = nvp) {
-			nvp = BACK_VP(vp);
-#else
-	loop:
-		 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
-		     vp != NULL;
-		     vp = nvp) {
-			nvp = LIST_NEXT(vp, v_mntvnodes);
-#endif
-			if (vp->v_mount != mp) {
-				printf("lfs_segunlock: starting over\n");
-				goto loop;
-			}
-			if (vp->v_type == VNON)
-				continue;
-			if (lfs_vref(vp))
-				continue;
-			if (VOP_ISLOCKED(vp) &&
-                            vp->v_lock.lk_lockholder != curproc->p_pid) {
-				lfs_vunref(vp);
-				continue;
-			}
-			if ((vp->v_flag & VDIROP) &&
-			    !(VTOI(vp)->i_flag & IN_ADIROP)) {
-				--lfs_dirvcount;
-				vp->v_flag &= ~VDIROP;
-				wakeup(&lfs_dirvcount);
-				fs->lfs_unlockvp = vp;
-				lfs_vunref(vp);
-				vrele(vp);
-				fs->lfs_unlockvp = NULL;
-			} else {
-				lfs_vunref(vp);
-			}
-		}
-	}
-
 	if (fs->lfs_seglock == 1) {
+		if ((sp->seg_flags & SEGM_PROT) == 0)
+			lfs_unmark_dirop(fs);
 		sync = sp->seg_flags & SEGM_SYNC;
 		ckp = sp->seg_flags & SEGM_CKP;
 		if (sp->bpp != sp->cbpp) {
@@ -250,7 +422,7 @@ lfs_segunlock(struct lfs *fs)
 			fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
 			bp = *sp->bpp;
 #ifdef LFS_MALLOC_SUMMARY
-			lfs_freebuf(bp);
+			lfs_freebuf(fs, bp);
 #else
 			s = splbio();
 			bremfree(bp);
@@ -263,11 +435,11 @@ lfs_segunlock(struct lfs *fs)
 		} else
 			printf ("unlock to 0 with no summary");
 
-		free(sp->bpp, M_SEGMENT);
+		pool_put(&fs->lfs_bpppool, sp->bpp);
 		sp->bpp = NULL;
 		/* The sync case holds a reference in `sp' to be freed below */
 		if (!sync)
-			free(sp, M_SEGMENT);
+			pool_put(&fs->lfs_segpool, sp);
 		fs->lfs_sp = NULL;
 
 		/*
@@ -275,9 +447,7 @@ lfs_segunlock(struct lfs *fs)
 		 * At the moment, the user's process hangs around so we can
 		 * sleep.
 		 */
-		if (--fs->lfs_iocount < LFS_THROTTLE)
-			wakeup(&fs->lfs_iocount);
-		if(fs->lfs_iocount == 0) {
+		if (--fs->lfs_iocount == 0) {
 			lfs_countlocked(&locked_queue_count,
 					&locked_queue_bytes, "lfs_segunlock");
 			wakeup(&locked_queue_count);
@@ -309,15 +479,18 @@ lfs_segunlock(struct lfs *fs)
 			/* printf("sleeping on iocount %x == %d\n", sp, sp->seg_iocount); */
 		}
 		if (sync)
-			free(sp, M_SEGMENT);
+			pool_put(&fs->lfs_segpool, sp);
 		if (ckp) {
 			fs->lfs_nactive = 0;
 			/* If we *know* everything's on disk, write both sbs */
+			/* XXX should wait for this one  */
 			if (sync)
-				lfs_writesuper(fs,fs->lfs_sboffs[fs->lfs_activesb]);
+				lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
+			lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
+#ifndef LFS_NO_AUTO_SEGCLEAN
+			lfs_auto_segclean(fs);
+#endif
 			fs->lfs_activesb = 1 - fs->lfs_activesb;
-			lfs_writesuper(fs,fs->lfs_sboffs[fs->lfs_activesb]);
-
 			--fs->lfs_seglock;
 			fs->lfs_lockpid = 0;
 			wakeup(&fs->lfs_seglock);
diff --git a/sys/ufs/lfs/lfs_syscalls.c b/sys/ufs/lfs/lfs_syscalls.c
index 77e157bf1c1c..37b8e5b52c8d 100644
--- a/sys/ufs/lfs/lfs_syscalls.c
+++ b/sys/ufs/lfs/lfs_syscalls.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.79 2003/01/24 21:55:28 fvdl Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.80 2003/02/17 23:48:20 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.79 2003/01/24 21:55:28 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.80 2003/02/17 23:48:20 perseant Exp $");
 
 #define LFS		/* for prototypes in syscallargs.h */
 
@@ -107,6 +107,9 @@ int verbose_debug = 0;
     
 pid_t lfs_cleaner_pid = 0;
 
+extern int lfs_subsys_pages;
+extern struct simplelock lfs_subsys_lock;
+
 /*
  * Definitions for the buffer free lists.
  */
@@ -578,7 +581,7 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 	s = splbio();
 	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp = nbp) {
 		nbp = bp->b_freelist.tqe_next;
-		if (bp->b_flags & B_CALL) {
+		if (LFS_IS_MALLOC_BUF(bp)) {
 			if (bp->b_flags & B_BUSY) { /* not bloody likely */
 				bp->b_flags |= B_WANTED;
 				tsleep(bp, PRIBIO+1, "markv", 0);
@@ -878,15 +881,12 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 		syscallarg(fsid_t *) fsidp;
 		syscallarg(u_long) segment;
 	} */ *uap = v;
-	struct proc *p = l->l_proc;
-	CLEANERINFO *cip;
-	SEGUSE *sup;
-	struct buf *bp;
-	struct mount *mntp;
 	struct lfs *fs;
+	struct mount *mntp;
 	fsid_t fsid;
 	int error;
 	unsigned long segnum;
+	struct proc *p = l->l_proc;
 	
 	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
 		return (error);
@@ -899,39 +899,44 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	fs = VFSTOUFS(mntp)->um_lfs;
 	segnum = SCARG(uap, segment);
 	
-	if (dtosn(fs, fs->lfs_curseg) == segnum)
-		return (EBUSY);
-	
 	if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 
 		return (error);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
+
 	lfs_seglock(fs, SEGM_PROT);
-#endif
+	error = lfs_do_segclean(fs, segnum);
+	lfs_segunlock(fs);
+	vfs_unbusy(mntp);
+	return error;
+}
+
+/*
+ * Actually mark the segment clean.
+ * Must be called with the segment lock held.
+ */
+int
+lfs_do_segclean(struct lfs *fs, unsigned long segnum)
+{
+	struct buf *bp;
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+	
+	if (dtosn(fs, fs->lfs_curseg) == segnum) {
+		return (EBUSY);
+	}
+	
 	LFS_SEGENTRY(sup, fs, segnum, bp);
 	if (sup->su_nbytes) {
 		printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
 			segnum, sup->su_nbytes);
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EBUSY);
 	}
 	if (sup->su_flags & SEGUSE_ACTIVE) {
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EBUSY);
 	}
 	if (!(sup->su_flags & SEGUSE_DIRTY)) {
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EALREADY);
 	}
 	
@@ -948,7 +953,7 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	if (fs->lfs_dmeta < 0)
 		fs->lfs_dmeta = 0;
 	sup->su_flags &= ~SEGUSE_DIRTY;
-	(void) LFS_BWRITE_LOG(bp);
+	LFS_WRITESEGENTRY(sup, fs, segnum, bp);
 	
 	LFS_CLEANERINFO(cip, fs, bp);
 	++cip->clean;
@@ -958,10 +963,6 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	cip->avail = fs->lfs_avail - fs->lfs_ravail;
 	(void) LFS_BWRITE_LOG(bp);
 	wakeup(&fs->lfs_avail);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-	lfs_segunlock(fs);
-#endif
-	vfs_unbusy(mntp);
 
 	return (0);
 }
@@ -1228,6 +1229,7 @@ lfs_fakebuf_iodone(struct buf *bp)
 
 	if (!(obp->b_flags & (B_DELWRI | B_DONE)))
 		obp->b_flags |= B_INVAL;
+	bp->b_saveaddr = (caddr_t)(VTOI(obp->b_vp)->i_lfs);
 	brelse(obp);
 	lfs_callback(bp);
 }
@@ -1256,11 +1258,10 @@ lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uadd
 	if (obp == NULL)
 		panic("lfs_fakebuf: getblk failed");
 
-#ifndef ALLOW_VFLUSH_CORRUPTION
-	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size);
+	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
 	error = copyin(uaddr, bp->b_data, size);
 	if (error) {
-		lfs_freebuf(bp);
+		lfs_freebuf(fs, bp);
 		return NULL;
 	}
 	bp->b_saveaddr = obp;
@@ -1272,11 +1273,6 @@ lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uadd
 		panic("lfs_fakebuf: gathered bp: %p, ino=%u, lbn=%d",
 		    bp, VTOI(vp)->i_number, lbn);
 #endif
-#else
-	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, 0);
-	bp->b_flags |= B_INVAL;
-	bp->b_saveaddr = uaddr;
-#endif
 #if 0
 	bp->b_saveaddr = (caddr_t)fs;
 	++fs->lfs_iocount;
diff --git a/sys/ufs/lfs/lfs_vfsops.c b/sys/ufs/lfs/lfs_vfsops.c
index 67103ae54917..4f4ed179bb83 100644
--- a/sys/ufs/lfs/lfs_vfsops.c
+++ b/sys/ufs/lfs/lfs_vfsops.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.91 2003/02/17 23:48:21 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.91 2003/02/17 23:48:21 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@@ -84,6 +84,7 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $")
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/kthread.h>
 #include <sys/buf.h>
 #include <sys/device.h>
 #include <sys/mbuf.h>
@@ -105,14 +106,32 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $")
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>
 
-int lfs_mountfs(struct vnode *, struct mount *, struct proc *);
+#ifdef LFS_UBC
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
+#endif
+
+static int lfs_mountfs(struct vnode *, struct mount *, struct proc *);
 
 extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
 extern const struct vnodeopv_desc lfs_specop_opv_desc;
 extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
+extern int lfs_subsys_pages;    
+extern int  locked_queue_count;
+extern long locked_queue_bytes;
+extern struct simplelock lfs_subsys_lock;
+
+int lfs_writer_daemon = 0;
+int lfs_do_flush = 0;
 
 const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
 	&lfs_vnodeop_opv_desc,
@@ -143,15 +162,95 @@ struct vfsops lfs_vfsops = {
 };
 
 struct genfs_ops lfs_genfsops = {
+#ifdef LFS_UBC
+	lfs_gop_size,
+	ufs_gop_alloc,
+	lfs_gop_write,
+#else
 	NULL,
 	NULL,
 	genfs_compat_gop_write,
+#endif
 };
 
-struct pool lfs_inode_pool;
+struct pool lfs_inode_pool, lfs_inoext_pool;
 
-extern int locked_queue_count;
-extern long locked_queue_bytes;
+/*
+ * The writer daemon.  UVM keeps track of how many dirty pages we are holding
+ * in lfs_subsys_pages; the daemon flushes the filesystem when this value
+ * crosses the (user-defined) threshhold LFS_MAX_PAGES.
+ */
+static void
+lfs_writerd(void *arg)
+{
+#ifdef LFS_PD
+	struct mount *mp, *nmp;
+	struct lfs *fs;
+#endif
+
+	lfs_writer_daemon = curproc->p_pid;
+
+	for (;;) {
+		tsleep(&lfs_writer_daemon, PVM, "lfswriter", 0);
+
+#ifdef LFS_PD
+		/*
+		 * Look through the list of LFSs to see if any of them
+		 * have requested pageouts.
+		 */
+		simple_lock(&mountlist_slock);
+		for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+		     mp = nmp) {
+			if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
+				nmp = mp->mnt_list.cqe_next;
+				continue;
+			}
+			if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
+				    MFSNAMELEN) == 0) {
+				fs = ((struct ufsmount *)mp->mnt_data)->ufsmount_u.lfs;
+				if (fs->lfs_pdflush ||
+				    !TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+					fs->lfs_pdflush = 0;
+					simple_unlock(&mountlist_slock);
+					lfs_flush_fs(fs, 0);
+					simple_lock(&mountlist_slock);
+				}
+			}
+
+			simple_lock(&mountlist_slock);
+			nmp = mp->mnt_list.cqe_next;
+			vfs_unbusy(mp);
+		}
+		simple_unlock(&mountlist_slock);
+#endif /* LFS_PD */
+
+		/*
+		 * If global state wants a flush, flush everything.
+		 */
+		while (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || 
+			locked_queue_bytes > LFS_MAX_BYTES ||
+			lfs_subsys_pages > LFS_MAX_PAGES) {
+
+#ifdef DEBUG_LFS_FLUSH
+			if (lfs_do_flush)
+				printf("daemon: lfs_do_flush\n");
+			if (locked_queue_count > LFS_MAX_BUFS)
+				printf("daemon: lqc = %d, max %d\n",
+					locked_queue_count, LFS_MAX_BUFS);
+			if (locked_queue_bytes > LFS_MAX_BYTES)
+				printf("daemon: lqb = %ld, max %d\n",
+					locked_queue_bytes, LFS_MAX_BYTES);
+			if (lfs_subsys_pages > LFS_MAX_PAGES) 
+				printf("daemon: lssp = %d, max %d\n",
+					lfs_subsys_pages, LFS_MAX_PAGES);
+#endif /* DEBUG_LFS_FLUSH */
+			lfs_flush(NULL, 0);
+			lfs_do_flush = 0;
+		}
+		wakeup(&lfs_subsys_pages);
+	}
+	/* NOTREACHED */
+}
 
 /*
  * Initialize the filesystem, most work done by ufs_init.
@@ -166,9 +265,12 @@ lfs_init()
 	 */
 	pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
 		  "lfsinopl", &pool_allocator_nointr);
+	pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
+		  "lfsinoextpl", &pool_allocator_nointr);
 #ifdef DEBUG
 	memset(lfs_log, 0, sizeof(lfs_log));
 #endif
+	simple_lock_init(&lfs_subsys_lock);
 }
 
 void
@@ -452,11 +554,11 @@ update_meta(struct lfs *fs, ino_t ino, int version, daddr_t lbn,
 		}
 #endif
 		sup->su_nbytes -= size;
-		LFS_BWRITE_LOG(bp);
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, odaddr)), bp);
 	}
 	LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
 	sup->su_nbytes += size;
-	LFS_BWRITE_LOG(bp);
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
 
 	/* Fix this so it can be released */
 	/* ip->i_lfs_effnblks = ip->i_ffs_blocks; */
@@ -544,12 +646,16 @@ update_inoblk(struct lfs *fs, daddr_t offset, struct ucred *cred,
 					LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
 						     ibp);
 					sup->su_nbytes -= DINODE_SIZE;
-					LFS_BWRITE_LOG(ibp);
+					LFS_WRITESEGENTRY(sup, fs,
+							  dtosn(fs, daddr),
+							  ibp);
 				}
 				LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
 					     ibp);
 				sup->su_nbytes += DINODE_SIZE;
-				LFS_BWRITE_LOG(ibp);
+				LFS_WRITESEGENTRY(sup, fs,
+					          dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+						  ibp);
 			}
 		}
 	}
@@ -969,7 +1075,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	fs->lfs_dirops = 0;
 	fs->lfs_nadirop = 0;
 	fs->lfs_seglock = 0;
-	lockinit(&fs->lfs_freelock, PINOD, "lfs_freelock", 0, 0);
+	fs->lfs_pdflush = 0;
 	lockinit(&fs->lfs_fraglock, PINOD, "lfs_fraglock", 0, 0);
 
 	/* Set the file system readonly/modify bits. */
@@ -985,6 +1091,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	mp->mnt_stat.f_iosize = fs->lfs_bsize;
 	mp->mnt_maxsymlinklen = fs->lfs_maxsymlinklen;
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_fs_bshift = fs->lfs_bshift;
 	ump->um_flags = 0;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
@@ -997,6 +1104,16 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 		ump->um_quotas[i] = NULLVP;
 	devvp->v_specmountpoint = mp;
 
+	/* Set up reserved memory for pageout */
+	lfs_setup_resblks(fs);
+	/* Set up vdirop tailq */
+	TAILQ_INIT(&fs->lfs_dchainhd);
+	/* and paging tailq */
+	TAILQ_INIT(&fs->lfs_pchainhd);
+#if 0 /* XXXDEBUG */
+	fs->lfs_lastwrit = dbtofsb(fs, fs->lfs_offset - 1);
+#endif
+
 	/*
 	 * We use the ifile vnode for almost every operation.  Instead of
 	 * retrieving it from the hash table each time we retrieve it here,
@@ -1012,6 +1129,32 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	fs->lfs_ivnode = vp;
 	VREF(vp);
 
+	/* Set up segment usage flags for the autocleaner. */
+	fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *),
+						M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t));
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		LFS_SEGENTRY(sup, fs, i, bp);
+		if (!ronly && sup->su_nbytes == 0 &&
+		    !(sup->su_flags & SEGUSE_EMPTY)) {
+			sup->su_flags |= SEGUSE_EMPTY;
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		} else if (!ronly && !(sup->su_nbytes == 0) &&
+			 (sup->su_flags & SEGUSE_EMPTY)) {
+			sup->su_flags &= ~SEGUSE_EMPTY;
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		} else {
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			brelse(bp);
+		}
+	}
+
 	/*
 	 * Roll forward.
 	 *
@@ -1045,7 +1188,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 		if (!(sup->su_flags & SEGUSE_DIRTY))
 			--fs->lfs_nclean;
 		sup->su_flags |= SEGUSE_DIRTY;
-		(void) LFS_BWRITE_LOG(bp);
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp);
 		while ((offset = check_segsum(fs, offset, cred, CHECK_CKSUM,
 					      &flags, p)) > 0)
 		{
@@ -1055,7 +1198,8 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 				if (!(sup->su_flags & SEGUSE_DIRTY))
 					--fs->lfs_nclean;
 				sup->su_flags |= SEGUSE_DIRTY;
-				(void) LFS_BWRITE_LOG(bp);
+				LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset),
+					     bp); 
 			}
 
 #ifdef DEBUG_LFS_RFW
@@ -1149,7 +1293,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	 */
         LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); 
         sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
-        (void) LFS_BWRITE_LOG(bp); /* Ifile */
+        LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);  /* Ifile */
 
 	/* Now that roll-forward is done, unlock the Ifile */
 	vput(vp);
@@ -1180,6 +1324,12 @@ out:
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
+
+	/* Start the pagedaemon-anticipating daemon */
+        if (lfs_writer_daemon == 0 &&
+	    kthread_create1(lfs_writerd, NULL, NULL, "lfs_writer") != 0)
+                panic("fork lfs_writer");
+
 	return (error);
 }
 
@@ -1259,12 +1409,18 @@ lfs_unmount(struct mount *mp, int mntflags, struct proc *p)
 	    ronly ? FREAD : FREAD|FWRITE, NOCRED, p);
 	vput(ump->um_devvp);
 
-	/* XXX KS - wake up the cleaner so it can die */
+	/* wake up the cleaner so it can die */
 	wakeup(&fs->lfs_nextseg);
 	wakeup(&lfs_allclean_wakeup);
 
+	/* Free per-mount data structures */
+	free(fs->lfs_suflags[0], M_SEGMENT);
+	free(fs->lfs_suflags[1], M_SEGMENT);
+	free(fs->lfs_suflags, M_SEGMENT);
+	lfs_free_resblks(fs);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
+
 	mp->mnt_data = NULL;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
@@ -1586,11 +1742,251 @@ lfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, si
 		if (lfs_dostats == 0)
 			memset(&lfs_stats,0,sizeof(lfs_stats));
 		return 0;
-	case LFS_STATS:
-		return (sysctl_rdstruct(oldp, oldlenp, newp,
-					&lfs_stats, sizeof(lfs_stats)));
 	default:
 		return (EOPNOTSUPP);
 	}
 	/* NOTREACHED */
 }
+
+#ifdef LFS_UBC
+/*
+ * lfs_gop_write functions exactly like genfs_gop_write, except that
+ * (1) it requires the seglock to be held by its caller, and sp->fip
+ *     to be properly initialized (it will return without re-initializing
+ *     sp->fip, and without calling lfs_writeseg).
+ * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
+ *     to determine how large a block it can write at once (though it does
+ *     still use VOP_BMAP to find holes in the file);
+ * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
+ *     (leaving lfs_writeseg to deal with the cluster blocks, so we might
+ *     now have clusters of clusters, ick.)
+ */
+static int
+lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+{
+	int i, s, error, run;
+	int fs_bshift, dev_bshift;
+	vaddr_t kva;
+	off_t eof, offset, startoffset;
+	size_t bytes, iobytes, skipbytes;
+	daddr_t lbn, blkno;
+	struct vm_page *pg;
+	struct buf *mbp, *bp;
+	struct vnode *devvp;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+	struct segment *sp = fs->lfs_sp;
+	UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+
+	/* The Ifile lives in the buffer cache */
+	if (vp == fs->lfs_ivnode)
+		return genfs_compat_gop_write(vp, pgs, npages, flags);
+
+	/*
+	 * Sometimes things slip past the filters in lfs_putpages,
+	 * and the pagedaemon tries to write pages---problem is
+	 * that the pagedaemon never acquires the segment lock.
+	 *
+	 * Unbusy and unclean the pages, and put them on the ACTIVE
+	 * queue under the hypothesis that they couldn't have got here
+	 * unless they were modified *quite* recently.
+	 *
+	 * XXXUBC that last statement is an oversimplification of course.
+	 */
+	if (!(fs->lfs_seglock) || fs->lfs_lockpid != curproc->p_pid) {
+		simple_lock(&vp->v_interlock);
+#ifdef DEBUG
+		printf("lfs_gop_write: seglock not held\n");
+#endif
+		uvm_lock_pageq();
+		for (i = 0; i < npages; i++) {
+			if (pgs[i]->flags & PG_WANTED)
+				wakeup(pgs[i]);
+			if (pgs[i]->flags & PG_PAGEOUT)
+				uvmexp.paging--;
+			pgs[i]->flags &= ~(PG_BUSY|PG_CLEAN|PG_WANTED|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
+			UVM_PAGE_OWN(pg, NULL);
+			uvm_pageactivate(pgs[i]);
+		}
+		uvm_page_unbusy(pgs, npages);
+		uvm_unlock_pageq();
+		simple_unlock(&vp->v_interlock);
+		return EAGAIN;
+	}
+
+	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
+	    vp, pgs, npages, flags);
+
+	GOP_SIZE(vp, vp->v_size, &eof, GOP_SIZE_WRITE);
+
+	if (vp->v_type == VREG) {
+		fs_bshift = vp->v_mount->mnt_fs_bshift;
+		dev_bshift = vp->v_mount->mnt_dev_bshift;
+	} else {
+		fs_bshift = DEV_BSHIFT;
+		dev_bshift = DEV_BSHIFT;
+	}
+	error = 0;
+	pg = pgs[0];
+	startoffset = pg->offset;
+	bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
+	skipbytes = 0;
+
+	KASSERT(bytes != 0);
+
+	/* Swap PG_DELWRI for PG_PAGEOUT */
+	for (i = 0; i < npages; i++)
+		if (pgs[i]->flags & PG_DELWRI) {
+			KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
+			pgs[i]->flags &= ~PG_DELWRI;
+			pgs[i]->flags |= PG_PAGEOUT;
+			uvmexp.paging++;
+		}
+
+	/*
+	 * Check to make sure we're starting on a block boundary.
+	 * We'll check later to make sure we always write entire
+	 * blocks (or fragments).
+	 */
+	if (startoffset & fs->lfs_bmask)
+		printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n",
+			startoffset, fs->lfs_bmask,
+			startoffset & fs->lfs_bmask);
+	KASSERT((startoffset & fs->lfs_bmask) == 0);
+	if (bytes & fs->lfs_ffmask) {
+		printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
+		panic("lfs_gop_write: non-integer blocks");
+	}
+
+	kva = uvm_pagermapin(pgs, npages,
+	    UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
+
+	s = splbio();
+	simple_lock(&global_v_numoutput_slock);
+	vp->v_numoutput += 2; /* one for biodone, one for aiodone */
+	simple_unlock(&global_v_numoutput_slock);
+	mbp = pool_get(&bufpool, PR_WAITOK);
+	splx(s);
+
+	memset(mbp, 0, sizeof(*bp));
+	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+	    vp, mbp, vp->v_numoutput, bytes);
+	mbp->b_bufsize = npages << PAGE_SHIFT;
+	mbp->b_data = (void *)kva;
+	mbp->b_resid = mbp->b_bcount = bytes;
+	mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL;
+	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_vp = vp;
+	LIST_INIT(&mbp->b_dep);
+
+	bp = NULL;
+	for (offset = startoffset;
+	    bytes > 0;
+	    offset += iobytes, bytes -= iobytes) {
+		lbn = offset >> fs_bshift;
+		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
+		if (error) {
+			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
+			skipbytes += bytes;
+			bytes = 0;
+			break;
+		}
+
+		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
+		    bytes);
+		if (blkno == (daddr_t)-1) {
+			skipbytes += iobytes;
+			continue;
+		}
+
+		/*
+		 * Discover how much we can really pack into this buffer.
+		 */
+#ifdef LFS_UBC_BIGBUFS
+		/* If no room in the current segment, finish it up */
+		if (sp->sum_bytes_left < sizeof(int32_t) ||
+		    sp->seg_bytes_left < MIN(iobytes, (1 << fs->lfs_bshift))) {
+			int version;
+
+			lfs_updatemeta(sp);
+
+			version = sp->fip->fi_version;
+			(void) lfs_writeseg(fs, sp);
+			
+			sp->fip->fi_version = version;
+			sp->fip->fi_ino = ip->i_number;
+			/* Add the current file to the segment summary. */
+			++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+			sp->sum_bytes_left -= FINFOSIZE;
+		}
+		iobytes = MIN(iobytes, ((sp->seg_bytes_left >> fs_bshift) << fs_bshift));
+#else
+		iobytes = MIN(iobytes, (1 << fs_bshift));
+		if (iobytes != blksize(fs, ip, lblkno(fs, offset))) {
+			printf("iobytes = %" PRId64 ", blk = %" PRId64 "\n",
+				(int64_t)iobytes,
+				(int64_t)blksize(fs, ip, lblkno(fs, offset)));
+		}
+		KASSERT(iobytes == blksize(fs, ip, lblkno(fs, offset)));
+#endif
+		KASSERT(iobytes > 0);
+
+		/* if it's really one i/o, don't make a second buf */
+		if (offset == startoffset && iobytes == bytes) {
+			bp = mbp;
+			/* printf("bp is mbp\n"); */
+			/* correct overcount if there is no second buffer */
+			s = splbio();
+			simple_lock(&global_v_numoutput_slock);
+			--vp->v_numoutput;
+			simple_unlock(&global_v_numoutput_slock);
+			splx(s);
+		} else {
+			/* printf("bp is not mbp\n"); */
+			s = splbio();
+			bp = pool_get(&bufpool, PR_WAITOK);
+			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+			    vp, bp, vp->v_numoutput, 0);
+			memset(bp, 0, sizeof(*bp));
+			splx(s);
+			bp->b_data = (char *)kva +
+			    (vaddr_t)(offset - pg->offset);
+			bp->b_resid = bp->b_bcount = iobytes;
+			bp->b_flags = B_BUSY|B_WRITE|B_CALL;
+			bp->b_iodone = uvm_aio_biodone1;
+			LIST_INIT(&bp->b_dep);
+		}
+
+		/* XXX This is silly ... is this necessary? */
+		bp->b_vp = NULL;
+		s = splbio();
+		bgetvp(vp, bp);
+		splx(s);
+
+		bp->b_lblkno = lblkno(fs, offset);
+		bp->b_private = mbp;
+		if (devvp->v_type == VBLK) {
+			bp->b_dev = devvp->v_rdev;
+		}
+		VOP_BWRITE(bp);
+		while(lfs_gatherblock(sp, bp, NULL))
+			;
+	}
+
+	if (skipbytes) {
+		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
+		s = splbio();
+		if (error) {
+			mbp->b_flags |= B_ERROR;
+			mbp->b_error = error;
+		}
+		mbp->b_resid -= skipbytes;
+		if (mbp->b_resid == 0) {
+			biodone(mbp);
+		}
+		splx(s);
+	}
+	UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
+	return (0);
+}
+#endif /* LFS_UBC */
diff --git a/sys/ufs/lfs/lfs_vnops.c b/sys/ufs/lfs/lfs_vnops.c
index 7493d8606eb4..6eb70d2a9edf 100644
--- a/sys/ufs/lfs/lfs_vnops.c
+++ b/sys/ufs/lfs/lfs_vnops.c
@@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp $	*/
+/*	$NetBSD: lfs_vnops.c,v 1.84 2003/02/17 23:48:22 perseant Exp $	*/
 
 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.84 2003/02/17 23:48:22 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -97,9 +97,19 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
+#include <uvm/uvm.h>
+#ifdef LFS_UBC
+# include <uvm/uvm_pmap.h>
+# include <uvm/uvm_stat.h>
+# include <uvm/uvm_pager.h>
+#endif
+
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>
 
+extern int lfs_writer_daemon;
+extern int lfs_subsys_pages;
+
 /* Global vfs data structures for lfs. */
 int (**lfs_vnodeop_p)(void *);
 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
@@ -121,7 +131,11 @@ const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 	{ &vop_poll_desc, ufs_poll },			/* poll */
 	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
 	{ &vop_revoke_desc, ufs_revoke },		/* revoke */
+#ifdef LFS_UBC
+	{ &vop_mmap_desc, lfs_mmap },			/* mmap */
+#else
 	{ &vop_mmap_desc, ufs_mmap },			/* mmap */
+#endif
 	{ &vop_fsync_desc, lfs_fsync },			/* fsync */
 	{ &vop_seek_desc, ufs_seek },			/* seek */
 	{ &vop_remove_desc, lfs_remove },		/* remove */
@@ -150,7 +164,11 @@ const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 	{ &vop_truncate_desc, lfs_truncate },		/* truncate */
 	{ &vop_update_desc, lfs_update },		/* update */
 	{ &vop_bwrite_desc, lfs_bwrite },		/* bwrite */
+#ifdef LFS_UBC
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+#else
 	{ &vop_getpages_desc, lfs_getpages },		/* getpages */
+#endif
 	{ &vop_putpages_desc, lfs_putpages },		/* putpages */
 	{ NULL, NULL }
 };
@@ -293,37 +311,46 @@ lfs_fsync(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
-	int error;
-	
-	/* Ignore the trickle syncer */
-	if (ap->a_flags & FSYNC_LAZY)
+	int error, wait;
+
+  	/*
+	 * Trickle sync checks for need to do a checkpoint after possible
+	 * activity from the pagedaemon.
+  	 */
+	if (ap->a_flags & FSYNC_LAZY) {
+		wakeup(&lfs_writer_daemon);
 		return 0;
-
-	simple_lock(&vp->v_interlock);
-	error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
-                    round_page(ap->a_offhi), PGO_CLEANIT | PGO_SYNCIO);
-	if (error)
-		return error;
-	error = VOP_UPDATE(vp, NULL, NULL,
-			   (ap->a_flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0);
-#ifdef DEBUG
-	/*
-	 * If we were called from vinvalbuf and lfs_update
-	 * didn't flush all our buffers, we're in trouble.
-	 */
-	if ((ap->a_flags & FSYNC_WAIT) && LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
-		struct buf *bp;
-
-		bp = LIST_FIRST(&vp->v_dirtyblkhd);
-		printf("lfs_fsync: ino %d failed to sync", VTOI(vp)->i_number);
-		printf("lfs_fsync: iocount = %d\n", VTOI(vp)->i_lfs->lfs_iocount);
-		printf("lfs_fsync: flags are 0x%x, numoutput=%d\n",
-			VTOI(vp)->i_flag, vp->v_numoutput);
-		printf("lfs_fsync: writecount=%ld\n", vp->v_writecount);
-		printf("lfs_fsync: first bp: %p, flags=0x%lx, lbn=%" PRId64 "\n",
-			bp, bp->b_flags, bp->b_lblkno);
 	}
+
+	wait = (ap->a_flags & FSYNC_WAIT);
+	do {
+#ifdef DEBUG
+  		struct buf *bp;
 #endif
+
+		simple_lock(&vp->v_interlock);
+		error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+			     	round_page(ap->a_offhi),
+			     	PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
+		if (error)
+			return error;
+		error = VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+		if (wait && error == 0 && !VPISEMPTY(vp)) {
+#ifdef DEBUG
+			printf("lfs_fsync: reflushing ino %d\n",
+				VTOI(vp)->i_number);
+			printf("vflags %x iflags %x npages %d\n",
+				vp->v_flag, VTOI(vp)->i_flag,
+				vp->v_uobj.uo_npages);
+			LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
+				printf("%" PRId64 " (%lx)", bp->b_lblkno,
+					bp->b_flags);
+			printf("\n");
+#endif
+			VTOI(vp)->i_flag |= IN_MODIFIED;
+		}
+	} while (wait && error == 0 && !VPISEMPTY(vp));
+
 	return error;
 }
 
@@ -361,6 +388,7 @@ lfs_inactive(void *v)
 #define	SET_DIROP2(vp, vp2)	lfs_set_dirop((vp), (vp2))
 static int lfs_set_dirop(struct vnode *, struct vnode *);
 extern int lfs_dirvcount;
+extern int lfs_do_flush;
 
 #define	NRESERVE(fs)	(btofsb(fs, (NIADDR + 3 + (2 * NIADDR + 3)) << fs->lfs_bshift))
 
@@ -383,17 +411,15 @@ lfs_set_dirop(struct vnode *vp, struct vnode *vp2)
 
 	if (fs->lfs_dirops == 0)
 		lfs_check(vp, LFS_UNUSED_LBN, 0);
-	while (fs->lfs_writer || lfs_dirvcount > LFS_MAXDIROP) {
+	while (fs->lfs_writer || lfs_dirvcount > LFS_MAX_DIROP) {
 		if (fs->lfs_writer)
 			tsleep(&fs->lfs_dirops, PRIBIO + 1, "lfs_sdirop", 0);
-		if (lfs_dirvcount > LFS_MAXDIROP && fs->lfs_dirops == 0) {
-                	++fs->lfs_writer;
-                	lfs_flush(fs, 0);
-                	if (--fs->lfs_writer == 0)
-                        	wakeup(&fs->lfs_dirops);
+		if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
+			wakeup(&lfs_writer_daemon);
+			preempt(NULL);
 		}
 
-		if (lfs_dirvcount > LFS_MAXDIROP) {		
+		if (lfs_dirvcount > LFS_MAX_DIROP) {
 #ifdef DEBUG_LFS
 			printf("lfs_set_dirop: sleeping with dirops=%d, "
 			       "dirvcount=%d\n", fs->lfs_dirops,
@@ -438,15 +464,19 @@ unreserve:
 }
 
 #define	MARK_VNODE(dvp)  do {                                           \
+	struct inode *_ip = VTOI(dvp);					\
+	struct lfs *_fs = _ip->i_lfs;					\
+									\
         if (!((dvp)->v_flag & VDIROP)) {				\
                 (void)lfs_vref(dvp);					\
 		++lfs_dirvcount;					\
+		TAILQ_INSERT_TAIL(&_fs->lfs_dchainhd, _ip, i_lfs_dchain); \
 	}								\
         (dvp)->v_flag |= VDIROP;					\
-	if (!(VTOI(dvp)->i_flag & IN_ADIROP)) {				\
-		++VTOI(dvp)->i_lfs->lfs_nadirop;			\
+	if (!(_ip->i_flag & IN_ADIROP)) {				\
+		++_fs->lfs_nadirop;					\
 	}								\
-	VTOI(dvp)->i_flag |= IN_ADIROP;					\
+	_ip->i_flag |= IN_ADIROP;					\
 } while (0)
 
 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
@@ -656,22 +686,24 @@ lfs_rmdir(void *v)
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap = v;
+	struct vnode *vp;
 	int error;
 
+	vp = ap->a_vp;
 	if ((error = SET_DIROP2(ap->a_dvp, ap->a_vp)) != 0) {
 		vrele(ap->a_dvp);
 		if (ap->a_vp != ap->a_dvp)
 			VOP_UNLOCK(ap->a_dvp, 0);
-		vput(ap->a_vp);
+		vput(vp);
 		return error;
 	}
 	MARK_VNODE(ap->a_dvp);
-	MARK_VNODE(ap->a_vp);
+	MARK_VNODE(vp);
 	error = ufs_rmdir(ap);
 	UNMARK_VNODE(ap->a_dvp);
-	UNMARK_VNODE(ap->a_vp);
+	UNMARK_VNODE(vp);
 
-	SET_ENDOP2(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
+	SET_ENDOP2(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vp, "rmdir");
 	return (error);
 }
 
@@ -844,7 +876,7 @@ lfs_getattr(void *v)
 		vap->va_blocksize = MAXBSIZE;
 	else
 		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
-	vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_ffs_blocks);
+	vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = ip->i_modrev;
 	return (0);
@@ -964,18 +996,22 @@ lfs_reclaim(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
 	int error;
 
-	KASSERT(VTOI(vp)->i_ffs_nlink == VTOI(vp)->i_ffs_effnlink);
+	KASSERT(ip->i_ffs_nlink == ip->i_ffs_effnlink);
 
-	LFS_CLR_UINO(VTOI(vp), IN_ALLMOD);
+	LFS_CLR_UINO(ip, IN_ALLMOD);
 	if ((error = ufs_reclaim(vp, ap->a_p)))
 		return (error);
+	pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
+	ip->inode_ext.lfs = NULL;
 	pool_put(&lfs_inode_pool, vp->v_data);
 	vp->v_data = NULL;
 	return (0);
 }
 
+#ifndef LFS_UBC
 int
 lfs_getpages(void *v)
 {
@@ -1004,3 +1040,645 @@ lfs_putpages(void *v)
 	error = genfs_putpages(v);
 	return error;
 }
+
+#else /* LFS_UBC */
+
+/*
+ * Make sure that for all pages in every block in the given range,
+ * either all are dirty or all are clean.  If any of the pages
+ * we've seen so far are dirty, put the vnode on the paging chain,
+ * and mark it IN_PAGING.
+ */
+static int
+check_dirty(struct lfs *fs, struct vnode *vp,
+	    off_t startoffset, off_t endoffset, off_t blkeof,
+	    int flags)
+{
+        int by_list;
+	struct vm_page *curpg, *pgs[MAXBSIZE / PAGE_SIZE], *pg;
+	struct lwp *l = curlwp ? curlwp : &lwp0;
+	off_t soff;
+	voff_t off;
+	int i, dirty, tdirty, nonexistent, any_dirty;
+	int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+
+  top:
+	by_list = (vp->v_uobj.uo_npages <=
+		   ((endoffset - startoffset) >> PAGE_SHIFT) *
+		   UVM_PAGE_HASH_PENALTY);
+	any_dirty = 0;
+
+	if (by_list) {
+		curpg = TAILQ_FIRST(&vp->v_uobj.memq);
+		PHOLD(l);
+	} else {
+		soff = startoffset;
+	}
+	while (by_list || soff < MIN(blkeof, endoffset)) {
+		if (by_list) {
+			if (pages_per_block > 1) {
+				while (curpg && (curpg->offset & fs->lfs_bmask))
+					curpg = TAILQ_NEXT(curpg, listq);
+			}
+			if (curpg == NULL)
+				break;
+			soff = curpg->offset;
+		}
+
+		/*
+		 * Mark all pages in extended range busy; find out if any
+		 * of them are dirty.
+		 */
+		nonexistent = dirty = 0;
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			if (by_list && pages_per_block <= 1) {
+				pgs[i] = pg = curpg;
+			} else {
+				off = soff + (i << PAGE_SHIFT);
+				pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
+				if (pg == NULL) {
+					++nonexistent;
+					continue;
+				}
+			}
+			KASSERT(pg != NULL);
+			while (pg->flags & PG_BUSY) {
+				pg->flags |= PG_WANTED;
+				UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
+						    "lfsput", 0);
+				simple_lock(&vp->v_interlock);
+				if (by_list)
+					goto top;
+			}
+			pg->flags |= PG_BUSY;
+			UVM_PAGE_OWN(pg, "lfs_putpages");
+
+			pmap_page_protect(pg, VM_PROT_NONE);
+			tdirty = (pmap_clear_modify(pg) ||
+				  (pg->flags & PG_CLEAN) == 0);
+			dirty += tdirty;
+		}
+		if (pages_per_block > 0 && nonexistent >= pages_per_block) {
+			if (by_list) {
+				curpg = TAILQ_NEXT(curpg, listq);
+			} else {
+				soff += fs->lfs_bsize;
+			}
+			continue;
+		}
+
+		any_dirty += dirty;
+		KASSERT(nonexistent == 0);
+
+		/*
+		 * If any are dirty make all dirty; unbusy them,
+		 * but if we were asked to clean, take them off
+		 * of their queue so the pagedaemon doesn't bother
+		 * us about them while they're on their way to disk.
+		 *
+		 * (XXXUBC the page is now on *no* page queue.)
+		 */
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			pg = pgs[i];
+			KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+			if (dirty) {
+				pg->flags &= ~PG_CLEAN;
+				if (flags & PGO_FREE) {
+					/* XXXUBC need better way to update */
+					lfs_subsys_pages += MIN(1, pages_per_block);
+					uvm_lock_pageq();
+					UVM_PAGE_OWN(pg, NULL);
+					uvm_pagedequeue(pg);
+					/* Suspended write flag */
+					pg->flags |= PG_DELWRI;
+					uvm_unlock_pageq();
+				}
+			} else {
+				UVM_PAGE_OWN(pg, NULL);
+			}
+			if (pg->flags & PG_WANTED)
+				wakeup(pg);
+			pg->flags &= ~(PG_WANTED|PG_BUSY);
+			/* UVM_PAGE_OWN(pg, NULL); */
+		}
+
+		if (by_list) {
+			curpg = TAILQ_NEXT(curpg, listq);
+		} else {
+			soff += MAX(PAGE_SIZE, fs->lfs_bsize);
+		}
+	}
+	if (by_list) {
+		PRELE(l);
+	}
+
+	/*
+	 * If any pages were dirty, mark this inode as "pageout requested",
+	 * and put it on the paging queue.
+	 * XXXUBC locking (check locking on dchainhd too)
+	 */
+#ifdef notyet
+	if (any_dirty) {
+		if (!(ip->i_flags & IN_PAGING)) {
+			ip->i_flags |= IN_PAGING;
+			TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+		}
+	}
+#endif
+	return any_dirty;
+}
+
+/*
+ * lfs_putpages functions like genfs_putpages except that
+ * 
+ * (1) It needs to bounds-check the incoming requests to ensure that
+ *     they are block-aligned; if they are not, expand the range and
+ *     do the right thing in case, e.g., the requested range is clean
+ *     but the expanded range is dirty.
+ * (2) It needs to explicitly send blocks to be written when it is done.
+ *     VOP_PUTPAGES is not ever called with the seglock held, so
+ *     we simply take the seglock and let lfs_segunlock wait for us.
+ *     XXX Actually we can be called with the seglock held, if we have
+ *     XXX to flush a vnode while lfs_markv is in operation.  As of this
+ *     XXX writing we panic in this case.
+ *
+ * Assumptions:
+ *
+ * (1) The caller does not hold any pages in this vnode busy.  If it does,
+ *     there is a danger that when we expand the page range and busy the
+ *     pages we will deadlock.
+ * (2) We are called with vp->v_interlock held; we must return with it
+ *     released.
+ * (3) We don't absolutely have to free pages right away, provided that
+ *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
+ *     us a request with PGO_FREE, we take the pages out of the paging
+ *     queue and wake up the writer, which will handle freeing them for us.
+ *
+ *     We ensure that for any filesystem block, all pages for that
+ *     block are either resident or not, even if those pages are higher
+ *     than EOF; that means that we will be getting requests to free
+ *     "unused" pages above EOF all the time, and should ignore them.
+ */
+
+int
+lfs_putpages(void *v)
+{
+	int error;
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offlo;
+		voff_t a_offhi;
+		int a_flags;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	struct lfs *fs;
+	struct segment *sp;
+	off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
+	off_t max_endoffset;
+	int pages_per_block;
+	int s, sync, dirty, pagedaemon;
+	UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	sync = (ap->a_flags & PGO_SYNCIO);
+	pagedaemon = (curproc == uvm.pagedaemon_proc);
+
+	/* Putpages does nothing for metadata. */
+	if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
+		simple_unlock(&vp->v_interlock);
+		return 0;
+	}
+
+	/*
+	 * If there are no pages, don't do anything.
+	 */
+	if (vp->v_uobj.uo_npages == 0) {
+		s = splbio();
+		if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
+		    (vp->v_flag & VONWORKLST)) {
+			vp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(vp, v_synclist);
+		}
+		splx(s);
+		simple_unlock(&vp->v_interlock);
+		return 0;
+	}
+
+	blkeof = blkroundup(fs, ip->i_ffs_size);
+
+	/*
+	 * Ignore requests to free pages past EOF but in the same block
+	 * as EOF, unless the request is synchronous. (XXX why sync?)
+	 * XXXUBC Make these pages look "active" so the pagedaemon won't
+	 * XXXUBC bother us with them again.
+	 */
+	if (!sync && ap->a_offlo >= ip->i_ffs_size && ap->a_offlo < blkeof) {
+		origoffset = ap->a_offlo;
+		ap->a_offlo = blkeof;
+		if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
+			simple_unlock(&vp->v_interlock);
+			return 0;
+		}
+	}
+
+	/*
+	 * Extend page range to start and end at block boundaries.
+	 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
+	 */
+	pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+        origoffset = ap->a_offlo;
+	origendoffset = ap->a_offhi;
+        startoffset = origoffset & ~(fs->lfs_bmask);
+	max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
+					       << fs->lfs_bshift;
+
+	if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+                endoffset = max_endoffset;
+		origendoffset = endoffset;
+        } else {
+		origendoffset = round_page(ap->a_offhi);
+		endoffset = round_page(blkroundup(fs, origendoffset));
+	}
+
+	KASSERT(startoffset > 0 || endoffset >= startoffset);
+	if (startoffset == endoffset) {
+		/* Nothing to do, why were we called? */
+		simple_unlock(&vp->v_interlock);
+#ifdef DEBUG
+		printf("lfs_putpages: startoffset = endoffset = %" PRId64 "\n",
+			startoffset);
+#endif
+		return 0;
+	}
+
+	ap->a_offlo = startoffset;
+	ap->a_offhi = endoffset;
+
+	if (!(ap->a_flags & PGO_CLEANIT))
+		return genfs_putpages(v);
+
+	/*
+	 * Make sure that all pages in any given block are dirty, or
+	 * none of them are.  Find out if any of the pages we've been
+	 * asked about are dirty.  If none are dirty, send them on
+	 * through genfs_putpages(), albeit with adjusted offsets.
+	 * XXXUBC I am assuming here that they can't be dirtied in
+	 * XXXUBC the meantime, but I bet that's wrong.
+	 */
+	dirty = check_dirty(fs, vp, startoffset, endoffset, blkeof, ap->a_flags);
+	if (!dirty)
+		return genfs_putpages(v);
+		
+	/*
+	 * Dirty and asked to clean.
+	 *
+	 * Pagedaemon can't actually write LFS pages; wake up
+	 * the writer to take care of that.  The writer will
+	 * notice the pager inode queue and act on that.
+	 */
+	if (pagedaemon) {
+		++fs->lfs_pdflush;
+		wakeup(&lfs_writer_daemon);
+		return EWOULDBLOCK;
+	}
+
+	/*
+	 * If this is a file created in a recent dirop, we can't flush its
+	 * inode until the dirop is complete.  Drain dirops, then flush the
+	 * filesystem (taking care of any other pending dirops while we're
+	 * at it).
+	 */
+	if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
+	    (vp->v_flag & VDIROP)) {
+		int locked;
+
+		/* printf("putpages to clean VDIROP, flushing\n"); */
+		while (fs->lfs_dirops > 0) {
+			++fs->lfs_diropwait;
+			tsleep(&fs->lfs_writer, PRIBIO+1, "ppdirop", 0);
+			--fs->lfs_diropwait;
+		}
+		++fs->lfs_writer;
+		locked = VOP_ISLOCKED(vp) && /* XXX */
+			vp->v_lock.lk_lockholder == curproc->p_pid;
+		if (locked)
+			VOP_UNLOCK(vp, 0);
+		simple_unlock(&vp->v_interlock);
+		
+		lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
+		
+		simple_lock(&vp->v_interlock);
+		if (locked)
+			VOP_LOCK(vp, LK_EXCLUSIVE);
+		if (--fs->lfs_writer == 0)
+			wakeup(&fs->lfs_dirops);
+
+		/* XXX the flush should have taken care of this one too! */
+	}
+
+
+	/*
+	 * This is it.  We are going to write some pages.  From here on
+	 * down it's all just mechanics.
+	 *
+	 * If there are more than one page per block, we don't want to get
+	 * caught locking them backwards; so set PGO_BUSYFAIL to avoid
+	 * deadlocks.  Also, don't let genfs_putpages wait;
+	 * lfs_segunlock will wait for us, if need be.
+	 */
+	ap->a_flags &= ~PGO_SYNCIO;
+	if (pages_per_block > 1)
+		ap->a_flags |= PGO_BUSYFAIL;
+
+	/*
+	 * If we've already got the seglock, flush the node and return.
+	 * The FIP has already been set up for us by lfs_writefile,
+	 * and FIP cleanup and lfs_updatemeta will also be done there,
+	 * unless genfs_putpages returns EDEADLK; then we must flush
+	 * what we have, and correct FIP and segment header accounting.
+	 */
+	if (ap->a_flags & PGO_LOCKED) {
+		sp = fs->lfs_sp;
+		sp->vp = vp;
+
+		/*
+		 * XXXUBC
+		 * There is some danger here that we might run out of
+		 * buffers if we flush too much at once.  If the number
+		 * of dirty buffers is too great, we should cut the range
+		 * down and write in chunks.
+		 */
+		while ((error = genfs_putpages(v)) == EDEADLK) {
+#ifdef DEBUG_LFS
+			printf("lfs_putpages: genfs_putpages returned EDEADLK"
+			       " ino %d off %x (seg %d)\n",
+			       ip->i_number, fs->lfs_offset,
+			       dtosn(fs, fs->lfs_offset));
+#endif
+			/* Write gathered pages */
+                	lfs_updatemeta(sp);
+                	(void) lfs_writeseg(fs, sp);
+ 
+			/* Reinitialize brand new FIP and add us to it */
+			sp->vp = vp;
+                	sp->fip->fi_version = ip->i_ffs_gen;
+                	sp->fip->fi_ino = ip->i_number;
+                	/* Add us to the new segment summary. */
+                	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+                	sp->sum_bytes_left -=
+                        	sizeof(struct finfo) - sizeof(int32_t);
+
+			/* Give the write a chance to complete */
+			simple_unlock(&vp->v_interlock);
+			preempt(NULL);
+			simple_lock(&vp->v_interlock);
+		}
+		return error;
+	}
+
+	/*
+	 * Take the seglock, because we are going to be writing pages.
+	 */
+	if ((error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0))) != 0)
+		return error;
+
+	/*
+	 * VOP_PUTPAGES should not be called while holding the seglock.
+	 * XXX fix lfs_markv, or do this properly.
+	 */
+	KASSERT(fs->lfs_seglock == 1);
+
+	/*
+	 * We assume we're being called with sp->fip pointing at blank space.
+	 * Account for a new FIP in the segment header, and set sp->vp.
+	 * (This should duplicate the setup at the top of lfs_writefile().)
+	 */
+	sp = fs->lfs_sp;
+        if (sp->seg_bytes_left < fs->lfs_bsize ||
+            sp->sum_bytes_left < sizeof(struct finfo))
+                (void) lfs_writeseg(fs, fs->lfs_sp); 
+ 
+        sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(int32_t);
+        ++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+	sp->vp = vp;
+ 
+        if (vp->v_flag & VDIROP)
+                ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+ 
+        sp->fip->fi_nblocks = 0;
+        sp->fip->fi_ino = ip->i_number;
+	sp->fip->fi_version = ip->i_ffs_gen;
+
+	/*
+	 * Loop through genfs_putpages until all pages are gathered.
+	 */
+		/*
+		 * There is some danger here that we might run out of
+		 * buffers if we flush too much at once.  If the number
+		 * of dirty buffers is too great, then, cut the range down
+		 * and write in chunks.
+		 *
+		 * XXXUBC this assumes a uniform dirtying of the pages
+		 * XXXUBC across the address space
+		 * XXXXXX do this
+		 */
+	while ((error = genfs_putpages(v)) == EDEADLK) {
+#ifdef DEBUG_LFS
+		printf("lfs_putpages: genfs_putpages returned EDEADLK [2]"
+		       " ino %d off %x (seg %d)\n",
+		       ip->i_number, fs->lfs_offset,
+		       dtosn(fs, fs->lfs_offset));
+#endif
+		/* Write gathered pages */
+               	lfs_updatemeta(sp);
+               	(void) lfs_writeseg(fs, sp);
+ 
+		/*
+		 * Reinitialize brand new FIP and add us to it.
+		 * (This should duplicate the fixup in lfs_gatherpages().)
+		 */
+		sp->vp = vp;
+               	sp->fip->fi_version = ip->i_ffs_gen;
+               	sp->fip->fi_ino = ip->i_number;
+               	/* Add us to the new segment summary. */
+               	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+               	sp->sum_bytes_left -=
+                       	sizeof(struct finfo) - sizeof(int32_t);
+
+		/* Give the write a chance to complete */
+		simple_unlock(&vp->v_interlock);
+		preempt(NULL);
+		simple_lock(&vp->v_interlock);
+	}
+
+	/*
+	 * Blocks are now gathered into a segment waiting to be written.
+	 * All that's left to do is update metadata, and write them.
+	 */
+	lfs_updatemeta(fs->lfs_sp);
+	fs->lfs_sp->vp = NULL;
+	lfs_writeseg(fs, fs->lfs_sp);
+
+	/*
+	 * Clean up FIP.
+	 * (This should duplicate cleanup at the end of lfs_writefile().)
+	 */
+        if (sp->fip->fi_nblocks != 0) {
+                sp->fip = (FINFO*)((caddr_t)sp->fip + sizeof(struct finfo) +
+			sizeof(int32_t) * (sp->fip->fi_nblocks - 1));
+                sp->start_lbp = &sp->fip->fi_blocks[0];
+        } else {
+                sp->sum_bytes_left += sizeof(FINFO) - sizeof(int32_t);
+                --((SEGSUM *)(sp->segsum))->ss_nfinfo;
+        }
+	/*
+	 * XXX - with the malloc/copy writeseg, the pages are freed by now
+	 * even if we don't wait (e.g. if we hold a nested lock).  This
+	 * will not be true if we stop using malloc/copy.
+	 */
+	KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
+	lfs_segunlock(fs);
+
+	/*
+	 * Wait for v_numoutput to drop to zero.  The seglock should
+	 * take care of this, but there is a slight possibility that
+	 * aiodoned might not have got around to our buffers yet.
+	 */
+	if (sync) {
+		int s;
+
+		s = splbio();
+		simple_lock(&global_v_numoutput_slock);
+		while(vp->v_numoutput > 0) {
+#ifdef DEBUG
+			printf("ino %d sleeping on num %d\n",
+				ip->i_number, vp->v_numoutput);
+#endif
+			vp->v_flag |= VBWAIT;
+			simple_unlock(&global_v_numoutput_slock);
+			tsleep(&vp->v_numoutput, PRIBIO + 1, "lfs_vn", 0);
+			simple_lock(&global_v_numoutput_slock);
+		}
+		simple_unlock(&global_v_numoutput_slock);
+		splx(s);
+	}
+	return error;
+}
+
+/*
+ * Find out whether the vnode has any blocks or pages waiting to be written.
+ * We used to just check LIST_EMPTY(&vp->v_dirtyblkhd), but there is not
+ * presently as simple a mechanism for the page cache.
+ */
+int
+lfs_checkifempty(struct vnode *vp)
+{
+	struct vm_page *pg;
+	struct buf *bp;
+	int r, s;
+
+	if (vp->v_type != VREG || VTOI(vp)->i_number == LFS_IFILE_INUM)
+		return LIST_EMPTY(&vp->v_dirtyblkhd);
+
+	/*
+	 * For vnodes with pages it is a little more complex.
+	 * Pages that have been written (i.e. are "clean" for our purposes)
+	 * might be in seemingly dirty buffers, so we have to troll
+	 * looking for indirect block buffers as well as pages.
+	 */
+	simple_lock(&vp->v_interlock);
+	s = splbio();
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp;
+	     bp = LIST_NEXT(bp, b_vnbufs)) {
+		if (bp->b_lblkno < 0) {
+			splx(s);
+			return 0;
+		}
+	}
+	splx(s);
+	
+	/*
+	 * Run through the page list to find dirty pages.
+	 * Right now I just walk the memq. 
+	 */
+	pg = TAILQ_FIRST(&vp->v_uobj.memq);
+	r = 1;
+	while(pg) {
+		if ((pg->flags & PG_CLEAN) == 0 || pmap_is_modified(pg)) {
+			r = 0;
+			break;
+		}
+		pg = TAILQ_NEXT(pg, listq);
+	}
+#if 0
+	if (r != !(vp->v_flag & VONWORKLST)) {
+		printf("nope, VONWORKLST isn't good enough!\n");
+	}
+#endif
+	simple_unlock(&vp->v_interlock);
+	return r;
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".  If writing, we need to know
+ * about sizes on disk, i.e. fragments if there are any; if reading, we need
+ * to know about entire blocks.
+ */
+void
+lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs; 
+	daddr_t olbn, nlbn;
+
+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE)) 
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
+
+	olbn = lblkno(fs, ip->i_ffs_size);
+	nlbn = lblkno(fs, size);
+        if ((flags & GOP_SIZE_WRITE) && nlbn < NDADDR && olbn <= nlbn) {
+                *eobp = fragroundup(fs, size);
+        } else {
+                *eobp = blkroundup(fs, size);
+        }
+}
+
+#ifdef DEBUG
+void lfs_dump_vop(void *);
+
+void
+lfs_dump_vop(void *v)
+{
+        struct vop_putpages_args /* {
+                struct vnode *a_vp;
+                voff_t a_offlo;
+                voff_t a_offhi;
+                int a_flags;
+        } */ *ap = v;
+
+	vfs_vnode_print(ap->a_vp, 0, printf);
+	lfs_dump_dinode(&VTOI(ap->a_vp)->i_din.ffs_din);
+}
+#endif
+
+int
+lfs_mmap(void *v)
+{
+	struct vop_mmap_args /* {
+        	const struct vnodeop_desc *a_desc;
+        	struct vnode *a_vp;
+        	int a_fflags;
+        	struct ucred *a_cred;
+        	struct proc *a_p;
+	} */ *ap = v;
+
+	if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
+		return EOPNOTSUPP;
+	return ufs_mmap(v);
+}
+#endif /* LFS_UBC */
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index b7b8e4beba53..73c76798bec9 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: inode.h,v 1.32 2003/01/24 21:55:29 fvdl Exp $	*/
+/*	$NetBSD: inode.h,v 1.33 2003/02/17 23:48:23 perseant Exp $	*/
 
 /*
  * Copyright (c) 1982, 1989, 1993
@@ -58,11 +58,7 @@ struct ext2fs_inode_ext {
 	daddr_t ext2fs_last_blk;	/* last block allocated on disk */
 };
 
-struct lfs_inode_ext {
-	off_t	  lfs_osize;		/* size of file on disk */
-	u_int32_t lfs_effnblocks;  /* number of blocks when i/o completes */
-	size_t    lfs_fragsize[NDADDR]; /* size of on-disk direct blocks */
-};
+struct lfs_inode_ext;
 
 /*
  * The inode is used to describe each active (or recently active) file in the
@@ -111,13 +107,10 @@ struct inode {
 	union {
 		/* Other extensions could go here... */
 		struct	ext2fs_inode_ext e2fs;
-		struct  lfs_inode_ext lfs;
+		struct  lfs_inode_ext *lfs;
 	} inode_ext;
 #define	i_e2fs_last_lblk	inode_ext.e2fs.ext2fs_last_lblk
 #define	i_e2fs_last_blk		inode_ext.e2fs.ext2fs_last_blk
-#define i_lfs_effnblks		inode_ext.lfs.lfs_effnblocks
-#define i_lfs_fragsize		inode_ext.lfs.lfs_fragsize
-#define i_lfs_osize		inode_ext.lfs.lfs_osize
 	/*
 	 * The on-disk dinode itself.
 	 */
@@ -179,6 +172,7 @@ struct inode {
 #define	IN_CLEANING	0x0100		/* LFS: file is being cleaned */
 #define	IN_ADIROP	0x0200		/* LFS: dirop in progress */
 #define IN_SPACECOUNTED	0x0400		/* Blocks to be freed in free count. */
+#define IN_PAGING       0x1000          /* LFS: file is on paging queue */
 
 #if defined(_KERNEL)
 /*
diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h
index b284a1bb4229..8a09bdc95e84 100644
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_extern.h,v 1.30 2003/01/24 21:55:30 fvdl Exp $	*/
+/*	$NetBSD: ufs_extern.h,v 1.31 2003/02/17 23:48:23 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993, 1994
@@ -167,6 +167,7 @@ void ufs_vinit __P((struct mount *, int (**) __P((void *)),
     int (**) __P((void *)), struct vnode **));
 int ufs_makeinode __P((int, struct vnode *, struct vnode **,
 		       struct componentname *));
+int ufs_gop_alloc __P((struct vnode *, off_t, off_t, int, struct ucred *));
 
 /*
  * Soft dependency function prototypes.
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
index d5c8fc3b2613..d2b00389b975 100644
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_inode.c,v 1.33 2002/01/26 08:32:05 chs Exp $	*/
+/*	$NetBSD: ufs_inode.c,v 1.34 2003/02/17 23:48:23 perseant Exp $	*/
 
 /*
  * Copyright (c) 1991, 1993
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.33 2002/01/26 08:32:05 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.34 2003/02/17 23:48:23 perseant Exp $");
 
 #include "opt_quota.h"
 
@@ -192,10 +192,10 @@ ufs_balloc_range(vp, off, len, cred, flags)
 		    vp, off, len, vp->v_size);
 
 	oldeof = vp->v_size;
-	GOP_SIZE(vp, oldeof, &oldeob);
+	GOP_SIZE(vp, oldeof, &oldeob, GOP_SIZE_WRITE);
 
 	neweof = MAX(vp->v_size, off + len);
-	GOP_SIZE(vp, neweof, &neweob);
+	GOP_SIZE(vp, neweof, &neweob, GOP_SIZE_WRITE);
 
 	error = 0;
 	uobj = &vp->v_uobj;
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index 6a2b4b8add74..caf1dc273e79 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.47 2003/01/24 21:55:30 fvdl Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.48 2003/02/17 23:48:23 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1993
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.47 2003/01/24 21:55:30 fvdl Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.48 2003/02/17 23:48:23 perseant Exp $");
 
 #ifdef LFS_READWRITE
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@@ -110,9 +110,13 @@ READ(void *v)
 		goto out;
 	}
 
-#ifndef LFS_READWRITE
+#ifdef LFS_READWRITE
+# ifdef LFS_UBC
+	usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
+# endif
+#else /* !LFS_READWRITE */
 	usepc = vp->v_type == VREG;
-#endif
+#endif /* !LFS_READWRITE */
 	if (usepc) {
 		while (uio->uio_resid > 0) {
 			bytelen = MIN(ip->i_ffs_size - uio->uio_offset,
@@ -278,9 +282,14 @@ WRITE(void *v)
 	bsize = fs->fs_bsize;
 	error = 0;
 
-#ifndef LFS_READWRITE
+#ifdef LFS_READWRITE
+# ifdef LFS_UBC
+	async = TRUE;
 	usepc = vp->v_type == VREG;
-#endif
+# endif
+#else /* !LFS_READWRITE */
+	usepc = vp->v_type == VREG;
+#endif /* !LFS_READWRITE */
 	if (!usepc) {
 		goto bcache;
 	}
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 61db4a63ec4d..cee935954a01 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $	*/
+/*	$NetBSD: ufs_vnops.c,v 1.90 2003/02/17 23:48:23 perseant Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.90 2003/02/17 23:48:23 perseant Exp $");
 
 #include "opt_quota.h"
 #include "fs_lfs.h"
@@ -73,6 +73,8 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $");
 #include <ufs/ext2fs/ext2fs_extern.h>
 #include <ufs/lfs/lfs_extern.h>
 
+#include <uvm/uvm.h>
+
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct proc *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *,
 		    struct proc *);
@@ -2071,3 +2073,49 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
 	vput(tvp);
 	return (error);
 }
+
+/*
+ * Allocate len bytes at offset off.
+ */
+int
+ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    struct ucred *cred)
+{
+        struct inode *ip = VTOI(vp);
+        int error, delta, bshift, bsize;
+        UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+        error = 0;
+        bshift = vp->v_mount->mnt_fs_bshift;                  
+        bsize = 1 << bshift;
+
+        delta = off & (bsize - 1);
+        off -= delta;
+        len += delta;
+
+        while (len > 0) {
+                bsize = MIN(bsize, len);
+
+                error = VOP_BALLOC(vp, off, bsize, cred, flags, NULL);
+                if (error) {
+                        goto out;
+                }
+
+                /*
+                 * increase file size now, VOP_BALLOC() requires that
+                 * EOF be up-to-date before each call.
+                 */
+
+                if (ip->i_ffs_size < off + bsize) {
+                        UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x",
+                            vp, ip->i_ffs_size, off + bsize, 0);
+                        ip->i_ffs_size = off + bsize;
+                }
+
+                off += bsize;
+                len -= bsize;
+        }
+
+out:
+        return error;
+}
diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c
index 37813da079ab..6afeee50fc6c 100644
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.c,v 1.83 2003/02/01 06:23:55 thorpej Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.84 2003/02/17 23:48:24 perseant Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.83 2003/02/01 06:23:55 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.84 2003/02/17 23:48:24 perseant Exp $");
 
 #include "opt_uvmhist.h"
 
@@ -1254,7 +1254,7 @@ uvm_pagefree(pg)
 		if (pg->flags & PG_WANTED) {
 			wakeup(pg);
 		}
-		pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED);
+		pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED|PG_PAGER1);
 #ifdef UVM_PAGE_TRKOWN
 		pg->owner_tag = NULL;
 #endif
diff --git a/sys/uvm/uvm_pager.h b/sys/uvm/uvm_pager.h
index b5e81a0e1fec..ea9352c5a472 100644
--- a/sys/uvm/uvm_pager.h
+++ b/sys/uvm/uvm_pager.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pager.h,v 1.25 2002/03/25 02:08:10 chs Exp $	*/
+/*	$NetBSD: uvm_pager.h,v 1.26 2003/02/17 23:48:24 perseant Exp $	*/
 
 /*
  *
@@ -124,6 +124,7 @@ struct uvm_pagerops {
 
 #define PGO_ALLPAGES	0x010	/* flush whole object/get all pages */
 #define PGO_LOCKED	0x040	/* fault data structures are locked [get] */
+#define PGO_BUSYFAIL	0x080	/* fail if a page is busy [put] */
 #define PGO_OVERWRITE	0x200	/* pages will be overwritten before unlocked */
 #define PGO_PASTEOF	0x400	/* allow allocation of pages past EOF */
 
diff --git a/usr.sbin/dumplfs/dumplfs.c b/usr.sbin/dumplfs/dumplfs.c
index c79e17622ef9..d071776f08b5 100644
--- a/usr.sbin/dumplfs/dumplfs.c
+++ b/usr.sbin/dumplfs/dumplfs.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: dumplfs.c,v 1.21 2003/01/28 07:44:54 mrg Exp $	*/
+/*	$NetBSD: dumplfs.c,v 1.22 2003/02/17 23:48:25 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993
@@ -45,7 +45,7 @@ __COPYRIGHT(
 #if 0
 static char sccsid[] = "@(#)dumplfs.c	8.5 (Berkeley) 5/24/95";
 #else
-__RCSID("$NetBSD: dumplfs.c,v 1.21 2003/01/28 07:44:54 mrg Exp $");
+__RCSID("$NetBSD: dumplfs.c,v 1.22 2003/02/17 23:48:25 perseant Exp $");
 #endif
 #endif /* not lint */
 
@@ -678,7 +678,7 @@ dump_super(struct lfs *lfsp)
  	
  	(void)printf("  Checkpoint Info\n");
  	(void)printf("    %s%-10d  %s0x%-8x  %s%-10d\n",
- 		     "free     ", lfsp->lfs_free,
+ 		     "freehd   ", lfsp->lfs_freehd,
  		     "idaddr   ", lfsp->lfs_idaddr,
  		     "ifile    ", lfsp->lfs_ifile);
  	(void)printf("    %s%-10d  %s%-10d  %s%-10d\n",
diff --git a/usr.sbin/quotaon/quotaon.c b/usr.sbin/quotaon/quotaon.c
index 576c7fece0e2..0626fb2dbfe6 100644
--- a/usr.sbin/quotaon/quotaon.c
+++ b/usr.sbin/quotaon/quotaon.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: quotaon.c,v 1.17 2002/07/20 08:40:20 grant Exp $	*/
+/*	$NetBSD: quotaon.c,v 1.18 2003/02/17 23:48:25 perseant Exp $	*/
 
 /*
  * Copyright (c) 1980, 1990, 1993
@@ -46,7 +46,7 @@ __COPYRIGHT("@(#) Copyright (c) 1980, 1990, 1993\n\
 #if 0
 static char sccsid[] = "@(#)quotaon.c	8.1 (Berkeley) 6/6/93";
 #else
-__RCSID("$NetBSD: quotaon.c,v 1.17 2002/07/20 08:40:20 grant Exp $");
+__RCSID("$NetBSD: quotaon.c,v 1.18 2003/02/17 23:48:25 perseant Exp $");
 #endif
 #endif /* not lint */
 
@@ -128,7 +128,8 @@ main(argc, argv)
 	}
 	setfsent();
 	while ((fs = getfsent()) != NULL) {
-		if (strcmp(fs->fs_vfstype, "ffs") ||
+		if ((strcmp(fs->fs_vfstype, "ffs") &&
+		     strcmp(fs->fs_vfstype, "lfs")) ||
 		    strcmp(fs->fs_type, FSTAB_RW))
 			continue;
 		if (aflag) {