Add code to UBCify LFS. This is still behind "#ifdef LFS_UBC" for now

(there are still some details to work out) but expect that to go away soon. To support these basic changes (creation of lfs_putpages, lfs_gop_write, mods to lfs_balloc) several other changes were made, to wit: * Create a writer daemon kernel thread whose purpose is to handle page writes for the pagedaemon, but which also takes over some of the functions of lfs_check(). This thread is started the first time an LFS is mounted. * Add a "flags" parameter to GOP_SIZE. Current values are GOP_SIZE_READ, meaning that the call should return the size of the in-core version of the file, and GOP_SIZE_WRITE, meaning that it should return the on-disk size. One of GOP_SIZE_READ or GOP_SIZE_WRITE must be specified. * Instead of using malloc(...M_WAITOK) for everything, reserve enough resources to get by and use malloc(...M_NOWAIT), using the reserves if necessary. Use the pool subsystem for structures small enough that this is feasible. This also obsoletes LFS_THROTTLE. And a few that are not strictly necessary: * Moves the LFS inode extensions off onto a separately allocated structure; getting closer to LFS as an LKM. "Welcome to 1.6O." * Unified GOP_ALLOC between FFS and LFS. * Update LFS copyright headers to correct values. * Actually cast to unsigned in lfs_shellsort, like the comment says. * Keep track of which segments were empty before the previous checkpoint; any segments that pass two checkpoints both dirty and empty can be summarily cleaned. Do this. Right now lfs_segclean still works, but this should be turned into an effectless compatibility syscall.
2003-02-17 23:48:08 +00:00 · 2003-02-17 23:48:08 +00:00 · b397c875ae
commit b397c875ae
parent 0cfe2d1c69
34 changed files with 2401 additions and 783 deletions
--- a/libexec/lfs_cleanerd/print.c
+++ b/libexec/lfs_cleanerd/print.c
@ -1,4 +1,4 @@
-/*	$NetBSD: print.c,v 1.13 2003/01/24 21:55:05 fvdl Exp $	*/
+/*	$NetBSD: print.c,v 1.14 2003/02/17 23:48:08 perseant Exp $	*/

 /*-
 * Copyright (c) 1992, 1993
@ -38,7 +38,7 @@
 #if 0
 static char sccsid[] = "from: @(#)print.c	8.1 (Berkeley) 6/4/93";
 #else
-__RCSID("$NetBSD: print.c,v 1.13 2003/01/24 21:55:05 fvdl Exp $");
+__RCSID("$NetBSD: print.c,v 1.14 2003/02/17 23:48:08 perseant Exp $");
 #endif
 #endif /* not lint */

@ -261,7 +261,7 @@ dump_super(struct lfs *lfsp)

 	syslog(LOG_DEBUG, "Checkpoint Info\n");
 	syslog(LOG_DEBUG, "%s%d\t%s0x%X\t%s%d\n",
-		"free     ", lfsp->lfs_free,
+		"freehd   ", lfsp->lfs_freehd,
 		"idaddr   ", lfsp->lfs_idaddr,
 		"ifile    ", lfsp->lfs_ifile);
 	syslog(LOG_DEBUG, "%s%d\t%s%d\t%s%d\n",
--- a/sbin/fsck_lfs/inode.c
+++ b/sbin/fsck_lfs/inode.c
@ -1,4 +1,4 @@
-/* $NetBSD: inode.c,v 1.14 2003/01/24 21:55:10 fvdl Exp $	 */
+/* $NetBSD: inode.c,v 1.15 2003/02/17 23:48:09 perseant Exp $	 */

 /*
 * Copyright (c) 1997, 1998
@ -348,8 +348,8 @@ lfs_ginode(ino_t inumber)
 		if (reply("free")) {
 			ifp = lfs_ientry(inumber, &bp);
 			ifp->if_daddr = LFS_UNUSED_DADDR;
-			ifp->if_nextfree = sblock.lfs_free;
-			sblock.lfs_free = inumber;
+			ifp->if_nextfree = sblock.lfs_freehd;
+			sblock.lfs_freehd = inumber;
 			sbdirty();
 			dirty(bp);
 			bp->b_flags &= ~B_INUSE;
@ -700,8 +700,8 @@ clri(struct inodesc *idesc, char *type, int flag)

 		ifp = lfs_ientry(idesc->id_number, &bp);
 		ifp->if_daddr = LFS_UNUSED_DADDR;
-		ifp->if_nextfree = sblock.lfs_free;
-		sblock.lfs_free = idesc->id_number;
+		ifp->if_nextfree = sblock.lfs_freehd;
+		sblock.lfs_freehd = idesc->id_number;
 		sbdirty();
 		dirty(bp);
 		bp->b_flags &= ~B_INUSE;
--- a/sbin/fsck_lfs/pass0.c
+++ b/sbin/fsck_lfs/pass0.c
@ -1,4 +1,4 @@
-/* $NetBSD: pass0.c,v 1.12 2003/01/24 21:55:10 fvdl Exp $	 */
+/* $NetBSD: pass0.c,v 1.13 2003/02/17 23:48:09 perseant Exp $	 */

 /*
 * Copyright (c) 1998 Konrad E. Schroder.
@ -86,7 +86,7 @@ pass0()
 	memset(visited, 0, maxino * sizeof(ino_t));

 	plastino = 0;
-	ino = sblock.lfs_free;
+	ino = sblock.lfs_freehd;
 	while (ino) {
 		if (ino >= maxino) {
 			printf("! Ino %d out of range (last was %d)\n", ino,
@ -115,7 +115,7 @@ pass0()
 			       ino, (long long)daddr);
 			if (preen || reply("FIX") == 1) {
 				if (plastino == 0) {
-					sblock.lfs_free = nextino;
+					sblock.lfs_freehd = nextino;
 					sbdirty();
 				} else {
 					ifp = lfs_ientry(plastino, &bp);
@ -145,8 +145,8 @@ pass0()

 		pwarn("! Ino %d free, but not on the free list\n", ino);
 		if (preen || reply("FIX") == 1) {
-			ifp->if_nextfree = sblock.lfs_free;
-			sblock.lfs_free = ino;
+			ifp->if_nextfree = sblock.lfs_freehd;
+			sblock.lfs_freehd = ino;
 			sbdirty();
 			dirty(bp);
 		}
--- a/sys/conf/osrelease.sh
+++ b/sys/conf/osrelease.sh
@ -1,6 +1,6 @@
 #!/bin/sh
 #
-#	$NetBSD: osrelease.sh,v 1.90 2003/02/01 06:26:30 thorpej Exp $
+#	$NetBSD: osrelease.sh,v 1.91 2003/02/17 23:48:09 perseant Exp $
 #
 # Copyright (c) 1997 The NetBSD Foundation, Inc.
 # All rights reserved.
@ -42,7 +42,7 @@
 #	sys/sys/param.h:	__NetBSD_Version__
 #	share/tmac/doc-common:	ds oS
 #
-release=1.6N
+release=1.6O

 case $1 in
 -s)
--- a/sys/miscfs/genfs/genfs_node.h
+++ b/sys/miscfs/genfs/genfs_node.h
@ -1,4 +1,4 @@
-/* $NetBSD: genfs_node.h,v 1.3 2001/12/18 07:49:36 chs Exp $ */
+/* $NetBSD: genfs_node.h,v 1.4 2003/02/17 23:48:10 perseant Exp $ */

 /*
 * Copyright (c) 2001 Chuck Silvers.
@ -36,18 +36,22 @@
 struct vm_page;

 struct genfs_ops {
-	void	(*gop_size)(struct vnode *, off_t, off_t *);
+	void	(*gop_size)(struct vnode *, off_t, off_t *, int);
 	int	(*gop_alloc)(struct vnode *, off_t, off_t, int, struct ucred *);
 	int	(*gop_write)(struct vnode *, struct vm_page **, int, int);
 };

-#define GOP_SIZE(vp, size, eobp) \
-	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp))
+#define GOP_SIZE(vp, size, eobp, flags) \
+	(*VTOG(vp)->g_op->gop_size)((vp), (size), (eobp), (flags))
 #define GOP_ALLOC(vp, off, len, flags, cred) \
 	(*VTOG(vp)->g_op->gop_alloc)((vp), (off), (len), (flags), (cred))
 #define GOP_WRITE(vp, pgs, npages, flags) \
 	(*VTOG(vp)->g_op->gop_write)((vp), (pgs), (npages), (flags))

+/* Flags to GOP_SIZE */
+#define GOP_SIZE_READ  0x1	/* Advise how many pages to read/create */
+#define GOP_SIZE_WRITE 0x2	/* Tell how many pages to write */
+
 struct genfs_node {
 	struct genfs_ops	*g_op;		/* ops vector */
 	struct lock		g_glock;	/* getpages lock */
@ -55,7 +59,7 @@ struct genfs_node {

 #define VTOG(vp) ((struct genfs_node *)(vp)->v_data)

-void	genfs_size(struct vnode *, off_t, off_t *);
+void	genfs_size(struct vnode *, off_t, off_t *, int);
 void	genfs_node_init(struct vnode *, struct genfs_ops *);
 int	genfs_gop_write(struct vnode *, struct vm_page **, int, int);
 int	genfs_compat_gop_write(struct vnode *, struct vm_page **, int, int);
--- a/sys/miscfs/genfs/genfs_vnops.c
+++ b/sys/miscfs/genfs/genfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: genfs_vnops.c,v 1.71 2003/02/05 21:38:42 pk Exp $	*/
+/*	$NetBSD: genfs_vnops.c,v 1.72 2003/02/17 23:48:11 perseant Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -35,7 +35,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.71 2003/02/05 21:38:42 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.72 2003/02/17 23:48:11 perseant Exp $");

 #include "opt_nfsserver.h"

@ -495,11 +495,11 @@ genfs_getpages(void *v)
 	error = 0;
 	origoffset = ap->a_offset;
 	orignpages = *ap->a_count;
-	GOP_SIZE(vp, vp->v_size, &diskeof);
+	GOP_SIZE(vp, vp->v_size, &diskeof, GOP_SIZE_READ);
 	if (flags & PGO_PASTEOF) {
 		newsize = MAX(vp->v_size,
 		    origoffset + (orignpages << PAGE_SHIFT));
-		GOP_SIZE(vp, newsize, &memeof);
+		GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_READ);
 	} else {
 		memeof = diskeof;
 	}
@ -1139,8 +1139,13 @@ genfs_putpages(void *v)
 		yield = (l->l_cpu->ci_schedstate.spc_flags &
 		    SPCF_SHOULDYIELD) && !pagedaemon;
 		if (pg->flags & PG_BUSY || yield) {
-			KASSERT(!pagedaemon);
 			UVMHIST_LOG(ubchist, "busy %p", pg,0,0,0);
+			if (flags & PGO_BUSYFAIL && pg->flags & PG_BUSY) {
+				UVMHIST_LOG(ubchist, "busyfail %p", pg, 0,0,0);
+				error = EDEADLK;
+				break;
+			}
+			KASSERT(!pagedaemon);
 			if (by_list) {
 				TAILQ_INSERT_BEFORE(pg, &curmp, listq);
 				UVMHIST_LOG(ubchist, "curmp next %p",
@ -1381,7 +1386,7 @@ genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
 	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
 	    vp, pgs, npages, flags);

-	GOP_SIZE(vp, vp->v_size, &eof);
+	GOP_SIZE(vp, vp->v_size, &eof, GOP_SIZE_WRITE);
 	if (vp->v_type == VREG) {
 		fs_bshift = vp->v_mount->mnt_fs_bshift;
 		dev_bshift = vp->v_mount->mnt_dev_bshift;
@ -1523,7 +1528,7 @@ genfs_node_init(struct vnode *vp, struct genfs_ops *ops)
 }

 void
-genfs_size(struct vnode *vp, off_t size, off_t *eobp)
+genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
 	int bsize;

--- a/sys/nfs/nfs_node.c
+++ b/sys/nfs/nfs_node.c
@ -1,4 +1,4 @@
-/*	$NetBSD: nfs_node.c,v 1.60 2003/02/15 18:00:25 drochner Exp $	*/
+/*	$NetBSD: nfs_node.c,v 1.61 2003/02/17 23:48:12 perseant Exp $	*/

 /*
 * Copyright (c) 1989, 1993
@ -39,7 +39,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: nfs_node.c,v 1.60 2003/02/15 18:00:25 drochner Exp $");
+__KERNEL_RCSID(0, "$NetBSD: nfs_node.c,v 1.61 2003/02/17 23:48:12 perseant Exp $");

 #include "opt_nfs.h"

@ -80,7 +80,7 @@ extern int prtactive;

 #define	nfs_hash(x,y)	hash32_buf((x), (y), HASH32_BUF_INIT)

-void nfs_gop_size(struct vnode *, off_t, off_t *);
+void nfs_gop_size(struct vnode *, off_t, off_t *, int);
 int nfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
 int nfs_gop_write(struct vnode *, struct vm_page **, int, int);

@ -315,8 +315,11 @@ nfs_reclaim(v)
 }

 void
-nfs_gop_size(struct vnode *vp, off_t size, off_t *eobp)
+nfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE))
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
 	*eobp = MAX(size, vp->v_size);
 }

--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@ -1,4 +1,4 @@
-/*	$NetBSD: param.h,v 1.159 2003/02/01 06:26:30 thorpej Exp $	*/
+/*	$NetBSD: param.h,v 1.160 2003/02/17 23:48:13 perseant Exp $	*/

 /*-
 * Copyright (c) 1982, 1986, 1989, 1993
@ -67,7 +67,7 @@
 * Don't forget to change conf/osrelease.sh too.
 */

-#define	__NetBSD_Version__	106140000	/* NetBSD 1.6N */
+#define	__NetBSD_Version__	106150000	/* NetBSD 1.6O */

 /*
 * Historical NetBSD #define
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_extern.h,v 1.25 2003/01/24 21:55:22 fvdl Exp $	*/
+/*	$NetBSD: ffs_extern.h,v 1.26 2003/02/17 23:48:14 perseant Exp $	*/

 /*-
 * Copyright (c) 1991, 1993, 1994
@ -151,7 +151,7 @@ int ffs_fsync __P((void *));
 int ffs_reclaim __P((void *));
 int ffs_getpages __P((void *));
 int ffs_putpages __P((void *));
-void ffs_gop_size __P((struct vnode *, off_t, off_t *));
+void ffs_gop_size __P((struct vnode *, off_t, off_t *, int));
 __END_DECLS

 
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.106 2003/01/24 21:55:23 fvdl Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.107 2003/02/17 23:48:14 perseant Exp $	*/

 /*
 * Copyright (c) 1989, 1991, 1993, 1994
@ -36,7 +36,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.106 2003/01/24 21:55:23 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.107 2003/02/17 23:48:14 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@ -117,7 +117,7 @@ struct vfsops ffs_vfsops = {

 struct genfs_ops ffs_genfsops = {
 	ffs_gop_size,
-	ffs_gop_alloc,
+	ufs_gop_alloc,
 	genfs_gop_write,
 };

--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vnops.c,v 1.54 2003/02/05 21:38:44 pk Exp $	*/
+/*	$NetBSD: ffs_vnops.c,v 1.55 2003/02/17 23:48:15 perseant Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993
@ -36,7 +36,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.54 2003/02/05 21:38:44 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.55 2003/02/17 23:48:15 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -567,12 +567,16 @@ ffs_putpages(void *v)
 */

 void
-ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp)
+ffs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
 {
 	struct inode *ip = VTOI(vp);
 	struct fs *fs = ip->i_fs;
 	daddr_t olbn, nlbn;

+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE)) 
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
+
 	olbn = lblkno(fs, ip->i_ffs_size);
 	nlbn = lblkno(fs, size);
 	if (nlbn < NDADDR && olbn <= nlbn) {
--- a/sys/ufs/lfs/TODO
+++ b/sys/ufs/lfs/TODO
@ -1,4 +1,19 @@
-#   $NetBSD: TODO,v 1.5 2001/07/13 20:30:22 perseant Exp $
+#   $NetBSD: TODO,v 1.6 2003/02/17 23:48:16 perseant Exp $
+
+- Lock audit.  Need to check locking for multiprocessor case in particular.
+
+- Get rid of the syscalls: make them into ioctl calls instead.  This would
+  allow LFS to be loaded as a module.  We would then ideally have an
+  in-kernel cleaner that runs if no userland cleaner has asserted itself.
+
+- Get rid of lfs_segclean(); the kernel should clean a dirty segment IFF it
+  has passed two checkpoints containing zero live bytes.
+
+- Now that our cache is basically all of physical memory, we need to make
+  sure that segwrite is not starving other important things.  Need a way
+  to prioritize which blocks are most important to write, and write only
+  those before giving up the seglock to do the rest.  How does this change
+  our notion of what a checkpoint is?

 - Investigate alternate inode locking strategy: Inode locks are useful
  for locking against simultaneous changes to inode size (balloc,
@ -11,12 +26,6 @@
 - Fully working fsck_lfs.  (Really, need a general-purpose external
  partial-segment writer.)

- Inode blocks are currently the same size as the fs block size; but all
-  the ones I've seen are mostly empty, and this will be especially true
-  if atime information is kept in the ifile instead of the inode.  Could
-  we shrink the inode block size to DEV_BSIZE?  Or parametrize it at fs
-  creation time?
-
 - Get rid of DEV_BSIZE, pay attention to the media block size at mount time.

 - More fs ops need to call lfs_imtime.  Which ones?  (Blackwell et al., 1995)
--- a/sys/ufs/lfs/lfs.h
+++ b/sys/ufs/lfs/lfs.h
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs.h,v 1.45 2003/01/29 13:14:33 yamt Exp $	*/
+/*	$NetBSD: lfs.h,v 1.46 2003/02/17 23:48:16 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -95,11 +95,44 @@
 #define BW_CLEAN	1
 #define MIN_FREE_SEGS	2
 #define LFS_MAX_ACTIVE	10
-#define LFS_MAXDIROP	(desiredvnodes >> 2)
 #ifndef LFS_ATIME_IFILE
 # define LFS_ATIME_IFILE 0
 #endif

+/* Local definition for LFS's usage of PG_PAGER1 */
+#define PG_DELWRI	PG_PAGER1
+
+/* Types for lfs_newbuf and lfs_malloc */
+#define LFS_NB_UNKNOWN -1
+#define LFS_NB_SUMMARY	0
+#define LFS_NB_SBLOCK	1
+#define LFS_NB_IBLOCK	2
+#define LFS_NB_CLUSTER	3
+#define LFS_NB_CLEAN	4
+#define LFS_NB_COUNT	5 /* always last */
+
+/* Number of reserved memory blocks of each type */
+#define LFS_N_SUMMARIES 2
+#define LFS_N_SBLOCKS   1   /* Always 1, to throttle superblock writes */
+#define LFS_N_IBLOCKS   16  /* In theory ssize/bsize; in practice around 2 */
+#define LFS_N_CLUSTERS  16  /* In theory ssize/MAXPHYS */
+#define LFS_N_CLEAN     0
+
+/* Total count of "large" (non-pool) types */
+#define LFS_N_TOTAL (LFS_N_SUMMARIES + LFS_N_SBLOCKS + LFS_N_IBLOCKS + LFS_N_CLUSTERS + LFS_N_CLEAN)
+
+/* Counts for pool types */
+#define LFS_N_CL        LFS_N_CLUSTERS
+#define LFS_N_BPP       2
+#define LFS_N_SEG	2
+
+/* Structure to keep reserved blocks */
+typedef struct lfs_res_blk {
+	void *p;
+	LIST_ENTRY(lfs_res_blk) res;
+	char inuse;
+} res_t;
+
 /*
 * #define WRITE_THRESHHOLD    ((nbuf >> 1) - 10)
 * #define WAIT_THRESHHOLD     (nbuf - (nbuf >> 2) - 10)
@ -109,8 +142,17 @@
 /* These are new ... is LFS taking up too much memory in its buffers? */
 #define LFS_MAX_BYTES       (((bufpages >> 2) - 10) * NBPG)
 #define LFS_WAIT_BYTES      (((bufpages >> 1) - (bufpages >> 3) - 10) * NBPG)
+#define LFS_MAX_DIROP	    ((desiredvnodes >> 2) + (desiredvnodes >> 3))
 #define LFS_BUFWAIT         2

+#define LFS_MAX_PAGES \
+     (((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemin) >> 8)
+#define LFS_WAIT_PAGES \
+     (((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemax) >> 8)
+
+#define LFS_IS_MALLOC_BUF(bp) (((bp)->b_flags & B_CALL) && 		\
+     ((bp)->b_iodone == lfs_callback || (bp)->b_iodone == lfs_fakebuf_iodone))
+
 #define LFS_LOCK_BUF(bp) do {						\
 	if (((bp)->b_flags & (B_LOCKED | B_CALL)) == 0) {		\
 		++locked_queue_count;       				\
@ -237,7 +279,21 @@ extern struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
 	(ip)->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);		\
 } while (0)

-#define WRITEINPROG(vp) (vp->v_dirtyblkhd.lh_first && !(VTOI(vp)->i_flag & \
+/*
+ * How to find out whether a vnode had dirty buffers or pages,
+ * to know whether it needs to retain IN_MODIFIED after a write.
+ */
+#ifdef LFS_UBC
+int lfs_checkifempty(struct vnode *);
+#  define VPISEMPTY(vp)  lfs_checkifempty(vp)
+#else
+# define VPISEMPTY(vp)  ((vp)->v_dirtyblkhd.lh_first == NULL)
+#endif
+/*
+ * WRITEINPROG does not use VPISEMPTY because any dirty pages will
+ * have been given buffer headers, if they are "in progress".
+ */
+#define WRITEINPROG(vp) ((vp)->v_dirtyblkhd.lh_first && !(VTOI(vp)->i_flag & \
 				(IN_MODIFIED | IN_ACCESSED | IN_CLEANING)))

 /* Here begins the berkeley code */
@ -257,6 +313,7 @@ struct segusage {
 #define	SEGUSE_DIRTY		0x02	/*  segment has data in it */
 #define	SEGUSE_SUPERBLOCK	0x04	/*  segment contains a superblock */
 #define SEGUSE_ERROR            0x08    /*  cleaner: do not clean segment */
+#define SEGUSE_EMPTY            0x10    /*  segment is empty */
 	u_int32_t su_flags;		/* 12: segment flags */
 	u_int64_t su_lastmod;		/* 16: last modified timestamp */
 };
@ -304,7 +361,7 @@ struct dlfs {
        u_int32_t dlfs_frag;      /* 28: number of frags in a block in fs */

 /* Checkpoint region. */
-        u_int32_t dlfs_free;      /* 32: start of the free list */
+        u_int32_t dlfs_freehd;      /* 32: start of the free list */
        u_int32_t dlfs_bfree;     /* 36: number of free disk blocks */
        u_int32_t dlfs_nfiles;    /* 40: number of allocated inodes */
        int32_t   dlfs_avail;     /* 44: blocks available for writing */
@ -371,9 +428,6 @@ struct dlfs {
 	u_int32_t dlfs_cksum;     /* 508: checksum for superblock checking */
 };

-/* Maximum number of io's we can have pending at once */
-#define LFS_THROTTLE  32 /* XXX should be better paramtrized - ? */
-
 /* In-memory super block. */
 struct lfs {
        struct dlfs lfs_dlfs;           /* on-disk parameters */
@ -385,7 +439,7 @@ struct lfs {
 #define lfs_bsize lfs_dlfs.dlfs_bsize
 #define lfs_fsize lfs_dlfs.dlfs_fsize
 #define lfs_frag lfs_dlfs.dlfs_frag
-#define lfs_free lfs_dlfs.dlfs_free
+#define lfs_freehd lfs_dlfs.dlfs_freehd
 #define lfs_bfree lfs_dlfs.dlfs_bfree
 #define lfs_nfiles lfs_dlfs.dlfs_nfiles
 #define lfs_avail lfs_dlfs.dlfs_avail
@ -455,20 +509,26 @@ struct lfs {
 #define LFS_WARNED  0x04
 	int8_t	  lfs_flags;		/* currently unused flag */
 	u_int16_t lfs_activesb;         /* toggle between superblocks */
-#ifdef LFS_TRACK_IOS
-	daddr_t   lfs_pending[LFS_THROTTLE]; /* daddrs of pending writes */
-#endif /* LFS_TRACK_IOS */
 	daddr_t   lfs_sbactive;         /* disk address of in-progress sb write */
 	struct vnode *lfs_flushvp;      /* vnode being flushed */
 	struct vnode *lfs_unlockvp;     /* being inactivated in lfs_segunlock */
 	u_int32_t lfs_diropwait;	/* # procs waiting on dirop flush */
 	size_t lfs_devbsize;		/* Device block size */
 	size_t lfs_devbshift;		/* Device block shift */
-	struct lock lfs_freelock;
 	struct lock lfs_fraglock;
 	pid_t lfs_rfpid;		/* Process ID of roll-forward agent */
 	int       lfs_nadirop;		/* number of active dirop nodes */
 	long      lfs_ravail;           /* blocks pre-reserved for writing */
+	res_t *lfs_resblk;		/* Reserved memory for pageout */
+	TAILQ_HEAD(, inode) lfs_dchainhd; /* dirop vnodes */
+	TAILQ_HEAD(, inode) lfs_pchainhd; /* paging vnodes */
+#define LFS_RESHASH_WIDTH 17
+	LIST_HEAD(, lfs_res_blk) lfs_reshash[LFS_RESHASH_WIDTH]; 
+	int       lfs_pdflush;           /* pagedaemon wants us to flush */
+	u_int32_t **lfs_suflags;	/* Segment use flags */
+	struct pool lfs_clpool;		/* Pool for struct lfs_cluster */
+	struct pool lfs_bpppool;	/* Pool for bpp */
+	struct pool lfs_segpool;	/* Pool for struct segment */
 };

 /*
@ -659,14 +719,14 @@ struct segsum {
 #define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do {                       \
 	if ((FS)->lfs_version > 1) {                                    \
 		LFS_CLEANERINFO((CIP), (FS), (BP));                     \
-		(FS)->lfs_free = (CIP)->free_head;			\
+		(FS)->lfs_freehd = (CIP)->free_head;			\
 		brelse(BP);                                             \
 	}								\
-	*(FREEP) = (FS)->lfs_free;					\
+	*(FREEP) = (FS)->lfs_freehd;					\
 } while (0)

 #define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do {                         \
-	(FS)->lfs_free = (VAL);						\
+	(FS)->lfs_freehd = (VAL);						\
 	if ((FS)->lfs_version > 1) {                                    \
 		LFS_CLEANERINFO((CIP), (FS), (BP));                     \
 		(CIP)->free_head = (VAL);                 		\
@ -721,6 +781,15 @@ struct segsum {
 		(SP) = (SEGUSE *)(BP)->b_data + ((IN) % (F)->lfs_sepb);	\
 } while(0)

+#define LFS_WRITESEGENTRY(SP, F, IN, BP) do {				\
+	if ((SP)->su_nbytes == 0)					\
+		(SP)->su_flags |= SEGUSE_EMPTY;				\
+	else								\
+		(SP)->su_flags &= ~SEGUSE_EMPTY;			\
+	(F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags;	\
+	LFS_BWRITE_LOG(BP);						\
+} while(0)
+
 /* Determine if a buffer belongs to the ifile */
 #define IS_IFILE(bp)	(VTOI(bp->b_vp)->i_number == LFS_IFILE_INUM)

@ -773,15 +842,16 @@ struct segment {
 #define	SEGM_CLEAN	0x02		/* cleaner call; don't sort */
 #define	SEGM_SYNC	0x04		/* wait for segment */
 #define	SEGM_PROT	0x08		/* don't inactivate at segunlock */
+#define SEGM_PAGEDAEMON 0x10		/* pagedaemon called us */
 	u_int16_t seg_flags;		/* run-time flags for this segment */
 	u_int32_t seg_iocount;		/* number of ios pending */
 	int	  ndupino;              /* number of duplicate inodes */
 };

 struct lfs_cluster {
+	size_t bufsize;        /* Size of kept data */
 	struct buf **bpp;      /* Array of kept buffers */
 	int bufcount;          /* Number of kept buffers */
-	size_t bufsize;        /* Size of kept data */
 #define LFS_CL_MALLOC	0x00000001
 #define LFS_CL_SHIFT	0x00000002
 #define LFS_CL_SYNC	0x00000004
@ -789,9 +859,25 @@ struct lfs_cluster {
 	struct lfs *fs;        /* LFS that this belongs to */
 	struct segment *seg;   /* Segment structure, for LFS_CL_SYNC */
 	void *saveaddr;        /* Original contents of saveaddr */
-	char *olddata;		/* Original b_data, if LFS_CL_MALLOC */
+	char *olddata;	       /* Original b_data, if LFS_CL_MALLOC */
 };

+/*
+ * LFS inode extensions; moved from <ufs/ufs/inode.h> so that file didn't
+ * have to change every time LFS changed.
+ */
+struct lfs_inode_ext {
+	off_t	  lfs_osize;		/* size of file on disk */
+	u_int32_t lfs_effnblocks;  /* number of blocks when i/o completes */
+	size_t    lfs_fragsize[NDADDR]; /* size of on-disk direct blocks */
+	TAILQ_ENTRY(inode) lfs_dchain; /* Dirop chain. */
+	TAILQ_ENTRY(inode) lfs_pchain; /* Paging chain. */
+};
+#define i_lfs_osize		inode_ext.lfs->lfs_osize
+#define i_lfs_effnblks		inode_ext.lfs->lfs_effnblocks
+#define i_lfs_fragsize		inode_ext.lfs->lfs_fragsize
+#define i_lfs_dchain		inode_ext.lfs->lfs_dchain
+
 /*
 * Macros for determining free space on the disk, with the variable metadata
 * of segment summaries and inode blocks taken into account.
--- a/sys/ufs/lfs/lfs_alloc.c
+++ b/sys/ufs/lfs/lfs_alloc.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $	*/
+/*	$NetBSD: lfs_alloc.c,v 1.63 2003/02/17 23:48:16 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.63 2003/02/17 23:48:16 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -85,7 +85,6 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/mount.h>
-#include <sys/malloc.h>
 #include <sys/pool.h>
 #include <sys/proc.h>

@ -99,6 +98,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.62 2003/01/27 23:17:56 yamt Exp $");

 extern int lfs_dirvcount;
 extern struct lock ufs_hashlock;
+extern struct simplelock lfs_subsys_lock;
+extern int lfs_subsys_pages;     

 static int extend_ifile(struct lfs *, struct ucred *);
 static int lfs_ialloc(struct lfs *, struct vnode *, ino_t, int, struct vnode **);
@ -207,6 +208,7 @@ lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
 		(void)lfs_vunref(vp);
 		--lfs_dirvcount;
 		vp->v_flag &= ~VDIROP;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 		--fs->lfs_nadirop;
 		ip->i_flag &= ~IN_ADIROP;
 	}
@ -245,7 +247,7 @@ extend_ifile(struct lfs *fs, struct ucred *cred)
 	LFS_GET_HEADFREE(fs, cip, cbp, &oldlast);
 	LFS_PUT_HEADFREE(fs, cip, cbp, i);
 #ifdef DIAGNOSTIC
-	if (fs->lfs_free == LFS_UNUSED_INUM)
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
 		panic("inode 0 allocated [2]");
 #endif /* DIAGNOSTIC */
 	max = i + fs->lfs_ifpb;
@ -300,21 +302,7 @@ lfs_valloc(void *v)
 		return EROFS;
 	*ap->a_vpp = NULL;
 	
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_seglock(fs, SEGM_PROT);
-#else
-	if (fs->lfs_version == 1) {
-		/*
-		 * Use lfs_seglock here, instead of fs->lfs_freelock, to
-		 * ensure that the free list is not changed in between
-		 * the time that the ifile blocks are written to disk
-		 * and the time that the superblock is written to disk.
-		 */
-		lfs_seglock(fs, SEGM_PROT);
-	} else {
-		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
-	}
-#endif

 	/* Get the head of the freelist. */
 	LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
@ -345,33 +333,20 @@ lfs_valloc(void *v)
 	brelse(bp);

 	/* Extend IFILE so that the next lfs_valloc will succeed. */
-	if (fs->lfs_free == LFS_UNUSED_INUM) {
+	if (fs->lfs_freehd == LFS_UNUSED_INUM) {
 		if ((error = extend_ifile(fs, ap->a_cred)) != 0) {
 			LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 			lfs_segunlock(fs);
-#else
-			if (fs->lfs_version == 1)
-				lfs_segunlock(fs);
-			else
-				lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
 			return error;
 		}
 	}
 #ifdef DIAGNOSTIC
-	if (fs->lfs_free == LFS_UNUSED_INUM)
+	if (fs->lfs_freehd == LFS_UNUSED_INUM)
 		panic("inode 0 allocated [3]");
 #endif /* DIAGNOSTIC */

-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#else
-	if (fs->lfs_version == 1)
-		lfs_segunlock(fs);
-	else
-		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
+
 	return lfs_ialloc(fs, ap->a_pvp, new_ino, new_gen, ap->a_vpp);
 }

@ -417,17 +392,16 @@ lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,

 	uvm_vnp_setsize(vp, 0);
 	*vpp = vp;
-#if 1
 	if (!(vp->v_flag & VDIROP)) {
 		(void)lfs_vref(vp);
 		++lfs_dirvcount;
+		TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 	}
 	vp->v_flag |= VDIROP;

 	if (!(ip->i_flag & IN_ADIROP))
 		++fs->lfs_nadirop;
 	ip->i_flag |= IN_ADIROP;
-#endif
 	genfs_node_init(vp, &lfs_genfsops);
 	VREF(ip->i_devvp);
 	/* Set superblock modified bit and increment file count. */
@ -439,17 +413,13 @@ lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
 	/*
 	 * Put the new inum back on the free list.
 	 */
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_seglock(fs, SEGM_PROT);
-#endif
 	LFS_IENTRY(ifp, fs, new_ino, bp);
 	ifp->if_daddr = LFS_UNUSED_DADDR;
 	LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
 	LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
 	(void) LFS_BWRITE_LOG(bp); /* Ifile */
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#endif

 	*vpp = NULLVP;
 	return (error);
@ -470,6 +440,7 @@ lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
 	
 	/* Initialize the inode. */
 	ip = pool_get(&lfs_inode_pool, PR_WAITOK);
+	ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_devvp = ump->um_devvp;
@ -487,8 +458,6 @@ lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
 	ip->i_ffs_blocks = 0;
 	ip->i_lfs_effnblks = 0;
 	ip->i_flag = 0;
-	/* Why was IN_MODIFIED ever set here? */
-	/* LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED); */

 #ifdef DEBUG_LFS_VNLOCK
 	if (ino == LFS_IFILE_INUM)
@ -531,18 +500,12 @@ lfs_vfree(void *v)
 		tsleep(vp, (PRIBIO+1), "lfs_vfree", 0);
 	splx(s);

-#ifdef LFS_AGGRESSIVE_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT); /* XXX */;
-#else
-	if (fs->lfs_version == 1)
-		lfs_seglock(fs, SEGM_PROT);
-	else
-		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
-#endif
+	lfs_seglock(fs, SEGM_PROT);
 	
 	if (vp->v_flag & VDIROP) {
 		--lfs_dirvcount;
 		vp->v_flag &= ~VDIROP;
+		TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 		wakeup(&lfs_dirvcount);
 		lfs_vunref(vp);
 	}
@ -597,20 +560,14 @@ lfs_vfree(void *v)
 		}
 #endif
 		sup->su_nbytes -= DINODE_SIZE;
-		(void) LFS_BWRITE_LOG(bp); /* Ifile */
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, old_iaddr), bp); /* Ifile */
 	}
 	
 	/* Set superblock modified bit and decrement file count. */
 	fs->lfs_fmod = 1;
 	--fs->lfs_nfiles;
 	
-#ifdef LFS_AGGRESSIVE_SEGLOCK
 	lfs_segunlock(fs);
-#else
-	if (fs->lfs_version == 1)
-		lfs_segunlock(fs);
-	else
-		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-#endif
+
 	return (0);
 }
--- a/sys/ufs/lfs/lfs_balloc.c
+++ b/sys/ufs/lfs/lfs_balloc.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $	*/
+/*	$NetBSD: lfs_balloc.c,v 1.36 2003/02/17 23:48:16 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.36 2003/02/17 23:48:16 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -96,6 +96,10 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.35 2003/01/24 21:55:26 fvdl Exp $")
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+#include <uvm/uvm.h>
+
+extern int lfs_subsys_pages;
+
 int lfs_fragextend(struct vnode *, int, int, daddr_t, struct buf **, struct ucred *);

 /*
@ -127,7 +131,7 @@ lfs_balloc(void *v)
 	int offset;
 	u_long iosize;
 	daddr_t daddr, idaddr;
-	struct buf *ibp, *bp;
+	struct buf *ibp, *bp, **bpp;
 	struct inode *ip;
 	struct lfs *fs;
 	struct indir indirs[NIADDR+2], *idp;
@ -141,8 +145,9 @@ lfs_balloc(void *v)
 	offset = blkoff(fs, ap->a_startoffset);
 	iosize = ap->a_size;
 	lbn = lblkno(fs, ap->a_startoffset);
-	(void)lfs_check(vp, lbn, 0);
-	
+	/* (void)lfs_check(vp, lbn, 0); */
+	bpp = ap->a_bpp;
+
 	/* 
 	 * Three cases: it's a block beyond the end of file, it's a block in
 	 * the file that may or may not have been assigned a disk address or
@ -159,7 +164,8 @@ lfs_balloc(void *v)
 	 * to rewrite it.
 	 */
 	
-	*ap->a_bpp = NULL;
+	if (bpp)
+		*bpp = NULL;
 	
 	/* Check for block beyond end of file and fragment extension needed. */
 	lastblock = lblkno(fs, ip->i_ffs_size);
@ -167,13 +173,15 @@ lfs_balloc(void *v)
 		osize = blksize(fs, ip, lastblock);
 		if (osize < fs->lfs_bsize && osize > 0) {
 			if ((error = lfs_fragextend(vp, osize, fs->lfs_bsize,
-						    lastblock, &bp,
+						    lastblock,
+						    (bpp ? &bp : NULL),
 						    ap->a_cred)))
 				return (error);
 			ip->i_ffs_size = (lastblock + 1) * fs->lfs_bsize;
 			uvm_vnp_setsize(vp, ip->i_ffs_size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
-			(void) VOP_BWRITE(bp);
+			if (bpp)
+				(void) VOP_BWRITE(bp);
 		}
 	}

@ -192,25 +200,30 @@ lfs_balloc(void *v)
 			/* Brand new block or fragment */
 			frags = numfrags(fs, nsize);
 			bb = fragstofsb(fs, frags);
-			*ap->a_bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+			if (bpp) {
+				*ap->a_bpp = bp = getblk(vp, lbn, nsize, 0, 0);
+				bp->b_blkno = UNWRITTEN;
+			}
 			if (ap->a_flags & B_CLRBUF)
 				clrbuf(bp);
 			ip->i_lfs_effnblks += bb;
 			ip->i_lfs->lfs_bfree -= bb;
-			ip->i_ffs_db[lbn] = bp->b_blkno = UNWRITTEN;
+			ip->i_ffs_db[lbn] = UNWRITTEN;
 		} else {
 			if (nsize <= osize) {
 				/* No need to extend */
-				if ((error = bread(vp, lbn, osize, NOCRED, &bp)))
+				if (bpp && (error = bread(vp, lbn, osize, NOCRED, &bp)))
 					return error;
 			} else {
 				/* Extend existing block */
 				if ((error =
-				     lfs_fragextend(vp, osize, nsize, lbn, &bp,
+				     lfs_fragextend(vp, osize, nsize, lbn,
+						    (bpp ? &bp : NULL),
 						    ap->a_cred)))
 					return error;
 			}
-			*ap->a_bpp = bp;
+			if (bpp)
+				*bpp = bp;
 		}
 		return 0;
 	}
@ -279,10 +292,11 @@ lfs_balloc(void *v)


 	/*
-	 * Get the existing block from the cache.
+	 * Get the existing block from the cache, if requested.
 	 */
 	frags = fsbtofrags(fs, bb);
-	*ap->a_bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
+	if (bpp)
+		*bpp = bp = getblk(vp, lbn, blksize(fs, ip, lbn), 0, 0);
 	
 	/* 
 	 * The block we are writing may be a brand new block
@ -293,11 +307,13 @@ lfs_balloc(void *v)
 	 * disk address UNWRITTEN.
 	 */
 	if (daddr == UNASSIGNED) {
-		if (ap->a_flags & B_CLRBUF)
-			clrbuf(bp);
+		if (bpp) {
+			if (ap->a_flags & B_CLRBUF)
+				clrbuf(bp);
 		
-		/* Note the new address */
-		bp->b_blkno = UNWRITTEN;
+			/* Note the new address */
+			bp->b_blkno = UNWRITTEN;
+		}
 		
 		switch (num) {
 		    case 0:
@ -316,7 +332,7 @@ lfs_balloc(void *v)
 			((int32_t *)ibp->b_data)[idp->in_off] = UNWRITTEN;
 			VOP_BWRITE(ibp);
 		}
-	} else if (!(bp->b_flags & (B_DONE|B_DELWRI))) {
+	} else if (bpp && !(bp->b_flags & (B_DONE|B_DELWRI))) {
 		/*
 		 * Not a brand new block, also not in the cache;
 		 * read it in from disk.
@ -356,26 +372,35 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	error = 0;

 	/*
-	 * Get the seglock so we don't enlarge blocks or change the segment
-	 * accounting information while a segment is being written.
+	 * Get the seglock so we don't enlarge blocks while a segment
+	 * is being written.  If we're called with bpp==NULL, though,
+	 * we are only pretending to change a buffer, so we don't have to
+	 * lock.
 	 */
    top:
-#ifdef LFS_MALLOC_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
-#endif
+	if (bpp) {
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
+	}
+
 	if (!ISSPACE(fs, bb, cred)) {
 		error = ENOSPC;
 		goto out;
 	}
-	if ((error = bread(vp, lbn, osize, NOCRED, bpp))) {
+
+	/*
+	 * If we are not asked to actually return the block, all we need
+	 * to do is allocate space for it.  UBC will handle dirtying the
+	 * appropriate things and making sure it all goes to disk.
+	 * Don't bother to read in that case.
+	 */
+	if (bpp && (error = bread(vp, lbn, osize, NOCRED, bpp))) {
 		brelse(*bpp);
 		goto out;
 	}
 #ifdef QUOTA
 	if ((error = chkdq(ip, bb, cred, 0))) {
-		brelse(*bpp);
+		if (bpp)
+			brelse(*bpp);
 		goto out;
 	}
 #endif
@ -386,17 +411,14 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	 * release both and start over after waiting.
 	 */

-	if ((*bpp)->b_flags & B_DELWRI) {
+	if (bpp && ((*bpp)->b_flags & B_DELWRI)) {
 		if (!lfs_fits(fs, bb)) {
-			brelse(*bpp);
+			if (bpp)
+				brelse(*bpp);
 #ifdef QUOTA
 			chkdq(ip, -bb, cred, 0);
 #endif
-#ifdef LFS_FRAGSIZE_SEGLOCK
-			lfs_segunlock(fs);
-#else
 			lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
 			lfs_availwait(fs, bb);
 			goto top;
 		}
@ -407,24 +429,24 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, daddr_t lbn, struct buf *
 	ip->i_lfs_effnblks += bb;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;

-	LFS_DEBUG_COUNTLOCKED("frag1");
+	if (bpp) {
+		LFS_DEBUG_COUNTLOCKED("frag1");

-	obufsize = (*bpp)->b_bufsize;
-	allocbuf(*bpp, nsize);
+		obufsize = (*bpp)->b_bufsize;
+		allocbuf(*bpp, nsize);

-	/* Adjust locked-list accounting */
-	if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
-		locked_queue_bytes += (*bpp)->b_bufsize - obufsize;
+		/* Adjust locked-list accounting */
+		if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
+			locked_queue_bytes += (*bpp)->b_bufsize - obufsize;

-	LFS_DEBUG_COUNTLOCKED("frag2");
+		LFS_DEBUG_COUNTLOCKED("frag2");

-	bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
+		bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));
+	}

    out:
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_segunlock(fs);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
+	if (bpp) {
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+	}
 	return (error);
 }
--- a/sys/ufs/lfs/lfs_bio.c
+++ b/sys/ufs/lfs/lfs_bio.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.58 2003/02/17 23:48:17 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.58 2003/02/17 23:48:17 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -86,10 +86,11 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>

-#include <sys/malloc.h>
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+#include <uvm/uvm.h>
+
 /* Macros to clear/set/test flags. */
 # define	SET(t, f)	(t) |= (f)
 # define	CLR(t, f)	(t) &= ~(f)
@ -102,11 +103,14 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.57 2003/02/05 21:38:45 pk Exp $");
 * No write cost accounting is done.
 * This is almost certainly wrong for synchronous operations and NFS.
 */
-int	locked_queue_count   = 0;	/* XXX Count of locked-down buffers. */
-long	locked_queue_bytes   = 0L;	/* XXX Total size of locked buffers. */
+int	locked_queue_count   = 0;	/* Count of locked-down buffers. */
+long	locked_queue_bytes   = 0L;	/* Total size of locked buffers. */
+int	lfs_subsys_pages     = 0L;      /* Total number LFS-written pages */
 int	lfs_writing          = 0;	/* Set if already kicked off a writer
 					   because of buffer space */
+struct simplelock lfs_subsys_lock;	/* Lock on subsys_pages */
 extern int lfs_dostats;
+extern int lfs_do_flush;

 /*
 * reserved number/bytes of locked buffers
@ -402,7 +406,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 	int fsb, s;

 	KASSERT(bp->b_flags & B_BUSY);
-	KASSERT(flags & BW_CLEAN || !(bp->b_flags & B_CALL));
+	KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));

 	/*
 	 * Don't write *any* blocks if we're mounted read-only.
@ -411,7 +415,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
        if (VTOI(bp->b_vp)->i_lfs->lfs_ronly) {
 		bp->b_flags &= ~(B_DELWRI | B_READ | B_ERROR);
 		LFS_UNLOCK_BUF(bp);
-		if (bp->b_flags & B_CALL)
+		if (LFS_IS_MALLOC_BUF(bp))
 			bp->b_flags &= ~B_BUSY;
 		else
 			brelse(bp);
@ -465,28 +469,26 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 void
 lfs_flush_fs(struct lfs *fs, int flags)
 {
-	if (fs->lfs_ronly == 0 && fs->lfs_dirops == 0)
-	{
-		/* disallow dirops during flush */
-		fs->lfs_writer++;
+	if (fs->lfs_ronly)
+		return;

-		/*
-		 * We set the queue to 0 here because we
-		 * are about to write all the dirty
-		 * buffers we have.  If more come in
-		 * while we're writing the segment, they
-		 * may not get written, so we want the
-		 * count to reflect these new writes
-		 * after the segwrite completes.
-		 */
-		if (lfs_dostats)
-			++lfs_stats.flush_invoked;
-		lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+	/* disallow dirops during flush */
+	fs->lfs_writer++;

-		/* XXX KS - allow dirops again */
-		if (--fs->lfs_writer == 0)
-			wakeup(&fs->lfs_dirops);
+	/* drain dirops */
+	while (fs->lfs_dirops > 0) {
+		++fs->lfs_diropwait;
+		tsleep(&fs->lfs_writer, PRIBIO+1, "fldirop", 0);
+		--fs->lfs_diropwait; 
 	}
+
+	if (lfs_dostats)
+		++lfs_stats.flush_invoked;
+	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
+
+	/* allow dirops again */
+	if (--fs->lfs_writer == 0)
+		wakeup(&fs->lfs_dirops);
 }

 /*
@ -512,6 +514,9 @@ lfs_flush(struct lfs *fs, int flags)
 	}
 	lfs_writing = 1;
 	
+	lfs_subsys_pages = 0; /* XXXUBC need a better way to count this */
+	wakeup(&lfs_subsys_pages);
+
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
@ -525,7 +530,6 @@ lfs_flush(struct lfs *fs, int flags)
 		vfs_unbusy(mp);
 	}
 	simple_unlock(&mountlist_slock);
-
 	LFS_DEBUG_COUNTLOCKED("flush");

 	lfs_writing = 0;
@ -562,25 +566,40 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 	while (fs->lfs_dirops > 0 &&
 	       (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
                locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
-                lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0))
+		lfs_subsys_pages > LFS_MAX_PAGES ||
+                lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
 	{
 		++fs->lfs_diropwait;
 		tsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0);
 		--fs->lfs_diropwait;
 	}

+#ifdef DEBUG_LFS_FLUSH
+	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
+		printf("lqc = %d, max %d\n", locked_queue_count + INOCOUNT(fs),
+			LFS_MAX_BUFS);
+	if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
+		printf("lqb = %ld, max %d\n", locked_queue_bytes + INOBYTES(fs),
+			LFS_MAX_BYTES);
+	if (lfs_subsys_pages > LFS_MAX_PAGES)
+		printf("lssp = %d, max %d\n", lfs_subsys_pages, LFS_MAX_PAGES);
+	if (lfs_dirvcount > LFS_MAX_DIROP)
+		printf("ldvc = %d, max %d\n", lfs_dirvcount, LFS_MAX_DIROP);
+	if (fs->lfs_diropwait > 0)
+		printf("ldvw = %d\n", fs->lfs_diropwait);
+#endif
 	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
-	    lfs_dirvcount > LFS_MAXDIROP || fs->lfs_diropwait > 0)
+	    lfs_subsys_pages > LFS_MAX_PAGES ||
+	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0)
 	{
-		++fs->lfs_writer;
 		lfs_flush(fs, flags);
-		if (--fs->lfs_writer == 0)
-			wakeup(&fs->lfs_dirops);
 	}

-	while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS
-	       || locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES)
+	while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS ||
+		locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES ||
+		lfs_subsys_pages > LFS_WAIT_PAGES ||
+		lfs_dirvcount > LFS_MAX_DIROP)
 	{
 		if (lfs_dostats)
 			++lfs_stats.wait_exceeded;
@ -601,10 +620,7 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 		if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 		    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
 		{
-			++fs->lfs_writer;
 			lfs_flush(fs, flags | SEGM_CKP);
-			if (--fs->lfs_writer == 0)
-				wakeup(&fs->lfs_dirops);
 		}
 	}
 	return (error);
@ -613,15 +629,8 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 /*
 * Allocate a new buffer header.
 */
-#ifdef MALLOCLOG
-# define DOMALLOC(S, T, F) _malloc((S), (T), (F), file, line)
 struct buf *
-lfs_newbuf_malloclog(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, char *file, int line)
-#else
-# define DOMALLOC(S, T, F) malloc((S), (T), (F))
-struct buf *
-lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
-#endif
+lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
 {
 	struct buf *bp;
 	size_t nbytes;
@ -629,11 +638,13 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
 	
 	nbytes = roundup(size, fsbtob(fs, 1));
 	
-	bp = DOMALLOC(sizeof(struct buf), M_SEGMENT, M_WAITOK);
-	bzero(bp, sizeof(struct buf));
+	s = splbio();
+	bp = pool_get(&bufpool, PR_WAITOK);
+	splx(s);
+	memset(bp, 0, sizeof(struct buf));
 	if (nbytes) {
-		bp->b_data = DOMALLOC(nbytes, M_SEGMENT, M_WAITOK);
-		bzero(bp->b_data, nbytes);
+		bp->b_data = lfs_malloc(fs, nbytes, type);
+		/* memset(bp->b_data, 0, nbytes); */
 	}
 #ifdef DIAGNOSTIC	
 	if (vp == NULL)
@ -659,27 +670,20 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size)
 	return (bp);
 }

-#ifdef MALLOCLOG
-# define DOFREE(A, T) _free((A), (T), file, line)
 void
-lfs_freebuf_malloclog(struct buf *bp, char *file, int line)
-#else
-# define DOFREE(A, T) free((A), (T))
-void
-lfs_freebuf(struct buf *bp)
-#endif
+lfs_freebuf(struct lfs *fs, struct buf *bp)
 {
 	int s;
 	
 	s = splbio();
 	if (bp->b_vp)
 		brelvp(bp);
-	splx(s);
 	if (!(bp->b_flags & B_INVAL)) { /* B_INVAL indicates a "fake" buffer */
-		DOFREE(bp->b_data, M_SEGMENT);
+		lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
 		bp->b_data = NULL;
 	}
-	DOFREE(bp, M_SEGMENT);
+	pool_put(&bufpool, bp);
+	splx(s);
 }

 /*
@ -707,7 +711,7 @@ lfs_countlocked(int *count, long *bytes, char *msg)

 	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
 	    bp = bp->b_freelist.tqe_next) {
-		if (bp->b_flags & B_CALL) /* Malloced buffer */
+		if (bp->b_flags & B_CALL)
 			continue;
 		n++;
 		size += bp->b_bufsize;
--- a/sys/ufs/lfs/lfs_cksum.c
+++ b/sys/ufs/lfs/lfs_cksum.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_cksum.c,v 1.20 2002/06/16 00:13:15 perseant Exp $	*/
+/*	$NetBSD: lfs_cksum.c,v 1.21 2003/02/17 23:48:18 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.20 2002/06/16 00:13:15 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_cksum.c,v 1.21 2003/02/17 23:48:18 perseant Exp $");

 #include <sys/param.h>
 #ifdef _KERNEL
--- a/sys/ufs/lfs/lfs_debug.c
+++ b/sys/ufs/lfs/lfs_debug.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_debug.c,v 1.19 2003/01/29 13:14:34 yamt Exp $	*/
+/*	$NetBSD: lfs_debug.c,v 1.20 2003/02/17 23:48:18 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -73,7 +73,7 @@
 #ifdef DEBUG

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.19 2003/01/29 13:14:34 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.20 2003/02/17 23:48:18 perseant Exp $");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
@ -167,7 +167,7 @@ lfs_dump_super(struct lfs *lfsp)
 	
 	printf("Checkpoint Info\n");
 	printf("%s%d\t%s%x\t%s%d\n",
-	       "free	 ", lfsp->lfs_free,
+	       "freehd	 ", lfsp->lfs_freehd,
 	       "idaddr	 ", lfsp->lfs_idaddr,
 	       "ifile	 ", lfsp->lfs_ifile);
 	printf("%s%x\t%s%d\t%s%x\t%s%x\t%s%x\t%s%x\n",
--- a/sys/ufs/lfs/lfs_extern.h
+++ b/sys/ufs/lfs/lfs_extern.h
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_extern.h,v 1.38 2003/02/01 18:34:14 tron Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.39 2003/02/17 23:48:18 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -88,7 +88,7 @@ MALLOC_DECLARE(M_SEGMENT);
 #define LFS_WRITEINDIR	 1 /* flush indirect blocks on non-checkpoint writes */
 #define LFS_CLEAN_VNHEAD 2 /* put prev unrefed cleaned vnodes on head of free list */
 #define LFS_DOSTATS      3
-#define LFS_STATS        4
+#define LFS_MAXPAGES	 4
 #define LFS_MAXID	 5

 #define LFS_NAMES { \
@ -96,7 +96,7 @@ MALLOC_DECLARE(M_SEGMENT);
 	{ "flushindir", CTLTYPE_INT }, \
 	{ "clean_vnhead", CTLTYPE_INT }, \
 	{ "dostats", CTLTYPE_INT }, \
-	{ "stats", CTLTYPE_STRUCT }, \
+	{ "maxpages", CTLTYPE_INT }, \
 }

 struct fid;
@ -117,7 +117,8 @@ struct segment;
 struct ucred;

 extern int lfs_allclean_wakeup;
-extern struct pool lfs_inode_pool;		/* memory pool for inodes */
+extern struct pool lfs_inode_pool;	/* memory pool for inodes */
+extern struct pool lfs_inoext_pool;	/* memory pool for inode extension */

 __BEGIN_DECLS
 /* lfs_alloc.c */
@ -130,16 +131,8 @@ int lfs_fits(struct lfs *, int);
 void lfs_flush_fs(struct lfs *, int);
 void lfs_flush(struct lfs *, int);
 int lfs_check(struct vnode *, daddr_t, int);
-#ifdef MALLOCLOG
-void lfs_freebuf_malloclog(struct buf *, char *, int);
-struct buf *lfs_newbuf_malloclog(struct lfs *, struct vnode *,
-				 daddr_t, size_t, char *, int);
-#define lfs_freebuf(BP) lfs_freebuf_malloclog((BP), __FILE__, __LINE__)
-#define lfs_newbuf(F, V, A, S) lfs_newbuf_malloclog((F),(V),(A),(S),__FILE__,__LINE__)
-#else
-void lfs_freebuf(struct buf *);
-struct buf *lfs_newbuf(struct lfs *, struct vnode *, daddr_t, size_t);
-#endif
+void lfs_freebuf(struct lfs *, struct buf *);
+struct buf *lfs_newbuf(struct lfs *, struct vnode *, daddr_t, size_t, int);
 void lfs_countlocked(int *, long *, char *);
 int lfs_reserve(struct lfs *, struct vnode *, struct vnode *, int);

@ -169,6 +162,7 @@ void lfs_writefile(struct lfs *, struct segment *, struct vnode *);
 int lfs_writeinode(struct lfs *, struct segment *, struct inode *);
 int lfs_gatherblock(struct segment *, struct buf *, int *);
 int lfs_gather(struct lfs *, struct segment *, struct vnode *, int (*match )(struct lfs *, struct buf *));
+void lfs_update_single(struct lfs *, struct segment *, daddr_t, int32_t, int, int);
 void lfs_updatemeta(struct segment *);
 int lfs_initseg(struct lfs *);
 void lfs_newseg(struct lfs *);
@ -187,12 +181,17 @@ void lfs_vunref(struct vnode *);
 void lfs_vunref_head(struct vnode *);

 /* lfs_subr.c */
-void lfs_seglock(struct lfs *, unsigned long);
+void lfs_setup_resblks(struct lfs *);
+void lfs_free_resblks(struct lfs *);
+void *lfs_malloc(struct lfs *, size_t, int);
+void lfs_free(struct lfs *, void *, int);
+int lfs_seglock(struct lfs *, unsigned long);
 void lfs_segunlock(struct lfs *);

 /* lfs_syscalls.c */
 int lfs_fastvget(struct mount *, ino_t, daddr_t, struct vnode **, struct dinode *);
 struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
+int lfs_do_segclean(struct lfs *, unsigned long);

 /* lfs_vfsops.c */
 void lfs_init(void);
@ -200,7 +199,6 @@ void lfs_reinit(void);
 void lfs_done(void);
 int lfs_mountroot(void);
 int lfs_mount(struct mount *, const char *, void *, struct nameidata *, struct proc *);
-int lfs_mountfs(struct vnode *, struct mount *, struct proc *);
 int lfs_unmount(struct mount *, int, struct proc *);
 int lfs_statfs(struct mount *, struct statfs *, struct proc *);
 int lfs_sync(struct mount *, int, struct ucred *, struct proc *);
@ -213,6 +211,10 @@ int lfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, struct proc *);
 void lfs_unmark_vnode(struct vnode *);
 void lfs_itimes(struct inode *, struct timespec *, struct timespec *,
 		struct timespec *);
+int lfs_gop_alloc(struct vnode *, off_t, off_t, int, struct ucred *);
+void lfs_gop_size(struct vnode *, off_t, off_t *, int);
+int lfs_putpages_ext(void *, int);
+int lfs_gatherpages(struct vnode *);

 int lfs_balloc	 (void *);
 int lfs_valloc	 (void *);
@ -230,6 +232,7 @@ int lfs_read	 (void *);
 int lfs_remove	 (void *);
 int lfs_rmdir	 (void *);
 int lfs_link	 (void *);
+int lfs_mmap	 (void *);
 int lfs_rename	 (void *);
 int lfs_getattr	 (void *);
 int lfs_setattr	 (void *);
--- a/sys/ufs/lfs/lfs_inode.c
+++ b/sys/ufs/lfs/lfs_inode.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_inode.c,v 1.63 2003/01/25 16:40:29 fvdl Exp $	*/
+/*	$NetBSD: lfs_inode.c,v 1.64 2003/02/17 23:48:18 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.63 2003/01/25 16:40:29 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.64 2003/02/17 23:48:18 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -231,6 +231,9 @@ lfs_truncate(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *ovp = ap->a_vp;
+#ifdef LFS_UBC
+	struct genfs_node *gp = VTOG(ovp);
+#endif
 	daddr_t lastblock;
 	struct inode *oip;
 	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
@ -247,6 +250,7 @@ lfs_truncate(void *v)
 	long lastseg;
 	size_t bc;
 	int obufsize, odb;
+	int usepc, needunlock;

 	if (length < 0)
 		return (EINVAL);
@ -282,6 +286,10 @@ lfs_truncate(void *v)
 	fs = oip->i_lfs;
 	lfs_imtime(fs);
 	osize = oip->i_ffs_size;
+	needunlock = usepc = 0;
+#ifdef LFS_UBC
+	usepc = (ovp->v_type == VREG && osize > length && ovp != fs->lfs_ivnode);
+#endif

 	/*
 	 * Lengthen the size of the file. We must ensure that the
@ -313,18 +321,7 @@ lfs_truncate(void *v)
 	if ((error = lfs_reserve(fs, ovp, NULL,
 	    btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift))) != 0)
 		return (error);
-	/*
-	 * Make sure no writes to this inode can happen while we're
-	 * truncating.  Otherwise, blocks which are accounted for on the
-	 * inode *and* which have been created for cleaning can coexist,
-	 * and cause an overcounting.
-	 */
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_seglock(fs, SEGM_PROT);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
-#endif
-	
+
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundary, the contents of the
@ -338,7 +335,12 @@ lfs_truncate(void *v)
 	bc = 0;
 	if (offset == 0) {
 		oip->i_ffs_size = length;
-	} else {
+	} else
+#ifdef LFS_UBC
+	if (!usepc)
+#endif
+	{
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
 		lbn = lblkno(fs, length);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
@ -347,11 +349,7 @@ lfs_truncate(void *v)
 		if (error) {
 			lfs_reserve(fs, ovp, NULL,
 			    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
-#ifdef LFS_FRAGSIZE_SEGLOCK
-			lfs_segunlock(fs);
-#else
 			lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
-#endif
 			return (error);
 		}
 		obufsize = bp->b_bufsize;
@ -367,7 +365,45 @@ lfs_truncate(void *v)
 		if (bp->b_flags & B_DELWRI)
 			fs->lfs_avail += odb - btofsb(fs, size);
 		(void) VOP_BWRITE(bp);
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 	}
+#ifdef LFS_UBC
+        /*
+         * When truncating a regular file down to a non-block-aligned size,
+         * we must zero the part of last block which is past the new EOF.
+         * We must synchronously flush the zeroed pages to disk
+         * since the new pages will be invalidated as soon as we
+         * inform the VM system of the new, smaller size.
+         * We must do this before acquiring the GLOCK, since fetching
+         * the pages will acquire the GLOCK internally.
+         * So there is a window where another thread could see a whole
+         * zeroed page past EOF, but that's life.
+         */
+
+        else { /* vp->v_type == VREG && length < osize && offset != 0 */
+                voff_t eoz;
+
+		aflags = ap->a_flags & IO_SYNC ? B_SYNC : 0;
+                error = ufs_balloc_range(ovp, length - 1, 1, ap->a_cred,
+			aflags);
+                if (error) {
+                        return error;
+                }
+                size = blksize(fs, oip, lblkno(fs, length));
+                eoz = MIN(lblktosize(fs, lblkno(fs, length)) + size, osize);
+                uvm_vnp_zerorange(ovp, length, eoz - length);
+                simple_lock(&ovp->v_interlock);
+                error = VOP_PUTPAGES(ovp, trunc_page(length), round_page(eoz),
+                    PGO_CLEANIT | PGO_DEACTIVATE | PGO_SYNCIO);
+                if (error) {
+                        return error;
+                }
+        }
+
+        lockmgr(&gp->g_glock, LK_EXCLUSIVE, NULL);
+#endif
+
+	oip->i_ffs_size = length;
 	uvm_vnp_setsize(ovp, length);
 	/*
 	 * Calculate index into inode's block list of
@ -428,6 +464,10 @@ lfs_truncate(void *v)
 			goto done;
 	}

+	if (!usepc) {
+		lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
+		needunlock = 1;
+	}
 	/*
 	 * All whole direct blocks or frags.
 	 */
@ -516,10 +556,10 @@ done:
 #endif
 	lfs_reserve(fs, ovp, NULL,
 	    -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
-#ifdef LFS_FRAGSIZE_SEGLOCK
-	lfs_segunlock(fs);
-#else
-	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+	if (needunlock)
+		lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
+#ifdef LFS_UBC
+	lockmgr(&gp->g_glock, LK_RELEASE, NULL);
 #endif
 	return (allerror);
 }
@ -550,7 +590,6 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 {
 	SEGUSE *sup;
 	struct buf *bp;
-	int error;

 	if (lastseg < 0 || num == 0)
 		return 0;
@ -563,8 +602,9 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 		sup->su_nbytes = num;
 	}
 	sup->su_nbytes -= num;
-	error = LFS_BWRITE_LOG(bp); /* Ifile */
-	return error;
+	LFS_WRITESEGENTRY(sup, fs, lastseg, bp);
+
+	return 0;
 }

 /*
@ -707,6 +747,8 @@ lfs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn,
 /*
 * Destroy any in core blocks past the truncation length.
 * Inlined from vtruncbuf, so that lfs_avail could be updated.
+ * We take the fraglock to prevent cleaning from occurring while we are
+ * invalidating blocks.
 */
 static int
 lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo)
@ -714,10 +756,19 @@ lfs_vtruncbuf(struct vnode *vp, daddr_t lbn, int slpflag, int slptimeo)
 	struct buf *bp, *nbp;
 	int s, error;
 	struct lfs *fs;
+	voff_t off;
+
+	off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
+	simple_lock(&vp->v_interlock);
+	error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
+	if (error) {
+		return error;
+	} 

 	fs = VTOI(vp)->i_lfs;
 	s = splbio();

+	lockmgr(&fs->lfs_fraglock, LK_SHARED, 0);
 restart:
 	for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 		nbp = LIST_NEXT(bp, b_vnbufs);
@ -729,6 +780,7 @@ restart:
 			    "lfs_vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
+				lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 				return (error);
 			}
 			goto restart;
@ -753,6 +805,7 @@ restart:
 			    "lfs_vtruncbuf", slptimeo);
 			if (error) {
 				splx(s);
+				lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);
 				return (error);
 			}
 			goto restart;
@ -768,6 +821,7 @@ restart:
 	}

 	splx(s);
+	lockmgr(&fs->lfs_fraglock, LK_RELEASE, 0);

 	return (0);
 }
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
--- a/sys/ufs/lfs/lfs_subr.c
+++ b/sys/ufs/lfs/lfs_subr.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $	*/
+/*	$NetBSD: lfs_subr.c,v 1.31 2003/02/17 23:48:20 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.31 2003/02/17 23:48:20 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -86,6 +86,8 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.30 2003/01/29 13:14:35 yamt Exp $");
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+#include <uvm/uvm.h>
+
 /*
 * Return buffer with the contents of block "offset" from the beginning of
 * directory "ip".  If "res" is non-zero, fill it in with a pointer to the
@ -122,12 +124,177 @@ lfs_blkatoff(void *v)
 	return (0);
 }

+#ifdef LFS_DEBUG_MALLOC
+char *lfs_res_names[LFS_NB_COUNT] = {
+	"summary",
+	"superblock",
+	"ifile block",
+	"cluster",
+	"clean",
+};
+#endif
+
+int lfs_res_qty[LFS_NB_COUNT] = {
+	LFS_N_SUMMARIES,
+	LFS_N_SBLOCKS,
+	LFS_N_IBLOCKS,
+	LFS_N_CLUSTERS,
+	LFS_N_CLEAN,
+};
+
+void
+lfs_setup_resblks(struct lfs *fs)
+{
+	int i, j;
+	int maxbpp;
+
+	fs->lfs_resblk = (res_t *)malloc(LFS_N_TOTAL * sizeof(res_t), M_SEGMENT,
+				          M_WAITOK);
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		fs->lfs_resblk[i].inuse = 0;
+		fs->lfs_resblk[i].p = NULL;
+	}
+	for (i = 0; i < LFS_RESHASH_WIDTH; i++)
+		LIST_INIT(fs->lfs_reshash + i);
+
+	/*
+	 * These types of allocations can be larger than a page,
+	 * so we can't use the pool subsystem for them.
+	 */
+	for (i = 0, j = 0; j < LFS_N_SUMMARIES; j++, i++)
+		fs->lfs_resblk[i].p = malloc(fs->lfs_sumsize, M_SEGMENT,
+					    M_WAITOK);
+	for (j = 0; j < LFS_N_SBLOCKS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(LFS_SBPAD, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_IBLOCKS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(fs->lfs_bsize, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_CLUSTERS; j++, i++)
+		fs->lfs_resblk[i].p = malloc(MAXPHYS, M_SEGMENT, M_WAITOK);
+	for (j = 0; j < LFS_N_CLEAN; j++, i++)
+		fs->lfs_resblk[i].p = malloc(MAXPHYS, M_SEGMENT, M_WAITOK);
+
+	/*
+	 * Initialize pools for small types (XXX is BPP small?)
+	 */
+	maxbpp = ((fs->lfs_sumsize - SEGSUM_SIZE(fs)) / sizeof(int32_t) + 2);
+	maxbpp = MIN(maxbpp, fs->lfs_ssize / fs->lfs_fsize + 2);
+        pool_init(&fs->lfs_bpppool, maxbpp * sizeof(struct buf *), 0, 0,
+		LFS_N_BPP, "lfsbpppl", &pool_allocator_nointr);
+        pool_init(&fs->lfs_clpool, sizeof(struct lfs_cluster), 0, 0,
+		LFS_N_CL, "lfsclpl", &pool_allocator_nointr);
+	pool_init(&fs->lfs_segpool, sizeof(struct segment), 0, 0,
+		LFS_N_SEG, "lfssegpool", &pool_allocator_nointr);
+}
+
+void
+lfs_free_resblks(struct lfs *fs)
+{
+	int i;
+
+	pool_destroy(&fs->lfs_bpppool);
+	pool_destroy(&fs->lfs_segpool);
+	pool_destroy(&fs->lfs_clpool);
+
+	for (i = 0; i < LFS_N_TOTAL; i++) {
+		while(fs->lfs_resblk[i].inuse)
+			tsleep(&fs->lfs_resblk, PRIBIO + 1, "lfs_free", 0);
+		if (fs->lfs_resblk[i].p != NULL)
+			free(fs->lfs_resblk[i].p, M_SEGMENT);
+	}
+	free(fs->lfs_resblk, M_SEGMENT);
+}
+
+static unsigned int
+lfs_mhash(void *vp)
+{
+	return (unsigned int)(((unsigned long)vp) >> 2) % LFS_RESHASH_WIDTH;
+}
+
+/*
+ * Return memory of the given size for the given purpose, or use one of a
+ * number of spare last-resort buffers, if malloc returns NULL.
+ */ 
+void *
+lfs_malloc(struct lfs *fs, size_t size, int type)
+{
+	struct lfs_res_blk *re;
+	void *r;
+	int i, s, start;
+	unsigned int h;
+
+	/* If no mem allocated for this type, it just waits */
+	if (lfs_res_qty[type] == 0)
+		return malloc(size, M_SEGMENT, M_WAITOK);
+
+	/* Otherwise try a quick malloc, and if it works, great */
+	if ((r = malloc(size, M_SEGMENT, M_NOWAIT)) != NULL)
+		return r;
+
+	/*
+	 * If malloc returned NULL, we are forced to use one of our
+	 * reserve blocks.  We have on hand at least one summary block,
+	 * at least one cluster block, at least one superblock,
+	 * and several indirect blocks.
+	 */
+	/* skip over blocks of other types */
+	for (i = 0, start = 0; i < type; i++)
+		start += lfs_res_qty[i];
+	while (r == NULL) {
+		for (i = 0; i < lfs_res_qty[type]; i++) {
+			if (fs->lfs_resblk[start + i].inuse == 0) {
+				re = fs->lfs_resblk + start + i;
+				re->inuse = 1;
+				r = re->p;
+				h = lfs_mhash(r);
+				s = splbio();
+				LIST_INSERT_HEAD(&fs->lfs_reshash[h], re, res);
+				splx(s);
+				return r;
+			}
+		}
+#ifdef LFS_DEBUG_MALLOC
+		printf("sleeping on %s (%d)\n", lfs_res_names[type], lfs_res_qty[type]);
+#endif
+		tsleep(&fs->lfs_resblk, PVM, "lfs_malloc", 0);
+#ifdef LFS_DEBUG_MALLOC
+		printf("done sleeping on %s\n", lfs_res_names[type]);
+#endif
+	}
+	/* NOTREACHED */
+	return r;
+}
+
+void
+lfs_free(struct lfs *fs, void *p, int type)
+{
+	int s;
+	unsigned int h;
+	res_t *re;
+
+	h = lfs_mhash(p);
+	s = splbio();
+	LIST_FOREACH(re, &fs->lfs_reshash[h], res) {
+		if (re->p == p) {
+			LIST_REMOVE(re, res);
+			re->inuse = 0;
+			wakeup(&fs->lfs_resblk);
+			splx(s);
+			return;
+		}
+	}
+	splx(s);
+
+	/*
+	 * If we didn't find it, free it.
+	 */
+	free(p, M_SEGMENT);
+}

 /*
 * lfs_seglock --
 *	Single thread the segment writer.
 */
-void
+int
 lfs_seglock(struct lfs *fs, unsigned long flags)
 {
 	struct segment *sp;
@ -136,8 +303,10 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 		if (fs->lfs_lockpid == curproc->p_pid) {
 			++fs->lfs_seglock;
 			fs->lfs_sp->seg_flags |= flags;
-			return;			
-		} else while (fs->lfs_seglock)
+			return 0;
+		} else if (flags & SEGM_PAGEDAEMON)
+			return EWOULDBLOCK;
+		else while (fs->lfs_seglock)
 			(void)tsleep(&fs->lfs_seglock, PRIBIO + 1,
 				     "lfs seglock", 0);
 	}
@ -148,10 +317,8 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 	/* Drain fragment size changes out */
 	lockmgr(&fs->lfs_fraglock, LK_EXCLUSIVE, 0);

-	sp = fs->lfs_sp = malloc(sizeof(struct segment), M_SEGMENT, M_WAITOK);
-	sp->bpp = malloc(((fs->lfs_sumsize - SEGSUM_SIZE(fs)) /
-			  sizeof(int32_t) + 1) * sizeof(struct buf *),
-			 M_SEGMENT, M_WAITOK);
+	sp = fs->lfs_sp = pool_get(&fs->lfs_segpool, PR_WAITOK);
+	sp->bpp = pool_get(&fs->lfs_bpppool, PR_WAITOK);
 	sp->seg_flags = flags;
 	sp->vp = NULL;
 	sp->seg_iocount = 0;
@ -164,8 +331,70 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 	 * the writes we intend to do.
 	 */
 	++fs->lfs_iocount;
+	return 0;
 }

+static void lfs_unmark_dirop(struct lfs *);
+
+static void
+lfs_unmark_dirop(struct lfs *fs)
+{
+	struct inode *ip, *nip;
+	struct vnode *vp;
+	extern int lfs_dirvcount;
+
+	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
+		nip = TAILQ_NEXT(ip, i_lfs_dchain);
+		vp = ITOV(ip);
+
+		if (VOP_ISLOCKED(vp) &&
+                           vp->v_lock.lk_lockholder != curproc->p_pid) {
+			continue;
+		}
+		if ((VTOI(vp)->i_flag & IN_ADIROP) == 0) {
+			--lfs_dirvcount;
+			vp->v_flag &= ~VDIROP;
+			TAILQ_REMOVE(&fs->lfs_dchainhd, ip, i_lfs_dchain);
+			wakeup(&lfs_dirvcount);
+			fs->lfs_unlockvp = vp;
+			vrele(vp);
+			fs->lfs_unlockvp = NULL;
+		}
+	}
+}
+
+#ifndef LFS_NO_AUTO_SEGCLEAN
+static void
+lfs_auto_segclean(struct lfs *fs)
+{
+	int i, error;
+
+	/*
+	 * Now that we've swapped lfs_activesb, but while we still
+	 * hold the segment lock, run through the segment list marking
+	 * the empty ones clean.
+	 * XXX - do we really need to do them all at once?
+	 */
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		if ((fs->lfs_suflags[0][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY) &&
+		    (fs->lfs_suflags[1][i] &
+		     (SEGUSE_ACTIVE | SEGUSE_DIRTY | SEGUSE_EMPTY)) ==
+		    (SEGUSE_DIRTY | SEGUSE_EMPTY)) {
+
+			if ((error = lfs_do_segclean(fs, i)) != 0) {
+#ifdef DEBUG
+				printf("lfs_auto_segclean: lfs_do_segclean returned %d for seg %d\n", error, i);
+#endif /* DEBUG */
+			}
+		}
+		fs->lfs_suflags[1 - fs->lfs_activesb][i] =
+			fs->lfs_suflags[fs->lfs_activesb][i];
+	}
+}
+#endif /* LFS_AUTO_SEGCLEAN */
+
 /*
 * lfs_segunlock --
 *	Single thread the segment writer.
@ -176,9 +405,6 @@ lfs_segunlock(struct lfs *fs)
 	struct segment *sp;
 	unsigned long sync, ckp;
 	struct buf *bp;
-	struct vnode *vp, *nvp;
-	struct mount *mp;
-	extern int lfs_dirvcount;
 #ifdef LFS_MALLOC_SUMMARY
 	extern int locked_queue_count;
 	extern long locked_queue_bytes;
@ -186,63 +412,9 @@ lfs_segunlock(struct lfs *fs)
 	
 	sp = fs->lfs_sp;

-	if (fs->lfs_seglock == 1 && !(sp->seg_flags & SEGM_PROT)) {
-
-		mp = fs->lfs_ivnode->v_mount;
-		/*
-		 * Go through and unmark all DIROP vnodes, possibly
-		 * calling VOP_INACTIVE (through vrele).  This is
-		 * delayed until now in order not to accidentally
-		 * write a DIROP node through lfs_flush.
-		 */
-#ifndef LFS_NO_BACKVP_HACK
-	/* BEGIN HACK */
-#define	VN_OFFSET	(((caddr_t)&LIST_NEXT(vp, v_mntvnodes)) - (caddr_t)vp)
-#define	BACK_VP(VP)	((struct vnode *)(((caddr_t)(VP)->v_mntvnodes.le_prev) - VN_OFFSET))
-#define	BEG_OF_VLIST	((struct vnode *)(((caddr_t)&LIST_FIRST(&mp->mnt_vnodelist)) - VN_OFFSET))
-	
-		/* Find last vnode. */
-	loop:	for (vp = LIST_FIRST(&mp->mnt_vnodelist);
-		     vp && LIST_NEXT(vp, v_mntvnodes) != NULL;
-		     vp = LIST_NEXT(vp, v_mntvnodes));
-		for (; vp && vp != BEG_OF_VLIST; vp = nvp) {
-			nvp = BACK_VP(vp);
-#else
-	loop:
-		 for (vp = LIST_FIRST(&mp->mnt_vnodelist);
-		     vp != NULL;
-		     vp = nvp) {
-			nvp = LIST_NEXT(vp, v_mntvnodes);
-#endif
-			if (vp->v_mount != mp) {
-				printf("lfs_segunlock: starting over\n");
-				goto loop;
-			}
-			if (vp->v_type == VNON)
-				continue;
-			if (lfs_vref(vp))
-				continue;
-			if (VOP_ISLOCKED(vp) &&
-                            vp->v_lock.lk_lockholder != curproc->p_pid) {
-				lfs_vunref(vp);
-				continue;
-			}
-			if ((vp->v_flag & VDIROP) &&
-			    !(VTOI(vp)->i_flag & IN_ADIROP)) {
-				--lfs_dirvcount;
-				vp->v_flag &= ~VDIROP;
-				wakeup(&lfs_dirvcount);
-				fs->lfs_unlockvp = vp;
-				lfs_vunref(vp);
-				vrele(vp);
-				fs->lfs_unlockvp = NULL;
-			} else {
-				lfs_vunref(vp);
-			}
-		}
-	}
-
 	if (fs->lfs_seglock == 1) {
+		if ((sp->seg_flags & SEGM_PROT) == 0)
+			lfs_unmark_dirop(fs);
 		sync = sp->seg_flags & SEGM_SYNC;
 		ckp = sp->seg_flags & SEGM_CKP;
 		if (sp->bpp != sp->cbpp) {
@ -250,7 +422,7 @@ lfs_segunlock(struct lfs *fs)
 			fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
 			bp = *sp->bpp;
 #ifdef LFS_MALLOC_SUMMARY
-			lfs_freebuf(bp);
+			lfs_freebuf(fs, bp);
 #else
 			s = splbio();
 			bremfree(bp);
@ -263,11 +435,11 @@ lfs_segunlock(struct lfs *fs)
 		} else
 			printf ("unlock to 0 with no summary");

-		free(sp->bpp, M_SEGMENT);
+		pool_put(&fs->lfs_bpppool, sp->bpp);
 		sp->bpp = NULL;
 		/* The sync case holds a reference in `sp' to be freed below */
 		if (!sync)
-			free(sp, M_SEGMENT);
+			pool_put(&fs->lfs_segpool, sp);
 		fs->lfs_sp = NULL;

 		/*
@ -275,9 +447,7 @@ lfs_segunlock(struct lfs *fs)
 		 * At the moment, the user's process hangs around so we can
 		 * sleep.
 		 */
-		if (--fs->lfs_iocount < LFS_THROTTLE)
-			wakeup(&fs->lfs_iocount);
-		if(fs->lfs_iocount == 0) {
+		if (--fs->lfs_iocount == 0) {
 			lfs_countlocked(&locked_queue_count,
 					&locked_queue_bytes, "lfs_segunlock");
 			wakeup(&locked_queue_count);
@ -309,15 +479,18 @@ lfs_segunlock(struct lfs *fs)
 			/* printf("sleeping on iocount %x == %d\n", sp, sp->seg_iocount); */
 		}
 		if (sync)
-			free(sp, M_SEGMENT);
+			pool_put(&fs->lfs_segpool, sp);
 		if (ckp) {
 			fs->lfs_nactive = 0;
 			/* If we *know* everything's on disk, write both sbs */
+			/* XXX should wait for this one  */
 			if (sync)
-				lfs_writesuper(fs,fs->lfs_sboffs[fs->lfs_activesb]);
+				lfs_writesuper(fs, fs->lfs_sboffs[fs->lfs_activesb]);
+			lfs_writesuper(fs, fs->lfs_sboffs[1 - fs->lfs_activesb]);
+#ifndef LFS_NO_AUTO_SEGCLEAN
+			lfs_auto_segclean(fs);
+#endif
 			fs->lfs_activesb = 1 - fs->lfs_activesb;
-			lfs_writesuper(fs,fs->lfs_sboffs[fs->lfs_activesb]);
-
 			--fs->lfs_seglock;
 			fs->lfs_lockpid = 0;
 			wakeup(&fs->lfs_seglock);
--- a/sys/ufs/lfs/lfs_syscalls.c
+++ b/sys/ufs/lfs/lfs_syscalls.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.79 2003/01/24 21:55:28 fvdl Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.80 2003/02/17 23:48:20 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.79 2003/01/24 21:55:28 fvdl Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.80 2003/02/17 23:48:20 perseant Exp $");

 #define LFS		/* for prototypes in syscallargs.h */

@ -107,6 +107,9 @@ int verbose_debug = 0;
    
 pid_t lfs_cleaner_pid = 0;

+extern int lfs_subsys_pages;
+extern struct simplelock lfs_subsys_lock;
+
 /*
 * Definitions for the buffer free lists.
 */
@ -578,7 +581,7 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 	s = splbio();
 	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; bp = nbp) {
 		nbp = bp->b_freelist.tqe_next;
-		if (bp->b_flags & B_CALL) {
+		if (LFS_IS_MALLOC_BUF(bp)) {
 			if (bp->b_flags & B_BUSY) { /* not bloody likely */
 				bp->b_flags |= B_WANTED;
 				tsleep(bp, PRIBIO+1, "markv", 0);
@ -878,15 +881,12 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 		syscallarg(fsid_t *) fsidp;
 		syscallarg(u_long) segment;
 	} */ *uap = v;
-	struct proc *p = l->l_proc;
-	CLEANERINFO *cip;
-	SEGUSE *sup;
-	struct buf *bp;
-	struct mount *mntp;
 	struct lfs *fs;
+	struct mount *mntp;
 	fsid_t fsid;
 	int error;
 	unsigned long segnum;
+	struct proc *p = l->l_proc;
 	
 	if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
 		return (error);
@ -899,39 +899,44 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	fs = VFSTOUFS(mntp)->um_lfs;
 	segnum = SCARG(uap, segment);
 	
-	if (dtosn(fs, fs->lfs_curseg) == segnum)
-		return (EBUSY);
-	
 	if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 
 		return (error);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
+
 	lfs_seglock(fs, SEGM_PROT);
-#endif
+	error = lfs_do_segclean(fs, segnum);
+	lfs_segunlock(fs);
+	vfs_unbusy(mntp);
+	return error;
+}
+
+/*
+ * Actually mark the segment clean.
+ * Must be called with the segment lock held.
+ */
+int
+lfs_do_segclean(struct lfs *fs, unsigned long segnum)
+{
+	struct buf *bp;
+	CLEANERINFO *cip;
+	SEGUSE *sup;
+	
+	if (dtosn(fs, fs->lfs_curseg) == segnum) {
+		return (EBUSY);
+	}
+	
 	LFS_SEGENTRY(sup, fs, segnum, bp);
 	if (sup->su_nbytes) {
 		printf("lfs_segclean: not cleaning segment %lu: %d live bytes\n",
 			segnum, sup->su_nbytes);
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EBUSY);
 	}
 	if (sup->su_flags & SEGUSE_ACTIVE) {
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EBUSY);
 	}
 	if (!(sup->su_flags & SEGUSE_DIRTY)) {
 		brelse(bp);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-		lfs_segunlock(fs);
-#endif
-		vfs_unbusy(mntp);
 		return (EALREADY);
 	}
 	
@ -948,7 +953,7 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	if (fs->lfs_dmeta < 0)
 		fs->lfs_dmeta = 0;
 	sup->su_flags &= ~SEGUSE_DIRTY;
-	(void) LFS_BWRITE_LOG(bp);
+	LFS_WRITESEGENTRY(sup, fs, segnum, bp);
 	
 	LFS_CLEANERINFO(cip, fs, bp);
 	++cip->clean;
@ -958,10 +963,6 @@ sys_lfs_segclean(struct lwp *l, void *v, register_t *retval)
 	cip->avail = fs->lfs_avail - fs->lfs_ravail;
 	(void) LFS_BWRITE_LOG(bp);
 	wakeup(&fs->lfs_avail);
-#ifdef LFS_AGGRESSIVE_SEGLOCK
-	lfs_segunlock(fs);
-#endif
-	vfs_unbusy(mntp);

 	return (0);
 }
@ -1228,6 +1229,7 @@ lfs_fakebuf_iodone(struct buf *bp)

 	if (!(obp->b_flags & (B_DELWRI | B_DONE)))
 		obp->b_flags |= B_INVAL;
+	bp->b_saveaddr = (caddr_t)(VTOI(obp->b_vp)->i_lfs);
 	brelse(obp);
 	lfs_callback(bp);
 }
@ -1256,11 +1258,10 @@ lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uadd
 	if (obp == NULL)
 		panic("lfs_fakebuf: getblk failed");

-#ifndef ALLOW_VFLUSH_CORRUPTION
-	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size);
+	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, size, LFS_NB_CLEAN);
 	error = copyin(uaddr, bp->b_data, size);
 	if (error) {
-		lfs_freebuf(bp);
+		lfs_freebuf(fs, bp);
 		return NULL;
 	}
 	bp->b_saveaddr = obp;
@ -1272,11 +1273,6 @@ lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uadd
 		panic("lfs_fakebuf: gathered bp: %p, ino=%u, lbn=%d",
 		    bp, VTOI(vp)->i_number, lbn);
 #endif
-#else
-	bp = lfs_newbuf(VTOI(vp)->i_lfs, vp, lbn, 0);
-	bp->b_flags |= B_INVAL;
-	bp->b_saveaddr = uaddr;
-#endif
 #if 0
 	bp->b_saveaddr = (caddr_t)fs;
 	++fs->lfs_iocount;
--- a/sys/ufs/lfs/lfs_vfsops.c
+++ b/sys/ufs/lfs/lfs_vfsops.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.91 2003/02/17 23:48:21 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.91 2003/02/17 23:48:21 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -84,6 +84,7 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $")
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/kthread.h>
 #include <sys/buf.h>
 #include <sys/device.h>
 #include <sys/mbuf.h>
@ -105,14 +106,32 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.90 2003/01/29 13:14:36 yamt Exp $")
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>

+#include <uvm/uvm.h>
+#include <uvm/uvm_stat.h>
+#include <uvm/uvm_pager.h>
+#include <uvm/uvm_pdaemon.h>
+
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

-int lfs_mountfs(struct vnode *, struct mount *, struct proc *);
+#ifdef LFS_UBC
+#include <miscfs/genfs/genfs.h>
+#include <miscfs/genfs/genfs_node.h>
+static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
+#endif
+
+static int lfs_mountfs(struct vnode *, struct mount *, struct proc *);

 extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
 extern const struct vnodeopv_desc lfs_specop_opv_desc;
 extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
+extern int lfs_subsys_pages;    
+extern int  locked_queue_count;
+extern long locked_queue_bytes;
+extern struct simplelock lfs_subsys_lock;
+
+int lfs_writer_daemon = 0;
+int lfs_do_flush = 0;

 const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
 	&lfs_vnodeop_opv_desc,
@ -143,15 +162,95 @@ struct vfsops lfs_vfsops = {
 };

 struct genfs_ops lfs_genfsops = {
+#ifdef LFS_UBC
+	lfs_gop_size,
+	ufs_gop_alloc,
+	lfs_gop_write,
+#else
 	NULL,
 	NULL,
 	genfs_compat_gop_write,
+#endif
 };

-struct pool lfs_inode_pool;
+struct pool lfs_inode_pool, lfs_inoext_pool;

-extern int locked_queue_count;
-extern long locked_queue_bytes;
+/*
+ * The writer daemon.  UVM keeps track of how many dirty pages we are holding
+ * in lfs_subsys_pages; the daemon flushes the filesystem when this value
+ * crosses the (user-defined) threshhold LFS_MAX_PAGES.
+ */
+static void
+lfs_writerd(void *arg)
+{
+#ifdef LFS_PD
+	struct mount *mp, *nmp;
+	struct lfs *fs;
+#endif
+
+	lfs_writer_daemon = curproc->p_pid;
+
+	for (;;) {
+		tsleep(&lfs_writer_daemon, PVM, "lfswriter", 0);
+
+#ifdef LFS_PD
+		/*
+		 * Look through the list of LFSs to see if any of them
+		 * have requested pageouts.
+		 */
+		simple_lock(&mountlist_slock);
+		for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+		     mp = nmp) {
+			if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock)) {
+				nmp = mp->mnt_list.cqe_next;
+				continue;
+			}
+			if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
+				    MFSNAMELEN) == 0) {
+				fs = ((struct ufsmount *)mp->mnt_data)->ufsmount_u.lfs;
+				if (fs->lfs_pdflush ||
+				    !TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+					fs->lfs_pdflush = 0;
+					simple_unlock(&mountlist_slock);
+					lfs_flush_fs(fs, 0);
+					simple_lock(&mountlist_slock);
+				}
+			}
+
+			simple_lock(&mountlist_slock);
+			nmp = mp->mnt_list.cqe_next;
+			vfs_unbusy(mp);
+		}
+		simple_unlock(&mountlist_slock);
+#endif /* LFS_PD */
+
+		/*
+		 * If global state wants a flush, flush everything.
+		 */
+		while (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS || 
+			locked_queue_bytes > LFS_MAX_BYTES ||
+			lfs_subsys_pages > LFS_MAX_PAGES) {
+
+#ifdef DEBUG_LFS_FLUSH
+			if (lfs_do_flush)
+				printf("daemon: lfs_do_flush\n");
+			if (locked_queue_count > LFS_MAX_BUFS)
+				printf("daemon: lqc = %d, max %d\n",
+					locked_queue_count, LFS_MAX_BUFS);
+			if (locked_queue_bytes > LFS_MAX_BYTES)
+				printf("daemon: lqb = %ld, max %d\n",
+					locked_queue_bytes, LFS_MAX_BYTES);
+			if (lfs_subsys_pages > LFS_MAX_PAGES) 
+				printf("daemon: lssp = %d, max %d\n",
+					lfs_subsys_pages, LFS_MAX_PAGES);
+#endif /* DEBUG_LFS_FLUSH */
+			lfs_flush(NULL, 0);
+			lfs_do_flush = 0;
+		}
+		wakeup(&lfs_subsys_pages);
+	}
+	/* NOTREACHED */
+}

 /*
 * Initialize the filesystem, most work done by ufs_init.
@ -166,9 +265,12 @@ lfs_init()
 	 */
 	pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
 		  "lfsinopl", &pool_allocator_nointr);
+	pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
+		  "lfsinoextpl", &pool_allocator_nointr);
 #ifdef DEBUG
 	memset(lfs_log, 0, sizeof(lfs_log));
 #endif
+	simple_lock_init(&lfs_subsys_lock);
 }

 void
@ -452,11 +554,11 @@ update_meta(struct lfs *fs, ino_t ino, int version, daddr_t lbn,
 		}
 #endif
 		sup->su_nbytes -= size;
-		LFS_BWRITE_LOG(bp);
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, odaddr)), bp);
 	}
 	LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
 	sup->su_nbytes += size;
-	LFS_BWRITE_LOG(bp);
+	LFS_WRITESEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);

 	/* Fix this so it can be released */
 	/* ip->i_lfs_effnblks = ip->i_ffs_blocks; */
@ -544,12 +646,16 @@ update_inoblk(struct lfs *fs, daddr_t offset, struct ucred *cred,
 					LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
 						     ibp);
 					sup->su_nbytes -= DINODE_SIZE;
-					LFS_BWRITE_LOG(ibp);
+					LFS_WRITESEGENTRY(sup, fs,
+							  dtosn(fs, daddr),
+							  ibp);
 				}
 				LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
 					     ibp);
 				sup->su_nbytes += DINODE_SIZE;
-				LFS_BWRITE_LOG(ibp);
+				LFS_WRITESEGENTRY(sup, fs,
+					          dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
+						  ibp);
 			}
 		}
 	}
@ -969,7 +1075,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	fs->lfs_dirops = 0;
 	fs->lfs_nadirop = 0;
 	fs->lfs_seglock = 0;
-	lockinit(&fs->lfs_freelock, PINOD, "lfs_freelock", 0, 0);
+	fs->lfs_pdflush = 0;
 	lockinit(&fs->lfs_fraglock, PINOD, "lfs_fraglock", 0, 0);

 	/* Set the file system readonly/modify bits. */
@ -985,6 +1091,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	mp->mnt_stat.f_iosize = fs->lfs_bsize;
 	mp->mnt_maxsymlinklen = fs->lfs_maxsymlinklen;
 	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_fs_bshift = fs->lfs_bshift;
 	ump->um_flags = 0;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
@ -997,6 +1104,16 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 		ump->um_quotas[i] = NULLVP;
 	devvp->v_specmountpoint = mp;

+	/* Set up reserved memory for pageout */
+	lfs_setup_resblks(fs);
+	/* Set up vdirop tailq */
+	TAILQ_INIT(&fs->lfs_dchainhd);
+	/* and paging tailq */
+	TAILQ_INIT(&fs->lfs_pchainhd);
+#if 0 /* XXXDEBUG */
+	fs->lfs_lastwrit = dbtofsb(fs, fs->lfs_offset - 1);
+#endif
+
 	/*
 	 * We use the ifile vnode for almost every operation.  Instead of
 	 * retrieving it from the hash table each time we retrieve it here,
@ -1012,6 +1129,32 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	fs->lfs_ivnode = vp;
 	VREF(vp);

+	/* Set up segment usage flags for the autocleaner. */
+	fs->lfs_suflags = (u_int32_t **)malloc(2 * sizeof(u_int32_t *),
+						M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[0] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	fs->lfs_suflags[1] = (u_int32_t *)malloc(fs->lfs_nseg * sizeof(u_int32_t),
+						 M_SEGMENT, M_WAITOK);
+	memset(fs->lfs_suflags[1], 0, fs->lfs_nseg * sizeof(u_int32_t));
+	for (i = 0; i < fs->lfs_nseg; i++) {
+		LFS_SEGENTRY(sup, fs, i, bp);
+		if (!ronly && sup->su_nbytes == 0 &&
+		    !(sup->su_flags & SEGUSE_EMPTY)) {
+			sup->su_flags |= SEGUSE_EMPTY;
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		} else if (!ronly && !(sup->su_nbytes == 0) &&
+			 (sup->su_flags & SEGUSE_EMPTY)) {
+			sup->su_flags &= ~SEGUSE_EMPTY;
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			LFS_WRITESEGENTRY(sup, fs, i, bp);
+		} else {
+			fs->lfs_suflags[0][i] = sup->su_flags;
+			brelse(bp);
+		}
+	}
+
 	/*
 	 * Roll forward.
 	 *
@ -1045,7 +1188,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 		if (!(sup->su_flags & SEGUSE_DIRTY))
 			--fs->lfs_nclean;
 		sup->su_flags |= SEGUSE_DIRTY;
-		(void) LFS_BWRITE_LOG(bp);
+		LFS_WRITESEGENTRY(sup, fs, dtosn(fs, offset), bp);
 		while ((offset = check_segsum(fs, offset, cred, CHECK_CKSUM,
 					      &flags, p)) > 0)
 		{
@ -1055,7 +1198,8 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 				if (!(sup->su_flags & SEGUSE_DIRTY))
 					--fs->lfs_nclean;
 				sup->su_flags |= SEGUSE_DIRTY;
-				(void) LFS_BWRITE_LOG(bp);
+				LFS_WRITESEGENTRY(sup, fs, dtosn(fs, oldoffset),
+					     bp); 
 			}

 #ifdef DEBUG_LFS_RFW
@ -1149,7 +1293,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	 */
        LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); 
        sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
-        (void) LFS_BWRITE_LOG(bp); /* Ifile */
+        LFS_WRITESEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp);  /* Ifile */

 	/* Now that roll-forward is done, unlock the Ifile */
 	vput(vp);
@ -1180,6 +1324,12 @@ out:
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
+
+	/* Start the pagedaemon-anticipating daemon */
+        if (lfs_writer_daemon == 0 &&
+	    kthread_create1(lfs_writerd, NULL, NULL, "lfs_writer") != 0)
+                panic("fork lfs_writer");
+
 	return (error);
 }

@ -1259,12 +1409,18 @@ lfs_unmount(struct mount *mp, int mntflags, struct proc *p)
 	    ronly ? FREAD : FREAD|FWRITE, NOCRED, p);
 	vput(ump->um_devvp);

-	/* XXX KS - wake up the cleaner so it can die */
+	/* wake up the cleaner so it can die */
 	wakeup(&fs->lfs_nextseg);
 	wakeup(&lfs_allclean_wakeup);

+	/* Free per-mount data structures */
+	free(fs->lfs_suflags[0], M_SEGMENT);
+	free(fs->lfs_suflags[1], M_SEGMENT);
+	free(fs->lfs_suflags, M_SEGMENT);
+	lfs_free_resblks(fs);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
+
 	mp->mnt_data = NULL;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
@ -1586,11 +1742,251 @@ lfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, si
 		if (lfs_dostats == 0)
 			memset(&lfs_stats,0,sizeof(lfs_stats));
 		return 0;
-	case LFS_STATS:
-		return (sysctl_rdstruct(oldp, oldlenp, newp,
-					&lfs_stats, sizeof(lfs_stats)));
 	default:
 		return (EOPNOTSUPP);
 	}
 	/* NOTREACHED */
 }
+
+#ifdef LFS_UBC
+/*
+ * lfs_gop_write functions exactly like genfs_gop_write, except that
+ * (1) it requires the seglock to be held by its caller, and sp->fip
+ *     to be properly initialized (it will return without re-initializing
+ *     sp->fip, and without calling lfs_writeseg).
+ * (2) it uses the remaining space in the segment, rather than VOP_BMAP,
+ *     to determine how large a block it can write at once (though it does
+ *     still use VOP_BMAP to find holes in the file);
+ * (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
+ *     (leaving lfs_writeseg to deal with the cluster blocks, so we might
+ *     now have clusters of clusters, ick.)
+ */
+static int
+lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
+{
+	int i, s, error, run;
+	int fs_bshift, dev_bshift;
+	vaddr_t kva;
+	off_t eof, offset, startoffset;
+	size_t bytes, iobytes, skipbytes;
+	daddr_t lbn, blkno;
+	struct vm_page *pg;
+	struct buf *mbp, *bp;
+	struct vnode *devvp;
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs;
+	struct segment *sp = fs->lfs_sp;
+	UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+
+	/* The Ifile lives in the buffer cache */
+	if (vp == fs->lfs_ivnode)
+		return genfs_compat_gop_write(vp, pgs, npages, flags);
+
+	/*
+	 * Sometimes things slip past the filters in lfs_putpages,
+	 * and the pagedaemon tries to write pages---problem is
+	 * that the pagedaemon never acquires the segment lock.
+	 *
+	 * Unbusy and unclean the pages, and put them on the ACTIVE
+	 * queue under the hypothesis that they couldn't have got here
+	 * unless they were modified *quite* recently.
+	 *
+	 * XXXUBC that last statement is an oversimplification of course.
+	 */
+	if (!(fs->lfs_seglock) || fs->lfs_lockpid != curproc->p_pid) {
+		simple_lock(&vp->v_interlock);
+#ifdef DEBUG
+		printf("lfs_gop_write: seglock not held\n");
+#endif
+		uvm_lock_pageq();
+		for (i = 0; i < npages; i++) {
+			if (pgs[i]->flags & PG_WANTED)
+				wakeup(pgs[i]);
+			if (pgs[i]->flags & PG_PAGEOUT)
+				uvmexp.paging--;
+			pgs[i]->flags &= ~(PG_BUSY|PG_CLEAN|PG_WANTED|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
+			UVM_PAGE_OWN(pg, NULL);
+			uvm_pageactivate(pgs[i]);
+		}
+		uvm_page_unbusy(pgs, npages);
+		uvm_unlock_pageq();
+		simple_unlock(&vp->v_interlock);
+		return EAGAIN;
+	}
+
+	UVMHIST_LOG(ubchist, "vp %p pgs %p npages %d flags 0x%x",
+	    vp, pgs, npages, flags);
+
+	GOP_SIZE(vp, vp->v_size, &eof, GOP_SIZE_WRITE);
+
+	if (vp->v_type == VREG) {
+		fs_bshift = vp->v_mount->mnt_fs_bshift;
+		dev_bshift = vp->v_mount->mnt_dev_bshift;
+	} else {
+		fs_bshift = DEV_BSHIFT;
+		dev_bshift = DEV_BSHIFT;
+	}
+	error = 0;
+	pg = pgs[0];
+	startoffset = pg->offset;
+	bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
+	skipbytes = 0;
+
+	KASSERT(bytes != 0);
+
+	/* Swap PG_DELWRI for PG_PAGEOUT */
+	for (i = 0; i < npages; i++)
+		if (pgs[i]->flags & PG_DELWRI) {
+			KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
+			pgs[i]->flags &= ~PG_DELWRI;
+			pgs[i]->flags |= PG_PAGEOUT;
+			uvmexp.paging++;
+		}
+
+	/*
+	 * Check to make sure we're starting on a block boundary.
+	 * We'll check later to make sure we always write entire
+	 * blocks (or fragments).
+	 */
+	if (startoffset & fs->lfs_bmask)
+		printf("%" PRId64 " & %" PRId64 " = %" PRId64 "\n",
+			startoffset, fs->lfs_bmask,
+			startoffset & fs->lfs_bmask);
+	KASSERT((startoffset & fs->lfs_bmask) == 0);
+	if (bytes & fs->lfs_ffmask) {
+		printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
+		panic("lfs_gop_write: non-integer blocks");
+	}
+
+	kva = uvm_pagermapin(pgs, npages,
+	    UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
+
+	s = splbio();
+	simple_lock(&global_v_numoutput_slock);
+	vp->v_numoutput += 2; /* one for biodone, one for aiodone */
+	simple_unlock(&global_v_numoutput_slock);
+	mbp = pool_get(&bufpool, PR_WAITOK);
+	splx(s);
+
+	memset(mbp, 0, sizeof(*bp));
+	UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x",
+	    vp, mbp, vp->v_numoutput, bytes);
+	mbp->b_bufsize = npages << PAGE_SHIFT;
+	mbp->b_data = (void *)kva;
+	mbp->b_resid = mbp->b_bcount = bytes;
+	mbp->b_flags = B_BUSY|B_WRITE|B_AGE|B_CALL;
+	mbp->b_iodone = uvm_aio_biodone;
+	mbp->b_vp = vp;
+	LIST_INIT(&mbp->b_dep);
+
+	bp = NULL;
+	for (offset = startoffset;
+	    bytes > 0;
+	    offset += iobytes, bytes -= iobytes) {
+		lbn = offset >> fs_bshift;
+		error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
+		if (error) {
+			UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0);
+			skipbytes += bytes;
+			bytes = 0;
+			break;
+		}
+
+		iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
+		    bytes);
+		if (blkno == (daddr_t)-1) {
+			skipbytes += iobytes;
+			continue;
+		}
+
+		/*
+		 * Discover how much we can really pack into this buffer.
+		 */
+#ifdef LFS_UBC_BIGBUFS
+		/* If no room in the current segment, finish it up */
+		if (sp->sum_bytes_left < sizeof(int32_t) ||
+		    sp->seg_bytes_left < MIN(iobytes, (1 << fs->lfs_bshift))) {
+			int version;
+
+			lfs_updatemeta(sp);
+
+			version = sp->fip->fi_version;
+			(void) lfs_writeseg(fs, sp);
+			
+			sp->fip->fi_version = version;
+			sp->fip->fi_ino = ip->i_number;
+			/* Add the current file to the segment summary. */
+			++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+			sp->sum_bytes_left -= FINFOSIZE;
+		}
+		iobytes = MIN(iobytes, ((sp->seg_bytes_left >> fs_bshift) << fs_bshift));
+#else
+		iobytes = MIN(iobytes, (1 << fs_bshift));
+		if (iobytes != blksize(fs, ip, lblkno(fs, offset))) {
+			printf("iobytes = %" PRId64 ", blk = %" PRId64 "\n",
+				(int64_t)iobytes,
+				(int64_t)blksize(fs, ip, lblkno(fs, offset)));
+		}
+		KASSERT(iobytes == blksize(fs, ip, lblkno(fs, offset)));
+#endif
+		KASSERT(iobytes > 0);
+
+		/* if it's really one i/o, don't make a second buf */
+		if (offset == startoffset && iobytes == bytes) {
+			bp = mbp;
+			/* printf("bp is mbp\n"); */
+			/* correct overcount if there is no second buffer */
+			s = splbio();
+			simple_lock(&global_v_numoutput_slock);
+			--vp->v_numoutput;
+			simple_unlock(&global_v_numoutput_slock);
+			splx(s);
+		} else {
+			/* printf("bp is not mbp\n"); */
+			s = splbio();
+			bp = pool_get(&bufpool, PR_WAITOK);
+			UVMHIST_LOG(ubchist, "vp %p bp %p num now %d",
+			    vp, bp, vp->v_numoutput, 0);
+			memset(bp, 0, sizeof(*bp));
+			splx(s);
+			bp->b_data = (char *)kva +
+			    (vaddr_t)(offset - pg->offset);
+			bp->b_resid = bp->b_bcount = iobytes;
+			bp->b_flags = B_BUSY|B_WRITE|B_CALL;
+			bp->b_iodone = uvm_aio_biodone1;
+			LIST_INIT(&bp->b_dep);
+		}
+
+		/* XXX This is silly ... is this necessary? */
+		bp->b_vp = NULL;
+		s = splbio();
+		bgetvp(vp, bp);
+		splx(s);
+
+		bp->b_lblkno = lblkno(fs, offset);
+		bp->b_private = mbp;
+		if (devvp->v_type == VBLK) {
+			bp->b_dev = devvp->v_rdev;
+		}
+		VOP_BWRITE(bp);
+		while(lfs_gatherblock(sp, bp, NULL))
+			;
+	}
+
+	if (skipbytes) {
+		UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0);
+		s = splbio();
+		if (error) {
+			mbp->b_flags |= B_ERROR;
+			mbp->b_error = error;
+		}
+		mbp->b_resid -= skipbytes;
+		if (mbp->b_resid == 0) {
+			biodone(mbp);
+		}
+		splx(s);
+	}
+	UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
+	return (0);
+}
+#endif /* LFS_UBC */
--- a/sys/ufs/lfs/lfs_vnops.c
+++ b/sys/ufs/lfs/lfs_vnops.c
@ -1,7 +1,7 @@
-/*	$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp $	*/
+/*	$NetBSD: lfs_vnops.c,v 1.84 2003/02/17 23:48:22 perseant Exp $	*/

 /*-
- * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
+ * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.84 2003/02/17 23:48:22 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -97,9 +97,19 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.83 2003/02/03 00:32:35 perseant Exp
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>

+#include <uvm/uvm.h>
+#ifdef LFS_UBC
+# include <uvm/uvm_pmap.h>
+# include <uvm/uvm_stat.h>
+# include <uvm/uvm_pager.h>
+#endif
+
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+extern int lfs_writer_daemon;
+extern int lfs_subsys_pages;
+
 /* Global vfs data structures for lfs. */
 int (**lfs_vnodeop_p)(void *);
 const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
@ -121,7 +131,11 @@ const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 	{ &vop_poll_desc, ufs_poll },			/* poll */
 	{ &vop_kqfilter_desc, genfs_kqfilter },		/* kqfilter */
 	{ &vop_revoke_desc, ufs_revoke },		/* revoke */
+#ifdef LFS_UBC
+	{ &vop_mmap_desc, lfs_mmap },			/* mmap */
+#else
 	{ &vop_mmap_desc, ufs_mmap },			/* mmap */
+#endif
 	{ &vop_fsync_desc, lfs_fsync },			/* fsync */
 	{ &vop_seek_desc, ufs_seek },			/* seek */
 	{ &vop_remove_desc, lfs_remove },		/* remove */
@ -150,7 +164,11 @@ const struct vnodeopv_entry_desc lfs_vnodeop_entries[] = {
 	{ &vop_truncate_desc, lfs_truncate },		/* truncate */
 	{ &vop_update_desc, lfs_update },		/* update */
 	{ &vop_bwrite_desc, lfs_bwrite },		/* bwrite */
+#ifdef LFS_UBC
+	{ &vop_getpages_desc, genfs_getpages },		/* getpages */
+#else
 	{ &vop_getpages_desc, lfs_getpages },		/* getpages */
+#endif
 	{ &vop_putpages_desc, lfs_putpages },		/* putpages */
 	{ NULL, NULL }
 };
@ -293,37 +311,46 @@ lfs_fsync(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
-	int error;
-	
-	/* Ignore the trickle syncer */
-	if (ap->a_flags & FSYNC_LAZY)
+	int error, wait;
+
+  	/*
+	 * Trickle sync checks for need to do a checkpoint after possible
+	 * activity from the pagedaemon.
+  	 */
+	if (ap->a_flags & FSYNC_LAZY) {
+		wakeup(&lfs_writer_daemon);
 		return 0;
-
-	simple_lock(&vp->v_interlock);
-	error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
-                    round_page(ap->a_offhi), PGO_CLEANIT | PGO_SYNCIO);
-	if (error)
-		return error;
-	error = VOP_UPDATE(vp, NULL, NULL,
-			   (ap->a_flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0);
-#ifdef DEBUG
-	/*
-	 * If we were called from vinvalbuf and lfs_update
-	 * didn't flush all our buffers, we're in trouble.
-	 */
-	if ((ap->a_flags & FSYNC_WAIT) && LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
-		struct buf *bp;
-
-		bp = LIST_FIRST(&vp->v_dirtyblkhd);
-		printf("lfs_fsync: ino %d failed to sync", VTOI(vp)->i_number);
-		printf("lfs_fsync: iocount = %d\n", VTOI(vp)->i_lfs->lfs_iocount);
-		printf("lfs_fsync: flags are 0x%x, numoutput=%d\n",
-			VTOI(vp)->i_flag, vp->v_numoutput);
-		printf("lfs_fsync: writecount=%ld\n", vp->v_writecount);
-		printf("lfs_fsync: first bp: %p, flags=0x%lx, lbn=%" PRId64 "\n",
-			bp, bp->b_flags, bp->b_lblkno);
 	}
+
+	wait = (ap->a_flags & FSYNC_WAIT);
+	do {
+#ifdef DEBUG
+  		struct buf *bp;
 #endif
+
+		simple_lock(&vp->v_interlock);
+		error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
+			     	round_page(ap->a_offhi),
+			     	PGO_CLEANIT | (wait ? PGO_SYNCIO : 0));
+		if (error)
+			return error;
+		error = VOP_UPDATE(vp, NULL, NULL, wait ? UPDATE_WAIT : 0);
+		if (wait && error == 0 && !VPISEMPTY(vp)) {
+#ifdef DEBUG
+			printf("lfs_fsync: reflushing ino %d\n",
+				VTOI(vp)->i_number);
+			printf("vflags %x iflags %x npages %d\n",
+				vp->v_flag, VTOI(vp)->i_flag,
+				vp->v_uobj.uo_npages);
+			LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
+				printf("%" PRId64 " (%lx)", bp->b_lblkno,
+					bp->b_flags);
+			printf("\n");
+#endif
+			VTOI(vp)->i_flag |= IN_MODIFIED;
+		}
+	} while (wait && error == 0 && !VPISEMPTY(vp));
+
 	return error;
 }

@ -361,6 +388,7 @@ lfs_inactive(void *v)
 #define	SET_DIROP2(vp, vp2)	lfs_set_dirop((vp), (vp2))
 static int lfs_set_dirop(struct vnode *, struct vnode *);
 extern int lfs_dirvcount;
+extern int lfs_do_flush;

 #define	NRESERVE(fs)	(btofsb(fs, (NIADDR + 3 + (2 * NIADDR + 3)) << fs->lfs_bshift))

@ -383,17 +411,15 @@ lfs_set_dirop(struct vnode *vp, struct vnode *vp2)

 	if (fs->lfs_dirops == 0)
 		lfs_check(vp, LFS_UNUSED_LBN, 0);
-	while (fs->lfs_writer || lfs_dirvcount > LFS_MAXDIROP) {
+	while (fs->lfs_writer || lfs_dirvcount > LFS_MAX_DIROP) {
 		if (fs->lfs_writer)
 			tsleep(&fs->lfs_dirops, PRIBIO + 1, "lfs_sdirop", 0);
-		if (lfs_dirvcount > LFS_MAXDIROP && fs->lfs_dirops == 0) {
-                	++fs->lfs_writer;
-                	lfs_flush(fs, 0);
-                	if (--fs->lfs_writer == 0)
-                        	wakeup(&fs->lfs_dirops);
+		if (lfs_dirvcount > LFS_MAX_DIROP && fs->lfs_dirops == 0) {
+			wakeup(&lfs_writer_daemon);
+			preempt(NULL);
 		}

-		if (lfs_dirvcount > LFS_MAXDIROP) {		
+		if (lfs_dirvcount > LFS_MAX_DIROP) {
 #ifdef DEBUG_LFS
 			printf("lfs_set_dirop: sleeping with dirops=%d, "
 			       "dirvcount=%d\n", fs->lfs_dirops,
@ -438,15 +464,19 @@ unreserve:
 }

 #define	MARK_VNODE(dvp)  do {                                           \
+	struct inode *_ip = VTOI(dvp);					\
+	struct lfs *_fs = _ip->i_lfs;					\
+									\
        if (!((dvp)->v_flag & VDIROP)) {				\
                (void)lfs_vref(dvp);					\
 		++lfs_dirvcount;					\
+		TAILQ_INSERT_TAIL(&_fs->lfs_dchainhd, _ip, i_lfs_dchain); \
 	}								\
        (dvp)->v_flag |= VDIROP;					\
-	if (!(VTOI(dvp)->i_flag & IN_ADIROP)) {				\
-		++VTOI(dvp)->i_lfs->lfs_nadirop;			\
+	if (!(_ip->i_flag & IN_ADIROP)) {				\
+		++_fs->lfs_nadirop;					\
 	}								\
-	VTOI(dvp)->i_flag |= IN_ADIROP;					\
+	_ip->i_flag |= IN_ADIROP;					\
 } while (0)

 #define UNMARK_VNODE(vp) lfs_unmark_vnode(vp)
@ -656,22 +686,24 @@ lfs_rmdir(void *v)
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap = v;
+	struct vnode *vp;
 	int error;

+	vp = ap->a_vp;
 	if ((error = SET_DIROP2(ap->a_dvp, ap->a_vp)) != 0) {
 		vrele(ap->a_dvp);
 		if (ap->a_vp != ap->a_dvp)
 			VOP_UNLOCK(ap->a_dvp, 0);
-		vput(ap->a_vp);
+		vput(vp);
 		return error;
 	}
 	MARK_VNODE(ap->a_dvp);
-	MARK_VNODE(ap->a_vp);
+	MARK_VNODE(vp);
 	error = ufs_rmdir(ap);
 	UNMARK_VNODE(ap->a_dvp);
-	UNMARK_VNODE(ap->a_vp);
+	UNMARK_VNODE(vp);

-	SET_ENDOP2(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, ap->a_vp, "rmdir");
+	SET_ENDOP2(VTOI(ap->a_dvp)->i_lfs, ap->a_dvp, vp, "rmdir");
 	return (error);
 }

@ -844,7 +876,7 @@ lfs_getattr(void *v)
 		vap->va_blocksize = MAXBSIZE;
 	else
 		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
-	vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_ffs_blocks);
+	vap->va_bytes = fsbtob(fs, (u_quad_t)ip->i_lfs_effnblks);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = ip->i_modrev;
 	return (0);
@ -964,18 +996,22 @@ lfs_reclaim(void *v)
 		struct proc *a_p;
 	} */ *ap = v;
 	struct vnode *vp = ap->a_vp;
+	struct inode *ip = VTOI(vp);
 	int error;

-	KASSERT(VTOI(vp)->i_ffs_nlink == VTOI(vp)->i_ffs_effnlink);
+	KASSERT(ip->i_ffs_nlink == ip->i_ffs_effnlink);

-	LFS_CLR_UINO(VTOI(vp), IN_ALLMOD);
+	LFS_CLR_UINO(ip, IN_ALLMOD);
 	if ((error = ufs_reclaim(vp, ap->a_p)))
 		return (error);
+	pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
+	ip->inode_ext.lfs = NULL;
 	pool_put(&lfs_inode_pool, vp->v_data);
 	vp->v_data = NULL;
 	return (0);
 }

+#ifndef LFS_UBC
 int
 lfs_getpages(void *v)
 {
@ -1004,3 +1040,645 @@ lfs_putpages(void *v)
 	error = genfs_putpages(v);
 	return error;
 }
+
+#else /* LFS_UBC */
+
+/*
+ * Make sure that for all pages in every block in the given range,
+ * either all are dirty or all are clean.  If any of the pages
+ * we've seen so far are dirty, put the vnode on the paging chain,
+ * and mark it IN_PAGING.
+ */
+static int
+check_dirty(struct lfs *fs, struct vnode *vp,
+	    off_t startoffset, off_t endoffset, off_t blkeof,
+	    int flags)
+{
+        int by_list;
+	struct vm_page *curpg, *pgs[MAXBSIZE / PAGE_SIZE], *pg;
+	struct lwp *l = curlwp ? curlwp : &lwp0;
+	off_t soff;
+	voff_t off;
+	int i, dirty, tdirty, nonexistent, any_dirty;
+	int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+
+  top:
+	by_list = (vp->v_uobj.uo_npages <=
+		   ((endoffset - startoffset) >> PAGE_SHIFT) *
+		   UVM_PAGE_HASH_PENALTY);
+	any_dirty = 0;
+
+	if (by_list) {
+		curpg = TAILQ_FIRST(&vp->v_uobj.memq);
+		PHOLD(l);
+	} else {
+		soff = startoffset;
+	}
+	while (by_list || soff < MIN(blkeof, endoffset)) {
+		if (by_list) {
+			if (pages_per_block > 1) {
+				while (curpg && (curpg->offset & fs->lfs_bmask))
+					curpg = TAILQ_NEXT(curpg, listq);
+			}
+			if (curpg == NULL)
+				break;
+			soff = curpg->offset;
+		}
+
+		/*
+		 * Mark all pages in extended range busy; find out if any
+		 * of them are dirty.
+		 */
+		nonexistent = dirty = 0;
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			if (by_list && pages_per_block <= 1) {
+				pgs[i] = pg = curpg;
+			} else {
+				off = soff + (i << PAGE_SHIFT);
+				pgs[i] = pg = uvm_pagelookup(&vp->v_uobj, off);
+				if (pg == NULL) {
+					++nonexistent;
+					continue;
+				}
+			}
+			KASSERT(pg != NULL);
+			while (pg->flags & PG_BUSY) {
+				pg->flags |= PG_WANTED;
+				UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
+						    "lfsput", 0);
+				simple_lock(&vp->v_interlock);
+				if (by_list)
+					goto top;
+			}
+			pg->flags |= PG_BUSY;
+			UVM_PAGE_OWN(pg, "lfs_putpages");
+
+			pmap_page_protect(pg, VM_PROT_NONE);
+			tdirty = (pmap_clear_modify(pg) ||
+				  (pg->flags & PG_CLEAN) == 0);
+			dirty += tdirty;
+		}
+		if (pages_per_block > 0 && nonexistent >= pages_per_block) {
+			if (by_list) {
+				curpg = TAILQ_NEXT(curpg, listq);
+			} else {
+				soff += fs->lfs_bsize;
+			}
+			continue;
+		}
+
+		any_dirty += dirty;
+		KASSERT(nonexistent == 0);
+
+		/*
+		 * If any are dirty make all dirty; unbusy them,
+		 * but if we were asked to clean, take them off
+		 * of their queue so the pagedaemon doesn't bother
+		 * us about them while they're on their way to disk.
+		 *
+		 * (XXXUBC the page is now on *no* page queue.)
+		 */
+		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			pg = pgs[i];
+			KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+			if (dirty) {
+				pg->flags &= ~PG_CLEAN;
+				if (flags & PGO_FREE) {
+					/* XXXUBC need better way to update */
+					lfs_subsys_pages += MIN(1, pages_per_block);
+					uvm_lock_pageq();
+					UVM_PAGE_OWN(pg, NULL);
+					uvm_pagedequeue(pg);
+					/* Suspended write flag */
+					pg->flags |= PG_DELWRI;
+					uvm_unlock_pageq();
+				}
+			} else {
+				UVM_PAGE_OWN(pg, NULL);
+			}
+			if (pg->flags & PG_WANTED)
+				wakeup(pg);
+			pg->flags &= ~(PG_WANTED|PG_BUSY);
+			/* UVM_PAGE_OWN(pg, NULL); */
+		}
+
+		if (by_list) {
+			curpg = TAILQ_NEXT(curpg, listq);
+		} else {
+			soff += MAX(PAGE_SIZE, fs->lfs_bsize);
+		}
+	}
+	if (by_list) {
+		PRELE(l);
+	}
+
+	/*
+	 * If any pages were dirty, mark this inode as "pageout requested",
+	 * and put it on the paging queue.
+	 * XXXUBC locking (check locking on dchainhd too)
+	 */
+#ifdef notyet
+	if (any_dirty) {
+		if (!(ip->i_flags & IN_PAGING)) {
+			ip->i_flags |= IN_PAGING;
+			TAILQ_INSERT_TAIL(&fs->lfs_pchainhd, ip, i_lfs_pchain);
+		}
+	}
+#endif
+	return any_dirty;
+}
+
+/*
+ * lfs_putpages functions like genfs_putpages except that
+ * 
+ * (1) It needs to bounds-check the incoming requests to ensure that
+ *     they are block-aligned; if they are not, expand the range and
+ *     do the right thing in case, e.g., the requested range is clean
+ *     but the expanded range is dirty.
+ * (2) It needs to explicitly send blocks to be written when it is done.
+ *     VOP_PUTPAGES is not ever called with the seglock held, so
+ *     we simply take the seglock and let lfs_segunlock wait for us.
+ *     XXX Actually we can be called with the seglock held, if we have
+ *     XXX to flush a vnode while lfs_markv is in operation.  As of this
+ *     XXX writing we panic in this case.
+ *
+ * Assumptions:
+ *
+ * (1) The caller does not hold any pages in this vnode busy.  If it does,
+ *     there is a danger that when we expand the page range and busy the
+ *     pages we will deadlock.
+ * (2) We are called with vp->v_interlock held; we must return with it
+ *     released.
+ * (3) We don't absolutely have to free pages right away, provided that
+ *     the request does not have PGO_SYNCIO.  When the pagedaemon gives
+ *     us a request with PGO_FREE, we take the pages out of the paging
+ *     queue and wake up the writer, which will handle freeing them for us.
+ *
+ *     We ensure that for any filesystem block, all pages for that
+ *     block are either resident or not, even if those pages are higher
+ *     than EOF; that means that we will be getting requests to free
+ *     "unused" pages above EOF all the time, and should ignore them.
+ */
+
+int
+lfs_putpages(void *v)
+{
+	int error;
+	struct vop_putpages_args /* {
+		struct vnode *a_vp;
+		voff_t a_offlo;
+		voff_t a_offhi;
+		int a_flags;
+	} */ *ap = v;
+	struct vnode *vp;
+	struct inode *ip;
+	struct lfs *fs;
+	struct segment *sp;
+	off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
+	off_t max_endoffset;
+	int pages_per_block;
+	int s, sync, dirty, pagedaemon;
+	UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+
+	vp = ap->a_vp;
+	ip = VTOI(vp);
+	fs = ip->i_lfs;
+	sync = (ap->a_flags & PGO_SYNCIO);
+	pagedaemon = (curproc == uvm.pagedaemon_proc);
+
+	/* Putpages does nothing for metadata. */
+	if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
+		simple_unlock(&vp->v_interlock);
+		return 0;
+	}
+
+	/*
+	 * If there are no pages, don't do anything.
+	 */
+	if (vp->v_uobj.uo_npages == 0) {
+		s = splbio();
+		if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL &&
+		    (vp->v_flag & VONWORKLST)) {
+			vp->v_flag &= ~VONWORKLST;
+			LIST_REMOVE(vp, v_synclist);
+		}
+		splx(s);
+		simple_unlock(&vp->v_interlock);
+		return 0;
+	}
+
+	blkeof = blkroundup(fs, ip->i_ffs_size);
+
+	/*
+	 * Ignore requests to free pages past EOF but in the same block
+	 * as EOF, unless the request is synchronous. (XXX why sync?)
+	 * XXXUBC Make these pages look "active" so the pagedaemon won't
+	 * XXXUBC bother us with them again.
+	 */
+	if (!sync && ap->a_offlo >= ip->i_ffs_size && ap->a_offlo < blkeof) {
+		origoffset = ap->a_offlo;
+		ap->a_offlo = blkeof;
+		if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {
+			simple_unlock(&vp->v_interlock);
+			return 0;
+		}
+	}
+
+	/*
+	 * Extend page range to start and end at block boundaries.
+	 * (For the purposes of VOP_PUTPAGES, fragments don't exist.)
+	 */
+	pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
+        origoffset = ap->a_offlo;
+	origendoffset = ap->a_offhi;
+        startoffset = origoffset & ~(fs->lfs_bmask);
+	max_endoffset = (trunc_page(LLONG_MAX) >> fs->lfs_bshift)
+					       << fs->lfs_bshift;
+
+	if (origendoffset == 0 || ap->a_flags & PGO_ALLPAGES) {
+                endoffset = max_endoffset;
+		origendoffset = endoffset;
+        } else {
+		origendoffset = round_page(ap->a_offhi);
+		endoffset = round_page(blkroundup(fs, origendoffset));
+	}
+
+	KASSERT(startoffset > 0 || endoffset >= startoffset);
+	if (startoffset == endoffset) {
+		/* Nothing to do, why were we called? */
+		simple_unlock(&vp->v_interlock);
+#ifdef DEBUG
+		printf("lfs_putpages: startoffset = endoffset = %" PRId64 "\n",
+			startoffset);
+#endif
+		return 0;
+	}
+
+	ap->a_offlo = startoffset;
+	ap->a_offhi = endoffset;
+
+	if (!(ap->a_flags & PGO_CLEANIT))
+		return genfs_putpages(v);
+
+	/*
+	 * Make sure that all pages in any given block are dirty, or
+	 * none of them are.  Find out if any of the pages we've been
+	 * asked about are dirty.  If none are dirty, send them on
+	 * through genfs_putpages(), albeit with adjusted offsets.
+	 * XXXUBC I am assuming here that they can't be dirtied in
+	 * XXXUBC the meantime, but I bet that's wrong.
+	 */
+	dirty = check_dirty(fs, vp, startoffset, endoffset, blkeof, ap->a_flags);
+	if (!dirty)
+		return genfs_putpages(v);
+		
+	/*
+	 * Dirty and asked to clean.
+	 *
+	 * Pagedaemon can't actually write LFS pages; wake up
+	 * the writer to take care of that.  The writer will
+	 * notice the pager inode queue and act on that.
+	 */
+	if (pagedaemon) {
+		++fs->lfs_pdflush;
+		wakeup(&lfs_writer_daemon);
+		return EWOULDBLOCK;
+	}
+
+	/*
+	 * If this is a file created in a recent dirop, we can't flush its
+	 * inode until the dirop is complete.  Drain dirops, then flush the
+	 * filesystem (taking care of any other pending dirops while we're
+	 * at it).
+	 */
+	if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
+	    (vp->v_flag & VDIROP)) {
+		int locked;
+
+		/* printf("putpages to clean VDIROP, flushing\n"); */
+		while (fs->lfs_dirops > 0) {
+			++fs->lfs_diropwait;
+			tsleep(&fs->lfs_writer, PRIBIO+1, "ppdirop", 0);
+			--fs->lfs_diropwait;
+		}
+		++fs->lfs_writer;
+		locked = VOP_ISLOCKED(vp) && /* XXX */
+			vp->v_lock.lk_lockholder == curproc->p_pid;
+		if (locked)
+			VOP_UNLOCK(vp, 0);
+		simple_unlock(&vp->v_interlock);
+		
+		lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
+		
+		simple_lock(&vp->v_interlock);
+		if (locked)
+			VOP_LOCK(vp, LK_EXCLUSIVE);
+		if (--fs->lfs_writer == 0)
+			wakeup(&fs->lfs_dirops);
+
+		/* XXX the flush should have taken care of this one too! */
+	}
+
+
+	/*
+	 * This is it.  We are going to write some pages.  From here on
+	 * down it's all just mechanics.
+	 *
+	 * If there are more than one page per block, we don't want to get
+	 * caught locking them backwards; so set PGO_BUSYFAIL to avoid
+	 * deadlocks.  Also, don't let genfs_putpages wait;
+	 * lfs_segunlock will wait for us, if need be.
+	 */
+	ap->a_flags &= ~PGO_SYNCIO;
+	if (pages_per_block > 1)
+		ap->a_flags |= PGO_BUSYFAIL;
+
+	/*
+	 * If we've already got the seglock, flush the node and return.
+	 * The FIP has already been set up for us by lfs_writefile,
+	 * and FIP cleanup and lfs_updatemeta will also be done there,
+	 * unless genfs_putpages returns EDEADLK; then we must flush
+	 * what we have, and correct FIP and segment header accounting.
+	 */
+	if (ap->a_flags & PGO_LOCKED) {
+		sp = fs->lfs_sp;
+		sp->vp = vp;
+
+		/*
+		 * XXXUBC
+		 * There is some danger here that we might run out of
+		 * buffers if we flush too much at once.  If the number
+		 * of dirty buffers is too great, we should cut the range
+		 * down and write in chunks.
+		 */
+		while ((error = genfs_putpages(v)) == EDEADLK) {
+#ifdef DEBUG_LFS
+			printf("lfs_putpages: genfs_putpages returned EDEADLK"
+			       " ino %d off %x (seg %d)\n",
+			       ip->i_number, fs->lfs_offset,
+			       dtosn(fs, fs->lfs_offset));
+#endif
+			/* Write gathered pages */
+                	lfs_updatemeta(sp);
+                	(void) lfs_writeseg(fs, sp);
+ 
+			/* Reinitialize brand new FIP and add us to it */
+			sp->vp = vp;
+                	sp->fip->fi_version = ip->i_ffs_gen;
+                	sp->fip->fi_ino = ip->i_number;
+                	/* Add us to the new segment summary. */
+                	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+                	sp->sum_bytes_left -=
+                        	sizeof(struct finfo) - sizeof(int32_t);
+
+			/* Give the write a chance to complete */
+			simple_unlock(&vp->v_interlock);
+			preempt(NULL);
+			simple_lock(&vp->v_interlock);
+		}
+		return error;
+	}
+
+	/*
+	 * Take the seglock, because we are going to be writing pages.
+	 */
+	if ((error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0))) != 0)
+		return error;
+
+	/*
+	 * VOP_PUTPAGES should not be called while holding the seglock.
+	 * XXX fix lfs_markv, or do this properly.
+	 */
+	KASSERT(fs->lfs_seglock == 1);
+
+	/*
+	 * We assume we're being called with sp->fip pointing at blank space.
+	 * Account for a new FIP in the segment header, and set sp->vp.
+	 * (This should duplicate the setup at the top of lfs_writefile().)
+	 */
+	sp = fs->lfs_sp;
+        if (sp->seg_bytes_left < fs->lfs_bsize ||
+            sp->sum_bytes_left < sizeof(struct finfo))
+                (void) lfs_writeseg(fs, fs->lfs_sp); 
+ 
+        sp->sum_bytes_left -= sizeof(struct finfo) - sizeof(int32_t);
+        ++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+	sp->vp = vp;
+ 
+        if (vp->v_flag & VDIROP)
+                ((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
+ 
+        sp->fip->fi_nblocks = 0;
+        sp->fip->fi_ino = ip->i_number;
+	sp->fip->fi_version = ip->i_ffs_gen;
+
+	/*
+	 * Loop through genfs_putpages until all pages are gathered.
+	 */
+		/*
+		 * There is some danger here that we might run out of
+		 * buffers if we flush too much at once.  If the number
+		 * of dirty buffers is too great, then, cut the range down
+		 * and write in chunks.
+		 *
+		 * XXXUBC this assumes a uniform dirtying of the pages
+		 * XXXUBC across the address space
+		 * XXXXXX do this
+		 */
+	while ((error = genfs_putpages(v)) == EDEADLK) {
+#ifdef DEBUG_LFS
+		printf("lfs_putpages: genfs_putpages returned EDEADLK [2]"
+		       " ino %d off %x (seg %d)\n",
+		       ip->i_number, fs->lfs_offset,
+		       dtosn(fs, fs->lfs_offset));
+#endif
+		/* Write gathered pages */
+               	lfs_updatemeta(sp);
+               	(void) lfs_writeseg(fs, sp);
+ 
+		/*
+		 * Reinitialize brand new FIP and add us to it.
+		 * (This should duplicate the fixup in lfs_gatherpages().)
+		 */
+		sp->vp = vp;
+               	sp->fip->fi_version = ip->i_ffs_gen;
+               	sp->fip->fi_ino = ip->i_number;
+               	/* Add us to the new segment summary. */
+               	++((SEGSUM *)(sp->segsum))->ss_nfinfo;
+               	sp->sum_bytes_left -=
+                       	sizeof(struct finfo) - sizeof(int32_t);
+
+		/* Give the write a chance to complete */
+		simple_unlock(&vp->v_interlock);
+		preempt(NULL);
+		simple_lock(&vp->v_interlock);
+	}
+
+	/*
+	 * Blocks are now gathered into a segment waiting to be written.
+	 * All that's left to do is update metadata, and write them.
+	 */
+	lfs_updatemeta(fs->lfs_sp);
+	fs->lfs_sp->vp = NULL;
+	lfs_writeseg(fs, fs->lfs_sp);
+
+	/*
+	 * Clean up FIP.
+	 * (This should duplicate cleanup at the end of lfs_writefile().)
+	 */
+        if (sp->fip->fi_nblocks != 0) {
+                sp->fip = (FINFO*)((caddr_t)sp->fip + sizeof(struct finfo) +
+			sizeof(int32_t) * (sp->fip->fi_nblocks - 1));
+                sp->start_lbp = &sp->fip->fi_blocks[0];
+        } else {
+                sp->sum_bytes_left += sizeof(FINFO) - sizeof(int32_t);
+                --((SEGSUM *)(sp->segsum))->ss_nfinfo;
+        }
+	/*
+	 * XXX - with the malloc/copy writeseg, the pages are freed by now
+	 * even if we don't wait (e.g. if we hold a nested lock).  This
+	 * will not be true if we stop using malloc/copy.
+	 */
+	KASSERT(fs->lfs_sp->seg_flags & SEGM_PROT);
+	lfs_segunlock(fs);
+
+	/*
+	 * Wait for v_numoutput to drop to zero.  The seglock should
+	 * take care of this, but there is a slight possibility that
+	 * aiodoned might not have got around to our buffers yet.
+	 */
+	if (sync) {
+		int s;
+
+		s = splbio();
+		simple_lock(&global_v_numoutput_slock);
+		while(vp->v_numoutput > 0) {
+#ifdef DEBUG
+			printf("ino %d sleeping on num %d\n",
+				ip->i_number, vp->v_numoutput);
+#endif
+			vp->v_flag |= VBWAIT;
+			simple_unlock(&global_v_numoutput_slock);
+			tsleep(&vp->v_numoutput, PRIBIO + 1, "lfs_vn", 0);
+			simple_lock(&global_v_numoutput_slock);
+		}
+		simple_unlock(&global_v_numoutput_slock);
+		splx(s);
+	}
+	return error;
+}
+
+/*
+ * Find out whether the vnode has any blocks or pages waiting to be written.
+ * We used to just check LIST_EMPTY(&vp->v_dirtyblkhd), but there is not
+ * presently as simple a mechanism for the page cache.
+ */
+int
+lfs_checkifempty(struct vnode *vp)
+{
+	struct vm_page *pg;
+	struct buf *bp;
+	int r, s;
+
+	if (vp->v_type != VREG || VTOI(vp)->i_number == LFS_IFILE_INUM)
+		return LIST_EMPTY(&vp->v_dirtyblkhd);
+
+	/*
+	 * For vnodes with pages it is a little more complex.
+	 * Pages that have been written (i.e. are "clean" for our purposes)
+	 * might be in seemingly dirty buffers, so we have to troll
+	 * looking for indirect block buffers as well as pages.
+	 */
+	simple_lock(&vp->v_interlock);
+	s = splbio();
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp;
+	     bp = LIST_NEXT(bp, b_vnbufs)) {
+		if (bp->b_lblkno < 0) {
+			splx(s);
+			return 0;
+		}
+	}
+	splx(s);
+	
+	/*
+	 * Run through the page list to find dirty pages.
+	 * Right now I just walk the memq. 
+	 */
+	pg = TAILQ_FIRST(&vp->v_uobj.memq);
+	r = 1;
+	while(pg) {
+		if ((pg->flags & PG_CLEAN) == 0 || pmap_is_modified(pg)) {
+			r = 0;
+			break;
+		}
+		pg = TAILQ_NEXT(pg, listq);
+	}
+#if 0
+	if (r != !(vp->v_flag & VONWORKLST)) {
+		printf("nope, VONWORKLST isn't good enough!\n");
+	}
+#endif
+	simple_unlock(&vp->v_interlock);
+	return r;
+}
+
+/*
+ * Return the last logical file offset that should be written for this file
+ * if we're doing a write that ends at "size".  If writing, we need to know
+ * about sizes on disk, i.e. fragments if there are any; if reading, we need
+ * to know about entire blocks.
+ */
+void
+lfs_gop_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
+{
+	struct inode *ip = VTOI(vp);
+	struct lfs *fs = ip->i_lfs; 
+	daddr_t olbn, nlbn;
+
+	KASSERT(flags & (GOP_SIZE_READ | GOP_SIZE_WRITE));
+	KASSERT((flags & (GOP_SIZE_READ | GOP_SIZE_WRITE)) 
+		!= (GOP_SIZE_READ | GOP_SIZE_WRITE));
+
+	olbn = lblkno(fs, ip->i_ffs_size);
+	nlbn = lblkno(fs, size);
+        if ((flags & GOP_SIZE_WRITE) && nlbn < NDADDR && olbn <= nlbn) {
+                *eobp = fragroundup(fs, size);
+        } else {
+                *eobp = blkroundup(fs, size);
+        }
+}
+
+#ifdef DEBUG
+void lfs_dump_vop(void *);
+
+void
+lfs_dump_vop(void *v)
+{
+        struct vop_putpages_args /* {
+                struct vnode *a_vp;
+                voff_t a_offlo;
+                voff_t a_offhi;
+                int a_flags;
+        } */ *ap = v;
+
+	vfs_vnode_print(ap->a_vp, 0, printf);
+	lfs_dump_dinode(&VTOI(ap->a_vp)->i_din.ffs_din);
+}
+#endif
+
+int
+lfs_mmap(void *v)
+{
+	struct vop_mmap_args /* {
+        	const struct vnodeop_desc *a_desc;
+        	struct vnode *a_vp;
+        	int a_fflags;
+        	struct ucred *a_cred;
+        	struct proc *a_p;
+	} */ *ap = v;
+
+	if (VTOI(ap->a_vp)->i_number == LFS_IFILE_INUM)
+		return EOPNOTSUPP;
+	return ufs_mmap(v);
+}
+#endif /* LFS_UBC */
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@ -1,4 +1,4 @@
-/*	$NetBSD: inode.h,v 1.32 2003/01/24 21:55:29 fvdl Exp $	*/
+/*	$NetBSD: inode.h,v 1.33 2003/02/17 23:48:23 perseant Exp $	*/

 /*
 * Copyright (c) 1982, 1989, 1993
@ -58,11 +58,7 @@ struct ext2fs_inode_ext {
 	daddr_t ext2fs_last_blk;	/* last block allocated on disk */
 };

-struct lfs_inode_ext {
-	off_t	  lfs_osize;		/* size of file on disk */
-	u_int32_t lfs_effnblocks;  /* number of blocks when i/o completes */
-	size_t    lfs_fragsize[NDADDR]; /* size of on-disk direct blocks */
-};
+struct lfs_inode_ext;

 /*
 * The inode is used to describe each active (or recently active) file in the
@ -111,13 +107,10 @@ struct inode {
 	union {
 		/* Other extensions could go here... */
 		struct	ext2fs_inode_ext e2fs;
-		struct  lfs_inode_ext lfs;
+		struct  lfs_inode_ext *lfs;
 	} inode_ext;
 #define	i_e2fs_last_lblk	inode_ext.e2fs.ext2fs_last_lblk
 #define	i_e2fs_last_blk		inode_ext.e2fs.ext2fs_last_blk
-#define i_lfs_effnblks		inode_ext.lfs.lfs_effnblocks
-#define i_lfs_fragsize		inode_ext.lfs.lfs_fragsize
-#define i_lfs_osize		inode_ext.lfs.lfs_osize
 	/*
 	 * The on-disk dinode itself.
 	 */
@ -179,6 +172,7 @@ struct inode {
 #define	IN_CLEANING	0x0100		/* LFS: file is being cleaned */
 #define	IN_ADIROP	0x0200		/* LFS: dirop in progress */
 #define IN_SPACECOUNTED	0x0400		/* Blocks to be freed in free count. */
+#define IN_PAGING       0x1000          /* LFS: file is on paging queue */

 #if defined(_KERNEL)
 /*
--- a/sys/ufs/ufs/ufs_extern.h
+++ b/sys/ufs/ufs/ufs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_extern.h,v 1.30 2003/01/24 21:55:30 fvdl Exp $	*/
+/*	$NetBSD: ufs_extern.h,v 1.31 2003/02/17 23:48:23 perseant Exp $	*/

 /*-
 * Copyright (c) 1991, 1993, 1994
@ -167,6 +167,7 @@ void ufs_vinit __P((struct mount *, int (**) __P((void *)),
    int (**) __P((void *)), struct vnode **));
 int ufs_makeinode __P((int, struct vnode *, struct vnode **,
 		       struct componentname *));
+int ufs_gop_alloc __P((struct vnode *, off_t, off_t, int, struct ucred *));

 /*
 * Soft dependency function prototypes.
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_inode.c,v 1.33 2002/01/26 08:32:05 chs Exp $	*/
+/*	$NetBSD: ufs_inode.c,v 1.34 2003/02/17 23:48:23 perseant Exp $	*/

 /*
 * Copyright (c) 1991, 1993
@ -41,7 +41,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.33 2002/01/26 08:32:05 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.34 2003/02/17 23:48:23 perseant Exp $");

 #include "opt_quota.h"

@ -192,10 +192,10 @@ ufs_balloc_range(vp, off, len, cred, flags)
 		    vp, off, len, vp->v_size);

 	oldeof = vp->v_size;
-	GOP_SIZE(vp, oldeof, &oldeob);
+	GOP_SIZE(vp, oldeof, &oldeob, GOP_SIZE_WRITE);

 	neweof = MAX(vp->v_size, off + len);
-	GOP_SIZE(vp, neweof, &neweob);
+	GOP_SIZE(vp, neweof, &neweob, GOP_SIZE_WRITE);

 	error = 0;
 	uobj = &vp->v_uobj;
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.47 2003/01/24 21:55:30 fvdl Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.48 2003/02/17 23:48:23 perseant Exp $	*/

 /*-
 * Copyright (c) 1993
@ -36,7 +36,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.47 2003/01/24 21:55:30 fvdl Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.48 2003/02/17 23:48:23 perseant Exp $");

 #ifdef LFS_READWRITE
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
@ -110,9 +110,13 @@ READ(void *v)
 		goto out;
 	}

-#ifndef LFS_READWRITE
+#ifdef LFS_READWRITE
+# ifdef LFS_UBC
+	usepc = (vp->v_type == VREG && ip->i_number != LFS_IFILE_INUM);
+# endif
+#else /* !LFS_READWRITE */
 	usepc = vp->v_type == VREG;
-#endif
+#endif /* !LFS_READWRITE */
 	if (usepc) {
 		while (uio->uio_resid > 0) {
 			bytelen = MIN(ip->i_ffs_size - uio->uio_offset,
@ -278,9 +282,14 @@ WRITE(void *v)
 	bsize = fs->fs_bsize;
 	error = 0;

-#ifndef LFS_READWRITE
+#ifdef LFS_READWRITE
+# ifdef LFS_UBC
+	async = TRUE;
 	usepc = vp->v_type == VREG;
-#endif
+# endif
+#else /* !LFS_READWRITE */
+	usepc = vp->v_type == VREG;
+#endif /* !LFS_READWRITE */
 	if (!usepc) {
 		goto bcache;
 	}
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $	*/
+/*	$NetBSD: ufs_vnops.c,v 1.90 2003/02/17 23:48:23 perseant Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1989, 1993, 1995
@ -41,7 +41,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.90 2003/02/17 23:48:23 perseant Exp $");

 #include "opt_quota.h"
 #include "fs_lfs.h"
@ -73,6 +73,8 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.89 2002/12/31 15:00:18 yamt Exp $");
 #include <ufs/ext2fs/ext2fs_extern.h>
 #include <ufs/lfs/lfs_extern.h>

+#include <uvm/uvm.h>
+
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct proc *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *,
 		    struct proc *);
@ -2071,3 +2073,49 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
 	vput(tvp);
 	return (error);
 }
+
+/*
+ * Allocate len bytes at offset off.
+ */
+int
+ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
+    struct ucred *cred)
+{
+        struct inode *ip = VTOI(vp);
+        int error, delta, bshift, bsize;
+        UVMHIST_FUNC("ufs_gop_alloc"); UVMHIST_CALLED(ubchist);
+
+        error = 0;
+        bshift = vp->v_mount->mnt_fs_bshift;                  
+        bsize = 1 << bshift;
+
+        delta = off & (bsize - 1);
+        off -= delta;
+        len += delta;
+
+        while (len > 0) {
+                bsize = MIN(bsize, len);
+
+                error = VOP_BALLOC(vp, off, bsize, cred, flags, NULL);
+                if (error) {
+                        goto out;
+                }
+
+                /*
+                 * increase file size now, VOP_BALLOC() requires that
+                 * EOF be up-to-date before each call.
+                 */
+
+                if (ip->i_ffs_size < off + bsize) {
+                        UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x",
+                            vp, ip->i_ffs_size, off + bsize, 0);
+                        ip->i_ffs_size = off + bsize;
+                }
+
+                off += bsize;
+                len -= bsize;
+        }
+
+out:
+        return error;
+}
--- a/sys/uvm/uvm_page.c
+++ b/sys/uvm/uvm_page.c
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.c,v 1.83 2003/02/01 06:23:55 thorpej Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.84 2003/02/17 23:48:24 perseant Exp $	*/

 /*
 * Copyright (c) 1997 Charles D. Cranor and Washington University.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.83 2003/02/01 06:23:55 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.84 2003/02/17 23:48:24 perseant Exp $");

 #include "opt_uvmhist.h"

@ -1254,7 +1254,7 @@ uvm_pagefree(pg)
 		if (pg->flags & PG_WANTED) {
 			wakeup(pg);
 		}
-		pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED);
+		pg->flags &= ~(PG_WANTED|PG_BUSY|PG_RELEASED|PG_PAGER1);
 #ifdef UVM_PAGE_TRKOWN
 		pg->owner_tag = NULL;
 #endif
--- a/sys/uvm/uvm_pager.h
+++ b/sys/uvm/uvm_pager.h
@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pager.h,v 1.25 2002/03/25 02:08:10 chs Exp $	*/
+/*	$NetBSD: uvm_pager.h,v 1.26 2003/02/17 23:48:24 perseant Exp $	*/

 /*
 *
@ -124,6 +124,7 @@ struct uvm_pagerops {

 #define PGO_ALLPAGES	0x010	/* flush whole object/get all pages */
 #define PGO_LOCKED	0x040	/* fault data structures are locked [get] */
+#define PGO_BUSYFAIL	0x080	/* fail if a page is busy [put] */
 #define PGO_OVERWRITE	0x200	/* pages will be overwritten before unlocked */
 #define PGO_PASTEOF	0x400	/* allow allocation of pages past EOF */

--- a/usr.sbin/dumplfs/dumplfs.c
+++ b/usr.sbin/dumplfs/dumplfs.c
@ -1,4 +1,4 @@
-/*	$NetBSD: dumplfs.c,v 1.21 2003/01/28 07:44:54 mrg Exp $	*/
+/*	$NetBSD: dumplfs.c,v 1.22 2003/02/17 23:48:25 perseant Exp $	*/

 /*-
 * Copyright (c) 1991, 1993
@ -45,7 +45,7 @@ __COPYRIGHT(
 #if 0
 static char sccsid[] = "@(#)dumplfs.c	8.5 (Berkeley) 5/24/95";
 #else
-__RCSID("$NetBSD: dumplfs.c,v 1.21 2003/01/28 07:44:54 mrg Exp $");
+__RCSID("$NetBSD: dumplfs.c,v 1.22 2003/02/17 23:48:25 perseant Exp $");
 #endif
 #endif /* not lint */

@ -678,7 +678,7 @@ dump_super(struct lfs *lfsp)
 	
 	(void)printf("  Checkpoint Info\n");
 	(void)printf("    %s%-10d  %s0x%-8x  %s%-10d\n",
- 		     "free     ", lfsp->lfs_free,
+ 		     "freehd   ", lfsp->lfs_freehd,
 		     "idaddr   ", lfsp->lfs_idaddr,
 		     "ifile    ", lfsp->lfs_ifile);
 	(void)printf("    %s%-10d  %s%-10d  %s%-10d\n",
--- a/usr.sbin/quotaon/quotaon.c
+++ b/usr.sbin/quotaon/quotaon.c
@ -1,4 +1,4 @@
-/*	$NetBSD: quotaon.c,v 1.17 2002/07/20 08:40:20 grant Exp $	*/
+/*	$NetBSD: quotaon.c,v 1.18 2003/02/17 23:48:25 perseant Exp $	*/

 /*
 * Copyright (c) 1980, 1990, 1993
@ -46,7 +46,7 @@ __COPYRIGHT("@(#) Copyright (c) 1980, 1990, 1993\n\
 #if 0
 static char sccsid[] = "@(#)quotaon.c	8.1 (Berkeley) 6/6/93";
 #else
-__RCSID("$NetBSD: quotaon.c,v 1.17 2002/07/20 08:40:20 grant Exp $");
+__RCSID("$NetBSD: quotaon.c,v 1.18 2003/02/17 23:48:25 perseant Exp $");
 #endif
 #endif /* not lint */

@ -128,7 +128,8 @@ main(argc, argv)
 	}
 	setfsent();
 	while ((fs = getfsent()) != NULL) {
-		if (strcmp(fs->fs_vfstype, "ffs") ||
+		if ((strcmp(fs->fs_vfstype, "ffs") &&
+		     strcmp(fs->fs_vfstype, "lfs")) ||
 		    strcmp(fs->fs_type, FSTAB_RW))
 			continue;
 		if (aflag) {