Phase one of my three-phase plan to make LFS play nice with UBC, and bug-fixes

I found while making sure there weren't any new ones. * Make the write clusters keep track of the buffers whose blocks they contain. This should make it possible to (1) write clusters using a page mapping instead of malloc, if desired, and (2) schedule blocks for rewriting (somewhere else) if a write error occurs. Code is present to use pagemove() to construct the clusters but that is untested and will go away anyway in favor of page mapping. * DEBUG now keeps a log of Ifile writes, so that any lingering instances of the "dirty bufs" problem can be properly debugged. * Keep track of whether the Ifile has been dirtied by various routines that can be called by lfs_segwrite, and loop on that until it is clean, for a checkpoint. Checkpoints need to be squeaky clean. * Warn the user (once) if the Ifile grows larger than is reasonable for their buffer cache. Both lfs_mountfs and lfs_unmount check since the Ifile can grow. * If an inode is not found in a disk block, try rereading the block, under the assumption that the block was copied to a cluster and then freed. * Protect WRITEINPROG() with splbio() to fix a hang in lfs_update.
2002-05-14 20:03:53 +00:00 · 2002-05-14 20:03:53 +00:00 · 8886b0f4b2
commit 8886b0f4b2
parent 56deade0b7
12 changed files with 832 additions and 263 deletions
--- a/sys/ufs/lfs/lfs.h
+++ b/sys/ufs/lfs/lfs.h
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs.h,v 1.38 2001/11/23 21:44:25 chs Exp $	*/
+/*	$NetBSD: lfs.h,v 1.39 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -73,12 +73,19 @@
 /*
 * Compile-time options for LFS.
 */
+#define LFS_IFIND_RETRIES	16
 #define LFS_EAGAIN_FAIL          /* markv fail with EAGAIN if ino is locked */
-#define LFS_TRACK_IOS            /* attempt to avoid cleaning segments not yet fully written to disk */
 #define LFS_DEBUG_RFW            /* print roll-forward debugging info */
+#define LFS_NO_PAGEMOVE          /* Use malloc/copy to write clusters */
+#define LFS_AGGRESSIVE_SEGLOCK
+#define LFS_LOGLENGTH 1024

 /* #define DEBUG_LFS */              /* Intensive debugging of LFS subsystem */

+#ifdef LFS_NO_PAGEMOVE
+# define LFS_MALLOC_SUMMARY
+#endif
+
 /*
 * Parameters and generic definitions
 */
@ -120,6 +127,20 @@
 	(bp)->b_flags &= ~B_LOCKED;					\
 } while (0)

+#ifdef DEBUG_LOCKED_LIST
+# define LFS_DEBUG_COUNTLOCKED(m) do {                                  \
+	int _s;                                                         \
+	extern int locked_queue_count;					\
+	extern long locked_queue_bytes;					\
+        _s = splbio();							\
+        lfs_countlocked(&locked_queue_count, &locked_queue_bytes, (m));	\
+        splx(_s);							\
+        wakeup(&locked_queue_count);					\
+} while (0)
+#else
+# define LFS_DEBUG_COUNTLOCKED(m)
+#endif
+
 /* For convenience */
 #define IN_ALLMOD (IN_MODIFIED|IN_ACCESS|IN_CHANGE|IN_UPDATE|IN_ACCESSED|IN_CLEANING)

@ -146,7 +167,42 @@
 	}                                                               \
 } while (0)

+#ifdef DEBUG
+struct lfs_log_entry {
+	char *op;
+	char *file;
+	int line;
+	ufs_daddr_t block;
+	unsigned long flags;
+};
+extern int lfs_lognum;
+extern struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
+# define LFS_BWRITE_LOG(bp) lfs_bwrite_log((bp), __FILE__, __LINE__)
+# define LFS_ENTER_LOG(theop, thefile, theline, lbn, theflags) do { \
+	int _s;							\
+								\
+	_s = splbio();						\
+	lfs_log[lfs_lognum].op = theop;				\
+	lfs_log[lfs_lognum].file = thefile;			\
+	lfs_log[lfs_lognum].line = (theline);			\
+	lfs_log[lfs_lognum].block = (lbn);			\
+	lfs_log[lfs_lognum].flags = (theflags);			\
+	lfs_lognum = (lfs_lognum + 1) % LFS_LOGLENGTH;		\
+	splx(_s);						\
+} while (0)
+
+# define LFS_BCLEAN_LOG(fs, bp) do {					\
+	if ((bp)->b_vp == (fs)->lfs_ivnode)				\
+		LFS_ENTER_LOG("clear", __FILE__, __LINE__, bp->b_lblkno, bp->b_flags); \
+} while (0)
+#else
+# define LFS_BCLEAN_LOG(fs, bp)
+# define LFS_BWRITE_LOG(bp)		VOP_BWRITE((bp))
+#endif
+	
 #define LFS_ITIMES(ip, acc, mod, cre)  do {				\
+	struct lfs *_fs = (ip)->i_lfs;					\
+									\
       	if ((ip)->i_flag & IN_ACCESS) {                        		\
 		(ip)->i_ffs_atime = (acc)->tv_sec;			\
 		(ip)->i_ffs_atimensec = (acc)->tv_nsec;			\
@ -157,7 +213,8 @@
 			LFS_IENTRY(ifp, ip->i_lfs, ip->i_number, ibp);	\
 			ifp->if_atime_sec = (acc)->tv_sec;		\
 			ifp->if_atime_nsec = (acc)->tv_nsec;		\
-			VOP_BWRITE(ibp);				\
+			LFS_BWRITE_LOG(ibp);				\
+			_fs->lfs_flags |= LFS_IFDIRTY;			\
 		} else {						\
 			LFS_SET_UINO(ip, IN_ACCESSED);			\
 		}                                              		\
@ -310,7 +367,7 @@ struct dlfs {
 };

 /* Maximum number of io's we can have pending at once */
-#define LFS_THROTTLE  16 /* XXX should be better paramtrized - ? */
+#define LFS_THROTTLE  32 /* XXX should be better paramtrized - ? */

 /* In-memory super block. */
 struct lfs {
@ -388,7 +445,9 @@ struct lfs {
 	u_int32_t lfs_nactive;		/* Number of segments since last ckp */
 	int8_t	  lfs_fmod;		/* super block modified flag */
 	int8_t	  lfs_ronly;		/* mounted read-only flag */
-#define LFS_NOTYET 0x01
+#define LFS_NOTYET  0x01
+#define LFS_IFDIRTY 0x02
+#define LFS_WARNED  0x04
 	int8_t	  lfs_flags;		/* currently unused flag */
 	u_int16_t lfs_activesb;         /* toggle between superblocks */
 #ifdef LFS_TRACK_IOS
@ -570,13 +629,13 @@ struct segsum {
 	((ufs_daddr_t)(segtod((fs), (sn)) + (fs)->lfs_start))

 /* Read in the block with the cleaner info from the ifile. */
-#define LFS_CLEANERINFO(CP, F, BP) {					\
+#define LFS_CLEANERINFO(CP, F, BP) do {					\
 	VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS;			\
 	if (bread((F)->lfs_ivnode,					\
 	    (ufs_daddr_t)0, (F)->lfs_bsize, NOCRED, &(BP)))		\
 		panic("lfs: ifile read");				\
 	(CP) = (CLEANERINFO *)(BP)->b_data;				\
-}
+} while(0)

 /* Synchronize the Ifile cleaner info with current avail and bfree */
 #define LFS_SYNC_CLEANERINFO(cip, fs, bp, w) do {                \
@ -584,7 +643,9 @@ struct segsum {
        (cip)->avail != (fs)->lfs_avail - (fs)->lfs_ravail) {    \
 	(cip)->bfree = (fs)->lfs_bfree;                          \
        (cip)->avail = (fs)->lfs_avail - (fs)->lfs_ravail;       \
-	(void) VOP_BWRITE(bp); /* Ifile */                       \
+        if (((bp)->b_flags & B_GATHERED) == 0)			 \
+		(fs)->lfs_flags |= LFS_IFDIRTY;                  \
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */                       \
    } else                                                       \
 	brelse(bp);                                              \
 } while (0)
@ -603,7 +664,8 @@ struct segsum {
 	if ((FS)->lfs_version > 1) {                                    \
 		LFS_CLEANERINFO((CIP), (FS), (BP));                     \
 		(CIP)->free_head = (VAL);                 		\
-		VOP_BWRITE(BP);                                         \
+		LFS_BWRITE_LOG(BP);                                         \
+		(FS)->lfs_flags |= LFS_IFDIRTY;                          \
 	}                                                               \
 } while (0)

@ -616,7 +678,8 @@ struct segsum {
 #define LFS_PUT_TAILFREE(FS, CIP, BP, VAL) do {                         \
 	LFS_CLEANERINFO((CIP), (FS), (BP));                     	\
 	(CIP)->free_tail = (VAL);                 			\
-	VOP_BWRITE(BP);                                         	\
+	LFS_BWRITE_LOG(BP);                                         	\
+	(FS)->lfs_flags |= LFS_IFDIRTY;                          \
 } while (0)

 /*
@ -624,7 +687,7 @@ struct segsum {
 * may not be mapped!
 */
 /* Read in the block with a specific inode from the ifile. */
-#define	LFS_IENTRY(IP, F, IN, BP) {					\
+#define	LFS_IENTRY(IP, F, IN, BP) do {					\
 	int _e;								\
 	VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS;			\
 	if ((_e = bread((F)->lfs_ivnode,				\
@ -635,10 +698,10 @@ struct segsum {
 		(IP) = (IFILE *)((IFILE_V1 *)(BP)->b_data + (IN) % (F)->lfs_ifpb); \
 	else								\
 		(IP) = (IFILE *)(BP)->b_data + (IN) % (F)->lfs_ifpb;	\
-}
+} while(0)

 /* Read in the block with a specific segment usage entry from the ifile. */
-#define	LFS_SEGENTRY(SP, F, IN, BP) {					\
+#define	LFS_SEGENTRY(SP, F, IN, BP) do {				\
 	int _e;								\
 	VTOI((F)->lfs_ivnode)->i_flag |= IN_ACCESS;			\
 	if ((_e = bread((F)->lfs_ivnode,				\
@ -650,7 +713,7 @@ struct segsum {
 			((IN) & ((F)->lfs_sepb - 1)));			\
 	else								\
 		(SP) = (SEGUSE *)(BP)->b_data + ((IN) % (F)->lfs_sepb);	\
-}
+} while(0)

 /* Determine if a buffer belongs to the ifile */
 #define IS_IFILE(bp)	(VTOI(bp->b_vp)->i_number == LFS_IFILE_INUM)
@ -704,6 +767,18 @@ struct segment {
 	u_int16_t seg_flags;		/* run-time flags for this segment */
 };

+struct lfs_cluster {
+	struct buf **bpp;      /* Array of kept buffers */
+	int bufcount;          /* Number of kept buffers */
+	size_t bufsize;        /* Size of kept data */
+#define LFS_CL_MALLOC	0x00000001
+#define LFS_CL_SHIFT	0x00000002
+	u_int32_t flags;       /* Flags */
+	struct lfs *fs;        /* LFS that this belongs to */
+	void *saveaddr;        /* Original contents of saveaddr */
+	char *olddata;		/* Original b_data, if LFS_CL_MALLOC */
+};
+
 /*
 * Macros for determining free space on the disk, with the variable metadata
 * of segment summaries and inode blocks taken into account.
--- a/sys/ufs/lfs/lfs_alloc.c
+++ b/sys/ufs/lfs/lfs_alloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_alloc.c,v 1.55 2002/02/04 03:32:16 perseant Exp $	*/
+/*	$NetBSD: lfs_alloc.c,v 1.56 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.55 2002/02/04 03:32:16 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.56 2002/05/14 20:03:53 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -81,6 +81,7 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_alloc.c,v 1.55 2002/02/04 03:32:16 perseant Exp
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/buf.h>
+#include <sys/lock.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/mount.h>
@ -109,6 +110,8 @@ static int lfs_ialloc(struct lfs *, struct vnode *, ino_t, int, struct vnode **)
 *
 * XXX this function does not have appropriate locking to be used on a live fs;
 * XXX but something similar could probably be used for an "undelete" call.
+ *
+ * Called with the Ifile inode locked.
 */
 int
 lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
@ -182,7 +185,7 @@ lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
 			return ENOENT;
 		}
 		ifp->if_nextfree = oldnext;
-		VOP_BWRITE(bp);
+		LFS_BWRITE_LOG(bp);
 	}

 	error = lfs_ialloc(fs, fs->lfs_ivnode, ino, version, &vp);
@ -211,6 +214,9 @@ lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
 	return error;
 }

+/*
+ * Called with the Ifile inode locked. 
+ */
 static int
 extend_ifile(struct lfs *fs, struct ucred *cred)
 {
@ -225,19 +231,14 @@ extend_ifile(struct lfs *fs, struct ucred *cred)
 	CLEANERINFO *cip;

 	vp = fs->lfs_ivnode;
-	(void)lfs_vref(vp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	ip = VTOI(vp);
 	blkno = lblkno(fs, ip->i_ffs_size);
 	if ((error = VOP_BALLOC(vp, ip->i_ffs_size, fs->lfs_bsize, cred, 0,
 				&bp)) != 0) {
-		VOP_UNLOCK(vp, 0);
-		lfs_vunref(vp);
 		return (error);
 	}
 	ip->i_ffs_size += fs->lfs_bsize;
 	uvm_vnp_setsize(vp, ip->i_ffs_size);
-	VOP_UNLOCK(vp, 0);
 	
 	i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
 		fs->lfs_ifpb;
@ -269,8 +270,7 @@ extend_ifile(struct lfs *fs, struct ucred *cred)
 	}
 	LFS_PUT_TAILFREE(fs, cip, cbp, max - 1);

-	(void) VOP_BWRITE(bp); /* Ifile */
-	lfs_vunref(vp);
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */

 	return 0;
 }
@ -300,6 +300,9 @@ lfs_valloc(void *v)
 		return EROFS;
 	*ap->a_vpp = NULL;
 	
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_seglock(fs, SEGM_PROT);
+#else
 	if (fs->lfs_version == 1) {
 		/*
 		 * Use lfs_seglock here, instead of fs->lfs_freelock, to
@ -311,6 +314,7 @@ lfs_valloc(void *v)
 	} else {
 		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
 	}
+#endif

 	/* Get the head of the freelist. */
 	LFS_GET_HEADFREE(fs, cip, cbp, &new_ino);
@ -344,10 +348,14 @@ lfs_valloc(void *v)
 	if (fs->lfs_free == LFS_UNUSED_INUM) {
 		if ((error = extend_ifile(fs, ap->a_cred)) != 0) {
 			LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+			lfs_segunlock(fs);
+#else
 			if (fs->lfs_version == 1)
 				lfs_segunlock(fs);
 			else
 				lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
+#endif
 			return error;
 		}
 	}
@ -356,11 +364,14 @@ lfs_valloc(void *v)
 		panic("inode 0 allocated [3]");
 #endif /* DIAGNOSTIC */

+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_segunlock(fs);
+#else
 	if (fs->lfs_version == 1)
 		lfs_segunlock(fs);
 	else
 		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
-
+#endif
 	return lfs_ialloc(fs, ap->a_pvp, new_ino, new_gen, ap->a_vpp);
 }

@ -426,11 +437,17 @@ lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
 	/*
 	 * Put the new inum back on the free list.
 	 */
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_seglock(fs, SEGM_PROT);
+#endif
 	LFS_IENTRY(ifp, fs, new_ino, bp);
 	ifp->if_daddr = LFS_UNUSED_DADDR;
 	LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
 	LFS_PUT_HEADFREE(fs, cip, cbp, new_ino);
-	(void) VOP_BWRITE(bp); /* Ifile */
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_segunlock(fs);
+#endif

 	*vpp = NULLVP;
 	return (error);
@ -470,6 +487,11 @@ lfs_vcreate(struct mount *mp, ino_t ino, struct vnode *vp)
 	ip->i_flag = 0;
 	/* Why was IN_MODIFIED ever set here? */
 	/* LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED); */
+
+#ifdef DEBUG_LFS_VNLOCK
+	if (ino == LFS_IFILE_INUM)
+		vp->v_vnlock->lk_wmesg = "inlock";
+#endif
 }

 /* Free an inode. */
@ -493,6 +515,7 @@ lfs_vfree(void *v)
 	ufs_daddr_t old_iaddr;
 	ino_t ino, otail;
 	extern int lfs_dirvcount;
+	int s;
 	
 	/* Get the inode number and file system. */
 	vp = ap->a_pvp;
@ -501,13 +524,19 @@ lfs_vfree(void *v)
 	ino = ip->i_number;

 	/* Drain of pending writes */
+	s = splbio();
 	if (fs->lfs_version > 1 && WRITEINPROG(vp))
 		tsleep(vp, (PRIBIO+1), "lfs_vfree", 0);
+	splx(s);

+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_seglock(fs, SEGM_PROT); /* XXX */;
+#else
 	if (fs->lfs_version == 1)
 		lfs_seglock(fs, SEGM_PROT);
 	else
 		lockmgr(&fs->lfs_freelock, LK_EXCLUSIVE, 0);
+#endif
 	
 	if (vp->v_flag & VDIROP) {
 		--lfs_dirvcount;
@ -534,7 +563,7 @@ lfs_vfree(void *v)
 	if (fs->lfs_version == 1) {
 		LFS_GET_HEADFREE(fs, cip, cbp, &(ifp->if_nextfree));
 		LFS_PUT_HEADFREE(fs, cip, cbp, ino);
-		(void) VOP_BWRITE(bp); /* Ifile */
+		(void) LFS_BWRITE_LOG(bp); /* Ifile */
 	} else {
 		ifp->if_nextfree = LFS_UNUSED_INUM;
 		/*
@ -543,11 +572,11 @@ lfs_vfree(void *v)
 		 * XXX (the ifile could be written before the rest of this
 		 * XXX completes).
 		 */
-		(void) VOP_BWRITE(bp); /* Ifile */
+		(void) LFS_BWRITE_LOG(bp); /* Ifile */
 		LFS_GET_TAILFREE(fs, cip, cbp, &otail);
 		LFS_IENTRY(ifp, fs, otail, bp);
 		ifp->if_nextfree = ino;
-		VOP_BWRITE(bp);
+		LFS_BWRITE_LOG(bp);
 		LFS_PUT_TAILFREE(fs, cip, cbp, ino);
 		/* printf("lfs_vfree: tailfree %d -> %d\n", otail, ino); */
 	}
@ -569,16 +598,20 @@ lfs_vfree(void *v)
 		}
 #endif
 		sup->su_nbytes -= DINODE_SIZE;
-		(void) VOP_BWRITE(bp); /* Ifile */
+		(void) LFS_BWRITE_LOG(bp); /* Ifile */
 	}
 	
 	/* Set superblock modified bit and decrement file count. */
 	fs->lfs_fmod = 1;
 	--fs->lfs_nfiles;
 	
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_segunlock(fs);
+#else
 	if (fs->lfs_version == 1)
 		lfs_segunlock(fs);
 	else
 		lockmgr(&fs->lfs_freelock, LK_RELEASE, 0);
+#endif
 	return (0);
 }
--- a/sys/ufs/lfs/lfs_balloc.c
+++ b/sys/ufs/lfs/lfs_balloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_balloc.c,v 1.31 2001/11/23 21:44:26 chs Exp $	*/
+/*	$NetBSD: lfs_balloc.c,v 1.32 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.31 2001/11/23 21:44:26 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_balloc.c,v 1.32 2002/05/14 20:03:53 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -400,13 +400,15 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, ufs_daddr_t lbn, struct b
 	if ((*bpp)->b_blkno > 0) {
 		LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno)), ibp);
 		sup->su_nbytes += (nsize - osize);
-		VOP_BWRITE(ibp);
+		LFS_BWRITE_LOG(ibp);
 		ip->i_ffs_blocks += bb;
 	}
 	fs->lfs_bfree -= bb;
 	ip->i_lfs_effnblks += bb;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;

+	LFS_DEBUG_COUNTLOCKED("frag1");
+
 	obufsize = (*bpp)->b_bufsize;
 	allocbuf(*bpp, nsize);

@ -414,6 +416,8 @@ lfs_fragextend(struct vnode *vp, int osize, int nsize, ufs_daddr_t lbn, struct b
 	if (((*bpp)->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
 		locked_queue_bytes += (*bpp)->b_bufsize - obufsize;

+	LFS_DEBUG_COUNTLOCKED("frag2");
+
 	bzero((char *)((*bpp)->b_data) + osize, (u_int)(nsize - osize));

    out:
--- a/sys/ufs/lfs/lfs_bio.c
+++ b/sys/ufs/lfs/lfs_bio.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_bio.c,v 1.42 2002/05/12 23:06:29 matt Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.43 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.42 2002/05/12 23:06:29 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.43 2002/05/14 20:03:53 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -298,11 +298,10 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 		bp->b_flags |= B_DELWRI;

 		LFS_LOCK_BUF(bp);
-		bp->b_flags &= ~(B_READ | B_ERROR);
+		bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
 		s = splbio();
 		reassignbuf(bp, bp->b_vp);
 		splx(s);
-
 	}
 	
 	if (bp->b_flags & B_CALL)
@ -351,7 +350,6 @@ lfs_flush_fs(struct lfs *fs, int flags)
 void
 lfs_flush(struct lfs *fs, int flags)
 {
-	int s;
 	struct mount *mp, *nmp;
 	
 	if (lfs_dostats) 
@ -378,12 +376,7 @@ lfs_flush(struct lfs *fs, int flags)
 	}
 	simple_unlock(&mountlist_slock);

-#if 1 || defined(DEBUG)
-	s = splbio();
-	lfs_countlocked(&locked_queue_count, &locked_queue_bytes);
-	splx(s);
-	wakeup(&locked_queue_count);
-#endif /* 1 || DEBUG */
+	LFS_DEBUG_COUNTLOCKED("flush");

 	lfs_writing = 0;
 }
@ -488,9 +481,8 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, ufs_daddr_t daddr, size_t size)
 	
 	bp = DOMALLOC(sizeof(struct buf), M_SEGMENT, M_WAITOK);
 	bzero(bp, sizeof(struct buf));
-	if (nbytes)
-		bp->b_data = DOMALLOC(nbytes, M_SEGMENT, M_WAITOK);
 	if (nbytes) {
+		bp->b_data = DOMALLOC(nbytes, M_SEGMENT, M_WAITOK);
 		bzero(bp->b_data, nbytes);
 	}
 #ifdef DIAGNOSTIC	
@ -503,6 +495,7 @@ lfs_newbuf(struct lfs *fs, struct vnode *vp, ufs_daddr_t daddr, size_t size)
 	bgetvp(vp, bp);
 	splx(s);
 	
+	bp->b_saveaddr = (caddr_t)fs;
 	bp->b_bufsize = size;
 	bp->b_bcount = size;
 	bp->b_lblkno = daddr;
@ -555,7 +548,7 @@ extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
 * Don't count malloced buffers, since they don't detract from the total.
 */
 void
-lfs_countlocked(int *count, long *bytes)
+lfs_countlocked(int *count, long *bytes, char *msg)
 {
 	struct buf *bp;
 	int n = 0;
@ -573,14 +566,14 @@ lfs_countlocked(int *count, long *bytes)
 			      " buffers locked than exist");
 #endif
 	}
-#ifdef DEBUG
+#ifdef DEBUG_LOCKED_LIST
 	/* Theoretically this function never really does anything */
 	if (n != *count)
-		printf("lfs_countlocked: adjusted buf count from %d to %d\n",
-		       *count, n);
+		printf("lfs_countlocked: %s: adjusted buf count from %d to %d\n",
+		       msg, *count, n);
 	if (size != *bytes)
-		printf("lfs_countlocked: adjusted byte count from %ld to %ld\n",
-		       *bytes, size);
+		printf("lfs_countlocked: %s: adjusted byte count from %ld to %ld\n",
+		       msg, *bytes, size);
 #endif
 	*count = n;
 	*bytes = size;
--- a/sys/ufs/lfs/lfs_debug.c
+++ b/sys/ufs/lfs/lfs_debug.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_debug.c,v 1.15 2001/11/23 21:44:27 chs Exp $	*/
+/*	$NetBSD: lfs_debug.c,v 1.16 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -73,18 +73,47 @@
 #ifdef DEBUG

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.15 2001/11/23 21:44:27 chs Exp $");
-
+__KERNEL_RCSID(0, "$NetBSD: lfs_debug.c,v 1.16 2002/05/14 20:03:53 perseant Exp $");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
+#include <sys/buf.h>

 #include <ufs/ufs/inode.h>
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+int lfs_lognum;
+struct lfs_log_entry lfs_log[LFS_LOGLENGTH];
+
+int lfs_bwrite_log(struct buf *bp, char *file, int line)
+{
+        struct vop_bwrite_args a;
+        a.a_desc = VDESC(vop_bwrite);
+        a.a_bp = bp;
+
+	if (!(bp->b_flags & (B_DELWRI | B_GATHERED)))
+		LFS_ENTER_LOG("write", file, line, bp->b_lblkno, bp->b_flags);
+        return (VCALL(bp->b_vp, VOFFSET(vop_bwrite), &a));
+}
+
+void lfs_dumplog(void)
+{
+	int i;
+
+	for (i = lfs_lognum; i != (lfs_lognum - 1) % LFS_LOGLENGTH; i = (i + 1) % LFS_LOGLENGTH)
+		if (lfs_log[i].file) {
+			printf("lbn %d %s %lx %d %s\n",
+				lfs_log[i].block,
+				lfs_log[i].op,
+				lfs_log[i].flags,
+				lfs_log[i].line,
+				lfs_log[i].file + 56);
+		}
+}
+
 void 
 lfs_dump_super(struct lfs *lfsp)
 {
--- a/sys/ufs/lfs/lfs_extern.h
+++ b/sys/ufs/lfs/lfs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_extern.h,v 1.29 2002/05/12 23:06:29 matt Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.30 2002/05/14 20:03:53 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -131,7 +131,7 @@ struct buf *lfs_newbuf_malloclog(struct lfs *, struct vnode *,
 void lfs_freebuf(struct buf *);
 struct buf *lfs_newbuf(struct lfs *, struct vnode *, ufs_daddr_t, size_t);
 #endif
-void lfs_countlocked(int *, long *);
+void lfs_countlocked(int *, long *, char *);
 int lfs_reserve(struct lfs *, struct vnode *, int);

 /* lfs_cksum.c */
@ -140,6 +140,8 @@ u_int32_t lfs_sb_cksum(struct dlfs *);

 /* lfs_debug.c */
 #ifdef DEBUG
+int lfs_bwrite_log(struct buf *, char *, int);
+void lfs_dumplog(void);
 void lfs_dump_super(struct lfs *);
 void lfs_dump_dinode(struct dinode *);
 void lfs_check_bpp(struct lfs *, struct segment *, char *, int);
@ -180,7 +182,7 @@ void lfs_segunlock(struct lfs *);

 /* lfs_syscalls.c */
 int lfs_fastvget(struct mount *, ino_t, ufs_daddr_t, struct vnode **, struct dinode *, int *);
-struct buf *lfs_fakebuf(struct vnode *, int, size_t, caddr_t);
+struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);

 /* lfs_vfsops.c */
 void lfs_init(void);
--- a/sys/ufs/lfs/lfs_inode.c
+++ b/sys/ufs/lfs/lfs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_inode.c,v 1.56 2001/11/23 21:44:27 chs Exp $	*/
+/*	$NetBSD: lfs_inode.c,v 1.57 2002/05/14 20:03:54 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.56 2001/11/23 21:44:27 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_inode.c,v 1.57 2002/05/14 20:03:54 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -137,8 +137,8 @@ lfs_ifind(struct lfs *fs, ino_t ino, struct buf *bp)
 	       dtosn(fs, fs->lfs_offset));
 	printf("block is 0x%x (seg %d)\n", dbtofsb(fs, bp->b_blkno),
 	       dtosn(fs, dbtofsb(fs, bp->b_blkno)));
-	panic("lfs_ifind: dinode %u not found", ino);
-	/* NOTREACHED */
+
+	return NULL;
 }

 int
@ -154,6 +154,7 @@ lfs_update(void *v)
 	struct vnode *vp = ap->a_vp;
 	struct timespec ts;
 	struct lfs *fs = VFSTOUFS(vp->v_mount)->um_lfs;
+	int s;
 	
 	if (vp->v_mount->mnt_flag & MNT_RDONLY)
 		return (0);
@ -166,6 +167,7 @@ lfs_update(void *v)
 	 * will cause a panic.  So, we must wait until any pending write
 	 * for our inode completes, if we are called with UPDATE_WAIT set.
 	 */
+	s = splbio();
 	while ((ap->a_flags & (UPDATE_WAIT|UPDATE_DIROP)) == UPDATE_WAIT &&
 	    WRITEINPROG(vp)) {
 #ifdef DEBUG_LFS
@ -174,6 +176,7 @@ lfs_update(void *v)
 #endif
 		tsleep(vp, (PRIBIO+1), "lfs_update", 0);
 	}
+	splx(s);
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
 	LFS_ITIMES(ip,
 		   ap->a_access ? ap->a_access : &ts,
@ -313,11 +316,15 @@ lfs_truncate(void *v)
 	 * (We don't need to *hold* the seglock, though, because we already
 	 * hold the inode lock; draining the seglock is sufficient.)
 	 */
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_seglock(fs, SEGM_PROT);
+#else
 	if (ovp != fs->lfs_unlockvp) {
 		while (fs->lfs_seglock) {
 			tsleep(&fs->lfs_seglock, PRIBIO+1, "lfs_truncate", 0);
 		}
 	}
+#endif
 	
 	/*
 	 * Shorten the size of the file. If the file is not being
@ -340,6 +347,9 @@ lfs_truncate(void *v)
 		error = VOP_BALLOC(ovp, length - 1, 1, ap->a_cred, aflags, &bp);
 		if (error) {
 			lfs_reserve(fs, ovp, -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+			lfs_segunlock(fs);
+#endif
 			return (error);
 		}
 		obufsize = bp->b_bufsize;
@ -350,11 +360,10 @@ lfs_truncate(void *v)
 			memset((char *)bp->b_data + offset, 0,
 			       (u_int)(size - offset));
 		allocbuf(bp, size);
-		if (bp->b_flags & B_DELWRI) {
-			if ((bp->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
-				locked_queue_bytes -= obufsize - bp->b_bufsize;
+		if ((bp->b_flags & (B_LOCKED | B_CALL)) == B_LOCKED)
+			locked_queue_bytes -= obufsize - bp->b_bufsize;
+		if (bp->b_flags & B_DELWRI)
 			fs->lfs_avail += odb - btofsb(fs, size);
-		}
 		(void) VOP_BWRITE(bp);
 	}
 	uvm_vnp_setsize(ovp, length);
@ -494,6 +503,9 @@ done:
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
 #endif
 	lfs_reserve(fs, ovp, -btofsb(fs, (2 * NIADDR + 3) << fs->lfs_bshift));
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_segunlock(fs);
+#endif
 	return (allerror);
 }

@ -523,10 +535,10 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 {
 	SEGUSE *sup;
 	struct buf *bp;
+	int error;

 	if (lastseg < 0 || num == 0)
 		return 0;
-
 	
 	LFS_SEGENTRY(sup, fs, lastseg, bp);
 	if (num > sup->su_nbytes) {
@ -536,7 +548,8 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 		sup->su_nbytes = num;
 	}
 	sup->su_nbytes -= num;
-	return (VOP_BWRITE(bp)); /* Ifile */
+	error = LFS_BWRITE_LOG(bp); /* Ifile */
+	return error;
 }

 /*
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_segment.c,v 1.73 2001/11/23 21:44:27 chs Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.74 2002/05/14 20:03:54 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.73 2001/11/23 21:44:27 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.74 2002/05/14 20:03:54 perseant Exp $");

 #define ivndebug(vp,str) printf("ino %d: %s\n",VTOI(vp)->i_number,(str))

@ -104,9 +104,14 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.73 2001/11/23 21:44:27 chs Exp $")
 #include <ufs/lfs/lfs.h>
 #include <ufs/lfs/lfs_extern.h>

+#include <uvm/uvm_extern.h>
+
 extern int count_lock_queue(void);
 extern struct simplelock vnode_free_list_slock;		/* XXX */

+static void lfs_cluster_callback(struct buf *);
+static struct buf **lookahead_pagemove(struct buf **, int, size_t *);
+
 /*
 * Determine if it's OK to start a partial in this segment, or if we need
 * to go on to a new segment.
@ -235,12 +240,14 @@ lfs_vflush(struct vnode *vp)
 	}

 	/* If the node is being written, wait until that is done */
+	s = splbio();
 	if (WRITEINPROG(vp)) {
 #ifdef DEBUG_LFS
 		ivndebug(vp,"vflush/writeinprog");
 #endif
 		tsleep(vp, PRIBIO+1, "lfs_vw", 0);
 	}
+	splx(s);

 	/* Protect against VXLOCK deadlock in vinvalbuf() */
 	lfs_seglock(fs, SEGM_SYNC);
@ -299,8 +306,7 @@ lfs_vflush(struct vnode *vp)
 		ivndebug(vp,"vflush/clean");
 #endif
 		lfs_writevnodes(fs, vp->v_mount, sp, VN_CLEAN);
-	}
-	else if (lfs_dostats) {
+	} else if (lfs_dostats) {
 		if (vp->v_dirtyblkhd.lh_first || (VTOI(vp)->i_flag & IN_ALLMOD))
 			++lfs_stats.vflush_invoked;
 #ifdef DEBUG_LFS
@ -334,6 +340,23 @@ lfs_vflush(struct vnode *vp)
 		if (sp->seg_flags & SEGM_CKP)
 			++lfs_stats.ncheckpoints;
 	}
+	/*
+	 * If we were called from somewhere that has already held the seglock
+	 * (e.g., lfs_markv()), the lfs_segunlock will not wait for
+	 * the write to complete because we are still locked.
+	 * Since lfs_vflush() must return the vnode with no dirty buffers,
+	 * we must explicitly wait, if that is the case.
+	 *
+	 * We compare the iocount against 1, not 0, because it is
+	 * artificially incremented by lfs_seglock().
+	 */
+	if (fs->lfs_seglock > 1) {
+		s = splbio();
+		while (fs->lfs_iocount > 1)
+			(void)tsleep(&fs->lfs_iocount, PRIBIO + 1,
+				     "lfs_vflush", 0);
+		splx(s);
+	}
 	lfs_segunlock(fs);

 	CLR_FLUSHING(fs,vp);
@ -483,6 +506,7 @@ lfs_segwrite(struct mount *mp, int flags)
 	int do_ckp, did_ckp, error, i;
 	int writer_set = 0;
 	int dirty;
+	int redo;
 	
 	fs = VFSTOUFS(mp)->um_lfs;

@ -598,7 +622,7 @@ lfs_segwrite(struct mount *mp, int flags)
 				--dirty;
 			}
 			if (dirty)
-				error = VOP_BWRITE(bp); /* Ifile */
+				error = LFS_BWRITE_LOG(bp); /* Ifile */
 			else
 				brelse(bp);
 		}
@ -610,18 +634,42 @@ lfs_segwrite(struct mount *mp, int flags)
 			vp = fs->lfs_ivnode;

 			vget(vp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
+#ifdef DEBUG
+			LFS_ENTER_LOG("pretend", __FILE__, __LINE__, 0, 0);
+#endif
+			fs->lfs_flags &= ~LFS_IFDIRTY;

 			ip = VTOI(vp);
-			if (vp->v_dirtyblkhd.lh_first != NULL)
+			/* if (vp->v_dirtyblkhd.lh_first != NULL) */
 				lfs_writefile(fs, sp, vp);
 			if (ip->i_flag & IN_ALLMOD)
 				++did_ckp;
-			(void) lfs_writeinode(fs, sp, ip);
+			redo = lfs_writeinode(fs, sp, ip);
 			
 			vput(vp);
-		} while (lfs_writeseg(fs, sp) && do_ckp);
+			redo += lfs_writeseg(fs, sp);
+			redo += (fs->lfs_flags & LFS_IFDIRTY);
+		} while (redo && do_ckp);

 		/* The ifile should now be all clear */
+		if (do_ckp && vp->v_dirtyblkhd.lh_first) {
+			struct buf *bp;
+			int s, warned = 0, dopanic = 0;
+			s = splbio();
+			for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = bp->b_vnbufs.le_next) {
+				if (!(bp->b_flags & B_GATHERED)) {
+					if (!warned)
+						printf("lfs_segwrite: ifile still has dirty blocks?!\n");
+					++dopanic;
+					++warned;
+					printf("bp=%p, lbn %d, flags 0x%lx\n",
+						bp, bp->b_lblkno, bp->b_flags);
+				}
+			}
+			if (dopanic)
+				panic("dirty blocks");
+			splx(s);
+		}
 		LFS_CLR_UINO(ip, IN_ALLMOD);
 	} else {
 		(void) lfs_writeseg(fs, sp);
@ -688,8 +736,7 @@ lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
 	fip->fi_version = ifp->if_version;
 	brelse(bp);
 	
-	if (sp->seg_flags & SEGM_CLEAN)
-	{
+	if (sp->seg_flags & SEGM_CLEAN) {
 		lfs_gather(fs, sp, vp, lfs_match_fake);
 		/*
 		 * For a file being flushed, we need to write *all* blocks.
@ -780,7 +827,9 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)

 	/* Update the inode times and copy the inode onto the inode page. */
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
-	LFS_ITIMES(ip, &ts, &ts, &ts);
+	/* XXX kludge --- don't redirty the ifile just to put times on it */
+	if (ip->i_number != LFS_IFILE_INUM)
+		LFS_ITIMES(ip, &ts, &ts, &ts);

 	/*
 	 * If this is the Ifile, and we've already written the Ifile in this
@ -873,7 +922,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 				ip->i_number);
 		}
 #endif
-		error = VOP_BWRITE(ibp); /* Ifile */
+		error = LFS_BWRITE_LOG(ibp); /* Ifile */
 	}
 	
 	/*
@ -913,7 +962,9 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 		sup->su_nbytes -= DINODE_SIZE;
 		redo_ifile =
 			(ino == LFS_IFILE_INUM && !(bp->b_flags & B_GATHERED));
-		error = VOP_BWRITE(bp); /* Ifile */
+		if (redo_ifile)
+			fs->lfs_flags |= LFS_IFDIRTY;
+		error = LFS_BWRITE_LOG(bp); /* Ifile */
 	}
 	return (redo_ifile);
 }
@ -963,6 +1014,8 @@ lfs_gatherblock(struct segment *sp, struct buf *bp, int *sptr)
 #endif
 	/* Insert into the buffer list, update the FINFO block. */
 	bp->b_flags |= B_GATHERED;
+	bp->b_flags &= ~B_DONE;
+
 	*sp->cbpp++ = bp;
 	sp->fip->fi_blocks[sp->fip->fi_nblocks++] = bp->b_lblkno;
 	
@ -992,8 +1045,13 @@ loop:	for (bp = vp->v_dirtyblkhd.lh_first; bp && bp->b_vnbufs.le_next != NULL;
 	    bp = bp->b_vnbufs.le_next);
 	for (; bp && bp != BEG_OF_LIST; bp = BACK_BUF(bp)) {
 #endif /* LFS_NO_BACKBUF_HACK */
-		if ((bp->b_flags & (B_BUSY|B_GATHERED)) || !match(fs, bp))
+		if ((bp->b_flags & (B_BUSY|B_GATHERED)) || !match(fs, bp)) {
+#ifdef DEBUG_LFS
+			if (vp == fs->lfs_ivnode && (bp->b_flags & (B_BUSY|B_GATHERED)) == B_BUSY)
+				printf("(%d:%lx)", bp->b_lblkno, bp->b_flags);
+#endif
 			continue;
+		}
 		if (vp->v_type == VBLK) {
 			/* For block devices, just write the blocks. */
 			/* XXX Do we really need to even do this? */
@ -1187,7 +1245,9 @@ lfs_updatemeta(struct segment *sp)
 			       (*sp->start_bpp)->b_lblkno, daddr);
 #endif
 			sup->su_nbytes -= (*sp->start_bpp)->b_bcount;
-			error = VOP_BWRITE(bp); /* Ifile */
+			if (!(bp->b_flags & B_GATHERED))
+				fs->lfs_flags |= LFS_IFDIRTY;
+			error = LFS_BWRITE_LOG(bp); /* Ifile */
 		}
 	}
 }
@ -1201,7 +1261,7 @@ lfs_initseg(struct lfs *fs)
 	struct segment *sp;
 	SEGUSE *sup;
 	SEGSUM *ssp;
-	struct buf *bp;
+	struct buf *bp, *sbp;
 	int repeat;
 	
 	sp = fs->lfs_sp;
@ -1250,9 +1310,16 @@ lfs_initseg(struct lfs *fs)

 	/* Get a new buffer for SEGSUM and enter it into the buffer list. */
 	sp->cbpp = sp->bpp;
-	*sp->cbpp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
-			       fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize);
-	sp->segsum = (*sp->cbpp)->b_data;
+#ifdef LFS_MALLOC_SUMMARY
+	sbp = *sp->cbpp = lfs_newbuf(fs, VTOI(fs->lfs_ivnode)->i_devvp,
+				     fsbtodb(fs, fs->lfs_offset), fs->lfs_sumsize);
+  	sp->segsum = (*sp->cbpp)->b_data;
+#else
+	sbp = *sp->cbpp = getblk(VTOI(fs->lfs_ivnode)->i_devvp,
+				 fsbtodb(fs, fs->lfs_offset), NBPG, 0, 0);
+	memset(sbp->b_data, 0x5a, NBPG);
+	sp->segsum = (*sp->cbpp)->b_data + NBPG - fs->lfs_sumsize;
+#endif
 	bzero(sp->segsum, fs->lfs_sumsize);
 	sp->start_bpp = ++sp->cbpp;
 	fs->lfs_offset += btofsb(fs, fs->lfs_sumsize);
@ -1272,6 +1339,10 @@ lfs_initseg(struct lfs *fs)
 	sp->seg_bytes_left -= fs->lfs_sumsize;
 	sp->sum_bytes_left = fs->lfs_sumsize - SEGSUM_SIZE(fs);
 	
+#ifndef LFS_MALLOC_SUMMARY
+	LFS_LOCK_BUF(sbp);
+	brelse(sbp);
+#endif
 	return (repeat);
 }

@ -1295,7 +1366,7 @@ lfs_newseg(struct lfs *fs)
 	sup->su_nbytes = 0;
 	sup->su_nsums = 0;
 	sup->su_ninos = 0;
-	(void) VOP_BWRITE(bp); /* Ifile */
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */

 	LFS_CLEANERINFO(cip, fs, bp);
 	--cip->clean;
@ -1323,19 +1394,109 @@ lfs_newseg(struct lfs *fs)
 	}
 }

+static struct buf **
+lookahead_pagemove(struct buf **bpp, int nblocks, size_t *size)
+{
+	size_t maxsize;
+#ifndef LFS_NO_PAGEMOVE
+	struct buf *bp;
+#endif
+
+	maxsize = *size;
+	*size = 0;
+#ifdef LFS_NO_PAGEMOVE
+	return bpp;
+#else
+	while((bp = *bpp) != NULL && *size < maxsize && nblocks--) {
+		if(bp->b_flags & B_CALL)
+			return bpp;
+		if(bp->b_bcount % NBPG)
+			return bpp;
+		*size += bp->b_bcount;
+		++bpp;
+	}
+	return NULL;
+#endif
+}
+
+#define BQUEUES 4 /* XXX */
+#define BQ_EMPTY 3 /* XXX */
+extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
+
+#define	BUFHASH(dvp, lbn)	\
+	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
+extern LIST_HEAD(bufhashhdr, buf) invalhash;
+/*
+ * Insq/Remq for the buffer hash lists.
+ */
+#define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
+#define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
+
+static struct buf *
+lfs_newclusterbuf(struct lfs *fs, struct vnode *vp, daddr_t addr, int n)
+{
+	struct lfs_cluster *cl;
+	struct buf **bpp, *bp;
+	int s;
+
+	cl = (struct lfs_cluster *)malloc(sizeof(*cl), M_SEGMENT, M_WAITOK);
+	bpp = (struct buf **)malloc(n*sizeof(*bpp), M_SEGMENT, M_WAITOK);
+	memset(cl,0,sizeof(*cl));
+	cl->fs = fs;
+	cl->bpp = bpp;
+	cl->bufcount = 0;
+	cl->bufsize = 0;
+
+	/* Get an empty buffer header, or maybe one with something on it */
+	s = splbio();
+	if((bp = bufqueues[BQ_EMPTY].tqh_first) != NULL) {
+		bremfree(bp);
+		/* clear out various other fields */
+		bp->b_flags = B_BUSY;
+		bp->b_dev = NODEV;
+		bp->b_blkno = bp->b_lblkno = 0;
+		bp->b_error = 0;
+		bp->b_resid = 0;
+		bp->b_bcount = 0;
+		
+		/* nuke any credentials we were holding */
+		/* XXXXXX */
+	
+		bremhash(bp);
+
+		/* disassociate us from our vnode, if we had one... */
+		if (bp->b_vp)
+			brelvp(bp);
+	}
+	splx(s);
+	while (!bp)
+		bp = getnewbuf(0, 0);
+	s = splbio();
+	bgetvp(vp, bp);
+	binshash(bp,&invalhash);
+	splx(s);
+	bp->b_bcount = 0;
+	bp->b_blkno = bp->b_lblkno = addr;
+
+	bp->b_flags |= B_CALL;
+	bp->b_iodone = lfs_cluster_callback;
+	cl->saveaddr = bp->b_saveaddr; /* XXX is this ever used? */
+	bp->b_saveaddr = (caddr_t)cl;
+
+	return bp;
+}
+
 int
 lfs_writeseg(struct lfs *fs, struct segment *sp)
 {
-	struct buf **bpp, *bp, *cbp, *newbp;
+	struct buf **bpp, *bp, *cbp, *newbp, **pmlastbpp;
 	SEGUSE *sup;
 	SEGSUM *ssp;
 	dev_t i_dev;
 	char *datap, *dp;
 	int do_again, i, nblocks, s;
 	size_t el_size;
-#ifdef LFS_TRACK_IOS
-	int j;
-#endif
+ 	struct lfs_cluster *cl;
 	int (*strategy)(void *);
 	struct vop_strategy_args vop_strategy_a;
 	u_short ninos;
@ -1343,6 +1504,9 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	char *p;
 	struct vnode *vp;
 	struct inode *ip;
+	size_t pmsize;
+	int use_pagemove;
+	daddr_t pseg_daddr;
 	daddr_t *daddrp;
 	int changed;
 #if defined(DEBUG) && defined(LFS_PROPELLER)
@ -1353,7 +1517,8 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	if (propeller == 4)
 		propeller = 0;
 #endif
-	
+	pseg_daddr = (*(sp->bpp))->b_blkno;
+
 	/*
 	 * If there are no buffers other than the segment summary to write
 	 * and it is not a checkpoint, don't do anything.  On a checkpoint,
@ -1402,7 +1567,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	fs->lfs_avail -= btofsb(fs, fs->lfs_sumsize);

 	do_again = !(bp->b_flags & B_GATHERED);
-	(void)VOP_BWRITE(bp); /* Ifile */
+	(void)LFS_BWRITE_LOG(bp); /* Ifile */
 	/*
 	 * Mark blocks B_BUSY, to prevent then from being changed between
 	 * the checksum computation and the actual write.
@ -1488,7 +1653,6 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			} else {
 				bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
 						 B_GATHERED);
-				LFS_UNLOCK_BUF(bp);
 				if (bp->b_flags & B_CALL) {
 					lfs_freebuf(bp);
 					bp = NULL;
@ -1496,6 +1660,7 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 					bremfree(bp);
 					bp->b_flags |= B_DONE;
 					reassignbuf(bp, bp->b_vp);
+					LFS_UNLOCK_BUF(bp);
 					brelse(bp);
 				}
 			}
@ -1533,6 +1698,10 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		ssp->ss_serial = ++fs->lfs_serial;
 		ssp->ss_ident  = fs->lfs_ident;
 	}
+#ifndef LFS_MALLOC_SUMMARY
+	/* Set the summary block busy too */
+	(*(sp->bpp))->b_flags |= B_BUSY;
+#endif
 	ssp->ss_datasum = cksum(datap, (nblocks - 1) * el_size);
 	ssp->ss_sumsum =
 	    cksum(&ssp->ss_datasum, fs->lfs_sumsize - sizeof(ssp->ss_sumsum));
@ -1548,51 +1717,85 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	strategy = devvp->v_op[VOFFSET(vop_strategy)];

 	/*
-	 * When we simply write the blocks we lose a rotation for every block
-	 * written.  To avoid this problem, we allocate memory in chunks, copy
-	 * the buffers into the chunk and write the chunk.  CHUNKSIZE is the
-	 * largest size I/O devices can handle.
-	 * When the data is copied to the chunk, turn off the B_LOCKED bit
-	 * and brelse the buffer (which will move them to the LRU list).  Add
-	 * the B_CALL flag to the buffer header so we can count I/O's for the
-	 * checkpoints and so we can release the allocated memory.
-	 *
-	 * XXX
-	 * This should be removed if the new virtual memory system allows us to
-	 * easily make the buffers contiguous in kernel memory and if that's
-	 * fast enough.
+  	 * When we simply write the blocks we lose a rotation for every block
+	 * written.  To avoid this problem, we use pagemove to cluster
+	 * the buffers into a chunk and write the chunk.  CHUNKSIZE is the
+  	 * largest size I/O devices can handle.
+  	 *
+	 * XXX - right now MAXPHYS is only 64k; could it be larger?
 	 */

 #define CHUNKSIZE MAXPHYS

 	if (devvp == NULL)
 		panic("devvp is NULL");
-	for (bpp = sp->bpp,i = nblocks; i;) {
-		cbp = lfs_newbuf(fs, devvp, (*bpp)->b_blkno, CHUNKSIZE);
+	for (bpp = sp->bpp, i = nblocks; i;) {
+		cbp = lfs_newclusterbuf(fs, devvp, (*bpp)->b_blkno, i);
+		cl = (struct lfs_cluster *)cbp->b_saveaddr;
+
 		cbp->b_dev = i_dev;
 		cbp->b_flags |= B_ASYNC | B_BUSY;
 		cbp->b_bcount = 0;

-#ifdef DIAGNOSTIC
-		if (dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno) + btofsb(fs, (*bpp)->b_bcount) - 1) !=
+		/*
+		 * Find out if we can use pagemove to build the cluster,
+		 * or if we are stuck using malloc/copy.  If this is the
+		 * first cluster, set the shift flag (see below).
+		 */
+		pmsize = CHUNKSIZE;
+		use_pagemove = 0;
+		if(bpp == sp->bpp) {
+			/* Summary blocks have to get special treatment */
+			pmlastbpp = lookahead_pagemove(bpp + 1, i - 1, &pmsize);
+			if(pmsize >= CHUNKSIZE - fs->lfs_sumsize ||
+			   pmlastbpp == NULL) {
+				use_pagemove = 1;
+				cl->flags |= LFS_CL_SHIFT;
+			} else {
+				/*
+				 * If we're not using pagemove, we have
+				 * to copy the summary down to the bottom
+				 * end of the block.
+				 */
+#ifndef LFS_MALLOC_SUMMARY
+				memcpy((*bpp)->b_data, (*bpp)->b_data +
+				       NBPG - fs->lfs_sumsize,
+				       fs->lfs_sumsize);
+#endif /* LFS_MALLOC_SUMMARY */
+			}
+		} else {
+			pmlastbpp = lookahead_pagemove(bpp, i, &pmsize);
+			if(pmsize >= CHUNKSIZE || pmlastbpp == NULL) {
+				use_pagemove = 1;
+			}
+		}
+		if(use_pagemove == 0) {
+			cl->flags |= LFS_CL_MALLOC;
+			cl->olddata = cbp->b_data;
+			cbp->b_data = malloc(CHUNKSIZE, M_SEGMENT, M_WAITOK);
+		}
+#if defined(DEBUG) && defined(DIAGNOSTIC)
+		if(dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno + btodb((*bpp)->b_bcount - 1))) !=
 		   dtosn(fs, dbtofsb(fs, cbp->b_blkno))) {
+			printf("block at %x (%d), cbp at %x (%d)\n",
+				(*bpp)->b_blkno, dtosn(fs, dbtofsb(fs, (*bpp)->b_blkno)),
+			       cbp->b_blkno, dtosn(fs, dbtofsb(fs, cbp->b_blkno)));
 			panic("lfs_writeseg: Segment overwrite");
 		}
 #endif

+		/*
+		 * Construct the cluster.
+		 */
 		s = splbio();
-		if (fs->lfs_iocount >= LFS_THROTTLE) {
-			tsleep(&fs->lfs_iocount, PRIBIO+1, "lfs throttle", 0);
+		while (fs->lfs_iocount >= LFS_THROTTLE) {
+#ifdef DEBUG_LFS
+			printf("[%d]", fs->lfs_iocount);
+#endif
+			tsleep(&fs->lfs_iocount, PRIBIO+1, "lfs_throttle", 0);
 		}
 		++fs->lfs_iocount;
-#ifdef LFS_TRACK_IOS
-		for (j = 0; j < LFS_THROTTLE; j++) {
-			if (fs->lfs_pending[j] == LFS_UNUSED_DADDR) {
-				fs->lfs_pending[j] = dbtofsb(fs, cbp->b_blkno);
-				break;
-			}
-		}
-#endif /* LFS_TRACK_IOS */
+
 		for (p = cbp->b_data; i && cbp->b_bcount < CHUNKSIZE; i--) {
 			bp = *bpp;

@ -1608,26 +1811,54 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			if ((bp->b_flags & (B_CALL|B_INVAL)) == (B_CALL|B_INVAL)) {
 				if (copyin(bp->b_saveaddr, p, bp->b_bcount))
 					panic("lfs_writeseg: copyin failed [2]");
-			} else
+			} else if (use_pagemove) {
+				pagemove(bp->b_data, p, bp->b_bcount);
+				cbp->b_bufsize += bp->b_bcount;
+				bp->b_bufsize -= bp->b_bcount;
+  			} else {
 				bcopy(bp->b_data, p, bp->b_bcount);
-			p += bp->b_bcount;
-			cbp->b_bcount += bp->b_bcount;
-			LFS_UNLOCK_BUF(bp);
-			bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI |
-					 B_GATHERED);
-			vp = bp->b_vp;
-			if (bp->b_flags & B_CALL) {
-				/* if B_CALL, it was created with newbuf */
-				lfs_freebuf(bp);
-				bp = NULL;
+				/* printf("copy in %p\n", bp->b_data); */
+  			}
+  
+			/*
+			 * XXX If we are *not* shifting, the summary
+			 * block is only fs->lfs_sumsize.  Otherwise,
+			 * it is NBPG but shifted.
+			 */
+			if(bpp == sp->bpp && !(cl->flags & LFS_CL_SHIFT)) {
+				p += fs->lfs_sumsize;
+				cbp->b_bcount += fs->lfs_sumsize;
+				cl->bufsize += fs->lfs_sumsize;
 			} else {
-				bremfree(bp);
-				bp->b_flags |= B_DONE;
-				if (vp)
-					reassignbuf(bp, vp);
-				brelse(bp);
+				p += bp->b_bcount;
+				cbp->b_bcount += bp->b_bcount;
+				cl->bufsize += bp->b_bcount;
 			}
+			bp->b_flags &= ~(B_ERROR | B_READ | B_DELWRI | B_DONE);
+			cl->bpp[cl->bufcount++] = bp;
+			vp = bp->b_vp;
+			++vp->v_numoutput;

+			/*
+			 * Although it cannot be freed for reuse before the
+			 * cluster is written to disk, this buffer does not
+			 * need to be held busy.  Therefore we unbusy it,
+			 * while leaving it on the locked list.  It will
+			 * be freed or requeued by the callback depending
+			 * on whether it has had B_DELWRI set again in the
+			 * meantime.
+			 *
+			 * If we are using pagemove, we have to hold the block
+			 * busy to prevent its contents from changing before
+			 * it hits the disk, and invalidating the checksum.
+			 */
+			bp->b_flags &= ~(B_DELWRI | B_READ | B_ERROR);
+#ifdef LFS_MNOBUSY
+			if (cl->flags & LFS_CL_MALLOC) {
+				if (!(bp->b_flags & B_CALL))
+					brelse(bp); /* Still B_LOCKED */
+			}
+#endif
 			bpp++;

 			/*
@ -1641,10 +1872,10 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 			 * of blocks are present (traverse the dirty list?)
 			 */
 			if ((i == 1 ||
-			    (i > 1 && vp && *bpp && (*bpp)->b_vp != vp)) &&
-			   (bp = vp->v_dirtyblkhd.lh_first) != NULL &&
-			   vp->v_mount == fs->lfs_ivnode->v_mount)
-			{
+			     (i > 1 && vp && *bpp && (*bpp)->b_vp != vp)) &&
+			    (bp = vp->v_dirtyblkhd.lh_first) != NULL &&
+			    vp->v_mount == fs->lfs_ivnode->v_mount)
+  			{
 				ip = VTOI(vp);
 #ifdef DEBUG_LFS
 				printf("lfs_writeseg: marking ino %d\n",
@ -1660,29 +1891,21 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 		++cbp->b_vp->v_numoutput;
 		splx(s);
 		/*
-		 * XXXX This is a gross and disgusting hack.  Since these
-		 * buffers are physically addressed, they hang off the
-		 * device vnode (devvp).  As a result, they have no way
-		 * of getting to the LFS superblock or lfs structure to
-		 * keep track of the number of I/O's pending.  So, I am
-		 * going to stuff the fs into the saveaddr field of
-		 * the buffer (yuk).
+		 * In order to include the summary in a clustered block,
+		 * it may be necessary to shift the block forward (since
+		 * summary blocks are in generay smaller than can be
+		 * addressed by pagemove().  After the write, the block
+		 * will be corrected before disassembly.
 		 */
-		cbp->b_saveaddr = (caddr_t)fs;
+		if(cl->flags & LFS_CL_SHIFT) {
+			cbp->b_data += (NBPG - fs->lfs_sumsize);
+			cbp->b_bcount -= (NBPG - fs->lfs_sumsize);
+		}
 		vop_strategy_a.a_desc = VDESC(vop_strategy);
 		vop_strategy_a.a_bp = cbp;
 		(strategy)(&vop_strategy_a);
 	}
-#if 1 || defined(DEBUG)
-	/*
-	 * After doing a big write, we recalculate how many buffers are
-	 * really still left on the locked queue.
-	 */
-	s = splbio();
-	lfs_countlocked(&locked_queue_count, &locked_queue_bytes);
-	splx(s);
-	wakeup(&locked_queue_count);
-#endif /* 1 || DEBUG */
+
 	if (lfs_dostats) {
 		++lfs_stats.psegwrites;
 		lfs_stats.blocktot += nblocks - 1;
@ -1798,28 +2021,8 @@ lfs_match_tindir(struct lfs *fs, struct buf *bp)
 void
 lfs_callback(struct buf *bp)
 {
-	struct lfs *fs;
-#ifdef LFS_TRACK_IOS
-	int j;
-#endif
-
-	fs = (struct lfs *)bp->b_saveaddr;
-#ifdef DIAGNOSTIC
-	if (fs->lfs_iocount == 0)
-		panic("lfs_callback: zero iocount\n");
-#endif
-	if (--fs->lfs_iocount < LFS_THROTTLE)
-		wakeup(&fs->lfs_iocount);
-#ifdef LFS_TRACK_IOS
-	for (j = 0; j < LFS_THROTTLE; j++) {
-		if (fs->lfs_pending[j] == dbtofsb(fs, bp->b_blkno)) {
-			fs->lfs_pending[j] = LFS_UNUSED_DADDR;
-			wakeup(&(fs->lfs_pending[j]));
-			break;
-		}
-	}
-#endif /* LFS_TRACK_IOS */
-
+	/* struct lfs *fs; */
+	/* fs = (struct lfs *)bp->b_saveaddr; */
 	lfs_freebuf(bp);
 }

@ -1836,6 +2039,122 @@ lfs_supercallback(struct buf *bp)
 	lfs_freebuf(bp);
 }

+static void
+lfs_cluster_callback(struct buf *bp)
+{
+	struct lfs_cluster *cl;
+	struct lfs *fs;
+	struct buf *tbp;
+	struct vnode *vp;
+	int error=0;
+	char *cp;
+	extern int locked_queue_count;
+	extern long locked_queue_bytes;
+
+	if(bp->b_flags & B_ERROR)
+		error = bp->b_error;
+
+	cl = (struct lfs_cluster *)bp->b_saveaddr;
+	fs = cl->fs;
+	bp->b_saveaddr = cl->saveaddr;
+
+	/* If shifted, shift back now */
+	if(cl->flags & LFS_CL_SHIFT) {
+		bp->b_data -= (NBPG - fs->lfs_sumsize);
+		bp->b_bcount += (NBPG - fs->lfs_sumsize);
+	}
+
+	cp = (char *)bp->b_data + cl->bufsize;
+	/* Put the pages back, and release the buffer */
+	while(cl->bufcount--) {
+		tbp = cl->bpp[cl->bufcount];
+		if(!(cl->flags & LFS_CL_MALLOC)) {
+			cp -= tbp->b_bcount;
+			printf("pm(%p,%p,%lx)",cp,tbp->b_data,tbp->b_bcount);
+			pagemove(cp, tbp->b_data, tbp->b_bcount);
+			bp->b_bufsize -= tbp->b_bcount;
+			tbp->b_bufsize += tbp->b_bcount;
+		}
+		if(error) {
+			tbp->b_flags |= B_ERROR;
+			tbp->b_error = error;
+		}
+
+		/*
+		 * We're done with tbp.  If it has not been re-dirtied since
+		 * the cluster was written, free it.  Otherwise, keep it on
+		 * the locked list to be written again.
+		 */
+		if ((tbp->b_flags & (B_LOCKED | B_DELWRI)) == B_LOCKED)
+			LFS_UNLOCK_BUF(tbp);
+		tbp->b_flags &= ~B_GATHERED;
+
+		LFS_BCLEAN_LOG(fs, tbp);
+
+		vp = tbp->b_vp;
+		/* Segment summary for a shifted cluster */
+		if(!cl->bufcount && (cl->flags & LFS_CL_SHIFT))
+			tbp->b_flags |= B_INVAL;
+		if(!(tbp->b_flags & B_CALL)) {
+			bremfree(tbp);
+			if(vp)
+				reassignbuf(tbp, vp);
+			tbp->b_flags |= B_ASYNC; /* for biodone */
+		}
+#ifdef DIAGNOSTIC
+		if (tbp->b_flags & B_DONE) {
+			printf("blk %d biodone already (flags %lx)\n",
+				cl->bufcount, (long)tbp->b_flags);
+		}
+#endif
+		if (tbp->b_flags & (B_BUSY | B_CALL)) {
+			biodone(tbp);
+		}
+	}
+
+	/* Fix up the cluster buffer, and release it */
+	if(!(cl->flags & LFS_CL_MALLOC) && bp->b_bufsize) {
+		printf("PM(%p,%p,%lx)", (char *)bp->b_data + bp->b_bcount,
+			 (char *)bp->b_data, bp->b_bufsize);
+		pagemove((char *)bp->b_data + bp->b_bcount,
+			 (char *)bp->b_data, bp->b_bufsize);
+	}
+	if(cl->flags & LFS_CL_MALLOC) {
+		free(bp->b_data, M_SEGMENT);
+		bp->b_data = cl->olddata;
+	}
+	bp->b_bcount = 0;
+	bp->b_iodone = NULL;
+	bp->b_flags &= ~B_DELWRI;
+	bp->b_flags |= B_DONE;
+	reassignbuf(bp, bp->b_vp);
+	brelse(bp);
+
+	free(cl->bpp, M_SEGMENT);
+	free(cl, M_SEGMENT);
+
+#ifdef DIAGNOSTIC
+	if (fs->lfs_iocount == 0)
+		panic("lfs_callback: zero iocount\n");
+#endif
+	if (--fs->lfs_iocount < LFS_THROTTLE)
+		wakeup(&fs->lfs_iocount);
+#if 0
+	if (fs->lfs_iocount == 0) {
+		/*
+		 * XXX - do we really want to do this in a callback?
+		 *
+		 * Vinvalbuf can move locked buffers off the locked queue
+		 * and we have no way of knowing about this.  So, after
+		 * doing a big write, we recalculate how many buffers are
+		 * really still left on the locked queue.
+		 */
+		lfs_countlocked(&locked_queue_count, &locked_queue_bytes, "lfs_cluster_callback");
+		wakeup(&locked_queue_count);
+	}
+#endif
+}
+
 /*
 * Shellsort (diminishing increment sort) from Data Structures and
 * Algorithms, Aho, Hopcraft and Ullman, 1983 Edition, page 290;
--- a/sys/ufs/lfs/lfs_subr.c
+++ b/sys/ufs/lfs/lfs_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_subr.c,v 1.21 2001/11/23 21:44:28 chs Exp $	*/
+/*	$NetBSD: lfs_subr.c,v 1.22 2002/05/14 20:03:54 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.21 2001/11/23 21:44:28 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.22 2002/05/14 20:03:54 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -175,9 +175,14 @@ lfs_segunlock(struct lfs *fs)
 	struct segment *sp;
 	unsigned long sync, ckp;
 	int s;
+	struct buf *bp;
 	struct vnode *vp;
 	struct mount *mp;
 	extern int lfs_dirvcount;
+#ifdef LFS_MALLOC_SUMMARY
+	extern int locked_queue_count;
+	extern long locked_queue_bytes;
+#endif
 	
 	sp = fs->lfs_sp;

@ -207,8 +212,10 @@ lfs_segunlock(struct lfs *fs)
 		     vp != NULL;
 		     vp = vp->v_mntvnodes.le_next) {
 #endif
-			if (vp->v_mount != mp)
+			if (vp->v_mount != mp) {
+				printf("lfs_segunlock: starting over\n");
 				goto loop;
+			}
 			if (vp->v_type == VNON)
 				continue;
 			if (lfs_vref(vp))
@ -239,7 +246,18 @@ lfs_segunlock(struct lfs *fs)
 		if (sp->bpp != sp->cbpp) {
 			/* Free allocated segment summary */
 			fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
-                        lfs_freebuf(*sp->bpp);
+			bp = *sp->bpp;
+#ifdef LFS_MALLOC_SUMMARY
+			lfs_freebuf(bp);
+#else
+			s = splbio();
+			bremfree(bp);
+			splx(s);
+			bp->b_flags |= B_DONE|B_INVAL;
+			bp->b_flags &= ~B_DELWRI;
+			reassignbuf(bp,bp->b_vp);
+			brelse(bp);
+#endif
 		} else
 			printf ("unlock to 0 with no summary");

@ -254,7 +272,14 @@ lfs_segunlock(struct lfs *fs)
 		 * sleep.
 		 */
 		s = splbio();
-		--fs->lfs_iocount;
+		if (--fs->lfs_iocount < LFS_THROTTLE)
+			wakeup(&fs->lfs_iocount);
+		if(fs->lfs_iocount == 0) {
+			lfs_countlocked(&locked_queue_count,
+					&locked_queue_bytes, "lfs_segunlock");
+			wakeup(&locked_queue_count);
+			wakeup(&fs->lfs_iocount);
+		}
 		/*
 		 * We let checkpoints happen asynchronously.  That means
 		 * that during recovery, we have to roll forward between
--- a/sys/ufs/lfs/lfs_syscalls.c
+++ b/sys/ufs/lfs/lfs_syscalls.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.64 2002/05/12 23:06:29 matt Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.65 2002/05/14 20:03:54 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.64 2002/05/12 23:06:29 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.65 2002/05/14 20:03:54 perseant Exp $");

 #define LFS		/* for prototypes in syscallargs.h */

@ -100,7 +100,7 @@ __KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.64 2002/05/12 23:06:29 matt Exp $
 /* Max block count for lfs_markv() */
 #define MARKV_MAXBLKCNT		65536

-struct buf *lfs_fakebuf(struct vnode *, int, size_t, caddr_t);
+struct buf *lfs_fakebuf(struct lfs *, struct vnode *, int, size_t, caddr_t);
 int lfs_fasthashget(dev_t, ino_t, int *, struct vnode **);

 int debug_cleaner = 0; 
@ -258,9 +258,6 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 #ifdef CHECK_COPYIN
 	int i;
 #endif /* CHECK_COPYIN */
-#ifdef LFS_TRACK_IOS
-	int j;
-#endif
 	int numlocked = 0, numrefed = 0;
 	ino_t maxino;

@ -311,23 +308,6 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 		if (blkp->bi_daddr == LFS_FORCE_WRITE)
 			printf("lfs_markv: warning: force-writing ino %d lbn %d\n",
 			       blkp->bi_inode, blkp->bi_lbn);
-#ifdef LFS_TRACK_IOS
-		/*
-		 * If there is I/O on this segment that is not yet complete,
-		 * the cleaner probably does not have the right information.
-		 * Send it packing.
-		 */
-		for (j = 0; j < LFS_THROTTLE; j++) {
-			if (fs->lfs_pending[j] != LFS_UNUSED_DADDR
-			   && dtosn(fs,fs->lfs_pending[j]) == dtosn(fs,blkp->bi_daddr)
-			   && blkp->bi_daddr != LFS_FORCE_WRITE)
-			{
-				printf("lfs_markv: attempt to clean pending segment? (#%d)\n",
-				       dtosn(fs, fs->lfs_pending[j]));
-				/* return (EBUSY); */
-			}
-		}
-#endif /* LFS_TRACK_IOS */
 		/* Bounds-check incoming data, avoid panic for failed VGET */
 		if (blkp->bi_inode <= 0 || blkp->bi_inode >= maxino) {
 			error = EINVAL;
@ -493,7 +473,7 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 		}
 		if (ip->i_number != LFS_IFILE_INUM && blkp->bi_lbn >= 0) {
 			/* Data Block */
-			bp = lfs_fakebuf(vp, blkp->bi_lbn,
+			bp = lfs_fakebuf(fs, vp, blkp->bi_lbn,
 					 blkp->bi_size, blkp->bi_bp);
 			/* Pretend we used bread() to get it */
 			bp->b_blkno = fsbtodb(fs, blkp->bi_daddr);
@ -716,9 +696,6 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 	ufs_daddr_t v_daddr;
 	int cnt, error, need_unlock = 0;
 	int numlocked = 0, numrefed = 0;
-#ifdef LFS_TRACK_IOS
-	int j;
-#endif

 	lfs_cleaner_pid = p->p_pid;
 	
@ -748,24 +725,6 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 			return (EBUSY);
 		}
 #endif /* DEBUG */
-#ifdef LFS_TRACK_IOS
-		/*
-		 * If there is I/O on this segment that is not yet complete,
-		 * the cleaner probably does not have the right information.
-		 * Send it packing.
-		 */
-		for (j = 0; j < LFS_THROTTLE; j++) {
-			if (fs->lfs_pending[j] != LFS_UNUSED_DADDR
-			   && dtosn(fs,fs->lfs_pending[j]) == dtosn(fs,blkp->bi_daddr))
-			{
-				printf("lfs_bmapv: attempt to clean pending segment? (#%d)\n",
-				       dtosn(fs, fs->lfs_pending[j]));
-				vfs_unbusy(mntp);
-				return (EBUSY);
-			}
-		}
-
-#endif /* LFS_TRACK_IOS */
 		/*
 		 * Get the IFILE entry (only once) and see if the file still
 		 * exists.
@ -939,14 +898,23 @@ sys_lfs_segclean(struct proc *p, void *v, register_t *retval)
 	
 	if ((error = vfs_busy(mntp, LK_NOWAIT, NULL)) != 0) 
 		return (error);
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_seglock(fs, SEGM_PROT);
+#endif
 	LFS_SEGENTRY(sup, fs, SCARG(uap, segment), bp);
 	if (sup->su_flags & SEGUSE_ACTIVE) {
 		brelse(bp);
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+		lfs_segunlock(fs);
+#endif
 		vfs_unbusy(mntp);
 		return (EBUSY);
 	}
 	if (!(sup->su_flags & SEGUSE_DIRTY)) {
 		brelse(bp);
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+		lfs_segunlock(fs);
+#endif
 		vfs_unbusy(mntp);
 		return (EALREADY);
 	}
@ -964,7 +932,7 @@ sys_lfs_segclean(struct proc *p, void *v, register_t *retval)
 	if (fs->lfs_dmeta < 0)
 		fs->lfs_dmeta = 0;
 	sup->su_flags &= ~SEGUSE_DIRTY;
-	(void) VOP_BWRITE(bp);
+	(void) LFS_BWRITE_LOG(bp);
 	
 	LFS_CLEANERINFO(cip, fs, bp);
 	++cip->clean;
@ -972,8 +940,11 @@ sys_lfs_segclean(struct proc *p, void *v, register_t *retval)
 	fs->lfs_nclean = cip->clean;
 	cip->bfree = fs->lfs_bfree;
 	cip->avail = fs->lfs_avail - fs->lfs_ravail;
-	(void) VOP_BWRITE(bp);
+	(void) LFS_BWRITE_LOG(bp);
 	wakeup(&fs->lfs_avail);
+#ifdef LFS_AGGRESSIVE_SEGLOCK
+	lfs_segunlock(fs);
+#endif
 	vfs_unbusy(mntp);

 	return (0);
@ -1100,10 +1071,11 @@ int
 lfs_fastvget(struct mount *mp, ino_t ino, ufs_daddr_t daddr, struct vnode **vpp, struct dinode *dinp, int *need_unlock)
 {
 	struct inode *ip;
+	struct dinode *dip;
 	struct vnode *vp;
 	struct ufsmount *ump;
 	dev_t dev;
-	int error;
+	int error, retries;
 	struct buf *bp;
 	struct lfs *fs;
 	
@ -1179,6 +1151,8 @@ lfs_fastvget(struct mount *mp, ino_t ino, ufs_daddr_t daddr, struct vnode **vpp,
 		if (ip->i_number != ino)
 			panic("lfs_fastvget: I was fed the wrong inode!");
 	} else {
+		retries = 0;
+	    again:
 		error = bread(ump->um_devvp, fsbtodb(fs, daddr), fs->lfs_ibsize,
 			      NOCRED, &bp);
 		if (error) {
@ -1197,7 +1171,18 @@ lfs_fastvget(struct mount *mp, ino_t ino, ufs_daddr_t daddr, struct vnode **vpp,
 			*vpp = NULL;
 			return (error);
 		}
-		ip->i_din.ffs_din = *lfs_ifind(fs, ino, bp);
+		dip = lfs_ifind(ump->um_lfs, ino, bp);
+		if (dip == NULL) {
+			/* Assume write has not completed yet; try again */
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			++retries;
+			if (retries > LFS_IFIND_RETRIES)
+				panic("lfs_fastvget: dinode not found");
+			printf("lfs_fastvget: dinode not found, retrying...\n");
+			goto again;
+		}
+		ip->i_din.ffs_din = *dip;
 		brelse(bp);
 	}
 	ip->i_ffs_effnlink = ip->i_ffs_nlink;
@ -1234,7 +1219,7 @@ lfs_fastvget(struct mount *mp, ino_t ino, ufs_daddr_t daddr, struct vnode **vpp,
 }

 struct buf *
-lfs_fakebuf(struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
+lfs_fakebuf(struct lfs *fs, struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
 {
 	struct buf *bp;
 	int error;
@ -1251,7 +1236,12 @@ lfs_fakebuf(struct vnode *vp, int lbn, size_t size, caddr_t uaddr)
 	bp->b_flags |= B_INVAL;
 	bp->b_saveaddr = uaddr;
 #endif
-
+#if 0
+	bp->b_saveaddr = (caddr_t)fs;
+	s = splbio();
+	++fs->lfs_iocount;
+	splx(s);
+#endif
 	bp->b_bufsize = size;
 	bp->b_bcount = size;
 	return (bp);
--- a/sys/ufs/lfs/lfs_vfsops.c
+++ b/sys/ufs/lfs/lfs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.73 2002/05/12 23:06:29 matt Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.74 2002/05/14 20:03:54 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.73 2002/05/12 23:06:29 matt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.74 2002/05/14 20:03:54 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@ -165,6 +165,9 @@ lfs_init()
 	 */
 	pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
 		  "lfsinopl", &pool_allocator_nointr);
+#ifdef DEBUG
+	memset(lfs_log, 0, sizeof(lfs_log));
+#endif
 }

 void
@ -436,11 +439,11 @@ update_meta(struct lfs *fs, ino_t ino, int version, ufs_daddr_t lbn,
 		}
 #endif
 		sup->su_nbytes -= size;
-		VOP_BWRITE(bp);
+		LFS_BWRITE_LOG(bp);
 	}
 	LFS_SEGENTRY(sup, fs, dtosn(fs, ndaddr), bp);
 	sup->su_nbytes += size;
-	VOP_BWRITE(bp);
+	LFS_BWRITE_LOG(bp);

 	/* Fix this so it can be released */
 	/* ip->i_lfs_effnblks = ip->i_ffs_blocks; */
@ -521,19 +524,19 @@ update_inoblk(struct lfs *fs, daddr_t offset, struct ucred *cred,
 			LFS_IENTRY(ifp, fs, dip->di_inumber, ibp);
 			daddr = ifp->if_daddr;
 			ifp->if_daddr = dbtofsb(fs, dbp->b_blkno);
-			error = VOP_BWRITE(ibp); /* Ifile */
+			error = LFS_BWRITE_LOG(ibp); /* Ifile */
 			/* And do segment accounting */
 			if (dtosn(fs, daddr) != dtosn(fs, dbtofsb(fs, dbp->b_blkno))) {
 				if (daddr > 0) {
 					LFS_SEGENTRY(sup, fs, dtosn(fs, daddr),
 						     ibp);
 					sup->su_nbytes -= DINODE_SIZE;
-					VOP_BWRITE(ibp);
+					LFS_BWRITE_LOG(ibp);
 				}
 				LFS_SEGENTRY(sup, fs, dtosn(fs, dbtofsb(fs, dbp->b_blkno)),
 					     ibp);
 				sup->su_nbytes += DINODE_SIZE;
-				VOP_BWRITE(ibp);
+				LFS_BWRITE_LOG(ibp);
 			}
 		}
 	}
@ -943,10 +946,6 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	fs->lfs_uinodes = 0;
 	fs->lfs_ravail = 0;
 	fs->lfs_sbactive = 0;
-#ifdef LFS_TRACK_IOS
-	for (i = 0; i < LFS_THROTTLE; i++)
-		fs->lfs_pending[i] = LFS_UNUSED_DADDR;
-#endif

 	/* Set up the ifile and lock aflags */
 	fs->lfs_doifile = 0;
@ -995,7 +994,6 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	}
 	fs->lfs_ivnode = vp;
 	VREF(vp);
-	vput(vp);

 	/*
 	 * Roll forward.
@ -1030,7 +1028,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 		if (!(sup->su_flags & SEGUSE_DIRTY))
 			--fs->lfs_nclean;
 		sup->su_flags |= SEGUSE_DIRTY;
-		(void) VOP_BWRITE(bp);
+		(void) LFS_BWRITE_LOG(bp);
 		while ((offset = check_segsum(fs, offset, cred, CHECK_CKSUM,
 					      &flags, p)) > 0)
 		{
@ -1040,7 +1038,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 				if (!(sup->su_flags & SEGUSE_DIRTY))
 					--fs->lfs_nclean;
 				sup->su_flags |= SEGUSE_DIRTY;
-				(void) VOP_BWRITE(bp);
+				(void) LFS_BWRITE_LOG(bp);
 			}

 #ifdef DEBUG_LFS_RFW
@ -1126,7 +1124,7 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	cip->dirty = fs->lfs_nseg - fs->lfs_nclean;
 	cip->avail = fs->lfs_avail;
 	cip->bfree = fs->lfs_bfree;
-	(void) VOP_BWRITE(bp); /* Ifile */
+	(void) LFS_BWRITE_LOG(bp); /* Ifile */

 	/*
 	 * Mark the current segment as ACTIVE, since we're going to 
@ -1134,7 +1132,22 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct proc *p)
 	 */
        LFS_SEGENTRY(sup, fs, dtosn(fs, fs->lfs_offset), bp); 
        sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
-        (void) VOP_BWRITE(bp); /* Ifile */
+        (void) LFS_BWRITE_LOG(bp); /* Ifile */
+
+	/* Now that roll-forward is done, unlock the Ifile */
+	vput(vp);
+
+	/* Comment on ifile size if it is too large */
+	if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS) {
+		fs->lfs_flags |= LFS_WARNED;
+		printf("lfs_mountfs: please consider increasing NBUF to at least %lld\n",
+			(long long)(fs->lfs_ivnode->v_size / fs->lfs_bsize) * (nbuf / LFS_MAX_BUFS));
+	}
+	if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES) {
+		fs->lfs_flags |= LFS_WARNED;
+		printf("lfs_mountfs: please consider increasing BUFPAGES to at least %lld\n",
+			(long long)fs->lfs_ivnode->v_size * bufpages / LFS_MAX_BYTES);
+	}

 	return (0);
 out:
@ -1198,6 +1211,20 @@ lfs_unmount(struct mount *mp, int mntflags, struct proc *p)
 	lfs_writesuper(fs, fs->lfs_sboffs[0]);
 	lfs_writesuper(fs, fs->lfs_sboffs[1]);

+	/* Comment on ifile size if it has become too large */
+	if (!(fs->lfs_flags & LFS_WARNED)) {
+		if (fs->lfs_ivnode->v_size / fs->lfs_bsize > LFS_MAX_BUFS)
+			printf("lfs_unmount: please consider increasing"
+				" NBUF to at least %lld\n",
+				(fs->lfs_ivnode->v_size / fs->lfs_bsize) *
+				(long long)(nbuf / LFS_MAX_BUFS));
+		if (fs->lfs_ivnode->v_size > LFS_MAX_BYTES)
+			printf("lfs_unmount: please consider increasing"
+				" BUFPAGES to at least %lld\n",
+				(long long)fs->lfs_ivnode->v_size *
+				bufpages / LFS_MAX_BYTES);
+	}
+
 	/* Finish with the Ifile, now that we're done with it */
 	vrele(fs->lfs_ivnode);
 	vgone(fs->lfs_ivnode);
@ -1300,6 +1327,7 @@ int
 lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
 {
 	struct lfs *fs;
+	struct dinode *dip;
 	struct inode *ip;
 	struct buf *bp;
 	struct ifile *ifp;
@ -1307,7 +1335,7 @@ lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
 	struct ufsmount *ump;
 	ufs_daddr_t daddr;
 	dev_t dev;
-	int error;
+	int error, retries;
 	struct timespec ts;

 	ump = VFSTOUFS(mp);
@ -1379,8 +1407,10 @@ lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
 	ip->i_lfs = ump->um_lfs;

 	/* Read in the disk contents for the inode, copy into the inode. */
+	retries = 0;
+    again:
 	error = bread(ump->um_devvp, fsbtodb(fs, daddr), 
-		(fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_fsize),
+		(fs->lfs_version == 1 ? fs->lfs_bsize : fs->lfs_ibsize),
 		NOCRED, &bp);
 	if (error) {
 		/*
@ -1394,7 +1424,45 @@ lfs_vget(struct mount *mp, ino_t ino, struct vnode **vpp)
 		*vpp = NULL;
 		return (error);
 	}
-	ip->i_din.ffs_din = *lfs_ifind(fs, ino, bp);
+
+	dip = lfs_ifind(fs, ino, bp);
+	if (dip == NULL) {
+		/* Assume write has not completed yet; try again */
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+		++retries;
+		if (retries > LFS_IFIND_RETRIES) {
+#ifdef DEBUG
+			/* If the seglock is held look at the bpp to see
+			   what is there anyway */
+			if (fs->lfs_seglock > 0) {
+				struct buf **bpp;
+				struct dinode *dp;
+				int i;
+
+				for (bpp = fs->lfs_sp->bpp;
+				     bpp != fs->lfs_sp->cbpp; ++bpp) {
+					if ((*bpp)->b_vp == fs->lfs_ivnode &&
+					    bpp != fs->lfs_sp->bpp) {
+						/* Inode block */
+						printf("block 0x%x: ", (*bpp)->b_blkno);
+						dp = (struct dinode *)(*bpp)->b_data;
+						for (i = 0; i < INOPB(fs); i++)
+							if (dp[i].di_u.inumber)
+								printf("%d ", dp[i].di_u.inumber);
+						printf("\n");
+					}
+				}
+			}
+#endif
+			panic("lfs_vget: dinode not found");
+		}
+		printf("lfs_vget: dinode %d not found, retrying...\n", ino);
+		(void)tsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs ifind", 1);
+		goto again;
+	}
+	ip->i_din.ffs_din = *dip;
+
 	ip->i_ffs_effnlink = ip->i_ffs_nlink;
 	ip->i_lfs_effnblks = ip->i_ffs_blocks;
 	if (fs->lfs_version > 1) {
--- a/sys/ufs/lfs/lfs_vnops.c
+++ b/sys/ufs/lfs/lfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vnops.c,v 1.62 2002/04/27 01:00:46 perseant Exp $	*/
+/*	$NetBSD: lfs_vnops.c,v 1.63 2002/05/14 20:03:55 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -71,7 +71,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.62 2002/04/27 01:00:46 perseant Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.63 2002/05/14 20:03:55 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -300,11 +300,29 @@ lfs_fsync(void *v)
 	simple_lock(&vp->v_interlock);
 	error = VOP_PUTPAGES(vp, trunc_page(ap->a_offlo),
                    round_page(ap->a_offhi), PGO_CLEANIT | PGO_SYNCIO);
-	if (error) {
+	if (error)
 		return error;
+	error = VOP_UPDATE(vp, NULL, NULL,
+			   (ap->a_flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0);
+#ifdef DEBUG
+	/*
+	 * If we were called from vinvalbuf and lfs_update
+	 * didn't flush all our buffers, we're in trouble.
+	 */
+	if ((ap->a_flags & FSYNC_WAIT) && vp->v_dirtyblkhd.lh_first != NULL) {
+		struct buf *bp;
+
+		bp = vp->v_dirtyblkhd.lh_first;
+		printf("lfs_fsync: ino %d failed to sync", VTOI(vp)->i_number);
+		printf("lfs_fsync: iocount = %d\n", VTOI(vp)->i_lfs->lfs_iocount);
+		printf("lfs_fsync: flags are 0x%x, numoutput=%d\n",
+			VTOI(vp)->i_flag, vp->v_numoutput);
+		printf("lfs_fsync: writecount=%ld\n", vp->v_writecount);
+		printf("lfs_fsync: first bp: %p, flags=0x%lx, lbn=%d\n",
+			bp, bp->b_flags, bp->b_lblkno);
 	}
-	return (VOP_UPDATE(vp, NULL, NULL,
-			   (ap->a_flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0));
+#endif
+	return error;
 }

 /*
@ -358,7 +376,7 @@ lfs_set_dirop(struct vnode *vp)
 		lfs_check(vp, LFS_UNUSED_LBN, 0);
 	while (fs->lfs_writer || lfs_dirvcount > LFS_MAXDIROP) {
 		if (fs->lfs_writer)
-			tsleep(&fs->lfs_dirops, PRIBIO + 1, "lfs_dirop", 0);
+			tsleep(&fs->lfs_dirops, PRIBIO + 1, "lfs_sdirop", 0);
 		if (lfs_dirvcount > LFS_MAXDIROP && fs->lfs_dirops == 0) {
                	++fs->lfs_writer;
                	lfs_flush(fs, 0);