* Remove PGO_RECLAIM during lfs_putpages()' call to genfs_putpages(),

to avoid a live lock in the latter when reclaiming a vnode with dirty pages. * Add a new segment flag, SEGM_RECLAIM, to note when a segment is being written for vnode reclamation, and record which inode is being reclaimed, to aid in forensic debugging. * Add a new segment flag, SEGM_SINGLE, so that opportunistic writes can write a single segment's worth of blocks and then stop, rather than writing all the way up to the cleaner's reserved number of segments. * Add assert statements to check mutex ownership is the way it ought to be, mostly in lfs_putpages; fix problems uncovered by this. * Don't clear VU_DIROP until the inode actually makes its way to disk, avoiding a problem where dirop inodes could become separated (uncovered by a modified version of the "ckckp" forensic regression test). * Move the vfs_getopsbyname() call into lfs_writerd. Prepare code to make lfs_writerd notice when there are no more LFSs, and exit losing the reference, so that, in theory, the module can be unloaded. This code is not enabled, since it causes a crash on exit. * Set IN_MODIFIED on inodes flushed by lfs_flush_dirops. Really we only need to set IN_MODIFIED if we are going to write them again (e.g., to write pages); need to think about this more. Finally, several changes to help avoid "no clean segments" panics: * In lfs_bmapv, note when a vnode is loaded only to discover whether its blocks are live, so it can immediately be recycled. Since the cleaner will try to choose ~empty segments over full ones, this prevents the cleaner from (1) filling the vnode cache with junk, and (2) squeezing any unwritten writes to disk and running the fs out of segments. * Overestimate by half the amount of metadata that will be required to fill the clean segments. This will make the disk appear smaller, but should help avoid a "no clean segments" panic. * Rearrange lfs_writerd. In particular, lfs_writerd now pays attention to the number of clean segments available, and holds off writing until there is room.
2012-01-02 22:10:44 +00:00 · 2012-01-02 22:10:44 +00:00 · f9b3466d45
parent b9d601ff9f
commit f9b3466d45
10 changed files with 483 additions and 207 deletions
--- a/sys/ufs/lfs/lfs.h
+++ b/sys/ufs/lfs/lfs.h
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs.h,v 1.134 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs.h,v 1.135 2012/01/02 22:10:44 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -592,6 +592,7 @@ struct segsum_v1 {
 #define	SS_CONT		0x02		/* more partials to finish this write*/
 #define	SS_CLEAN	0x04		/* written by the cleaner */
 #define	SS_RFW		0x08		/* written by the roll-forward agent */
+#define	SS_RECLAIM	0x10		/* written by the roll-forward agent */
 	u_int16_t ss_flags;		/* 24: used for directory operations */
 	u_int16_t ss_pad;		/* 26: extra space */
 	/* FINFO's and inode daddr's... */
@ -608,7 +609,8 @@ struct segsum {
 	u_int16_t ss_nfinfo;		/* 20: number of file info structures */
 	u_int16_t ss_ninos;		/* 22: number of inodes in summary */
 	u_int16_t ss_flags;		/* 24: used for directory operations */
-	u_int8_t  ss_pad[6];		/* 26: extra space */
+	u_int8_t  ss_pad[2];		/* 26: extra space */
+	u_int32_t ss_reclino;           /* 28: inode being reclaimed */
 	u_int64_t ss_serial;		/* 32: serial number */
 	u_int64_t ss_create;		/* 40: time stamp */
 	/* FINFO's and inode daddr's... */
@ -840,6 +842,8 @@ struct lfs {
 	int lfs_nowrap;			/* Suspend log wrap */
 	int lfs_wrappass;		/* Allow first log wrap requester to pass */
 	int lfs_wrapstatus;		/* Wrap status */
+	int lfs_reclino;		/* Inode being reclaimed */
+	int lfs_startseg;               /* Segment we started writing at */
 	LIST_HEAD(, segdelta) lfs_segdhd;	/* List of pending trunc accounting events */
 };

@ -945,13 +949,15 @@ struct segment {
 	u_int32_t seg_number;		/* number of this segment */
 	int32_t *start_lbp;		/* beginning lbn for this set */

-#define	SEGM_CKP	0x01		/* doing a checkpoint */
-#define	SEGM_CLEAN	0x02		/* cleaner call; don't sort */
-#define	SEGM_SYNC	0x04		/* wait for segment */
-#define	SEGM_PROT	0x08		/* don't inactivate at segunlock */
-#define SEGM_PAGEDAEMON	0x10		/* pagedaemon called us */
-#define SEGM_WRITERD	0x20		/* LFS writed called us */
-#define SEGM_FORCE_CKP	0x40		/* Force checkpoint right away */
+#define SEGM_CKP	0x0001		/* doing a checkpoint */
+#define SEGM_CLEAN	0x0002		/* cleaner call; don't sort */
+#define SEGM_SYNC	0x0004		/* wait for segment */
+#define SEGM_PROT	0x0008		/* don't inactivate at segunlock */
+#define SEGM_PAGEDAEMON	0x0010		/* pagedaemon called us */
+#define SEGM_WRITERD	0x0020		/* LFS writed called us */
+#define SEGM_FORCE_CKP	0x0040		/* Force checkpoint right away */
+#define SEGM_RECLAIM	0x0080		/* Writing to reclaim vnode */
+#define SEGM_SINGLE	0x0100		/* Opportunistic writevnodes */
 	u_int16_t seg_flags;		/* run-time flags for this segment */
 	u_int32_t seg_iocount;		/* number of ios pending */
 	int	  ndupino;		/* number of duplicate inodes */
@ -992,6 +998,7 @@ struct lfs_inode_ext {
 #define LFSI_DELETED      0x02
 #define LFSI_WRAPBLOCK    0x04
 #define LFSI_WRAPWAIT     0x08
+#define LFSI_BMAP         0x10
 	u_int32_t lfs_iflags;           /* Inode flags */
 	daddr_t   lfs_hiblk;		/* Highest lbn held by inode */
 #ifdef _KERNEL
@ -1017,10 +1024,16 @@ struct lfs_inode_ext {
 * Macros for determining free space on the disk, with the variable metadata
 * of segment summaries and inode blocks taken into account.
 */
-/* Estimate number of clean blocks not available for writing */
-#define LFS_EST_CMETA(F) (int32_t)((((F)->lfs_dmeta *			     \
-				     (int64_t)(F)->lfs_nclean) /	     \
-				      ((F)->lfs_nseg - (F)->lfs_nclean)))
+/*
+ * Estimate number of clean blocks not available for writing because
+ * they will contain metadata or overhead.  This is calculated as
+ * (dmeta / # dirty segments) * (# clean segments).
+ */
+#define CM_MAG_NUM 3
+#define CM_MAG_DEN 2
+#define LFS_EST_CMETA(F) (int32_t)((					\
+				    (CM_MAG_NUM * ((F)->lfs_dmeta * (int64_t)(F)->lfs_nclean)) / \
+				    (CM_MAG_DEN * ((F)->lfs_nseg - (F)->lfs_nclean))))

 /* Estimate total size of the disk not including metadata */
 #define LFS_EST_NONMETA(F) ((F)->lfs_dsize - (F)->lfs_dmeta - LFS_EST_CMETA(F))
--- a/sys/ufs/lfs/lfs_bio.c
+++ b/sys/ufs/lfs/lfs_bio.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -96,6 +96,7 @@ int	lfs_subsys_pages     = 0L;	/* Total number LFS-written pages */
 int	lfs_fs_pagetrip	     = 0;	/* # of pages to trip per-fs write */
 int	lfs_writing	     = 0;	/* Set if already kicked off a writer
 					   because of buffer space */
+int	locked_queue_waiters = 0;	/* Number of processes waiting on lq */

 /* Lock and condition variables for above. */
 kcondvar_t	locked_queue_cv;
@ -160,8 +161,12 @@ lfs_reservebuf(struct lfs *fs, struct vnode *vp,

 		lfs_flush(fs, 0, 0);

+		DLOG((DLOG_AVAIL, "lfs_reservebuf: waiting: count=%d, bytes=%ld\n",
+		      locked_queue_count, locked_queue_bytes));
+		++locked_queue_waiters;
 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
 		    hz * LFS_BUFWAIT);
+		--locked_queue_waiters;
 		if (error && error != EWOULDBLOCK) {
 			mutex_exit(&lfs_lock);
 			return error;
@ -171,8 +176,11 @@ lfs_reservebuf(struct lfs *fs, struct vnode *vp,
 	locked_queue_rcount += n;
 	locked_queue_rbytes += bytes;

-	if (n < 0)
+	if (n < 0 && locked_queue_waiters > 0) {
+		DLOG((DLOG_AVAIL, "lfs_reservebuf: broadcast: count=%d, bytes=%ld\n",
+		      locked_queue_count, locked_queue_bytes));
 		cv_broadcast(&locked_queue_cv);
+	}

 	mutex_exit(&lfs_lock);

@ -461,7 +469,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
 	 */
 	if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
 		bp->b_oflags &= ~BO_DELWRI;
-		bp->b_flags |= B_READ;
+		bp->b_flags |= B_READ; /* XXX is this right? --ks */
 		bp->b_error = 0;
 		mutex_enter(&bufcache_lock);
 		LFS_UNLOCK_BUF(bp);
@ -535,6 +543,7 @@ lfs_flush_fs(struct lfs *fs, int flags)
 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;

+	fs->lfs_pdflush = 0;
 	mutex_exit(&lfs_lock);
 	lfs_writer_enter(fs, "fldirop");
 	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
@ -689,10 +698,10 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 	/* If there are too many pending dirops, we have to flush them. */
 	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
 	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
-		flags |= SEGM_CKP;
-	}
-
-	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+		mutex_exit(&lfs_lock);
+		lfs_flush_dirops(fs);
+		mutex_enter(&lfs_lock);
+	} else if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
 	    lfs_subsys_pages > LFS_MAX_PAGES ||
 	    fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
@ -717,8 +726,10 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
 			++lfs_stats.wait_exceeded;
 		DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
 		      locked_queue_count, locked_queue_bytes));
+		++locked_queue_waiters;
 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
 		    hz * LFS_BUFWAIT);
+		--locked_queue_waiters;
 		if (error != EWOULDBLOCK)
 			break;

--- a/sys/ufs/lfs/lfs_extern.h
+++ b/sys/ufs/lfs/lfs_extern.h
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_extern.h,v 1.96 2008/06/28 01:34:05 rumble Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.97 2012/01/02 22:10:44 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -240,8 +240,8 @@ int lfs_gop_alloc(struct vnode *, off_t, off_t, int, kauth_cred_t);
 void lfs_gop_size(struct vnode *, off_t, off_t *, int);
 int lfs_putpages_ext(void *, int);
 int lfs_gatherpages(struct vnode *);
-void lfs_flush_dirops(struct lfs *);
-void lfs_flush_pchain(struct lfs *);
+int lfs_flush_dirops(struct lfs *);
+int lfs_flush_pchain(struct lfs *);

 int lfs_bwrite	 (void *);
 int lfs_fsync	 (void *);
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $");

 #ifdef DEBUG
 # define vndebug(vp, str) do {						\
@ -202,6 +202,9 @@ lfs_vflush(struct vnode *vp)
 	relock = 0;

    top:
+	KASSERT(mutex_owned(vp->v_interlock) == false);
+	KASSERT(mutex_owned(&lfs_lock) == false);
+	KASSERT(mutex_owned(&bufcache_lock) == false);
 	ASSERT_NO_SEGLOCK(fs);
 	if (ip->i_flag & IN_CLEANING) {
 		ivndebug(vp,"vflush/in_cleaning");
@ -280,7 +283,10 @@ lfs_vflush(struct vnode *vp)
 	mutex_exit(vp->v_interlock);

 	/* Protect against VI_XLOCK deadlock in vinvalbuf() */
-	lfs_seglock(fs, SEGM_SYNC);
+	lfs_seglock(fs, SEGM_SYNC | ((vp->v_iflag & VI_XLOCK) ? SEGM_RECLAIM : 0));
+	if (vp->v_iflag & VI_XLOCK) {
+		fs->lfs_reclino = ip->i_number;
+	}

 	/* If we're supposed to flush a freed inode, just toss it */
 	if (ip->i_lfs_iflags & LFSI_DELETED) {
@ -380,11 +386,12 @@ lfs_vflush(struct vnode *vp)
 		do {
 			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
 				relock = lfs_writefile(fs, sp, vp);
-				if (relock) {
+				if (relock && vp != fs->lfs_ivnode) {
 					/*
 					 * Might have to wait for the
 					 * cleaner to run; but we're
 					 * still not done with this vnode.
+					 * XXX we can do better than this.
 					 */
 					KDASSERT(ip->i_number != LFS_IFILE_INUM);
 					lfs_writeinode(fs, sp, ip);
@ -486,9 +493,16 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
 			 * After this, pages might be busy
 			 * due to our own previous putpages.
 			 * Start actual segment write here to avoid deadlock.
+			 * If we were just writing one segment and we've done
+			 * that, break out.
 			 */
 			mutex_exit(&mntvnode_lock);
-			(void)lfs_writeseg(fs, sp);
+			if (lfs_writeseg(fs, sp) &&
+			    (sp->seg_flags & SEGM_SINGLE) &&
+			    fs->lfs_curseg != fs->lfs_startseg) {
+				DLOG((DLOG_VNODE, "lfs_writevnodes: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
+				break;
+			}
 			goto loop;
 		}

@ -626,6 +640,10 @@ lfs_segwrite(struct mount *mp, int flags)
 	 */
 	do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);

+	/* We can't do a partial write and checkpoint at the same time. */
+	if (do_ckp)
+		flags &= ~SEGM_SINGLE;
+
 	lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
 	sp = fs->lfs_sp;
 	if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
@ -645,6 +663,11 @@ lfs_segwrite(struct mount *mp, int flags)
 	else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
 		do {
 			um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
+			if ((sp->seg_flags & SEGM_SINGLE) &&
+			    fs->lfs_curseg != fs->lfs_startseg) {
+				DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
+				break;
+			}

 			if (do_ckp || fs->lfs_dirops == 0) {
 				if (!writer_set) {
@ -1025,6 +1048,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 {
 	struct buf *bp;
 	struct ufs1_dinode *cdp;
+	struct vnode *vp = ITOV(ip);
 	daddr_t daddr;
 	int32_t *daddrp;	/* XXX ondisk32 */
 	int i, ndx;
@ -1033,7 +1057,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	int count;

 	ASSERT_SEGLOCK(fs);
-	if (!(ip->i_flag & IN_ALLMOD))
+	if (!(ip->i_flag & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP))
 		return (0);

 	/* Can't write ifile when writer is not set */
@ -1047,7 +1071,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	 * solid.
 	 */
 	count = 0;
-	while (ip->i_number == LFS_IFILE_INUM) {
+	while (vp == fs->lfs_ivnode) {
 		int redo = 0;

 		if (sp->idp == NULL && sp->ibp == NULL &&
@ -1112,7 +1136,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	}

 	/* Check VU_DIROP in case there is a new file with no data blocks */
-	if (ITOV(ip)->v_uflag & VU_DIROP)
+	if (vp->v_uflag & VU_DIROP)
 		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);

 	/* Update the inode times and copy the inode onto the inode page. */
@ -1138,6 +1162,18 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
 	*cdp = *ip->i_din.ffs1_din;

+	/*
+	 * This inode is on its way to disk; clear its VU_DIROP status when
+	 * the write is complete.
+	 */
+	if (vp->v_uflag & VU_DIROP) {
+		if (!(sp->seg_flags & SEGM_CLEAN))
+			ip->i_flag |= IN_CDIROP;
+		else {
+			DLOG((DLOG_DIROP, "lfs_writeinode: not clearing dirop for cleaned ino %d\n", (int)ip->i_number));
+		}
+	}
+
 	/*
 	 * If cleaning, link counts and directory file sizes cannot change,
 	 * since those would be directory operations---even if the file
@ -1146,9 +1182,9 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
 	 * current values the next time we clean.
 	 */
 	if (sp->seg_flags & SEGM_CLEAN) {
-		if (ITOV(ip)->v_uflag & VU_DIROP) {
+		if (vp->v_uflag & VU_DIROP) {
 			cdp->di_nlink = ip->i_lfs_odnlink;
-			/* if (ITOV(ip)->v_type == VDIR) */
+			/* if (vp->v_type == VDIR) */
 			cdp->di_size = ip->i_lfs_osize;
 		}
 	} else {
@ -1988,6 +2024,12 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
 	if (sp->seg_flags & SEGM_CLEAN)
 		ssp->ss_flags |= SS_CLEAN;

+	/* Note if we are writing to reclaim */
+	if (sp->seg_flags & SEGM_RECLAIM) {
+		ssp->ss_flags |= SS_RECLAIM;
+		ssp->ss_reclino = fs->lfs_reclino;
+	}
+
 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;

 	/* Update the segment usage information. */
@ -2720,7 +2762,6 @@ lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
 int
 lfs_vref(struct vnode *vp)
 {
-	int error;
 	struct lfs *fs;

 	KASSERT(mutex_owned(vp->v_interlock));
@ -2734,12 +2775,13 @@ lfs_vref(struct vnode *vp)
 	 * being able to flush all of the pages from this vnode, which
 	 * will cause it to panic.  So, return 0 if a flush is in progress.
 	 */
-	error = vget(vp, LK_NOWAIT);
-	if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
-		++fs->lfs_flushvp_fakevref;
-		return 0;
-	}
-	return error;
+	if (IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
+ 		++fs->lfs_flushvp_fakevref;
+		mutex_exit(vp->v_interlock);
+ 		return 0;
+ 	}
+
+	return vget(vp, LK_NOWAIT);
 }

 /*
--- a/sys/ufs/lfs/lfs_subr.c
+++ b/sys/ufs/lfs/lfs_subr.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $	*/
+/*	$NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -335,6 +335,7 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
 	 */
 	mutex_enter(&lfs_lock);
 	++fs->lfs_iocount;
+	fs->lfs_startseg = fs->lfs_curseg;
 	mutex_exit(&lfs_lock);
 	return 0;
 }
@ -361,7 +362,7 @@ lfs_unmark_dirop(struct lfs *fs)
 	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
 		nip = TAILQ_NEXT(ip, i_lfs_dchain);
 		vp = ITOV(ip);
-		if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
+		if ((ip->i_flag & (IN_ADIROP | IN_CDIROP)) == IN_CDIROP) {
 			--lfs_dirvcount;
 			--fs->lfs_dirvcount;
 			vp->v_uflag &= ~VU_DIROP;
@ -372,6 +373,7 @@ lfs_unmark_dirop(struct lfs *fs)
 			vrele(vp);
 			mutex_enter(&lfs_lock);
 			fs->lfs_unlockvp = NULL;
+			ip->i_flag &= ~IN_CDIROP;
 		}
 	}

@ -437,8 +439,7 @@ lfs_segunlock(struct lfs *fs)
 	mutex_enter(&lfs_lock);
 	KASSERT(LFS_SEGLOCK_HELD(fs));
 	if (fs->lfs_seglock == 1) {
-		if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
-		    LFS_STARVED_FOR_SEGS(fs) == 0)
+		if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0)
 			do_unmark_dirop = 1;
 		mutex_exit(&lfs_lock);
 		sync = sp->seg_flags & SEGM_SYNC;
--- a/sys/ufs/lfs/lfs_syscalls.c
+++ b/sys/ufs/lfs/lfs_syscalls.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
@ -61,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $");

 #ifndef LFS
 # define LFS		/* for prototypes in syscallargs.h */
@ -291,6 +291,17 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
 			 */
 			if (v_daddr != LFS_UNUSED_DADDR) {
 				lfs_vunref(vp);
+				/*
+				 * If the vnode has LFSI_BMAP, it was
+				 * not found in the cache.  Dump it so
+				 * we can reuse the vnode.
+				 * XXX If we knew what segment we were
+				 * XXX supposed to be looking for, we
+				 * XXX would be able to be more selective
+				 * XXX here.
+				 */
+				if (ip->i_lfs_iflags & LFSI_BMAP)
+					vrecycle(vp, NULL, NULL);
 				numrefed--;
 			}

@ -760,6 +771,7 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 					continue;
 				} else {
 					KASSERT(VOP_ISLOCKED(vp));
+					VTOI(vp)->i_lfs_iflags |= LFSI_BMAP;
 					VOP_UNLOCK(vp);
 					numrefed++;
 				}
@ -814,6 +826,9 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
 	 */
 	if (v_daddr != LFS_UNUSED_DADDR) {
 		lfs_vunref(vp);
+		/* Recycle as above. */
+		if (ip->i_lfs_iflags & LFSI_BMAP)
+			vrecycle(vp, NULL, NULL);
 		numrefed--;
 	}

--- a/sys/ufs/lfs/lfs_vfsops.c
+++ b/sys/ufs/lfs/lfs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
@ -61,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_lfs.h"
@ -129,6 +129,7 @@ extern const struct vnodeopv_desc lfs_specop_opv_desc;
 extern const struct vnodeopv_desc lfs_fifoop_opv_desc;

 pid_t lfs_writer_daemon = 0;
+lwpid_t lfs_writer_lid = 0;
 int lfs_do_flush = 0;
 #ifdef LFS_KERNEL_RFW
 int lfs_do_rfw = 0;
@ -399,85 +400,151 @@ struct pool lfs_lbnentry_pool;
 static void
 lfs_writerd(void *arg)
 {
-	struct mount *mp, *nmp;
-	struct lfs *fs;
-	int fsflags;
-	int loopcount;
-
-	lfs_writer_daemon = curproc->p_pid;
-
+ 	struct mount *mp, *nmp;
+ 	struct lfs *fs;
+	struct vfsops *vfs = NULL;
+ 	int fsflags;
+ 	int loopcount;
+	int skipc;
+	int lfsc;
+	int wrote_something = 0;
+ 
 	mutex_enter(&lfs_lock);
-	for (;;) {
-		mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
-		    &lfs_lock);
+ 	lfs_writer_daemon = curproc->p_pid;
+	lfs_writer_lid = curlwp->l_lid;
+	mutex_exit(&lfs_lock);

-		/*
-		 * Look through the list of LFSs to see if any of them
-		 * have requested pageouts.
-		 */
-		mutex_enter(&mountlist_lock);
-		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
-		     mp = nmp) {
-			if (vfs_busy(mp, &nmp)) {
-				continue;
-			}
-			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
-			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
-				fs = VFSTOUFS(mp)->um_lfs;
-				mutex_enter(&lfs_lock);
-				fsflags = 0;
-				if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
-				     lfs_dirvcount > LFS_MAX_DIROP) &&
-				    fs->lfs_dirops == 0)
-					fsflags |= SEGM_CKP;
-				if (fs->lfs_pdflush) {
-					DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
-					fs->lfs_pdflush = 0;
-					lfs_flush_fs(fs, fsflags);
-					mutex_exit(&lfs_lock);
-				} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
-					DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
-					mutex_exit(&lfs_lock);
-					lfs_writer_enter(fs, "wrdirop");
-					lfs_flush_pchain(fs);
-					lfs_writer_leave(fs);
-				} else
-					mutex_exit(&lfs_lock);
-			}
-			vfs_unbusy(mp, false, &nmp);
-		}
-		mutex_exit(&mountlist_lock);
+	/* Take an extra reference to the LFS vfsops. */
+	vfs = vfs_getopsbyname(MOUNT_LFS);
+ 
+ 	mutex_enter(&lfs_lock);
+ 	for (;;) {
+		KASSERT(mutex_owned(&lfs_lock));
+		if (wrote_something == 0)
+			mtsleep(&lfs_writer_daemon, PVM, "lfswriter", hz/10 + 1,
+				&lfs_lock);
+
+		KASSERT(mutex_owned(&lfs_lock));
+		loopcount = 0;
+		wrote_something = 0;

 		/*
 		 * If global state wants a flush, flush everything.
 		 */
-		mutex_enter(&lfs_lock);
-		loopcount = 0;
 		if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
 			locked_queue_bytes > LFS_MAX_BYTES ||
 			lfs_subsys_pages > LFS_MAX_PAGES) {

 			if (lfs_do_flush) {
-				DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
+				DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
 			}
 			if (locked_queue_count > LFS_MAX_BUFS) {
-				DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
 				      locked_queue_count, LFS_MAX_BUFS));
 			}
 			if (locked_queue_bytes > LFS_MAX_BYTES) {
-				DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
 				      locked_queue_bytes, LFS_MAX_BYTES));
 			}
 			if (lfs_subsys_pages > LFS_MAX_PAGES) {
-				DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
 				      lfs_subsys_pages, LFS_MAX_PAGES));
 			}

 			lfs_flush(NULL, SEGM_WRITERD, 0);
 			lfs_do_flush = 0;
+			KASSERT(mutex_owned(&lfs_lock));
+			continue;
 		}
-	}
-	/* NOTREACHED */
+		KASSERT(mutex_owned(&lfs_lock));
+		mutex_exit(&lfs_lock);
+ 
+ 		/*
+ 		 * Look through the list of LFSs to see if any of them
+ 		 * have requested pageouts.
+ 		 */
+ 		mutex_enter(&mountlist_lock);
+		lfsc = 0;
+		skipc = 0;
+ 		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+ 		     mp = nmp) {
+ 			if (vfs_busy(mp, &nmp)) {
+				++skipc;
+ 				continue;
+ 			}
+			KASSERT(!mutex_owned(&lfs_lock));
+ 			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
+ 			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+				++lfsc;
+ 				fs = VFSTOUFS(mp)->um_lfs;
+				int32_t ooffset = 0;
+				fsflags = SEGM_SINGLE;
+
+ 				mutex_enter(&lfs_lock);
+				ooffset = fs->lfs_offset;
+
+				if (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
+					/* Don't try to write if we're suspended */
+					mutex_exit(&lfs_lock);
+					vfs_unbusy(mp, false, &nmp);
+					continue;
+				}
+				if (LFS_STARVED_FOR_SEGS(fs)) {
+					mutex_exit(&lfs_lock);
+
+					DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
+					lfs_wakeup_cleaner(fs);
+					vfs_unbusy(mp, false, &nmp);
+					continue;
+				}
+
+ 				if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ 				     lfs_dirvcount > LFS_MAX_DIROP) &&
+				    fs->lfs_dirops == 0) {
+					fsflags &= ~SEGM_SINGLE;
+ 					fsflags |= SEGM_CKP;
+					DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
+					lfs_flush_fs(fs, fsflags);
+				} else if (fs->lfs_pdflush) {
+ 					DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
+ 					lfs_flush_fs(fs, fsflags);
+ 				} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+ 					DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
+ 					mutex_exit(&lfs_lock);
+ 					lfs_writer_enter(fs, "wrdirop");
+ 					lfs_flush_pchain(fs);
+ 					lfs_writer_leave(fs);
+					mutex_enter(&lfs_lock);
+				}
+				if (fs->lfs_offset != ooffset)
+					++wrote_something;
+				mutex_exit(&lfs_lock);
+ 			}
+			KASSERT(!mutex_owned(&lfs_lock));
+ 			vfs_unbusy(mp, false, &nmp);
+ 		}
+		if (lfsc + skipc == 0) {
+#ifdef notyet
+			mutex_enter(&lfs_lock);
+			lfs_writer_daemon = 0;
+			lfs_writer_lid = 0;
+			mutex_exit(&lfs_lock);
+			mutex_exit(&mountlist_lock);
+			break;
+#endif
+		}
+ 		mutex_exit(&mountlist_lock);
+ 
+ 		mutex_enter(&lfs_lock);
+ 	}
+	KASSERT(!mutex_owned(&lfs_lock));
+	KASSERT(!mutex_owned(&mountlist_lock));
+
+	/* Give up our extra reference so the module can be unloaded. */
+	mutex_enter(&vfs_list_lock);
+	if (vfs != NULL)
+		vfs->vfs_refcount--;
+	mutex_exit(&vfs_list_lock);
 }

 /*
@ -1063,16 +1130,12 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 	vput(vp);

 	/* Start the pagedaemon-anticipating daemon */
-	if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
+	mutex_enter(&lfs_lock);
+	if (lfs_writer_daemon == 0 && lfs_writer_lid == 0 &&
+	    kthread_create(PRI_BIO, 0, NULL,
 	    lfs_writerd, NULL, NULL, "lfs_writer") != 0)
 		panic("fork lfs_writer");
-	/*
-	 * XXX: Get extra reference to LFS vfsops.  This prevents unload,
-	 * but also prevents kernel panic due to text being unloaded
-	 * from below lfs_writerd.  When lfs_writerd can exit, remove
-	 * this!!!
-	 */
-	vfs_getopsbyname(MOUNT_LFS);
+	mutex_exit(&lfs_lock);

 	printf("WARNING: the log-structured file system is experimental\n"
 	    "WARNING: it may cause system crashes and/or corrupt data\n");
@ -1576,6 +1639,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 	struct lfs *fs = ip->i_lfs;
 	struct segment *sp = fs->lfs_sp;
 	UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+	const char * failreason = NULL;

 	ASSERT_SEGLOCK(fs);

@ -1591,8 +1655,10 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 	 * We must write everything, however, if our vnode is being
 	 * reclaimed.
 	 */
-	if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
-		goto tryagain;
+	if (LFS_STARVED_FOR_SEGS(fs) && !(vp->v_iflag & VI_XLOCK)) {
+		failreason = "Starved for segs and not flushing vp";
+ 		goto tryagain;
+	}

 	/*
 	 * Sometimes things slip past the filters in lfs_putpages,
@ -1610,9 +1676,16 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 	 *
 	 * XXXUBC that last statement is an oversimplification of course.
 	 */
-	if (!LFS_SEGLOCK_HELD(fs) ||
-	    (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
-	    (pgs[0]->offset & fs->lfs_bmask) != 0) {
+	if (!LFS_SEGLOCK_HELD(fs)) {
+		failreason = "Seglock not held";
+		goto tryagain;
+	}
+	if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
+		failreason = "Inode with no_gop_write";
+		goto tryagain;
+	}
+	if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
+		failreason = "Bad page offset";
 		goto tryagain;
 	}

@ -1632,6 +1705,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 	KASSERT(eof >= 0);

 	if (startoffset >= eof) {
+		failreason = "Offset beyond EOF";
 		goto tryagain;
 	} else
 		bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
@ -1646,9 +1720,11 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 			pgs[i]->flags &= ~PG_DELWRI;
 			pgs[i]->flags |= PG_PAGEOUT;
 			uvm_pageout_start(1);
+			mutex_enter(vp->v_interlock);
 			mutex_enter(&uvm_pageqlock);
 			uvm_pageunwire(pgs[i]);
 			mutex_exit(&uvm_pageqlock);
+			mutex_exit(vp->v_interlock);
 		}
 	}

@ -1768,7 +1844,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 			nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
 			/*
 			 * LFS doesn't like async I/O here, dies with
-			 * and assert in lfs_bwrite().  Is that assert
+			 * an assert in lfs_bwrite().  Is that assert
 			 * valid?  I retained non-async behaviour when
 			 * converted this to use nestiobuf --pooka
 			 */
@ -1805,6 +1881,10 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 		lfs_flush(fs, 0, 1);
 		mutex_exit(&lfs_lock);
 	}
+
+	if ((sp->seg_flags & SEGM_SINGLE) && fs->lfs_curseg != fs->lfs_startseg)
+		return EAGAIN;
+
 	return (0);

    tryagain:
@ -1815,18 +1895,13 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
 	mutex_enter(vp->v_interlock);

 	/* Tell why we're here, if we know */
-	if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
-	} else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
-	} else if (haveeof && startoffset >= eof) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
-		      " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
-		      pgs[0]->offset, eof, npages));
-	} else if (LFS_STARVED_FOR_SEGS(fs)) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
-	} else {
-		DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
+	if (failreason != NULL) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
+	}
+	if (haveeof && startoffset >= eof) {
+ 		DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
+ 		      " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
+ 		      pgs[0]->offset, eof, npages));
 	}

 	mutex_enter(&uvm_pageqlock);
@ -1898,14 +1973,14 @@ lfs_vinit(struct mount *mp, struct vnode **vpp)
 			    i == 0)
 				continue;
 			if (ip->i_ffs1_db[i] != 0) {
-inconsistent:
 				lfs_dump_dinode(ip->i_din.ffs1_din);
-				panic("inconsistent inode");
+				panic("inconsistent inode (direct)");
 			}
 		}
 		for ( ; i < NDADDR + NIADDR; i++) {
 			if (ip->i_ffs1_ib[i - NDADDR] != 0) {
-				goto inconsistent;
+				lfs_dump_dinode(ip->i_din.ffs1_din);
+				panic("inconsistent inode (indirect)");
 			}
 		}
 #endif /* DEBUG */
--- a/sys/ufs/lfs/lfs_vnops.c
+++ b/sys/ufs/lfs/lfs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $	*/
+/*	$NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $");

 #ifdef _KERNEL_OPT
 #include "opt_compat_netbsd.h"
@ -363,6 +363,17 @@ lfs_inactive(void *v)
 		return 0;
 	}

+#ifdef DEBUG
+	/*
+	 * This might happen on unmount.
+	 * XXX If it happens at any other time, it should be a panic.
+	 */
+	if (ap->a_vp->v_uflag & VU_DIROP) {
+		struct inode *ip = VTOI(ap->a_vp);
+		printf("lfs_inactive: inactivating VU_DIROP? ino = %d\n", (int)ip->i_number);
+	}
+#endif /* DIAGNOSTIC */
+
 	return ufs_inactive(v);
 }

@ -438,7 +449,7 @@ lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
 	}

 	++fs->lfs_dirops;
-	fs->lfs_doifile = 1;
+	/* fs->lfs_doifile = 1; */ /* XXX why? --ks */
 	mutex_exit(&lfs_lock);

 	/* Hold a reference so SET_ENDOP will be happy */
@ -544,13 +555,15 @@ lfs_mark_vnode(struct vnode *vp)
 	if (!(ip->i_flag & IN_ADIROP)) {
 		if (!(vp->v_uflag & VU_DIROP)) {
 			mutex_enter(vp->v_interlock);
-			(void)lfs_vref(vp);
+			if (lfs_vref(vp) != 0)
+				panic("lfs_mark_vnode: could not vref");
 			++lfs_dirvcount;
 			++fs->lfs_dirvcount;
 			TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 			vp->v_uflag |= VU_DIROP;
 		}
 		++fs->lfs_nadirop;
+		ip->i_flag &= ~IN_CDIROP;
 		ip->i_flag |= IN_ADIROP;
 	} else
 		KASSERT(vp->v_uflag & VU_DIROP);
@ -1153,7 +1166,8 @@ lfs_strategy(void *v)
 	struct vnode	*vp;
 	struct inode	*ip;
 	daddr_t		tbn;
-	int		i, sn, error, slept;
+#define MAXLOOP 25
+	int		i, sn, error, slept, loopcount;

 	bp = ap->a_bp;
 	vp = ap->a_vp;
@ -1185,6 +1199,7 @@ lfs_strategy(void *v)
 	}

 	slept = 1;
+	loopcount = 0;
 	mutex_enter(&lfs_lock);
 	while (slept && fs->lfs_seglock) {
 		mutex_exit(&lfs_lock);
@ -1213,12 +1228,19 @@ lfs_strategy(void *v)
 				      PRId64 "\n", ip->i_number, bp->b_lblkno));
 				mutex_enter(&lfs_lock);
 				if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
-					/* Cleaner can't wait for itself */
-					mtsleep(&fs->lfs_iocount,
-						(PRIBIO + 1) | PNORELOCK,
-						"clean2", 0,
-						&lfs_lock);
+					/*
+					 * Cleaner can't wait for itself.
+					 * Instead, wait for the blocks
+					 * to be written to disk.
+					 * XXX we need pribio in the test
+					 * XXX here.
+					 */
+ 					mtsleep(&fs->lfs_iocount,
+ 						(PRIBIO + 1) | PNORELOCK,
+						"clean2", hz/10 + 1,
+ 						&lfs_lock);
 					slept = 1;
+					++loopcount;
 					break;
 				} else if (fs->lfs_seglock) {
 					mtsleep(&fs->lfs_seglock,
@ -1232,6 +1254,10 @@ lfs_strategy(void *v)
 			}
 		}
 		mutex_enter(&lfs_lock);
+		if (loopcount > MAXLOOP) {
+			printf("lfs_strategy: breaking out of clean2 loop\n");
+			break;
+		}
 	}
 	mutex_exit(&lfs_lock);

@ -1240,37 +1266,39 @@ lfs_strategy(void *v)
 	return (0);
 }

-void
+/*
+ * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
+ * Technically this is a checkpoint (the on-disk state is valid)
+ * even though we are leaving out all the file data.
+ */
+int
 lfs_flush_dirops(struct lfs *fs)
 {
 	struct inode *ip, *nip;
 	struct vnode *vp;
 	extern int lfs_dostats;
 	struct segment *sp;
+	int flags = 0;
+	int error = 0;

 	ASSERT_MAYBE_SEGLOCK(fs);
 	KASSERT(fs->lfs_nadirop == 0);

 	if (fs->lfs_ronly)
-		return;
+		return EROFS;

 	mutex_enter(&lfs_lock);
 	if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
 		mutex_exit(&lfs_lock);
-		return;
+		return 0;
 	} else
 		mutex_exit(&lfs_lock);

 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;

-	/*
-	 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
-	 * Technically this is a checkpoint (the on-disk state is valid)
-	 * even though we are leaving out all the file data.
-	 */
 	lfs_imtime(fs);
-	lfs_seglock(fs, SEGM_CKP);
+	lfs_seglock(fs, flags);
 	sp = fs->lfs_sp;

 	/*
@ -1293,6 +1321,8 @@ lfs_flush_dirops(struct lfs *fs)
 		vp = ITOV(ip);

 		KASSERT((ip->i_flag & IN_ADIROP) == 0);
+		KASSERT(vp->v_uflag & VU_DIROP);
+		KASSERT(!(vp->v_iflag & VI_XLOCK));

 		/*
 		 * All writes to directories come from dirops; all
@ -1300,9 +1330,7 @@ lfs_flush_dirops(struct lfs *fs)
 		 * cache, which we're not touching.  Reads to files
 		 * and/or directories will not be affected by writing
 		 * directory blocks inodes and file inodes.  So we don't
-		 * really need to lock.	 If we don't lock, though,
-		 * make sure that we don't clear IN_MODIFIED
-		 * unnecessarily.
+		 * really need to lock.
 		 */
 		if (vp->v_iflag & VI_XLOCK) {
 			mutex_enter(&lfs_lock);
@ -1313,23 +1341,36 @@ lfs_flush_dirops(struct lfs *fs)
 		 */
 		if (vp->v_type != VREG &&
 		    ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
-			lfs_writefile(fs, sp, vp);
+			error = lfs_writefile(fs, sp, vp);
 			if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
 			    !(ip->i_flag & IN_ALLMOD)) {
 			    	mutex_enter(&lfs_lock);
 				LFS_SET_UINO(ip, IN_MODIFIED);
 			    	mutex_exit(&lfs_lock);
 			}
+			if (error && (sp->seg_flags & SEGM_SINGLE)) {
+				mutex_enter(&lfs_lock);
+				error = EAGAIN;
+				break;
+			}
 		}
 		KDASSERT(ip->i_number != LFS_IFILE_INUM);
-		(void) lfs_writeinode(fs, sp, ip);
+		error = lfs_writeinode(fs, sp, ip);
 		mutex_enter(&lfs_lock);
+		if (error && (sp->seg_flags & SEGM_SINGLE)) {
+			error = EAGAIN;
+			break;
+		}
+
 		/*
-		 * XXX
-		 * LK_EXCLOTHER is dead -- what is intended here?
-		 * if (waslocked == LK_EXCLOTHER)
-		 *	LFS_SET_UINO(ip, IN_MODIFIED);
+		 * We might need to update these inodes again,
+		 * for example, if they have data blocks to write.
+		 * Make sure that after this flush, they are still
+		 * marked IN_MODIFIED so that we don't forget to
+		 * write them.
 		 */
+		/* XXX only for non-directories? --KS */
+		LFS_SET_UINO(ip, IN_MODIFIED);
 	}
 	mutex_exit(&lfs_lock);
 	/* We've written all the dirops there are */
@ -1337,6 +1378,8 @@ lfs_flush_dirops(struct lfs *fs)
 	lfs_finalize_fs_seguse(fs);
 	(void) lfs_writeseg(fs, sp);
 	lfs_segunlock(fs);
+
+	return error;
 }

 /*
@ -1346,29 +1389,30 @@ lfs_flush_dirops(struct lfs *fs)
 * for any reason, just skip it; if we have to wait for the cleaner,
 * abort.  The writer daemon will call us again later.
 */
-void
+int
 lfs_flush_pchain(struct lfs *fs)
 {
 	struct inode *ip, *nip;
 	struct vnode *vp;
 	extern int lfs_dostats;
 	struct segment *sp;
-	int error;
+	int error, error2;

 	ASSERT_NO_SEGLOCK(fs);

 	if (fs->lfs_ronly)
-		return;
+		return EROFS;

 	mutex_enter(&lfs_lock);
 	if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
 		mutex_exit(&lfs_lock);
-		return;
+		return 0;
 	} else
 		mutex_exit(&lfs_lock);

 	/* Get dirops out of the way */
-	lfs_flush_dirops(fs);
+	if ((error = lfs_flush_dirops(fs)) != 0)
+		return error;

 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;
@ -1422,12 +1466,12 @@ lfs_flush_pchain(struct lfs *fs)
 		    	mutex_exit(&lfs_lock);
 		}
 		KDASSERT(ip->i_number != LFS_IFILE_INUM);
-		(void) lfs_writeinode(fs, sp, ip);
+		error2 = lfs_writeinode(fs, sp, ip);

 		VOP_UNLOCK(vp);
 		lfs_vunref(vp);

-		if (error == EAGAIN) {
+		if (error == EAGAIN || error2 == EAGAIN) {
 			lfs_writeseg(fs, sp);
 			mutex_enter(&lfs_lock);
 			break;
@ -1437,6 +1481,8 @@ lfs_flush_pchain(struct lfs *fs)
 	mutex_exit(&lfs_lock);
 	(void) lfs_writeseg(fs, sp);
 	lfs_segunlock(fs);
+
+	return 0;
 }

 /*
@ -1682,7 +1728,8 @@ segwait_common:
 		/* Wait for the log to wrap, if asked */
 		if (*(int *)ap->a_data) {
 			mutex_enter(ap->a_vp->v_interlock);
-			lfs_vref(ap->a_vp);
+			if (lfs_vref(ap->a_vp) != 0)
+				panic("LFCNWRAPPASS: lfs_vref failed");
 			VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
 			log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
 			error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
@ -1746,6 +1793,7 @@ lfs_getpages(void *v)
 static void
 wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
 {
+	KASSERT(mutex_owned(vp->v_interlock));
 	if ((pg->flags & PG_BUSY) == 0)
 		return;		/* Nothing to wait for! */

@ -1786,6 +1834,7 @@ static void
 write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
 	       int seglocked, const char *label)
 {
+	KASSERT(mutex_owned(vp->v_interlock));
 #ifndef BUSYWAIT
 	struct inode *ip = VTOI(vp);
 	struct segment *sp = fs->lfs_sp;
@ -1814,12 +1863,15 @@ write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
 		mutex_enter(vp->v_interlock);
 		wait_for_page(vp, pg, label);
 	}
-	if (label != NULL && count > 1)
-		printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
-		       label, (count > 0 ? "looping, " : ""), count);
+	if (label != NULL && count > 1) {
+		DLOG((DLOG_PAGE, "lfs_putpages[%d]: %s: %sn = %d\n",
+		      curproc->p_pid, label, (count > 0 ? "looping, " : ""),
+		      count));
+	}
 #else
 	preempt(1);
 #endif
+	KASSERT(mutex_owned(vp->v_interlock));
 }

 /*
@ -1849,6 +1901,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
 	int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
 	int pagedaemon = (curlwp == uvm.pagedaemon_lwp);

+	KASSERT(mutex_owned(vp->v_interlock));
 	ASSERT_MAYBE_SEGLOCK(fs);
  top:
 	by_list = (vp->v_uobj.uo_npages <=
@ -1891,6 +1944,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
 		 */
 		nonexistent = dirty = 0;
 		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			KASSERT(mutex_owned(vp->v_interlock));
 			if (by_list && pages_per_block <= 1) {
 				pgs[i] = pg = curpg;
 			} else {
@ -1916,13 +1970,16 @@ check_dirty(struct lfs *fs, struct vnode *vp,
 				DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
 				if (pgp)
 					*pgp = pg;
+				KASSERT(mutex_owned(vp->v_interlock));
 				return -1;
 			}

 			while (pg->flags & PG_BUSY) {
 				wait_for_page(vp, pg, NULL);
+				KASSERT(mutex_owned(vp->v_interlock));
 				if (i > 0)
 					uvm_page_unbusy(pgs, i);
+				KASSERT(mutex_owned(vp->v_interlock));
 				goto top;
 			}
 			pg->flags |= PG_BUSY;
@ -1944,6 +2001,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,

 		any_dirty += dirty;
 		KASSERT(nonexistent == 0);
+		KASSERT(mutex_owned(vp->v_interlock));

 		/*
 		 * If any are dirty make all dirty; unbusy them,
@ -1952,8 +2010,10 @@ check_dirty(struct lfs *fs, struct vnode *vp,
 		 * they're on their way to disk.
 		 */
 		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			KASSERT(mutex_owned(vp->v_interlock));
 			pg = pgs[i];
 			KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+			KASSERT(pg->flags & PG_BUSY);
 			if (dirty) {
 				pg->flags &= ~PG_CLEAN;
 				if (flags & PGO_FREE) {
@ -1985,6 +2045,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
 		}
 	}

+	KASSERT(mutex_owned(vp->v_interlock));
 	return any_dirty;
 }

@ -2048,9 +2109,11 @@ lfs_putpages(void *v)
 	struct segment *sp;
 	off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
 	off_t off, max_endoffset;
-	bool seglocked, sync, pagedaemon;
+	bool seglocked, sync, pagedaemon, reclaim;
 	struct vm_page *pg, *busypg;
 	UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+	int oreclaim = 0;
+	int donewriting = 0;
 #ifdef DEBUG
 	int debug_n_again, debug_n_dirtyclean;
 #endif
@ -2059,8 +2122,11 @@ lfs_putpages(void *v)
 	ip = VTOI(vp);
 	fs = ip->i_lfs;
 	sync = (ap->a_flags & PGO_SYNCIO) != 0;
+	reclaim = (ap->a_flags & PGO_RECLAIM) != 0;
 	pagedaemon = (curlwp == uvm.pagedaemon_lwp);

+	KASSERT(mutex_owned(vp->v_interlock));
+
 	/* Putpages does nothing for metadata. */
 	if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
 		mutex_exit(vp->v_interlock);
@ -2086,6 +2152,8 @@ lfs_putpages(void *v)
 			TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
 		}
 		mutex_exit(&lfs_lock);
+
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return 0;
 	}

@ -2093,12 +2161,15 @@ lfs_putpages(void *v)

 	/*
 	 * Ignore requests to free pages past EOF but in the same block
-	 * as EOF, unless the request is synchronous.  (If the request is
-	 * sync, it comes from lfs_truncate.)
-	 * XXXUBC Make these pages look "active" so the pagedaemon won't
-	 * XXXUBC bother us with them again.
+	 * as EOF, unless the vnode is being reclaimed or the request
+	 * is synchronous.  (If the request is sync, it comes from
+	 * lfs_truncate.)
+	 *
+	 * To avoid being flooded with this request, make these pages
+	 * look "active".
 	 */
-	if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
+	if (!sync && !reclaim &&
+	    ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
 		origoffset = ap->a_offlo;
 		for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
 			pg = uvm_pagelookup(&vp->v_uobj, off);
@ -2154,8 +2225,13 @@ lfs_putpages(void *v)
 	 * If not cleaning, just send the pages through genfs_putpages
 	 * to be returned to the pool.
 	 */
-	if (!(ap->a_flags & PGO_CLEANIT))
-		return genfs_putpages(v);
+	if (!(ap->a_flags & PGO_CLEANIT)) {
+		DLOG((DLOG_PAGE, "lfs_putpages: no cleanit vn %p ino %d (flags %x)\n",
+		      vp, (int)ip->i_number, ap->a_flags));
+		int r = genfs_putpages(v);
+		KASSERT(!mutex_owned(vp->v_interlock));
+		return r;
+	}

 	/* Set PGO_BUSYFAIL to avoid deadlocks */
 	ap->a_flags |= PGO_BUSYFAIL;
@ -2169,6 +2245,7 @@ lfs_putpages(void *v)
 #endif
 	do {
 		int r;
+		KASSERT(mutex_owned(vp->v_interlock));

 		/* Count the number of dirty pages */
 		r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
@ -2191,8 +2268,10 @@ lfs_putpages(void *v)
 		r = genfs_do_putpages(vp, startoffset, endoffset,
 				       ap->a_flags & ~PGO_SYNCIO, &busypg);
 		ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
-		if (r != EDEADLK)
-			return r;
+		if (r != EDEADLK) {
+			KASSERT(!mutex_owned(vp->v_interlock));
+ 			return r;
+		}

 		/* One of the pages was busy.  Start over. */
 		mutex_enter(vp->v_interlock);
@ -2204,8 +2283,8 @@ lfs_putpages(void *v)

 #ifdef DEBUG
 	if (debug_n_dirtyclean > TOOMANY)
-		printf("lfs_putpages: dirtyclean: looping, n = %d\n",
-		       debug_n_dirtyclean);
+		DLOG((DLOG_PAGE, "lfs_putpages: dirtyclean: looping, n = %d\n",
+		      debug_n_dirtyclean));
 #endif

 	/*
@ -2228,6 +2307,7 @@ lfs_putpages(void *v)
 		wakeup(&lfs_writer_daemon);
 		mutex_exit(&lfs_lock);
 		preempt();
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return EWOULDBLOCK;
 	}

@ -2239,26 +2319,28 @@ lfs_putpages(void *v)
 	 */
 	if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
 	    (vp->v_uflag & VU_DIROP)) {
-		int locked;
-
 		DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
-		/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
-		locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+ 		lfs_writer_enter(fs, "ppdirop");
+
+		/* Note if we hold the vnode locked */
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		{
+		    DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n"));
+		} else {
+		    DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n"));
+		}
 		mutex_exit(vp->v_interlock);
-		lfs_writer_enter(fs, "ppdirop");
-		if (locked)
-			VOP_UNLOCK(vp); /* XXX why? */

 		mutex_enter(&lfs_lock);
 		lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
 		mutex_exit(&lfs_lock);

-		if (locked)
-			VOP_LOCK(vp, LK_EXCLUSIVE);
 		mutex_enter(vp->v_interlock);
 		lfs_writer_leave(fs);

-		/* XXX the flush should have taken care of this one too! */
+		/* The flush will have cleaned out this vnode as well,
+		   no need to do more to it. */
 	}

 	/*
@ -2286,8 +2368,10 @@ lfs_putpages(void *v)
 	if (!seglocked) {
 		mutex_exit(vp->v_interlock);
 		error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
-		if (error != 0)
-			return error;
+		if (error != 0) {
+			KASSERT(!mutex_owned(vp->v_interlock));
+ 			return error;
+		}
 		mutex_enter(vp->v_interlock);
 		lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
 	}
@ -2295,6 +2379,12 @@ lfs_putpages(void *v)
 	KASSERT(sp->vp == NULL);
 	sp->vp = vp;

+	/* Note segments written by reclaim; only for debugging */
+	if ((vp->v_iflag & VI_XLOCK) != 0) {
+		sp->seg_flags |= SEGM_RECLAIM;
+		fs->lfs_reclino = ip->i_number;
+	}
+
 	/*
 	 * Ensure that the partial segment is marked SS_DIROP if this
 	 * vnode is a DIROP.
@ -2313,10 +2403,11 @@ lfs_putpages(void *v)
 #endif
 	do {
 		busypg = NULL;
+		KASSERT(mutex_owned(vp->v_interlock));
 		if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
 				ap->a_flags, 0, &busypg) < 0) {
 			mutex_exit(vp->v_interlock);
-
+			/* XXX why? --ks */
 			mutex_enter(vp->v_interlock);
 			write_and_wait(fs, vp, busypg, seglocked, NULL);
 			if (!seglocked) {
@ -2330,8 +2421,12 @@ lfs_putpages(void *v)
 		}
 	
 		busypg = NULL;
+		KASSERT(!mutex_owned(&uvm_pageqlock));
+		oreclaim = (ap->a_flags & PGO_RECLAIM);
+		ap->a_flags &= ~PGO_RECLAIM;
 		error = genfs_do_putpages(vp, startoffset, endoffset,
 					   ap->a_flags, &busypg);
+		ap->a_flags |= oreclaim;
 	
 		if (error == EDEADLK || error == EAGAIN) {
 			DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
@ -2339,20 +2434,40 @@ lfs_putpages(void *v)
 			      ip->i_number, fs->lfs_offset,
 			      dtosn(fs, fs->lfs_offset)));

-			mutex_enter(vp->v_interlock);
-			write_and_wait(fs, vp, busypg, seglocked, "again");
+			if (oreclaim) {
+				mutex_enter(vp->v_interlock);
+				write_and_wait(fs, vp, busypg, seglocked, "again");
+				mutex_exit(vp->v_interlock);
+			} else {
+				if ((sp->seg_flags & SEGM_SINGLE) &&
+				    fs->lfs_curseg != fs->lfs_startseg)
+					donewriting = 1;
+			}
+		} else if (error) {
+			DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
+			      " %d ino %d off %x (seg %d)\n", error,
+			      (int)ip->i_number, fs->lfs_offset,
+			      dtosn(fs, fs->lfs_offset)));
 		}
+		/* genfs_do_putpages loses the interlock */
 #ifdef DEBUG
 		++debug_n_again;
 #endif
-	} while (error == EDEADLK);
+		if (oreclaim && error == EAGAIN) {
+			DLOG((DLOG_PAGE, "vp %p ino %d vi_flags %x a_flags %x avoiding vclean panic\n",
+			      vp, (int)ip->i_number, vp->v_iflag, ap->a_flags));
+			mutex_enter(vp->v_interlock);
+		}
+		if (error == EDEADLK)
+			mutex_enter(vp->v_interlock);
+	} while (error == EDEADLK || (oreclaim && error == EAGAIN));
 #ifdef DEBUG
 	if (debug_n_again > TOOMANY)
-		printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
+		DLOG((DLOG_PAGE, "lfs_putpages: again: looping, n = %d\n", debug_n_again));
 #endif

 	KASSERT(sp != NULL && sp->vp == vp);
-	if (!seglocked) {
+	if (!seglocked && !donewriting) {
 		sp->vp = NULL;

 		/* Write indirect blocks as well */
@ -2376,8 +2491,10 @@ lfs_putpages(void *v)
 	 * If we were called from lfs_writefile, we don't need to clean up
 	 * the FIP or unlock the segment lock.	We're done.
 	 */
-	if (seglocked)
+	if (seglocked) {
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return error;
+	}

 	/* Clean up FIP and send it to disk. */
 	lfs_release_finfo(fs);
@ -2417,6 +2534,7 @@ lfs_putpages(void *v)
 		}
 		mutex_exit(vp->v_interlock);
 	}
+	KASSERT(!mutex_owned(vp->v_interlock));
 	return error;
 }

--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@ -1,4 +1,4 @@
-/*	$NetBSD: inode.h,v 1.58 2011/07/12 02:22:13 dholland Exp $	*/
+/*	$NetBSD: inode.h,v 1.59 2012/01/02 22:10:45 perseant Exp $	*/

 /*
 * Copyright (c) 1982, 1989, 1993
@ -242,7 +242,7 @@ struct inode {
 #define	IN_ADIROP	0x0200		/* LFS: dirop in progress */
 #define	IN_SPACECOUNTED	0x0400		/* Blocks to be freed in free count. */
 #define	IN_PAGING       0x1000		/* LFS: file is on paging queue */
-
+#define IN_CDIROP       0x4000          /* LFS: dirop completed pending i/o */
 #if defined(_KERNEL)

 /*
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $	*/

 /*-
 * Copyright (c) 1993
@ -32,7 +32,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $");

 #ifdef LFS_READWRITE
 #define	FS			struct lfs
@ -294,6 +294,7 @@ WRITE(void *v)

 #ifdef LFS_READWRITE
 	async = true;
+	lfs_availwait(fs, btofsb(fs, uio->uio_resid));
 	lfs_check(vp, LFS_UNUSED_LBN, 0);
 #endif /* !LFS_READWRITE */
 	if (!usepc)