PR kern/39564 wapbl performance issues with disk cache flushing

PR kern/40361 WAPBL locking panic in -current PR kern/40361 WAPBL locking panic in -current PR kern/40470 WAPBL corrupts ext2fs PR kern/40562 busy loop in ffs_sync when unmounting a file system PR kern/40525 panic: ffs_valloc: dup alloc - A fix for an issue that can lead to "ffs_valloc: dup" due to dirty cg buffers being invalidated. Problem discovered and patch by dholland@. - If the syncer fails to lazily sync a vnode due to lock contention, retry 1 second later instead of 30 seconds later. - Flush inode atime updates every ~10 seconds (this makes most sense with logging). Presently they didn't hit the disk for read-only files or devices until the file system was unmounted. It would be better to trickle the updates out but that would require more extensive changes. - Fix issues with file system corruption, busy looping and other nasty problems when logging and non-logging file systems are intermixed, with one being the root file system. - For logging, do not flush metadata on an inode-at-a-time basis if the sync has been requested by ioflush. Previously, we could try hundreds of log sync operations a second due to inode update activity, causing the syncer to fall behind and metadata updates to be serialized across the entire file system. Instead, burst out metadata and log flushes at a minimum interval of every 10 seconds on an active file system (happens more often if the log becomes full). Note this does not change the operation of fsync() etc. - With the flush issue fixed, re-enable concurrent metadata updates in vfs_wapbl.c.
2009-02-22 20:10:25 +00:00 · 2009-02-22 20:10:25 +00:00 · 430f67aa17
parent 4534498c64
commit 430f67aa17
6 changed files with 328 additions and 90 deletions
--- a/sys/kern/vfs_wapbl.c
+++ b/sys/kern/vfs_wapbl.c
@ -1,7 +1,7 @@
-/*	$NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $	*/
+/*	$NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $	*/

 /*-
- * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
@ -36,7 +36,7 @@
 #define WAPBL_INTERNAL

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $");

 #include <sys/param.h>

@ -770,27 +770,9 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
 {
 	int doflush;
 	unsigned lockcount;
-	krw_t op;

 	KDASSERT(wl);

-/*
- *	XXX: The original code calls for the use of a RW_READER lock 
- *	here, but it turns out there are performance issues with high 
- *	metadata-rate workloads (e.g. multiple simultaneous tar
- *	extractions).  For now, we force the lock to be RW_WRITER, 
- *	since that currently has the best performance characteristics 
- *	(even for a single tar-file extraction). 
- *	
- */
-#define WAPBL_DEBUG_SERIALIZE 1
-
-#ifdef WAPBL_DEBUG_SERIALIZE
-	op = RW_WRITER;
-#else
-	op = RW_READER;
-#endif
-
 	/*
 	 * XXX this needs to be made much more sophisticated.
 	 * perhaps each wapbl_begin could reserve a specified
@ -820,12 +802,12 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
 			return error;
 	}

-	rw_enter(&wl->wl_rwlock, op);
+	rw_enter(&wl->wl_rwlock, RW_READER);
 	mutex_enter(&wl->wl_mtx);
 	wl->wl_lock_count++;
 	mutex_exit(&wl->wl_mtx);

-#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
+#if defined(WAPBL_DEBUG_PRINT)
 	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
 	    ("wapbl_begin thread %d.%d with bufcount=%zu "
 	    "bufbytes=%zu bcount=%zu at %s:%d\n",
@ -840,7 +822,7 @@ void
 wapbl_end(struct wapbl *wl)
 {

-#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
+#if defined(WAPBL_DEBUG_PRINT)
 	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
 	     ("wapbl_end thread %d.%d with bufcount=%zu "
 	      "bufbytes=%zu bcount=%zu\n",
@ -1552,20 +1534,14 @@ void
 wapbl_jlock_assert(struct wapbl *wl)
 {

-#ifdef WAPBL_DEBUG_SERIALIZE
-	KASSERT(rw_write_held(&wl->wl_rwlock));
-#else
-	KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
-#endif
+	KASSERT(rw_lock_held(&wl->wl_rwlock));
 }

 void
 wapbl_junlock_assert(struct wapbl *wl)
 {

-#ifdef WAPBL_DEBUG_SERIALIZE
 	KASSERT(!rw_write_held(&wl->wl_rwlock));
-#endif
 }

 /****************************************************************/
--- a/sys/miscfs/syncfs/sync_subr.c
+++ b/sys/miscfs/syncfs/sync_subr.c
@ -1,4 +1,33 @@
-/*	$NetBSD: sync_subr.c,v 1.35 2009/01/17 07:02:35 yamt Exp $	*/
+/*	$NetBSD: sync_subr.c,v 1.36 2009/02/22 20:10:25 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */

 /*
 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
@ -32,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.35 2009/01/17 07:02:35 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.36 2009/02/22 20:10:25 ad Exp $");

 #include <sys/param.h>
 #include <sys/systm.h>
@ -59,6 +88,7 @@ time_t syncdelay = 30;			/* max time to delay syncing data */
 time_t filedelay = 30;			/* time to delay syncing files */
 time_t dirdelay  = 15;			/* time to delay syncing directories */
 time_t metadelay = 10;			/* time to delay syncing metadata */
+time_t lockdelay = 1;			/* time to delay if locking fails */

 kmutex_t syncer_mutex;			/* used to freeze syncer, long term */
 static kmutex_t syncer_data_lock;	/* short term lock on data structures */
@ -196,6 +226,7 @@ sched_sync(void *v)
 	struct synclist *slp;
 	struct vnode *vp;
 	long starttime;
+	bool synced;

 	updateproc = curlwp;

@ -206,8 +237,7 @@ sched_sync(void *v)
 		starttime = time_second;

 		/*
-		 * Push files whose dirty time has expired. Be careful
-		 * of interrupt race on slp queue.
+		 * Push files whose dirty time has expired.
 		 */
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
@ -216,10 +246,12 @@ sched_sync(void *v)

 		while ((vp = TAILQ_FIRST(slp)) != NULL) {
 			/* We are locking in the wrong direction. */
+			synced = false;
 			if (mutex_tryenter(&vp->v_interlock)) {
 				mutex_exit(&syncer_data_lock);
 				if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT |
 				    LK_INTERLOCK) == 0) {
+					synced = true;
 					(void) VOP_FSYNC(vp, curlwp->l_cred,
 					    FSYNC_LAZY, 0, 0);
 					vput(vp);
@ -227,15 +259,36 @@ sched_sync(void *v)
 				mutex_enter(&syncer_data_lock);
 			}

-			/* XXXAD The vnode may have been recycled. */
+			/*
+			 * XXX The vnode may have been recycled, in which
+			 * case it may have a new identity.
+			 */
 			if (TAILQ_FIRST(slp) == vp) {
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
+				 *
+				 * Try again sooner rather than later if
+				 * we were unable to lock the vnode.  Lock
+				 * failure should not prevent us from doing
+				 * the sync "soon".
+				 *
+				 * If we locked it yet arrive here, it's
+				 * likely that lazy sync is in progress and
+				 * so the vnode still has dirty metadata. 
+				 * syncdelay is mainly to get this vnode out
+				 * of the way so we do not consider it again
+				 * "soon" in this loop, so the delay time is
+				 * not critical as long as it is not "soon". 
+				 * While write-back strategy is the file
+				 * system's domain, we expect write-back to
+				 * occur no later than syncdelay seconds
+				 * into the future.
 				 */
-				vn_syncer_add1(vp, syncdelay);
+				vn_syncer_add1(vp,
+				    synced ? syncdelay : lockdelay);
 			}
 		}

@ -247,8 +300,10 @@ sched_sync(void *v)
 		if (bioopsp != NULL)
 			(*bioopsp->io_sync)(NULL);

+		/*
+		 * Wait until there are more workitems to process.
+		 */
 		mutex_exit(&syncer_mutex);
-
 		mutex_enter(&syncer_data_lock);
 		if (rushjob > 0) {
 			/*
--- a/sys/miscfs/syncfs/sync_vnops.c
+++ b/sys/miscfs/syncfs/sync_vnops.c
@ -1,4 +1,33 @@
-/*	$NetBSD: sync_vnops.c,v 1.25 2008/05/06 18:43:44 ad Exp $	*/
+/*	$NetBSD: sync_vnops.c,v 1.26 2009/02/22 20:10:25 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */

 /*
 * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
@ -32,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sync_vnops.c,v 1.25 2008/05/06 18:43:44 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sync_vnops.c,v 1.26 2009/02/22 20:10:25 ad Exp $");

 #include <sys/param.h>
 #include <sys/proc.h>
@ -62,6 +91,18 @@ const struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 const struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };

+/*
+ * Return delay factor appropriate for the given file system.   For
+ * WAPBL we use the sync vnode to burst out metadata updates: sync
+ * those file systems more frequently.
+ */
+static inline int
+sync_delay(struct mount *mp)
+{
+
+	return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
+}
+
 /*
 * Create a new filesystem syncer vnode for the specified mount point.
 */
@ -70,8 +111,8 @@ vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
-	static long start, incr, next;
-	int error;
+	static int start, incr, next;
+	int error, vdelay;

 	/* Allocate a new vnode */
 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0)
@ -98,7 +139,8 @@ vfs_allocate_syncvnode(mp)
 		next = start;
 	}
 	mutex_enter(&vp->v_interlock);
-	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
+	vdelay = sync_delay(mp);
+	vn_syncer_add_to_worklist(vp, vdelay > 0 ? next % vdelay : 0);
 	mutex_exit(&vp->v_interlock);
 	mp->mnt_syncer = vp;
 	return (0);
@ -149,7 +191,7 @@ sync_fsync(v)
 	 * Move ourselves to the back of the sync list.
 	 */
 	mutex_enter(&syncvp->v_interlock);
-	vn_syncer_add_to_worklist(syncvp, syncdelay);
+	vn_syncer_add_to_worklist(syncvp, sync_delay(mp));
 	mutex_exit(&syncvp->v_interlock);

 	/*
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_alloc.c,v 1.120 2009/01/11 02:45:56 christos Exp $	*/
+/*	$NetBSD: ffs_alloc.c,v 1.121 2009/02/22 20:10:25 ad Exp $	*/

 /*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -70,7 +70,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.120 2009/01/11 02:45:56 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.121 2009/02/22 20:10:25 ad Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@ -1284,7 +1284,7 @@ retry:
 	if (ibp != NULL &&
 	    initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
 		/* Another thread allocated more inodes so we retry the test. */
-		brelse(ibp, BC_INVAL);
+		brelse(ibp, 0);
 		ibp = NULL;
 	}
 	/*
@ -1396,7 +1396,7 @@ gotit:
 	if (bp != NULL)
 		brelse(bp, 0);
 	if (ibp != NULL)
-		brelse(ibp, BC_INVAL);
+		brelse(ibp, 0);
 	mutex_enter(&ump->um_lock);
 	return (0);
 }
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.242 2009/02/22 20:10:25 ad Exp $	*/

 /*-
 * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -61,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.242 2009/02/22 20:10:25 ad Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@ -111,6 +111,8 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $");

 MODULE(MODULE_CLASS_VFS, ffs, NULL);

+static int	ffs_vfs_fsync(vnode_t *, int);
+
 static struct sysctllog *ffs_sysctl_log;

 /* how many times ffs_init() was called */
@ -151,7 +153,7 @@ struct vfsops ffs_vfsops = {
 	ffs_suspendctl,
 	genfs_renamelock_enter,
 	genfs_renamelock_exit,
-	ffs_full_fsync,
+	ffs_vfs_fsync,
 	ffs_vnodeopv_descs,
 	0,
 	{ NULL, NULL },
@ -1697,11 +1699,22 @@ loop:
 			continue;
 		mutex_enter(&vp->v_interlock);
 		ip = VTOI(vp);
-		/* XXXpooka: why wapbl check? */
+
+		/*
+		 * We deliberately update inode times here.  This will
+		 * prevent a massive queue of updates accumulating, only
+		 * to be handled by a call to unmount.
+		 *
+		 * XXX It would be better to have the syncer trickle these
+		 * out.  Adjustment needed to allow registering vnodes for
+		 * sync when the vnode is clean, but the inode dirty.  Or
+		 * have ufs itself trickle out inode updates.
+		 */
 		if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
 		    vp->v_type == VNON || ((ip->i_flag &
-		    (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
-		    (LIST_EMPTY(&vp->v_dirtyblkhd) || (mp->mnt_wapbl)) &&
+		    (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+		    IN_MODIFIED | IN_ACCESSED)) == 0 &&
+		    LIST_EMPTY(&vp->v_dirtyblkhd) &&
 		    UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
 		{
 			mutex_exit(&vp->v_interlock);
@ -2138,3 +2151,152 @@ ffs_suspendctl(struct mount *mp, int cmd)
 		return EINVAL;
 	}
 }
+
+/*
+ * Synch vnode for a mounted file system.  This is called for foreign
+ * vnodes, i.e. non-ffs.
+ */
+static int
+ffs_vfs_fsync(vnode_t *vp, int flags)
+{
+	int error, passes, skipmeta, i, pflags;
+	buf_t *bp, *nbp;
+	struct mount *mp;
+
+	KASSERT(vp->v_type == VBLK);
+	KASSERT(vp->v_specmountpoint != NULL);
+
+	mp = vp->v_specmountpoint;
+	if ((mp->mnt_flag & MNT_SOFTDEP) != 0)
+		softdep_fsync_mountdev(vp);
+
+	/*
+	 * Flush all dirty data associated with the vnode.
+	 */
+	pflags = PGO_ALLPAGES | PGO_CLEANIT;
+	if ((flags & FSYNC_WAIT) != 0)
+		pflags |= PGO_SYNCIO;
+	mutex_enter(&vp->v_interlock);
+	error = VOP_PUTPAGES(vp, 0, 0, pflags);
+	if (error)
+		return error;
+
+#ifdef WAPBL
+	if (mp && mp->mnt_wapbl) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
+			return 0;
+
+		/*
+		 * Don't flush the log if the vnode being flushed
+		 * contains no dirty buffers that could be in the log.
+		 */
+		if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+			error = wapbl_flush(mp->mnt_wapbl, 0);
+			if (error)
+				return error;
+		}
+
+		if ((flags & FSYNC_WAIT) != 0) {
+			mutex_enter(&vp->v_interlock);
+			while (vp->v_numoutput)
+				cv_wait(&vp->v_cv, &vp->v_interlock);
+			mutex_exit(&vp->v_interlock);
+		}
+
+		return 0;
+	}
+#endif /* WAPBL */
+
+	/*
+	 * Write out metadata for non-logging file systems.  This block can
+	 * be simplified once softdep goes.
+	 */
+	passes = NIADDR + 1;
+	skipmeta = 0;
+	if (flags & FSYNC_WAIT)
+		skipmeta = 1;
+
+loop:
+	mutex_enter(&bufcache_lock);
+	LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
+		bp->b_cflags &= ~BC_SCANNED;
+	}
+	for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+		nbp = LIST_NEXT(bp, b_vnbufs);
+		if (bp->b_cflags & (BC_BUSY | BC_SCANNED))
+			continue;
+		if ((bp->b_oflags & BO_DELWRI) == 0)
+			panic("ffs_fsync: not dirty");
+		if (skipmeta && bp->b_lblkno < 0)
+			continue;
+		bp->b_cflags |= BC_BUSY | BC_VFLUSH | BC_SCANNED;
+		mutex_exit(&bufcache_lock);
+		/*
+		 * On our final pass through, do all I/O synchronously
+		 * so that we can find out if our flush is failing
+		 * because of write errors.
+		 */
+		if (passes > 0 || !(flags & FSYNC_WAIT))
+			(void) bawrite(bp);
+		else if ((error = bwrite(bp)) != 0)
+			return (error);
+		/*
+		 * Since we unlocked during the I/O, we need
+		 * to start from a known point.
+		 */
+		mutex_enter(&bufcache_lock);
+		nbp = LIST_FIRST(&vp->v_dirtyblkhd);
+	}
+	mutex_exit(&bufcache_lock);
+	if (skipmeta) {
+		skipmeta = 0;
+		goto loop;
+	}
+
+	if ((flags & FSYNC_WAIT) != 0) {
+		mutex_enter(&vp->v_interlock);
+		while (vp->v_numoutput) {
+			cv_wait(&vp->v_cv, &vp->v_interlock);
+		}
+		mutex_exit(&vp->v_interlock);
+
+		/*
+		 * Ensure that any filesystem metadata associated
+		 * with the vnode has been written.
+		 */
+		if ((error = softdep_sync_metadata(vp)) != 0)
+			return (error);
+
+		if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
+			/*
+			* Block devices associated with filesystems may
+			* have new I/O requests posted for them even if
+			* the vnode is locked, so no amount of trying will
+			* get them clean. Thus we give block devices a
+			* good effort, then just give up. For all other file
+			* types, go around and try again until it is clean.
+			*/
+			if (passes > 0) {
+				passes--;
+				goto loop;
+			}
+#ifdef DIAGNOSTIC
+			if (vp->v_type != VBLK)
+				vprint("ffs_fsync: dirty", vp);
+#endif
+		}
+	}
+
+	if (error == 0 && (flags & FSYNC_CACHE) != 0) {
+		(void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
+		    kauth_cred_get());
+	}
+
+	return error;
+}
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_vnops.c,v 1.109 2009/02/01 17:36:43 ad Exp $	*/
+/*	$NetBSD: ffs_vnops.c,v 1.110 2009/02/22 20:10:25 ad Exp $	*/

 /*-
 * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
@ -61,7 +61,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.109 2009/02/01 17:36:43 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.110 2009/02/22 20:10:25 ad Exp $");

 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@ -319,7 +319,13 @@ ffs_fsync(void *v)
 #ifdef WAPBL
 	mp = wapbl_vptomp(vp);
 	if (mp->mnt_wapbl) {
-		if (ap->a_flags & FSYNC_DATAONLY) {
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
 			fstrans_done(vp->v_mount);
 			return 0;
 		}
@ -336,7 +342,7 @@ ffs_fsync(void *v)
 				(ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
 			UFS_WAPBL_END(mp);
 		}
-		if (error || (ap->a_flags & FSYNC_NOLOG)) {
+		if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
 			fstrans_done(vp->v_mount);
 			return error;
 		}
@ -393,43 +399,38 @@ out:
 }

 /*
- * Synch an open file.  Called for VOP_FSYNC() and VFS_FSYNC().
- *
- * BEWARE: THIS ROUTINE ACCEPTS BOTH FFS AND NON-FFS VNODES.
+ * Synch an open file.  Called for VOP_FSYNC().
 */
 /* ARGSUSED */
 int
 ffs_full_fsync(struct vnode *vp, int flags)
 {
 	struct buf *bp, *nbp;
-	int error, passes, skipmeta, inodedeps_only, waitfor;
+	int error, passes, skipmeta, inodedeps_only, waitfor, i;
 	struct mount *mp;

+	KASSERT(VTOI(vp) != NULL);
+	KASSERT(vp->v_tag == VT_UFS);
+
 	error = 0;

-	if ((flags & FSYNC_VFS) != 0) {
-		KASSERT(vp->v_specmountpoint != NULL);
+	mp = vp->v_mount;
+	if (vp->v_type == VBLK && vp->v_specmountpoint != NULL) {
 		mp = vp->v_specmountpoint;
-		KASSERT(vp->v_type == VBLK);
+		if ((mp->mnt_flag & MNT_SOFTDEP) != 0)
+			softdep_fsync_mountdev(vp);
 	} else {
 		mp = vp->v_mount;
-		KASSERT(vp->v_tag == VT_UFS);
 	}

-	if (vp->v_type == VBLK &&
-	    vp->v_specmountpoint != NULL &&
-	    (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP))
-		softdep_fsync_mountdev(vp);
-
 	mutex_enter(&vp->v_interlock);

 	inodedeps_only = DOINGSOFTDEP(vp) && (flags & FSYNC_RECLAIM)
 	    && UVM_OBJ_IS_CLEAN(&vp->v_uobj) && LIST_EMPTY(&vp->v_dirtyblkhd);

 	/*
-	 * Flush all dirty data associated with a vnode.
+	 * Flush all dirty data associated with the vnode.
 	 */
-
 	if (vp->v_type == VREG || vp->v_type == VBLK) {
 		int pflags = PGO_ALLPAGES | PGO_CLEANIT;

@ -447,21 +448,25 @@ ffs_full_fsync(struct vnode *vp, int flags)

 #ifdef WAPBL
 	if (mp && mp->mnt_wapbl) {
-		error = 0;
-		if (flags & FSYNC_DATAONLY)
-			return error;
+		/*
+		 * Don't bother writing out metadata if the syncer is
+		 * making the request.  We will let the sync vnode
+		 * write it out in a single burst through a call to
+		 * VFS_SYNC().
+		 */
+		if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
+			return 0;

-		if ((flags & FSYNC_VFS) == 0 && VTOI(vp) != NULL &&
-		    (VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
+		if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
 		    | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
 			error = UFS_WAPBL_BEGIN(mp);
 			if (error)
 				return error;
 			error = ffs_update(vp, NULL, NULL,
-				(flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+			    (flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
 			UFS_WAPBL_END(mp);
 		}
-		if (error || (flags & FSYNC_NOLOG))
+		if (error || (flags & FSYNC_NOLOG) != 0)
 			return error;

 		/*
@ -476,7 +481,7 @@ ffs_full_fsync(struct vnode *vp, int flags)

 		if ((flags & FSYNC_WAIT) != 0) {
 			mutex_enter(&vp->v_interlock);
-			while (vp->v_numoutput)
+			while (vp->v_numoutput != 0)
 				cv_wait(&vp->v_cv, &vp->v_interlock);
 			mutex_exit(&vp->v_interlock);
 		}
@ -485,6 +490,10 @@ ffs_full_fsync(struct vnode *vp, int flags)
 	}
 #endif /* WAPBL */

+	/*
+	 * Write out metadata for non-logging file systems.  This block can
+	 * be simplified once softdep goes.
+	 */
 	passes = NIADDR + 1;
 	skipmeta = 0;
 	if (flags & FSYNC_WAIT)
@ -565,17 +574,11 @@ loop:
 		waitfor = 0;
 	else
 		waitfor = (flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0;
-
-	if ((flags & FSYNC_VFS) == 0)
-		error = ffs_update(vp, NULL, NULL, waitfor);
+	error = ffs_update(vp, NULL, NULL, waitfor);

 	if (error == 0 && (flags & FSYNC_CACHE) != 0) {
-		int i = 0;
-		if ((flags & FSYNC_VFS) == 0) {
-			KASSERT(VTOI(vp) != NULL);
-			vp = VTOI(vp)->i_devvp;
-		}
-		VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, curlwp->l_cred);
+		(void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
+		    kauth_cred_get());
 	}

 	return error;