diff --git a/distrib/sets/lists/comp/mi b/distrib/sets/lists/comp/mi
index 710a445dbc29..a1e9af15c227 100644
--- a/distrib/sets/lists/comp/mi
+++ b/distrib/sets/lists/comp/mi
@@ -1,4 +1,4 @@
-#	$NetBSD: mi,v 1.1170 2008/07/29 13:17:40 pooka Exp $
+#	$NetBSD: mi,v 1.1171 2008/07/31 05:38:04 simonb Exp $
 #
 # Note: don't delete entries from here - mark them as "obsolete" instead.
 #
@@ -2069,6 +2069,7 @@
 ./usr/include/sys/vnode_if.h			comp-c-include
 ./usr/include/sys/vsio.h			comp-obsolete		obsolete
 ./usr/include/sys/wait.h			comp-c-include
+./usr/include/sys/wapbl.h			comp-c-include
 ./usr/include/sys/wdog.h			comp-c-include
 ./usr/include/sysexits.h			comp-c-include
 ./usr/include/syslog.h				comp-c-include
@@ -2100,6 +2101,7 @@
 ./usr/include/ufs/ufs/quota.h			comp-c-include
 ./usr/include/ufs/ufs/ufs_bswap.h		comp-c-include
 ./usr/include/ufs/ufs/ufs_extern.h		comp-c-include
+./usr/include/ufs/ufs/ufs_wapbl.h		comp-c-include
 ./usr/include/ufs/ufs/ufsmount.h		comp-c-include
 ./usr/include/ulimit.h				comp-c-include
 ./usr/include/unctrl.h				comp-c-include
diff --git a/include/mntopts.h b/include/mntopts.h
index c30385fa2517..6dac10e8dea6 100644
--- a/include/mntopts.h
+++ b/include/mntopts.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: mntopts.h,v 1.10 2006/10/31 08:12:46 mjf Exp $	*/
+/*	$NetBSD: mntopts.h,v 1.11 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1994
@@ -55,6 +55,7 @@ struct mntopt {
 #define MOPT_NOATIME		{ "atime",	1, MNT_NOATIME, 0 }
 #define MOPT_SYMPERM		{ "symperm",	0, MNT_SYMPERM, 0 }
 #define MOPT_SOFTDEP		{ "softdep",	0, MNT_SOFTDEP, 0 }
+#define MOPT_LOG		{ "log",	0, MNT_LOG, 0 }
 #define MOPT_IGNORE		{ "hidden",	0, MNT_IGNORE, 0 }
 
 /* Control flags. */
diff --git a/sbin/fsck_ffs/Makefile b/sbin/fsck_ffs/Makefile
index 57a72d416d8b..2cb2cb2dd22b 100644
--- a/sbin/fsck_ffs/Makefile
+++ b/sbin/fsck_ffs/Makefile
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.35 2008/05/04 15:37:19 tsutsui Exp $
+#	$NetBSD: Makefile,v 1.36 2008/07/31 05:38:04 simonb Exp $
 #	@(#)Makefile	8.2 (Berkeley) 4/27/95
 
 .include <bsd.own.mk>
@@ -19,6 +19,10 @@ SRCS+=	progress.c
 
 .PATH:	${NETBSDSRCDIR}/sys/ufs/ffs ${FSCK}
 
+SRCS+= vfs_wapbl.c wapbl.c
+.PATH:	${NETBSDSRCDIR}/sys/kern
+CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0
+
 LDADD+=-lutil
 DPADD+=${LIBUTIL}
 
diff --git a/sbin/fsck_ffs/extern.h b/sbin/fsck_ffs/extern.h
index 0ff3e46edf18..66a0f29cba3e 100644
--- a/sbin/fsck_ffs/extern.h
+++ b/sbin/fsck_ffs/extern.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: extern.h,v 1.22 2005/06/27 01:25:35 christos Exp $	*/
+/*	$NetBSD: extern.h,v 1.23 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*
  * Copyright (c) 1994 James A. Jegers
@@ -27,7 +27,7 @@
 void		adjust(struct inodesc *, int);
 daddr_t		allocblk(long);
 ino_t		allocdir(ino_t, ino_t, int);
-ino_t		allocino(ino_t request, int type);
+ino_t		allocino(ino_t, int);
 void		blkerror(ino_t, const char *, daddr_t);
 int		bread(int, char *, daddr_t, long);
 void		bufinit(void);
@@ -82,7 +82,12 @@ void		setinodebuf(ino_t);
 int		setup(const char *);
 void		voidquit(int);
 
-void	swap_cg(struct cg *, struct cg *);
-void copyback_cg(struct bufarea *);
-void sb_oldfscompat_write(struct fs *, struct fs *);
-void sb_oldfscompat_read(struct fs *, struct fs **);
+void		replay_wapbl(void);
+void		cleanup_wapbl(void);
+int		read_wapbl(char *, long, daddr_t);
+int		is_journal_inode(ino_t);
+
+void		swap_cg(struct cg *, struct cg *);
+void		copyback_cg(struct bufarea *);
+void		sb_oldfscompat_write(struct fs *, struct fs *);
+void		sb_oldfscompat_read(struct fs *, struct fs **);
diff --git a/sbin/fsck_ffs/fsck_ffs.8 b/sbin/fsck_ffs/fsck_ffs.8
index c241648df419..22e341e5c0a2 100644
--- a/sbin/fsck_ffs/fsck_ffs.8
+++ b/sbin/fsck_ffs/fsck_ffs.8
@@ -1,4 +1,4 @@
-.\"	$NetBSD: fsck_ffs.8,v 1.40 2005/01/19 16:41:04 wiz Exp $
+.\"	$NetBSD: fsck_ffs.8,v 1.41 2008/07/31 05:38:04 simonb Exp $
 .\"
 .\" Copyright (c) 1980, 1989, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
@@ -198,7 +198,7 @@ possible without user interaction.
 Conversion in preen mode is best used when all the file systems
 are being converted at once.
 The format of a file system can be determined from the
-second line of output from
+third line of output from
 .Xr dumpfs 8 .
 .It Fl d
 Print debugging output.
diff --git a/sbin/fsck_ffs/pass4.c b/sbin/fsck_ffs/pass4.c
index 9105b7d704a6..fab824ade249 100644
--- a/sbin/fsck_ffs/pass4.c
+++ b/sbin/fsck_ffs/pass4.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: pass4.c,v 1.24 2008/02/23 21:41:48 christos Exp $	*/
+/*	$NetBSD: pass4.c,v 1.25 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*
  * Copyright (c) 1980, 1986, 1993
@@ -34,7 +34,7 @@
 #if 0
 static char sccsid[] = "@(#)pass4.c	8.4 (Berkeley) 4/28/95";
 #else
-__RCSID("$NetBSD: pass4.c,v 1.24 2008/02/23 21:41:48 christos Exp $");
+__RCSID("$NetBSD: pass4.c,v 1.25 2008/07/31 05:38:04 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -89,7 +89,14 @@ pass4(void)
 			case DFOUND:
 				n = info->ino_linkcnt;
 				if (n) {
-					adjust(&idesc, (short)n);
+					if (is_journal_inode(inumber)) {
+						if (debug)
+							printf(
+    "skipping unreferenced journal inode %" PRId64 "\n", inumber);
+						break;
+					} else {
+						adjust(&idesc, (short)n);
+					}
 					break;
 				}
 				for (zlnp = zlnhead; zlnp; zlnp = zlnp->next)
diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c
index fdf53f633741..6a49ecb36c74 100644
--- a/sbin/fsck_ffs/setup.c
+++ b/sbin/fsck_ffs/setup.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: setup.c,v 1.82 2008/02/23 21:41:48 christos Exp $	*/
+/*	$NetBSD: setup.c,v 1.83 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*
  * Copyright (c) 1980, 1986, 1993
@@ -34,7 +34,7 @@
 #if 0
 static char sccsid[] = "@(#)setup.c	8.10 (Berkeley) 5/9/95";
 #else
-__RCSID("$NetBSD: setup.c,v 1.82 2008/02/23 21:41:48 christos Exp $");
+__RCSID("$NetBSD: setup.c,v 1.83 2008/07/31 05:38:04 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -159,6 +159,25 @@ setup(const char *dev)
 		doskipclean = 0;
 		pwarn("USING ALTERNATE SUPERBLOCK AT %d\n", bflag);
 	}
+	if (sblock->fs_flags & FS_DOWAPBL) {
+		if (preen) {
+			if (!quiet)
+				pwarn("file system is journaled; not checking\n");
+			return (-1);
+		}
+		if (!quiet)
+			pwarn("** File system is journaled; replaying journal\n");
+		replay_wapbl();
+		doskipclean = 0;
+		sblock->fs_flags &= ~FS_DOWAPBL;
+		sbdirty();
+		/* Although we may have updated the superblock from the
+		 * journal, we are still going to do a full check, so we
+		 * don't bother to re-read the superblock from the journal.
+		 * XXX, instead we could re-read the superblock and then not
+		 * force doskipclean = 0 
+		 */
+	}
 	if (debug)
 		printf("clean = %d\n", sblock->fs_clean);
 	if (doswap)
@@ -218,6 +237,13 @@ setup(const char *dev)
 	/*
 	 * Check and potentially fix certain fields in the super block.
 	 */
+	if (sblock->fs_flags & ~(FS_KNOWN_FLAGS)) {
+		pfatal("UNKNOWN FLAGS=0x%08x IN SUPERBLOCK", sblock->fs_flags);
+		if (reply("CLEAR") == 1) {
+			sblock->fs_flags &= FS_KNOWN_FLAGS;
+			sbdirty();
+		}
+	}
 	if (sblock->fs_optim != FS_OPTTIME && sblock->fs_optim != FS_OPTSPACE) {
 		pfatal("UNDEFINED OPTIMIZATION IN SUPERBLOCK");
 		if (reply("SET TO DEFAULT") == 1) {
diff --git a/sbin/fsck_ffs/utilities.c b/sbin/fsck_ffs/utilities.c
index 7551c20f28ab..06429296344a 100644
--- a/sbin/fsck_ffs/utilities.c
+++ b/sbin/fsck_ffs/utilities.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: utilities.c,v 1.55 2008/02/23 21:41:48 christos Exp $	*/
+/*	$NetBSD: utilities.c,v 1.56 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*
  * Copyright (c) 1980, 1986, 1993
@@ -34,7 +34,7 @@
 #if 0
 static char sccsid[] = "@(#)utilities.c	8.6 (Berkeley) 5/19/95";
 #else
-__RCSID("$NetBSD: utilities.c,v 1.55 2008/02/23 21:41:48 christos Exp $");
+__RCSID("$NetBSD: utilities.c,v 1.56 2008/07/31 05:38:04 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -322,6 +322,7 @@ ckfini(void)
 	if (debug)
 		printf("cache missed %ld of %ld (%d%%)\n", diskreads,
 		    totalreads, (int)(diskreads * 100 / totalreads));
+	cleanup_wapbl();
 	(void)close(fsreadfd);
 	(void)close(fswritefd);
 }
@@ -335,7 +336,8 @@ bread(int fd, char *buf, daddr_t blk, long size)
 
 	offset = blk;
 	offset *= dev_bsize;
-	if (pread(fd, buf, (int)size, offset) == size)
+	if ((pread(fd, buf, (int)size, offset) == size) &&
+	    read_wapbl(buf, size, blk) == 0)
 		return (0);
 	rwerror("READ", blk);
 	errs = 0;
diff --git a/sbin/fsck_ffs/wapbl.c b/sbin/fsck_ffs/wapbl.c
new file mode 100644
index 000000000000..8e68b0750a87
--- /dev/null
+++ b/sbin/fsck_ffs/wapbl.c
@@ -0,0 +1,202 @@
+/*	$NetBSD: wapbl.c,v 1.2 2008/07/31 05:38:04 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2005,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* This file contains fsck support for wapbl
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: wapbl.c,v 1.2 2008/07/31 05:38:04 simonb Exp $");
+
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#include <sys/wapbl.h>
+
+#include "fsck.h"
+#include "fsutil.h"
+#include "extern.h"
+#include "exitvalues.h"
+
+int
+wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
+{
+
+	WAPBL_PRINTF(WAPBL_PRINT_IO,
+		("wapbl_write: %zd bytes at block %"PRId64" on fd 0x%x\n",
+		len, pbn, fswritefd));
+	bwrite(fswritefd, data, pbn, len);
+	return 0;
+}
+
+int
+wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
+{
+
+	WAPBL_PRINTF(WAPBL_PRINT_IO,
+		("wapbl_read: %zd bytes at block %"PRId64" on fd 0x%x\n",
+		len, pbn, fsreadfd));
+	bread(fsreadfd, data, pbn, len);
+	return 0;
+}
+
+struct wapbl_replay *wapbl_replay;
+
+void
+replay_wapbl(void)
+{
+	uint64_t addr, count, blksize;
+	int error;
+
+	if (debug)
+		wapbl_debug_print = WAPBL_PRINT_ERROR | WAPBL_PRINT_REPLAY;
+	if (debug > 1)
+		wapbl_debug_print |= WAPBL_PRINT_IO;
+
+	if (sblock->fs_journal_version != UFS_WAPBL_VERSION) {
+		pfatal("INVALID JOURNAL VERSION %d",
+		    sblock->fs_journal_version);
+		if (reply("CONTINUE") == 0) {
+			exit(FSCK_EXIT_CHECK_FAILED);
+		}
+		return;
+	}
+
+	switch (sblock->fs_journal_location) {
+	case UFS_WAPBL_JOURNALLOC_NONE:
+		pfatal("INVALID JOURNAL LOCATION 'NONE'");
+		if (reply("CONTINUE") == 0) {
+			exit(FSCK_EXIT_CHECK_FAILED);
+		}
+		return;
+
+	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+		addr = sblock->fs_journallocs[UFS_WAPBL_EPART_ADDR];
+		count = sblock->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+		blksize = sblock->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+		break;
+
+	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+		addr = sblock->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+		count = sblock->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+		blksize = sblock->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+		break;
+
+	default:
+		pfatal("INVALID JOURNAL LOCATION %d",
+		    sblock->fs_journal_location);
+		if (reply("CONTINUE") == 0) {
+			exit(FSCK_EXIT_CHECK_FAILED);
+		}
+		return;
+	}
+
+	error = wapbl_replay_start(&wapbl_replay, 0, addr, count, blksize);
+	if (error) {
+		pfatal("UNABLE TO READ JOURNAL FOR REPLAY");
+		if (reply("CONTINUE") == 0) {
+			exit(FSCK_EXIT_CHECK_FAILED);
+		}
+		return;
+	}
+	if (!nflag) {
+		error = wapbl_replay_write(wapbl_replay, 0);
+		if (error) {
+			pfatal("UNABLE TO REPLAY JOURNAL BLOCKS");
+			if (reply("CONTINUE") == 0) {
+				exit(FSCK_EXIT_CHECK_FAILED);
+			}
+		} else {
+			wapbl_replay_stop(wapbl_replay);
+		}
+	}
+	{
+		int i;
+		for (i = 0; i < wapbl_replay->wr_inodescnt; i++) {
+			WAPBL_PRINTF(WAPBL_PRINT_REPLAY,("wapbl_replay: "
+			    "not cleaning inode %"PRIu32" mode %"PRIo32"\n",
+			    wapbl_replay->wr_inodes[i].wr_inumber,
+			    wapbl_replay->wr_inodes[i].wr_imode));
+		}
+	}
+}
+
+void
+cleanup_wapbl(void)
+{
+
+	if (wapbl_replay) {
+		if (wapbl_replay_isopen(wapbl_replay))
+			wapbl_replay_stop(wapbl_replay);
+		wapbl_replay_free(wapbl_replay);
+		wapbl_replay = 0;
+	}
+}
+
+int
+read_wapbl(char *buf, long size, daddr_t blk)
+{
+
+	if (!wapbl_replay || !wapbl_replay_isopen(wapbl_replay))
+		return 0;
+	return wapbl_replay_read(wapbl_replay, buf, blk, size);
+}
+
+int
+is_journal_inode(ino_t ino)
+{
+	union dinode *dp;
+
+	dp = ginode(ino);
+	if ((iswap32(DIP(dp, flags)) & SF_LOG) != 0 &&
+	    sblock->fs_journal_version == UFS_WAPBL_VERSION &&
+	    sblock->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM &&
+	    sblock->fs_journallocs[UFS_WAPBL_INFS_INO] == ino)
+		return 1;
+
+	return 0;
+}
diff --git a/sbin/fsdb/Makefile b/sbin/fsdb/Makefile
index 7a2615e1a8ed..1aa3bdd375b4 100644
--- a/sbin/fsdb/Makefile
+++ b/sbin/fsdb/Makefile
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.22 2008/05/04 15:37:19 tsutsui Exp $
+#	$NetBSD: Makefile,v 1.23 2008/07/31 05:38:04 simonb Exp $
 #	@(#)Makefile	8.1 (Berkeley) 6/5/93
 
 .include <bsd.own.mk>
@@ -16,6 +16,10 @@ FSCK_FFS=${NETBSDSRCDIR}/sbin/fsck_ffs
 CPPFLAGS+= -I${FSCK} -I${FSCK_FFS}
 .PATH:	${FSCK} ${FSCK_FFS} ${NETBSDSRCDIR}/sys/ufs/ffs
 
+SRCS+=	vfs_wapbl.c wapbl.c
+.PATH:	${NETBSDSRCDIR}/sys/kern
+CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0
+
 LDADD+= -lutil -ledit -ltermcap
 .ifndef HOSTPROG
 DPADD+= ${LIBUTIL} ${LIBEDIT} ${LIBTERMCAP}
diff --git a/sbin/mount_ffs/mount_ffs.c b/sbin/mount_ffs/mount_ffs.c
index 9c46df364ba4..d6a23166b95d 100644
--- a/sbin/mount_ffs/mount_ffs.c
+++ b/sbin/mount_ffs/mount_ffs.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: mount_ffs.c,v 1.23 2008/07/20 01:20:22 lukem Exp $	*/
+/*	$NetBSD: mount_ffs.c,v 1.24 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1993, 1994
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1993, 1994\
 #if 0
 static char sccsid[] = "@(#)mount_ufs.c	8.4 (Berkeley) 4/26/95";
 #else
-__RCSID("$NetBSD: mount_ffs.c,v 1.23 2008/07/20 01:20:22 lukem Exp $");
+__RCSID("$NetBSD: mount_ffs.c,v 1.24 2008/07/31 05:38:04 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -70,6 +70,7 @@ static const struct mntopt mopts[] = {
 	MOPT_NODEVMTIME,
 	MOPT_FORCE,
 	MOPT_SOFTDEP,
+	MOPT_LOG,
 	MOPT_GETARGS,
 	MOPT_NULL,
 };
diff --git a/sbin/tunefs/tunefs.8 b/sbin/tunefs/tunefs.8
index df5095e835dd..0aac0e340f5b 100644
--- a/sbin/tunefs/tunefs.8
+++ b/sbin/tunefs/tunefs.8
@@ -1,4 +1,4 @@
-.\"	$NetBSD: tunefs.8,v 1.36 2004/12/20 10:28:47 hubertf Exp $
+.\"	$NetBSD: tunefs.8,v 1.37 2008/07/31 05:38:04 simonb Exp $
 .\"
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
@@ -41,6 +41,7 @@
 .Op Fl e Ar maxbpg
 .Op Fl g Ar avgfilesize
 .Op Fl h Ar avgfpdir
+.Op Fl l Ar logsize
 .Op Fl m Ar minfree
 .Bk -words
 .\" .Op Fl n Ar soft_dependency_enabling
@@ -97,6 +98,13 @@ this parameter should be set higher.
 This specifies the expected average file size.
 .It Fl h Ar avgfpdir
 This specifies the expected number of files per directory.
+.It Fl l Ar logsize
+This value specifies the size of the in-filesystem journaling log file.
+The default journaling log file size is described in
+.Xr wapbl 4 .
+Specifying a size of zero will cause the in-filesystem journaling log file
+to be removed the next time the filesystem is mounted.
+The size of an existing in-filesystem journaling log file can not be changed.
 .It Fl m Ar minfree
 This value specifies the percentage of space held back
 from normal users; the minimum free space threshold.
@@ -145,6 +153,7 @@ or
 .Li time .
 .El
 .Sh SEE ALSO
+.Xr wapbl 4 ,
 .Xr fs 5 ,
 .Xr dumpfs 8 ,
 .Xr fsck_ffs 8 ,
diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c
index b4ac66d9cfcb..ca7c14daa42d 100644
--- a/sbin/tunefs/tunefs.c
+++ b/sbin/tunefs/tunefs.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $	*/
+/*	$NetBSD: tunefs.c,v 1.35 2008/07/31 05:38:04 simonb Exp $	*/
 
 /*
  * Copyright (c) 1983, 1993
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1983, 1993\
 #if 0
 static char sccsid[] = "@(#)tunefs.c	8.3 (Berkeley) 5/3/95";
 #else
-__RCSID("$NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $");
+__RCSID("$NetBSD: tunefs.c,v 1.35 2008/07/31 05:38:04 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -48,9 +48,9 @@ __RCSID("$NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $");
  */
 #include <sys/param.h>
 
-#include <ufs/ufs/dinode.h>
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include <machine/bswap.h>
 
@@ -85,15 +85,16 @@ static off_t sblock_try[] = SBLOCKSEARCH;
 
 static	void	bwrite(daddr_t, char *, int, const char *);
 static	void	bread(daddr_t, char *, int, const char *);
-static	int	getnum(const char *, const char *, int, int);
+static	void	change_log_info(long long);
 static	void	getsb(struct fs *, const char *);
 static	int	openpartition(const char *, int, char *, size_t);
+static	void	show_log_info(void);
 static	void	usage(void);
 
 int
 main(int argc, char *argv[])
 {
-#define	OPTSTRINGBASE	"AFNe:g:h:m:o:"
+#define	OPTSTRINGBASE	"AFNe:g:h:l:m:o:"
 #ifdef TUNEFS_SOFTDEP
 	int		softdep;
 #define	OPTSTRING	OPTSTRINGBASE ## "n:"
@@ -105,10 +106,12 @@ main(int argc, char *argv[])
 	char		device[MAXPATHLEN];
 	int		maxbpg, minfree, optim;
 	int		avgfilesize, avgfpdir;
+	long long	logfilesize;
 
 	Aflag = Fflag = Nflag = 0;
 	maxbpg = minfree = optim = -1;
 	avgfilesize = avgfpdir = -1;
+	logfilesize = -1;
 #ifdef TUNEFS_SOFTDEP
 	softdep = -1;
 #endif
@@ -131,25 +134,30 @@ main(int argc, char *argv[])
 			break;
 
 		case 'e':
-			maxbpg = getnum(optarg,
+			maxbpg = strsuftoll(
 			    "maximum blocks per file in a cylinder group",
-			    1, INT_MAX);
+			    optarg, 1, INT_MAX);
 			break;
 
 		case 'g':
-			avgfilesize = getnum(optarg,
-			    "average file size", 1, INT_MAX);
-			break;
-
-		case 'h':
-			avgfpdir = getnum(optarg,
-			    "expected number of files per directory",
+			avgfilesize = strsuftoll("average file size", optarg,
 			    1, INT_MAX);
 			break;
 
+		case 'h':
+			avgfpdir = strsuftoll(
+			    "expected number of files per directory",
+			    optarg, 1, INT_MAX);
+			break;
+
+		case 'l':
+			logfilesize = strsuftoll("journal log file size",
+			    optarg, 0, INT_MAX);
+			break;
+
 		case 'm':
-			minfree = getnum(optarg,
-			    "minimum percentage of free space", 0, 99);
+			minfree = strsuftoll("minimum percentage of free space",
+			    optarg, 0, 99);
 			break;
 
 #ifdef TUNEFS_SOFTDEP
@@ -254,6 +262,9 @@ main(int argc, char *argv[])
 	CHANGEVAL(sblock.fs_avgfpdir, avgfpdir,
 	    "expected number of files per directory", "");
 
+	if (logfilesize >= 0)
+		change_log_info(logfilesize);
+
 	if (Nflag) {
 		fprintf(stdout, "tunefs: current settings of %s\n", special);
 		fprintf(stdout, "\tmaximum contiguous block count %d\n",
@@ -274,6 +285,7 @@ main(int argc, char *argv[])
 		fprintf(stdout,
 		    "\texpected number of files per directory: %d\n",
 		    sblock.fs_avgfpdir);
+		show_log_info();
 		fprintf(stdout, "tunefs: no changes made\n");
 		exit(0);
 	}
@@ -290,20 +302,123 @@ main(int argc, char *argv[])
 	exit(0);
 }
 
-static int
-getnum(const char *num, const char *desc, int min, int max)
+static void
+show_log_info(void)
 {
-	long	n;
-	char	*ep;
+	const char *loc;
+	uint64_t size, blksize;
+	int print;
 
-	n = strtol(num, &ep, 10);
-	if (ep[0] != '\0')
-		errx(1, "Invalid number `%s' for %s", num, desc);
-	if ((int) n < min)
-		errx(1, "%s `%s' too small (minimum is %d)", desc, num, min);
-	if ((int) n > max)
-		errx(1, "%s `%s' too large (maximum is %d)", desc, num, max);
-	return ((int)n);
+	switch (sblock.fs_journal_location) {
+	case UFS_WAPBL_JOURNALLOC_NONE:
+		print = blksize = 0;
+		/* nothing */
+		break;
+	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+		loc = "end of partition";
+		size = sblock.fs_journallocs[UFS_WAPBL_EPART_COUNT];
+		blksize = sblock.fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+		print = 1;
+		break;
+	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+		loc = "in filesystem";
+		size = sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT];
+		blksize = sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+		print = 1;
+		break;
+	default:
+		loc = "unknown";
+		size = blksize = 0;
+		print = 1;
+		break;
+	}
+
+	if (print) {
+		fprintf(stdout, "\tjournal log file location: %s\n", loc);
+		fprintf(stdout, "\tjournal log file size: %" PRIu64 "\n",
+		    size * blksize);
+		fprintf(stdout, "\tjournal log flags:");
+		if (sblock.fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG)
+			fprintf(stdout, " clear-log");
+		if (sblock.fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG)
+			fprintf(stdout, " clear-log");
+		fprintf(stdout, "\n");
+	}
+}
+
+static void
+change_log_info(long long logfilesize)
+{
+	/*
+	 * NOTES:
+	 *  - only operate on in-filesystem log sizes
+	 *  - can't change size of existing log
+	 *  - if current is same, no action
+	 *  - if current is zero and new is non-zero, set flag to create log
+	 *    on next mount
+	 *  - if current is non-zero and new is zero, set flag to clear log
+	 *    on next mount
+	 */
+	int in_fs_log;
+	uint64_t old_size;
+
+	old_size = 0;
+	switch (sblock.fs_journal_location) {
+	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+		in_fs_log = 0;
+		old_size = sblock.fs_journallocs[UFS_WAPBL_EPART_COUNT] *
+		    sblock.fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+		break;
+
+	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+		in_fs_log = 1;
+		old_size = sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT] *
+		    sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+		break;
+
+	case UFS_WAPBL_JOURNALLOC_NONE:
+	default:
+		in_fs_log = 0;
+		old_size = 0;
+		break;
+	}
+
+	if (!in_fs_log)
+		errx(1, "Can't change size of non-in-filesystem log");
+
+	if (old_size == logfilesize && logfilesize > 0) {
+		/* no action */
+		warnx("log file size remains unchanged at %lld", logfilesize);
+		return;
+	}
+
+	if (logfilesize == 0) {
+		/*
+		 * Don't clear out the locators - the kernel might need
+		 * these to find the log!  Just set the "clear the log"
+		 * flag and let the kernel do the rest.
+		 */
+		sblock.fs_journal_flags |= UFS_WAPBL_FLAGS_CLEAR_LOG;
+		sblock.fs_journal_flags &= ~UFS_WAPBL_FLAGS_CREATE_LOG;
+		warnx("log file size cleared from %" PRIu64 "", old_size);
+		return;
+	}
+
+	if (old_size == 0) {
+		/* create new log of desired size next mount */
+		sblock.fs_journal_location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
+		sblock.fs_journallocs[UFS_WAPBL_INFS_ADDR] = 0;
+		sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT] = logfilesize;
+		sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = 0;
+		sblock.fs_journallocs[UFS_WAPBL_INFS_INO] = 0;
+		sblock.fs_journal_flags |= UFS_WAPBL_FLAGS_CREATE_LOG;
+		sblock.fs_journal_flags &= ~UFS_WAPBL_FLAGS_CLEAR_LOG;
+		warnx("log file size set to %lld", logfilesize);
+	} else {
+		errx(1,
+		    "Can't change existing log size from %" PRIu64 " to %lld",
+		     old_size, logfilesize);
+	} 
 }
 
 static void
@@ -315,6 +430,7 @@ usage(void)
 	fprintf(stderr, "\t-e maximum blocks per file in a cylinder group\n");
 	fprintf(stderr, "\t-g average file size\n");
 	fprintf(stderr, "\t-h expected number of files per directory\n");
+	fprintf(stderr, "\t-l journal log file size (`0' to clear journal)\n");
 	fprintf(stderr, "\t-m minimum percentage of free space\n");
 #ifdef TUNEFS_SOFTDEP
 	fprintf(stderr, "\t-n soft dependencies (`enable' or `disable')\n");
diff --git a/sys/conf/files b/sys/conf/files
index 97a6135f38af..cad99941baf5 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,4 +1,4 @@
-#	$NetBSD: files,v 1.910 2008/07/16 20:06:19 pooka Exp $
+#	$NetBSD: files,v 1.911 2008/07/31 05:38:04 simonb Exp $
 
 #	@(#)files.newconf	7.5 (Berkeley) 5/10/93
 
@@ -110,6 +110,10 @@ defflag	opt_fileassoc.h		FILEASSOC
 
 defflag	opt_gre.h		GRE_DEBUG
 
+# Write Ahead Physical Block Logging
+defflag opt_wapbl.h		WAPBL WAPBL_DEBUG
+defparam opt_wapbl.h		WAPBL_DEBUG_PRINT
+
 # compatibility options
 #
 defflag	opt_compat_netbsd.h	COMPAT_40
@@ -1475,6 +1479,7 @@ file	kern/vfs_subr.c
 file	kern/vfs_syscalls.c
 file	kern/vfs_trans.c
 file	kern/vfs_vnops.c
+file	kern/vfs_wapbl.c		wapbl
 file	kern/vfs_xattr.c
 file	kern/vnode_if.c
 file	miscfs/deadfs/dead_vnops.c
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 14881a98588a..bd35e0f2b180 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $	*/
+/*	$NetBSD: init_main.c,v 1.361 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -97,7 +97,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.361 2008/07/31 05:38:05 simonb Exp $");
 
 #include "opt_ipsec.h"
 #include "opt_ntp.h"
@@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $")
 #include "opt_fileassoc.h"
 #include "opt_ktrace.h"
 #include "opt_pax.h"
+#include "opt_wapbl.h"
 
 #include "rnd.h"
 #include "sysmon_envsys.h"
@@ -192,6 +193,9 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $")
 #include <sys/ktrace.h>
 #endif
 #include <sys/kauth.h>
+#ifdef WAPBL
+#include <sys/wapbl.h>
+#endif
 #include <net80211/ieee80211_netbsd.h>
 
 #include <sys/syscall.h>
@@ -570,6 +574,11 @@ main(void)
 	/* Initialize the UUID system calls. */
 	uuid_init();
 
+#ifdef WAPBL
+	/* Initialize write-ahead physical block logging. */
+	wapbl_init();
+#endif
+
 	/*
 	 * Create process 1 (init(8)).  We do this now, as Unix has
 	 * historically had init be process 1, and changing this would
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index c428304cd8b8..d79f3029f276 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $	*/
+/*	$NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*-
  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
@@ -6,6 +6,8 @@
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Andrew Doran.
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -107,7 +109,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $");
 
 #include "fs_ffs.h"
 #include "opt_bufcache.h"
@@ -126,6 +128,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $"
 #include <sys/fstrans.h>
 #include <sys/intr.h>
 #include <sys/cpu.h>
+#include <sys/wapbl.h>
 
 #include <uvm/uvm.h>
 
@@ -714,8 +717,23 @@ bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred,
 
 	/* Wait for the read to complete, and return result. */
 	error = biowait(bp);
-	if (error == 0 && (flags & B_MODIFY) != 0)
+	if (error == 0 && (flags & B_MODIFY) != 0)	/* XXXX before the next code block or after? */
 		error = fscow_run(bp, true);
+
+	if (!error) {
+		struct mount *mp = wapbl_vptomp(vp);
+
+		if (mp && mp->mnt_wapbl_replay &&
+		    WAPBL_REPLAY_ISOPEN(mp)) {
+			error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno,
+			    bp->b_bcount);
+			if (error) {
+				mutex_enter(&bufcache_lock);
+				SET(bp->b_cflags, BC_INVAL);
+				mutex_exit(&bufcache_lock);
+			}
+		}
+	}
 	return error;
 }
 
@@ -793,6 +811,13 @@ bwrite(buf_t *bp)
 		mp = NULL;
 	}
 
+	if (mp && mp->mnt_wapbl) {
+		if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
+			bdwrite(bp);
+			return 0;
+		}
+	}
+
 	/*
 	 * Remember buffer type, to switch on it later.  If the write was
 	 * synchronous, but the file system was mounted with MNT_ASYNC,
@@ -897,6 +922,14 @@ bdwrite(buf_t *bp)
 		return;
 	}
 
+	if (wapbl_vphaswapbl(bp->b_vp)) {
+		struct mount *mp = wapbl_vptomp(bp->b_vp);
+
+		if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) {
+			WAPBL_ADD_BUF(mp, bp);
+		}
+	}
+
 	/*
 	 * If the block hasn't been seen before:
 	 *	(1) Mark it as having been seen,
@@ -1028,6 +1061,16 @@ brelsel(buf_t *bp, int set)
 		if (bioopsp != NULL)
 			(*bioopsp->io_deallocate)(bp);
 
+		if (ISSET(bp->b_flags, B_LOCKED)) {
+			if (wapbl_vphaswapbl(vp = bp->b_vp)) {
+				struct mount *mp = wapbl_vptomp(vp);
+
+				KASSERT(bp->b_iodone
+				    != mp->mnt_wapbl_op->wo_wapbl_biodone);
+				WAPBL_REMOVE_BUF(mp, bp);
+			}
+		}
+
 		mutex_enter(bp->b_objlock);
 		CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
 		if ((vp = bp->b_vp) != NULL) {
@@ -1224,19 +1267,22 @@ geteblk(int size)
 int
 allocbuf(buf_t *bp, int size, int preserve)
 {
-	vsize_t oldsize, desired_size;
 	void *addr;
+	vsize_t oldsize, desired_size;
+	int oldcount;
 	int delta;
 
 	desired_size = buf_roundsize(size);
 	if (desired_size > MAXBSIZE)
 		printf("allocbuf: buffer larger than MAXBSIZE requested");
 
+	oldcount = bp->b_bcount;
+
 	bp->b_bcount = size;
 
 	oldsize = bp->b_bufsize;
 	if (oldsize == desired_size)
-		return 0;
+		goto out;
 
 	/*
 	 * If we want a buffer of a different size, re-allocate the
@@ -1274,6 +1320,11 @@ allocbuf(buf_t *bp, int size, int preserve)
 		}
 	}
 	mutex_exit(&bufcache_lock);
+
+ out:
+	if (wapbl_vphaswapbl(bp->b_vp))
+		WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
+
 	return 0;
 }
 
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 3bfc11300dd2..055c11af7f8d 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_lookup.c,v 1.108 2008/05/06 18:43:44 ad Exp $	*/
+/*	$NetBSD: vfs_lookup.c,v 1.109 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.108 2008/05/06 18:43:44 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.109 2008/07/31 05:38:05 simonb Exp $");
 
 #include "opt_magiclinks.h"
 
@@ -956,8 +956,10 @@ relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 	if (cnp->cn_nameptr[0] == '\0')
 		panic("relookup: null name");
 
+#ifdef ohcrap
 	if (cnp->cn_flags & ISDOTDOT)
 		panic("relookup: lookup on dot-dot");
+#endif
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 17bb5ee4d51d..3ad7b9a616a3 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $	*/
+/*	$NetBSD: vfs_subr.c,v 1.355 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
@@ -81,7 +81,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.355 2008/07/31 05:38:05 simonb Exp $");
 
 #include "opt_ddb.h"
 #include "opt_compat_netbsd.h"
@@ -106,6 +106,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $")
 #include <sys/kauth.h>
 #include <sys/atomic.h>
 #include <sys/kthread.h>
+#include <sys/wapbl.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <miscfs/syncfs/syncfs.h>
@@ -1804,8 +1805,13 @@ vclean(vnode_t *vp, int flags)
 	 */
 	if (flags & DOCLOSE) {
 		error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
-		if (error != 0)
+		if (error != 0) {
+			/* XXX, fix vn_start_write's grab of mp and use that. */
+
+			if (wapbl_vphaswapbl(vp))
+				WAPBL_DISCARD(wapbl_vptomp(vp));
 			error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
+		}
 		KASSERT(error == 0);
 		KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
 		if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index f6be934a7487..b48faf864f29 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $	*/
+/*	$NetBSD: vfs_syscalls.c,v 1.370 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -63,7 +63,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.370 2008/07/31 05:38:05 simonb Exp $");
 
 #include "opt_compat_netbsd.h"
 #include "opt_compat_43.h"
@@ -208,12 +208,13 @@ mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags,
 	mp->mnt_flag &=
 	  ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
-	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP);
+	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
+	    MNT_LOG);
 	mp->mnt_flag |= flags &
 	   (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
-	    MNT_IGNORE);
+	    MNT_LOG | MNT_IGNORE);
 
 	error = VFS_MOUNT(mp, path, data, data_len);
 
@@ -367,7 +368,7 @@ mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops,
 	   (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
 	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP |
 	    MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP |
-	    MNT_IGNORE | MNT_RDONLY);
+	    MNT_LOG | MNT_IGNORE | MNT_RDONLY);
 
 	error = VFS_MOUNT(mp, path, data, data_len);
 	mp->mnt_flag &= ~MNT_OP_FLAGS;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 900ddbbdf813..e168d2427c50 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $	*/
+/*	$NetBSD: vfs_vnops.c,v 1.159 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.159 2008/07/31 05:38:05 simonb Exp $");
 
 #include "fs_union.h"
 #include "veriexec.h"
@@ -61,6 +61,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $");
 #include <sys/fstrans.h>
 #include <sys/atomic.h>
 #include <sys/filedesc.h>
+#include <sys/wapbl.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -692,6 +693,11 @@ vn_lock(struct vnode *vp, int flags)
 	    LK_CANRECURSE))
 	    == 0);
 
+#ifdef DIAGNOSTIC
+	if (wapbl_vphaswapbl(vp))
+		WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
+#endif
+
 	do {
 		/*
 		 * XXX PR 37706 forced unmount of file systems is unsafe.
diff --git a/sys/kern/vfs_wapbl.c b/sys/kern/vfs_wapbl.c
new file mode 100644
index 000000000000..c9792cd17283
--- /dev/null
+++ b/sys/kern/vfs_wapbl.c
@@ -0,0 +1,2783 @@
+/*	$NetBSD: vfs_wapbl.c,v 1.2 2008/07/31 05:38:05 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This implements file system independent write ahead filesystem logging.
+ */
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.2 2008/07/31 05:38:05 simonb Exp $");
+
+#include <sys/param.h>
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/malloc.h>
+#include <sys/resourcevar.h>
+#include <sys/conf.h>
+#include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/kauth.h>
+#include <sys/mutex.h>
+#include <sys/atomic.h>
+#include <sys/wapbl.h>
+
+#if WAPBL_UVM_ALLOC
+#include <uvm/uvm.h>
+#endif
+
+#include <miscfs/specfs/specdev.h>
+
+MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
+#define	wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
+#define	wapbl_free(a) free((a), M_WAPBL)
+#define	wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
+
+#else /* !_KERNEL */
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/time.h>
+#include <sys/wapbl.h>
+
+#define	KDASSERT(x) assert(x)
+#define	KASSERT(x) assert(x)
+#define	wapbl_malloc(s) malloc(s)
+#define	wapbl_free(a) free(a)
+#define	wapbl_calloc(n, s) calloc((n), (s))
+
+#endif /* !_KERNEL */
+
+/*
+ * INTERNAL DATA STRUCTURES
+ */
+
+/* 
+ * This structure holds per-mount log information.
+ *
+ * Legend:	a = atomic access only
+ *		r = read-only after init
+ *		l = rwlock held
+ *		m = mutex held
+ *		u = unlocked access ok
+ *		b = bufcache_lock held
+ */
+struct wapbl {
+	struct vnode *wl_logvp;	/* r:	log here */
+	struct vnode *wl_devvp;	/* r:	log on this device */
+	struct mount *wl_mount;	/* r:	mountpoint wl is associated with */
+	daddr_t wl_logpbn;	/* r:	Physical block number of start of log */
+	int wl_log_dev_bshift;	/* r:	logarithm of device block size of log
+					device */
+	int wl_fs_dev_bshift;	/* r:	logarithm of device block size of
+					filesystem device */
+
+	unsigned wl_lock_count;	/* a:	Count of transactions in progress */
+
+	size_t wl_circ_size; 	/* r:	Number of bytes in buffer of log */
+	size_t wl_circ_off;	/* r:	Number of bytes reserved at start */
+
+	size_t wl_bufcount_max;	/* r:	Number of buffers reserved for log */
+	size_t wl_bufbytes_max;	/* r:	Number of buf bytes reserved for log */
+
+	off_t wl_head;		/* l:	Byte offset of log head */
+	off_t wl_tail;		/* l:	Byte offset of log tail */
+	/*
+	 * head == tail == 0 means log is empty
+	 * head == tail != 0 means log is full
+	 * see assertions in wapbl_advance() for other boundary conditions.
+	 * only truncate moves the tail, except when flush sets it to
+	 * wl_header_size only flush moves the head, except when truncate
+	 * sets it to 0.
+	 */
+
+	struct wapbl_wc_header *wl_wc_header;	/* l	*/
+	void *wl_wc_scratch;	/* l:	scratch space (XXX: por que?!?) */
+
+	kmutex_t wl_mtx;	/* u:	short-term lock */
+	krwlock_t wl_rwlock;	/* u:	File system transaction lock */
+
+	/*
+	 * Must be held while accessing
+	 * wl_count or wl_bufs or head or tail
+	 */
+
+	/*
+	 * Callback called from within the flush routine to flush any extra
+	 * bits.  Note that flush may be skipped without calling this if
+	 * there are no outstanding buffers in the transaction.
+	 */
+	wapbl_flush_fn_t wl_flush;	/* r	*/
+	wapbl_flush_fn_t wl_flush_abort;/* r	*/
+
+	size_t wl_bufbytes;	/* m:	Byte count of pages in wl_bufs */
+	size_t wl_bufcount;	/* m:	Count of buffers in wl_bufs */
+	size_t wl_bcount;	/* m:	Total bcount of wl_bufs */
+
+	LIST_HEAD(, buf) wl_bufs; /* m:	Buffers in current transaction */
+
+	kcondvar_t wl_reclaimable_cv;	/* m (obviously) */
+	size_t wl_reclaimable_bytes; /* m:	Amount of space available for
+						reclamation by truncate */
+	int wl_error_count;	/* m:	# of wl_entries with errors */
+	size_t wl_reserved_bytes; /* never truncate log smaller than this */
+
+#ifdef WAPBL_DEBUG_BUFBYTES
+	size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
+#endif
+
+	daddr_t *wl_deallocblks;/* l:	address of block */
+	int *wl_dealloclens;	/* l:	size of block (fragments, kom ih�g) */
+	int wl_dealloccnt;	/* l:	total count */
+	int wl_dealloclim;	/* l:	max count */
+
+	/* hashtable of inode numbers for allocated but unlinked inodes */
+	/* synch ??? */
+	LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
+	u_long wl_inohashmask;
+	int wl_inohashcnt;
+
+	SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
+						   accounting */
+};
+
+#ifdef WAPBL_DEBUG_PRINT
+int wapbl_debug_print = WAPBL_DEBUG_PRINT;
+#endif
+
+/****************************************************************/
+#ifdef _KERNEL
+
+#ifdef WAPBL_DEBUG
+struct wapbl *wapbl_debug_wl;
+#endif
+
+static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
+static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
+static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
+static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
+#endif /* _KERNEL */
+
+static int wapbl_replay_prescan(struct wapbl_replay *wr);
+static int wapbl_replay_get_inodes(struct wapbl_replay *wr);
+
+static __inline size_t wapbl_space_free(size_t avail, off_t head,
+	off_t tail);
+static __inline size_t wapbl_space_used(size_t avail, off_t head,
+	off_t tail);
+
+#ifdef _KERNEL
+
+#define	WAPBL_INODETRK_SIZE 83
+static int wapbl_ino_pool_refcount;
+static struct pool wapbl_ino_pool;
+struct wapbl_ino {
+	LIST_ENTRY(wapbl_ino) wi_hash;
+	ino_t wi_ino;
+	mode_t wi_mode;
+};
+
+static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
+static void wapbl_inodetrk_free(struct wapbl *wl);
+static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
+
+static size_t wapbl_transaction_len(struct wapbl *wl);
+static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
+
+/*
+ * This is useful for debugging.  If set, the log will
+ * only be truncated when necessary.
+ */
+int wapbl_lazy_truncate = 0;
+
+struct wapbl_ops wapbl_ops = {
+	.wo_wapbl_discard	= wapbl_discard,
+	.wo_wapbl_replay_isopen	= wapbl_replay_isopen1,
+	.wo_wapbl_replay_read	= wapbl_replay_read,
+	.wo_wapbl_add_buf	= wapbl_add_buf,
+	.wo_wapbl_remove_buf	= wapbl_remove_buf,
+	.wo_wapbl_resize_buf	= wapbl_resize_buf,
+	.wo_wapbl_begin		= wapbl_begin,
+	.wo_wapbl_end		= wapbl_end,
+	.wo_wapbl_junlock_assert= wapbl_junlock_assert,
+
+	/* XXX: the following is only used to say "this is a wapbl buf" */
+	.wo_wapbl_biodone	= wapbl_biodone,
+};
+
+void
+wapbl_init()
+{
+
+	malloc_type_attach(M_WAPBL);
+}
+
+int
+wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
+	daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
+	wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
+{
+	struct wapbl *wl;
+	struct vnode *devvp;
+	daddr_t logpbn;
+	int error;
+	int log_dev_bshift = DEV_BSHIFT;
+	int fs_dev_bshift = DEV_BSHIFT;
+	int run;
+
+	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
+	    " count=%zu blksize=%zu\n", vp, off, count, blksize));
+
+	if (log_dev_bshift > fs_dev_bshift) {
+		WAPBL_PRINTF(WAPBL_PRINT_OPEN,
+			("wapbl: log device's block size cannot be larger "
+			 "than filesystem's\n"));
+		/*
+		 * Not currently implemented, although it could be if
+		 * needed someday.
+		 */
+		return ENOSYS;
+	}
+
+	if (off < 0)
+		return EINVAL;
+
+	if (blksize < DEV_BSIZE)
+		return EINVAL;
+	if (blksize % DEV_BSIZE)
+		return EINVAL;
+
+	/* XXXTODO: verify that the full load is writable */
+
+	/*
+	 * XXX check for minimum log size
+	 * minimum is governed by minimum amount of space
+	 * to complete a transaction. (probably truncate)
+	 */
+	/* XXX for now pick something minimal */
+	if ((count * blksize) < MAXPHYS) {
+		return ENOSPC;
+	}
+
+	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
+		return error;
+	}
+
+	wl = wapbl_calloc(1, sizeof(*wl));
+	rw_init(&wl->wl_rwlock);
+	mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&wl->wl_reclaimable_cv, "wapblrec");
+	LIST_INIT(&wl->wl_bufs);
+	SIMPLEQ_INIT(&wl->wl_entries);
+
+	wl->wl_logvp = vp;
+	wl->wl_devvp = devvp;
+	wl->wl_mount = mp;
+	wl->wl_logpbn = logpbn;
+	wl->wl_log_dev_bshift = log_dev_bshift;
+	wl->wl_fs_dev_bshift = fs_dev_bshift;
+
+	wl->wl_flush = flushfn;
+	wl->wl_flush_abort = flushabortfn;
+
+	/* Reserve two log device blocks for the commit headers */
+	wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
+	wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
+	/* truncate the log usage to a multiple of log_dev_bshift */
+	wl->wl_circ_size >>= wl->wl_log_dev_bshift;
+	wl->wl_circ_size <<= wl->wl_log_dev_bshift;
+
+	/*
+	 * wl_bufbytes_max limits the size of the in memory transaction space.
+	 * - Since buffers are allocated and accounted for in units of
+	 *   PAGE_SIZE it is required to be a multiple of PAGE_SIZE
+	 *   (i.e. 1<<PAGE_SHIFT)
+	 * - Since the log device has to be written in units of
+	 *   1<<wl_log_dev_bshift it is required to be a mulitple of
+	 *   1<<wl_log_dev_bshift.
+	 * - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
+	 *   it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
+	 * Therefore it must be multiple of the least common multiple of those
+	 * three quantities.  Fortunately, all of those quantities are
+	 * guaranteed to be a power of two, and the least common multiple of
+	 * a set of numbers which are all powers of two is simply the maximum
+	 * of those numbers.  Finally, the maximum logarithm of a power of two
+	 * is the same as the log of the maximum power of two.  So we can do
+	 * the following operations to size wl_bufbytes_max:
+	 */
+
+	/* XXX fix actual number of pages reserved per filesystem. */
+	wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
+
+	/* Round wl_bufbytes_max to the largest power of two constraint */
+	wl->wl_bufbytes_max >>= PAGE_SHIFT;
+	wl->wl_bufbytes_max <<= PAGE_SHIFT;
+	wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
+	wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
+	wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
+	wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
+
+	/* XXX maybe use filesystem fragment size instead of 1024 */
+	/* XXX fix actual number of buffers reserved per filesystem. */
+	wl->wl_bufcount_max = (nbuf / 2) * 1024;
+
+	/* XXX tie this into resource estimation */
+	wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
+	
+#if WAPBL_UVM_ALLOC
+	wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map,
+	    round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim));
+	KASSERT(wl->wl_deallocblks != NULL);
+	wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map,
+	    round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim));
+	KASSERT(wl->wl_dealloclens != NULL);
+#else
+	wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
+	    wl->wl_dealloclim);
+	wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
+	    wl->wl_dealloclim);
+#endif
+
+	wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
+
+	/* Initialize the commit header */
+	{
+		struct wapbl_wc_header *wc;
+		size_t len = 1<<wl->wl_log_dev_bshift;
+		wc = wapbl_calloc(1, len);
+		wc->wc_type = WAPBL_WC_HEADER;
+		wc->wc_len = len;
+		wc->wc_circ_off = wl->wl_circ_off;
+		wc->wc_circ_size = wl->wl_circ_size;
+		/* XXX wc->wc_fsid */
+		wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
+		wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
+		wl->wl_wc_header = wc;
+		wl->wl_wc_scratch = wapbl_malloc(len);
+	}
+
+	/*
+	 * if there was an existing set of unlinked but
+	 * allocated inodes, preserve it in the new
+	 * log.
+	 */
+	if (wr && wr->wr_inodescnt) {
+		int i;
+
+		WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
+		    ("wapbl_start: reusing log with %d inodes\n",
+		    wr->wr_inodescnt));
+
+		/*
+		 * Its only valid to reuse the replay log if its
+		 * the same as the new log we just opened.
+		 */
+		KDASSERT(!wapbl_replay_isopen(wr));
+		KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev);
+		KASSERT(logpbn == wr->wr_logpbn);
+		KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size);
+		KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off);
+		KASSERT(wl->wl_log_dev_bshift ==
+		    wr->wr_wc_header.wc_log_dev_bshift);
+		KASSERT(wl->wl_fs_dev_bshift ==
+		    wr->wr_wc_header.wc_fs_dev_bshift);
+
+		wl->wl_wc_header->wc_generation =
+		    wr->wr_wc_header.wc_generation + 1;
+
+		for (i = 0; i < wr->wr_inodescnt; i++)
+			wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
+			    wr->wr_inodes[i].wr_imode);
+
+		/* Make sure new transaction won't overwrite old inodes list */
+		KDASSERT(wapbl_transaction_len(wl) <= 
+		    wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
+		      wr->wr_inodestail));
+
+		wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
+		wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
+			wapbl_transaction_len(wl);
+
+		error = wapbl_write_inodes(wl, &wl->wl_head);
+		if (error)
+			goto errout;
+
+		KASSERT(wl->wl_head != wl->wl_tail);
+		KASSERT(wl->wl_head != 0);
+	}
+
+	error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
+	if (error) {
+		goto errout;
+	}
+
+	*wlp = wl;
+#if defined(WAPBL_DEBUG)
+	wapbl_debug_wl = wl;
+#endif
+
+	return 0;
+ errout:
+	wapbl_discard(wl);
+	wapbl_free(wl->wl_wc_scratch);
+	wapbl_free(wl->wl_wc_header);
+#if WAPBL_UVM_ALLOC
+	uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
+			   round_page(sizeof(*wl->wl_deallocblks *
+			   	      wl->wl_dealloclim)));
+	uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
+			   round_page(sizeof(*wl->wl_dealloclens *
+				      wl->wl_dealloclim)));
+#else
+	wapbl_free(wl->wl_deallocblks);
+	wapbl_free(wl->wl_dealloclens);
+#endif
+	wapbl_inodetrk_free(wl);
+	wapbl_free(wl);
+
+	return error;
+}
+
+/*
+ * Like wapbl_flush, only discards the transaction
+ * completely
+ */
+
+void
+wapbl_discard(struct wapbl *wl)
+{
+	struct wapbl_entry *we;
+	struct buf *bp;
+	int i;
+
+	/*
+	 * XXX we may consider using upgrade here
+	 * if we want to call flush from inside a transaction
+	 */
+	rw_enter(&wl->wl_rwlock, RW_WRITER);
+	wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
+	    wl->wl_dealloccnt);
+
+#ifdef WAPBL_DEBUG_PRINT
+	{
+		struct wapbl_entry *we;
+		pid_t pid = -1;
+		lwpid_t lid = -1;
+		if (curproc)
+			pid = curproc->p_pid;
+		if (curlwp)
+			lid = curlwp->l_lid;
+#ifdef WAPBL_DEBUG_BUFBYTES
+		WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
+		    ("wapbl_discard: thread %d.%d discarding "
+		    "transaction\n"
+		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
+		    "deallocs=%d inodes=%d\n"
+		    "\terrcnt = %u, reclaimable=%zu reserved=%zu "
+		    "unsynced=%zu\n",
+		    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
+		    wl->wl_bcount, wl->wl_dealloccnt,
+		    wl->wl_inohashcnt, wl->wl_error_count,
+		    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
+		    wl->wl_unsynced_bufbytes));
+		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
+			WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
+			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
+			     "error = %d, unsynced = %zu\n",
+			     we->we_bufcount, we->we_reclaimable_bytes,
+			     we->we_error, we->we_unsynced_bufbytes));
+		}
+#else /* !WAPBL_DEBUG_BUFBYTES */
+		WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
+		    ("wapbl_discard: thread %d.%d discarding transaction\n"
+		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
+		    "deallocs=%d inodes=%d\n"
+		    "\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
+		    pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
+		    wl->wl_bcount, wl->wl_dealloccnt,
+		    wl->wl_inohashcnt, wl->wl_error_count,
+		    wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
+		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
+			WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
+			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
+			     "error = %d\n",
+			     we->we_bufcount, we->we_reclaimable_bytes,
+			     we->we_error));
+		}
+#endif /* !WAPBL_DEBUG_BUFBYTES */
+	}
+#endif /* WAPBL_DEBUG_PRINT */
+
+	for (i = 0; i <= wl->wl_inohashmask; i++) {
+		struct wapbl_ino_head *wih;
+		struct wapbl_ino *wi;
+
+		wih = &wl->wl_inohash[i];
+		while ((wi = LIST_FIRST(wih)) != NULL) {
+			LIST_REMOVE(wi, wi_hash);
+			pool_put(&wapbl_ino_pool, wi);
+			KASSERT(wl->wl_inohashcnt > 0);
+			wl->wl_inohashcnt--;
+		}
+	}
+
+	/*
+	 * clean buffer list
+	 */
+	mutex_enter(&bufcache_lock);
+	mutex_enter(&wl->wl_mtx);
+	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
+		if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
+			/*
+			 * The buffer will be unlocked and
+			 * removed from the transaction in brelse
+			 */
+			mutex_exit(&wl->wl_mtx);
+			brelsel(bp, 0);
+			mutex_enter(&wl->wl_mtx);
+		}
+	}
+	mutex_exit(&wl->wl_mtx);
+	mutex_exit(&bufcache_lock);
+
+	/*
+	 * Remove references to this wl from wl_entries, free any which
+	 * no longer have buffers, others will be freed in wapbl_biodone
+	 * when they no longer have any buffers.
+	 */
+	while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
+		SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
+		/* XXX should we be accumulating wl_error_count
+		 * and increasing reclaimable bytes ? */
+		we->we_wapbl = NULL;
+		if (we->we_bufcount == 0) {
+#ifdef WAPBL_DEBUG_BUFBYTES
+			KASSERT(we->we_unsynced_bufbytes == 0);
+#endif
+			wapbl_free(we);
+		}
+	}
+
+	/* Discard list of deallocs */
+	wl->wl_dealloccnt = 0;
+	/* XXX should we clear wl_reserved_bytes? */
+
+	KASSERT(wl->wl_bufbytes == 0);
+	KASSERT(wl->wl_bcount == 0);
+	KASSERT(wl->wl_bufcount == 0);
+	KASSERT(LIST_EMPTY(&wl->wl_bufs));
+	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
+	KASSERT(wl->wl_inohashcnt == 0);
+
+	rw_exit(&wl->wl_rwlock);
+}
+
+int
+wapbl_stop(struct wapbl *wl, int force)
+{
+	struct vnode *vp;
+	int error;
+
+	WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
+	error = wapbl_flush(wl, 1);
+	if (error) {
+		if (force)
+			wapbl_discard(wl);
+		else
+			return error;
+	}
+
+	/* Unlinked inodes persist after a flush */
+	if (wl->wl_inohashcnt) {
+		if (force) {
+			wapbl_discard(wl);
+		} else {
+			return EBUSY;
+		}
+	}
+
+	KASSERT(wl->wl_bufbytes == 0);
+	KASSERT(wl->wl_bcount == 0);
+	KASSERT(wl->wl_bufcount == 0);
+	KASSERT(LIST_EMPTY(&wl->wl_bufs));
+	KASSERT(wl->wl_dealloccnt == 0);
+	KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
+	KASSERT(wl->wl_inohashcnt == 0);
+
+	vp = wl->wl_logvp;
+
+	wapbl_free(wl->wl_wc_scratch);
+	wapbl_free(wl->wl_wc_header);
+#if WAPBL_UVM_ALLOC
+	uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks,
+			   round_page(sizeof(*wl->wl_deallocblks *
+			   	      wl->wl_dealloclim)));
+	uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens,
+			   round_page(sizeof(*wl->wl_dealloclens *
+				      wl->wl_dealloclim)));
+#else
+	wapbl_free(wl->wl_deallocblks);
+	wapbl_free(wl->wl_dealloclens);
+#endif
+	wapbl_inodetrk_free(wl);
+
+	cv_destroy(&wl->wl_reclaimable_cv);
+	mutex_destroy(&wl->wl_mtx);
+	rw_destroy(&wl->wl_rwlock);
+	wapbl_free(wl);
+
+	return 0;
+}
+
+static int
+wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
+{
+	struct pstats *pstats = curlwp->l_proc->p_stats;
+	struct buf *bp;
+	int error;
+
+	KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
+	KASSERT(devvp->v_type == VBLK);
+
+	if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
+		mutex_enter(&devvp->v_interlock);
+		devvp->v_numoutput++;
+		mutex_exit(&devvp->v_interlock);
+		pstats->p_ru.ru_oublock++;
+	} else {
+		pstats->p_ru.ru_inblock++;
+	}
+
+	bp = getiobuf(devvp, true);
+	bp->b_flags = flags;
+	bp->b_cflags = BC_BUSY; /* silly & dubious */
+	bp->b_dev = devvp->v_rdev;
+	bp->b_data = data;
+	bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
+	bp->b_blkno = pbn;
+
+	WAPBL_PRINTF(WAPBL_PRINT_IO,
+	    ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
+	    BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
+	    bp->b_blkno, bp->b_dev));
+
+	VOP_STRATEGY(devvp, bp);
+
+	error = biowait(bp);
+	putiobuf(bp);
+
+	if (error) {
+		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+		    ("wapbl_doio: %s %zu bytes at block %" PRId64
+		    " on dev 0x%x failed with error %d\n",
+		    (((flags & (B_WRITE | B_READ)) == B_WRITE) ?
+		     "write" : "read"),
+		    len, pbn, devvp->v_rdev, error));
+	}
+
+	return error;
+}
+
+int
+wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
+{
+
+	return wapbl_doio(data, len, devvp, pbn, B_WRITE);
+}
+
+int
+wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
+{
+
+	return wapbl_doio(data, len, devvp, pbn, B_READ);
+}
+
+/*
+ * Off is byte offset returns new offset for next write
+ * handles log wraparound
+ */
+static int
+wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
+{
+	size_t slen;
+	off_t off = *offp;
+	int error;
+
+	KDASSERT(((len >> wl->wl_log_dev_bshift) <<
+	    wl->wl_log_dev_bshift) == len);
+
+	if (off < wl->wl_circ_off)
+		off = wl->wl_circ_off;
+	slen = wl->wl_circ_off + wl->wl_circ_size - off;
+	if (slen < len) {
+		error = wapbl_write(data, slen, wl->wl_devvp,
+		    wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
+		if (error)
+			return error;
+		data = (uint8_t *)data + slen;
+		len -= slen;
+		off = wl->wl_circ_off;
+	}
+	error = wapbl_write(data, len, wl->wl_devvp,
+			    wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
+	if (error)
+		return error;
+	off += len;
+	if (off >= wl->wl_circ_off + wl->wl_circ_size)
+		off = wl->wl_circ_off;
+	*offp = off;
+	return 0;
+}
+
+/****************************************************************/
+
+int
+wapbl_begin(struct wapbl *wl, const char *file, int line)
+{
+	int doflush;
+	unsigned lockcount;
+	krw_t op;
+
+	KDASSERT(wl);
+
+/*
+ *	XXX: The original code calls for the use of a RW_READER lock 
+ *	here, but it turns out there are performance issues with high 
+ *	metadata-rate workloads (e.g. multiple simultaneous tar
+ *	extractions).  For now, we force the lock to be RW_WRITER, 
+ *	since that currently has the best performance characteristics 
+ *	(even for a single tar-file extraction). 
+ *	
+ */
+#define WAPBL_DEBUG_SERIALIZE 1
+
+#ifdef WAPBL_DEBUG_SERIALIZE
+	op = RW_WRITER;
+#else
+	op = RW_READER;
+#endif
+
+	/*
+	 * XXX this needs to be made much more sophisticated.
+	 * perhaps each wapbl_begin could reserve a specified
+	 * number of buffers and bytes.
+	 */
+	mutex_enter(&wl->wl_mtx);
+	lockcount = wl->wl_lock_count;
+	doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
+		   wl->wl_bufbytes_max / 2) ||
+		  ((wl->wl_bufcount + (lockcount * 10)) >
+		   wl->wl_bufcount_max / 2) ||
+		  (wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
+	mutex_exit(&wl->wl_mtx);
+
+	if (doflush) {
+		WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
+		    ("force flush lockcnt=%d bufbytes=%zu "
+		    "(max=%zu) bufcount=%zu (max=%zu)\n",
+		    lockcount, wl->wl_bufbytes,
+		    wl->wl_bufbytes_max, wl->wl_bufcount,
+		    wl->wl_bufcount_max));
+	}
+
+	if (doflush) {
+		int error = wapbl_flush(wl, 0);
+		if (error)
+			return error;
+	}
+
+	rw_enter(&wl->wl_rwlock, op);
+	mutex_enter(&wl->wl_mtx);
+	wl->wl_lock_count++;
+	mutex_exit(&wl->wl_mtx);
+
+#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
+	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
+	    ("wapbl_begin thread %d.%d with bufcount=%zu "
+	    "bufbytes=%zu bcount=%zu at %s:%d\n",
+	    curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
+	    wl->wl_bufbytes, wl->wl_bcount, file, line));
+#endif
+
+	return 0;
+}
+
+void
+wapbl_end(struct wapbl *wl)
+{
+
+#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE)
+	WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
+	     ("wapbl_end thread %d.%d with bufcount=%zu "
+	      "bufbytes=%zu bcount=%zu\n",
+	      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
+	      wl->wl_bufbytes, wl->wl_bcount));
+#endif
+
+	mutex_enter(&wl->wl_mtx);
+	KASSERT(wl->wl_lock_count > 0);
+	wl->wl_lock_count--;
+	mutex_exit(&wl->wl_mtx);
+
+	rw_exit(&wl->wl_rwlock);
+}
+
+void
+wapbl_add_buf(struct wapbl *wl, struct buf * bp)
+{
+
+	KASSERT(bp->b_cflags & BC_BUSY);
+	KASSERT(bp->b_vp);
+
+	wapbl_jlock_assert(wl);
+
+#if 0
+	/*
+	 * XXX this might be an issue for swapfiles.
+	 * see uvm_swap.c:1702
+	 *
+	 * XXX2 why require it then?  leap of semantics?
+	 */
+	KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
+#endif
+
+	mutex_enter(&wl->wl_mtx);
+	if (bp->b_flags & B_LOCKED) {
+		LIST_REMOVE(bp, b_wapbllist);
+		WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
+		   ("wapbl_add_buf thread %d.%d re-adding buf %p "
+		    "with %d bytes %d bcount\n",
+		    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
+		    bp->b_bcount));
+	} else {
+		/* unlocked by dirty buffers shouldn't exist */
+		KASSERT(!(bp->b_oflags & BO_DELWRI));
+		wl->wl_bufbytes += bp->b_bufsize;
+		wl->wl_bcount += bp->b_bcount;
+		wl->wl_bufcount++;
+		WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
+		   ("wapbl_add_buf thread %d.%d adding buf %p "
+		    "with %d bytes %d bcount\n",
+		    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
+		    bp->b_bcount));
+	}
+	LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
+	mutex_exit(&wl->wl_mtx);
+
+	bp->b_flags |= B_LOCKED;
+}
+
+static void
+wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
+{
+
+	KASSERT(mutex_owned(&wl->wl_mtx));
+	KASSERT(bp->b_cflags & BC_BUSY);
+	wapbl_jlock_assert(wl);
+
+#if 0
+	/*
+	 * XXX this might be an issue for swapfiles.
+	 * see uvm_swap.c:1725
+	 *
+	 * XXXdeux: see above
+	 */
+	KASSERT((bp->b_flags & BC_NOCACHE) == 0);
+#endif
+	KASSERT(bp->b_flags & B_LOCKED);
+
+	WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
+	   ("wapbl_remove_buf thread %d.%d removing buf %p with "
+	    "%d bytes %d bcount\n",
+	    curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
+
+	KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
+	wl->wl_bufbytes -= bp->b_bufsize;
+	KASSERT(wl->wl_bcount >= bp->b_bcount);
+	wl->wl_bcount -= bp->b_bcount;
+	KASSERT(wl->wl_bufcount > 0);
+	wl->wl_bufcount--;
+	KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
+	KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
+	LIST_REMOVE(bp, b_wapbllist);
+
+	bp->b_flags &= ~B_LOCKED;
+}
+
+/* called from brelsel() in vfs_bio among other places */
+void
+wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
+{
+
+	mutex_enter(&wl->wl_mtx);
+	wapbl_remove_buf_locked(wl, bp);
+	mutex_exit(&wl->wl_mtx);
+}
+
+void
+wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
+{
+
+	KASSERT(bp->b_cflags & BC_BUSY);
+
+	/*
+	 * XXX: why does this depend on B_LOCKED?  otherwise the buf
+	 * is not for a transaction?  if so, why is this called in the
+	 * first place?
+	 */
+	if (bp->b_flags & B_LOCKED) {
+		mutex_enter(&wl->wl_mtx);
+		wl->wl_bufbytes += bp->b_bufsize - oldsz;
+		wl->wl_bcount += bp->b_bcount - oldcnt;
+		mutex_exit(&wl->wl_mtx);
+	}
+}
+
+#endif /* _KERNEL */
+
+/****************************************************************/
+/* Some utility inlines */
+
+/* This is used to advance the pointer at old to new value at old+delta */
+static __inline off_t
+wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
+{
+	off_t new;
+
+	/* Define acceptable ranges for inputs. */
+	KASSERT(delta <= size);
+	KASSERT((old == 0) || (old >= off));
+	KASSERT(old < (size + off));
+
+	if ((old == 0) && (delta != 0))
+		new = off + delta;
+	else if ((old + delta) < (size + off))
+		new = old + delta;
+	else
+		new = (old + delta) - size;
+
+	/* Note some interesting axioms */
+	KASSERT((delta != 0) || (new == old));
+	KASSERT((delta == 0) || (new != 0));
+	KASSERT((delta != (size)) || (new == old));
+
+	/* Define acceptable ranges for output. */
+	KASSERT((new == 0) || (new >= off));
+	KASSERT(new < (size + off));
+	return new;
+}
+
+static __inline size_t
+wapbl_space_used(size_t avail, off_t head, off_t tail)
+{
+
+	if (tail == 0) {
+		KASSERT(head == 0);
+		return 0;
+	}
+	return ((head + (avail - 1) - tail) % avail) + 1;
+}
+
+static __inline size_t
+wapbl_space_free(size_t avail, off_t head, off_t tail)
+{
+
+	return avail - wapbl_space_used(avail, head, tail);
+}
+
+static __inline void
+wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
+		   off_t *tailp)
+{
+	off_t head = *headp;
+	off_t tail = *tailp;
+
+	KASSERT(delta <= wapbl_space_free(size, head, tail));
+	head = wapbl_advance(size, off, head, delta);
+	if ((tail == 0) && (head != 0))
+		tail = off;
+	*headp = head;
+	*tailp = tail;
+}
+
+static __inline void
+wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
+		   off_t *tailp)
+{
+	off_t head = *headp;
+	off_t tail = *tailp;
+
+	KASSERT(delta <= wapbl_space_used(size, head, tail));
+	tail = wapbl_advance(size, off, tail, delta);
+	if (head == tail) {
+		head = tail = 0;
+	}
+	*headp = head;
+	*tailp = tail;
+}
+
+#ifdef _KERNEL
+
+/****************************************************************/
+
+/*
+ * Remove transactions whose buffers are completely flushed to disk.
+ * Will block until at least minfree space is available.
+ * only intended to be called from inside wapbl_flush and therefore
+ * does not protect against commit races with itself or with flush.
+ */
+static int
+wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
+{
+	size_t delta;
+	size_t avail;
+	off_t head;
+	off_t tail;
+	int error = 0;
+
+	KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
+	KASSERT(rw_write_held(&wl->wl_rwlock));
+
+	mutex_enter(&wl->wl_mtx);
+
+	/*
+	 * First check to see if we have to do a commit
+	 * at all.
+	 */
+	avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
+	if (minfree < avail) {
+		mutex_exit(&wl->wl_mtx);
+		return 0;
+	}
+	minfree -= avail;
+	while ((wl->wl_error_count == 0) &&
+	    (wl->wl_reclaimable_bytes < minfree)) {
+        	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
+                   ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
+		    "minfree=%zd\n",
+                    &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
+		    minfree));
+
+		cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
+	}
+	if (wl->wl_reclaimable_bytes < minfree) {
+		KASSERT(wl->wl_error_count);
+		/* XXX maybe get actual error from buffer instead someday? */
+		error = EIO;
+	}
+	head = wl->wl_head;
+	tail = wl->wl_tail;
+	delta = wl->wl_reclaimable_bytes;
+
+	/* If all of of the entries are flushed, then be sure to keep
+	 * the reserved bytes reserved.  Watch out for discarded transactions,
+	 * which could leave more bytes reserved than are reclaimable.
+	 */
+	if (SIMPLEQ_EMPTY(&wl->wl_entries) && 
+	    (delta >= wl->wl_reserved_bytes)) {
+		delta -= wl->wl_reserved_bytes;
+	}
+	wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
+			   &tail);
+	KDASSERT(wl->wl_reserved_bytes <=
+		wapbl_space_used(wl->wl_circ_size, head, tail));
+	mutex_exit(&wl->wl_mtx);
+
+	if (error)
+		return error;
+
+	if (waitonly)
+		return 0;
+
+	/*
+	 * This is where head, tail and delta are unprotected
+	 * from races against itself or flush.  This is ok since
+	 * we only call this routine from inside flush itself.
+	 *
+	 * XXX: how can it race against itself when accessed only
+	 * from behind the write-locked rwlock?
+	 */
+	error = wapbl_write_commit(wl, head, tail);
+	if (error)
+		return error;
+
+	wl->wl_head = head;
+	wl->wl_tail = tail;
+
+	mutex_enter(&wl->wl_mtx);
+	KASSERT(wl->wl_reclaimable_bytes >= delta);
+	wl->wl_reclaimable_bytes -= delta;
+	mutex_exit(&wl->wl_mtx);
+	WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
+	    ("wapbl_truncate thread %d.%d truncating %zu bytes\n",
+	    curproc->p_pid, curlwp->l_lid, delta));
+
+	return 0;
+}
+
+/****************************************************************/
+
+void
+wapbl_biodone(struct buf *bp)
+{
+	struct wapbl_entry *we = bp->b_private;
+	struct wapbl *wl = we->we_wapbl;
+
+	/*
+	 * Handle possible flushing of buffers after log has been
+	 * decomissioned.
+	 */
+	if (!wl) {
+		KASSERT(we->we_bufcount > 0);
+		we->we_bufcount--;
+#ifdef WAPBL_DEBUG_BUFBYTES
+		KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
+		we->we_unsynced_bufbytes -= bp->b_bufsize;
+#endif
+
+		if (we->we_bufcount == 0) {
+#ifdef WAPBL_DEBUG_BUFBYTES
+			KASSERT(we->we_unsynced_bufbytes == 0);
+#endif
+			wapbl_free(we);
+		}
+
+		brelse(bp, 0);
+		return;
+	}
+
+#ifdef ohbother
+	KDASSERT(bp->b_flags & B_DONE);
+	KDASSERT(!(bp->b_flags & B_DELWRI));
+	KDASSERT(bp->b_flags & B_ASYNC);
+	KDASSERT(bp->b_flags & B_BUSY);
+	KDASSERT(!(bp->b_flags & B_LOCKED));
+	KDASSERT(!(bp->b_flags & B_READ));
+	KDASSERT(!(bp->b_flags & B_INVAL));
+	KDASSERT(!(bp->b_flags & B_NOCACHE));
+#endif
+
+	if (bp->b_error) {
+#ifdef notyet /* Can't currently handle possible dirty buffer reuse */
+		XXXpooka: interfaces not fully updated
+		Note: this was not enabled in the original patch
+		against netbsd4 either.  I don't know if comment
+		above is true or not.
+
+		/*
+		 * If an error occurs, report the error and leave the
+		 * buffer as a delayed write on the LRU queue.
+		 * restarting the write would likely result in
+		 * an error spinloop, so let it be done harmlessly
+		 * by the syncer.
+		 */
+		bp->b_flags &= ~(B_DONE);
+		simple_unlock(&bp->b_interlock);
+
+		if (we->we_error == 0) {
+			mutex_enter(&wl->wl_mtx);
+			wl->wl_error_count++;
+			mutex_exit(&wl->wl_mtx);
+			cv_broadcast(&wl->wl_reclaimable_cv);
+		}
+		we->we_error = bp->b_error;
+		bp->b_error = 0;
+		brelse(bp);
+		return;
+#else
+		/* For now, just mark the log permanently errored out */
+
+		mutex_enter(&wl->wl_mtx);
+		if (wl->wl_error_count == 0) {
+			wl->wl_error_count++;
+			cv_broadcast(&wl->wl_reclaimable_cv);
+		}
+		mutex_exit(&wl->wl_mtx);
+#endif
+	}
+
+	mutex_enter(&wl->wl_mtx);
+
+	KASSERT(we->we_bufcount > 0);
+	we->we_bufcount--;
+#ifdef WAPBL_DEBUG_BUFBYTES
+	KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
+	we->we_unsynced_bufbytes -= bp->b_bufsize;
+	KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
+	wl->wl_unsynced_bufbytes -= bp->b_bufsize;
+#endif
+
+	/*
+	 * If the current transaction can be reclaimed, start
+	 * at the beginning and reclaim any consecutive reclaimable
+	 * transactions.  If we successfully reclaim anything,
+	 * then wakeup anyone waiting for the reclaim.
+	 */
+	if (we->we_bufcount == 0) {
+		size_t delta = 0;
+		int errcnt = 0;
+#ifdef WAPBL_DEBUG_BUFBYTES
+		KDASSERT(we->we_unsynced_bufbytes == 0);
+#endif
+		/*
+		 * clear any posted error, since the buffer it came from
+		 * has successfully flushed by now
+		 */
+		while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
+		       (we->we_bufcount == 0)) {
+			delta += we->we_reclaimable_bytes;
+			if (we->we_error)
+				errcnt++;
+			SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
+			wapbl_free(we);
+		}
+
+		if (delta) {
+			wl->wl_reclaimable_bytes += delta;
+			KASSERT(wl->wl_error_count >= errcnt);
+			wl->wl_error_count -= errcnt;
+			cv_broadcast(&wl->wl_reclaimable_cv);
+		}
+	}
+
+	mutex_exit(&wl->wl_mtx);
+	brelse(bp, 0);
+}
+
+/*
+ * Write transactions to disk + start I/O for contents
+ */
+int
+wapbl_flush(struct wapbl *wl, int waitfor)
+{
+	struct buf *bp;
+	struct wapbl_entry *we;
+	off_t off;
+	off_t head;
+	off_t tail;
+	size_t delta = 0;
+	size_t flushsize;
+	size_t reserved;
+	int error = 0;
+
+	/*
+	 * Do a quick check to see if a full flush can be skipped
+	 * This assumes that the flush callback does not need to be called
+	 * unless there are other outstanding bufs.
+	 */
+	if (!waitfor) {
+		size_t nbufs;
+		mutex_enter(&wl->wl_mtx);	/* XXX need mutex here to
+						   protect the KASSERTS */
+		nbufs = wl->wl_bufcount;
+		KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
+		KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
+		mutex_exit(&wl->wl_mtx);
+		if (nbufs == 0)
+			return 0;
+	}
+
+	/*
+	 * XXX we may consider using LK_UPGRADE here
+	 * if we want to call flush from inside a transaction
+	 */
+	rw_enter(&wl->wl_rwlock, RW_WRITER);
+	wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
+	    wl->wl_dealloccnt);
+
+	/*
+	 * Now that we are fully locked and flushed,
+	 * do another check for nothing to do.
+	 */
+	if (wl->wl_bufcount == 0) {
+		goto out;
+	}
+
+#if 0
+	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
+		     ("wapbl_flush thread %d.%d flushing entries with "
+		      "bufcount=%zu bufbytes=%zu\n",
+		      curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
+		      wl->wl_bufbytes));
+#endif
+
+	/* Calculate amount of space needed to flush */
+	flushsize = wapbl_transaction_len(wl);
+
+	if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
+		/*
+		 * XXX this could be handled more gracefully, perhaps place
+		 * only a partial transaction in the log and allow the
+		 * remaining to flush without the protection of the journal.
+		 */
+		panic("wapbl_flush: current transaction too big to flush\n");
+	}
+
+	error = wapbl_truncate(wl, flushsize, 0);
+	if (error)
+		goto out2;
+
+	off = wl->wl_head;
+	KASSERT((off == 0) || ((off >= wl->wl_circ_off) && 
+	                      (off < wl->wl_circ_off + wl->wl_circ_size)));
+	error = wapbl_write_blocks(wl, &off);
+	if (error)
+		goto out2;
+	error = wapbl_write_revocations(wl, &off);
+	if (error)
+		goto out2;
+	error = wapbl_write_inodes(wl, &off);
+	if (error)
+		goto out2;
+
+	reserved = 0;
+	if (wl->wl_inohashcnt)
+		reserved = wapbl_transaction_inodes_len(wl);
+
+	head = wl->wl_head;
+	tail = wl->wl_tail;
+
+	wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
+	    &head, &tail);
+#ifdef WAPBL_DEBUG
+	if (head != off) {
+		panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
+		      " off=%"PRIdMAX" flush=%zu\n",
+		      (intmax_t)head, (intmax_t)tail, (intmax_t)off,
+		      flushsize);
+	}
+#else
+	KASSERT(head == off);
+#endif
+
+	/* Opportunistically move the tail forward if we can */
+	if (!wapbl_lazy_truncate) {
+		mutex_enter(&wl->wl_mtx);
+		delta = wl->wl_reclaimable_bytes;
+		mutex_exit(&wl->wl_mtx);
+		wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
+		    &head, &tail);
+	}
+
+	error = wapbl_write_commit(wl, head, tail);
+	if (error)
+		goto out2;
+
+	/* poolme?  or kmemme? */
+	we = wapbl_calloc(1, sizeof(*we));
+
+#ifdef WAPBL_DEBUG_BUFBYTES
+	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
+		("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
+		 " unsynced=%zu"
+		 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
+		 "inodes=%d\n",
+		 curproc->p_pid, curlwp->l_lid, flushsize, delta,
+		 wapbl_space_used(wl->wl_circ_size, head, tail),
+		 wl->wl_unsynced_bufbytes, wl->wl_bufcount,
+		 wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
+		 wl->wl_inohashcnt));
+#else
+	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
+		("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
+		 "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
+		 "inodes=%d\n",
+		 curproc->p_pid, curlwp->l_lid, flushsize, delta,
+		 wapbl_space_used(wl->wl_circ_size, head, tail),
+		 wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
+		 wl->wl_dealloccnt, wl->wl_inohashcnt));
+#endif
+
+
+	mutex_enter(&bufcache_lock);
+	mutex_enter(&wl->wl_mtx);
+
+	wl->wl_reserved_bytes = reserved;
+	wl->wl_head = head;
+	wl->wl_tail = tail;
+	KASSERT(wl->wl_reclaimable_bytes >= delta);
+	wl->wl_reclaimable_bytes -= delta;
+	wl->wl_dealloccnt = 0;
+#ifdef WAPBL_DEBUG_BUFBYTES
+	wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
+#endif
+
+	we->we_wapbl = wl;
+	we->we_bufcount = wl->wl_bufcount;
+#ifdef WAPBL_DEBUG_BUFBYTES
+	we->we_unsynced_bufbytes = wl->wl_bufbytes;
+#endif
+	we->we_reclaimable_bytes = flushsize;
+	we->we_error = 0;
+	SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
+
+	/*
+	 * this flushes bufs in reverse order than they were queued
+	 * it shouldn't matter, but if we care we could use TAILQ instead.
+	 * XXX Note they will get put on the lru queue when they flush
+	 * so we might actually want to change this to preserve order.
+	 */
+	while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
+		if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
+			continue;
+		}
+		bp->b_iodone = wapbl_biodone;
+		bp->b_private = we;
+		bremfree(bp);
+		wapbl_remove_buf_locked(wl, bp);
+		mutex_exit(&wl->wl_mtx);
+		mutex_exit(&bufcache_lock);
+		bawrite(bp);
+		mutex_enter(&bufcache_lock);
+		mutex_enter(&wl->wl_mtx);
+	}
+	mutex_exit(&wl->wl_mtx);
+	mutex_exit(&bufcache_lock);
+
+#if 0
+	WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
+		     ("wapbl_flush thread %d.%d done flushing entries...\n",
+		     curproc->p_pid, curlwp->l_lid));
+#endif
+
+ out:
+
+	/*
+	 * If the waitfor flag is set, don't return until everything is
+	 * fully flushed and the on disk log is empty.
+	 */
+	if (waitfor) {
+		error = wapbl_truncate(wl, wl->wl_circ_size - 
+			wl->wl_reserved_bytes, wapbl_lazy_truncate);
+	}
+
+ out2:
+	if (error) {
+		wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
+		    wl->wl_dealloclens, wl->wl_dealloccnt);
+	}
+
+#ifdef WAPBL_DEBUG_PRINT
+	if (error) {
+		pid_t pid = -1;
+		lwpid_t lid = -1;
+		if (curproc)
+			pid = curproc->p_pid;
+		if (curlwp)
+			lid = curlwp->l_lid;
+		mutex_enter(&wl->wl_mtx);
+#ifdef WAPBL_DEBUG_BUFBYTES
+		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+		    ("wapbl_flush: thread %d.%d aborted flush: "
+		    "error = %d\n"
+		    "\tbufcount=%zu bufbytes=%zu bcount=%zu "
+		    "deallocs=%d inodes=%d\n"
+		    "\terrcnt = %d, reclaimable=%zu reserved=%zu "
+		    "unsynced=%zu\n",
+		    pid, lid, error, wl->wl_bufcount,
+		    wl->wl_bufbytes, wl->wl_bcount,
+		    wl->wl_dealloccnt, wl->wl_inohashcnt,
+		    wl->wl_error_count, wl->wl_reclaimable_bytes,
+		    wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
+		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
+			WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
+			     "error = %d, unsynced = %zu\n",
+			     we->we_bufcount, we->we_reclaimable_bytes,
+			     we->we_error, we->we_unsynced_bufbytes));
+		}
+#else
+		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+		    ("wapbl_flush: thread %d.%d aborted flush: "
+		     "error = %d\n"
+		     "\tbufcount=%zu bufbytes=%zu bcount=%zu "
+		     "deallocs=%d inodes=%d\n"
+		     "\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
+		     pid, lid, error, wl->wl_bufcount,
+		     wl->wl_bufbytes, wl->wl_bcount,
+		     wl->wl_dealloccnt, wl->wl_inohashcnt,
+		     wl->wl_error_count, wl->wl_reclaimable_bytes,
+		     wl->wl_reserved_bytes));
+		SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
+			WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+			    ("\tentry: bufcount = %zu, reclaimable = %zu, "
+			     "error = %d\n", we->we_bufcount,
+			     we->we_reclaimable_bytes, we->we_error));
+		}
+#endif
+		mutex_exit(&wl->wl_mtx);
+	}
+#endif
+
+	rw_exit(&wl->wl_rwlock);
+	return error;
+}
+
+/****************************************************************/
+
+void
+wapbl_jlock_assert(struct wapbl *wl)
+{
+
+#ifdef WAPBL_DEBUG_SERIALIZE
+	KASSERT(rw_write_held(&wl->wl_rwlock));
+#else
+	KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
+#endif
+}
+
+void
+wapbl_junlock_assert(struct wapbl *wl)
+{
+
+#ifdef WAPBL_DEBUG_SERIALIZE
+	KASSERT(!rw_write_held(&wl->wl_rwlock));
+#endif
+}
+
+/****************************************************************/
+
+/* locks missing */
+void
+wapbl_print(struct wapbl *wl,
+		int full,
+		void (*pr)(const char *, ...))
+{
+	struct buf *bp;
+	struct wapbl_entry *we;
+	(*pr)("wapbl %p", wl);
+	(*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
+	      wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
+	(*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
+	      wl->wl_circ_size, wl->wl_circ_off,
+	      (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
+	(*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
+	      wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
+#ifdef WAPBL_DEBUG_BUFBYTES
+	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
+	      "reserved = %zu errcnt = %d unsynced = %zu\n",
+	      wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
+	      wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
+				wl->wl_error_count, wl->wl_unsynced_bufbytes);
+#else
+	(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
+	      "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
+	      wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
+				wl->wl_error_count);
+#endif
+	(*pr)("\tdealloccnt = %d, dealloclim = %d\n",
+	      wl->wl_dealloccnt, wl->wl_dealloclim);
+	(*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
+	      wl->wl_inohashcnt, wl->wl_inohashmask);
+	(*pr)("entries:\n");
+	SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
+#ifdef WAPBL_DEBUG_BUFBYTES
+		(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
+		      "unsynced = %zu\n",
+		      we->we_bufcount, we->we_reclaimable_bytes,
+		      we->we_error, we->we_unsynced_bufbytes);
+#else
+		(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
+		      we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
+#endif
+	}
+	if (full) {
+		int cnt = 0;
+		(*pr)("bufs =");
+		LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
+			if (!LIST_NEXT(bp, b_wapbllist)) {
+				(*pr)(" %p", bp);
+			} else if ((++cnt % 6) == 0) {
+				(*pr)(" %p,\n\t", bp);
+			} else {
+				(*pr)(" %p,", bp);
+			}
+		}
+		(*pr)("\n");
+
+		(*pr)("dealloced blks = ");
+		{
+			int i;
+			cnt = 0;
+			for (i = 0; i < wl->wl_dealloccnt; i++) {
+				(*pr)(" %"PRId64":%d,",
+				      wl->wl_deallocblks[i],
+				      wl->wl_dealloclens[i]);
+				if ((++cnt % 4) == 0) {
+					(*pr)("\n\t");
+				}
+			}
+		}
+		(*pr)("\n");
+
+		(*pr)("registered inodes = ");
+		{
+			int i;
+			cnt = 0;
+			for (i = 0; i <= wl->wl_inohashmask; i++) {
+				struct wapbl_ino_head *wih;
+				struct wapbl_ino *wi;
+
+				wih = &wl->wl_inohash[i];
+				LIST_FOREACH(wi, wih, wi_hash) {
+					if (wi->wi_ino == 0)
+						continue;
+					(*pr)(" %"PRId32"/0%06"PRIo32",",
+					    wi->wi_ino, wi->wi_mode);
+					if ((++cnt % 4) == 0) {
+						(*pr)("\n\t");
+					}
+				}
+			}
+			(*pr)("\n");
+		}
+	}
+}
+
+#if defined(WAPBL_DEBUG) || defined(DDB)
+void
+wapbl_dump(struct wapbl *wl)
+{
+#if defined(WAPBL_DEBUG)
+	if (!wl)
+		wl = wapbl_debug_wl;
+#endif
+	if (!wl)
+		return;
+	wapbl_print(wl, 1, printf);
+}
+#endif
+
+/****************************************************************/
+
+void
+wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
+{
+
+	wapbl_jlock_assert(wl);
+
+	/* XXX should eventually instead tie this into resource estimation */
+	/* XXX this KASSERT needs locking/mutex analysis */
+	KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
+	wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
+	wl->wl_dealloclens[wl->wl_dealloccnt] = len;
+	wl->wl_dealloccnt++;
+	WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
+	    ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
+}
+
+/****************************************************************/
+
+static void
+wapbl_inodetrk_init(struct wapbl *wl, u_int size)
+{
+
+	wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
+	if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
+		pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
+		    "wapblinopl", &pool_allocator_nointr, IPL_NONE);
+	}
+}
+
+static void
+wapbl_inodetrk_free(struct wapbl *wl)
+{
+
+	/* XXX this KASSERT needs locking/mutex analysis */
+	KASSERT(wl->wl_inohashcnt == 0);
+	hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
+	if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
+		pool_destroy(&wapbl_ino_pool);
+	}
+}
+
+static struct wapbl_ino *
+wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
+{
+	struct wapbl_ino_head *wih;
+	struct wapbl_ino *wi;
+
+	KASSERT(mutex_owned(&wl->wl_mtx));
+
+	wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
+	LIST_FOREACH(wi, wih, wi_hash) {
+		if (ino == wi->wi_ino)
+			return wi;
+	}
+	return 0;
+}
+
+void
+wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
+{
+	struct wapbl_ino_head *wih;
+	struct wapbl_ino *wi;
+
+	wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
+
+	mutex_enter(&wl->wl_mtx);
+	if (wapbl_inodetrk_get(wl, ino) == NULL) {
+		wi->wi_ino = ino;
+		wi->wi_mode = mode;
+		wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
+		LIST_INSERT_HEAD(wih, wi, wi_hash);
+		wl->wl_inohashcnt++;
+		WAPBL_PRINTF(WAPBL_PRINT_INODE,
+		    ("wapbl_register_inode: ino=%"PRId64"\n", ino));
+		mutex_exit(&wl->wl_mtx);
+	} else {
+		mutex_exit(&wl->wl_mtx);
+		pool_put(&wapbl_ino_pool, wi);
+	}
+}
+
+void
+wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
+{
+	struct wapbl_ino *wi;
+
+	mutex_enter(&wl->wl_mtx);
+	wi = wapbl_inodetrk_get(wl, ino);
+	if (wi) {
+		WAPBL_PRINTF(WAPBL_PRINT_INODE,
+		    ("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
+		KASSERT(wl->wl_inohashcnt > 0);
+		wl->wl_inohashcnt--;
+		LIST_REMOVE(wi, wi_hash);
+		mutex_exit(&wl->wl_mtx);
+
+		pool_put(&wapbl_ino_pool, wi);
+	} else {
+		mutex_exit(&wl->wl_mtx);
+	}
+}
+
+/****************************************************************/
+
+static __inline size_t
+wapbl_transaction_inodes_len(struct wapbl *wl)
+{
+	int blocklen = 1<<wl->wl_log_dev_bshift;
+	int iph;
+
+	/* Calculate number of inodes described in a inodelist header */
+	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
+	    sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
+
+	KASSERT(iph > 0);
+
+	return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
+}
+
+
+/* Calculate amount of space a transaction will take on disk */
+static size_t
+wapbl_transaction_len(struct wapbl *wl)
+{
+	int blocklen = 1<<wl->wl_log_dev_bshift;
+	size_t len;
+	int bph;
+
+	/* Calculate number of blocks described in a blocklist header */
+	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
+	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
+
+	KASSERT(bph > 0);
+
+	len = wl->wl_bcount;
+	len += howmany(wl->wl_bufcount, bph)*blocklen;
+	len += howmany(wl->wl_dealloccnt, bph)*blocklen;
+	len += wapbl_transaction_inodes_len(wl);
+
+	return len;
+}
+
+/*
+ * Perform commit operation
+ *
+ * Note that generation number incrementation needs to
+ * be protected against racing with other invocations
+ * of wapbl_commit.  This is ok since this routine
+ * is only invoked from wapbl_flush
+ */
+static int
+wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
+{
+	struct wapbl_wc_header *wc = wl->wl_wc_header;
+	struct timespec ts;
+	int error;
+	int force = 1;
+
+	/* XXX Calc checksum here, instead we do this for now */
+	error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
+	if (error) {
+		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+		    ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
+		    "returned %d\n", wl->wl_devvp->v_rdev, error));
+	}
+
+	wc->wc_head = head;
+	wc->wc_tail = tail;
+	wc->wc_checksum = 0;
+	wc->wc_version = 1;
+	getnanotime(&ts);
+	wc->wc_time = ts.tv_sec;;
+	wc->wc_timensec = ts.tv_nsec;
+
+	WAPBL_PRINTF(WAPBL_PRINT_WRITE,
+	    ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
+	    (intmax_t)head, (intmax_t)tail));
+
+	/*
+	 * XXX if generation will rollover, then first zero
+	 * over second commit header before trying to write both headers.
+	 */
+
+	error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
+	    wl->wl_logpbn + wc->wc_generation % 2);
+	if (error)
+		return error;
+
+	error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
+	if (error) {
+		WAPBL_PRINTF(WAPBL_PRINT_ERROR,
+		    ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
+		    "returned %d\n", wl->wl_devvp->v_rdev, error));
+	}
+
+	/*
+	 * If the generation number was zero, write it out a second time.
+	 * This handles initialization and generation number rollover
+	 */
+	if (wc->wc_generation++ == 0) {
+		error = wapbl_write_commit(wl, head, tail);
+		/*
+		 * This panic should be able to be removed if we do the
+		 * zero'ing mentioned above, and we are certain to roll
+		 * back generation number on failure.
+		 */
+		if (error)
+			panic("wapbl_write_commit: error writing duplicate "
+			      "log header: %d\n", error);
+	}
+	return 0;
+}
+
+/* Returns new offset value */
+static int
+wapbl_write_blocks(struct wapbl *wl, off_t *offp)
+{
+	struct wapbl_wc_blocklist *wc =
+	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
+	int blocklen = 1<<wl->wl_log_dev_bshift;
+	int bph;
+	struct buf *bp;
+	off_t off = *offp;
+	int error;
+
+	KASSERT(rw_write_held(&wl->wl_rwlock));
+
+	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
+	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
+
+	bp = LIST_FIRST(&wl->wl_bufs);
+
+	while (bp) {
+		int cnt;
+		struct buf *obp = bp;
+
+		KASSERT(bp->b_flags & B_LOCKED);
+
+		wc->wc_type = WAPBL_WC_BLOCKS;
+		wc->wc_len = blocklen;
+		wc->wc_blkcount = 0;
+		while (bp && (wc->wc_blkcount < bph)) {
+			/*
+			 * Make sure all the physical block numbers are up to
+			 * date.  If this is not always true on a given
+			 * filesystem, then VOP_BMAP must be called.  We
+			 * could call VOP_BMAP here, or else in the filesystem
+			 * specific flush callback, although neither of those
+			 * solutions allow us to take the vnode lock.  If a
+			 * filesystem requires that we must take the vnode lock
+			 * to call VOP_BMAP, then we can probably do it in
+			 * bwrite when the vnode lock should already be held
+			 * by the invoking code.
+			 */
+			KASSERT((bp->b_vp->v_type == VBLK) ||
+				 (bp->b_blkno != bp->b_lblkno));
+			KASSERT(bp->b_blkno > 0);
+
+			wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
+			wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
+			wc->wc_len += bp->b_bcount;
+			wc->wc_blkcount++;
+			bp = LIST_NEXT(bp, b_wapbllist);
+		}
+		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
+		    ("wapbl_write_blocks: len = %u off = %"PRIdMAX"\n",
+		    wc->wc_len, (intmax_t)off));
+
+		error = wapbl_circ_write(wl, wc, blocklen, &off);
+		if (error)
+			return error;
+		bp = obp;
+		cnt = 0;
+		while (bp && (cnt++ < bph)) {
+			error = wapbl_circ_write(wl, bp->b_data,
+			    bp->b_bcount, &off);
+			if (error)
+				return error;
+			bp = LIST_NEXT(bp, b_wapbllist);
+		}
+	}
+	*offp = off;
+	return 0;
+}
+
+static int
+wapbl_write_revocations(struct wapbl *wl, off_t *offp)
+{
+	struct wapbl_wc_blocklist *wc =
+	    (struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
+	int i;
+	int blocklen = 1<<wl->wl_log_dev_bshift;
+	int bph;
+	off_t off = *offp;
+	int error;
+
+	if (wl->wl_dealloccnt == 0)
+		return 0;
+
+	bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
+	    sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
+
+	i = 0;
+	while (i < wl->wl_dealloccnt) {
+		wc->wc_type = WAPBL_WC_REVOCATIONS;
+		wc->wc_len = blocklen;
+		wc->wc_blkcount = 0;
+		while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
+			wc->wc_blocks[wc->wc_blkcount].wc_daddr =
+			    wl->wl_deallocblks[i];
+			wc->wc_blocks[wc->wc_blkcount].wc_dlen =
+			    wl->wl_dealloclens[i];
+			wc->wc_blkcount++;
+			i++;
+		}
+		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
+		    ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
+		    wc->wc_len, (intmax_t)off));
+		error = wapbl_circ_write(wl, wc, blocklen, &off);
+		if (error)
+			return error;
+	}
+	*offp = off;
+	return 0;
+}
+
+static int
+wapbl_write_inodes(struct wapbl *wl, off_t *offp)
+{
+	struct wapbl_wc_inodelist *wc =
+	    (struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
+	int i;
+	int blocklen = 1<<wl->wl_log_dev_bshift;
+	off_t off = *offp;
+	int error;
+
+	struct wapbl_ino_head *wih;
+	struct wapbl_ino *wi;
+	int iph;
+
+	iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
+	    sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
+
+	i = 0;
+	wih = &wl->wl_inohash[0];
+	wi = 0;
+	do {
+		wc->wc_type = WAPBL_WC_INODES;
+		wc->wc_len = blocklen;
+		wc->wc_inocnt = 0;
+		wc->wc_clear = (i == 0);
+		while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
+			while (!wi) {
+				KASSERT((wih - &wl->wl_inohash[0])
+				    <= wl->wl_inohashmask);
+				wi = LIST_FIRST(wih++);
+			}
+			wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
+			wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
+			wc->wc_inocnt++;
+			i++;
+			wi = LIST_NEXT(wi, wi_hash);
+		}
+		WAPBL_PRINTF(WAPBL_PRINT_WRITE,
+		    ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
+		    wc->wc_len, (intmax_t)off));
+		error = wapbl_circ_write(wl, wc, blocklen, &off);
+		if (error)
+			return error;
+	} while (i < wl->wl_inohashcnt);
+	
+	*offp = off;
+	return 0;
+}
+
+#endif /* _KERNEL */
+
+/****************************************************************/
+
+#ifdef _KERNEL
+static struct pool wapbl_blk_pool;
+static int wapbl_blk_pool_refcount;
+#endif
+struct wapbl_blk {
+	LIST_ENTRY(wapbl_blk) wb_hash;
+	daddr_t wb_blk;
+	off_t wb_off; /* Offset of this block in the log */
+};
+#define	WAPBL_BLKPOOL_MIN 83
+
+static void
+wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
+{
+	if (size < WAPBL_BLKPOOL_MIN)
+		size = WAPBL_BLKPOOL_MIN;
+	KASSERT(wr->wr_blkhash == 0);
+#ifdef _KERNEL
+	wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
+	if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) {
+		pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0,
+		    "wapblblkpl", &pool_allocator_nointr, IPL_NONE);
+        }
+#else /* ! _KERNEL */
+	/* Manually implement hashinit */
+	{
+		int i;
+		unsigned long hashsize;
+		for (hashsize = 1; hashsize < size; hashsize <<= 1)
+			continue;
+		wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
+		for (i = 0; i < wr->wr_blkhashmask; i++)
+			LIST_INIT(&wr->wr_blkhash[i]);
+		wr->wr_blkhashmask = hashsize - 1;
+	}
+#endif /* ! _KERNEL */
+}
+
+static void
+wapbl_blkhash_free(struct wapbl_replay *wr)
+{
+	KASSERT(wr->wr_blkhashcnt == 0);
+#ifdef _KERNEL
+	hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
+	if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) {
+		pool_destroy(&wapbl_blk_pool);
+	}
+#else /* ! _KERNEL */
+	wapbl_free(wr->wr_blkhash);
+#endif /* ! _KERNEL */
+}
+
+static struct wapbl_blk *
+wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
+{
+	struct wapbl_blk_head *wbh;
+	struct wapbl_blk *wb;
+	wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
+	LIST_FOREACH(wb, wbh, wb_hash) {
+		if (blk == wb->wb_blk)
+			return wb;
+	}
+	return 0;
+}
+
+static void
+wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
+{
+	struct wapbl_blk_head *wbh;
+	struct wapbl_blk *wb;
+	wb = wapbl_blkhash_get(wr, blk);
+	if (wb) {
+		KASSERT(wb->wb_blk == blk);
+		wb->wb_off = off;
+	} else {
+#ifdef _KERNEL
+		wb = pool_get(&wapbl_blk_pool, PR_WAITOK);
+#else /* ! _KERNEL */
+		wb = wapbl_malloc(sizeof(*wb));
+#endif /* ! _KERNEL */
+		wb->wb_blk = blk;
+		wb->wb_off = off;
+		wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
+		LIST_INSERT_HEAD(wbh, wb, wb_hash);
+		wr->wr_blkhashcnt++;
+	}
+}
+
+static void
+wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
+{
+	struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
+	if (wb) {
+		KASSERT(wr->wr_blkhashcnt > 0);
+		wr->wr_blkhashcnt--;
+		LIST_REMOVE(wb, wb_hash);
+#ifdef _KERNEL
+		pool_put(&wapbl_blk_pool, wb);
+#else /* ! _KERNEL */
+		wapbl_free(wb);
+#endif /* ! _KERNEL */
+	}
+}
+
+static void
+wapbl_blkhash_clear(struct wapbl_replay *wr)
+{
+	int i;
+	for (i = 0; i <= wr->wr_blkhashmask; i++) {
+		struct wapbl_blk *wb;
+
+		while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
+			KASSERT(wr->wr_blkhashcnt > 0);
+			wr->wr_blkhashcnt--;
+			LIST_REMOVE(wb, wb_hash);
+#ifdef _KERNEL
+			pool_put(&wapbl_blk_pool, wb);
+#else /* ! _KERNEL */
+			wapbl_free(wb);
+#endif /* ! _KERNEL */
+		}
+	}
+	KASSERT(wr->wr_blkhashcnt == 0);
+}
+
+/****************************************************************/
+
+static int
+wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
+{
+	size_t slen;
+	struct wapbl_wc_header *wc = &wr->wr_wc_header;
+	off_t off = *offp;
+	int error;
+
+	KASSERT(((len >> wc->wc_log_dev_bshift) <<
+	    wc->wc_log_dev_bshift) == len);
+	if (off < wc->wc_circ_off)
+		off = wc->wc_circ_off;
+	slen = wc->wc_circ_off + wc->wc_circ_size - off;
+	if (slen < len) {
+		error = wapbl_read(data, slen, wr->wr_devvp,
+		    wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
+		if (error)
+			return error;
+		data = (uint8_t *)data + slen;
+		len -= slen;
+		off = wc->wc_circ_off;
+	}
+	error = wapbl_read(data, len, wr->wr_devvp,
+	    wr->wr_logpbn + (off >> wc->wc_log_dev_bshift));
+	if (error)
+		return error;
+	off += len;
+	if (off >= wc->wc_circ_off + wc->wc_circ_size)
+		off = wc->wc_circ_off;
+	*offp = off;
+	return 0;
+}
+
+static void
+wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
+{
+	size_t slen;
+	struct wapbl_wc_header *wc = &wr->wr_wc_header;
+	off_t off = *offp;
+
+	KASSERT(((len >> wc->wc_log_dev_bshift) <<
+	    wc->wc_log_dev_bshift) == len);
+
+	if (off < wc->wc_circ_off)
+		off = wc->wc_circ_off;
+	slen = wc->wc_circ_off + wc->wc_circ_size - off;
+	if (slen < len) {
+		len -= slen;
+		off = wc->wc_circ_off;
+	}
+	off += len;
+	if (off >= wc->wc_circ_off + wc->wc_circ_size)
+		off = wc->wc_circ_off;
+	*offp = off;
+}
+
+/****************************************************************/
+
+int
+wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
+	daddr_t off, size_t count, size_t blksize)
+{
+	struct wapbl_replay *wr;
+	int error;
+	struct vnode *devvp;
+	daddr_t logpbn;
+	uint8_t *scratch;
+	struct wapbl_wc_header *wch;
+	struct wapbl_wc_header *wch2;
+	/* Use this until we read the actual log header */
+	int log_dev_bshift = DEV_BSHIFT;
+	size_t used;
+
+	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
+	    ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
+	    vp, off, count, blksize));
+
+	if (off < 0)
+		return EINVAL;
+
+	if (blksize < DEV_BSIZE)
+		return EINVAL;
+	if (blksize % DEV_BSIZE)
+		return EINVAL;
+
+#ifdef _KERNEL
+#if 0
+	/* XXX vp->v_size isn't reliably set for VBLK devices,
+	 * especially root.  However, we might still want to verify
+	 * that the full load is readable */
+	if ((off + count) * blksize > vp->v_size)
+		return EINVAL;
+#endif
+
+	if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
+		return error;
+	}
+#else /* ! _KERNEL */
+	devvp = vp;
+	logpbn = off;
+#endif /* ! _KERNEL */
+
+	scratch = wapbl_malloc(MAXBSIZE);
+
+	error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
+	if (error)
+		goto errout;
+
+	wch = (struct wapbl_wc_header *)scratch;
+	wch2 =
+	    (struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
+	/* XXX verify checksums and magic numbers */
+	if (wch->wc_type != WAPBL_WC_HEADER) {
+		printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
+		error = EFTYPE;
+		goto errout;
+	}
+
+	if (wch2->wc_generation > wch->wc_generation)
+		wch = wch2;
+
+	wr = wapbl_calloc(1, sizeof(*wr));
+
+	wr->wr_logvp = vp;
+	wr->wr_devvp = devvp;
+	wr->wr_logpbn = logpbn;
+
+	wr->wr_scratch = scratch;
+
+	memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header));
+
+	used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
+
+	WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
+	    ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
+	    " len=%"PRId64" used=%zu\n",
+	    wch->wc_head, wch->wc_tail, wch->wc_circ_off,
+	    wch->wc_circ_size, used));
+
+	wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
+	error = wapbl_replay_prescan(wr);
+	if (error) {
+		wapbl_replay_stop(wr);
+		wapbl_replay_free(wr);
+		return error;
+	}
+
+	error = wapbl_replay_get_inodes(wr);
+	if (error) {
+		wapbl_replay_stop(wr);
+		wapbl_replay_free(wr);
+		return error;
+	}
+
+	*wrp = wr;
+	return 0;
+
+ errout:
+	wapbl_free(scratch);
+	return error;
+}
+
+void
+wapbl_replay_stop(struct wapbl_replay *wr)
+{
+
+	WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
+
+	KDASSERT(wapbl_replay_isopen(wr));
+
+	wapbl_free(wr->wr_scratch);
+	wr->wr_scratch = 0;
+
+	wr->wr_logvp = 0;
+
+	wapbl_blkhash_clear(wr);
+	wapbl_blkhash_free(wr);
+}
+
+void
+wapbl_replay_free(struct wapbl_replay *wr)
+{
+
+	KDASSERT(!wapbl_replay_isopen(wr));
+
+	if (wr->wr_inodes)
+		wapbl_free(wr->wr_inodes);
+	wapbl_free(wr);
+}
+
+int
+wapbl_replay_isopen1(struct wapbl_replay *wr)
+{
+
+	return wapbl_replay_isopen(wr);
+}
+
+static int
+wapbl_replay_prescan(struct wapbl_replay *wr)
+{
+	off_t off;
+	struct wapbl_wc_header *wch = &wr->wr_wc_header;
+	int error;
+
+	int logblklen = 1<<wch->wc_log_dev_bshift;
+	int fsblklen = 1<<wch->wc_fs_dev_bshift;
+
+	wapbl_blkhash_clear(wr);
+
+	off = wch->wc_tail;
+	while (off != wch->wc_head) {
+		struct wapbl_wc_null *wcn;
+		off_t saveoff = off;
+		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
+		if (error)
+			goto errout;
+		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
+		switch (wcn->wc_type) {
+		case WAPBL_WC_BLOCKS:
+			{
+				struct wapbl_wc_blocklist *wc =
+				    (struct wapbl_wc_blocklist *)wr->wr_scratch;
+				int i;
+				for (i = 0; i < wc->wc_blkcount; i++) {
+					int j, n;
+					/*
+					 * Enter each physical block into the
+					 * hashtable independently
+					 */
+					n = wc->wc_blocks[i].wc_dlen >>
+					    wch->wc_fs_dev_bshift;
+					for (j = 0; j < n; j++) {
+						wapbl_blkhash_ins(wr,
+						    wc->wc_blocks[i].wc_daddr + j,
+						    off);
+						wapbl_circ_advance(wr,
+						    fsblklen, &off);
+					}
+				}
+			}
+			break;
+
+		case WAPBL_WC_REVOCATIONS:
+			{
+				struct wapbl_wc_blocklist *wc =
+				    (struct wapbl_wc_blocklist *)wr->wr_scratch;
+				int i;
+				for (i = 0; i < wc->wc_blkcount; i++) {
+					int j, n;
+					/*
+					 * Remove any blocks found from the
+					 * hashtable
+					 */
+					n = wc->wc_blocks[i].wc_dlen >>
+					    wch->wc_fs_dev_bshift;
+					for (j = 0; j < n; j++) {
+						wapbl_blkhash_rem(wr,
+						   wc->wc_blocks[i].wc_daddr + j);
+					}
+				}
+			}
+			break;
+
+		case WAPBL_WC_INODES:
+			{
+				struct wapbl_wc_inodelist *wc =
+				    (struct wapbl_wc_inodelist *)wr->wr_scratch;
+				/*
+				 * Keep track of where we found this so we
+				 * can use it later
+				 */
+				if (wc->wc_clear) {
+					wr->wr_inodestail = saveoff;
+					wr->wr_inodescnt = 0;
+				}
+				if (wr->wr_inodestail)
+					wr->wr_inodeshead = off;
+				wr->wr_inodescnt += wc->wc_inocnt;
+			}
+			break;
+		default:
+			printf("Unrecognized wapbl type: 0x%08x\n",
+			       wcn->wc_type);
+ 			error = EFTYPE;
+			goto errout;
+		}
+		wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
+		if (off != saveoff) {
+			printf("wapbl_replay: corrupted records\n");
+			error = EFTYPE;
+			goto errout;
+		}
+	}
+	return 0;
+
+ errout:
+	wapbl_blkhash_clear(wr);
+	return error;
+}
+
+static int
+wapbl_replay_get_inodes(struct wapbl_replay *wr)
+{
+	off_t off;
+	struct wapbl_wc_header *wch = &wr->wr_wc_header;
+	int logblklen = 1<<wch->wc_log_dev_bshift;
+	int cnt= 0;
+
+	KDASSERT(wapbl_replay_isopen(wr));
+
+	if (wr->wr_inodescnt == 0)
+		return 0;
+
+	KASSERT(!wr->wr_inodes);
+
+	wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0]));
+
+	off = wr->wr_inodestail;
+
+	while (off != wr->wr_inodeshead) {
+		struct wapbl_wc_null *wcn;
+		int error;
+		off_t saveoff = off;
+		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
+		if (error) {
+			wapbl_free(wr->wr_inodes);
+			wr->wr_inodes = 0;
+			return error;
+		}
+		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
+		switch (wcn->wc_type) {
+		case WAPBL_WC_BLOCKS:
+		case WAPBL_WC_REVOCATIONS:
+			break;
+		case WAPBL_WC_INODES:
+			{
+				struct wapbl_wc_inodelist *wc =
+				    (struct wapbl_wc_inodelist *)wr->wr_scratch;
+				/*
+				 * Keep track of where we found this so we
+				 * can use it later
+				 */
+				if (wc->wc_clear) {
+					cnt = 0;
+				}
+                                /* This memcpy assumes that wr_inodes is
+                                 * laid out the same as wc_inodes. */
+				memcpy(&wr->wr_inodes[cnt], wc->wc_inodes,
+				       wc->wc_inocnt*sizeof(wc->wc_inodes[0]));
+				cnt += wc->wc_inocnt;
+			}
+			break;
+		default:
+			KASSERT(0);
+		}
+		off = saveoff;
+		wapbl_circ_advance(wr, wcn->wc_len, &off);
+	}
+	KASSERT(cnt == wr->wr_inodescnt);
+	return 0;
+}
+
+#ifdef DEBUG
+int
+wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
+{
+	off_t off;
+	struct wapbl_wc_header *wch = &wr->wr_wc_header;
+	int mismatchcnt = 0;
+	int logblklen = 1<<wch->wc_log_dev_bshift;
+	int fsblklen = 1<<wch->wc_fs_dev_bshift;
+	void *scratch1 = wapbl_malloc(MAXBSIZE);
+	void *scratch2 = wapbl_malloc(MAXBSIZE);
+	int error = 0;
+
+	KDASSERT(wapbl_replay_isopen(wr));
+
+	off = wch->wc_tail;
+	while (off != wch->wc_head) {
+		struct wapbl_wc_null *wcn;
+#ifdef DEBUG
+		off_t saveoff = off;
+#endif
+		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
+		if (error)
+			goto out;
+		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
+		switch (wcn->wc_type) {
+		case WAPBL_WC_BLOCKS:
+			{
+				struct wapbl_wc_blocklist *wc =
+				    (struct wapbl_wc_blocklist *)wr->wr_scratch;
+				int i;
+				for (i = 0; i < wc->wc_blkcount; i++) {
+					int foundcnt = 0;
+					int dirtycnt = 0;
+					int j, n;
+					/*
+					 * Check each physical block into the
+					 * hashtable independently
+					 */
+					n = wc->wc_blocks[i].wc_dlen >>
+					    wch->wc_fs_dev_bshift;
+					for (j = 0; j < n; j++) {
+						struct wapbl_blk *wb =
+						   wapbl_blkhash_get(wr,
+						   wc->wc_blocks[i].wc_daddr + j);
+						if (wb && (wb->wb_off == off)) {
+							foundcnt++;
+							error =
+							    wapbl_circ_read(wr,
+							    scratch1, fsblklen,
+							    &off);
+							if (error)
+								goto out;
+							error =
+							    wapbl_read(scratch2,
+							    fsblklen, fsdevvp,
+							    wb->wb_blk);
+							if (error)
+								goto out;
+							if (memcmp(scratch1,
+								   scratch2,
+								   fsblklen)) {
+								printf(
+		"wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
+		wb->wb_blk, (intmax_t)off);
+								dirtycnt++;
+								mismatchcnt++;
+							}
+						} else {
+							wapbl_circ_advance(wr,
+							    fsblklen, &off);
+						}
+					}
+#if 0
+					/*
+					 * If all of the blocks in an entry
+					 * are clean, then remove all of its
+					 * blocks from the hashtable since they
+					 * never will need replay.
+					 */
+					if ((foundcnt != 0) &&
+					    (dirtycnt == 0)) {
+						off = saveoff;
+						wapbl_circ_advance(wr,
+						    logblklen, &off);
+						for (j = 0; j < n; j++) {
+							struct wapbl_blk *wb =
+							   wapbl_blkhash_get(wr,
+							   wc->wc_blocks[i].wc_daddr + j);
+							if (wb &&
+							  (wb->wb_off == off)) {
+								wapbl_blkhash_rem(wr, wb->wb_blk);
+							}
+							wapbl_circ_advance(wr,
+							    fsblklen, &off);
+						}
+					}
+#endif
+				}
+			}
+			break;
+		case WAPBL_WC_REVOCATIONS:
+		case WAPBL_WC_INODES:
+			break;
+		default:
+			KASSERT(0);
+		}
+#ifdef DEBUG
+		wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
+		KASSERT(off == saveoff);
+#endif
+	}
+ out:
+	wapbl_free(scratch1);
+	wapbl_free(scratch2);
+	if (!error && mismatchcnt)
+		error = EFTYPE;
+	return error;
+}
+#endif
+
+int
+wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
+{
+	off_t off;
+	struct wapbl_wc_header *wch = &wr->wr_wc_header;
+	int logblklen = 1<<wch->wc_log_dev_bshift;
+	int fsblklen = 1<<wch->wc_fs_dev_bshift;
+	void *scratch1 = wapbl_malloc(MAXBSIZE);
+	int error = 0;
+
+	KDASSERT(wapbl_replay_isopen(wr));
+
+	/*
+	 * This parses the journal for replay, although it could
+	 * just as easily walk the hashtable instead.
+	 */
+
+	off = wch->wc_tail;
+	while (off != wch->wc_head) {
+		struct wapbl_wc_null *wcn;
+#ifdef DEBUG
+		off_t saveoff = off;
+#endif
+		error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
+		if (error)
+			goto out;
+		wcn = (struct wapbl_wc_null *)wr->wr_scratch;
+		switch (wcn->wc_type) {
+		case WAPBL_WC_BLOCKS:
+			{
+				struct wapbl_wc_blocklist *wc =
+				    (struct wapbl_wc_blocklist *)wr->wr_scratch;
+				int i;
+				for (i = 0; i < wc->wc_blkcount; i++) {
+					int j, n;
+					/*
+					 * Check each physical block against
+					 * the hashtable independently
+					 */
+					n = wc->wc_blocks[i].wc_dlen >>
+					    wch->wc_fs_dev_bshift;
+					for (j = 0; j < n; j++) {
+						struct wapbl_blk *wb =
+						   wapbl_blkhash_get(wr,
+						   wc->wc_blocks[i].wc_daddr + j);
+						if (wb && (wb->wb_off == off)) {
+							error = wapbl_circ_read(
+							    wr, scratch1,
+							    fsblklen, &off);
+							if (error)
+								goto out;
+							error =
+							   wapbl_write(scratch1,
+							   fsblklen, fsdevvp,
+							   wb->wb_blk);
+							if (error)
+								goto out;
+						} else {
+							wapbl_circ_advance(wr,
+							    fsblklen, &off);
+						}
+					}
+				}
+			}
+			break;
+		case WAPBL_WC_REVOCATIONS:
+		case WAPBL_WC_INODES:
+			break;
+		default:
+			KASSERT(0);
+		}
+#ifdef DEBUG
+		wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
+		KASSERT(off == saveoff);
+#endif
+	}
+ out:
+	wapbl_free(scratch1);
+	return error;
+}
+
+int
+wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
+{
+	struct wapbl_wc_header *wch = &wr->wr_wc_header;
+	int fsblklen = 1<<wch->wc_fs_dev_bshift;
+
+	KDASSERT(wapbl_replay_isopen(wr));
+
+	KASSERT((len % fsblklen) == 0);
+
+	while (len != 0) {
+		struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
+		if (wb) {
+			off_t off = wb->wb_off;
+			int error;
+			error = wapbl_circ_read(wr, data, fsblklen, &off);
+			if (error)
+				return error;
+		}
+		data = (uint8_t *)data + fsblklen;
+		len -= fsblklen;
+		blk++;
+	}
+	return 0;
+}
diff --git a/sys/kern/vnode_if.c b/sys/kern/vnode_if.c
index 44f4f7f419b5..830c8055eb9f 100644
--- a/sys/kern/vnode_if.c
+++ b/sys/kern/vnode_if.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: vnode_if.c,v 1.76 2008/01/25 14:32:46 ad Exp $	*/
+/*	$NetBSD: vnode_if.c,v 1.77 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Warning: DO NOT EDIT! This file is automatically generated!
@@ -40,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.76 2008/01/25 14:32:46 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.77 2008/07/31 05:38:05 simonb Exp $");
 
 
 #include "opt_vnode_lockdebug.h"
@@ -802,6 +802,7 @@ VOP_FSYNC(struct vnode *vp,
 	mpsafe = (vp->v_vflag & VV_MPSAFE);
 	if (!mpsafe) { KERNEL_LOCK(1, curlwp); }
 	error = (VCALL(vp, VOFFSET(vop_fsync), &a));
+
 	if (!mpsafe) { KERNEL_UNLOCK_ONE(curlwp); }
 	return error;
 }
diff --git a/sys/miscfs/genfs/genfs_io.c b/sys/miscfs/genfs/genfs_io.c
index 3cce9ed6704d..240371f1a825 100644
--- a/sys/miscfs/genfs/genfs_io.c
+++ b/sys/miscfs/genfs/genfs_io.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: genfs_io.c,v 1.8 2008/06/04 12:41:40 ad Exp $	*/
+/*	$NetBSD: genfs_io.c,v 1.9 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.8 2008/06/04 12:41:40 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.9 2008/07/31 05:38:05 simonb Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -589,8 +589,22 @@ loopdone:
 	 */
 
 	if (!error && sawhole && blockalloc) {
-		error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0,
-		    cred);
+		/*
+		 * XXX: This assumes that we come here only via
+		 * the mmio path
+		 */
+		if (vp->v_mount->mnt_wapbl && write) {
+			error = WAPBL_BEGIN(vp->v_mount);
+		}
+
+		if (!error) {
+			error = GOP_ALLOC(vp, startoffset,
+			    npages << PAGE_SHIFT, 0, cred);
+			if (vp->v_mount->mnt_wapbl && write) {
+				WAPBL_END(vp->v_mount);
+			}
+		}
+
 		UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d",
 		    startoffset, npages << PAGE_SHIFT, error,0);
 		if (!error) {
diff --git a/sys/rump/fs/lib/libffs/Makefile b/sys/rump/fs/lib/libffs/Makefile
index 4e186faf6359..c074e16c97c0 100644
--- a/sys/rump/fs/lib/libffs/Makefile
+++ b/sys/rump/fs/lib/libffs/Makefile
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.4 2008/07/29 13:17:42 pooka Exp $
+#	$NetBSD: Makefile,v 1.5 2008/07/31 05:38:05 simonb Exp $
 #
 
 .PATH:  ${.CURDIR}/../../../../ufs/ffs
@@ -7,9 +7,9 @@ LIB=	rumpfs_ffs
 
 SRCS=	ffs_alloc.c ffs_balloc.c ffs_bswap.c ffs_inode.c		\
 	ffs_softdep.stub.c ffs_subr.c ffs_tables.c ffs_vfsops.c		\
-	ffs_vnops.c ffs_snapshot.c
+	ffs_vnops.c ffs_snapshot.c ffs_wapbl.c
 
-CPPFLAGS+=	-DFFS_NO_SNAPSHOT -DFFS_EI
+CPPFLAGS+=	-DFFS_NO_SNAPSHOT -DFFS_EI -DWAPBL
 CFLAGS+=	-Wno-pointer-sign
 
 .include <bsd.lib.mk>
diff --git a/sys/rump/fs/lib/libufs/Makefile b/sys/rump/fs/lib/libufs/Makefile
index e6575abb1336..7c9d9ec58a58 100644
--- a/sys/rump/fs/lib/libufs/Makefile
+++ b/sys/rump/fs/lib/libufs/Makefile
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.5 2008/07/29 13:17:47 pooka Exp $
+#	$NetBSD: Makefile,v 1.6 2008/07/31 05:38:05 simonb Exp $
 #
 
 .PATH:  ${.CURDIR}/../../../../ufs/ufs
@@ -6,9 +6,9 @@
 LIB=	rumpfs_ufs
 
 SRCS=	ufs_bmap.c ufs_dirhash.c ufs_ihash.c ufs_inode.c ufs_lookup.c	\
-	ufs_vfsops.c ufs_vnops.c
+	ufs_vfsops.c ufs_vnops.c ufs_wapbl.c
 
-CPPFLAGS+=	-DUFS_DIRHASH -DFFS_EI
+CPPFLAGS+=	-DUFS_DIRHASH -DFFS_EI -DWAPBL
 
 .include <bsd.lib.mk>
 .include <bsd.klinks.mk>
diff --git a/sys/rump/librump/rumpkern/Makefile.rumpkern b/sys/rump/librump/rumpkern/Makefile.rumpkern
index 2cadd1a098f5..644f221356fa 100644
--- a/sys/rump/librump/rumpkern/Makefile.rumpkern
+++ b/sys/rump/librump/rumpkern/Makefile.rumpkern
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile.rumpkern,v 1.2 2008/07/30 01:32:47 oster Exp $
+#	$NetBSD: Makefile.rumpkern,v 1.3 2008/07/31 05:38:05 simonb Exp $
 #
 
 .include "${RUMPTOP}/Makefile.rump"
@@ -26,7 +26,7 @@ SRCS+=	clock_subr.c kern_descrip.c kern_stub.c param.c	\
 	subr_bufq.c subr_hash.c subr_prf2.c subr_specificdata.c		\
 	subr_time.c subr_workqueue.c sys_descrip.c sys_generic.c vfs_bio.c \
 	vfs_cache.c vfs_getcwd.c vfs_hooks.c vfs_init.c vfs_lookup.c	\
-	vfs_subr.c vfs_vnops.c vfs_syscalls.c vnode_if.c \
+	vfs_subr.c vfs_vnops.c vfs_syscalls.c vfs_wapbl.c vnode_if.c \
 	subr_kobj.c kern_module.c
 
 # sys/miscfs
diff --git a/sys/rump/librump/rumpkern/rump.c b/sys/rump/librump/rumpkern/rump.c
index 44fa4ace070c..ffdc9dfee3c7 100644
--- a/sys/rump/librump/rumpkern/rump.c
+++ b/sys/rump/librump/rumpkern/rump.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: rump.c,v 1.48 2008/07/29 13:17:47 pooka Exp $	*/
+/*	$NetBSD: rump.c,v 1.49 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 2007 Antti Kantee.  All Rights Reserved.
@@ -32,6 +32,7 @@
 #include <sys/filedesc.h>
 #include <sys/kauth.h>
 #include <sys/kmem.h>
+#include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/queue.h>
@@ -39,7 +40,7 @@
 #include <sys/select.h>
 #include <sys/vnode.h>
 #include <sys/vfs_syscalls.h>
-#include <sys/module.h>
+#include <sys/wapbl.h>
 
 #include <miscfs/specfs/specdev.h>
 
@@ -135,6 +136,7 @@ rump_init()
 	module_init();
 	vfsinit();
 	bufinit();
+	wapbl_init();
 
 	rumpvfs_init();
 
diff --git a/sys/sys/Makefile b/sys/sys/Makefile
index e8a98d8001ed..7920033b183b 100644
--- a/sys/sys/Makefile
+++ b/sys/sys/Makefile
@@ -1,4 +1,4 @@
-#	$NetBSD: Makefile,v 1.109 2008/06/04 14:31:15 ad Exp $
+#	$NetBSD: Makefile,v 1.110 2008/07/31 05:38:05 simonb Exp $
 
 .include <bsd.sys.mk>
 
@@ -19,12 +19,13 @@ INCS=	acct.h agpio.h aio.h ansi.h ataio.h atomic.h audioio.h \
 	joystick.h \
 	kcore.h kgdb.h kmem.h ksem.h ksyms.h ktrace.h \
 	lkm.h localedef.h lock.h lockf.h lwp.h lwpctl.h \
-	malloc.h mallocvar.h mbuf.h md4.h \
-	md5.h midiio.h mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \
+	malloc.h mallocvar.h mbuf.h md4.h md5.h midiio.h \
+	mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \
 	namei.h null.h \
 	param.h pipe.h pmc.h poll.h pool.h power.h proc.h \
 	protosw.h pset.h ptrace.h queue.h \
-	ras.h rb.h reboot.h radioio.h resource.h resourcevar.h rmd160.h rnd.h rwlock.h \
+	ras.h rb.h reboot.h radioio.h resource.h resourcevar.h rmd160.h \
+	rnd.h rwlock.h \
 	scanio.h sched.h scsiio.h select.h selinfo.h sem.h sha1.h sha2.h \
 	shm.h siginfo.h signal.h signalvar.h sigtypes.h simplelock.h \
 	sleepq.h socket.h \
@@ -36,7 +37,7 @@ INCS=	acct.h agpio.h aio.h ansi.h ataio.h atomic.h audioio.h \
 	ttydefaults.h ttydev.h types.h \
 	ucontext.h ucred.h uio.h un.h unistd.h unpcb.h user.h utsname.h uuid.h \
 	vadvise.h verified_exec.h vmmeter.h vnode.h vnode_if.h \
-	wait.h wdog.h
+	wait.h wapbl.h wdog.h
 
 INCSYMLINKS=\
 	sys/exec_elf.h /usr/include/elf.h \
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index f5135b5392a7..3b8dbd70c6ad 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -1,4 +1,4 @@
-/*     $NetBSD: buf.h,v 1.109 2008/06/09 15:42:01 ad Exp $ */
+/*     $NetBSD: buf.h,v 1.110 2008/07/31 05:38:05 simonb Exp $ */
 
 /*-
  * Copyright (c) 1999, 2000, 2007 The NetBSD Foundation, Inc.
@@ -162,6 +162,7 @@ struct buf {
 	LIST_ENTRY(buf)		b_hash;		/* c: hash chain */
 	LIST_ENTRY(buf)		b_vnbufs;	/* c: associated vnode */
 	TAILQ_ENTRY(buf)	b_freelist;	/* c: position if not active */
+	LIST_ENTRY(buf)		b_wapbllist;	/* c: transaction buffer list */
 	daddr_t			b_lblkno;	/* c: logical block number */
 	int			b_freelistindex;/* c: free list index (BQ_) */
 	u_int			b_cflags;	/* c: BC_* flags */
@@ -244,6 +245,7 @@ do {									\
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 #define B_METAONLY	0x04	/* Return indirect block buffer. */
+#define B_CONTIG	0x08	/* Allocate file contiguously. */
 
 /* Flags to bread(), breadn() and breada(). */
 #define B_MODIFY	0x01	/* Hint: caller might modify buffer */
diff --git a/sys/sys/fstypes.h b/sys/sys/fstypes.h
index f6a8df4f6d08..ba55187c4b21 100644
--- a/sys/sys/fstypes.h
+++ b/sys/sys/fstypes.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: fstypes.h,v 1.23 2008/05/06 18:43:45 ad Exp $	*/
+/*	$NetBSD: fstypes.h,v 1.24 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 1989, 1991, 1993
@@ -87,7 +87,6 @@ typedef struct fhandle	fhandle_t;
 #define	__MNT_UNUSED2	0x00200000
 #define	__MNT_UNUSED3	0x00800000
 #define	__MNT_UNUSED4	0x01000000
-#define	__MNT_UNUSED5	0x02000000
 
 #define	MNT_RDONLY	0x00000001	/* read only filesystem */
 #define	MNT_SYNCHRONOUS	0x00000002	/* file system written synchronously */
@@ -98,6 +97,7 @@ typedef struct fhandle	fhandle_t;
 #define	MNT_ASYNC	0x00000040	/* file system written asynchronously */
 #define	MNT_NOCOREDUMP	0x00008000	/* don't write core dumps to this FS */
 #define MNT_IGNORE	0x00100000	/* don't show entry in df */
+#define	MNT_LOG		0x02000000	/* Use logging */
 #define MNT_NOATIME	0x04000000	/* Never update access times in fs */
 #define MNT_SYMPERM	0x20000000	/* recognize symlink permission */
 #define MNT_NODEVMTIME	0x40000000	/* Never update mod times for devs */
@@ -116,7 +116,8 @@ typedef struct fhandle	fhandle_t;
 	{ MNT_NOATIME,		0,	"noatime" }, \
 	{ MNT_SYMPERM,		0,	"symperm" }, \
 	{ MNT_NODEVMTIME,	0,	"nodevmtime" }, \
-	{ MNT_SOFTDEP,		0,	"soft dependencies" },
+	{ MNT_SOFTDEP,		0,	"soft dependencies" }, \
+	{ MNT_LOG,		0,	"log" },
 
 /*
  * exported mount flags.
@@ -176,7 +177,8 @@ typedef struct fhandle	fhandle_t;
      MNT_EXPUBLIC | \
      MNT_LOCAL | \
      MNT_QUOTA | \
-     MNT_ROOTFS)
+     MNT_ROOTFS | \
+     MNT_LOG)
 
 /*
  * External filesystem control flags.
@@ -223,7 +225,7 @@ typedef struct fhandle	fhandle_t;
 	"\35MNT_EXPUBLIC" \
 	"\34MNT_EXNORESPORT" \
 	"\33MNT_NOATIME" \
-	"\32MNT_UNUSED" \
+	"\32MNT_LOG" \
 	"\31MNT_UNUSED" \
 	"\30MNT_UNUSED" \
 	"\27MNT_GETARGS" \
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index fe4a665b69a0..a52c65106c1a 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: mount.h,v 1.180 2008/07/30 18:10:38 pooka Exp $	*/
+/*	$NetBSD: mount.h,v 1.181 2008/07/31 05:38:05 simonb Exp $	*/
 
 /*
  * Copyright (c) 1989, 1991, 1993
@@ -121,6 +121,11 @@ struct mount {
 	specificdata_reference
 			mnt_specdataref;	/* subsystem specific data */
 	kmutex_t	mnt_updating;		/* to serialize updates */
+	struct wapbl_ops
+			*mnt_wapbl_op;		/* logging ops */
+	struct wapbl	*mnt_wapbl;		/* log info */
+	struct wapbl_replay
+			*mnt_wapbl_replay;	/* replay support XXX: what? */
 };
 
 /*
@@ -278,6 +283,45 @@ int	fsname##_extattrctl(struct mount *, int, struct vnode *, int,	\
 		const char *);						\
 int	fsname##_suspendctl(struct mount *, int)
 
+/*
+ * This operations vector is so wapbl can be wrapped into a filesystem lkm.
+ * XXX Eventually, we want to move this functionality
+ * down into the filesystems themselves so that this isn't needed.
+ */
+struct wapbl_ops {
+	void (*wo_wapbl_discard)(struct wapbl *);
+	int (*wo_wapbl_replay_isopen)(struct wapbl_replay *);
+	int (*wo_wapbl_replay_read)(struct wapbl_replay *, void *, daddr_t, long);
+	void (*wo_wapbl_add_buf)(struct wapbl *, struct buf *);
+	void (*wo_wapbl_remove_buf)(struct wapbl *, struct buf *);
+	void (*wo_wapbl_resize_buf)(struct wapbl *, struct buf *, long, long);
+	int (*wo_wapbl_begin)(struct wapbl *, const char *, int);
+	void (*wo_wapbl_end)(struct wapbl *);
+	void (*wo_wapbl_junlock_assert)(struct wapbl *);
+	void (*wo_wapbl_biodone)(struct buf *);
+};
+#define WAPBL_DISCARD(MP)						\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_discard)((MP)->mnt_wapbl)
+#define WAPBL_REPLAY_ISOPEN(MP)						\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_replay_isopen)((MP)->mnt_wapbl_replay)
+#define WAPBL_REPLAY_READ(MP, DATA, BLK, LEN)				\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_replay_read)((MP)->mnt_wapbl_replay,	\
+    (DATA), (BLK), (LEN))
+#define WAPBL_ADD_BUF(MP, BP)						\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_add_buf)((MP)->mnt_wapbl, (BP))
+#define WAPBL_REMOVE_BUF(MP, BP)					\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_remove_buf)((MP)->mnt_wapbl, (BP))
+#define WAPBL_RESIZE_BUF(MP, BP, OLDSZ, OLDCNT)				\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_resize_buf)((MP)->mnt_wapbl, (BP),	\
+    (OLDSZ), (OLDCNT))
+#define WAPBL_BEGIN(MP)							\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_begin)((MP)->mnt_wapbl,		\
+    __FILE__, __LINE__)
+#define WAPBL_END(MP)							\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_end)((MP)->mnt_wapbl)
+#define WAPBL_JUNLOCK_ASSERT(MP)					\
+    (*(MP)->mnt_wapbl_op->wo_wapbl_junlock_assert)((MP)->mnt_wapbl)
+
 struct vfs_hooks {
 	void	(*vh_unmount)(struct mount *);
 	LIST_ENTRY(vfs_hooks) vfs_hooks_list;
diff --git a/sys/sys/stat.h b/sys/sys/stat.h
index 5267ddf83300..d806f6c82000 100644
--- a/sys/sys/stat.h
+++ b/sys/sys/stat.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: stat.h,v 1.56 2007/10/19 15:58:52 christos Exp $	*/
+/*	$NetBSD: stat.h,v 1.57 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -214,6 +214,7 @@ struct stat {
 #define	SF_APPEND	0x00040000	/* writes to file may only append */
 /*	SF_NOUNLINK	0x00100000	   [NOT IMPLEMENTED] */
 #define	SF_SNAPSHOT	0x00200000	/* snapshot inode */
+#define	SF_LOG		0x00400000	/* WAPBL log file inode */
 
 #ifdef _KERNEL
 /*
diff --git a/sys/sys/statvfs.h b/sys/sys/statvfs.h
index bae423d5a4f9..1db40cf3ea16 100644
--- a/sys/sys/statvfs.h
+++ b/sys/sys/statvfs.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: statvfs.h,v 1.14 2008/04/28 20:24:11 martin Exp $	 */
+/*	$NetBSD: statvfs.h,v 1.15 2008/07/31 05:38:06 simonb Exp $	 */
 
 /*-
  * Copyright (c) 2004 The NetBSD Foundation, Inc.
@@ -118,6 +118,7 @@ struct statvfs {
 #define	ST_SYMPERM	MNT_SYMPERM
 #define	ST_NODEVMTIME	MNT_NODEVMTIME
 #define	ST_SOFTDEP	MNT_SOFTDEP
+#define	ST_LOG		MNT_LOG
 
 #define	ST_EXRDONLY	MNT_EXRDONLY
 #define	ST_EXPORTED	MNT_EXPORTED
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 02a430ce8b07..5c659bbc1e11 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: vnode.h,v 1.196 2008/06/24 11:21:46 ad Exp $	*/
+/*	$NetBSD: vnode.h,v 1.197 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*-
  * Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -296,6 +296,7 @@ struct vattr {
 #define	IO_NORMAL	0x00800		/* operate on regular data */
 #define	IO_EXT		0x01000		/* operate on extended attributes */
 #define	IO_DIRECT	0x02000		/* direct I/O hint */
+#define	IO_JOURNALLOCKED 0x04000	/* journal is already locked */
 #define	IO_ADV_MASK	0x00003		/* access pattern hint */
 
 #define	IO_ADV_SHIFT	0
@@ -342,6 +343,7 @@ extern const int	vttoif_tab[];
 #define	FSYNC_DATAONLY	0x0002		/* fsync: hint: sync file data only */
 #define	FSYNC_RECLAIM	0x0004		/* fsync: hint: vnode is being reclaimed */
 #define	FSYNC_LAZY	0x0008		/* fsync: lazy sync (trickle) */
+#define	FSYNC_NOLOG	0x0010		/* fsync: do not flush the log */
 #define	FSYNC_CACHE	0x0100		/* fsync: flush disk caches too */
 #define	FSYNC_VFS	0x0200		/* fsync: via FSYNC_VFS() */
 
diff --git a/sys/sys/wapbl.h b/sys/sys/wapbl.h
new file mode 100644
index 000000000000..b985e906d83f
--- /dev/null
+++ b/sys/sys/wapbl.h
@@ -0,0 +1,381 @@
+/*	$NetBSD: wapbl.h,v 1.2 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SYS_WAPBL_H
+#define	_SYS_WAPBL_H
+
+#include <sys/mutex.h>
+
+#include <miscfs/specfs/specdev.h>
+
+/* This header file describes the api and data structures for
+ * write ahead physical block logging (WAPBL) support.
+ */
+
+#if defined(_KERNEL_OPT)
+#include "opt_wapbl.h"
+#endif
+
+#ifdef WAPBL_DEBUG
+#ifndef WAPBL_DEBUG_PRINT
+#define	WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
+#endif
+
+#if 0
+#define	WAPBL_DEBUG_BUFBYTES
+#define	WAPBL_DEBUG_SERIALIZE
+#endif
+
+#endif
+
+#ifdef WAPBL_DEBUG_PRINT
+
+enum {
+	WAPBL_PRINT_OPEN = 0x1,
+	WAPBL_PRINT_FLUSH = 0x2,
+	WAPBL_PRINT_TRUNCATE = 0x4,
+	WAPBL_PRINT_TRANSACTION = 0x8,
+	WAPBL_PRINT_BUFFER = 0x10,
+	WAPBL_PRINT_BUFFER2 = 0x20,
+	WAPBL_PRINT_ALLOC = 0x40,
+	WAPBL_PRINT_INODE = 0x80,
+	WAPBL_PRINT_WRITE = 0x100,
+	WAPBL_PRINT_IO = 0x200,
+	WAPBL_PRINT_REPLAY = 0x400,
+	WAPBL_PRINT_ERROR = 0x800,
+	WAPBL_PRINT_DISCARD = 0x1000,
+	WAPBL_PRINT_BIODONE = 0x2000,
+};
+
+#define	WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf  a
+extern int wapbl_debug_print;
+#else
+#define	WAPBL_PRINTF(mask, a)
+#endif
+
+/****************************************************************/
+
+/* The WAPBL journal layout.
+ * 
+ * The journal consists of a header followed by a circular buffer
+ * region.  The circular data area is described by the header
+ * wc_circ_off, wc_circ_size, wc_head and wc_tail fields as bytes
+ * from the start of the journal header.  New records are inserted
+ * at wc_head and the oldest valid record can be found at wc_tail.
+ * When ((wc_head == wc_tail) && (wc_head == 0)), the journal is empty.
+ * The condition of ((wc_head == wc_tail) && (wc_head != 0))
+ * indicates a full journal, although this condition is rare.
+ *
+ * The journal header as well as its records are marked by a 32bit
+ * type tag and length for ease of parsing.  Journal records are
+ * padded so as to fall on journal device block boundaries.
+ * (XXX i think there is currently a bug wrt WC_BLOCKS not ending
+ * correctly on a journal device block boundary. this would need
+ * to be fixed if the journal blocksize does not match filesystem.)
+ */
+
+/*
+ * The following are the 4 record types used by the journal:
+ * Each tag indicates journal data organized by one of the
+ * structures used below.
+ */
+enum {
+	WAPBL_WC_HEADER = 0x5741424c,	/* "WABL", struct wapbl_wc_header */
+	WAPBL_WC_INODES,		/* struct wapbl_wc_inodelist */
+	WAPBL_WC_REVOCATIONS,		/* struct wapbl_wc_blocklist */
+	WAPBL_WC_BLOCKS,		/* struct wapbl_wc_blocklist */
+};
+
+/* null entry (on disk) */
+/* This structure isn't used directly, but shares its header
+ * layout with all the other log structures for the purpose
+ * of reading a log structure and determining its type
+ */
+struct wapbl_wc_null {
+	uint32_t	wc_type;	/* WAPBL_WC_* */
+	int32_t		wc_len;
+	uint8_t		wc_spare[0];	/* actually longer */
+};
+
+/* journal header (on-disk)
+ * This record is found at the start of the
+ * journal, but not within the circular buffer region.  As well as
+ * describing the journal parameters and matching filesystem, it
+ * additionally serves as the atomic update record for journal
+ * updates.
+ */
+struct wapbl_wc_header {
+	uint32_t	wc_type;	/* WAPBL_WC_HEADER log magic number */
+	int32_t		wc_len;		/* length of this journal entry */
+	uint32_t	wc_checksum;
+	uint32_t	wc_generation;
+	int32_t		wc_fsid[2];
+	uint64_t	wc_time;
+	uint32_t	wc_timensec;
+	uint32_t	wc_version;
+	uint32_t	wc_log_dev_bshift;
+	uint32_t	wc_fs_dev_bshift;
+	int64_t		wc_head;
+	int64_t		wc_tail;
+	int64_t		wc_circ_off;	/* offset of of circ buffer region */
+	int64_t		wc_circ_size;	/* size of circular buffer region */
+	uint8_t		wc_spare[0];	/* actually longer */
+};
+
+/* list of blocks (on disk)
+ * This record is used to describe a set of filesystem blocks,
+ * and is used with two type tags, WAPBL_WC_BLOCKS and
+ * WAPBL_WC_REVOCATIONS.
+ * 
+ * For WAPBL_WC_BLOCKS, a copy of each listed block can be found
+ * starting at the next log device blocksize boundary.  starting at
+ * one log device block since the start of the record.  This contains
+ * the bulk of the filesystem journal data which is written using
+ * these records before being written into the filesystem.
+ *
+ * The WAPBL_WC_REVOCATIONS record is used to indicate that any
+ * previously listed blocks should not be written into the filesystem.
+ * This is important so that deallocated and reallocated data blocks
+ * do not get overwritten with stale data from the journal.  The
+ * revocation records to not contain a copy of any actual block data.
+ */
+struct wapbl_wc_blocklist {
+	uint32_t	wc_type; /* WAPBL_WC_{REVOCATIONS,BLOCKS} */
+	int32_t		wc_len;
+	int32_t		wc_blkcount;
+	int32_t		wc_unused;
+	struct {
+		int64_t	wc_daddr;
+		int32_t	wc_unused;
+		int32_t	wc_dlen;
+	} wc_blocks[0];			/* actually longer */
+};
+
+/* list of inodes (on disk)
+ * This record is used to describe the set of inodes which
+ * may be allocated but are unlinked.  Inodes end up listed here
+ * while they are in the process of being initialized and
+ * deinitialized.  Inodes unlinked while in use by a process
+ * will be listed here and the actual deletion must be completed
+ * on journal replay.
+ */
+struct wapbl_wc_inodelist {
+	uint32_t	wc_type; /* WAPBL_WC_INODES */
+	int32_t		wc_len;
+	int32_t		wc_inocnt;
+	int32_t		wc_clear;	/* set if previously listed inodes 
+					   hould be ignored */
+	struct {
+		uint32_t wc_inumber;
+		uint32_t wc_imode;
+	} wc_inodes[0];		/* actually longer */
+};
+
+/****************************************************************/
+
+#include <sys/queue.h>
+#include <sys/vnode.h>
+#include <sys/buf.h>
+
+typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int);
+
+#ifdef _KERNEL
+
+struct wapbl_entry;
+struct wapbl_wc_header;
+struct wapbl_replay;
+struct wapbl;
+
+/*
+ * This structure holds per transaction log information
+ */
+struct wapbl_entry {
+	struct wapbl *we_wapbl;
+	SIMPLEQ_ENTRY(wapbl_entry) we_entries;
+	size_t we_bufcount;		/* Count of unsynced buffers */
+	size_t we_reclaimable_bytes;	/* Number on disk bytes for this
+					   transaction */
+	int	we_error;
+#ifdef WAPBL_DEBUG_BUFBYTES
+	size_t we_unsynced_bufbytes;	/* Byte count of unsynced buffers */
+#endif
+};
+
+void	wapbl_init(void);
+
+/* Start using a log */
+int	wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
+		    size_t, size_t, struct wapbl_replay *,
+		    wapbl_flush_fn_t, wapbl_flush_fn_t);
+
+/* Discard the current transaction, potentially dangerous */
+void	wapbl_discard(struct wapbl *);
+
+/* stop using a log */
+int	wapbl_stop(struct wapbl *, int);
+
+/*
+ * Begin a new transaction or increment transaction recursion
+ * level if called while a transaction is already in progress
+ * by the current process.
+ */
+int	wapbl_begin(struct wapbl *, const char *, int);
+
+
+/* End a transaction or decrement the transaction recursion level */
+void	wapbl_end(struct wapbl *);
+
+/*
+ * Add a new buffer to the current transaction.  The buffers
+ * data will be copied to the current transaction log and the
+ * buffer will be marked B_LOCKED so that it will not be
+ * flushed to disk by the syncer or reallocated.
+ */
+void	wapbl_add_buf(struct wapbl *, struct buf *);
+
+/* Remove a buffer from the current transaction. */
+void	wapbl_remove_buf(struct wapbl *, struct buf *);
+
+void	wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
+
+/*
+ * This will flush all completed transactions to disk and
+ * start asynchronous writes on the associated buffers
+ */
+int	wapbl_flush(struct wapbl *, int);
+
+/*
+ * Inodes that are allocated but have zero link count
+ * must be registered with the current transaction
+ * so they may be recorded in the log and cleaned up later.
+ * registration/unregistration of ino numbers already registered is ok.
+ */
+void	wapbl_register_inode(struct wapbl *, ino_t, mode_t);
+void	wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);
+
+/*
+ * Metadata block deallocations must be registered so
+ * that revocations records can be written and to prevent
+ * the corresponding blocks from being reused as data
+ * blocks until the log is on disk.
+ */
+void	wapbl_register_deallocation(struct wapbl *, daddr_t, int);
+
+void	wapbl_jlock_assert(struct wapbl *wl);
+void	wapbl_junlock_assert(struct wapbl *wl);
+
+void	wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...));
+
+#if defined(WAPBL_DEBUG) || defined(DDB)
+void	wapbl_dump(struct wapbl *);
+#endif
+
+void	wapbl_biodone(struct buf *);
+
+extern struct wapbl_ops wapbl_ops;
+
+static __inline struct mount *
+wapbl_vptomp(struct vnode *vp)
+{
+	struct mount *mp;
+
+	mp = NULL;
+	if (vp != NULL) {
+		if (vp->v_type == VBLK)
+			mp = vp->v_specmountpoint;
+		else
+			mp = vp->v_mount;
+	}
+
+	return mp;
+}
+
+static __inline bool
+wapbl_vphaswapbl(struct vnode *vp)
+{
+	struct mount *mp;
+
+	if (vp == NULL)
+		return false;
+
+	mp = wapbl_vptomp(vp);
+	if (mp && mp->mnt_wapbl)
+		return true;
+	else
+		return false;
+}
+
+#endif /* _KERNEL */
+
+/****************************************************************/
+/* Replay support */
+
+struct wapbl_replay {
+	struct vnode *wr_logvp;
+	struct vnode *wr_devvp;
+	daddr_t wr_logpbn;
+
+	struct wapbl_wc_header wr_wc_header;
+	void *wr_scratch;
+
+	LIST_HEAD(wapbl_blk_head, wapbl_blk) *wr_blkhash;
+	u_long wr_blkhashmask;
+	int wr_blkhashcnt;
+
+	off_t wr_inodeshead;
+	off_t wr_inodestail;
+	int wr_inodescnt;
+	struct {
+		uint32_t wr_inumber;
+		uint32_t wr_imode;
+	} *wr_inodes;
+};
+
+#define	wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)
+
+int	wapbl_replay_isopen1(struct wapbl_replay *);
+int	wapbl_replay_start(struct wapbl_replay **, struct vnode *,
+	daddr_t, size_t, size_t);
+void	wapbl_replay_stop(struct wapbl_replay *);
+void	wapbl_replay_free(struct wapbl_replay *);
+int	wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
+int	wapbl_replay_write(struct wapbl_replay *, struct vnode *);
+int	wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);
+
+/****************************************************************/
+
+/* Supply this to provide i/o support */
+int wapbl_write(void *, size_t, struct vnode *, daddr_t);
+int wapbl_read(void *, size_t, struct vnode *, daddr_t);
+
+/****************************************************************/
+
+#endif /* !_SYS_WAPBL_H */
diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
index 9006ca6dca15..1bbf8a6b9e8e 100644
--- a/sys/ufs/ffs/ffs_alloc.c
+++ b/sys/ufs/ffs/ffs_alloc.c
@@ -1,4 +1,33 @@
-/*	$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $	*/
+/*	$NetBSD: ffs_alloc.c,v 1.111 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 2002 Networks Associates Technology, Inc.
@@ -41,7 +70,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.111 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -51,13 +80,14 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
-#include <sys/proc.h>
-#include <sys/vnode.h>
-#include <sys/mount.h>
-#include <sys/kernel.h>
-#include <sys/syslog.h>
-#include <sys/kauth.h>
 #include <sys/fstrans.h>
+#include <sys/kauth.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/syslog.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <ufs/ufs/quota.h>
@@ -65,21 +95,22 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
-static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int);
-static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t);
+static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int);
+static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
 #ifdef XXXUBC
 static daddr_t ffs_clusteralloc(struct inode *, int, daddr_t, int);
 #endif
 static ino_t ffs_dirpref(struct inode *);
 static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
 static void ffs_fserr(struct fs *, u_int, const char *);
-static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int,
-    daddr_t (*)(struct inode *, int, daddr_t, int));
-static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int);
+static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
+    daddr_t (*)(struct inode *, int, daddr_t, int, int));
+static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
 static int32_t ffs_mapsearch(struct fs *, struct cg *,
 				      daddr_t, int);
 #if defined(DIAGNOSTIC) || defined(DEBUG)
@@ -118,7 +149,7 @@ extern const u_char * const fragtbl[];
  * => releases um_lock before returning
  */
 int
-ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
+ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags,
     kauth_cred_t cred, daddr_t *bnp)
 {
 	struct ufsmount *ump;
@@ -174,13 +205,14 @@ ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
 		return (error);
 	mutex_enter(&ump->um_lock);
 #endif
+
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	if (bpref == 0)
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
-	bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg);
 	if (bno > 0) {
 		DIP_ADD(ip, blocks, btodb(size));
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
@@ -193,6 +225,20 @@ ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
 	 */
 	(void) chkdq(ip, -btodb(size), cred, FORCE);
 #endif
+	if (flags & B_CONTIG) {
+		/*
+		 * XXX ump->um_lock handling is "suspect" at best.
+		 * For the case where ffs_hashalloc() fails early
+		 * in the B_CONTIG case we reach here with um_lock
+		 * already unlocked, so we can't release it again
+		 * like in the normal error path.  See kern/39206.
+		 *
+		 *
+		 * Fail silently - it's up to our caller to report
+		 * errors.
+		 */
+		return (ENOSPC);
+	}
 nospace:
 	mutex_exit(&ump->um_lock);
 	ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full");
@@ -372,14 +418,30 @@ ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize,
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
-	bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg);
+	bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg);
 	if (bno > 0) {
-		if (!DOINGSOFTDEP(ITOV(ip)))
-			ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
-			    ip->i_number);
-		if (nsize < request)
-			ffs_blkfree(fs, ip->i_devvp, bno + numfrags(fs, nsize),
-			    (long)(request - nsize), ip->i_number);
+		if (!DOINGSOFTDEP(ITOV(ip))) {
+			if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+			    (ITOV(ip)->v_type != VREG)) {
+				UFS_WAPBL_REGISTER_DEALLOCATION(
+				    ip->i_ump->um_mountp, fsbtodb(fs, bprev),
+				    osize);
+			} else
+				ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
+				    ip->i_number);
+		}
+		if (nsize < request) {
+			if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+			    (ITOV(ip)->v_type != VREG)) {
+				UFS_WAPBL_REGISTER_DEALLOCATION(
+				    ip->i_ump->um_mountp,
+				    fsbtodb(fs, (bno + numfrags(fs, nsize))),
+				    request - nsize);
+			} else
+				ffs_blkfree(fs, ip->i_devvp,
+				    bno + numfrags(fs, nsize),
+				    (long)(request - nsize), ip->i_number);
+		}
 		DIP_ADD(ip, blocks, btodb(nsize - osize));
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (bpp != NULL) {
@@ -443,7 +505,7 @@ struct ctldebug debug15 = { "prtrealloc", &prtrealloc };
 #endif
 
 /*
- * NOTE: when re-enabling this, it must be updated for UFS2.
+ * NOTE: when re-enabling this, it must be updated for UFS2 and WAPBL.
  */
 
 int doasyncfree = 1;
@@ -548,7 +610,7 @@ ffs_reallocblks(void *v)
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = (daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref,
-	    len, ffs_clusteralloc)) == 0) {
+	    len, flags, ffs_clusteralloc)) == 0) {
 		mutex_exit(&ump->um_lock);
 		goto fail;
 	}
@@ -696,11 +758,17 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
 	ino_t ino, ipref;
 	int cg, error;
 
+	UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
+
 	*vpp = NULL;
 	pip = VTOI(pvp);
 	fs = pip->i_fs;
 	ump = pip->i_ump;
 
+	error = UFS_WAPBL_BEGIN(pvp->v_mount);
+	if (error) {
+		return error;
+	}
 	mutex_enter(&ump->um_lock);
 	if (fs->fs_cstotal.cs_nifree == 0)
 		goto noinodes;
@@ -723,12 +791,18 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
-	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, ffs_nodealloccg);
+	ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
+	UFS_WAPBL_END(pvp->v_mount);
 	error = VFS_VGET(pvp->v_mount, ino, vpp);
 	if (error) {
-		ffs_vfree(pvp, ino, mode);
+		int err;
+		err = UFS_WAPBL_BEGIN(pvp->v_mount);
+		if (err == 0)
+			ffs_vfree(pvp, ino, mode);
+		if (err == 0)
+			UFS_WAPBL_END(pvp->v_mount);
 		return (error);
 	}
 	KASSERT((*vpp)->v_type == VNON);
@@ -774,6 +848,7 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred,
 	return (0);
 noinodes:
 	mutex_exit(&ump->um_lock);
+	UFS_WAPBL_END(pvp->v_mount);
 	ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes");
 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
 	return (ENOSPC);
@@ -922,7 +997,7 @@ ffs_dirpref(struct inode *pip)
  * => um_lock held on entry and exit
  */
 daddr_t
-ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx,
+ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
     int32_t *bap /* XXX ondisk32 */)
 {
 	struct fs *fs;
@@ -932,6 +1007,26 @@ ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx,
 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
 
 	fs = ip->i_fs;
+
+	/*
+	 * If allocating a contiguous file with B_CONTIG, use the hints
+	 * in the inode extentions to return the desired block.
+	 *
+	 * For metadata (indirect blocks) return the address of where
+	 * the first indirect block resides - we'll scan for the next
+	 * available slot if we need to allocate more than one indirect
+	 * block.  For data, return the address of the actual block
+	 * relative to the address of the first data block.
+	 */
+	if (flags & B_CONTIG) {
+		KASSERT(ip->i_ffs_first_data_blk != 0);
+		KASSERT(ip->i_ffs_first_indir_blk != 0);
+		if (flags & B_METAONLY)
+			return ip->i_ffs_first_indir_blk;
+		else
+			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+	}
+
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 		if (lbn < NDADDR + NINDIR(fs)) {
 			cg = ino_to_cg(fs, ip->i_number);
@@ -966,7 +1061,8 @@ ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx,
 }
 
 daddr_t
-ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap)
+ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
+    int64_t *bap)
 {
 	struct fs *fs;
 	int cg;
@@ -975,6 +1071,26 @@ ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap)
 	KASSERT(mutex_owned(&ip->i_ump->um_lock));
 
 	fs = ip->i_fs;
+
+	/*
+	 * If allocating a contiguous file with B_CONTIG, use the hints
+	 * in the inode extentions to return the desired block.
+	 *
+	 * For metadata (indirect blocks) return the address of where
+	 * the first indirect block resides - we'll scan for the next
+	 * available slot if we need to allocate more than one indirect
+	 * block.  For data, return the address of the actual block
+	 * relative to the address of the first data block.
+	 */
+	if (flags & B_CONTIG) {
+		KASSERT(ip->i_ffs_first_data_blk != 0);
+		KASSERT(ip->i_ffs_first_indir_blk != 0);
+		if (flags & B_METAONLY)
+			return ip->i_ffs_first_indir_blk;
+		else
+			return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn);
+	}
+
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 		if (lbn < NDADDR + NINDIR(fs)) {
 			cg = ino_to_cg(fs, ip->i_number);
@@ -1025,7 +1141,7 @@ ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap)
 static daddr_t
 ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
     int size /* size for data blocks, mode for inodes */,
-    daddr_t (*allocator)(struct inode *, int, daddr_t, int))
+    int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int))
 {
 	struct fs *fs;
 	daddr_t result;
@@ -1035,9 +1151,12 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
 	/*
 	 * 1: preferred cylinder group
 	 */
-	result = (*allocator)(ip, cg, pref, size);
+	result = (*allocator)(ip, cg, pref, size, flags);
 	if (result)
 		return (result);
+
+	if (flags & B_CONTIG)
+		return (result);
 	/*
 	 * 2: quadratic rehash
 	 */
@@ -1045,7 +1164,7 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, flags);
 		if (result)
 			return (result);
 	}
@@ -1056,7 +1175,7 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref,
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
-		result = (*allocator)(ip, cg, 0, size);
+		result = (*allocator)(ip, cg, 0, size, flags);
 		if (result)
 			return (result);
 		cg++;
@@ -1157,7 +1276,7 @@ ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize)
  * and if it is, allocate it.
  */
 static daddr_t
-ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
+ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags)
 {
 	struct ufsmount *ump;
 	struct fs *fs = ip->i_fs;
@@ -1192,7 +1311,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
 		cgp->cg_time = ufs_rw64(time_second, needswap);
 	if (size == fs->fs_bsize) {
 		mutex_enter(&ump->um_lock);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
+		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
 		ACTIVECG_CLR(fs, cg);
 		mutex_exit(&ump->um_lock);
 		bdwrite(bp);
@@ -1216,7 +1335,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
 		if (cgp->cg_cs.cs_nbfree == 0)
 			goto fail;
 		mutex_enter(&ump->um_lock);
-		blkno = ffs_alloccgblk(ip, bp, bpref);
+		blkno = ffs_alloccgblk(ip, bp, bpref, flags);
 		bno = dtogd(fs, blkno);
 		for (i = frags; i < fs->fs_frag; i++)
 			setbit(blksfree, bno + i);
@@ -1276,7 +1395,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
  * blocks may be fragmented by the routine that allocates them.
  */
 static daddr_t
-ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref)
+ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags)
 {
 	struct ufsmount *ump;
 	struct fs *fs = ip->i_fs;
@@ -1304,7 +1423,14 @@ ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref)
 		 */
 		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
 			goto gotit;
+		/*
+		 * if the requested data block isn't available and we are
+		 * trying to allocate a contiguous file, return an error.
+		 */
+		if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
+			return (0);
 	}
+
 	/*
 	 * Take the next available block in this cylinder group.
 	 */
@@ -1453,7 +1579,7 @@ ffs_clusteralloc(struct inode *ip, int cg, daddr_t bpref, int len)
 	len = blkstofrags(fs, len);
 	mutex_enter(&ump->um_lock);
 	for (i = 0; i < len; i += fs->fs_frag)
-		if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i)
+		if ((got = ffs_alloccgblk(ip, bp, bno + i, flags)) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	ACTIVECG_CLR(fs, cg);
 	mutex_exit(&ump->um_lock);
@@ -1477,7 +1603,7 @@ fail:
  *      inode in the specified cylinder group.
  */
 static daddr_t
-ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
+ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags)
 {
 	struct ufsmount *ump = ip->i_ump;
 	struct fs *fs = ip->i_fs;
@@ -1492,6 +1618,7 @@ ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
 #endif
 
 	KASSERT(mutex_owned(&ump->um_lock));
+	UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp);
 
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
@@ -1542,6 +1669,8 @@ ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
 	panic("ffs_nodealloccg: block not in map");
 	/* NOTREACHED */
 gotit:
+	UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
+	    mode);
 	/*
 	 * Check to see if we need to initialize more inodes.
 	 */
@@ -1593,6 +1722,122 @@ gotit:
 	return (0);
 }
 
+/*
+ * Allocate a block or fragment.
+ *
+ * The specified block or fragment is removed from the
+ * free map, possibly fragmenting a block in the process.
+ *
+ * This implementation should mirror fs_blkfree
+ *
+ * => um_lock not held on entry or exit
+ */
+int
+ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
+{
+	struct ufsmount *ump = ip->i_ump;
+	struct fs *fs = ip->i_fs;
+	struct cg *cgp;
+	struct buf *bp;
+	int32_t fragno, cgbno;
+	int i, error, cg, blk, frags, bbase;
+	u_int8_t *blksfree;
+	const int needswap = UFS_FSNEEDSWAP(fs);
+
+	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
+	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
+		printf("dev = 0x%x, bno = %" PRId64 " bsize = %d, "
+		       "size = %ld, fs = %s\n",
+		    ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
+		panic("blkalloc: bad size");
+	}
+	cg = dtog(fs, bno);
+	if (bno >= fs->fs_size) {
+		printf("bad block %" PRId64 ", ino %" PRId64 "\n", bno,
+		    ip->i_number);
+		ffs_fserr(fs, ip->i_uid, "bad block");
+		return EINVAL;
+	}
+	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		(int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp);
+	if (error) {
+		brelse(bp, 0);
+		return error;
+	}
+	cgp = (struct cg *)bp->b_data;
+	if (!cg_chkmagic(cgp, needswap)) {
+		brelse(bp, 0);
+		return EIO;
+	}
+	cgp->cg_old_time = ufs_rw32(time_second, needswap);
+	cgp->cg_time = ufs_rw64(time_second, needswap);
+	cgbno = dtogd(fs, bno);
+	blksfree = cg_blksfree(cgp, needswap);
+
+	mutex_enter(&ump->um_lock);
+	if (size == fs->fs_bsize) {
+		fragno = fragstoblks(fs, cgbno);
+		if (!ffs_isblock(fs, blksfree, fragno)) {
+			mutex_exit(&ump->um_lock);
+			brelse(bp, 0);
+			return EBUSY;
+		}
+		ffs_clrblock(fs, blksfree, fragno);
+		ffs_clusteracct(fs, cgp, fragno, -1);
+		ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+		fs->fs_cstotal.cs_nbfree--;
+		fs->fs_cs(fs, cg).cs_nbfree--;
+	} else {
+		bbase = cgbno - fragnum(fs, cgbno);
+
+		frags = numfrags(fs, size);
+		for (i = 0; i < frags; i++) {
+			if (isclr(blksfree, cgbno + i)) {
+				mutex_exit(&ump->um_lock);
+				brelse(bp, 0);
+				return EBUSY;
+			}
+		}
+		/*
+		 * if a complete block is being split, account for it
+		 */
+		fragno = fragstoblks(fs, bbase);
+		if (ffs_isblock(fs, blksfree, fragno)) {
+			ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
+			fs->fs_cstotal.cs_nffree += fs->fs_frag;
+			fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
+			ffs_clusteracct(fs, cgp, fragno, -1);
+			ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
+			fs->fs_cstotal.cs_nbfree--;
+			fs->fs_cs(fs, cg).cs_nbfree--;
+		}
+		/*
+		 * decrement the counts associated with the old frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
+		/*
+		 * allocate the fragment
+		 */
+		for (i = 0; i < frags; i++) {
+			clrbit(blksfree, cgbno + i);
+		}
+		ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
+		fs->fs_cstotal.cs_nffree -= i;
+		fs->fs_cs(fs, cg).cs_nffree -= i;
+		/*
+		 * add back in counts associated with the new frags
+		 */
+		blk = blkmap(fs, blksfree, bbase);
+		ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
+	}
+	fs->fs_fmod = 1;
+	ACTIVECG_CLR(fs, cg);
+	mutex_exit(&ump->um_lock);
+	bdwrite(bp);
+	return 0;
+}
+
 /*
  * Free a block or fragment.
  *
@@ -1817,6 +2062,8 @@ ffs_vfree(struct vnode *vp, ino_t ino, int mode)
 /*
  * Do the actual free operation.
  * The specified inode is placed back in the free map.
+ *
+ * => um_lock not held on entry or exit
  */
 int
 ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
@@ -1832,6 +2079,8 @@ ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
 	const int needswap = UFS_FSNEEDSWAP(fs);
 #endif
 
+	UFS_WAPBL_JLOCK_ASSERT(devvp->v_specinfo->si_mountpoint);
+
 	cg = ino_to_cg(fs, ino);
 	if (devvp->v_type != VBLK) {
 		/* devvp is a snapshot */
@@ -1871,6 +2120,8 @@ ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
 			panic("ifree: freeing free inode");
 	}
 	clrbit(inosused, ino);
+	UFS_WAPBL_UNREGISTER_INODE(devvp->v_specmountpoint,
+	    ino + cg * fs->fs_ipg, mode);
 	if (ino < ufs_rw32(cgp->cg_irotor, needswap))
 		cgp->cg_irotor = ufs_rw32(ino, needswap);
 	ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c
index a2e82d689bcb..661d6210747f 100644
--- a/sys/ufs/ffs/ffs_balloc.c
+++ b/sys/ufs/ffs/ffs_balloc.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_balloc.c,v 1.50 2008/06/03 09:47:49 hannken Exp $	*/
+/*	$NetBSD: ffs_balloc.c,v 1.51 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*
  * Copyright (c) 2002 Networks Associates Technology, Inc.
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.50 2008/06/03 09:47:49 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.51 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_quota.h"
@@ -141,7 +141,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 		if (osize < fs->fs_bsize && osize > 0) {
 			mutex_enter(&ump->um_lock);
 			error = ffs_realloccg(ip, nb,
-				    ffs_blkpref_ufs1(ip, lastlbn, nb,
+				    ffs_blkpref_ufs1(ip, lastlbn, nb, flags,
 					&ip->i_ffs1_db[0]),
 				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
 			if (error)
@@ -222,9 +222,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				 */
 				mutex_enter(&ump->um_lock);
 				error = ffs_realloccg(ip, lbn,
-				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
-					&ip->i_ffs1_db[0]), osize, nsize, cred,
-					bpp, &newb);
+				    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
+					&ip->i_ffs1_db[0]),
+				    osize, nsize, cred, bpp, &newb);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
@@ -245,9 +245,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				nsize = fs->fs_bsize;
 			mutex_enter(&ump->um_lock);
 			error = ffs_alloc(ip, lbn,
-			    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
+			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags,
 				&ip->i_ffs1_db[0]),
-				nsize, cred, &newb);
+			    nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			if (bpp != NULL) {
@@ -284,9 +284,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 	allocblk = allociblk;
 	if (nb == 0) {
 		mutex_enter(&ump->um_lock);
-		pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
-		    &newb);
+		pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
 		if (error)
 			goto fail;
 		nb = newb;
@@ -341,9 +341,10 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 		}
 		mutex_enter(&ump->um_lock);
 		if (pref == 0)
-			pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
-		    &newb);
+			pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY,
+			    NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
 		if (error) {
 			brelse(bp, 0);
 			goto fail;
@@ -404,8 +405,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 			goto fail;
 		}
 		mutex_enter(&ump->um_lock);
-		pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, &bap[0]);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
+		pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags,
+		    &bap[0]);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
 		    &newb);
 		if (error) {
 			brelse(bp, 0);
@@ -619,7 +621,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				error = ffs_realloccg(ip, -1 - nb,
 				    dp->di_extb[nb],
 				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
-				    &dp->di_extb[0]), osize,
+					flags, &dp->di_extb[0]),
+				    osize,
 				    (int)fs->fs_bsize, cred, &bp);
 				if (error)
 					return (error);
@@ -679,8 +682,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				mutex_enter(&ump->um_lock);
 				error = ffs_realloccg(ip, -1 - lbn,
 				    dp->di_extb[lbn],
-				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
-				    &dp->di_extb[0]), osize, nsize, cred, &bp);
+				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+				        &dp->di_extb[0]),
+				    osize, nsize, cred, &bp);
 				if (error)
 					return (error);
 				bp->b_xflags |= BX_ALTDATA;
@@ -696,8 +700,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				nsize = fs->fs_bsize;
 			mutex_enter(&ump->um_lock);
 			error = ffs_alloc(ip, lbn,
-			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
-			   nsize, cred, &newb);
+			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+			       &dp->di_extb[0]),
+			   nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb),
@@ -728,7 +733,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 		if (osize < fs->fs_bsize && osize > 0) {
 			mutex_enter(&ump->um_lock);
 			error = ffs_realloccg(ip, nb,
-				    ffs_blkpref_ufs2(ip, lastlbn, nb,
+				    ffs_blkpref_ufs2(ip, lastlbn, nb, flags,
 					&ip->i_ffs2_db[0]),
 				    osize, (int)fs->fs_bsize, cred, bpp, &newb);
 			if (error)
@@ -809,9 +814,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				 */
 				mutex_enter(&ump->um_lock);
 				error = ffs_realloccg(ip, lbn,
-				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
-					&ip->i_ffs2_db[0]), osize, nsize, cred,
-					bpp, &newb);
+				    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+					&ip->i_ffs2_db[0]),
+				    osize, nsize, cred, bpp, &newb);
 				if (error)
 					return (error);
 				if (DOINGSOFTDEP(vp))
@@ -832,8 +837,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 				nsize = fs->fs_bsize;
 			mutex_enter(&ump->um_lock);
 			error = ffs_alloc(ip, lbn,
-			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
-				&ip->i_ffs2_db[0]), nsize, cred, &newb);
+			    ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags,
+				&ip->i_ffs2_db[0]),
+			    nsize, flags, cred, &newb);
 			if (error)
 				return (error);
 			if (bpp != NULL) {
@@ -870,9 +876,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 	allocblk = allociblk;
 	if (nb == 0) {
 		mutex_enter(&ump->um_lock);
-		pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
-		    &newb);
+		pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
 		if (error)
 			goto fail;
 		nb = newb;
@@ -927,9 +933,10 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 		}
 		mutex_enter(&ump->um_lock);
 		if (pref == 0)
-			pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
-		    &newb);
+			pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY,
+			    NULL);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
+		    flags | B_METAONLY, cred, &newb);
 		if (error) {
 			brelse(bp, 0);
 			goto fail;
@@ -990,8 +997,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred,
 			goto fail;
 		}
 		mutex_enter(&ump->um_lock);
-		pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, &bap[0]);
-		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred,
+		pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags,
+		    &bap[0]);
+		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred,
 		    &newb);
 		if (error) {
 			brelse(bp, 0);
diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
index d7b69dcfb359..b17c486f9dff 100644
--- a/sys/ufs/ffs/ffs_extern.h
+++ b/sys/ufs/ffs/ffs_extern.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: ffs_extern.h,v 1.66 2008/06/28 01:34:05 rumble Exp $	*/
+/*	$NetBSD: ffs_extern.h,v 1.67 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993, 1994
@@ -84,9 +84,10 @@ __BEGIN_DECLS
 
 #include <sys/param.h>
 #include <sys/mount.h>
+#include <sys/wapbl.h>
 
 /* ffs_alloc.c */
-int	ffs_alloc(struct inode *, daddr_t, daddr_t , int, kauth_cred_t,
+int	ffs_alloc(struct inode *, daddr_t, daddr_t , int, int, kauth_cred_t,
 		  daddr_t *);
 int	ffs_realloccg(struct inode *, daddr_t, daddr_t, int, int ,
 		      kauth_cred_t, struct buf **, daddr_t *);
@@ -94,8 +95,9 @@ int	ffs_realloccg(struct inode *, daddr_t, daddr_t, int, int ,
 int	ffs_reallocblks(void *);
 #endif
 int	ffs_valloc(struct vnode *, int, kauth_cred_t, struct vnode **);
-daddr_t	ffs_blkpref_ufs1(struct inode *, daddr_t, int, int32_t *);
-daddr_t	ffs_blkpref_ufs2(struct inode *, daddr_t, int, int64_t *);
+daddr_t	ffs_blkpref_ufs1(struct inode *, daddr_t, int, int, int32_t *);
+daddr_t	ffs_blkpref_ufs2(struct inode *, daddr_t, int, int, int64_t *);
+int	ffs_blkalloc(struct inode *, daddr_t, long);
 void	ffs_blkfree(struct fs *, struct vnode *, daddr_t, long, ino_t);
 int	ffs_vfree(struct vnode *, ino_t, int);
 void	ffs_clusteracct(struct fs *, struct cg *, int32_t, int);
@@ -175,6 +177,17 @@ void	softdep_setup_allocindir_page(struct inode *, daddr_t,
 void	softdep_fsync_mountdev(struct vnode *);
 int	softdep_sync_metadata(struct vnode *);
 
+/* Write Ahead Physical Block Logging */
+void	ffs_wapbl_verify_inodes(struct mount *, const char *);
+void	ffs_wapbl_replay_finish(struct mount *);
+int	ffs_wapbl_start(struct mount *);
+int	ffs_wapbl_stop(struct mount *, int);
+int	ffs_wapbl_replay_start(struct mount *, struct fs *, struct vnode *);
+void	ffs_wapbl_blkalloc(struct fs *, struct vnode *, daddr_t, int);
+
+void	ffs_wapbl_sync_metadata(struct mount *, daddr_t *, int *, int);
+void	ffs_wapbl_abort_sync_metadata(struct mount *, daddr_t *, int *, int);
+
 extern int (**ffs_vnodeop_p)(void *);
 extern int (**ffs_specop_p)(void *);
 extern int (**ffs_fifoop_p)(void *);
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index e037ede2aea5..706ac653de92 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -1,4 +1,33 @@
-/*	$NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $	*/
+/*	$NetBSD: ffs_inode.c,v 1.98 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -32,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.98 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -41,23 +70,25 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/mount.h>
-#include <sys/proc.h>
-#include <sys/file.h>
 #include <sys/buf.h>
-#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/fstrans.h>
+#include <sys/kauth.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
-#include <sys/trace.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
 #include <sys/resourcevar.h>
-#include <sys/kauth.h>
-#include <sys/fstrans.h>
+#include <sys/trace.h>
+#include <sys/vnode.h>
+#include <sys/wapbl.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
@@ -128,6 +159,17 @@ ffs_update(struct vnode *vp, const struct timespec *acc,
 		softdep_update_inodeblock(ip, bp, waitfor);
 	} else if (ip->i_ffs_effnlink != ip->i_nlink)
 		panic("ffs_update: bad link cnt");
+	/* Keep unlinked inode list up to date */
+	KDASSERT(DIP(ip, nlink) == ip->i_nlink);
+	if (ip->i_mode) {
+		if (ip->i_nlink > 0) {
+			UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
+			    ip->i_number, ip->i_mode);
+		} else {
+			UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
+			    ip->i_number, ip->i_mode);
+		}
+	}
 	if (fs->fs_magic == FS_UFS1_MAGIC) {
 		cp = (char *)bp->b_data +
 		    (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
@@ -411,8 +453,13 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				DIP_ASSIGN(oip, ib[level], 0);
-				ffs_blkfree(fs, oip->i_devvp, bn, fs->fs_bsize,
-				    oip->i_number);
+				if (oip->i_ump->um_mountp->mnt_wapbl) {
+					UFS_WAPBL_REGISTER_DEALLOCATION(
+					    oip->i_ump->um_mountp,
+					    fsbtodb(fs, bn), fs->fs_bsize);
+				} else
+					ffs_blkfree(fs, oip->i_devvp, bn,
+					    fs->fs_bsize, oip->i_number);
 				blocksreleased += nblocks;
 			}
 		}
@@ -434,7 +481,12 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
 			continue;
 		DIP_ASSIGN(oip, db[i], 0);
 		bsize = blksize(fs, oip, i);
-		ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
+		if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+		    (ovp->v_type != VREG)) {
+			UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp,
+			    fsbtodb(fs, bn), bsize);
+		} else
+			ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
@@ -468,8 +520,14 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
-			ffs_blkfree(fs, oip->i_devvp, bn, oldspace - newspace,
-			    oip->i_number);
+			if ((oip->i_ump->um_mountp->mnt_wapbl) &&
+			    (ovp->v_type != VREG)) {
+				UFS_WAPBL_REGISTER_DEALLOCATION(
+				    oip->i_ump->um_mountp, fsbtodb(fs, bn),
+				    oldspace - newspace);
+			} else
+				ffs_blkfree(fs, oip->i_devvp, bn,
+				    oldspace - newspace, oip->i_number);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
@@ -494,6 +552,7 @@ done:
 	DIP_ADD(oip, blocks, -blocksreleased);
 	genfs_node_unlock(ovp);
 	oip->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
 #ifdef QUOTA
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
 #endif
@@ -621,7 +680,13 @@ ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
 				allerror = error;
 			blocksreleased += blkcount;
 		}
-		ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize, ip->i_number);
+		if ((ip->i_ump->um_mountp->mnt_wapbl) &&
+		    ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
+			UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp,
+			    fsbtodb(fs, nb), fs->fs_bsize);
+		} else
+			ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
+			    ip->i_number);
 		blocksreleased += nblocks;
 	}
 
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 78da484a0038..d375c7c13bbb 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -1,4 +1,33 @@
-/*	$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp $	*/
+/*	$NetBSD: ffs_vfsops.c,v 1.231 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 1989, 1991, 1993, 1994
@@ -32,12 +61,13 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.231 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
 #include "opt_quota.h"
 #include "opt_softdep.h"
+#include "opt_wapbl.h"
 #endif
 
 #include <sys/param.h>
@@ -61,6 +91,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp
 #include <sys/sysctl.h>
 #include <sys/conf.h>
 #include <sys/kauth.h>
+#include <sys/wapbl.h>
 #include <sys/fstrans.h>
 #include <sys/module.h>
 
@@ -73,6 +104,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
@@ -243,11 +275,17 @@ ffs_mountroot(void)
 		vrele(rootvp);
 		return (error);
 	}
+
+	/*
+	 * We always need to be able to mount the root file system.
+	 */
+	mp->mnt_flag |= MNT_FORCE;
 	if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
 		vfs_unbusy(mp, false, NULL);
 		vfs_destroy(mp);
 		return (error);
 	}
+	mp->mnt_flag &= ~MNT_FORCE;
 	mutex_enter(&mountlist_lock);
 	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 	mutex_exit(&mountlist_lock);
@@ -261,6 +299,8 @@ ffs_mountroot(void)
 	return (0);
 }
 
+static int dolog;
+
 /*
  * VFS Operations.
  *
@@ -278,6 +318,9 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 	int error = 0, flags, update;
 	mode_t accessmode;
 
+	if (dolog)
+		mp->mnt_flag |= MNT_LOG;
+
 	if (*data_len < sizeof *args)
 		return EINVAL;
 
@@ -378,13 +421,31 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 		return (error);
 	}
 
+#ifdef WAPBL
+	/*
+	 * WAPBL can only be enabled on a r/w mount
+	 * that does not use softdep.
+	 */
+	if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) {
+		mp->mnt_flag &= ~MNT_LOG;
+	}
+	if ((mp->mnt_flag & (MNT_SOFTDEP | MNT_LOG)) ==
+			(MNT_SOFTDEP | MNT_LOG)) {
+		printf("%s fs is journalled, ignoring soft update mode\n",
+			VFSTOUFS(mp)->um_fs->fs_fsmnt);
+		mp->mnt_flag &= ~MNT_SOFTDEP;
+	}
+#else /* !WAPBL */
+	mp->mnt_flag &= ~MNT_LOG;
+#endif /* !WAPBL */
+
 	if (!update) {
 		int xflags;
 
 		if (mp->mnt_flag & MNT_RDONLY)
 			xflags = FREAD;
 		else
-			xflags = FREAD|FWRITE;
+			xflags = FREAD | FWRITE;
 		error = VOP_OPEN(devvp, xflags, FSCRED);
 		if (error)
 			goto fail;
@@ -439,6 +500,8 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 				fs->fs_pendingblocks = 0;
 				fs->fs_pendinginodes = 0;
 			}
+			if (error == 0)
+				error = UFS_WAPBL_BEGIN(mp);
 			if (error == 0 &&
 			    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
 			    fs->fs_clean & FS_WASCLEAN) {
@@ -447,8 +510,24 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 				fs->fs_clean = FS_ISCLEAN;
 				(void) ffs_sbupdate(ump, MNT_WAIT);
 			}
+			if (error == 0)
+				UFS_WAPBL_END(mp);
 			if (error)
 				return (error);
+		}
+
+#ifdef WAPBL
+		if ((mp->mnt_flag & MNT_LOG) == 0) {
+			error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE);
+			if (error)
+				return error;
+		}
+#endif /* WAPBL */
+
+		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
+			/*
+			 * Finish change from r/w to r/o
+			 */
 			fs->fs_ronly = 1;
 			fs->fs_fmod = 0;
 		}
@@ -508,9 +587,30 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 				if (error)
 					return (error);
 			}
+#ifdef WAPBL
+			if (fs->fs_flags & FS_DOWAPBL) {
+				printf("%s: replaying log to disk\n",
+				    fs->fs_fsmnt);
+				KDASSERT(mp->mnt_wapbl_replay);
+				error = wapbl_replay_write(mp->mnt_wapbl_replay,
+							   devvp);
+				if (error) {
+					return error;
+				}
+				wapbl_replay_stop(mp->mnt_wapbl_replay);
+				fs->fs_clean = FS_WASCLEAN;
+			}
+#endif /* WAPBL */
 			if (fs->fs_snapinum[0] != 0)
 				ffs_snapshot_mount(mp);
 		}
+
+#ifdef WAPBL
+		error = ffs_wapbl_start(mp);
+		if (error)
+			return error;
+#endif /* WAPBL */
+
 		if (args->fspec == NULL)
 			return EINVAL;
 		if ((mp->mnt_flag & (MNT_SOFTDEP | MNT_ASYNC)) ==
@@ -531,17 +631,24 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
 	else
 		fs->fs_flags &= ~FS_DOSOFTDEP;
 	if (fs->fs_fmod != 0) {	/* XXX */
+		int err;
+
 		fs->fs_fmod = 0;
 		if (fs->fs_clean & FS_WASCLEAN)
 			fs->fs_time = time_second;
 		else {
-			printf("%s: file system not clean (fs_clean=%x); please fsck(8)\n",
-			    mp->mnt_stat.f_mntfromname, fs->fs_clean);
+			printf("%s: file system not clean (fs_clean=%#x); "
+			    "please fsck(8)\n", mp->mnt_stat.f_mntfromname,
+			    fs->fs_clean);
 			printf("%s: lost blocks %" PRId64 " files %d\n",
 			    mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks,
 			    fs->fs_pendinginodes);
 		}
-		(void) ffs_cgupdate(ump, MNT_WAIT);
+		err = UFS_WAPBL_BEGIN(mp);
+		if (err == 0) {
+			(void) ffs_cgupdate(ump, MNT_WAIT);
+			UFS_WAPBL_END(mp);
+		}
 	}
 	return (error);
 
@@ -659,7 +766,7 @@ ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
 			return (error);
 		}
 		error = ffs_appleufs_validate(fs->fs_fsmnt,
-			(struct appleufslabel *)bp->b_data,NULL);
+			(struct appleufslabel *)bp->b_data, NULL);
 		if (error == 0)
 			ump->um_flags |= UFS_ISAPPLEUFS;
 		brelse(bp, 0);
@@ -686,6 +793,17 @@ ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l)
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	mutex_enter(&ump->um_lock);
 	ump->um_maxfilesize = fs->fs_maxfilesize;
+
+	if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+		uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+		    mp->mnt_stat.f_mntonname, fs->fs_flags,
+		    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+		if ((mp->mnt_flag & MNT_FORCE) == 0) {
+			mutex_exit(&ump->um_lock);
+			return (EINVAL);
+		}
+	}
+
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
@@ -839,6 +957,17 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 	if (error)
 		return error;
 
+	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
+	memset(ump, 0, sizeof *ump);
+	mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
+	error = ffs_snapshot_init(ump);
+	if (error)
+		goto out;
+	ump->um_ops = &ffs_ufsops;
+
+#ifdef WAPBL
+ sbagain:
+#endif
 	/*
 	 * Try reading the superblock in each of its possible locations.
 	 */
@@ -916,15 +1045,7 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 
 	fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK);
 	memcpy(fs, bp->b_data, sbsize);
-
-	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
-	memset(ump, 0, sizeof *ump);
-	mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE);
-	error = ffs_snapshot_init(ump);
-	if (error)
-		goto out;
 	ump->um_fs = fs;
-	ump->um_ops = &ffs_ufsops;
 
 #ifdef FFS_EI
 	if (needswap) {
@@ -934,9 +1055,52 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 #endif
 		fs->fs_flags &= ~FS_SWAPPED;
 
+#ifdef WAPBL
+	if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) {
+		error = ffs_wapbl_replay_start(mp, fs, devvp);
+		if (error)
+			goto out;
+
+		if (!ronly) {
+			/* XXX fsmnt may be stale. */
+			printf("%s: replaying log to disk\n", fs->fs_fsmnt);
+			error = wapbl_replay_write(mp->mnt_wapbl_replay, devvp);
+			if (error)
+				goto out;
+			wapbl_replay_stop(mp->mnt_wapbl_replay);
+			fs->fs_clean = FS_WASCLEAN;
+		} else {
+			/* XXX fsmnt may be stale */
+			printf("%s: replaying log to memory\n", fs->fs_fsmnt);
+		}
+
+		/* Force a re-read of the superblock */
+		brelse(bp, BC_INVAL);
+		bp = NULL;
+		free(fs, M_UFSMNT);
+		fs = NULL;
+		goto sbagain;
+	}
+#else /* !WAPBL */
+	if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) {
+		error = EPERM;
+		goto out;
+	}
+#endif /* !WAPBL */
+
 	ffs_oldfscompat_read(fs, ump, sblockloc);
 	ump->um_maxfilesize = fs->fs_maxfilesize;
 
+	if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) {
+		uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n",
+		    mp->mnt_stat.f_mntonname, fs->fs_flags,
+		    (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting");
+		if ((mp->mnt_flag & MNT_FORCE) == 0) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
 	if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) {
 		fs->fs_pendingblocks = 0;
 		fs->fs_pendinginodes = 0;
@@ -966,7 +1130,7 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 		if (error)
 			goto out;
 		error = ffs_appleufs_validate(fs->fs_fsmnt,
-			(struct appleufslabel *)bp->b_data,NULL);
+			(struct appleufslabel *)bp->b_data, NULL);
 		if (error == 0) {
 			ump->um_flags |= UFS_ISAPPLEUFS;
 		}
@@ -980,6 +1144,36 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 	}
 #endif
 
+#if 0
+/*
+ * XXX This code changes the behaviour of mounting dirty filesystems, to
+ * XXX require "mount -f ..." to mount them.  This doesn't match what
+ * XXX mount(8) describes and is disabled for now.
+ */
+	/*
+	 * If the file system is not clean, don't allow it to be mounted
+	 * unless MNT_FORCE is specified.  (Note: MNT_FORCE is always set
+	 * for the root file system.)
+	 */
+	if (fs->fs_flags & FS_DOWAPBL) {
+		/*
+		 * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL
+		 * bit is set, although there's a window in unmount where it
+		 * could be FS_ISCLEAN
+		 */
+		if ((mp->mnt_flag & MNT_FORCE) == 0 &&
+		    (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) {
+			error = EPERM;
+			goto out;
+		}
+	} else
+		if ((fs->fs_clean & FS_ISCLEAN) == 0 &&
+		    (mp->mnt_flag & MNT_FORCE) == 0) {
+			error = EPERM;
+			goto out;
+		}
+#endif
+
 	/*
 	 * verify that we can access the last block in the fs
 	 * if we're mounting read/write.
@@ -999,10 +1193,12 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 	}
 
 	fs->fs_ronly = ronly;
-	if (ronly == 0) {
-		fs->fs_clean <<= 1;
-		fs->fs_fmod = 1;
-	}
+	/* Don't bump fs_clean if we're replaying journal */
+	if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN)))
+		if (ronly == 0) {
+			fs->fs_clean <<= 1;
+			fs->fs_fmod = 1;
+		}
 	size = fs->fs_cssize;
 	blks = howmany(size, fs->fs_fsize);
 	if (fs->fs_contigsumsize > 0)
@@ -1095,6 +1291,24 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 			goto out;
 		}
 	}
+
+#ifdef WAPBL
+	if (!ronly) {
+		KDASSERT(fs->fs_ronly == 0);
+		/*
+		 * ffs_wapbl_start() needs mp->mnt_stat initialised if it
+		 * needs to create a new log file in-filesystem.
+		 */
+		ffs_statvfs(mp, &mp->mnt_stat);
+
+		error = ffs_wapbl_start(mp);
+		if (error) {
+			free(fs->fs_csp, M_UFSMNT);
+			goto out;
+		}
+	}
+#endif /* WAPBL */
+
 	if (ronly == 0 && fs->fs_snapinum[0] != 0)
 		ffs_snapshot_mount(mp);
 #ifdef UFS_EXTATTR
@@ -1115,6 +1329,15 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
 #endif /* UFS_EXTATTR */
 	return (0);
 out:
+#ifdef WAPBL
+	if (mp->mnt_wapbl_replay) {
+		if (wapbl_replay_isopen(mp->mnt_wapbl_replay))
+			wapbl_replay_stop(mp->mnt_wapbl_replay);
+		wapbl_replay_free(mp->mnt_wapbl_replay);
+		mp->mnt_wapbl_replay = 0;
+	}
+#endif
+
 	fstrans_unmount(mp);
 	if (fs)
 		free(fs, M_UFSMNT);
@@ -1175,7 +1398,7 @@ ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc)
 	fs->fs_csaddr = fs->fs_old_csaddr;
 	fs->fs_sblockloc = sblockloc;
 
-        fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);
+	fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL);
 
 	if (fs->fs_old_postblformat == FS_42POSTBLFMT) {
 		fs->fs_old_nrpos = 8;
@@ -1256,6 +1479,9 @@ ffs_unmount(struct mount *mp, int mntflags)
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs = ump->um_fs;
 	int error, flags, penderr;
+#ifdef WAPBL
+	extern int doforce;
+#endif
 
 	penderr = 0;
 	flags = 0;
@@ -1284,25 +1510,42 @@ ffs_unmount(struct mount *mp, int mntflags)
 		penderr = 1;
 	}
 	mutex_exit(&ump->um_lock);
-	if (fs->fs_ronly == 0 &&
-	    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
-	    fs->fs_clean & FS_WASCLEAN) {
-		/*
-		 * XXXX don't mark fs clean in the case of softdep
-		 * pending block errors, until they are fixed.
-		 */
-		if (penderr == 0) {
-			if (mp->mnt_flag & MNT_SOFTDEP)
-				fs->fs_flags &= ~FS_DOSOFTDEP;
-			fs->fs_clean = FS_ISCLEAN;
+	error = UFS_WAPBL_BEGIN(mp);
+	if (error == 0)
+		if (fs->fs_ronly == 0 &&
+		    ffs_cgupdate(ump, MNT_WAIT) == 0 &&
+		    fs->fs_clean & FS_WASCLEAN) {
+			/*
+			 * XXXX don't mark fs clean in the case of softdep
+			 * pending block errors, until they are fixed.
+			 */
+			if (penderr == 0) {
+				if (mp->mnt_flag & MNT_SOFTDEP)
+					fs->fs_flags &= ~FS_DOSOFTDEP;
+				fs->fs_clean = FS_ISCLEAN;
+			}
+			fs->fs_fmod = 0;
+			(void) ffs_sbupdate(ump, MNT_WAIT);
 		}
-		fs->fs_fmod = 0;
-		(void) ffs_sbupdate(ump, MNT_WAIT);
+	if (error == 0)
+		UFS_WAPBL_END(mp);
+#ifdef WAPBL
+	KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl));
+	if (mp->mnt_wapbl_replay) {
+		KDASSERT(fs->fs_ronly);
+		wapbl_replay_stop(mp->mnt_wapbl_replay);
+		wapbl_replay_free(mp->mnt_wapbl_replay);
+		mp->mnt_wapbl_replay = 0;
 	}
+	error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE));
+	if (error) {
+		return error;
+	}
+#endif /* WAPBL */
 	if (ump->um_devvp->v_type != VBAD)
 		ump->um_devvp->v_specmountpoint = NULL;
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
-	(void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE,
+	(void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE,
 		NOCRED);
 	vput(ump->um_devvp);
 	free(fs->fs_csp, M_UFSMNT);
@@ -1335,7 +1578,7 @@ ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
-		if ((error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) != 0)
+		if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (ump->um_quotas[i] == NULLVP)
@@ -1363,6 +1606,19 @@ ffs_flushfiles(struct mount *mp, int flags, struct lwp *l)
 	vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0);
 	VOP_UNLOCK(ump->um_devvp, 0);
+	if (flags & FORCECLOSE) /* XXXDBJ */
+		error = 0;
+
+#ifdef WAPBL
+	if (error)
+		return error;
+	if (mp->mnt_wapbl) {
+		error = wapbl_flush(mp->mnt_wapbl, 1);
+		if (flags & FORCECLOSE)
+			error = 0;
+	}
+#endif
+
 	return (error);
 }
 
@@ -1447,10 +1703,11 @@ loop:
 			continue;
 		mutex_enter(&vp->v_interlock);
 		ip = VTOI(vp);
-		if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 ||
+		/* XXXpooka: why wapbl check? */
+		if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
 		    vp->v_type == VNON || ((ip->i_flag &
 		    (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 &&
-		    LIST_EMPTY(&vp->v_dirtyblkhd) &&
+		    (LIST_EMPTY(&vp->v_dirtyblkhd) || (mp->mnt_wapbl)) &&
 		    UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
 		{
 			mutex_exit(&vp->v_interlock);
@@ -1471,11 +1728,16 @@ loop:
 			}
 			continue;
 		}
-		if (vp->v_type == VREG && waitfor == MNT_LAZY)
-			error = ffs_update(vp, NULL, NULL, 0);
-		else
-			error = VOP_FSYNC(vp, cred,
-			    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0);
+		if (vp->v_type == VREG && waitfor == MNT_LAZY) {
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (!error) {
+				error = ffs_update(vp, NULL, NULL, 0);
+				UFS_WAPBL_END(vp->v_mount);
+			}
+		} else {
+			error = VOP_FSYNC(vp, cred, FSYNC_NOLOG |
+			    (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0);
+		}
 		if (error)
 			allerror = error;
 		vput(vp);
@@ -1498,10 +1760,11 @@ loop:
 	    !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) {
 		vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY);
 		if ((error = VOP_FSYNC(ump->um_devvp, cred,
-		    waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0)
+		    (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG,
+		    0, 0)) != 0)
 			allerror = error;
 		VOP_UNLOCK(ump->um_devvp, 0);
-		if (allerror == 0 && waitfor == MNT_WAIT) {
+		if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) {
 			mutex_enter(&mntvnode_lock);
 			goto loop;
 		}
@@ -1515,9 +1778,24 @@ loop:
 	if (fs->fs_fmod != 0) {
 		fs->fs_fmod = 0;
 		fs->fs_time = time_second;
-		if ((error = ffs_cgupdate(ump, waitfor)))
+		error = UFS_WAPBL_BEGIN(mp);
+		if (error)
+			allerror = error;
+		else {
+			if ((error = ffs_cgupdate(ump, waitfor)))
+				allerror = error;
+				UFS_WAPBL_END(mp);
+		}
+	}
+
+#ifdef WAPBL
+	if (mp->mnt_wapbl) {
+		error = wapbl_flush(mp->mnt_wapbl, 0);
+		if (error)
 			allerror = error;
 	}
+#endif
+
 	fstrans_done(mp);
 	vnfree(mvp);
 	return (allerror);
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 071a78b9ceed..881476b4166a 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -1,4 +1,33 @@
-/*	$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $	*/
+/*	$NetBSD: ffs_vnops.c,v 1.100 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -32,7 +61,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.100 2008/07/31 05:38:06 simonb Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#include "opt_wapbl.h"
+#endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $");
 #include <sys/pool.h>
 #include <sys/signalvar.h>
 #include <sys/kauth.h>
+#include <sys/wapbl.h>
 #include <sys/fstrans.h>
 
 #include <miscfs/fifofs/fifo.h>
@@ -58,6 +93,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $");
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
@@ -246,6 +282,9 @@ ffs_fsync(void *v)
 	int bsize;
 	daddr_t blk_high;
 	struct vnode *vp;
+#ifdef WAPBL
+	struct mount *mp;
+#endif
 
 	vp = ap->a_vp;
 
@@ -255,7 +294,11 @@ ffs_fsync(void *v)
 	 */
 	if ((ap->a_offlo == 0 && ap->a_offhi == 0) || DOINGSOFTDEP(vp) ||
 	    (vp->v_type != VREG)) {
-		error = ffs_full_fsync(vp, ap->a_flags);
+		int flags = ap->a_flags;
+
+		if (vp->v_type == VBLK)
+			flags |= FSYNC_VFS;
+		error = ffs_full_fsync(vp, flags);
 		goto out;
 	}
 
@@ -276,6 +319,36 @@ ffs_fsync(void *v)
 		goto out;
 	}
 
+#ifdef WAPBL
+	mp = wapbl_vptomp(vp);
+	if (mp->mnt_wapbl) {
+		if (ap->a_flags & FSYNC_DATAONLY) {
+			fstrans_done(vp->v_mount);
+			return 0;
+		}
+		error = 0;
+		if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag &
+		    (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+				 IN_MODIFIED | IN_ACCESSED)) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error) {
+				fstrans_done(vp->v_mount);
+				return error;
+			}
+			error = ffs_update(vp, NULL, NULL,
+				(ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+			UFS_WAPBL_END(mp);
+		}
+		if (error || (ap->a_flags & FSYNC_NOLOG)) {
+			fstrans_done(vp->v_mount);
+			return error;
+		}
+		error = wapbl_flush(mp->mnt_wapbl, 0);
+		fstrans_done(vp->v_mount);
+		return error;
+	}
+#endif /* WAPBL */
+
 	/*
 	 * Then, flush indirect blocks.
 	 */
@@ -350,7 +423,7 @@ ffs_full_fsync(struct vnode *vp, int flags)
 	 */
 
 	if (vp->v_type == VREG || vp->v_type == VBLK) {
-		if ((flags & FSYNC_VFS) != 0)
+		if ((flags & FSYNC_VFS) != 0 && vp->v_specmountpoint != NULL)
 			mp = vp->v_specmountpoint;
 		else
 			mp = vp->v_mount;
@@ -360,8 +433,55 @@ ffs_full_fsync(struct vnode *vp, int flags)
 			PGO_FREE : 0));
 		if (error)
 			return error;
-	} else
+	} else {
+		mp = vp->v_mount;
 		mutex_exit(&vp->v_interlock);
+	}
+
+#ifdef WAPBL
+	if (mp && mp->mnt_wapbl) {
+		error = 0;
+		if (flags & FSYNC_DATAONLY)
+			return error;
+
+		if (VTOI(vp) && (VTOI(vp)->i_flag &
+		    (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
+				 IN_MODIFIED | IN_ACCESSED))) {
+			error = UFS_WAPBL_BEGIN(mp);
+			if (error)
+				return error;
+			error = ffs_update(vp, NULL, NULL,
+				(flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
+			UFS_WAPBL_END(mp);
+		}
+		if (error || (flags & FSYNC_NOLOG))
+			return error;
+		/*
+		 * Don't flush the log if the vnode being flushed
+		 * contains no dirty buffers that could be in the log.
+		 */
+		if (!((flags & FSYNC_RECLAIM) &&
+		    LIST_EMPTY(&vp->v_dirtyblkhd))) {
+			error = wapbl_flush(mp->mnt_wapbl, 0);
+			if (error)
+				return error;
+		}
+
+		/*
+		 * XXX temporary workaround for "dirty bufs" panic in
+		 * vinvalbuf.  need a full fix for the v_numoutput
+		 * waiters issues.
+		 */
+		if (flags & FSYNC_WAIT) {
+			mutex_enter(&vp->v_interlock);
+			while (vp->v_numoutput)
+				cv_wait(&vp->v_cv, &vp->v_interlock);
+			mutex_exit(&vp->v_interlock);
+		}
+
+		return error;
+	}
+#endif /* WAPBL */
 
 	passes = NIADDR + 1;
 	skipmeta = 0;
@@ -453,8 +573,10 @@ loop:
 
 	if (error == 0 && flags & FSYNC_CACHE) {
 		int i = 0;
-		if ((flags & FSYNC_VFS) == 0)
+		if ((flags & FSYNC_VFS) == 0) {
+			KASSERT(VTOI(vp) != NULL);
 			vp = VTOI(vp)->i_devvp;
+		}
 		VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, curlwp->l_cred);
 	}
 
diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c
new file mode 100644
index 000000000000..e91050533413
--- /dev/null
+++ b/sys/ufs/ffs/ffs_wapbl.c
@@ -0,0 +1,858 @@
+/*	$NetBSD: ffs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_ffs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/disk.h>
+#include <sys/disklabel.h>
+#include <sys/ioctl.h>
+#include <sys/errno.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+
+#include <ufs/ffs/fs.h>
+#include <ufs/ffs/ffs_extern.h>
+
+#undef	WAPBL_DEBUG
+#ifdef WAPBL_DEBUG
+int ffs_wapbl_debug = 1;
+#define DPRINTF(fmt, args...)						\
+do {									\
+	if (ffs_wapbl_debug)						\
+		printf("%s:%d "fmt, __func__ , __LINE__, ##args);	\
+} while (/* CONSTCOND */0)
+#else
+#define	DPRINTF(fmt, args...)						\
+do {									\
+	/* nothing */							\
+} while (/* CONSTCOND */0)
+#endif
+
+static int wapbl_log_position(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, size_t *, uint64_t *);
+static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *,
+    daddr_t *, size_t *, size_t *, uint64_t *);
+static void wapbl_find_log_start(struct mount *, struct vnode *, off_t,
+    daddr_t *, daddr_t *, size_t *);
+static int wapbl_remove_log(struct mount *);
+static int wapbl_allocate_log_file(struct mount *, struct vnode *);
+
+/*
+ * This function is invoked after a log is replayed to
+ * disk to perform logical cleanup actions as described by
+ * the log
+ */
+void
+ffs_wapbl_replay_finish(struct mount *mp)
+{
+	struct wapbl_replay *wr = mp->mnt_wapbl_replay;
+	int i;
+	int error;
+
+	if (!wr)
+		return;
+
+	KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
+
+	for (i = 0; i < wr->wr_inodescnt; i++) {
+		struct vnode *vp;
+		struct inode *ip;
+		error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp);
+		if (error) {
+			printf("ffs_wapbl_replay_finish: "
+			    "unable to cleanup inode %" PRIu32 "\n",
+			    wr->wr_inodes[i].wr_inumber);
+			continue;
+		}
+		ip = VTOI(vp);
+		KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number);
+		printf("ffs_wapbl_replay_finish: "
+		    "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n",
+		    ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink);
+		KASSERT(ip->i_nlink == 0);
+
+		/*
+		 * The journal may have left partially allocated inodes in mode
+		 * zero.  This may occur if a crash occurs betweeen the node
+		 * allocation in ffs_nodeallocg and when the node is properly
+		 * initialized in ufs_makeinode.  If so, just dallocate them.
+		 */
+		if (ip->i_mode == 0) {
+			UFS_WAPBL_BEGIN(mp);
+			ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode);
+			UFS_WAPBL_END(mp);
+		}
+		vput(vp);
+	}
+	mp->mnt_wapbl_replay = 0;
+	wapbl_replay_free(wr);
+}
+
+/* Callback for wapbl */
+void
+ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int i, error;
+
+#ifdef WAPBL_DEBUG_INODES
+	ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata");
+#endif
+
+	for (i = 0; i< dealloccnt; i++) {
+		/*
+		 * blkfree errors are unreported, might silently fail
+		 * if it cannot read the cylinder group block
+		 */
+		ffs_blkfree(fs, ump->um_devvp,
+		    dbtofsb(fs, deallocblks[i]), dealloclens[i], -1);
+	}
+
+	fs->fs_fmod = 0;
+	fs->fs_time = time_second;
+	error = ffs_cgupdate(ump, 0);
+	KASSERT(error == 0);
+}
+
+void
+ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks,
+    int *dealloclens, int dealloccnt)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int i;
+
+	/*
+	 * I suppose we could dig around for an in use inode, but
+	 * its not really used by ffs_blkalloc, so we just fake
+	 * the couple of fields that it touches.
+	 */
+	struct inode in;
+	in.i_fs = fs;
+	in.i_devvp = ump->um_devvp;
+	in.i_dev = ump->um_dev;
+	in.i_number = -1;
+	in.i_uid = 0;
+	for (i = 0; i < dealloccnt; i++) {
+		/*
+		 * Since the above blkfree may have failed, this blkalloc might
+		 * fail as well, so don't check its error.  Note that if the
+		 * blkfree succeeded above, then this shouldn't fail because
+		 * the buffer will be locked in the current transaction.
+		 */
+		ffs_blkalloc(&in, dbtofsb(fs, deallocblks[i]),
+		    dealloclens[i]);
+	}
+}
+
+static int
+wapbl_remove_log(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *vp;
+	struct inode *ip;
+	ino_t log_ino;
+	int error;
+
+	/* If all the log locators are 0, just clean up */
+	if (fs->fs_journallocs[0] == 0 &&
+	    fs->fs_journallocs[1] == 0 &&
+	    fs->fs_journallocs[2] == 0 &&
+	    fs->fs_journallocs[3] == 0) {
+		DPRINTF("empty locators, just clear\n");
+		goto done;
+	}
+
+	switch (fs->fs_journal_location) {
+	case UFS_WAPBL_JOURNALLOC_NONE:
+		/* nothing! */
+		DPRINTF("no log\n");
+		break;
+
+	case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+		log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
+		DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino);
+
+		/* if no existing log inode, just clear all fields and bail */
+		if (log_ino == 0)
+			goto done;
+		error = VFS_VGET(mp, log_ino, &vp);
+		if (error != 0) {
+			printf("ffs_wapbl: vget failed %d\n",
+			    error);
+			/* clear out log info on error */
+			goto done;
+		}
+		ip = VTOI(vp);
+		KASSERT(log_ino == ip->i_number);
+		if ((ip->i_flags & SF_LOG) == 0) {
+			printf("ffs_wapbl: try to clear non-log inode "
+			    "%" PRId64 "\n", log_ino);
+			vput(vp);
+			/* clear out log info on error */
+			goto done;
+		}
+
+		/*
+		 * remove the log inode by setting its link count back
+		 * to zero and bail.
+		 */
+		ip->i_ffs_effnlink = 0;
+		ip->i_nlink = 0;
+		DIP_ASSIGN(ip, nlink, 0);
+		if (DOINGSOFTDEP(vp))
+			softdep_change_linkcnt(ip);
+		vput(vp);
+
+	case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+		DPRINTF("end-of-partition log\n");
+		/* no extra work required */
+		break;
+
+	default:
+		printf("ffs_wapbl: unknown journal type %d\n",
+		    fs->fs_journal_location);
+		return EINVAL;
+	}
+
+
+done:
+	/* Clear out all previous knowledge of journal */
+	fs->fs_journal_version = 0;
+	fs->fs_journal_location = 0;
+	fs->fs_journal_flags = 0;
+	fs->fs_journallocs[0] = 0;
+	fs->fs_journallocs[1] = 0;
+	fs->fs_journallocs[2] = 0;
+	fs->fs_journallocs[3] = 0;
+	(void) ffs_sbupdate(ump, MNT_WAIT);
+
+	return 0;
+}
+
+int
+ffs_wapbl_start(struct mount *mp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *devvp = ump->um_devvp;
+	daddr_t off;
+	size_t count;
+	size_t blksize;
+	uint64_t extradata;
+	int error;
+
+	if (mp->mnt_wapbl == 0) {
+		if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) {
+			/* Clear out any existing journal file */
+			error = wapbl_remove_log(mp);
+			if (error != 0)
+				return error;
+		}
+
+		if (mp->mnt_flag & MNT_LOG) {
+			KDASSERT(fs->fs_ronly == 0);
+
+			error = wapbl_log_position(mp, fs, devvp, &off,
+			    &count, &blksize, &extradata);
+			if (error)
+				return error;
+
+			/* XXX any other consistancy checks here? */
+			if (blksize != DEV_BSIZE) {
+				printf("%s: bad blocksize %zd\n", __func__,
+				    blksize);
+				return EINVAL;
+			}
+
+			error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off,
+			    count, blksize, mp->mnt_wapbl_replay,
+			    ffs_wapbl_sync_metadata,
+			    ffs_wapbl_abort_sync_metadata);
+			if (error)
+				return error;
+
+			mp->mnt_wapbl_op = &wapbl_ops;
+
+#ifdef WAPBL_DEBUG
+			printf("%s: enabling logging\n", fs->fs_fsmnt);
+#endif
+
+			if ((fs->fs_flags & FS_DOWAPBL) == 0) {
+				UFS_WAPBL_BEGIN(mp);
+				fs->fs_flags |= FS_DOWAPBL;
+				error = ffs_sbupdate(ump, MNT_WAIT);
+				if (error) {
+					UFS_WAPBL_END(mp);
+					ffs_wapbl_stop(mp, MNT_FORCE);
+					return error;
+				}
+				UFS_WAPBL_END(mp);
+				error = wapbl_flush(mp->mnt_wapbl, 1);
+				if (error) {
+					ffs_wapbl_stop(mp, MNT_FORCE);
+					return error;
+				}
+			}
+		} else if (fs->fs_flags & FS_DOWAPBL) {
+			fs->fs_fmod = 1;
+			fs->fs_flags &= ~FS_DOWAPBL;
+		}
+	}
+
+	/*
+	 * It is recommended that you finish replay with logging enabled.
+	 * However, even if logging is not enabled, the remaining log
+	 * replay should be safely recoverable with an fsck, so perform
+	 * it anyway.
+	 */
+	if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) {
+		int saveflag = mp->mnt_flag & MNT_RDONLY;
+		/*
+		 * Make sure MNT_RDONLY is not set so that the inode
+		 * cleanup in ufs_inactive will actually do its work.
+		 */
+		mp->mnt_flag &= ~MNT_RDONLY;
+		ffs_wapbl_replay_finish(mp);
+		mp->mnt_flag |= saveflag;
+		KASSERT(fs->fs_ronly == 0);
+	}
+
+	return 0;
+}
+
+int
+ffs_wapbl_stop(struct mount *mp, int force)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	int error;
+
+	if (mp->mnt_wapbl) {
+		KDASSERT(fs->fs_ronly == 0);
+
+		/*
+		 * Make sure turning off FS_DOWAPBL is only removed
+		 * as the only change in the final flush since otherwise
+		 * a transaction may reorder writes.
+		 */
+		error = wapbl_flush(mp->mnt_wapbl, 1);
+		if (error && !force)
+			return error;
+		if (error && force)
+			goto forceout;
+		error = UFS_WAPBL_BEGIN(mp);
+		if (error && !force)
+			return error;
+		if (error && force)
+			goto forceout;
+		KASSERT(fs->fs_flags & FS_DOWAPBL);
+
+		fs->fs_flags &= ~FS_DOWAPBL;
+		error = ffs_sbupdate(ump, MNT_WAIT);
+		KASSERT(error == 0);	/* XXX a bit drastic! */
+		UFS_WAPBL_END(mp);
+	forceout:
+		error = wapbl_stop(mp->mnt_wapbl, force);
+		if (error) {
+			KASSERT(!force);
+			fs->fs_flags |= FS_DOWAPBL;
+			return error;
+		}
+		fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */
+		mp->mnt_wapbl = 0;
+
+#ifdef WAPBL_DEBUG
+		printf("%s: disabled logging\n", fs->fs_fsmnt);
+#endif
+	}
+
+	return 0;
+}
+
+int
+ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp)
+{
+	int error;
+	daddr_t off;
+	size_t count;
+	size_t blksize;
+	uint64_t extradata;
+
+	error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize,
+	    &extradata);
+
+	if (error)
+		return error;
+
+	error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off,
+		count, blksize);
+	if (error)
+		return error;
+
+	mp->mnt_wapbl_op = &wapbl_ops;
+
+	return 0;
+}
+
+/*
+ * If the superblock doesn't already have a recorded journal location
+ * then we allocate the journal in one of two positions:
+ *
+ *  - At the end of the partition after the filesystem if there's
+ *    enough space.  "Enough space" is defined as >= 1MB of journal
+ *    per 1GB of filesystem or 64MB, whichever is smaller.
+ *
+ *  - Inside the filesystem.  We try to allocate a contiguous journal
+ *    based on the total filesystem size - the target is 1MB of journal
+ *    per 1GB of filesystem, up to a maximum journal size of 64MB.  As
+ *    a worst case allowing for fragmentation, we'll allocate a journal
+ *    1/4 of the desired size but never smaller than 1MB.
+ *
+ *    XXX In the future if we allow for non-contiguous journal files we
+ *    can tighten the above restrictions.
+ *
+ * XXX
+ * These seems like a lot of duplication both here and in some of
+ * the userland tools (fsck_ffs, dumpfs, tunefs) with similar 
+ * "switch (fs_journal_location)" constructs.  Can we centralise
+ * this sort of code somehow/somewhere?
+ */
+static int
+wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct partinfo dpart;
+	daddr_t logstart, logend, desired_logsize;
+	size_t blksize;
+	int error;
+
+	if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
+		switch (fs->fs_journal_location) {
+		case UFS_WAPBL_JOURNALLOC_END_PARTITION:
+			DPRINTF("found existing end-of-partition log\n");
+			*startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR];
+			*countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
+			*blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ];
+			DPRINTF(" start = %" PRId64 ", size = %zd, "
+			    "blksize = %zd\n", *startp, *countp, *blksizep);
+			return 0;
+
+		case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
+			DPRINTF("found existing in-filesystem log\n");
+			*startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+			*countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+			*blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+			DPRINTF(" start = %" PRId64 ", size = %zd, "
+			    "blksize = %zd\n", *startp, *countp, *blksizep);
+			return 0;
+
+		default:
+			printf("ffs_wapbl: unknown journal type %d\n",
+			    fs->fs_journal_location);
+			return EINVAL;
+		}
+	}
+
+	desired_logsize =
+	    lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE;
+	DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024);
+	desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+	desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+	DPRINTF("adjusted desired log size = %" PRId64 " kB\n",
+	    desired_logsize / 1024);
+
+	/* Is there space after after filesystem on partition for log? */
+	logstart = fsbtodb(fs, fs->fs_size);
+	error = VOP_IOCTL(devvp, DIOCGPART, &dpart, FREAD, FSCRED);
+	if (!error) {
+		logend  = dpart.part->p_size;
+		blksize = dpart.disklab->d_secsize;
+	} else {
+		struct dkwedge_info dkw;
+		error = VOP_IOCTL(devvp, DIOCGWEDGEINFO, &dkw, FREAD, FSCRED);
+		if (error)
+			return error;
+
+		blksize = DEV_BSIZE;
+		logend = dkw.dkw_size;
+	}
+
+	if ((logend - logstart) >= desired_logsize) {
+		KDASSERT(blksize != 0);
+		DPRINTF("enough space, use end-of-partition log\n");
+
+		*startp = logstart;
+		*countp = (logend - logstart);
+		*blksizep = blksize;
+		*extradatap = 0;
+
+		/* update superblock with log location */
+		fs->fs_journal_version = UFS_WAPBL_VERSION;
+		fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_END_PARTITION;
+		fs->fs_journal_flags = 0;
+		fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp;
+		fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp;
+		fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep;
+		fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap;
+
+		error = ffs_sbupdate(ump, MNT_WAIT);
+		return error;
+	}
+	DPRINTF("end-of-partition has only %" PRId64 " free\n",
+	    logend - logstart);
+
+	error = wapbl_create_infs_log(mp, fs, devvp, startp, countp, blksizep,
+	    extradatap);
+
+	ffs_sync(mp, 1, FSCRED);
+
+	return error;
+}
+
+/*
+ * Try to create a journal log inside the filesystem.
+ */
+static int
+wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp,
+    daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap)
+{
+	struct vnode *vp, *rvp;
+	struct inode *ip;
+	int error;
+
+	if ((error = VFS_ROOT(mp, &rvp)) != 0)
+		return error;
+
+	if ((error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp)) != 0) {
+		vput(rvp);
+		return error;
+	}
+	vput(rvp);
+
+	vp->v_type = VREG;
+	ip = VTOI(vp);
+	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
+	ip->i_mode = 0 | IFREG;
+	DIP_ASSIGN(ip, mode, ip->i_mode);
+	ip->i_flags = SF_LOG;
+	DIP_ASSIGN(ip, flags, ip->i_flags);
+	ip->i_ffs_effnlink = 1;
+	ip->i_nlink = 1;
+	DIP_ASSIGN(ip, nlink, 1);
+	if (DOINGSOFTDEP(vp))
+		softdep_change_linkcnt(ip);
+	ffs_update(vp, NULL, NULL, UPDATE_WAIT);
+
+	if ((error = wapbl_allocate_log_file(mp, vp)) != 0) {
+		/*
+		 * If we couldn't allocate the space for the log file,
+		 * remove the inode by setting its link count back to
+		 * zero and bail.
+		 */
+		ip->i_ffs_effnlink = 0;
+		ip->i_nlink = 0;
+		DIP_ASSIGN(ip, nlink, 0);
+		if (DOINGSOFTDEP(vp))
+			softdep_change_linkcnt(ip);
+		vput(vp);
+
+		return error;
+	}
+
+	/*
+	 * Now that we have the place-holder inode for the journal,
+	 * we don't need the vnode ever again.
+	 */
+	vput(vp);
+
+	*startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR];
+	*countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+	*blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
+	*extradatap = fs->fs_journallocs[UFS_WAPBL_INFS_INO];
+
+	return 0;
+}
+
+int
+wapbl_allocate_log_file(struct mount *mp, struct vnode *vp)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	daddr_t addr, indir_addr;
+	off_t logsize;
+	size_t size;
+	int error;
+
+	logsize = 0;
+	/* check if there's a suggested log size */
+	if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG &&
+	    fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM)
+		logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
+
+	if (vp->v_size > 0) {
+		printf("%s: file size (%" PRId64 ") non zero\n", __func__,
+		    vp->v_size);
+		return EEXIST;
+	}
+	wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size);
+	if (addr == 0) {
+		printf("%s: log not allocated, largest extent is "
+		    "%" PRId64 "MB\n", __func__,
+		    lblktosize(fs, size) / (1024 * 1024));
+		return ENOSPC;
+	}
+
+	logsize = lblktosize(fs, size);	/* final log size */
+
+	VTOI(vp)->i_ffs_first_data_blk = addr;
+	VTOI(vp)->i_ffs_first_indir_blk = indir_addr;
+
+	error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED);
+	if (error) {
+		printf("%s: GOP_ALLOC error %d\n", __func__, error);
+		return error;
+	}
+
+	fs->fs_journal_version = UFS_WAPBL_VERSION;
+	fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM;
+	fs->fs_journal_flags = 0;
+	fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] =
+	    lfragtosize(fs, addr) / DEV_BSIZE;
+	fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = logsize / DEV_BSIZE;
+	fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = DEV_BSIZE;
+	fs->fs_journallocs[UFS_WAPBL_INFS_INO] = VTOI(vp)->i_number;
+
+	error = ffs_sbupdate(ump, MNT_WAIT);
+	return error;
+}
+
+/*
+ * Find a suitable location for the journal in the filesystem.
+ *
+ * Our strategy here is to look for a contiguous block of free space
+ * at least "logfile" MB in size (plus room for any indirect blocks).
+ * We start at the middle of the filesystem and check each cylinder
+ * group working outwards.  If "logfile" MB is not available as a
+ * single contigous chunk, then return the address and size of the
+ * largest chunk found.
+ *
+ * XXX 
+ * At what stage does the search fail?  Is if the largest space we could
+ * find is less than a quarter the requested space reasonable?  If the
+ * search fails entirely, return a block address if "0" it indicate this.
+ */
+static void
+wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize,
+    daddr_t *addr, daddr_t *indir_addr, size_t *size)
+{
+	struct ufsmount *ump = VFSTOUFS(mp);
+	struct fs *fs = ump->um_fs;
+	struct vnode *devvp = ump->um_devvp;
+	struct cg *cgp;
+	struct buf *bp;
+	uint8_t *blksfree;
+	daddr_t blkno, best_addr, start_addr;
+	daddr_t desired_blks, min_desired_blks;
+	daddr_t freeblks, best_blks;
+	int bpcg, cg, error, fixedsize, indir_blks, n, s;
+#ifdef FFS_EI
+	const int needswap = UFS_FSNEEDSWAP(fs);
+#endif
+
+	if (logsize == 0) {
+		fixedsize = 0;	/* We can adjust the size if tight */
+		logsize = lfragtosize(fs, fs->fs_dsize) /
+		    UFS_WAPBL_JOURNAL_SCALE;
+		DPRINTF("suggested log size = %" PRId64 "\n", logsize);
+		logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE);
+		logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE);
+		DPRINTF("adjusted log size = %" PRId64 "\n", logsize);
+	} else {
+		fixedsize = 1;
+		DPRINTF("fixed log size = %" PRId64 "\n", logsize);
+	}
+
+	desired_blks = logsize / fs->fs_bsize;
+	DPRINTF("desired blocks = %" PRId64 "\n", desired_blks);
+
+	/* add in number of indirect blocks needed */
+	indir_blks = 0;
+	if (desired_blks >= NDADDR) {
+		struct indir indirs[NIADDR + 2];
+		int num;
+
+		error = ufs_getlbns(vp, desired_blks, indirs, &num);
+		if (error) {
+			printf("%s: ufs_getlbns failed, error %d!\n",
+			    __func__, error);
+			goto bad;
+		}
+
+		switch (num) {
+		case 2:
+			indir_blks = 1;		/* 1st level indirect */
+			break;
+		case 3:
+			indir_blks = 1 +	/* 1st level indirect */
+			    1 +			/* 2nd level indirect */
+			    indirs[1].in_off + 1; /* extra 1st level indirect */
+			break;
+		default:
+			printf("%s: unexpected numlevels %d from ufs_getlbns\n",
+			    __func__, num);
+			*size = 0;
+			goto bad;
+		}
+		desired_blks += indir_blks;
+	}
+	DPRINTF("desired blocks = %" PRId64 " (including indirect)\n",
+	    desired_blks);
+
+	/*
+	 * If a specific size wasn't requested, allow for a smaller log
+	 * if we're really tight for space...
+	 */
+	min_desired_blks = desired_blks;
+	if (!fixedsize)
+		min_desired_blks = desired_blks / 4;
+
+	/* Look at number of blocks per CG.  If it's too small, bail early. */
+	bpcg = fragstoblks(fs, fs->fs_fpg);
+	if (min_desired_blks > bpcg) {
+		printf("ffs_wapbl: cylinder group size of %" PRId64 " MB "
+		    " is not big enough for journal\n",
+		    lblktosize(fs, bpcg) / (1024 * 1024));
+		goto bad;
+	}
+
+	/*
+	 * Start with the middle cylinder group, and search outwards in
+	 * both directions until we either find the requested log size
+	 * or reach the start/end of the file system.  If we reach the
+	 * start/end without finding enough space for the full requested
+	 * log size, use the largest extent found if it is large enough
+	 * to satisfy the our minimum size.
+	 *
+	 * XXX
+	 * Can we just use the cluster contigsum stuff (esp on UFS2)
+	 * here to simplify this search code?
+	 */
+	best_addr = 0;
+	best_blks = 0;
+	for (cg = fs->fs_ncg / 2, s = 0, n = 1;
+	    best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg;
+	    s++, n = -n, cg += n * s) {
+		DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg);
+		error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)),
+		    fs->fs_cgsize, FSCRED, 0, &bp);
+		cgp = (struct cg *)bp->b_data;
+		if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
+			brelse(bp, 0);
+			continue;
+		}
+
+		blksfree = cg_blksfree(cgp, needswap);
+
+		for (blkno = 0; blkno < bpcg;) {
+			/* look for next free block */
+			/* XXX use scanc() and fragtbl[] here? */
+			for (; blkno < bpcg - min_desired_blks; blkno++)
+				if (ffs_isblock(fs, blksfree, blkno))
+					break;
+
+			/* past end of search space in this CG? */
+			if (blkno >= bpcg - min_desired_blks)
+				break;
+
+			/* count how many free blocks in this extent */
+			start_addr = blkno;
+			for (freeblks = 0; blkno < bpcg; blkno++, freeblks++)
+				if (!ffs_isblock(fs, blksfree, blkno))
+					break;
+
+			if (freeblks > best_blks) {
+				best_blks = freeblks;
+				best_addr = blkstofrags(fs, start_addr) +
+				    cgbase(fs, cg);
+
+				if (freeblks >= desired_blks) {
+					DPRINTF("found len %" PRId64
+					    " at offset %" PRId64 " in gc\n",
+					    freeblks, start_addr);
+					break;
+				}
+			}
+		}
+		brelse(bp, 0);
+	}
+	DPRINTF("best found len = %" PRId64 ", wanted %" PRId64
+	    " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr);
+
+	if (best_blks < min_desired_blks) {
+		*addr = 0;
+		*indir_addr = 0;
+	} else {
+		/* put indirect blocks at start, and data blocks after */
+		*addr = best_addr + blkstofrags(fs, indir_blks);
+		*indir_addr = best_addr;
+	}
+	*size = min(desired_blks, best_blks) - indir_blks;
+	return;
+
+bad:
+	*addr = 0;
+	*indir_addr = 0;
+	*size = 0;
+	return;
+}
diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h
index 4d0ee8bc5f29..e2763799dbb7 100644
--- a/sys/ufs/ffs/fs.h
+++ b/sys/ufs/ffs/fs.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: fs.h,v 1.49 2007/12/25 18:33:49 perry Exp $	*/
+/*	$NetBSD: fs.h,v 1.50 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1993
@@ -327,7 +327,12 @@ struct fs {
 	int32_t	 fs_old_cpc;		/* cyl per cycle in postbl */
 /* this area is otherwise allocated unless fs_old_flags & FS_FLAGS_UPDATED */
 	int32_t	 fs_maxbsize;		/* maximum blocking factor permitted */
-	int64_t	 fs_sparecon64[17];	/* old rotation block list head */
+	uint8_t	 fs_journal_version;	/* journal format version */
+	uint8_t	 fs_journal_location;	/* journal location type */
+	uint8_t	 fs_journal_reserved[2];/* reserved for future use */
+	uint32_t fs_journal_flags;	/* journal flags */
+	uint64_t fs_journallocs[4];	/* location info for journal */
+	int64_t	 fs_sparecon64[12];	/* reserved for future use */
 	int64_t	 fs_sblockloc;		/* byte offset of standard superblock */
 	struct	csum_total fs_cstotal;	/* cylinder summary information */
 	int64_t  fs_time;		/* last time written */
@@ -406,13 +411,17 @@ struct fs {
 /*
  * File system flags
  */
-#define	FS_UNCLEAN	0x01	/* file system not clean at mount (unused) */
-#define	FS_DOSOFTDEP	0x02	/* file system using soft dependencies */
-#define FS_NEEDSFSCK	0x04	/* needs sync fsck (FreeBSD compat, unused) */
-#define FS_INDEXDIRS	0x08	/* kernel supports indexed directories */
-#define FS_ACLS		0x10	/* file system has ACLs enabled */
-#define FS_MULTILABEL	0x20	/* file system is MAC multi-label */
+#define	FS_UNCLEAN	0x001	/* file system not clean at mount (unused) */
+#define	FS_DOSOFTDEP	0x002	/* file system using soft dependencies */
+#define FS_NEEDSFSCK	0x004	/* needs sync fsck (FreeBSD compat, unused) */
+#define FS_INDEXDIRS	0x008	/* kernel supports indexed directories */
+#define FS_ACLS		0x010	/* file system has ACLs enabled */
+#define FS_MULTILABEL	0x020	/* file system is MAC multi-label */
 #define FS_FLAGS_UPDATED 0x80	/* flags have been moved to new location */
+#define FS_DOWAPBL	0x100	/* Write ahead physical block logging */
+
+/* File system flags that are ok for NetBSD if set in fs_flags */
+#define FS_KNOWN_FLAGS	(FS_DOSOFTDEP | FS_DOWAPBL)
 
 /*
  * File system internal flags, also in fs_flags.
diff --git a/sys/ufs/files.ufs b/sys/ufs/files.ufs
index df10a491d90e..88078047c6ca 100644
--- a/sys/ufs/files.ufs
+++ b/sys/ufs/files.ufs
@@ -1,4 +1,4 @@
-#	$NetBSD: files.ufs,v 1.17 2007/12/12 02:56:03 lukem Exp $
+#	$NetBSD: files.ufs,v 1.18 2008/07/31 05:38:06 simonb Exp $
 
 deffs	fs_ffs.h			FFS
 deffs					EXT2FS
@@ -34,6 +34,7 @@ file	ufs/ffs/ffs_subr.c		ffs | mfs | ext2fs
 file	ufs/ffs/ffs_tables.c		ffs | mfs | ext2fs
 file	ufs/ffs/ffs_vfsops.c		ffs | mfs | ext2fs
 file	ufs/ffs/ffs_vnops.c		ffs | mfs | ext2fs
+file	ufs/ffs/ffs_wapbl.c		ffs & wapbl
 file	ufs/ffs/ffs_appleufs.c		ffs & apple_ufs
 
 file	ufs/lfs/lfs_alloc.c		lfs
@@ -62,3 +63,4 @@ file	ufs/ufs/ufs_lookup.c		ffs | lfs | mfs | ext2fs
 file	ufs/ufs/ufs_quota.c		quota & (ffs | lfs | mfs | ext2fs)
 file	ufs/ufs/ufs_vfsops.c		ffs | lfs | mfs | ext2fs
 file	ufs/ufs/ufs_vnops.c		ffs | lfs | mfs | ext2fs
+file	ufs/ufs/ufs_wapbl.c		ffs & wapbl
diff --git a/sys/ufs/ufs/Makefile b/sys/ufs/ufs/Makefile
index adb42dd39a56..e288c1cdc14c 100644
--- a/sys/ufs/ufs/Makefile
+++ b/sys/ufs/ufs/Makefile
@@ -1,8 +1,8 @@
-#	$NetBSD: Makefile,v 1.5 2005/12/11 12:25:28 christos Exp $
+#	$NetBSD: Makefile,v 1.6 2008/07/31 05:38:06 simonb Exp $
 
 INCSDIR= /usr/include/ufs/ufs
 
 INCS=	dinode.h dir.h extattr.h inode.h quota.h ufs_bswap.h ufs_extern.h \
-	ufsmount.h
+	ufs_wapbl.h ufsmount.h
 
 .include <bsd.kinc.mk>
diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h
index 8447acd1fa36..c85552d41a5c 100644
--- a/sys/ufs/ufs/inode.h
+++ b/sys/ufs/ufs/inode.h
@@ -1,4 +1,4 @@
-/*	$NetBSD: inode.h,v 1.51 2008/01/09 16:15:23 ad Exp $	*/
+/*	$NetBSD: inode.h,v 1.52 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*
  * Copyright (c) 1982, 1989, 1993
@@ -51,6 +51,9 @@
  */
 struct ffs_inode_ext {
 	daddr_t *ffs_snapblklist;	/* Collect expunged snapshot blocks. */
+	/* follow two fields are used by contiguous allocation code only. */
+	daddr_t ffs_first_data_blk;	/* first indirect block on disk. */
+	daddr_t ffs_first_indir_blk;	/* first data block on disk. */
 };
 
 struct ext2fs_inode_ext {
@@ -113,6 +116,8 @@ struct inode {
 		struct  lfs_inode_ext *lfs;
 	} inode_ext;
 #define	i_snapblklist		inode_ext.ffs.ffs_snapblklist
+#define	i_ffs_first_data_blk	inode_ext.ffs.ffs_first_data_blk
+#define	i_ffs_first_indir_blk	inode_ext.ffs.ffs_first_indir_blk
 #define	i_e2fs_last_lblk	inode_ext.e2fs.ext2fs_last_lblk
 #define	i_e2fs_last_blk		inode_ext.e2fs.ext2fs_last_blk
 	/*
@@ -219,7 +224,7 @@ struct inode {
 #define	IN_CLEANING	0x0100		/* LFS: file is being cleaned */
 #define	IN_ADIROP	0x0200		/* LFS: dirop in progress */
 #define IN_SPACECOUNTED	0x0400		/* Blocks to be freed in free count. */
-#define IN_PAGING       0x1000          /* LFS: file is on paging queue */
+#define IN_PAGING       0x1000		/* LFS: file is on paging queue */
 
 #if defined(_KERNEL)
 
diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c
index 383d57013b1f..4ea34181c362 100644
--- a/sys/ufs/ufs/ufs_inode.c
+++ b/sys/ufs/ufs/ufs_inode.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $	*/
+/*	$NetBSD: ufs_inode.c,v 1.76 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*
  * Copyright (c) 1991, 1993
@@ -37,11 +37,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.76 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
 #include "opt_quota.h"
+#include "opt_wapbl.h"
 #endif
 
 #include <sys/param.h>
@@ -52,12 +53,14 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $");
 #include <sys/kernel.h>
 #include <sys/namei.h>
 #include <sys/kauth.h>
+#include <sys/wapbl.h>
 #include <sys/fstrans.h>
 #include <sys/kmem.h>
 
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
@@ -84,6 +87,9 @@ ufs_inactive(void *v)
 	struct mount *transmp;
 	mode_t mode;
 	int error = 0;
+	int logged = 0;
+
+	UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount);
 
 	transmp = vp->v_mount;
 	fstrans_start(transmp, FSTRANS_SHARED);
@@ -96,6 +102,10 @@ ufs_inactive(void *v)
 		softdep_releasefile(ip);
 
 	if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
+		logged = 1;
 #ifdef QUOTA
 		(void)chkiq(ip, -1, NOCRED, 0);
 #endif
@@ -103,7 +113,35 @@ ufs_inactive(void *v)
 		ufs_extattr_vnode_inactive(vp, curlwp);
 #endif
 		if (ip->i_size != 0) {
-			error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED);
+			/*
+			 * When journaling, only truncate one indirect block
+			 * at a time
+			 */
+			if (vp->v_mount->mnt_wapbl) {
+				uint64_t incr = MNINDIR(ip->i_ump) <<
+				    vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+				uint64_t base = NDADDR <<
+				    vp->v_mount->mnt_fs_bshift;
+				while (!error && ip->i_size > base + incr) {
+					/*
+					 * round down to next full indirect
+					 * block boundary.
+					 */
+					uint64_t nsize = base +
+					    ((ip->i_size - base - 1) &
+					    ~(incr - 1));
+					error = UFS_TRUNCATE(vp, nsize, 0,
+					    NOCRED);
+					if (error)
+						break;
+					UFS_WAPBL_END(vp->v_mount);
+					error = UFS_WAPBL_BEGIN(vp->v_mount);
+					if (error)
+						goto out;
+				}
+			}
+			if (!error)
+				error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED);
 		}
 		/*
 		 * Setting the mode to zero needs to wait for the inode
@@ -125,8 +163,16 @@ ufs_inactive(void *v)
 	}
 
 	if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
+		if (!logged++) {
+			int err;
+			err = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (err)
+				goto out;
+		}
 		UFS_UPDATE(vp, NULL, NULL, 0);
 	}
+	if (logged)
+		UFS_WAPBL_END(vp->v_mount);
 out:
 	/*
 	 * If we are done with the inode, reclaim it
@@ -149,6 +195,10 @@ ufs_reclaim(struct vnode *vp)
 	if (prtactive && vp->v_usecount > 1)
 		vprint("ufs_reclaim: pushing active", vp);
 
+	if (!UFS_WAPBL_BEGIN(vp->v_mount)) {
+		UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
+		UFS_WAPBL_END(vp->v_mount);
+	}
 	UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
 
 	/*
diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c
index 9303494fc4f3..9f2b39555aed 100644
--- a/sys/ufs/ufs/ufs_lookup.c
+++ b/sys/ufs/ufs/ufs_lookup.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp $	*/
+/*	$NetBSD: ufs_lookup.c,v 1.99 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*
  * Copyright (c) 1989, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.99 2008/07/31 05:38:06 simonb Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ffs.h"
@@ -53,6 +53,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp
 #include <sys/vnode.h>
 #include <sys/kernel.h>
 #include <sys/kauth.h>
+#include <sys/wapbl.h>
 #include <sys/fstrans.h>
 #include <sys/proc.h>
 #include <sys/kmem.h>
@@ -65,6 +66,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_wapbl.h>
 
 #include "fs_ffs.h"
 
@@ -158,7 +160,7 @@ ufs_lookup(void *v)
 		return (error);
 
 	if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
-	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
+	    (nameiop == DELETE || nameiop == RENAME))
 		return (EROFS);
 
 	/*
@@ -495,6 +497,7 @@ found:
 		dp->i_size = dp->i_offset + DIRSIZ(FSFMT(vdp), ep, needswap);
 		DIP_ASSIGN(dp, size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
 	}
 	brelse(bp, 0);
 
@@ -690,11 +693,12 @@ ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock)
 		DIRSIZ(FSFMT(dp), ep, needswap) ||
 	    namlen > FFS_MAXNAMLEN) {
 		/*return (1); */
-		printf("First bad, reclen=%x, DIRSIZ=%lu, namlen=%d, flags=%x "
-			"entryoffsetinblock=%d, dirblksiz = %d\n",
+		printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, "
+			"flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n",
 			ufs_rw16(ep->d_reclen, needswap),
 			(u_long)DIRSIZ(FSFMT(dp), ep, needswap),
-			namlen, dp->v_mount->mnt_flag, entryoffsetinblock,dirblksiz);
+			namlen, dp->v_mount->mnt_flag, entryoffsetinblock,
+			dirblksiz);
 		goto bad;
 	}
 	if (ep->d_ino == 0)
@@ -762,6 +766,8 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp,
 	const int needswap = UFS_MPNEEDSWAP(ump);
 	int dirblksiz = ump->um_dirblksiz;
 
+	UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
 	error = 0;
 	cr = cnp->cn_cred;
 	l = curlwp;
@@ -882,6 +888,7 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp,
 		dp->i_size = dp->i_offset + dp->i_count;
 		DIP_ASSIGN(dp, size, dp->i_size);
 		dp->i_flag |= IN_CHANGE | IN_UPDATE;
+		UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
 	}
 	/*
 	 * Get the block containing the space for the new directory entry.
@@ -1014,6 +1021,7 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp,
 		if (DOINGSOFTDEP(dvp) && (tvp != NULL))
 			vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
 	}
+	UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
 	return (error);
 }
 
@@ -1040,6 +1048,8 @@ ufs_dirremove(struct vnode *dvp, struct inode *ip, int flags, int isrmdir)
 	const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
 #endif
 
+	UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
+
 	if (flags & DOWHITEOUT) {
 		/*
 		 * Whiteout entry: set d_ino to WINO.
@@ -1105,6 +1115,7 @@ out:
 			ip->i_nlink--;
 			DIP_ASSIGN(ip, nlink, ip->i_nlink);
 			ip->i_flag |= IN_CHANGE;
+			UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
 		}
 		error = VOP_BWRITE(bp);
 	}
@@ -1118,6 +1129,7 @@ out:
 	if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
 	    ip->i_ffs_effnlink == 0)
 		ffs_snapgone(ip);
+	UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
 #endif
 	return (error);
 }
@@ -1151,6 +1163,7 @@ ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype,
 		oip->i_nlink--;
 		DIP_ASSIGN(oip, nlink, oip->i_nlink);
 		oip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
 		error = VOP_BWRITE(bp);
 	}
 	dp->i_flag |= iflags;
@@ -1162,6 +1175,7 @@ ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype,
 	 */
 	if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_ffs_effnlink == 0)
 		ffs_snapgone(oip);
+	UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
 #endif
 	return (error);
 }
@@ -1333,8 +1347,8 @@ ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp,
 	const int bsize = 1 << bshift;
 	off_t eof;
 
-	blks = kmem_alloc((1+dirrablks) * sizeof(daddr_t), KM_SLEEP);
-	blksizes = kmem_alloc((1+dirrablks) * sizeof(int), KM_SLEEP);
+	blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
+	blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
 	ip = VTOI(vp);
 	KASSERT(vp->v_size == ip->i_size);
 	GOP_SIZE(vp, vp->v_size, &eof, 0);
@@ -1370,7 +1384,7 @@ ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp,
 	*bpp = bp;
 
  out:
-	kmem_free(blks, (1+dirrablks) * sizeof(daddr_t));
-	kmem_free(blksizes, (1+dirrablks) * sizeof(int));
+	kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
+	kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
 	return error;
 }
diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c
index ce722980c027..9e9d81c1fe5a 100644
--- a/sys/ufs/ufs/ufs_readwrite.c
+++ b/sys/ufs/ufs/ufs_readwrite.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.89 2008/07/31 05:38:06 simonb Exp $	*/
 
 /*-
  * Copyright (c) 1993
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.89 2008/07/31 05:38:06 simonb Exp $");
 
 #ifdef LFS_READWRITE
 #define	FS			struct lfs
@@ -43,6 +43,9 @@ __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken E
 #define	WRITE_S			"lfs_write"
 #define	fs_bsize		lfs_bsize
 #define	fs_bmask		lfs_bmask
+#define	UFS_WAPBL_BEGIN(mp)	0
+#define	UFS_WAPBL_END(mp)	do { } while (0)
+#define	UFS_WAPBL_UPDATE(vp, access, modify, flags)	do { } while (0)
 #else
 #define	FS			struct fs
 #define	I_FS			i_fs
@@ -177,8 +180,15 @@ READ(void *v)
  out:
 	if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
 		ip->i_flag |= IN_ACCESS;
-		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC)
+		if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) {
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error) {
+				fstrans_done(vp->v_mount);
+				return error;
+			}
 			error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+			UFS_WAPBL_END(vp->v_mount);
+		}
 	}
 
 	fstrans_done(vp->v_mount);
@@ -283,6 +293,15 @@ WRITE(void *v)
 	error = 0;
 
 	usepc = vp->v_type == VREG;
+
+	if ((ioflag & IO_JOURNALLOCKED) == 0) {
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error) {
+			fstrans_done(vp->v_mount);
+			return error;
+		}
+	}
+
 #ifdef LFS_READWRITE
 	async = true;
 	lfs_check(vp, LFS_UNUSED_LBN, 0);
@@ -511,8 +530,11 @@ out:
 		uio->uio_resid = resid;
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
 		error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
+	else
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
 	KASSERT(vp->v_size == ip->i_size);
-
+	if ((ioflag & IO_JOURNALLOCKED) == 0)
+		UFS_WAPBL_END(vp->v_mount);
 	fstrans_done(vp->v_mount);
 
 	return (error);
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 241a96a86cac..c749518f6c26 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1,4 +1,33 @@
-/*	$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $	*/
+/*	$NetBSD: ufs_vnops.c,v 1.167 2008/07/31 05:38:06 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
@@ -37,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.167 2008/07/31 05:38:06 simonb Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -60,6 +89,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $");
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/kauth.h>
+#include <sys/wapbl.h>
 #include <sys/fstrans.h>
 
 #include <miscfs/specfs/specdev.h>
@@ -70,6 +100,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $");
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_bswap.h>
 #include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
@@ -105,13 +136,20 @@ ufs_create(void *v)
 	} */ *ap = v;
 	int	error;
 
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
 	fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 			  ap->a_dvp, ap->a_vpp, ap->a_cnp);
-	fstrans_done(ap->a_dvp->v_mount);
-	if (error)
+	if (error) {
+		fstrans_done(ap->a_dvp->v_mount);
 		return (error);
+	}
+	UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
+	fstrans_done(ap->a_dvp->v_mount);
 	VN_KNOTE(ap->a_dvp, NOTE_WRITE);
 	return (0);
 }
@@ -138,6 +176,11 @@ ufs_mknod(void *v)
 
 	vap = ap->a_vap;
 	vpp = ap->a_vpp;
+
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
 	fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
 	if ((error =
 	    ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
@@ -161,6 +204,8 @@ ufs_mknod(void *v)
 			ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev,
 			    UFS_MPNEEDSWAP(ump));
 	}
+	UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0);
+	UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
 	/*
 	 * Remove inode so that it will be reloaded by VFS_VGET and
 	 * checked to see if it is an alias of an existing entry in
@@ -394,8 +439,8 @@ ufs_setattr(void *v)
 			goto out;
 		}
 		if (kauth_cred_geteuid(cred) != ip->i_uid &&
-		    (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
-		    NULL)))
+		    (error = kauth_authorize_generic(cred,
+		    KAUTH_GENERIC_ISSUSER, NULL)))
 			goto out;
 		if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
 		    NULL) == 0) {
@@ -411,6 +456,9 @@ ufs_setattr(void *v)
 				error = EPERM;
 				goto out;
 			}
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
 			ip->i_flags = vap->va_flags;
 			DIP_ASSIGN(ip, flags, ip->i_flags);
 		} else {
@@ -424,11 +472,16 @@ ufs_setattr(void *v)
 				error = EPERM;
 				goto out;
 			}
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
 			ip->i_flags &= SF_SETTABLE;
 			ip->i_flags |= (vap->va_flags & UF_SETTABLE);
 			DIP_ASSIGN(ip, flags, ip->i_flags);
 		}
 		ip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+		UFS_WAPBL_END(vp->v_mount);
 		if (vap->va_flags & (IMMUTABLE | APPEND)) {
 			error = 0;
 			goto out;
@@ -446,7 +499,11 @@ ufs_setattr(void *v)
 			error = EROFS;
 			goto out;
 		}
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
 		error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l);
+		UFS_WAPBL_END(vp->v_mount);
 		if (error)
 			goto out;
 	}
@@ -466,14 +523,46 @@ ufs_setattr(void *v)
 			break;
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY) {
-				 error = EROFS;
-				 goto out;
+				error = EROFS;
+				goto out;
 			}
 			if ((ip->i_flags & SF_SNAPSHOT) != 0) {
 				error = EPERM;
 				goto out;
 			}
-			error = UFS_TRUNCATE(vp, vap->va_size, 0, cred);
+			error = UFS_WAPBL_BEGIN(vp->v_mount);
+			if (error)
+				goto out;
+			/*
+			 * When journaling, only truncate one indirect block
+			 * at a time.
+			 */
+			if (vp->v_mount->mnt_wapbl) {
+				uint64_t incr = MNINDIR(ip->i_ump) <<
+				    vp->v_mount->mnt_fs_bshift; /* Power of 2 */
+				uint64_t base = NDADDR <<
+				    vp->v_mount->mnt_fs_bshift;
+				while (!error && ip->i_size > base + incr &&
+				    ip->i_size > vap->va_size + incr) {
+					/*
+					 * round down to next full indirect
+					 * block boundary.
+					 */
+					uint64_t nsize = base +
+					    ((ip->i_size - base - 1) &
+					    ~(incr - 1));
+					error = UFS_TRUNCATE(vp, nsize, 0,
+					    cred);
+					if (error == 0) {
+						UFS_WAPBL_END(vp->v_mount);
+						error =
+						   UFS_WAPBL_BEGIN(vp->v_mount);
+					}
+				}
+			}
+			if (!error)
+				error = UFS_TRUNCATE(vp, vap->va_size, 0, cred);
+			UFS_WAPBL_END(vp->v_mount);
 			if (error)
 				goto out;
 			break;
@@ -494,11 +583,14 @@ ufs_setattr(void *v)
 			goto out;
 		}
 		if (kauth_cred_geteuid(cred) != ip->i_uid &&
-		    (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER,
-		    NULL)) &&
+		    (error = kauth_authorize_generic(cred,
+		    KAUTH_GENERIC_ISSUSER, NULL)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, cred))))
 			goto out;
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
 		if (vap->va_atime.tv_sec != VNOVAL)
 			if (!(vp->v_mount->mnt_flag & MNT_NOATIME))
 				ip->i_flag |= IN_ACCESS;
@@ -510,6 +602,7 @@ ufs_setattr(void *v)
 			ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec;
 		}
 		error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0);
+		UFS_WAPBL_END(vp->v_mount);
 		if (error)
 			goto out;
 	}
@@ -525,7 +618,11 @@ ufs_setattr(void *v)
 			error = EPERM;
 			goto out;
 		}
+		error = UFS_WAPBL_BEGIN(vp->v_mount);
+		if (error)
+			goto out;
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, l);
+		UFS_WAPBL_END(vp->v_mount);
 	}
 	VN_KNOTE(vp, NOTE_ATTRIB);
 out:
@@ -543,6 +640,8 @@ ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
 	struct inode	*ip;
 	int		error, ismember = 0;
 
+	UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
+
 	ip = VTOI(vp);
 	if (kauth_cred_geteuid(cred) != ip->i_uid &&
 	    (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL)))
@@ -558,6 +657,7 @@ ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l)
 	ip->i_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
 	DIP_ASSIGN(ip, mode, ip->i_mode);
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
 	return (0);
 }
 
@@ -626,6 +726,7 @@ ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred,
  good:
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
 	return (0);
 }
 
@@ -649,8 +750,13 @@ ufs_remove(void *v)
 	if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND))
 		error = EPERM;
-	else
-		error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
+	else {
+		error = UFS_WAPBL_BEGIN(dvp->v_mount);
+		if (error == 0) {
+			error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
+			UFS_WAPBL_END(dvp->v_mount);
+		}
+	}
 	VN_KNOTE(vp, NOTE_DELETE);
 	VN_KNOTE(dvp, NOTE_WRITE);
 	if (dvp == vp)
@@ -720,6 +826,11 @@ ufs_link(void *v)
 		error = EPERM;
 		goto out1;
 	}
+	error = UFS_WAPBL_BEGIN(vp->v_mount);
+	if (error) {
+		VOP_ABORTOP(dvp, cnp);
+		goto out1;
+	}
 	ip->i_ffs_effnlink++;
 	ip->i_nlink++;
 	DIP_ASSIGN(ip, nlink, ip->i_nlink);
@@ -738,10 +849,12 @@ ufs_link(void *v)
 		ip->i_nlink--;
 		DIP_ASSIGN(ip, nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP);
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 	}
 	PNBUF_PUT(cnp->cn_pnbuf);
+	UFS_WAPBL_END(vp->v_mount);
  out1:
 	if (dvp != vp)
 		VOP_UNLOCK(vp, 0);
@@ -865,6 +978,11 @@ ufs_rename(void *v)
 	struct direct		*newdir;
 	int			doingdirectory, oldparent, newparent, error;
 
+#ifdef WAPBL
+	if (ap->a_tdvp->v_mount->mnt_wapbl)
+		return wapbl_ufs_rename(v);
+#endif
+
 	tvp = ap->a_tvp;
 	tdvp = ap->a_tdvp;
 	fvp = ap->a_fvp;
@@ -1297,6 +1415,9 @@ ufs_mkdir(void *v)
 	 */
 	if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0)
 		goto out;
+	error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount);
+	if (error)
+		goto out;
 	tvp = *ap->a_vpp;
 	ip = VTOI(tvp);
 	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
@@ -1307,6 +1428,7 @@ ufs_mkdir(void *v)
 	if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		PNBUF_PUT(cnp->cn_pnbuf);
 		UFS_VFREE(tvp, ip->i_number, dmode);
+		UFS_WAPBL_END(dvp->v_mount);
 		fstrans_done(dvp->v_mount);
 		vput(tvp);
 		vput(dvp);
@@ -1412,11 +1534,13 @@ ufs_mkdir(void *v)
  bad:
 	if (error == 0) {
 		VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
+		UFS_WAPBL_END(dvp->v_mount);
 	} else {
 		dp->i_ffs_effnlink--;
 		dp->i_nlink--;
 		DIP_ASSIGN(dp, nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
 		if (DOINGSOFTDEP(dvp))
 			softdep_change_linkcnt(dp);
 		/*
@@ -1431,8 +1555,10 @@ ufs_mkdir(void *v)
 		/* If IN_ADIROP, account for it */
 		lfs_unmark_vnode(tvp);
 #endif
+		UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP);
 		if (DOINGSOFTDEP(tvp))
 			softdep_change_linkcnt(ip);
+		UFS_WAPBL_END(dvp->v_mount);
 		vput(tvp);
 	}
  out:
@@ -1496,6 +1622,9 @@ ufs_rmdir(void *v)
 		error = EPERM;
 		goto out;
 	}
+	error = UFS_WAPBL_BEGIN(dvp->v_mount);
+	if (error)
+		goto out;
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
@@ -1515,6 +1644,7 @@ ufs_rmdir(void *v)
 			softdep_change_linkcnt(dp);
 			softdep_change_linkcnt(ip);
 		}
+		UFS_WAPBL_END(dvp->v_mount);
 		goto out;
 	}
 	VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK);
@@ -1531,6 +1661,7 @@ ufs_rmdir(void *v)
 		dp->i_ffs_effnlink--;
 		DIP_ASSIGN(dp, nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
+		UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
 		ip->i_nlink--;
 		ip->i_ffs_effnlink--;
 		DIP_ASSIGN(ip, nlink, ip->i_nlink);
@@ -1538,6 +1669,11 @@ ufs_rmdir(void *v)
 		error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred);
 	}
 	cache_purge(vp);
+	/*
+	 * Unlock the log while we still have reference to unlinked
+	 * directory vp so that it will not get locked for recycling
+	 */
+	UFS_WAPBL_END(dvp->v_mount);
 #ifdef UFS_DIRHASH
 	if (ip->i_dirhash != NULL)
 		ufsdirhash_free(ip);
@@ -1576,6 +1712,10 @@ ufs_symlink(void *v)
 	int		len, error;
 
 	vpp = ap->a_vpp;
+	/*
+	 * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful
+	 * ufs_makeinode
+	 */
 	fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED);
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 			      vpp, ap->a_cnp);
@@ -1591,10 +1731,12 @@ ufs_symlink(void *v)
 		DIP_ASSIGN(ip, size, len);
 		uvm_vnp_setsize(vp, ip->i_size);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
+		UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
-		    UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, NULL,
-		    NULL);
+		    UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED,
+		    ap->a_cnp->cn_cred, NULL, NULL);
+	UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp);
 	if (error)
 		vput(vp);
 out:
@@ -2096,6 +2238,8 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
 	struct vnode	*tvp;
 	int		error, ismember = 0;
 
+	UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount);
+
 	pdir = VTOI(dvp);
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
@@ -2115,9 +2259,22 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
 	DIP_ASSIGN(ip, gid, ip->i_gid);
 	ip->i_uid = kauth_cred_geteuid(cnp->cn_cred);
 	DIP_ASSIGN(ip, uid, ip->i_uid);
+	error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp);
+	if (error) {
+		/*
+		 * Note, we can't VOP_VFREE(tvp) here like we should
+		 * because we can't write to the disk.  Instead, we leave
+		 * the vnode dangling from the journal.
+		 */
+		vput(tvp);
+		PNBUF_PUT(cnp->cn_pnbuf);
+		vput(dvp);
+		return (error);
+	}
 #ifdef QUOTA
 	if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		UFS_VFREE(tvp, ip->i_number, mode);
+		UFS_WAPBL_END1(dvp->v_mount, dvp);
 		vput(tvp);
 		PNBUF_PUT(cnp->cn_pnbuf);
 		vput(dvp);
@@ -2175,9 +2332,11 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
 	/* If IN_ADIROP, account for it */
 	lfs_unmark_vnode(tvp);
 #endif
+	UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0);
 	if (DOINGSOFTDEP(tvp))
 		softdep_change_linkcnt(ip);
 	tvp->v_type = VNON;		/* explodes later if VBLK */
+	UFS_WAPBL_END1(dvp->v_mount, dvp);
 	vput(tvp);
 	PNBUF_PUT(cnp->cn_pnbuf);
 	vput(dvp);
@@ -2228,7 +2387,8 @@ ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags,
         }
 
 out:
-        return error;
+	UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
+	return error;
 }
 
 void
diff --git a/sys/ufs/ufs/ufs_wapbl.c b/sys/ufs/ufs/ufs_wapbl.c
new file mode 100644
index 000000000000..663c6e7a02c6
--- /dev/null
+++ b/sys/ufs/ufs/ufs_wapbl.c
@@ -0,0 +1,805 @@
+/*  $NetBSD: ufs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $ */
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_vnops.c	8.28 (Berkeley) 7/31/95
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $");
+
+#if defined(_KERNEL_OPT)
+#include "opt_quota.h"
+#include "fs_lfs.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/namei.h>
+#include <sys/resourcevar.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+#include <sys/lockf.h>
+#include <sys/kauth.h>
+#include <sys/wapbl.h>
+#include <sys/fstrans.h>
+
+#include <miscfs/specfs/specdev.h>
+#include <miscfs/fifofs/fifo.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+#include <ufs/ufs/dir.h>
+#include <ufs/ufs/ufsmount.h>
+#include <ufs/ufs/ufs_bswap.h>
+#include <ufs/ufs/ufs_extern.h>
+#include <ufs/ufs/ufs_wapbl.h>
+#include <ufs/ext2fs/ext2fs_extern.h>
+#include <ufs/lfs/lfs_extern.h>
+
+#include <uvm/uvm.h>
+
+/* XXX following lifted from ufs_lookup.c */
+#define	FSFMT(vp)	(((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0)
+
+/*
+ * A virgin directory (no blushing please).
+ */
+static const struct dirtemplate mastertemplate = {
+	0,	12,		DT_DIR,	1,	".",
+	0,	DIRBLKSIZ - 12,	DT_DIR,	2,	".."
+};
+
+/*
+ * Rename vnode operation
+ * 	rename("foo", "bar");
+ * is essentially
+ *	unlink("bar");
+ *	link("foo", "bar");
+ *	unlink("foo");
+ * but ``atomically''.  Can't do full commit without saving state in the
+ * inode on disk which isn't feasible at this time.  Best we can do is
+ * always guarantee the target exists.
+ *
+ * Basic algorithm is:
+ *
+ * 1) Bump link count on source while we're linking it to the
+ *    target.  This also ensure the inode won't be deleted out
+ *    from underneath us while we work (it may be truncated by
+ *    a concurrent `trunc' or `open' for creation).
+ * 2) Link source to destination.  If destination already exists,
+ *    delete it first.
+ * 3) Unlink source reference to inode if still around. If a
+ *    directory was moved and the parent of the destination
+ *    is different from the source, patch the ".." entry in the
+ *    directory.
+ *
+ * WAPBL NOTE: wapbl_ufs_rename derived from ufs_rename in ufs_vnops.c
+ * ufs_vnops.c netbsd cvs revision 1.108
+ * which has the berkeley copyright above
+ * changes introduced to ufs_rename since netbsd cvs revision 1.164
+ * will need to be ported into wapbl_ufs_rename
+ */
+int
+wapbl_ufs_rename(void *v)
+{
+	struct vop_rename_args  /* {
+		struct vnode		*a_fdvp;
+		struct vnode		*a_fvp;
+		struct componentname	*a_fcnp;
+		struct vnode		*a_tdvp;
+		struct vnode		*a_tvp;
+		struct componentname	*a_tcnp;
+	} */ *ap = v;
+	struct vnode		*tvp, *tdvp, *fvp, *fdvp;
+	struct componentname	*tcnp, *fcnp;
+	struct inode		*ip, *txp, *fxp, *tdp, *fdp;
+	struct mount		*mp;
+	struct direct		*newdir;
+	int			doingdirectory, oldparent, newparent, error;
+
+	int32_t	  saved_f_count;
+	doff_t	  saved_f_diroff;
+	doff_t	  saved_f_offset;
+	u_int32_t saved_f_reclen;
+	int32_t	  saved_t_count;
+	doff_t	  saved_t_endoff;
+	doff_t	  saved_t_diroff;
+	doff_t	  saved_t_offset;
+	u_int32_t saved_t_reclen;
+
+	tvp = ap->a_tvp;
+	tdvp = ap->a_tdvp;
+	fvp = ap->a_fvp;
+	fdvp = ap->a_fdvp;
+	tcnp = ap->a_tcnp;
+	fcnp = ap->a_fcnp;
+	doingdirectory = oldparent = newparent = error = 0;
+
+#ifdef DIAGNOSTIC
+	if ((tcnp->cn_flags & HASBUF) == 0 ||
+	    (fcnp->cn_flags & HASBUF) == 0)
+		panic("ufs_rename: no name");
+#endif
+	/*
+	 * Check for cross-device rename.
+	 */
+	if ((fvp->v_mount != tdvp->v_mount) ||
+	    (tvp && (fvp->v_mount != tvp->v_mount))) {
+		error = EXDEV;
+ abortit:
+		VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
+		vrele(fdvp);
+		vrele(fvp);
+		return (error);
+	}
+
+	/*
+	 * Check if just deleting a link name.
+	 */
+	if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
+	    (VTOI(tdvp)->i_flags & APPEND))) {
+		error = EPERM;
+		goto abortit;
+	}
+	if (fvp == tvp) {
+		if (fvp->v_type == VDIR) {
+			error = EINVAL;
+			goto abortit;
+		}
+
+		/* Release destination completely. */
+		VOP_ABORTOP(tdvp, tcnp);
+		vput(tdvp);
+		vput(tvp);
+
+		/* Delete source. */
+		vrele(fvp);
+		fcnp->cn_flags &= ~(MODMASK | SAVESTART);
+		fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+		fcnp->cn_nameiop = DELETE;
+		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = relookup(fdvp, &fvp, fcnp))) {
+			vput(fdvp);
+			return (error);
+		}
+		return (VOP_REMOVE(fdvp, fvp, fcnp));
+	}
+	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
+		goto abortit;
+	fdp = VTOI(fdvp);
+	ip = VTOI(fvp);
+	if ((nlink_t) ip->i_nlink >= LINK_MAX) {
+		VOP_UNLOCK(fvp, 0);
+		error = EMLINK;
+		goto abortit;
+	}
+	if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
+		(fdp->i_flags & APPEND)) {
+		VOP_UNLOCK(fvp, 0);
+		error = EPERM;
+		goto abortit;
+	}
+	if ((ip->i_mode & IFMT) == IFDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
+		    fdp == ip ||
+		    (fcnp->cn_flags & ISDOTDOT) ||
+		    (tcnp->cn_flags & ISDOTDOT) ||
+		    (ip->i_flag & IN_RENAME)) {
+			VOP_UNLOCK(fvp, 0);
+			error = EINVAL;
+			goto abortit;
+		}
+		ip->i_flag |= IN_RENAME;
+		doingdirectory = 1;
+	}
+	oldparent = fdp->i_number;
+	VN_KNOTE(fdvp, NOTE_WRITE);		/* XXXLUKEM/XXX: right place? */
+
+	/*
+	 * When the target exists, both the directory
+	 * and target vnodes are returned locked.
+	 */
+	tdp = VTOI(tdvp);
+	txp = NULL;
+	if (tvp)
+		txp = VTOI(tvp);
+
+	mp = fdvp->v_mount;
+	fstrans_start(mp, FSTRANS_SHARED);
+
+	/*
+	 * If ".." must be changed (ie the directory gets a new
+	 * parent) then the source directory must not be in the
+	 * directory hierarchy above the target, as this would
+	 * orphan everything below the source directory. Also
+	 * the user must have write permission in the source so
+	 * as to be able to change "..". We must repeat the call 
+	 * to namei, as the parent directory is unlocked by the
+	 * call to checkpath().
+	 */
+	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred);
+	VOP_UNLOCK(fvp, 0);
+	if (oldparent != tdp->i_number)
+		newparent = tdp->i_number;
+	if (doingdirectory && newparent) {
+		if (error)	/* write access check above */
+			goto out;
+		if (txp != NULL)
+			vput(tvp);
+		txp = NULL;
+		vref(tdvp);	/* compensate for the ref checkpath loses */
+		if ((error = ufs_checkpath(ip, tdp, tcnp->cn_cred)) != 0) {
+			vrele(tdvp);
+			tdp = NULL;
+			goto out;
+		}
+		tcnp->cn_flags &= ~SAVESTART;
+		tdp = NULL;
+		vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+		error = relookup(tdvp, &tvp, tcnp);
+		if (error != 0) {
+			vput(tdvp);
+			goto out;
+		}
+		tdp = VTOI(tdvp);
+		if (tvp)
+			txp = VTOI(tvp);
+	}
+
+	/*
+	 * XXX handle case where fdvp is parent of tdvp,
+	 * by unlocking tdvp and regrabbing it with vget after?
+	 */
+
+	/* save directory lookup information in case tdvp == fdvp */
+	saved_t_count  = tdp->i_count;
+	saved_t_endoff = tdp->i_endoff;
+	saved_t_diroff = tdp->i_diroff;
+	saved_t_offset = tdp->i_offset;
+	saved_t_reclen = tdp->i_reclen;
+
+	/*
+	 * This was moved up to before the journal lock to
+	 * avoid potential deadlock
+	 */
+	fcnp->cn_flags &= ~(MODMASK | SAVESTART);
+	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
+	if (newparent) {
+		vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY);
+		if ((error = relookup(fdvp, &fvp, fcnp))) {
+			vput(fdvp);
+			vrele(ap->a_fvp);
+			goto out2;
+		}
+	} else {
+		error = VOP_LOOKUP(fdvp, &fvp, fcnp);
+		if (error && (error != EJUSTRETURN)) {
+			vrele(ap->a_fvp);
+			goto out2;
+		}
+		error = 0;
+	}
+	if (fvp != NULL) {
+		fxp = VTOI(fvp);
+		fdp = VTOI(fdvp);
+	} else {
+		/*
+		 * From name has disappeared.
+		 */
+		if (doingdirectory)
+			panic("rename: lost dir entry");
+		vrele(ap->a_fvp);
+		error = ENOENT;	/* XXX ufs_rename sets "0" here */
+		goto out2;
+	}
+	vrele(ap->a_fvp);
+
+	/* save directory lookup information in case tdvp == fdvp */
+	saved_f_count  = fdp->i_count;
+	saved_f_diroff = fdp->i_diroff;
+	saved_f_offset = fdp->i_offset;
+	saved_f_reclen = fdp->i_reclen;
+
+	/* restore directory lookup information in case tdvp == fdvp */
+	tdp->i_offset = saved_t_offset;
+	tdp->i_reclen = saved_t_reclen;
+	tdp->i_count  = saved_t_count;
+	tdp->i_endoff = saved_t_endoff;
+	tdp->i_diroff = saved_t_diroff;
+
+	error = UFS_WAPBL_BEGIN(fdvp->v_mount);
+	if (error)
+		goto out2;
+
+	/*
+	 * 1) Bump link count while we're moving stuff
+	 *    around.  If we crash somewhere before
+	 *    completing our work, the link count
+	 *    may be wrong, but correctable.
+	 */
+	ip->i_ffs_effnlink++;
+	ip->i_nlink++;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	if (DOINGSOFTDEP(fvp))
+		softdep_change_linkcnt(ip);
+	if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) {
+		goto bad;
+	}
+
+	/*
+	 * 2) If target doesn't exist, link the target
+	 *    to the source and unlink the source.
+	 *    Otherwise, rewrite the target directory
+	 *    entry to reference the source inode and
+	 *    expunge the original entry's existence.
+	 */
+	if (txp == NULL) {
+		if (tdp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Account for ".." in new directory.
+		 * When source and destination have the same
+		 * parent we don't fool with the link count.
+		 */
+		if (doingdirectory && newparent) {
+			if ((nlink_t)tdp->i_nlink >= LINK_MAX) {
+				error = EMLINK;
+				goto bad;
+			}
+			tdp->i_ffs_effnlink++;
+			tdp->i_nlink++;
+			DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+			tdp->i_flag |= IN_CHANGE;
+			if (DOINGSOFTDEP(tdvp))
+				softdep_change_linkcnt(tdp);
+			if ((error = UFS_UPDATE(tdvp, NULL, NULL,
+			    UPDATE_DIROP)) != 0) {
+				tdp->i_ffs_effnlink--;
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				if (DOINGSOFTDEP(tdvp))
+					softdep_change_linkcnt(tdp);
+				goto bad;
+			}
+		}
+		newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
+		ufs_makedirentry(ip, tcnp, newdir);
+		error = ufs_direnter(tdvp, NULL, newdir, tcnp, NULL);
+		pool_cache_put(ufs_direct_cache, newdir);
+		if (error != 0) {
+			if (doingdirectory && newparent) {
+				tdp->i_ffs_effnlink--;
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				if (DOINGSOFTDEP(tdvp))
+					softdep_change_linkcnt(tdp);
+				(void)UFS_UPDATE(tdvp, NULL, NULL,
+						 UPDATE_WAIT | UPDATE_DIROP);
+			}
+			goto bad;
+		}
+		VN_KNOTE(tdvp, NOTE_WRITE);
+	} else {
+		if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev)
+			panic("rename: EXDEV");
+		/*
+		 * Short circuit rename(foo, foo).
+		 */
+		if (txp->i_number == ip->i_number)
+			panic("rename: same file");
+		/*
+		 * If the parent directory is "sticky", then the user must
+		 * own the parent directory, or the destination of the rename,
+		 * otherwise the destination may not be changed (except by
+		 * root). This implements append-only directories.
+		 */
+		if ((tdp->i_mode & S_ISTXT) &&
+		    kauth_authorize_generic(tcnp->cn_cred,
+		     KAUTH_GENERIC_ISSUSER, NULL) != 0 &&
+		    kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid &&
+		    txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) {
+			error = EPERM;
+			goto bad;
+		}
+		/*
+		 * Target must be empty if a directory and have no links
+		 * to it. Also, ensure source and target are compatible
+		 * (both directories, or both not directories).
+		 */
+		if ((txp->i_mode & IFMT) == IFDIR) {
+			if (txp->i_ffs_effnlink > 2 ||
+			    !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) {
+				error = ENOTEMPTY;
+				goto bad;
+			}
+			if (!doingdirectory) {
+				error = ENOTDIR;
+				goto bad;
+			}
+			cache_purge(tdvp);
+		} else if (doingdirectory) {
+			error = EISDIR;
+			goto bad;
+		}
+		if ((error = ufs_dirrewrite(tdp, txp, ip->i_number,
+		    IFTODT(ip->i_mode), doingdirectory && newparent ?
+		    newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0)
+			goto bad;
+		if (doingdirectory) {
+			if (!newparent) {
+				tdp->i_ffs_effnlink--;
+				if (DOINGSOFTDEP(tdvp))
+					softdep_change_linkcnt(tdp);
+			}
+			txp->i_ffs_effnlink--;
+			if (DOINGSOFTDEP(tvp))
+				softdep_change_linkcnt(txp);
+		}
+		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
+			/*
+			 * Truncate inode. The only stuff left in the directory
+			 * is "." and "..". The "." reference is inconsequential
+			 * since we are quashing it. We have removed the "."
+			 * reference and the reference in the parent directory,
+			 * but there may be other hard links. The soft
+			 * dependency code will arrange to do these operations
+			 * after the parent directory entry has been deleted on
+			 * disk, so when running with that code we avoid doing
+			 * them now.
+			 */
+			if (!newparent) {
+				tdp->i_nlink--;
+				DIP_ASSIGN(tdp, nlink, tdp->i_nlink);
+				tdp->i_flag |= IN_CHANGE;
+				UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
+			}
+			txp->i_nlink--;
+			DIP_ASSIGN(txp, nlink, txp->i_nlink);
+			txp->i_flag |= IN_CHANGE;
+			if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC,
+			    tcnp->cn_cred)))
+				goto bad;
+		}
+		VN_KNOTE(tdvp, NOTE_WRITE);
+		VN_KNOTE(tvp, NOTE_DELETE);
+	}
+
+	/* restore directory lookup information in case tdvp == fdvp */
+	fdp->i_offset = saved_f_offset;
+	fdp->i_reclen = saved_f_reclen;
+	fdp->i_count  = saved_f_count;
+	fdp->i_diroff = saved_f_diroff;
+
+	/*
+	 * Handle case where the directory we need to remove may have
+	 * been moved when the directory insertion above performed compaction.
+	 * or when i_count may be wrong due to insertion before this entry.
+	 */
+	if ((tdp->i_number == fdp->i_number) &&
+		(((saved_f_offset >= saved_t_offset) &&
+			(saved_f_offset < saved_t_offset + saved_t_count)) ||
+		((saved_f_offset - saved_f_count >= saved_t_offset) &&
+			(saved_f_offset - saved_f_count <
+			 saved_t_offset + saved_t_count)))) {
+		struct buf *bp;
+		struct direct *ep;
+		struct ufsmount *ump = fdp->i_ump;
+		doff_t endsearch;	/* offset to end directory search */
+		int dirblksiz = ump->um_dirblksiz;
+		const int needswap = UFS_MPNEEDSWAP(ump);
+		u_long bmask;
+		int namlen, entryoffsetinblock;
+		char *dirbuf;
+
+		bmask = fdvp->v_mount->mnt_stat.f_iosize - 1;
+
+		/*
+		 * the fcnp entry will be somewhere between the start of
+		 * compaction and the original location.
+		 */
+		fdp->i_offset = saved_t_offset;
+		error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset, &dirbuf, &bp,
+		    false);
+		if (error)
+			goto bad;
+
+		/*
+		 * keep existing fdp->i_count in case
+		 * compaction started at the same location as the fcnp entry.
+		 */
+		endsearch = saved_f_offset + saved_f_reclen;
+		entryoffsetinblock = 0;
+		while (fdp->i_offset < endsearch) {
+			int reclen;
+
+			/*
+			 * If necessary, get the next directory block.
+			 */
+			if ((fdp->i_offset & bmask) == 0) {
+				if (bp != NULL)
+					brelse(bp, 0);
+				error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset,
+				    &dirbuf, &bp, false);
+				if (error)
+					goto bad;
+				entryoffsetinblock = 0;
+			}
+
+			KASSERT(bp != NULL);
+			ep = (struct direct *)(dirbuf + entryoffsetinblock);
+			reclen = ufs_rw16(ep->d_reclen, needswap);
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+			if (FSFMT(fdvp) && needswap == 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#else
+			if (FSFMT(fdvp) && needswap != 0)
+				namlen = ep->d_type;
+			else
+				namlen = ep->d_namlen;
+#endif
+			if ((ep->d_ino != 0) &&
+			    (ufs_rw32(ep->d_ino, needswap) != WINO) &&
+			    (namlen == fcnp->cn_namelen) &&
+			    memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) {
+				fdp->i_reclen = reclen;
+				break;
+			}
+			fdp->i_offset += reclen;
+			fdp->i_count = reclen;
+			entryoffsetinblock += reclen;
+		}
+
+		KASSERT(fdp->i_offset <= endsearch);
+
+		/*
+		 * If fdp->i_offset points to start of a directory block,
+		 * set fdp->i_count so ufs_dirremove() doesn't compact over
+		 * a directory block boundary.
+		 */
+		if ((fdp->i_offset & (dirblksiz - 1)) == 0)
+			fdp->i_count = 0;
+
+		brelse(bp, 0);
+	}
+
+	/*
+	 * 3) Unlink the source.
+	 */
+	/*
+	 * Ensure that the directory entry still exists and has not
+	 * changed while the new name has been entered. If the source is
+	 * a file then the entry may have been unlinked or renamed. In
+	 * either case there is no further work to be done. If the source
+	 * is a directory then it cannot have been rmdir'ed; The IRENAME
+	 * flag ensures that it cannot be moved by another rename or removed
+	 * by a rmdir.
+	 */
+	if (fxp != ip) {
+		if (doingdirectory)
+			panic("rename: lost dir entry");
+	} else {
+		/*
+		 * If the source is a directory with a
+		 * new parent, the link count of the old
+		 * parent directory must be decremented
+		 * and ".." set to point to the new parent.
+		 */
+		if (doingdirectory && newparent) {
+			KASSERT(fdp != NULL);
+			fxp->i_offset = mastertemplate.dot_reclen;
+			ufs_dirrewrite(fxp, fdp, newparent, DT_DIR, 0, IN_CHANGE);
+			cache_purge(fdvp);
+		}
+		error = ufs_dirremove(fdvp, fxp, fcnp->cn_flags, 0);
+		fxp->i_flag &= ~IN_RENAME;
+	}
+	VN_KNOTE(fvp, NOTE_RENAME);
+	goto done;
+
+ out:
+	vrele(fvp);
+	vrele(fdvp);
+	goto out2;
+
+	/* exit routines from steps 1 & 2 */
+ bad:
+	if (doingdirectory)
+		ip->i_flag &= ~IN_RENAME;
+	ip->i_ffs_effnlink--;
+	ip->i_nlink--;
+	DIP_ASSIGN(ip, nlink, ip->i_nlink);
+	ip->i_flag |= IN_CHANGE;
+	ip->i_flag &= ~IN_RENAME;
+	UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
+	if (DOINGSOFTDEP(fvp))
+		softdep_change_linkcnt(ip);
+ done:
+	UFS_WAPBL_END(fdvp->v_mount);
+	vput(fdvp);
+	vput(fvp);
+ out2:
+	/*
+	 * clear IN_RENAME - some exit paths happen too early to go
+	 * through the cleanup done in the "bad" case above, so we
+	 * always do this mini-cleanup here.
+	 */
+	ip->i_flag &= ~IN_RENAME;
+
+	if (txp)
+		vput(ITOV(txp));
+	if (tdp) {
+		if (newparent)
+			vput(ITOV(tdp));
+		else
+			vrele(ITOV(tdp));
+	}
+
+	fstrans_done(mp);
+	return (error);
+}
+
+#ifdef WAPBL_DEBUG_INODES
+void
+ufs_wapbl_verify_inodes(struct mount *mp, const char *str)
+{
+	struct vnode *vp, *nvp;
+	struct inode *ip;
+
+	simple_lock(&mntvnode_slock);
+ loop:
+	TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) {
+		/*
+		 * If the vnode that we are about to sync is no longer
+		 * associated with this mount point, start over.
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		simple_lock(&vp->v_interlock);
+		nvp = TAILQ_NEXT(vp, v_mntvnodes);
+		ip = VTOI(vp);
+		if (vp->v_type == VNON) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+		/* verify that update has been called on all inodes */
+		if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) {
+			panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n",
+				mp, vp, ip, ip->i_flag);
+		}
+		KDASSERT(ip->i_nlink == ip->i_ffs_effnlink);
+
+		simple_unlock(&mntvnode_slock);
+		{
+			int s;
+			struct buf *bp;
+			struct buf *nbp;
+			s = splbio();
+			for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+				nbp = LIST_NEXT(bp, b_vnbufs);
+				simple_lock(&bp->b_interlock);
+				if ((bp->b_flags & B_BUSY)) {
+					simple_unlock(&bp->b_interlock);
+					continue;
+				}
+				if ((bp->b_flags & B_DELWRI) == 0)
+					panic("wapbl_verify: not dirty, bp %p", bp);
+				if ((bp->b_flags & B_LOCKED) == 0)
+					panic("wapbl_verify: not locked, bp %p", bp);
+				simple_unlock(&bp->b_interlock);
+			}
+			splx(s);
+		}
+		simple_unlock(&vp->v_interlock);
+		simple_lock(&mntvnode_slock);
+	}
+	simple_unlock(&mntvnode_slock);
+
+	vp = VFSTOUFS(mp)->um_devvp;
+	simple_lock(&vp->v_interlock);
+	{
+		int s;
+		struct buf *bp;
+		struct buf *nbp;
+		s = splbio();
+		for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
+			nbp = LIST_NEXT(bp, b_vnbufs);
+			simple_lock(&bp->b_interlock);
+			if ((bp->b_flags & B_BUSY)) {
+				simple_unlock(&bp->b_interlock);
+				continue;
+			}
+			if ((bp->b_flags & B_DELWRI) == 0)
+				panic("wapbl_verify: devvp not dirty, bp %p", bp);
+			if ((bp->b_flags & B_LOCKED) == 0)
+				panic("wapbl_verify: devvp not locked, bp %p", bp);
+			simple_unlock(&bp->b_interlock);
+		}
+		splx(s);
+	}
+	simple_unlock(&vp->v_interlock);
+}
+#endif /* WAPBL_DEBUG_INODES */
diff --git a/sys/ufs/ufs/ufs_wapbl.h b/sys/ufs/ufs/ufs_wapbl.h
new file mode 100644
index 000000000000..2ec1abcee339
--- /dev/null
+++ b/sys/ufs/ufs/ufs_wapbl.h
@@ -0,0 +1,176 @@
+/*	$NetBSD: ufs_wapbl.h,v 1.2 2008/07/31 05:38:07 simonb Exp $	*/
+
+/*-
+ * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Wasabi Systems, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _UFS_UFS_UFS_WAPBL_H_
+#define _UFS_UFS_UFS_WAPBL_H_
+
+#if defined(_KERNEL_OPT)
+#include "opt_wapbl.h"
+#endif
+
+/*
+ * Information for the journal location stored in the superblock.
+ * We store the journal version, some flags, the journal location
+ * type, and some location specific "locators" that identify where
+ * the log itself is located.
+ */
+
+/* fs->fs_journal_version */
+#define	UFS_WAPBL_VERSION			1
+
+/* fs->fs_journal_location */
+#define	UFS_WAPBL_JOURNALLOC_NONE		0
+
+#define	UFS_WAPBL_JOURNALLOC_END_PARTITION	1
+#define	 UFS_WAPBL_EPART_ADDR			  0 /* locator slots */
+#define	 UFS_WAPBL_EPART_COUNT			  1
+#define	 UFS_WAPBL_EPART_BLKSZ			  2
+#define	 UFS_WAPBL_EPART_UNUSED			  3
+
+#define	UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM	2
+#define	 UFS_WAPBL_INFS_ADDR			  0 /* locator slots */
+#define	 UFS_WAPBL_INFS_COUNT			  1
+#define	 UFS_WAPBL_INFS_BLKSZ			  2
+#define	 UFS_WAPBL_INFS_INO			  3
+
+/* fs->fs_journal_flags */
+#define	UFS_WAPBL_FLAGS_CREATE_LOG		0x1
+#define	UFS_WAPBL_FLAGS_CLEAR_LOG		0x2
+
+
+/*
+ * The journal size is limited to between 1MB and 64MB.
+ * The default journal size is the filesystem size divided by
+ * the scale factor - this is 1M of journal per 1GB of filesystem
+ * space.
+ *
+ * XXX: Is 64MB too limiting?  If user explicitly asks for more, allow it?
+ */
+#define	UFS_WAPBL_JOURNAL_SCALE			1024
+#define	UFS_WAPBL_MIN_JOURNAL_SIZE		(1024 * 1024)
+#define	UFS_WAPBL_MAX_JOURNAL_SIZE		(64 * 1024 * 1024)
+
+
+#if defined(WAPBL)
+
+#if defined(WAPBL_DEBUG)
+#define WAPBL_DEBUG_INODES
+#endif
+
+int	wapbl_ufs_rename(void *v);
+
+#ifdef WAPBL_DEBUG_INODES
+void	ufs_wapbl_verify_inodes(struct mount *, const char *);
+#endif
+
+static __inline int
+ufs_wapbl_begin2(struct mount *mp, struct vnode *vp1, struct vnode *vp2,
+		 const char *file, int line)
+{
+	if (mp->mnt_wapbl) {
+		int error;
+
+		if (vp1)
+			vref(vp1);
+		if (vp2)
+			vref(vp2);
+		error = wapbl_begin(mp->mnt_wapbl, file, line);
+		if (error)
+			return error;
+#ifdef WAPBL_DEBUG_INODES
+		if (mp->mnt_wapbl->wl_lock.lk_exclusivecount == 1)
+			ufs_wapbl_verify_inodes(mp, "wapbl_begin");
+#endif
+	}
+	return 0;
+}
+
+static __inline void
+ufs_wapbl_end2(struct mount *mp, struct vnode *vp1, struct vnode *vp2)
+{
+	if (mp->mnt_wapbl) {
+#ifdef WAPBL_DEBUG_INODES
+		if (mp->mnt_wapbl->wl_lock.lk_exclusivecount == 1)
+			ufs_wapbl_verify_inodes(mp, "wapbl_end");
+#endif
+		wapbl_end(mp->mnt_wapbl);
+		if (vp2)
+			vrele(vp2);
+		if (vp1)
+			vrele(vp1);
+	}
+}
+
+#define	UFS_WAPBL_BEGIN(mp)						\
+	ufs_wapbl_begin2(mp, 0, 0, __FUNCTION__, __LINE__)
+#define	UFS_WAPBL_BEGIN1(mp, v1)					\
+	ufs_wapbl_begin2(mp, v1, 0, __FUNCTION__, __LINE__)
+#define	UFS_WAPBL_END(mp)	ufs_wapbl_end2(mp, 0, 0)
+#define	UFS_WAPBL_END1(mp, v1)	ufs_wapbl_end2(mp, v1, 0)
+
+#define UFS_WAPBL_UPDATE(vp, access, modify, flags)			\
+	if ((vp)->v_mount->mnt_wapbl) {					\
+		UFS_UPDATE(vp, access, modify, flags);			\
+	}
+
+#ifdef UFS_WAPBL_DEBUG_JLOCK
+#define	UFS_WAPBL_JLOCK_ASSERT(mp)					\
+	if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl)
+#define	UFS_WAPBL_JUNLOCK_ASSERT(mp)					\
+	if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl)
+#else
+#define	UFS_WAPBL_JLOCK_ASSERT(mp)
+#define	UFS_WAPBL_JUNLOCK_ASSERT(mp)
+#endif
+
+#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode)				\
+	if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode)
+#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode)			\
+	if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode)
+
+#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len)			\
+	if (mp->mnt_wapbl) wapbl_register_deallocation(mp->mnt_wapbl, blk, len)
+
+#else /* ! WAPBL */
+#define UFS_WAPBL_BEGIN(mp) 0
+#define UFS_WAPBL_BEGIN1(mp, v1) 0
+#define UFS_WAPBL_END(mp)	do { } while (0)
+#define UFS_WAPBL_END1(mp, v1)
+#define UFS_WAPBL_UPDATE(vp, access, modify, flags)	do { } while (0)
+#define UFS_WAPBL_JLOCK_ASSERT(mp)
+#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
+#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode)
+#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode)
+#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len)
+#endif
+
+#endif /* !_UFS_UFS_UFS_WAPBL_H_ */
diff --git a/usr.sbin/dumpfs/dumpfs.c b/usr.sbin/dumpfs/dumpfs.c
index ceb7d45a5869..242b27050bac 100644
--- a/usr.sbin/dumpfs/dumpfs.c
+++ b/usr.sbin/dumpfs/dumpfs.c
@@ -1,4 +1,4 @@
-/*	$NetBSD: dumpfs.c,v 1.49 2008/07/21 13:36:58 lukem Exp $	*/
+/*	$NetBSD: dumpfs.c,v 1.50 2008/07/31 05:38:07 simonb Exp $	*/
 
 /*
  * Copyright (c) 1983, 1992, 1993
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1983, 1992, 1993\
 #if 0
 static char sccsid[] = "@(#)dumpfs.c	8.5 (Berkeley) 4/29/95";
 #else
-__RCSID("$NetBSD: dumpfs.c,v 1.49 2008/07/21 13:36:58 lukem Exp $");
+__RCSID("$NetBSD: dumpfs.c,v 1.50 2008/07/31 05:38:07 simonb Exp $");
 #endif
 #endif /* not lint */
 
@@ -379,6 +379,13 @@ print_superblock(struct fs *fs, uint16_t *opostbl,
 		    fs->fs_old_csshift, fs->fs_old_csmask);
 	printf("\ncgrotor\t%d\tfmod\t%d\tronly\t%d\tclean\t0x%02x\n",
 	    fs->fs_cgrotor, fs->fs_fmod, fs->fs_ronly, fs->fs_clean);
+	printf("wapbl version 0x%x\tlocation %u\tflags 0x%x\n",
+	    fs->fs_journal_version, fs->fs_journal_location,
+	    fs->fs_journal_flags);
+	printf("wapbl loc0 %" PRIu64 "\tloc1 %" PRIu64,
+	    fs->fs_journallocs[0], fs->fs_journallocs[1]);
+	printf("\tloc1 %" PRIu64 "\tloc2 %" PRIu64 "\n",
+	    fs->fs_journallocs[2], fs->fs_journallocs[3]);
 	printf("flags\t");
 	if (fs->fs_flags == 0)
 		printf("none");
@@ -396,8 +403,11 @@ print_superblock(struct fs *fs, uint16_t *opostbl,
 		printf("multilabel ");
 	if (fs->fs_flags & FS_FLAGS_UPDATED)
 		printf("fs_flags expanded ");
-	fsflags = fs->fs_flags & ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS |
-			FS_ACLS | FS_MULTILABEL | FS_FLAGS_UPDATED);
+	if (fs->fs_flags & FS_DOWAPBL)
+		printf("wapbl ");
+	fsflags = fs->fs_flags & ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK |
+			FS_INDEXDIRS | FS_ACLS | FS_MULTILABEL |
+			FS_FLAGS_UPDATED | FS_DOWAPBL);
 	if (fsflags != 0)
 		printf("unknown flags (%#x)", fsflags);
 	printf("\nfsmnt\t%s\n", fs->fs_fsmnt);