If LFS_DO_ROLLFORWARD is defined, roll forward from the older checkpoint

on mount, through the newer checkpoint and on through any newer partial-segments that may have been written but not checkpointed because of an intervening crash. LFS_DO_ROLLFORWARD is not defined by default.
2000-11-27 03:33:57 +00:00 · 2000-11-27 03:33:57 +00:00 · 0055236dda
parent 25491e6a22
commit 0055236dda
6 changed files with 775 additions and 88 deletions
--- a/sys/ufs/lfs/lfs_alloc.c
+++ b/sys/ufs/lfs/lfs_alloc.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_alloc.c,v 1.43 2000/09/09 04:49:54 perseant Exp $	*/
+/*	$NetBSD: lfs_alloc.c,v 1.44 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -95,6 +95,166 @@
 extern int lfs_dirvcount;
 extern struct lock ufs_hashlock;

+static int extend_ifile(struct lfs *, struct ucred *);
+static int lfs_ialloc(struct lfs *, struct vnode *, ino_t, int, struct vnode **);
+
+/*
+ * Allocate a particular inode with a particular version number, freeing
+ * any previous versions of this inode that may have gone before.
+ * Used by the roll-forward code.
+ *
+ * XXX this function does not have appropriate locking to be used on a live fs;
+ * XXX but something similar could probably be used for an "undelete" call.
+ */
+int
+lfs_rf_valloc(struct lfs *fs, ino_t ino, int version, struct proc *p,
+	      struct vnode **vpp)
+{
+	IFILE *ifp;
+	struct buf *bp;
+	struct vnode *vp;
+	struct inode *ip;
+	ino_t tino, oldnext;
+	int error;
+
+	/*
+	 * First, just try a vget. If the version number is the one we want,
+	 * we don't have to do anything else.  If the version number is wrong,
+	 * take appropriate action.
+	 */
+	error = VFS_VGET(fs->lfs_ivnode->v_mount, ino, &vp);
+	if (error == 0) {
+		/* printf("lfs_rf_valloc[1]: ino %d vp %p\n", ino, vp); */
+
+		*vpp = vp;
+		ip = VTOI(vp);
+		if (ip->i_ffs_gen == version)
+			return 0;
+		else if (ip->i_ffs_gen < version) {
+			VOP_TRUNCATE(vp, (off_t)0, 0, NOCRED, p);
+			ip->i_ffs_gen = version;
+			LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED | IN_UPDATE);
+			return 0;
+		} else {
+			/* printf("ino %d: asked for version %d but got %d\n",
+			       ino, version, ip->i_ffs_gen); */
+			vput(vp);
+			*vpp = NULLVP;
+			return EEXIST;
+		}
+	}
+
+	/*
+	 * The inode is not in use.  Find it on the free list.
+	 */
+	/* If the Ifile is too short to contain this inum, extend it */
+	while (VTOI(fs->lfs_ivnode)->i_ffs_size <=
+	       dbtob(fsbtodb(fs, ino / fs->lfs_ifpb + fs->lfs_cleansz +
+			     fs->lfs_segtabsz))) {
+		extend_ifile(fs, NOCRED);
+	}
+
+	LFS_IENTRY(ifp, fs, ino, bp);
+	oldnext = ifp->if_nextfree;
+	ifp->if_version = version;
+	brelse(bp);
+
+	if (ino == fs->lfs_free) {
+		fs->lfs_free = oldnext;
+	} else {
+		tino = fs->lfs_free;
+		while(1) {
+			LFS_IENTRY(ifp, fs, tino, bp);
+			if (ifp->if_nextfree == ino ||
+			    ifp->if_nextfree == LFS_UNUSED_INUM)
+				break;
+			tino = ifp->if_nextfree;
+			brelse(bp);
+		}
+		if (ifp->if_nextfree == LFS_UNUSED_INUM) {
+			brelse(bp);
+			return ENOENT;
+		}
+		ifp->if_nextfree = oldnext;
+		VOP_BWRITE(bp);
+	}
+
+	error = lfs_ialloc(fs, fs->lfs_ivnode, ino, version, &vp);
+	if (error == 0) {
+		/*
+		 * Make it VREG so we can put blocks on it.  We will change
+		 * this later if it turns out to be some other kind of file.
+		 */
+		ip = VTOI(vp);
+		ip->i_ffs_mode = IFREG;
+		ip->i_ffs_nlink = 1;
+		ip->i_ffs_effnlink = 1;
+		ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp);
+		ip = VTOI(vp);
+
+		/* printf("lfs_rf_valloc: ino %d vp %p\n", ino, vp); */
+
+		/* The dirop-nature of this vnode is past */
+		(void)lfs_vunref(vp);
+		--lfs_dirvcount;
+		vp->v_flag &= ~VDIROP;
+		--fs->lfs_nadirop;
+		ip->i_flag &= ~IN_ADIROP;
+	}
+	*vpp = vp;
+	return error;
+}
+
+static int
+extend_ifile(struct lfs *fs, struct ucred *cred)
+{
+	struct vnode *vp;
+	struct inode *ip;
+	IFILE *ifp;
+	struct buf *bp;
+	int error;
+	ufs_daddr_t i, blkno, max;
+	ino_t oldlast;
+
+	vp = fs->lfs_ivnode;
+	(void)lfs_vref(vp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	ip = VTOI(vp);
+	blkno = lblkno(fs, ip->i_ffs_size);
+	if ((error = VOP_BALLOC(vp, ip->i_ffs_size, fs->lfs_bsize, cred, 0,
+				&bp)) != 0) {
+		VOP_UNLOCK(vp, 0);
+		lfs_vunref(vp);
+		return (error);
+	}
+	ip->i_ffs_size += fs->lfs_bsize;
+	uvm_vnp_setsize(vp, ip->i_ffs_size);
+	(void)uvm_vnp_uncache(vp);
+	VOP_UNLOCK(vp, 0);
+	
+	i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
+		fs->lfs_ifpb;
+	oldlast = fs->lfs_free;
+	fs->lfs_free = i;
+#ifdef DIAGNOSTIC
+	if(fs->lfs_free == LFS_UNUSED_INUM)
+		panic("inode 0 allocated [2]");
+#endif /* DIAGNOSTIC */
+	max = i + fs->lfs_ifpb;
+	/* printf("extend ifile for ino %d--%d\n", i, max); */
+	for (ifp = (struct ifile *)bp->b_data; i < max; ++ifp) {
+		ifp->if_version = 1;
+		ifp->if_daddr = LFS_UNUSED_DADDR;
+		ifp->if_nextfree = ++i;
+	}
+	ifp--;
+	ifp->if_nextfree = oldlast;
+	(void) VOP_BWRITE(bp); /* Ifile */
+	lfs_vunref(vp);
+
+	return 0;
+}
+
 /* Allocate a new inode. */
 /* ARGSUSED */
 /* VOP_BWRITE 2i times */
@ -111,14 +271,9 @@ lfs_valloc(v)
 	struct lfs *fs;
 	struct buf *bp;
 	struct ifile *ifp;
-	struct inode *ip;
-	struct vnode *vp;
-	ufs_daddr_t blkno;
 	ino_t new_ino;
-	u_long i, max;
 	int error;
 	int new_gen;
-	extern int lfs_dirvcount;

 	fs = VTOI(ap->a_pvp)->i_lfs;
 	if (fs->lfs_ronly)
@ -132,7 +287,7 @@ lfs_valloc(v)
 	 * written to disk.
 	 *
 	 * XXX this sucks.  We should instead encode the head of the free
-	 * list into the CLEANERINFO block of the Ifile.
+	 * list into the CLEANERINFO block of the Ifile. [XXX v2]
 	 */
 	lfs_seglock(fs, SEGM_PROT);

@ -147,7 +302,7 @@ lfs_valloc(v)
 	}
 #endif /* DIAGNOSTIC */
 #ifdef ALLOCPRINT
-	printf("lfs_ialloc: allocate inode %d\n", new_ino);
+	printf("lfs_valloc: allocate inode %d\n", new_ino);
 #endif
 	
 	/*
@ -156,52 +311,18 @@ lfs_valloc(v)
 	 */
 	LFS_IENTRY(ifp, fs, new_ino, bp);
 	if (ifp->if_daddr != LFS_UNUSED_DADDR)
-		panic("lfs_ialloc: inuse inode %d on the free list", new_ino);
+		panic("lfs_valloc: inuse inode %d on the free list", new_ino);
 	fs->lfs_free = ifp->if_nextfree;
 	new_gen = ifp->if_version; /* version was updated by vfree */
-#ifdef LFS_DEBUG_NEXTFREE
-	ifp->if_nextfree = 0;
-	(void) VOP_BWRITE(bp); /* Ifile */
-#else
 	brelse(bp);
-#endif

 	/* Extend IFILE so that the next lfs_valloc will succeed. */
 	if (fs->lfs_free == LFS_UNUSED_INUM) {
-		vp = fs->lfs_ivnode;
-		(void)lfs_vref(vp);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-		ip = VTOI(vp);
-		blkno = lblkno(fs, ip->i_ffs_size);
-		if ((error = VOP_BALLOC(vp, ip->i_ffs_size, fs->lfs_bsize,
-					ap->a_cred, 0, &bp)) != 0) {
-			VOP_UNLOCK(vp, 0);
-			lfs_segunlock(fs);
+		if ((error = extend_ifile(fs, ap->a_cred)) != 0) {
 			fs->lfs_free = new_ino;
-			return (error);
+			lfs_segunlock(fs);
+			return error;
 		}
-		ip->i_ffs_size += fs->lfs_bsize;
-		uvm_vnp_setsize(vp, ip->i_ffs_size);
-		(void)uvm_vnp_uncache(vp);
-		VOP_UNLOCK(vp, 0);
-
-		i = (blkno - fs->lfs_segtabsz - fs->lfs_cleansz) *
-			fs->lfs_ifpb;
-		fs->lfs_free = i;
-#ifdef DIAGNOSTIC
-		if(fs->lfs_free == LFS_UNUSED_INUM)
-			panic("inode 0 allocated [2]");
-#endif /* DIAGNOSTIC */
-		max = i + fs->lfs_ifpb;
-		for (ifp = (struct ifile *)bp->b_data; i < max; ++ifp) {
-			ifp->if_version = 1;
-			ifp->if_daddr = LFS_UNUSED_DADDR;
-			ifp->if_nextfree = ++i;
-		}
-		ifp--;
-		ifp->if_nextfree = LFS_UNUSED_INUM;
-		(void) VOP_BWRITE(bp); /* Ifile */
-		lfs_vunref(vp);
 	}
 #ifdef DIAGNOSTIC
 	if(fs->lfs_free == LFS_UNUSED_INUM)
@ -210,13 +331,27 @@ lfs_valloc(v)
 	
 	lfs_segunlock(fs);

-	if ((error = getnewvnode(VT_LFS, ap->a_pvp->v_mount,
-	    lfs_vnodeop_p, &vp)) != 0)
+	return lfs_ialloc(fs, ap->a_pvp, new_ino, new_gen, ap->a_vpp);
+}
+
+static int
+lfs_ialloc(struct lfs *fs, struct vnode *pvp, ino_t new_ino, int new_gen,
+	   struct vnode **vpp)
+{
+	struct inode *ip;
+	struct vnode *vp;
+	IFILE *ifp;
+	struct buf *bp;
+	int error;
+
+	error = getnewvnode(VT_LFS, pvp->v_mount, lfs_vnodeop_p, &vp);
+	/* printf("lfs_ialloc: ino %d vp %p error %d\n", new_ino, vp, error);*/
+	if (error)
 		goto errout;

 	lockmgr(&ufs_hashlock, LK_EXCLUSIVE, 0);
 	/* Create an inode to associate with the vnode. */
-	lfs_vcreate(ap->a_pvp->v_mount, new_ino, vp);
+	lfs_vcreate(pvp->v_mount, new_ino, vp);
 	
 	ip = VTOI(vp);
 	/* Zero out the direct and indirect block addresses. */
@ -224,19 +359,22 @@ lfs_valloc(v)
 	ip->i_din.ffs_din.di_inumber = new_ino;
 	
 	/* Set a new generation number for this inode. */
-	ip->i_ffs_gen = new_gen;
+	if (new_gen)
+		ip->i_ffs_gen = new_gen;
 	
 	/* Insert into the inode hash table. */
 	ufs_ihashins(ip);
 	lockmgr(&ufs_hashlock, LK_RELEASE, 0);
-	
+
 	error = ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p, &vp);
+	ip = VTOI(vp);
 	if (error) {
 		vput(vp);
 		goto errout;
 	}
+	/* printf("lfs_ialloc[2]: ino %d vp %p\n", new_ino, vp);*/
 	
-	*ap->a_vpp = vp;
+	*vpp = vp;
 #if 1
 	if(!(vp->v_flag & VDIROP)) {
 		(void)lfs_vref(vp);
@ -264,6 +402,7 @@ lfs_valloc(v)
 	fs->lfs_free = new_ino;
 	(void) VOP_BWRITE(bp); /* Ifile */

+	*vpp = NULLVP;
 	return (error);
 }

@ -302,7 +441,7 @@ lfs_vcreate(mp, ino, vp)
 	ip->i_ffs_blocks = 0;
 	ip->i_lfs_effnblks = 0;
 	ip->i_flag = 0;
-	LFS_SET_UINO(ip, IN_MODIFIED);
+	LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED);
 }

 /* Free an inode. */
--- a/sys/ufs/lfs/lfs_bio.c
+++ b/sys/ufs/lfs/lfs_bio.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_bio.c,v 1.32 2000/11/17 19:14:41 perseant Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.33 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -448,27 +448,30 @@ lfs_check(vp, blkno, flags)
 			wakeup(&fs->lfs_dirops);
 	}

-	while  (locked_queue_count > LFS_WAIT_BUFS
-		|| locked_queue_bytes > LFS_WAIT_BYTES)
+	while (locked_queue_count > LFS_WAIT_BUFS
+	       || locked_queue_bytes > LFS_WAIT_BYTES)
 	{
 		if(lfs_dostats)
 			++lfs_stats.wait_exceeded;
-#ifdef DEBUG_LFS
+#ifdef DEBUG
 		printf("lfs_check: waiting: count=%d, bytes=%ld\n",
 			locked_queue_count, locked_queue_bytes);
 #endif
 		error = tsleep(&locked_queue_count, PCATCH | PUSER,
 			       "buffers", hz * LFS_BUFWAIT);
+		if (error != EWOULDBLOCK)
+			break;
 		/*
 		 * lfs_flush might not flush all the buffers, if some of the
-		 * inodes were locked.  Try flushing again to keep us from
-		 * blocking indefinitely.
+		 * inodes were locked or if most of them were Ifile blocks
+		 * and we weren't asked to checkpoint.  Try flushing again
+		 * to keep us from blocking indefinitely.
 		 */
 		if (locked_queue_count > LFS_MAX_BUFS ||
 		    locked_queue_bytes > LFS_MAX_BYTES)
 		{
 			++fs->lfs_writer;
-			lfs_flush(fs, flags);
+			lfs_flush(fs, flags | SEGM_CKP);
 			if(--fs->lfs_writer==0)
 				wakeup(&fs->lfs_dirops);
 		}
--- a/sys/ufs/lfs/lfs_inode.c
+++ b/sys/ufs/lfs/lfs_inode.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_inode.c,v 1.47 2000/11/21 00:00:31 perseant Exp $	*/
+/*	$NetBSD: lfs_inode.c,v 1.48 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -99,7 +99,8 @@ extern long locked_queue_bytes;

 static int lfs_update_seguse(struct lfs *, long, size_t);
 static int lfs_indirtrunc (struct inode *, ufs_daddr_t, ufs_daddr_t,
-			   ufs_daddr_t, int, long *, long *, long *, size_t *);
+			   ufs_daddr_t, int, long *, long *, long *, size_t *,
+			   struct proc *);
 static int lfs_blkfree (struct lfs *, daddr_t, size_t, long *, size_t *);
 static int lfs_vtruncbuf(struct vnode *, daddr_t, int, int);

@ -392,7 +393,7 @@ lfs_truncate(v)
 			error = lfs_indirtrunc(oip, indir_lbn[level],
 					       bn, lastiblock[level],
 					       level, &count, &rcount,
-					       &lastseg, &bc);
+					       &lastseg, &bc, ap->a_p);
 			if (error)
 				allerror = error;
 			real_released += rcount;
@ -543,7 +544,7 @@ lfs_update_seguse(struct lfs *fs, long lastseg, size_t num)
 static int
 lfs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, daddr_t dbn,
 	       ufs_daddr_t lastbn, int level, long *countp,
-	       long *rcountp, long *lastsegp, size_t *bcp)
+	       long *rcountp, long *lastsegp, size_t *bcp, struct proc *p)
 {
 	int i;
 	struct buf *bp;
@ -582,7 +583,7 @@ lfs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, daddr_t dbn,
 		trace(TR_BREADHIT, pack(vp, fs->lfs_bsize), lbn);
 	} else {
 		trace(TR_BREADMISS, pack(vp, fs->lfs_bsize), lbn);
-		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
+		p->p_stats->p_ru.ru_inblock++;	/* pay for read */
 		bp->b_flags |= B_READ;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("lfs_indirtrunc: bad buffer size");
@ -620,7 +621,7 @@ lfs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, daddr_t dbn,
 			error = lfs_indirtrunc(ip, nlbn, nb,
 					       (ufs_daddr_t)-1, level - 1,
 					       &blkcount, &rblkcount,
-					       lastsegp, bcp);
+					       lastsegp, bcp, p);
 			if (error)
 				allerror = error;
 			blocksreleased += blkcount;
@ -641,7 +642,7 @@ lfs_indirtrunc(struct inode *ip, ufs_daddr_t lbn, daddr_t dbn,
 		if (nb != 0) {
 			error = lfs_indirtrunc(ip, nlbn, nb,
 					       last, level - 1, &blkcount,
-					       &rblkcount, lastsegp, bcp);
+					       &rblkcount, lastsegp, bcp, p);
 			if (error)
 				allerror = error;
 			real_released += rblkcount;
--- a/sys/ufs/lfs/lfs_segment.c
+++ b/sys/ufs/lfs/lfs_segment.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_segment.c,v 1.62 2000/11/17 19:14:41 perseant Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.63 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -587,22 +587,21 @@ lfs_segwrite(mp, flags)

 	did_ckp = 0;
 	if (do_ckp || fs->lfs_doifile) {
-	redo:
-		vp = fs->lfs_ivnode;
+		do {
+			vp = fs->lfs_ivnode;

-		vget(vp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
+			vget(vp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);

-		ip = VTOI(vp);
-		if (vp->v_dirtyblkhd.lh_first != NULL)
-			lfs_writefile(fs, sp, vp);
-		if (ip->i_flag & IN_ALLMOD)
-			++did_ckp;
-		(void) lfs_writeinode(fs, sp, ip);
+			ip = VTOI(vp);
+			if (vp->v_dirtyblkhd.lh_first != NULL)
+				lfs_writefile(fs, sp, vp);
+			if (ip->i_flag & IN_ALLMOD)
+				++did_ckp;
+			(void) lfs_writeinode(fs, sp, ip);
+			
+			vput(vp);
+		} while (lfs_writeseg(fs, sp) && do_ckp);

-		vput(vp);
-
-		if (lfs_writeseg(fs, sp) && do_ckp)
-			goto redo;
 		/* The ifile should now be all clear */
 		LFS_CLR_UINO(ip, IN_ALLMOD);
 	} else {
@ -814,6 +813,12 @@ lfs_writeinode(fs, sp, ip)
 			     IN_UPDATE);
 		if (ip->i_lfs_effnblks == ip->i_ffs_blocks)
 			LFS_CLR_UINO(ip, IN_MODIFIED);
+#ifdef DEBUG_LFS
+		else
+			printf("lfs_writeinode: ino %d: real blks=%d, "
+			       "eff=%d\n", ip->i_number, ip->i_ffs_blocks,
+			       ip->i_lfs_effnblks);
+#endif
 	}

 	if(ip->i_number == LFS_IFILE_INUM) /* We know sp->idp == NULL */
--- a/sys/ufs/lfs/lfs_syscalls.c
+++ b/sys/ufs/lfs/lfs_syscalls.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.53 2000/11/22 22:11:34 perseant Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.54 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -988,6 +988,19 @@ lfs_fastvget(mp, ino, daddr, vpp, dinp, need_unlock)
 	dev = ump->um_dev;
 	*need_unlock = 0;

+	/*
+	 * Wait until the filesystem is fully mounted before allowing vget
+	 * to complete.  This prevents possible problems with roll-forward.
+	 */
+	while(ump->um_lfs->lfs_flags & LFS_NOTYET) {
+		tsleep(&ump->um_lfs->lfs_flags, PRIBIO+1, "lfs_fnotyet", 0);
+	}
+	/*
+	 * This is playing fast and loose.  Someone may have the inode
+	 * locked, in which case they are going to be distinctly unhappy
+	 * if we trash something.
+	 */
+
 	error = lfs_fasthashget(dev, ino, need_unlock, vpp);
 	if (error != 0 || *vpp != NULL)
 		return (error);
--- a/sys/ufs/lfs/lfs_vfsops.c
+++ b/sys/ufs/lfs/lfs_vfsops.c
@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.59 2000/11/14 00:42:55 perseant Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.60 2000/11/27 03:33:57 perseant Exp $	*/

 /*-
 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
@ -139,6 +139,9 @@ struct vfsops lfs_vfsops = {

 struct pool lfs_inode_pool;

+extern int locked_queue_count;
+extern long locked_queue_bytes;
+
 /*
 * Initialize the filesystem, most work done by ufs_init.
 */
@ -319,6 +322,400 @@ lfs_mount(mp, path, data, ndp, p)
 	return (0);
 }

+#ifdef LFS_DO_ROLLFORWARD
+/*
+ * Roll-forward code.
+ */
+
+/*
+ * Load the appropriate indirect block, and change the appropriate pointer.
+ * Mark the block dirty.  Do segment and avail accounting.
+ */
+static int
+update_meta(struct lfs *fs, ino_t ino, int version, ufs_daddr_t lbn,
+	    daddr_t ndaddr, size_t size, struct proc *p)
+{
+	int error;
+	struct vnode *vp;
+	struct inode *ip;
+	daddr_t odaddr, ooff;
+	struct indir a[NIADDR], *ap;
+	struct buf *bp;
+	SEGUSE *sup;
+	int num;
+
+	if ((error = lfs_rf_valloc(fs, ino, version, p, &vp)) != 0) {
+		printf("update_meta: ino %d: lfs_rf_valloc returned %d\n", ino,
+		       error);
+		return error;
+	}
+
+	if ((error = VOP_BALLOC(vp, (lbn << fs->lfs_bshift), size,
+				NOCRED, 0, &bp)) != 0) {
+		vput(vp);
+		return (error);
+	}
+	/* No need to write, the block is already on disk */
+	if (bp->b_flags & B_DELWRI) {
+		LFS_UNLOCK_BUF(bp);
+		fs->lfs_avail += btodb(bp->b_bcount);
+	}
+	bp->b_flags |= B_INVAL;
+	brelse(bp);
+
+	/*
+	 * Extend the file, if it is not large enough already.
+	 * XXX this is not exactly right, we don't know how much of the
+	 * XXX last block is actually used.  We hope that an inode will
+	 * XXX appear later to give the correct size.
+	 */
+	ip = VTOI(vp);
+	if (ip->i_ffs_size <= (lbn << fs->lfs_bshift)) {
+		if (lbn < NDADDR)
+			ip->i_ffs_size = (lbn << fs->lfs_bshift) +
+				(size - fs->lfs_fsize) + 1;
+		else
+			ip->i_ffs_size = (lbn << fs->lfs_bshift) + 1;
+	}
+
+	error = ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL);
+	if (error) {
+		printf("update_meta: ufs_bmaparray returned %d\n", error);
+		vput(vp);
+		return error;
+	}
+	switch (num) {
+	    case 0:
+		ooff = ip->i_ffs_db[lbn];
+		if (ooff == UNWRITTEN)
+			ip->i_ffs_blocks += btodb(size);
+		ip->i_ffs_db[lbn] = ndaddr;
+		break;
+	    case 1:
+		ooff = ip->i_ffs_ib[a[0].in_off];
+		if (ooff == UNWRITTEN)
+			ip->i_ffs_blocks += btodb(size);
+		ip->i_ffs_ib[a[0].in_off] = ndaddr;
+		break;
+	    default:
+		ap = &a[num - 1];
+		if (bread(vp, ap->in_lbn, fs->lfs_bsize, NOCRED, &bp))
+			panic("update_meta: bread bno %d", ap->in_lbn);
+		
+		ooff = ((ufs_daddr_t *)bp->b_data)[ap->in_off];
+		if (ooff == UNWRITTEN)
+			ip->i_ffs_blocks += btodb(size);
+		((ufs_daddr_t *)bp->b_data)[ap->in_off] = ndaddr;
+		(void) VOP_BWRITE(bp);
+	}
+	LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED | IN_UPDATE);
+
+	/* Update segment usage information. */
+	if (odaddr > 0) {
+		LFS_SEGENTRY(sup, fs, datosn(fs, odaddr), bp);
+#ifdef DIAGNOSTIC
+		if (sup->su_nbytes < size) {
+			panic("update_meta: negative bytes "
+			      "(segment %d short by %ld)\n",
+			      datosn(fs, odaddr), (long)size - sup->su_nbytes);
+			sup->su_nbytes = size;
+		}
+#endif
+		sup->su_nbytes -= size;
+		VOP_BWRITE(bp);
+	}
+	LFS_SEGENTRY(sup, fs, datosn(fs, ndaddr), bp);
+	sup->su_nbytes += size;
+	VOP_BWRITE(bp);
+
+	/* Fix this so it can be released */
+	/* ip->i_lfs_effnblks = ip->i_ffs_blocks; */
+
+	/* Now look again to make sure it worked */
+	ufs_bmaparray(vp, lbn, &odaddr, &a[0], &num, NULL );
+	if (odaddr != ndaddr)
+		printf("update_meta: failed setting ino %d lbn %d to %x\n",
+		       ino, lbn, ndaddr);
+
+	vput(vp);
+	return 0;
+}
+
+static int
+update_inoblk(struct lfs *fs, daddr_t offset, struct ucred *cred,
+	      struct proc *p)
+{
+	struct vnode *devvp, *vp;
+	struct inode *ip;
+	struct dinode *dip;
+	struct buf *dbp, *ibp;
+	int error;
+	daddr_t daddr;
+	IFILE *ifp;
+	SEGUSE *sup;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+
+	/*
+	 * Get the inode, update times and perms.
+	 * DO NOT update disk blocks, we do that separately.
+	 */
+	error = bread(devvp, offset, fs->lfs_bsize, cred, &dbp);
+	if (error) {
+		printf("update_inoblk: bread returned %d\n", error);
+		return error;
+	}
+	dip = ((struct dinode *)(dbp->b_data)) + INOPB(fs);
+	while(--dip >= (struct dinode *)dbp->b_data) {
+		if(dip->di_inumber > LFS_IFILE_INUM) {
+			/* printf("ino %d version %d\n", dip->di_inumber,
+			       dip->di_gen); */
+			error = lfs_rf_valloc(fs, dip->di_inumber, dip->di_gen,
+					      p, &vp);
+			if (error) {
+				printf("update_inoblk: lfs_rf_valloc returned %d\n", error);
+				continue;
+			}
+			ip = VTOI(vp);
+			if (dip->di_size != ip->i_ffs_size)
+				VOP_TRUNCATE(vp, dip->di_size, 0, NOCRED, p);
+			/* Get mode, link count, size, and times */
+			memcpy(&ip->i_din.ffs_din, dip, 
+			       offsetof(struct dinode, di_db[0]));
+
+			/* Then the rest, except di_blocks */
+			ip->i_ffs_flags = dip->di_flags;
+			ip->i_ffs_gen = dip->di_gen;
+			ip->i_ffs_uid = dip->di_uid;
+			ip->i_ffs_gid = dip->di_gid;
+
+			ip->i_ffs_effnlink = dip->di_nlink;
+
+			LFS_SET_UINO(ip, IN_CHANGE | IN_MODIFIED | IN_UPDATE);
+
+			/* Re-initialize to get type right */
+			ufs_vinit(vp->v_mount, lfs_specop_p, lfs_fifoop_p,
+				  &vp);
+			vput(vp);
+
+			/* Record change in location */
+			LFS_IENTRY(ifp, fs, dip->di_inumber, ibp);
+			daddr = ifp->if_daddr;
+			ifp->if_daddr = dbp->b_blkno;
+			error = VOP_BWRITE(ibp); /* Ifile */
+			/* And do segment accounting */
+			if (datosn(fs, daddr) != datosn(fs, dbp->b_blkno)) {
+				if (daddr > 0) {
+					LFS_SEGENTRY(sup, fs, datosn(fs, daddr),
+						     ibp);
+					sup->su_nbytes -= DINODE_SIZE;
+					VOP_BWRITE(ibp);
+				}
+				LFS_SEGENTRY(sup, fs, datosn(fs, dbp->b_blkno),
+					     ibp);
+				sup->su_nbytes += DINODE_SIZE;
+				VOP_BWRITE(ibp);
+			}
+		}
+	}
+	dbp->b_flags |= B_AGE;
+	brelse(dbp);
+
+	return 0;
+}
+
+#define CHECK_CKSUM   0x0001  /* Check the checksum to make sure it's valid */
+#define CHECK_UPDATE  0x0002  /* Update Ifile for new data blocks / inodes */
+
+static daddr_t
+check_segsum(struct lfs *fs, daddr_t offset,
+	     struct ucred *cred, int flags, int *pseg_flags, struct proc *p)
+{
+	struct vnode *devvp;
+	struct buf *bp, *dbp;
+	int error, nblocks, ninos, i, j;
+	SEGSUM *ssp;
+	u_long *dp, *datap; /* XXX u_int32_t */
+	daddr_t *iaddr, oldoffset;
+	FINFO *fip;
+	SEGUSE *sup;
+	size_t size;
+
+	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
+	/*
+	 * If the segment has a superblock and we're at the top
+	 * of the segment, skip the superblock.
+	 */
+	if(sntoda(fs, datosn(fs, offset)) == offset) {
+       		LFS_SEGENTRY(sup, fs, datosn(fs, offset), bp); 
+       		if(sup->su_flags & SEGUSE_SUPERBLOCK)
+			offset += btodb(LFS_SBPAD);
+       		brelse(bp);
+	}
+
+	/* Read in the segment summary */
+	error = bread(devvp, offset, LFS_SUMMARY_SIZE, cred, &bp);
+	if(error)
+		return -1;
+	
+	/* Check summary checksum */
+	ssp = (SEGSUM *)bp->b_data;
+	if(flags & CHECK_CKSUM) {
+		if(ssp->ss_sumsum != cksum(&ssp->ss_datasum,
+					   LFS_SUMMARY_SIZE -
+					   sizeof(ssp->ss_sumsum))) {
+#ifdef DEBUG_LFS_RFW
+			printf("Sumsum error at 0x%x\n", offset);
+#endif
+			offset = -1;
+			goto err1;
+		}
+		if (ssp->ss_nfinfo == 0 && ssp->ss_ninos == 0) {
+#ifdef DEBUG_LFS_RFW
+			printf("Empty pseg at 0x%x\n", offset);
+#endif
+			offset = -1;
+			goto err1;
+		}
+		if (ssp->ss_create < fs->lfs_tstamp) {
+#ifdef DEBUG_LFS_RFW
+			printf("Old data at 0x%x\n", offset);
+#endif
+			offset = -1;
+			goto err1;
+		}
+	}
+	if(pseg_flags)
+		*pseg_flags = ssp->ss_flags;
+	oldoffset = offset;
+	offset += btodb(LFS_SUMMARY_SIZE);
+
+	ninos = howmany(ssp->ss_ninos, INOPB(fs));
+	iaddr = (daddr_t *)(bp->b_data + LFS_SUMMARY_SIZE - sizeof(daddr_t));
+	if(flags & CHECK_CKSUM) {
+		/* Count blocks */
+		nblocks = 0;
+		fip = (FINFO *)(bp->b_data + sizeof(SEGSUM));
+		for(i = 0; i < ssp->ss_nfinfo; ++i) {
+			nblocks += fip->fi_nblocks;
+			if(fip->fi_nblocks <= 0)
+				break;
+			fip = (FINFO *)(((char *)fip) + sizeof(FINFO) +
+					(fip->fi_nblocks - 1) *
+					sizeof(ufs_daddr_t));
+		}
+		nblocks += ninos;
+		/* Create the sum array */
+		datap = dp = (u_long *)malloc(nblocks * sizeof(u_long),
+					      M_SEGMENT, M_WAITOK);
+	}
+
+	/* Handle individual blocks */
+	fip = (FINFO *)(bp->b_data + sizeof(SEGSUM));
+	for(i = 0; i < ssp->ss_nfinfo || ninos; ++i) {
+		/* Inode block? */
+		if(ninos && *iaddr == offset) {
+			if(flags & CHECK_CKSUM) {
+				/* Read in the head and add to the buffer */
+				error = bread(devvp, offset, fs->lfs_bsize,
+					      cred, &dbp);
+				if(error) {
+					offset = -1;
+					goto err2;
+				}
+				(*dp++) = ((u_long *)(dbp->b_data))[0];
+				dbp->b_flags |= B_AGE;
+				brelse(dbp);
+			}
+			if(flags & CHECK_UPDATE) {
+				if ((error = update_inoblk(fs, offset, cred, p))
+				    != 0) {
+					offset = -1;
+					goto err2;
+				}
+			}
+			offset += fsbtodb(fs,1);
+			--iaddr;
+			--ninos;
+			--i; /* compensate */
+			continue;
+		}
+		/* printf("check: blocks from ino %d version %d\n",
+		       fip->fi_ino, fip->fi_version); */
+		size = fs->lfs_bsize;
+		for(j = 0; j < fip->fi_nblocks; ++j) {
+			if (j == fip->fi_nblocks - 1)
+				size = fip->fi_lastlength;
+			if(flags & CHECK_CKSUM) {
+				error = bread(devvp, offset, size, cred, &dbp);
+				if(error) {
+					offset = -1;
+					goto err2;
+				}
+				(*dp++) = ((u_long *)(dbp->b_data))[0];
+				dbp->b_flags |= B_AGE;
+				brelse(dbp);
+			}
+			/* Account for and update any direct blocks */
+			if((flags & CHECK_UPDATE) &&
+			   fip->fi_ino > LFS_IFILE_INUM &&
+			   fip->fi_blocks[j] >= 0) {
+				update_meta(fs, fip->fi_ino, fip->fi_version,
+					    fip->fi_blocks[j], offset, size, p);
+			}
+			offset += btodb(size);
+		}
+		fip = (FINFO *)(((char *)fip) + sizeof(FINFO)
+				+ (fip->fi_nblocks - 1) * sizeof(ufs_daddr_t));
+	}
+	/* Checksum the array, compare */
+	if((flags & CHECK_CKSUM) &&
+	   ssp->ss_datasum != cksum(datap, nblocks * sizeof(u_long)))
+	{
+		printf("Datasum error at 0x%x (wanted %x got %x)\n", offset,
+		       ssp->ss_datasum, cksum(datap, nblocks *
+					      sizeof(u_long)));
+		offset = -1;
+		goto err2;
+	}
+
+	/* If we're at the end of the segment, move to the next */
+	if(datosn(fs, offset + btodb(LFS_SUMMARY_SIZE + fs->lfs_bsize)) !=
+	   datosn(fs, offset)) {
+		if (datosn(fs, offset) == datosn(fs, ssp->ss_next)) {
+			offset = -1;
+			goto err2;
+		}
+		offset = ssp->ss_next;
+#ifdef DEBUG_LFS_RFW
+		printf("LFS roll forward: moving on to offset 0x%x "
+		       " -> segment %d\n", offset, datosn(fs,offset));
+#endif
+	}
+
+	if (flags & CHECK_UPDATE) {
+		fs->lfs_avail -= (offset - oldoffset);
+		/* Don't clog the buffer queue */
+		if (locked_queue_count > LFS_MAX_BUFS ||
+		    locked_queue_bytes > LFS_MAX_BYTES) {
+			++fs->lfs_writer;
+			lfs_flush(fs, SEGM_CKP);
+			if(--fs->lfs_writer==0)
+				wakeup(&fs->lfs_dirops);
+		}
+	}
+
+    err2:
+	if(flags & CHECK_CKSUM)
+		free(datap, M_SEGMENT);
+    err1:
+	bp->b_flags |= B_AGE;
+	brelse(bp);
+
+	return offset;
+}
+#endif /* LFS_DO_ROLLFORWARD */
+
 /*
 * Common code for mount and mountroot
 * LFS specific
@ -330,7 +727,7 @@ lfs_mountfs(devvp, mp, p)
 	struct proc *p;
 {
 	extern struct vnode *rootvp;
-	struct dlfs *dfs, *adfs;
+	struct dlfs *tdfs, *dfs, *adfs;
 	struct lfs *fs;
 	struct ufsmount *ump;
 	struct vnode *vp;
@ -341,6 +738,11 @@ lfs_mountfs(devvp, mp, p)
 	struct ucred *cred;
 	CLEANERINFO *cip;
        SEGUSE *sup;
+#ifdef LFS_DO_ROLLFORWARD
+	int flags, dirty;
+	daddr_t offset, oldoffset, lastgoodpseg;
+	int sn, curseg;
+#endif

 	cred = p ? p->p_ucred : NOCRED;
 	/*
@ -389,6 +791,7 @@ lfs_mountfs(devvp, mp, p)
 	 * using the older of the two.  This is necessary to ensure that
 	 * the filesystem is valid if it was not unmounted cleanly.
 	 */
+
 	if (dfs->dlfs_sboffs[1] &&
 	    dfs->dlfs_sboffs[1]-(LFS_LABELPAD/size) > LFS_SBPAD/size)
 	{
@ -398,7 +801,18 @@ lfs_mountfs(devvp, mp, p)
 		adfs = (struct dlfs *)abp->b_data;

 		if (adfs->dlfs_tstamp < dfs->dlfs_tstamp) /* XXX 1s? */
-			dfs = adfs;
+			tdfs = adfs;
+		else
+			tdfs = dfs;
+
+		/* Check the basics. */
+		if (tdfs->dlfs_magic != LFS_MAGIC ||
+		    tdfs->dlfs_bsize > MAXBSIZE ||
+	    	    tdfs->dlfs_version > LFS_VERSION ||
+	    	    tdfs->dlfs_bsize < sizeof(struct dlfs)) {
+			error = EINVAL;		/* XXX needs translation */
+			goto out;
+		}
 	} else {
 		printf("lfs_mountfs: invalid alt superblock daddr=0x%x\n",
 			dfs->dlfs_sboffs[1]);
@ -408,12 +822,23 @@ lfs_mountfs(devvp, mp, p)

 	/* Allocate the mount structure, copy the superblock into it. */
 	fs = malloc(sizeof(struct lfs), M_UFSMNT, M_WAITOK);
-	memcpy(&fs->lfs_dlfs, dfs, sizeof(struct dlfs));
+	memcpy(&fs->lfs_dlfs, tdfs, sizeof(struct dlfs));
+
+#ifdef LFS_DO_ROLLFORWARD
+	/* Before rolling forward, lock so vget will sleep for other procs */
+	fs->lfs_flags = LFS_NOTYET;
+	fs->lfs_rfpid = p->p_pid;
+#else
+	fs->lfs_flags = 0;
+#endif
+
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
 	memset((caddr_t)ump, 0, sizeof *ump);
 	ump->um_lfs = fs;
-	if (sizeof(struct lfs) < LFS_SBPAD)			/* XXX why? */
+	if (sizeof(struct lfs) < LFS_SBPAD) {			/* XXX why? */
 		bp->b_flags |= B_INVAL;
+		abp->b_flags |= B_INVAL;
+	}
 	brelse(bp);
 	bp = NULL;
 	brelse(abp);
@ -477,6 +902,97 @@ lfs_mountfs(devvp, mp, p)
 	VREF(vp);
 	vput(vp);

+#ifdef LFS_DO_ROLLFORWARD
+	/*
+	 * Roll forward.
+	 */
+	/*
+	 * Phase I:
+	 * Find the address of the last good partial segment that was written
+	 * after the checkpoint.  Mark the segments in question dirty, so
+	 * they won't be reallocated.
+	 */
+	lastgoodpseg = oldoffset = offset = fs->lfs_offset;
+	flags = 0x0;
+#ifdef DEBUG_LFS_RFW
+	printf("LFS roll forward phase 1: starting at offset 0x%x\n", offset);
+#endif
+	LFS_SEGENTRY(sup, fs, datosn(fs, offset), bp);
+	if (!(sup->su_flags & SEGUSE_DIRTY))
+		--fs->lfs_nclean;
+	sup->su_flags |= SEGUSE_DIRTY;
+	(void) VOP_BWRITE(bp);
+	while ((offset = check_segsum(fs, offset, cred, CHECK_CKSUM, &flags,
+				      p)) > 0) {
+		if(sntoda(fs, oldoffset) != sntoda(fs, offset)) {
+        		LFS_SEGENTRY(sup, fs, datosn(fs, oldoffset), bp); 
+			if (!(sup->su_flags & SEGUSE_DIRTY))
+				--fs->lfs_nclean;
+        		sup->su_flags |= SEGUSE_DIRTY;
+        		(void) VOP_BWRITE(bp);
+		}
+
+#ifdef DEBUG_LFS_RFW
+		printf("LFS roll forward phase 1: offset=0x%x\n", offset);
+		if(flags & SS_DIROP) {
+			printf("lfs_mountfs: dirops at 0x%x\n", oldoffset);
+			if(!(flags & SS_CONT))
+				printf("lfs_mountfs: dirops end at 0x%x\n",
+				       oldoffset);
+		}
+#endif
+		if(!(flags & SS_CONT))
+			lastgoodpseg = offset;
+		oldoffset = offset;
+	}
+#ifdef DEBUG_LFS_RFW
+	if (flags & SS_CONT) {
+		printf("LFS roll forward: warning: incomplete dirops discarded\n");
+	}
+	printf("LFS roll forward phase 1: completed: lastgoodpseg=0x%x\n",
+	       lastgoodpseg);
+#endif
+
+	/* Don't accidentally overwrite what we're trying to preserve */
+	offset = fs->lfs_offset;
+	fs->lfs_offset = lastgoodpseg;
+	fs->lfs_curseg = sntoda(fs, datosn(fs, fs->lfs_offset));
+	for (sn = curseg = datosn(fs, fs->lfs_curseg);;) {
+		sn = (sn + 1) % fs->lfs_nseg;
+		if (sn == curseg)
+			panic("lfs_mountfs: no clean segments");
+		LFS_SEGENTRY(sup, fs, sn, bp);
+		dirty = (sup->su_flags & SEGUSE_DIRTY);
+		brelse(bp);
+		if (!dirty)
+			break;
+	}
+	fs->lfs_nextseg = sntoda(fs, sn);
+
+	/*
+	 * Phase II: Roll forward from the first superblock.
+	 */
+	while (offset != lastgoodpseg) {
+#ifdef DEBUG_LFS_RFW
+		printf("LFS roll forward phase 2: 0x%x\n", offset);
+#endif
+		oldoffset = offset;
+		offset = check_segsum(fs, offset, cred, CHECK_UPDATE, NULL, p);
+	}
+
+	/*
+	 * Finish: flush our changes to disk.
+	 */
+	lfs_segwrite(fs->lfs_ivnode->v_mount, SEGM_CKP | SEGM_SYNC);
+
+#ifdef DEBUG_LFS_RFW
+	printf("LFS roll forward complete\n");
+#endif
+	/* Allow vget now that roll-forward is complete */
+	fs->lfs_flags &= ~(LFS_NOTYET);
+	wakeup(&fs->lfs_flags);
+#endif /* LFS_DO_ROLLFORWARD */
+
 	/*
 	 * Initialize the ifile cleaner info with information from 
 	 * the superblock.
@ -675,6 +1191,14 @@ lfs_vget(mp, ino, vpp)

 	ump = VFSTOUFS(mp);
 	dev = ump->um_dev;
+	fs = ump->um_lfs;
+
+	/*
+	 * If the filesystem is not completely mounted yet, suspend
+	 * any access requests (wait for roll-forward to complete).
+	 */
+	while((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
+		tsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0);

 	if ((*vpp = ufs_ihashget(dev, ino, LK_EXCLUSIVE)) != NULL)
 		return (0);
@ -692,10 +1216,10 @@ lfs_vget(mp, ino, vpp)
 	} while (lockmgr(&ufs_hashlock, LK_EXCLUSIVE|LK_SLEEPFAIL, 0));

 	/* Translate the inode number to a disk address. */
-	fs = ump->um_lfs;
 	if (ino == LFS_IFILE_INUM)
 		daddr = fs->lfs_idaddr;
 	else {
+		/* XXX bounds-check this too */
 		LFS_IENTRY(ifp, fs, ino, bp);
 		daddr = ifp->if_daddr;
 #ifdef LFS_ATIME_IFILE
@ -703,6 +1227,8 @@ lfs_vget(mp, ino, vpp)
 #endif
 		brelse(bp);
 		if (daddr == LFS_UNUSED_DADDR) {
+			*vpp = NULLVP;
+			ungetnewvnode(vp);
 			lockmgr(&ufs_hashlock, LK_RELEASE, 0);
 			return (ENOENT);
 		}