diff --git a/distrib/sets/lists/comp/mi b/distrib/sets/lists/comp/mi index 710a445dbc29..a1e9af15c227 100644 --- a/distrib/sets/lists/comp/mi +++ b/distrib/sets/lists/comp/mi @@ -1,4 +1,4 @@ -# $NetBSD: mi,v 1.1170 2008/07/29 13:17:40 pooka Exp $ +# $NetBSD: mi,v 1.1171 2008/07/31 05:38:04 simonb Exp $ # # Note: don't delete entries from here - mark them as "obsolete" instead. # @@ -2069,6 +2069,7 @@ ./usr/include/sys/vnode_if.h comp-c-include ./usr/include/sys/vsio.h comp-obsolete obsolete ./usr/include/sys/wait.h comp-c-include +./usr/include/sys/wapbl.h comp-c-include ./usr/include/sys/wdog.h comp-c-include ./usr/include/sysexits.h comp-c-include ./usr/include/syslog.h comp-c-include @@ -2100,6 +2101,7 @@ ./usr/include/ufs/ufs/quota.h comp-c-include ./usr/include/ufs/ufs/ufs_bswap.h comp-c-include ./usr/include/ufs/ufs/ufs_extern.h comp-c-include +./usr/include/ufs/ufs/ufs_wapbl.h comp-c-include ./usr/include/ufs/ufs/ufsmount.h comp-c-include ./usr/include/ulimit.h comp-c-include ./usr/include/unctrl.h comp-c-include diff --git a/include/mntopts.h b/include/mntopts.h index c30385fa2517..6dac10e8dea6 100644 --- a/include/mntopts.h +++ b/include/mntopts.h @@ -1,4 +1,4 @@ -/* $NetBSD: mntopts.h,v 1.10 2006/10/31 08:12:46 mjf Exp $ */ +/* $NetBSD: mntopts.h,v 1.11 2008/07/31 05:38:04 simonb Exp $ */ /*- * Copyright (c) 1994 @@ -55,6 +55,7 @@ struct mntopt { #define MOPT_NOATIME { "atime", 1, MNT_NOATIME, 0 } #define MOPT_SYMPERM { "symperm", 0, MNT_SYMPERM, 0 } #define MOPT_SOFTDEP { "softdep", 0, MNT_SOFTDEP, 0 } +#define MOPT_LOG { "log", 0, MNT_LOG, 0 } #define MOPT_IGNORE { "hidden", 0, MNT_IGNORE, 0 } /* Control flags. */ diff --git a/sbin/fsck_ffs/Makefile b/sbin/fsck_ffs/Makefile index 57a72d416d8b..2cb2cb2dd22b 100644 --- a/sbin/fsck_ffs/Makefile +++ b/sbin/fsck_ffs/Makefile @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.35 2008/05/04 15:37:19 tsutsui Exp $ +# $NetBSD: Makefile,v 1.36 2008/07/31 05:38:04 simonb Exp $ # @(#)Makefile 8.2 (Berkeley) 4/27/95 .include @@ -19,6 +19,10 @@ SRCS+= progress.c .PATH: ${NETBSDSRCDIR}/sys/ufs/ffs ${FSCK} +SRCS+= vfs_wapbl.c wapbl.c +.PATH: ${NETBSDSRCDIR}/sys/kern +CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0 + LDADD+=-lutil DPADD+=${LIBUTIL} diff --git a/sbin/fsck_ffs/extern.h b/sbin/fsck_ffs/extern.h index 0ff3e46edf18..66a0f29cba3e 100644 --- a/sbin/fsck_ffs/extern.h +++ b/sbin/fsck_ffs/extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: extern.h,v 1.22 2005/06/27 01:25:35 christos Exp $ */ +/* $NetBSD: extern.h,v 1.23 2008/07/31 05:38:04 simonb Exp $ */ /* * Copyright (c) 1994 James A. Jegers @@ -27,7 +27,7 @@ void adjust(struct inodesc *, int); daddr_t allocblk(long); ino_t allocdir(ino_t, ino_t, int); -ino_t allocino(ino_t request, int type); +ino_t allocino(ino_t, int); void blkerror(ino_t, const char *, daddr_t); int bread(int, char *, daddr_t, long); void bufinit(void); @@ -82,7 +82,12 @@ void setinodebuf(ino_t); int setup(const char *); void voidquit(int); -void swap_cg(struct cg *, struct cg *); -void copyback_cg(struct bufarea *); -void sb_oldfscompat_write(struct fs *, struct fs *); -void sb_oldfscompat_read(struct fs *, struct fs **); +void replay_wapbl(void); +void cleanup_wapbl(void); +int read_wapbl(char *, long, daddr_t); +int is_journal_inode(ino_t); + +void swap_cg(struct cg *, struct cg *); +void copyback_cg(struct bufarea *); +void sb_oldfscompat_write(struct fs *, struct fs *); +void sb_oldfscompat_read(struct fs *, struct fs **); diff --git a/sbin/fsck_ffs/fsck_ffs.8 b/sbin/fsck_ffs/fsck_ffs.8 index c241648df419..22e341e5c0a2 100644 --- a/sbin/fsck_ffs/fsck_ffs.8 +++ b/sbin/fsck_ffs/fsck_ffs.8 @@ -1,4 +1,4 @@ -.\" $NetBSD: fsck_ffs.8,v 1.40 2005/01/19 16:41:04 wiz Exp $ +.\" $NetBSD: fsck_ffs.8,v 1.41 2008/07/31 05:38:04 simonb Exp $ .\" .\" Copyright (c) 1980, 1989, 1991, 1993 .\" The Regents of the University of California. All rights reserved. @@ -198,7 +198,7 @@ possible without user interaction. Conversion in preen mode is best used when all the file systems are being converted at once. The format of a file system can be determined from the -second line of output from +third line of output from .Xr dumpfs 8 . .It Fl d Print debugging output. diff --git a/sbin/fsck_ffs/pass4.c b/sbin/fsck_ffs/pass4.c index 9105b7d704a6..fab824ade249 100644 --- a/sbin/fsck_ffs/pass4.c +++ b/sbin/fsck_ffs/pass4.c @@ -1,4 +1,4 @@ -/* $NetBSD: pass4.c,v 1.24 2008/02/23 21:41:48 christos Exp $ */ +/* $NetBSD: pass4.c,v 1.25 2008/07/31 05:38:04 simonb Exp $ */ /* * Copyright (c) 1980, 1986, 1993 @@ -34,7 +34,7 @@ #if 0 static char sccsid[] = "@(#)pass4.c 8.4 (Berkeley) 4/28/95"; #else -__RCSID("$NetBSD: pass4.c,v 1.24 2008/02/23 21:41:48 christos Exp $"); +__RCSID("$NetBSD: pass4.c,v 1.25 2008/07/31 05:38:04 simonb Exp $"); #endif #endif /* not lint */ @@ -89,7 +89,14 @@ pass4(void) case DFOUND: n = info->ino_linkcnt; if (n) { - adjust(&idesc, (short)n); + if (is_journal_inode(inumber)) { + if (debug) + printf( + "skipping unreferenced journal inode %" PRId64 "\n", inumber); + break; + } else { + adjust(&idesc, (short)n); + } break; } for (zlnp = zlnhead; zlnp; zlnp = zlnp->next) diff --git a/sbin/fsck_ffs/setup.c b/sbin/fsck_ffs/setup.c index fdf53f633741..6a49ecb36c74 100644 --- a/sbin/fsck_ffs/setup.c +++ b/sbin/fsck_ffs/setup.c @@ -1,4 +1,4 @@ -/* $NetBSD: setup.c,v 1.82 2008/02/23 21:41:48 christos Exp $ */ +/* $NetBSD: setup.c,v 1.83 2008/07/31 05:38:04 simonb Exp $ */ /* * Copyright (c) 1980, 1986, 1993 @@ -34,7 +34,7 @@ #if 0 static char sccsid[] = "@(#)setup.c 8.10 (Berkeley) 5/9/95"; #else -__RCSID("$NetBSD: setup.c,v 1.82 2008/02/23 21:41:48 christos Exp $"); +__RCSID("$NetBSD: setup.c,v 1.83 2008/07/31 05:38:04 simonb Exp $"); #endif #endif /* not lint */ @@ -159,6 +159,25 @@ setup(const char *dev) doskipclean = 0; pwarn("USING ALTERNATE SUPERBLOCK AT %d\n", bflag); } + if (sblock->fs_flags & FS_DOWAPBL) { + if (preen) { + if (!quiet) + pwarn("file system is journaled; not checking\n"); + return (-1); + } + if (!quiet) + pwarn("** File system is journaled; replaying journal\n"); + replay_wapbl(); + doskipclean = 0; + sblock->fs_flags &= ~FS_DOWAPBL; + sbdirty(); + /* Although we may have updated the superblock from the + * journal, we are still going to do a full check, so we + * don't bother to re-read the superblock from the journal. + * XXX, instead we could re-read the superblock and then not + * force doskipclean = 0 + */ + } if (debug) printf("clean = %d\n", sblock->fs_clean); if (doswap) @@ -218,6 +237,13 @@ setup(const char *dev) /* * Check and potentially fix certain fields in the super block. */ + if (sblock->fs_flags & ~(FS_KNOWN_FLAGS)) { + pfatal("UNKNOWN FLAGS=0x%08x IN SUPERBLOCK", sblock->fs_flags); + if (reply("CLEAR") == 1) { + sblock->fs_flags &= FS_KNOWN_FLAGS; + sbdirty(); + } + } if (sblock->fs_optim != FS_OPTTIME && sblock->fs_optim != FS_OPTSPACE) { pfatal("UNDEFINED OPTIMIZATION IN SUPERBLOCK"); if (reply("SET TO DEFAULT") == 1) { diff --git a/sbin/fsck_ffs/utilities.c b/sbin/fsck_ffs/utilities.c index 7551c20f28ab..06429296344a 100644 --- a/sbin/fsck_ffs/utilities.c +++ b/sbin/fsck_ffs/utilities.c @@ -1,4 +1,4 @@ -/* $NetBSD: utilities.c,v 1.55 2008/02/23 21:41:48 christos Exp $ */ +/* $NetBSD: utilities.c,v 1.56 2008/07/31 05:38:04 simonb Exp $ */ /* * Copyright (c) 1980, 1986, 1993 @@ -34,7 +34,7 @@ #if 0 static char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95"; #else -__RCSID("$NetBSD: utilities.c,v 1.55 2008/02/23 21:41:48 christos Exp $"); +__RCSID("$NetBSD: utilities.c,v 1.56 2008/07/31 05:38:04 simonb Exp $"); #endif #endif /* not lint */ @@ -322,6 +322,7 @@ ckfini(void) if (debug) printf("cache missed %ld of %ld (%d%%)\n", diskreads, totalreads, (int)(diskreads * 100 / totalreads)); + cleanup_wapbl(); (void)close(fsreadfd); (void)close(fswritefd); } @@ -335,7 +336,8 @@ bread(int fd, char *buf, daddr_t blk, long size) offset = blk; offset *= dev_bsize; - if (pread(fd, buf, (int)size, offset) == size) + if ((pread(fd, buf, (int)size, offset) == size) && + read_wapbl(buf, size, blk) == 0) return (0); rwerror("READ", blk); errs = 0; diff --git a/sbin/fsck_ffs/wapbl.c b/sbin/fsck_ffs/wapbl.c new file mode 100644 index 000000000000..8e68b0750a87 --- /dev/null +++ b/sbin/fsck_ffs/wapbl.c @@ -0,0 +1,202 @@ +/* $NetBSD: wapbl.c,v 1.2 2008/07/31 05:38:04 simonb Exp $ */ + +/*- + * Copyright (c) 2005,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* This file contains fsck support for wapbl + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: wapbl.c,v 1.2 2008/07/31 05:38:04 simonb Exp $"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "fsck.h" +#include "fsutil.h" +#include "extern.h" +#include "exitvalues.h" + +int +wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_write: %zd bytes at block %"PRId64" on fd 0x%x\n", + len, pbn, fswritefd)); + bwrite(fswritefd, data, pbn, len); + return 0; +} + +int +wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_read: %zd bytes at block %"PRId64" on fd 0x%x\n", + len, pbn, fsreadfd)); + bread(fsreadfd, data, pbn, len); + return 0; +} + +struct wapbl_replay *wapbl_replay; + +void +replay_wapbl(void) +{ + uint64_t addr, count, blksize; + int error; + + if (debug) + wapbl_debug_print = WAPBL_PRINT_ERROR | WAPBL_PRINT_REPLAY; + if (debug > 1) + wapbl_debug_print |= WAPBL_PRINT_IO; + + if (sblock->fs_journal_version != UFS_WAPBL_VERSION) { + pfatal("INVALID JOURNAL VERSION %d", + sblock->fs_journal_version); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + return; + } + + switch (sblock->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_NONE: + pfatal("INVALID JOURNAL LOCATION 'NONE'"); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + return; + + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + addr = sblock->fs_journallocs[UFS_WAPBL_EPART_ADDR]; + count = sblock->fs_journallocs[UFS_WAPBL_EPART_COUNT]; + blksize = sblock->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + break; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + addr = sblock->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + count = sblock->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + blksize = sblock->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + break; + + default: + pfatal("INVALID JOURNAL LOCATION %d", + sblock->fs_journal_location); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + return; + } + + error = wapbl_replay_start(&wapbl_replay, 0, addr, count, blksize); + if (error) { + pfatal("UNABLE TO READ JOURNAL FOR REPLAY"); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + return; + } + if (!nflag) { + error = wapbl_replay_write(wapbl_replay, 0); + if (error) { + pfatal("UNABLE TO REPLAY JOURNAL BLOCKS"); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + } else { + wapbl_replay_stop(wapbl_replay); + } + } + { + int i; + for (i = 0; i < wapbl_replay->wr_inodescnt; i++) { + WAPBL_PRINTF(WAPBL_PRINT_REPLAY,("wapbl_replay: " + "not cleaning inode %"PRIu32" mode %"PRIo32"\n", + wapbl_replay->wr_inodes[i].wr_inumber, + wapbl_replay->wr_inodes[i].wr_imode)); + } + } +} + +void +cleanup_wapbl(void) +{ + + if (wapbl_replay) { + if (wapbl_replay_isopen(wapbl_replay)) + wapbl_replay_stop(wapbl_replay); + wapbl_replay_free(wapbl_replay); + wapbl_replay = 0; + } +} + +int +read_wapbl(char *buf, long size, daddr_t blk) +{ + + if (!wapbl_replay || !wapbl_replay_isopen(wapbl_replay)) + return 0; + return wapbl_replay_read(wapbl_replay, buf, blk, size); +} + +int +is_journal_inode(ino_t ino) +{ + union dinode *dp; + + dp = ginode(ino); + if ((iswap32(DIP(dp, flags)) & SF_LOG) != 0 && + sblock->fs_journal_version == UFS_WAPBL_VERSION && + sblock->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM && + sblock->fs_journallocs[UFS_WAPBL_INFS_INO] == ino) + return 1; + + return 0; +} diff --git a/sbin/fsdb/Makefile b/sbin/fsdb/Makefile index 7a2615e1a8ed..1aa3bdd375b4 100644 --- a/sbin/fsdb/Makefile +++ b/sbin/fsdb/Makefile @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.22 2008/05/04 15:37:19 tsutsui Exp $ +# $NetBSD: Makefile,v 1.23 2008/07/31 05:38:04 simonb Exp $ # @(#)Makefile 8.1 (Berkeley) 6/5/93 .include @@ -16,6 +16,10 @@ FSCK_FFS=${NETBSDSRCDIR}/sbin/fsck_ffs CPPFLAGS+= -I${FSCK} -I${FSCK_FFS} .PATH: ${FSCK} ${FSCK_FFS} ${NETBSDSRCDIR}/sys/ufs/ffs +SRCS+= vfs_wapbl.c wapbl.c +.PATH: ${NETBSDSRCDIR}/sys/kern +CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0 + LDADD+= -lutil -ledit -ltermcap .ifndef HOSTPROG DPADD+= ${LIBUTIL} ${LIBEDIT} ${LIBTERMCAP} diff --git a/sbin/mount_ffs/mount_ffs.c b/sbin/mount_ffs/mount_ffs.c index 9c46df364ba4..d6a23166b95d 100644 --- a/sbin/mount_ffs/mount_ffs.c +++ b/sbin/mount_ffs/mount_ffs.c @@ -1,4 +1,4 @@ -/* $NetBSD: mount_ffs.c,v 1.23 2008/07/20 01:20:22 lukem Exp $ */ +/* $NetBSD: mount_ffs.c,v 1.24 2008/07/31 05:38:04 simonb Exp $ */ /*- * Copyright (c) 1993, 1994 @@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1993, 1994\ #if 0 static char sccsid[] = "@(#)mount_ufs.c 8.4 (Berkeley) 4/26/95"; #else -__RCSID("$NetBSD: mount_ffs.c,v 1.23 2008/07/20 01:20:22 lukem Exp $"); +__RCSID("$NetBSD: mount_ffs.c,v 1.24 2008/07/31 05:38:04 simonb Exp $"); #endif #endif /* not lint */ @@ -70,6 +70,7 @@ static const struct mntopt mopts[] = { MOPT_NODEVMTIME, MOPT_FORCE, MOPT_SOFTDEP, + MOPT_LOG, MOPT_GETARGS, MOPT_NULL, }; diff --git a/sbin/tunefs/tunefs.8 b/sbin/tunefs/tunefs.8 index df5095e835dd..0aac0e340f5b 100644 --- a/sbin/tunefs/tunefs.8 +++ b/sbin/tunefs/tunefs.8 @@ -1,4 +1,4 @@ -.\" $NetBSD: tunefs.8,v 1.36 2004/12/20 10:28:47 hubertf Exp $ +.\" $NetBSD: tunefs.8,v 1.37 2008/07/31 05:38:04 simonb Exp $ .\" .\" Copyright (c) 1983, 1991, 1993 .\" The Regents of the University of California. All rights reserved. @@ -41,6 +41,7 @@ .Op Fl e Ar maxbpg .Op Fl g Ar avgfilesize .Op Fl h Ar avgfpdir +.Op Fl l Ar logsize .Op Fl m Ar minfree .Bk -words .\" .Op Fl n Ar soft_dependency_enabling @@ -97,6 +98,13 @@ this parameter should be set higher. This specifies the expected average file size. .It Fl h Ar avgfpdir This specifies the expected number of files per directory. +.It Fl l Ar logsize +This value specifies the size of the in-filesystem journaling log file. +The default journaling log file size is described in +.Xr wapbl 4 . +Specifying a size of zero will cause the in-filesystem journaling log file +to be removed the next time the filesystem is mounted. +The size of an existing in-filesystem journaling log file can not be changed. .It Fl m Ar minfree This value specifies the percentage of space held back from normal users; the minimum free space threshold. @@ -145,6 +153,7 @@ or .Li time . .El .Sh SEE ALSO +.Xr wapbl 4 , .Xr fs 5 , .Xr dumpfs 8 , .Xr fsck_ffs 8 , diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c index b4ac66d9cfcb..ca7c14daa42d 100644 --- a/sbin/tunefs/tunefs.c +++ b/sbin/tunefs/tunefs.c @@ -1,4 +1,4 @@ -/* $NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $ */ +/* $NetBSD: tunefs.c,v 1.35 2008/07/31 05:38:04 simonb Exp $ */ /* * Copyright (c) 1983, 1993 @@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1983, 1993\ #if 0 static char sccsid[] = "@(#)tunefs.c 8.3 (Berkeley) 5/3/95"; #else -__RCSID("$NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $"); +__RCSID("$NetBSD: tunefs.c,v 1.35 2008/07/31 05:38:04 simonb Exp $"); #endif #endif /* not lint */ @@ -48,9 +48,9 @@ __RCSID("$NetBSD: tunefs.c,v 1.34 2008/07/20 01:20:23 lukem Exp $"); */ #include -#include #include #include +#include #include @@ -85,15 +85,16 @@ static off_t sblock_try[] = SBLOCKSEARCH; static void bwrite(daddr_t, char *, int, const char *); static void bread(daddr_t, char *, int, const char *); -static int getnum(const char *, const char *, int, int); +static void change_log_info(long long); static void getsb(struct fs *, const char *); static int openpartition(const char *, int, char *, size_t); +static void show_log_info(void); static void usage(void); int main(int argc, char *argv[]) { -#define OPTSTRINGBASE "AFNe:g:h:m:o:" +#define OPTSTRINGBASE "AFNe:g:h:l:m:o:" #ifdef TUNEFS_SOFTDEP int softdep; #define OPTSTRING OPTSTRINGBASE ## "n:" @@ -105,10 +106,12 @@ main(int argc, char *argv[]) char device[MAXPATHLEN]; int maxbpg, minfree, optim; int avgfilesize, avgfpdir; + long long logfilesize; Aflag = Fflag = Nflag = 0; maxbpg = minfree = optim = -1; avgfilesize = avgfpdir = -1; + logfilesize = -1; #ifdef TUNEFS_SOFTDEP softdep = -1; #endif @@ -131,25 +134,30 @@ main(int argc, char *argv[]) break; case 'e': - maxbpg = getnum(optarg, + maxbpg = strsuftoll( "maximum blocks per file in a cylinder group", - 1, INT_MAX); + optarg, 1, INT_MAX); break; case 'g': - avgfilesize = getnum(optarg, - "average file size", 1, INT_MAX); - break; - - case 'h': - avgfpdir = getnum(optarg, - "expected number of files per directory", + avgfilesize = strsuftoll("average file size", optarg, 1, INT_MAX); break; + case 'h': + avgfpdir = strsuftoll( + "expected number of files per directory", + optarg, 1, INT_MAX); + break; + + case 'l': + logfilesize = strsuftoll("journal log file size", + optarg, 0, INT_MAX); + break; + case 'm': - minfree = getnum(optarg, - "minimum percentage of free space", 0, 99); + minfree = strsuftoll("minimum percentage of free space", + optarg, 0, 99); break; #ifdef TUNEFS_SOFTDEP @@ -254,6 +262,9 @@ main(int argc, char *argv[]) CHANGEVAL(sblock.fs_avgfpdir, avgfpdir, "expected number of files per directory", ""); + if (logfilesize >= 0) + change_log_info(logfilesize); + if (Nflag) { fprintf(stdout, "tunefs: current settings of %s\n", special); fprintf(stdout, "\tmaximum contiguous block count %d\n", @@ -274,6 +285,7 @@ main(int argc, char *argv[]) fprintf(stdout, "\texpected number of files per directory: %d\n", sblock.fs_avgfpdir); + show_log_info(); fprintf(stdout, "tunefs: no changes made\n"); exit(0); } @@ -290,20 +302,123 @@ main(int argc, char *argv[]) exit(0); } -static int -getnum(const char *num, const char *desc, int min, int max) +static void +show_log_info(void) { - long n; - char *ep; + const char *loc; + uint64_t size, blksize; + int print; - n = strtol(num, &ep, 10); - if (ep[0] != '\0') - errx(1, "Invalid number `%s' for %s", num, desc); - if ((int) n < min) - errx(1, "%s `%s' too small (minimum is %d)", desc, num, min); - if ((int) n > max) - errx(1, "%s `%s' too large (maximum is %d)", desc, num, max); - return ((int)n); + switch (sblock.fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_NONE: + print = blksize = 0; + /* nothing */ + break; + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + loc = "end of partition"; + size = sblock.fs_journallocs[UFS_WAPBL_EPART_COUNT]; + blksize = sblock.fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + print = 1; + break; + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + loc = "in filesystem"; + size = sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT]; + blksize = sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + print = 1; + break; + default: + loc = "unknown"; + size = blksize = 0; + print = 1; + break; + } + + if (print) { + fprintf(stdout, "\tjournal log file location: %s\n", loc); + fprintf(stdout, "\tjournal log file size: %" PRIu64 "\n", + size * blksize); + fprintf(stdout, "\tjournal log flags:"); + if (sblock.fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG) + fprintf(stdout, " clear-log"); + if (sblock.fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) + fprintf(stdout, " clear-log"); + fprintf(stdout, "\n"); + } +} + +static void +change_log_info(long long logfilesize) +{ + /* + * NOTES: + * - only operate on in-filesystem log sizes + * - can't change size of existing log + * - if current is same, no action + * - if current is zero and new is non-zero, set flag to create log + * on next mount + * - if current is non-zero and new is zero, set flag to clear log + * on next mount + */ + int in_fs_log; + uint64_t old_size; + + old_size = 0; + switch (sblock.fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + in_fs_log = 0; + old_size = sblock.fs_journallocs[UFS_WAPBL_EPART_COUNT] * + sblock.fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + break; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + in_fs_log = 1; + old_size = sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT] * + sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + break; + + case UFS_WAPBL_JOURNALLOC_NONE: + default: + in_fs_log = 0; + old_size = 0; + break; + } + + if (!in_fs_log) + errx(1, "Can't change size of non-in-filesystem log"); + + if (old_size == logfilesize && logfilesize > 0) { + /* no action */ + warnx("log file size remains unchanged at %lld", logfilesize); + return; + } + + if (logfilesize == 0) { + /* + * Don't clear out the locators - the kernel might need + * these to find the log! Just set the "clear the log" + * flag and let the kernel do the rest. + */ + sblock.fs_journal_flags |= UFS_WAPBL_FLAGS_CLEAR_LOG; + sblock.fs_journal_flags &= ~UFS_WAPBL_FLAGS_CREATE_LOG; + warnx("log file size cleared from %" PRIu64 "", old_size); + return; + } + + if (old_size == 0) { + /* create new log of desired size next mount */ + sblock.fs_journal_location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; + sblock.fs_journallocs[UFS_WAPBL_INFS_ADDR] = 0; + sblock.fs_journallocs[UFS_WAPBL_INFS_COUNT] = logfilesize; + sblock.fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = 0; + sblock.fs_journallocs[UFS_WAPBL_INFS_INO] = 0; + sblock.fs_journal_flags |= UFS_WAPBL_FLAGS_CREATE_LOG; + sblock.fs_journal_flags &= ~UFS_WAPBL_FLAGS_CLEAR_LOG; + warnx("log file size set to %lld", logfilesize); + } else { + errx(1, + "Can't change existing log size from %" PRIu64 " to %lld", + old_size, logfilesize); + } } static void @@ -315,6 +430,7 @@ usage(void) fprintf(stderr, "\t-e maximum blocks per file in a cylinder group\n"); fprintf(stderr, "\t-g average file size\n"); fprintf(stderr, "\t-h expected number of files per directory\n"); + fprintf(stderr, "\t-l journal log file size (`0' to clear journal)\n"); fprintf(stderr, "\t-m minimum percentage of free space\n"); #ifdef TUNEFS_SOFTDEP fprintf(stderr, "\t-n soft dependencies (`enable' or `disable')\n"); diff --git a/sys/conf/files b/sys/conf/files index 97a6135f38af..cad99941baf5 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $NetBSD: files,v 1.910 2008/07/16 20:06:19 pooka Exp $ +# $NetBSD: files,v 1.911 2008/07/31 05:38:04 simonb Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -110,6 +110,10 @@ defflag opt_fileassoc.h FILEASSOC defflag opt_gre.h GRE_DEBUG +# Write Ahead Physical Block Logging +defflag opt_wapbl.h WAPBL WAPBL_DEBUG +defparam opt_wapbl.h WAPBL_DEBUG_PRINT + # compatibility options # defflag opt_compat_netbsd.h COMPAT_40 @@ -1475,6 +1479,7 @@ file kern/vfs_subr.c file kern/vfs_syscalls.c file kern/vfs_trans.c file kern/vfs_vnops.c +file kern/vfs_wapbl.c wapbl file kern/vfs_xattr.c file kern/vnode_if.c file miscfs/deadfs/dead_vnops.c diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 14881a98588a..bd35e0f2b180 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,4 @@ -/* $NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $ */ +/* $NetBSD: init_main.c,v 1.361 2008/07/31 05:38:05 simonb Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -97,7 +97,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $"); +__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.361 2008/07/31 05:38:05 simonb Exp $"); #include "opt_ipsec.h" #include "opt_ntp.h" @@ -108,6 +108,7 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $") #include "opt_fileassoc.h" #include "opt_ktrace.h" #include "opt_pax.h" +#include "opt_wapbl.h" #include "rnd.h" #include "sysmon_envsys.h" @@ -192,6 +193,9 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.360 2008/06/18 09:06:27 yamt Exp $") #include #endif #include +#ifdef WAPBL +#include +#endif #include #include @@ -570,6 +574,11 @@ main(void) /* Initialize the UUID system calls. */ uuid_init(); +#ifdef WAPBL + /* Initialize write-ahead physical block logging. */ + wapbl_init(); +#endif + /* * Create process 1 (init(8)). We do this now, as Unix has * historically had init be process 1, and changing this would diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index c428304cd8b8..d79f3029f276 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $ */ +/* $NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $ */ /*- * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. @@ -6,6 +6,8 @@ * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -107,7 +109,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $"); #include "fs_ffs.h" #include "opt_bufcache.h" @@ -126,6 +128,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.207 2008/07/14 16:22:42 hannken Exp $" #include #include #include +#include #include @@ -714,8 +717,23 @@ bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred, /* Wait for the read to complete, and return result. */ error = biowait(bp); - if (error == 0 && (flags & B_MODIFY) != 0) + if (error == 0 && (flags & B_MODIFY) != 0) /* XXXX before the next code block or after? */ error = fscow_run(bp, true); + + if (!error) { + struct mount *mp = wapbl_vptomp(vp); + + if (mp && mp->mnt_wapbl_replay && + WAPBL_REPLAY_ISOPEN(mp)) { + error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, + bp->b_bcount); + if (error) { + mutex_enter(&bufcache_lock); + SET(bp->b_cflags, BC_INVAL); + mutex_exit(&bufcache_lock); + } + } + } return error; } @@ -793,6 +811,13 @@ bwrite(buf_t *bp) mp = NULL; } + if (mp && mp->mnt_wapbl) { + if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { + bdwrite(bp); + return 0; + } + } + /* * Remember buffer type, to switch on it later. If the write was * synchronous, but the file system was mounted with MNT_ASYNC, @@ -897,6 +922,14 @@ bdwrite(buf_t *bp) return; } + if (wapbl_vphaswapbl(bp->b_vp)) { + struct mount *mp = wapbl_vptomp(bp->b_vp); + + if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { + WAPBL_ADD_BUF(mp, bp); + } + } + /* * If the block hasn't been seen before: * (1) Mark it as having been seen, @@ -1028,6 +1061,16 @@ brelsel(buf_t *bp, int set) if (bioopsp != NULL) (*bioopsp->io_deallocate)(bp); + if (ISSET(bp->b_flags, B_LOCKED)) { + if (wapbl_vphaswapbl(vp = bp->b_vp)) { + struct mount *mp = wapbl_vptomp(vp); + + KASSERT(bp->b_iodone + != mp->mnt_wapbl_op->wo_wapbl_biodone); + WAPBL_REMOVE_BUF(mp, bp); + } + } + mutex_enter(bp->b_objlock); CLR(bp->b_oflags, BO_DONE|BO_DELWRI); if ((vp = bp->b_vp) != NULL) { @@ -1224,19 +1267,22 @@ geteblk(int size) int allocbuf(buf_t *bp, int size, int preserve) { - vsize_t oldsize, desired_size; void *addr; + vsize_t oldsize, desired_size; + int oldcount; int delta; desired_size = buf_roundsize(size); if (desired_size > MAXBSIZE) printf("allocbuf: buffer larger than MAXBSIZE requested"); + oldcount = bp->b_bcount; + bp->b_bcount = size; oldsize = bp->b_bufsize; if (oldsize == desired_size) - return 0; + goto out; /* * If we want a buffer of a different size, re-allocate the @@ -1274,6 +1320,11 @@ allocbuf(buf_t *bp, int size, int preserve) } } mutex_exit(&bufcache_lock); + + out: + if (wapbl_vphaswapbl(bp->b_vp)) + WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); + return 0; } diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 3bfc11300dd2..055c11af7f8d 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_lookup.c,v 1.108 2008/05/06 18:43:44 ad Exp $ */ +/* $NetBSD: vfs_lookup.c,v 1.109 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -37,7 +37,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.108 2008/05/06 18:43:44 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.109 2008/07/31 05:38:05 simonb Exp $"); #include "opt_magiclinks.h" @@ -956,8 +956,10 @@ relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) if (cnp->cn_nameptr[0] == '\0') panic("relookup: null name"); +#ifdef ohcrap if (cnp->cn_flags & ISDOTDOT) panic("relookup: lookup on dot-dot"); +#endif /* * We now have a segment name to search for, and a directory to search. diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 17bb5ee4d51d..3ad7b9a616a3 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $ */ +/* $NetBSD: vfs_subr.c,v 1.355 2008/07/31 05:38:05 simonb Exp $ */ /*- * Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc. @@ -81,7 +81,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.355 2008/07/31 05:38:05 simonb Exp $"); #include "opt_ddb.h" #include "opt_compat_netbsd.h" @@ -106,6 +106,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.354 2008/07/27 15:08:37 pooka Exp $") #include #include #include +#include #include #include @@ -1804,8 +1805,13 @@ vclean(vnode_t *vp, int flags) */ if (flags & DOCLOSE) { error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); - if (error != 0) + if (error != 0) { + /* XXX, fix vn_start_write's grab of mp and use that. */ + + if (wapbl_vphaswapbl(vp)) + WAPBL_DISCARD(wapbl_vptomp(vp)); error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); + } KASSERT(error == 0); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index f6be934a7487..b48faf864f29 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $ */ +/* $NetBSD: vfs_syscalls.c,v 1.370 2008/07/31 05:38:05 simonb Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -63,7 +63,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.369 2008/06/24 11:21:46 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls.c,v 1.370 2008/07/31 05:38:05 simonb Exp $"); #include "opt_compat_netbsd.h" #include "opt_compat_43.h" @@ -208,12 +208,13 @@ mount_update(struct lwp *l, struct vnode *vp, const char *path, int flags, mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | - MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP); + MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | + MNT_LOG); mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | - MNT_IGNORE); + MNT_LOG | MNT_IGNORE); error = VFS_MOUNT(mp, path, data, data_len); @@ -367,7 +368,7 @@ mount_domount(struct lwp *l, struct vnode **vpp, struct vfsops *vfsops, (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | - MNT_IGNORE | MNT_RDONLY); + MNT_LOG | MNT_IGNORE | MNT_RDONLY); error = VFS_MOUNT(mp, path, data, data_len); mp->mnt_flag &= ~MNT_OP_FLAGS; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 900ddbbdf813..e168d2427c50 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4 +1,4 @@ -/* $NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $ */ +/* $NetBSD: vfs_vnops.c,v 1.159 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -37,7 +37,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.159 2008/07/31 05:38:05 simonb Exp $"); #include "fs_union.h" #include "veriexec.h" @@ -61,6 +61,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.158 2008/06/02 16:08:41 ad Exp $"); #include #include #include +#include #include @@ -692,6 +693,11 @@ vn_lock(struct vnode *vp, int flags) LK_CANRECURSE)) == 0); +#ifdef DIAGNOSTIC + if (wapbl_vphaswapbl(vp)) + WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); +#endif + do { /* * XXX PR 37706 forced unmount of file systems is unsafe. diff --git a/sys/kern/vfs_wapbl.c b/sys/kern/vfs_wapbl.c new file mode 100644 index 000000000000..c9792cd17283 --- /dev/null +++ b/sys/kern/vfs_wapbl.c @@ -0,0 +1,2783 @@ +/* $NetBSD: vfs_wapbl.c,v 1.2 2008/07/31 05:38:05 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This implements file system independent write ahead filesystem logging. + */ +#include +__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.2 2008/07/31 05:38:05 simonb Exp $"); + +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if WAPBL_UVM_ALLOC +#include +#endif + +#include + +MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); +#define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) +#define wapbl_free(a) free((a), M_WAPBL) +#define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) + +#else /* !_KERNEL */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KDASSERT(x) assert(x) +#define KASSERT(x) assert(x) +#define wapbl_malloc(s) malloc(s) +#define wapbl_free(a) free(a) +#define wapbl_calloc(n, s) calloc((n), (s)) + +#endif /* !_KERNEL */ + +/* + * INTERNAL DATA STRUCTURES + */ + +/* + * This structure holds per-mount log information. + * + * Legend: a = atomic access only + * r = read-only after init + * l = rwlock held + * m = mutex held + * u = unlocked access ok + * b = bufcache_lock held + */ +struct wapbl { + struct vnode *wl_logvp; /* r: log here */ + struct vnode *wl_devvp; /* r: log on this device */ + struct mount *wl_mount; /* r: mountpoint wl is associated with */ + daddr_t wl_logpbn; /* r: Physical block number of start of log */ + int wl_log_dev_bshift; /* r: logarithm of device block size of log + device */ + int wl_fs_dev_bshift; /* r: logarithm of device block size of + filesystem device */ + + unsigned wl_lock_count; /* a: Count of transactions in progress */ + + size_t wl_circ_size; /* r: Number of bytes in buffer of log */ + size_t wl_circ_off; /* r: Number of bytes reserved at start */ + + size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ + size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ + + off_t wl_head; /* l: Byte offset of log head */ + off_t wl_tail; /* l: Byte offset of log tail */ + /* + * head == tail == 0 means log is empty + * head == tail != 0 means log is full + * see assertions in wapbl_advance() for other boundary conditions. + * only truncate moves the tail, except when flush sets it to + * wl_header_size only flush moves the head, except when truncate + * sets it to 0. + */ + + struct wapbl_wc_header *wl_wc_header; /* l */ + void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ + + kmutex_t wl_mtx; /* u: short-term lock */ + krwlock_t wl_rwlock; /* u: File system transaction lock */ + + /* + * Must be held while accessing + * wl_count or wl_bufs or head or tail + */ + + /* + * Callback called from within the flush routine to flush any extra + * bits. Note that flush may be skipped without calling this if + * there are no outstanding buffers in the transaction. + */ + wapbl_flush_fn_t wl_flush; /* r */ + wapbl_flush_fn_t wl_flush_abort;/* r */ + + size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ + size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ + size_t wl_bcount; /* m: Total bcount of wl_bufs */ + + LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ + + kcondvar_t wl_reclaimable_cv; /* m (obviously) */ + size_t wl_reclaimable_bytes; /* m: Amount of space available for + reclamation by truncate */ + int wl_error_count; /* m: # of wl_entries with errors */ + size_t wl_reserved_bytes; /* never truncate log smaller than this */ + +#ifdef WAPBL_DEBUG_BUFBYTES + size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ +#endif + + daddr_t *wl_deallocblks;/* l: address of block */ + int *wl_dealloclens; /* l: size of block (fragments, kom ihåg) */ + int wl_dealloccnt; /* l: total count */ + int wl_dealloclim; /* l: max count */ + + /* hashtable of inode numbers for allocated but unlinked inodes */ + /* synch ??? */ + LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; + u_long wl_inohashmask; + int wl_inohashcnt; + + SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction + accounting */ +}; + +#ifdef WAPBL_DEBUG_PRINT +int wapbl_debug_print = WAPBL_DEBUG_PRINT; +#endif + +/****************************************************************/ +#ifdef _KERNEL + +#ifdef WAPBL_DEBUG +struct wapbl *wapbl_debug_wl; +#endif + +static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); +static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); +static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); +static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); +#endif /* _KERNEL */ + +static int wapbl_replay_prescan(struct wapbl_replay *wr); +static int wapbl_replay_get_inodes(struct wapbl_replay *wr); + +static __inline size_t wapbl_space_free(size_t avail, off_t head, + off_t tail); +static __inline size_t wapbl_space_used(size_t avail, off_t head, + off_t tail); + +#ifdef _KERNEL + +#define WAPBL_INODETRK_SIZE 83 +static int wapbl_ino_pool_refcount; +static struct pool wapbl_ino_pool; +struct wapbl_ino { + LIST_ENTRY(wapbl_ino) wi_hash; + ino_t wi_ino; + mode_t wi_mode; +}; + +static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); +static void wapbl_inodetrk_free(struct wapbl *wl); +static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); + +static size_t wapbl_transaction_len(struct wapbl *wl); +static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); + +/* + * This is useful for debugging. If set, the log will + * only be truncated when necessary. + */ +int wapbl_lazy_truncate = 0; + +struct wapbl_ops wapbl_ops = { + .wo_wapbl_discard = wapbl_discard, + .wo_wapbl_replay_isopen = wapbl_replay_isopen1, + .wo_wapbl_replay_read = wapbl_replay_read, + .wo_wapbl_add_buf = wapbl_add_buf, + .wo_wapbl_remove_buf = wapbl_remove_buf, + .wo_wapbl_resize_buf = wapbl_resize_buf, + .wo_wapbl_begin = wapbl_begin, + .wo_wapbl_end = wapbl_end, + .wo_wapbl_junlock_assert= wapbl_junlock_assert, + + /* XXX: the following is only used to say "this is a wapbl buf" */ + .wo_wapbl_biodone = wapbl_biodone, +}; + +void +wapbl_init() +{ + + malloc_type_attach(M_WAPBL); +} + +int +wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, + daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, + wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) +{ + struct wapbl *wl; + struct vnode *devvp; + daddr_t logpbn; + int error; + int log_dev_bshift = DEV_BSHIFT; + int fs_dev_bshift = DEV_BSHIFT; + int run; + + WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 + " count=%zu blksize=%zu\n", vp, off, count, blksize)); + + if (log_dev_bshift > fs_dev_bshift) { + WAPBL_PRINTF(WAPBL_PRINT_OPEN, + ("wapbl: log device's block size cannot be larger " + "than filesystem's\n")); + /* + * Not currently implemented, although it could be if + * needed someday. + */ + return ENOSYS; + } + + if (off < 0) + return EINVAL; + + if (blksize < DEV_BSIZE) + return EINVAL; + if (blksize % DEV_BSIZE) + return EINVAL; + + /* XXXTODO: verify that the full load is writable */ + + /* + * XXX check for minimum log size + * minimum is governed by minimum amount of space + * to complete a transaction. (probably truncate) + */ + /* XXX for now pick something minimal */ + if ((count * blksize) < MAXPHYS) { + return ENOSPC; + } + + if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { + return error; + } + + wl = wapbl_calloc(1, sizeof(*wl)); + rw_init(&wl->wl_rwlock); + mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); + cv_init(&wl->wl_reclaimable_cv, "wapblrec"); + LIST_INIT(&wl->wl_bufs); + SIMPLEQ_INIT(&wl->wl_entries); + + wl->wl_logvp = vp; + wl->wl_devvp = devvp; + wl->wl_mount = mp; + wl->wl_logpbn = logpbn; + wl->wl_log_dev_bshift = log_dev_bshift; + wl->wl_fs_dev_bshift = fs_dev_bshift; + + wl->wl_flush = flushfn; + wl->wl_flush_abort = flushabortfn; + + /* Reserve two log device blocks for the commit headers */ + wl->wl_circ_off = 2<wl_log_dev_bshift; + wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); + /* truncate the log usage to a multiple of log_dev_bshift */ + wl->wl_circ_size >>= wl->wl_log_dev_bshift; + wl->wl_circ_size <<= wl->wl_log_dev_bshift; + + /* + * wl_bufbytes_max limits the size of the in memory transaction space. + * - Since buffers are allocated and accounted for in units of + * PAGE_SIZE it is required to be a multiple of PAGE_SIZE + * (i.e. 1<wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); + + /* Round wl_bufbytes_max to the largest power of two constraint */ + wl->wl_bufbytes_max >>= PAGE_SHIFT; + wl->wl_bufbytes_max <<= PAGE_SHIFT; + wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; + wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; + wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; + wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; + + /* XXX maybe use filesystem fragment size instead of 1024 */ + /* XXX fix actual number of buffers reserved per filesystem. */ + wl->wl_bufcount_max = (nbuf / 2) * 1024; + + /* XXX tie this into resource estimation */ + wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); + +#if WAPBL_UVM_ALLOC + wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map, + round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim)); + KASSERT(wl->wl_deallocblks != NULL); + wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map, + round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim)); + KASSERT(wl->wl_dealloclens != NULL); +#else + wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * + wl->wl_dealloclim); + wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * + wl->wl_dealloclim); +#endif + + wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); + + /* Initialize the commit header */ + { + struct wapbl_wc_header *wc; + size_t len = 1<wl_log_dev_bshift; + wc = wapbl_calloc(1, len); + wc->wc_type = WAPBL_WC_HEADER; + wc->wc_len = len; + wc->wc_circ_off = wl->wl_circ_off; + wc->wc_circ_size = wl->wl_circ_size; + /* XXX wc->wc_fsid */ + wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; + wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; + wl->wl_wc_header = wc; + wl->wl_wc_scratch = wapbl_malloc(len); + } + + /* + * if there was an existing set of unlinked but + * allocated inodes, preserve it in the new + * log. + */ + if (wr && wr->wr_inodescnt) { + int i; + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_start: reusing log with %d inodes\n", + wr->wr_inodescnt)); + + /* + * Its only valid to reuse the replay log if its + * the same as the new log we just opened. + */ + KDASSERT(!wapbl_replay_isopen(wr)); + KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev); + KASSERT(logpbn == wr->wr_logpbn); + KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size); + KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off); + KASSERT(wl->wl_log_dev_bshift == + wr->wr_wc_header.wc_log_dev_bshift); + KASSERT(wl->wl_fs_dev_bshift == + wr->wr_wc_header.wc_fs_dev_bshift); + + wl->wl_wc_header->wc_generation = + wr->wr_wc_header.wc_generation + 1; + + for (i = 0; i < wr->wr_inodescnt; i++) + wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, + wr->wr_inodes[i].wr_imode); + + /* Make sure new transaction won't overwrite old inodes list */ + KDASSERT(wapbl_transaction_len(wl) <= + wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, + wr->wr_inodestail)); + + wl->wl_head = wl->wl_tail = wr->wr_inodeshead; + wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = + wapbl_transaction_len(wl); + + error = wapbl_write_inodes(wl, &wl->wl_head); + if (error) + goto errout; + + KASSERT(wl->wl_head != wl->wl_tail); + KASSERT(wl->wl_head != 0); + } + + error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); + if (error) { + goto errout; + } + + *wlp = wl; +#if defined(WAPBL_DEBUG) + wapbl_debug_wl = wl; +#endif + + return 0; + errout: + wapbl_discard(wl); + wapbl_free(wl->wl_wc_scratch); + wapbl_free(wl->wl_wc_header); +#if WAPBL_UVM_ALLOC + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, + round_page(sizeof(*wl->wl_deallocblks * + wl->wl_dealloclim))); + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, + round_page(sizeof(*wl->wl_dealloclens * + wl->wl_dealloclim))); +#else + wapbl_free(wl->wl_deallocblks); + wapbl_free(wl->wl_dealloclens); +#endif + wapbl_inodetrk_free(wl); + wapbl_free(wl); + + return error; +} + +/* + * Like wapbl_flush, only discards the transaction + * completely + */ + +void +wapbl_discard(struct wapbl *wl) +{ + struct wapbl_entry *we; + struct buf *bp; + int i; + + /* + * XXX we may consider using upgrade here + * if we want to call flush from inside a transaction + */ + rw_enter(&wl->wl_rwlock, RW_WRITER); + wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, + wl->wl_dealloccnt); + +#ifdef WAPBL_DEBUG_PRINT + { + struct wapbl_entry *we; + pid_t pid = -1; + lwpid_t lid = -1; + if (curproc) + pid = curproc->p_pid; + if (curlwp) + lid = curlwp->l_lid; +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("wapbl_discard: thread %d.%d discarding " + "transaction\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %u, reclaimable=%zu reserved=%zu " + "unsynced=%zu\n", + pid, lid, wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt, wl->wl_error_count, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_unsynced_bufbytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d, unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes)); + } +#else /* !WAPBL_DEBUG_BUFBYTES */ + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("wapbl_discard: thread %d.%d discarding transaction\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", + pid, lid, wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt, wl->wl_error_count, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error)); + } +#endif /* !WAPBL_DEBUG_BUFBYTES */ + } +#endif /* WAPBL_DEBUG_PRINT */ + + for (i = 0; i <= wl->wl_inohashmask; i++) { + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + wih = &wl->wl_inohash[i]; + while ((wi = LIST_FIRST(wih)) != NULL) { + LIST_REMOVE(wi, wi_hash); + pool_put(&wapbl_ino_pool, wi); + KASSERT(wl->wl_inohashcnt > 0); + wl->wl_inohashcnt--; + } + } + + /* + * clean buffer list + */ + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { + if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { + /* + * The buffer will be unlocked and + * removed from the transaction in brelse + */ + mutex_exit(&wl->wl_mtx); + brelsel(bp, 0); + mutex_enter(&wl->wl_mtx); + } + } + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + + /* + * Remove references to this wl from wl_entries, free any which + * no longer have buffers, others will be freed in wapbl_biodone + * when they no longer have any buffers. + */ + while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { + SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); + /* XXX should we be accumulating wl_error_count + * and increasing reclaimable bytes ? */ + we->we_wapbl = NULL; + if (we->we_bufcount == 0) { +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes == 0); +#endif + wapbl_free(we); + } + } + + /* Discard list of deallocs */ + wl->wl_dealloccnt = 0; + /* XXX should we clear wl_reserved_bytes? */ + + KASSERT(wl->wl_bufbytes == 0); + KASSERT(wl->wl_bcount == 0); + KASSERT(wl->wl_bufcount == 0); + KASSERT(LIST_EMPTY(&wl->wl_bufs)); + KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); + KASSERT(wl->wl_inohashcnt == 0); + + rw_exit(&wl->wl_rwlock); +} + +int +wapbl_stop(struct wapbl *wl, int force) +{ + struct vnode *vp; + int error; + + WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); + error = wapbl_flush(wl, 1); + if (error) { + if (force) + wapbl_discard(wl); + else + return error; + } + + /* Unlinked inodes persist after a flush */ + if (wl->wl_inohashcnt) { + if (force) { + wapbl_discard(wl); + } else { + return EBUSY; + } + } + + KASSERT(wl->wl_bufbytes == 0); + KASSERT(wl->wl_bcount == 0); + KASSERT(wl->wl_bufcount == 0); + KASSERT(LIST_EMPTY(&wl->wl_bufs)); + KASSERT(wl->wl_dealloccnt == 0); + KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); + KASSERT(wl->wl_inohashcnt == 0); + + vp = wl->wl_logvp; + + wapbl_free(wl->wl_wc_scratch); + wapbl_free(wl->wl_wc_header); +#if WAPBL_UVM_ALLOC + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, + round_page(sizeof(*wl->wl_deallocblks * + wl->wl_dealloclim))); + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, + round_page(sizeof(*wl->wl_dealloclens * + wl->wl_dealloclim))); +#else + wapbl_free(wl->wl_deallocblks); + wapbl_free(wl->wl_dealloclens); +#endif + wapbl_inodetrk_free(wl); + + cv_destroy(&wl->wl_reclaimable_cv); + mutex_destroy(&wl->wl_mtx); + rw_destroy(&wl->wl_rwlock); + wapbl_free(wl); + + return 0; +} + +static int +wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) +{ + struct pstats *pstats = curlwp->l_proc->p_stats; + struct buf *bp; + int error; + + KASSERT((flags & ~(B_WRITE | B_READ)) == 0); + KASSERT(devvp->v_type == VBLK); + + if ((flags & (B_WRITE | B_READ)) == B_WRITE) { + mutex_enter(&devvp->v_interlock); + devvp->v_numoutput++; + mutex_exit(&devvp->v_interlock); + pstats->p_ru.ru_oublock++; + } else { + pstats->p_ru.ru_inblock++; + } + + bp = getiobuf(devvp, true); + bp->b_flags = flags; + bp->b_cflags = BC_BUSY; /* silly & dubious */ + bp->b_dev = devvp->v_rdev; + bp->b_data = data; + bp->b_bufsize = bp->b_resid = bp->b_bcount = len; + bp->b_blkno = pbn; + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", + BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, + bp->b_blkno, bp->b_dev)); + + VOP_STRATEGY(devvp, bp); + + error = biowait(bp); + putiobuf(bp); + + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_doio: %s %zu bytes at block %" PRId64 + " on dev 0x%x failed with error %d\n", + (((flags & (B_WRITE | B_READ)) == B_WRITE) ? + "write" : "read"), + len, pbn, devvp->v_rdev, error)); + } + + return error; +} + +int +wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + return wapbl_doio(data, len, devvp, pbn, B_WRITE); +} + +int +wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + return wapbl_doio(data, len, devvp, pbn, B_READ); +} + +/* + * Off is byte offset returns new offset for next write + * handles log wraparound + */ +static int +wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) +{ + size_t slen; + off_t off = *offp; + int error; + + KDASSERT(((len >> wl->wl_log_dev_bshift) << + wl->wl_log_dev_bshift) == len); + + if (off < wl->wl_circ_off) + off = wl->wl_circ_off; + slen = wl->wl_circ_off + wl->wl_circ_size - off; + if (slen < len) { + error = wapbl_write(data, slen, wl->wl_devvp, + wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); + if (error) + return error; + data = (uint8_t *)data + slen; + len -= slen; + off = wl->wl_circ_off; + } + error = wapbl_write(data, len, wl->wl_devvp, + wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); + if (error) + return error; + off += len; + if (off >= wl->wl_circ_off + wl->wl_circ_size) + off = wl->wl_circ_off; + *offp = off; + return 0; +} + +/****************************************************************/ + +int +wapbl_begin(struct wapbl *wl, const char *file, int line) +{ + int doflush; + unsigned lockcount; + krw_t op; + + KDASSERT(wl); + +/* + * XXX: The original code calls for the use of a RW_READER lock + * here, but it turns out there are performance issues with high + * metadata-rate workloads (e.g. multiple simultaneous tar + * extractions). For now, we force the lock to be RW_WRITER, + * since that currently has the best performance characteristics + * (even for a single tar-file extraction). + * + */ +#define WAPBL_DEBUG_SERIALIZE 1 + +#ifdef WAPBL_DEBUG_SERIALIZE + op = RW_WRITER; +#else + op = RW_READER; +#endif + + /* + * XXX this needs to be made much more sophisticated. + * perhaps each wapbl_begin could reserve a specified + * number of buffers and bytes. + */ + mutex_enter(&wl->wl_mtx); + lockcount = wl->wl_lock_count; + doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > + wl->wl_bufbytes_max / 2) || + ((wl->wl_bufcount + (lockcount * 10)) > + wl->wl_bufcount_max / 2) || + (wapbl_transaction_len(wl) > wl->wl_circ_size / 2); + mutex_exit(&wl->wl_mtx); + + if (doflush) { + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("force flush lockcnt=%d bufbytes=%zu " + "(max=%zu) bufcount=%zu (max=%zu)\n", + lockcount, wl->wl_bufbytes, + wl->wl_bufbytes_max, wl->wl_bufcount, + wl->wl_bufcount_max)); + } + + if (doflush) { + int error = wapbl_flush(wl, 0); + if (error) + return error; + } + + rw_enter(&wl->wl_rwlock, op); + mutex_enter(&wl->wl_mtx); + wl->wl_lock_count++; + mutex_exit(&wl->wl_mtx); + +#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) + WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, + ("wapbl_begin thread %d.%d with bufcount=%zu " + "bufbytes=%zu bcount=%zu at %s:%d\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, file, line)); +#endif + + return 0; +} + +void +wapbl_end(struct wapbl *wl) +{ + +#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) + WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, + ("wapbl_end thread %d.%d with bufcount=%zu " + "bufbytes=%zu bcount=%zu\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount)); +#endif + + mutex_enter(&wl->wl_mtx); + KASSERT(wl->wl_lock_count > 0); + wl->wl_lock_count--; + mutex_exit(&wl->wl_mtx); + + rw_exit(&wl->wl_rwlock); +} + +void +wapbl_add_buf(struct wapbl *wl, struct buf * bp) +{ + + KASSERT(bp->b_cflags & BC_BUSY); + KASSERT(bp->b_vp); + + wapbl_jlock_assert(wl); + +#if 0 + /* + * XXX this might be an issue for swapfiles. + * see uvm_swap.c:1702 + * + * XXX2 why require it then? leap of semantics? + */ + KASSERT((bp->b_cflags & BC_NOCACHE) == 0); +#endif + + mutex_enter(&wl->wl_mtx); + if (bp->b_flags & B_LOCKED) { + LIST_REMOVE(bp, b_wapbllist); + WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, + ("wapbl_add_buf thread %d.%d re-adding buf %p " + "with %d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, + bp->b_bcount)); + } else { + /* unlocked by dirty buffers shouldn't exist */ + KASSERT(!(bp->b_oflags & BO_DELWRI)); + wl->wl_bufbytes += bp->b_bufsize; + wl->wl_bcount += bp->b_bcount; + wl->wl_bufcount++; + WAPBL_PRINTF(WAPBL_PRINT_BUFFER, + ("wapbl_add_buf thread %d.%d adding buf %p " + "with %d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, + bp->b_bcount)); + } + LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); + mutex_exit(&wl->wl_mtx); + + bp->b_flags |= B_LOCKED; +} + +static void +wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) +{ + + KASSERT(mutex_owned(&wl->wl_mtx)); + KASSERT(bp->b_cflags & BC_BUSY); + wapbl_jlock_assert(wl); + +#if 0 + /* + * XXX this might be an issue for swapfiles. + * see uvm_swap.c:1725 + * + * XXXdeux: see above + */ + KASSERT((bp->b_flags & BC_NOCACHE) == 0); +#endif + KASSERT(bp->b_flags & B_LOCKED); + + WAPBL_PRINTF(WAPBL_PRINT_BUFFER, + ("wapbl_remove_buf thread %d.%d removing buf %p with " + "%d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); + + KASSERT(wl->wl_bufbytes >= bp->b_bufsize); + wl->wl_bufbytes -= bp->b_bufsize; + KASSERT(wl->wl_bcount >= bp->b_bcount); + wl->wl_bcount -= bp->b_bcount; + KASSERT(wl->wl_bufcount > 0); + wl->wl_bufcount--; + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); + LIST_REMOVE(bp, b_wapbllist); + + bp->b_flags &= ~B_LOCKED; +} + +/* called from brelsel() in vfs_bio among other places */ +void +wapbl_remove_buf(struct wapbl * wl, struct buf *bp) +{ + + mutex_enter(&wl->wl_mtx); + wapbl_remove_buf_locked(wl, bp); + mutex_exit(&wl->wl_mtx); +} + +void +wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) +{ + + KASSERT(bp->b_cflags & BC_BUSY); + + /* + * XXX: why does this depend on B_LOCKED? otherwise the buf + * is not for a transaction? if so, why is this called in the + * first place? + */ + if (bp->b_flags & B_LOCKED) { + mutex_enter(&wl->wl_mtx); + wl->wl_bufbytes += bp->b_bufsize - oldsz; + wl->wl_bcount += bp->b_bcount - oldcnt; + mutex_exit(&wl->wl_mtx); + } +} + +#endif /* _KERNEL */ + +/****************************************************************/ +/* Some utility inlines */ + +/* This is used to advance the pointer at old to new value at old+delta */ +static __inline off_t +wapbl_advance(size_t size, size_t off, off_t old, size_t delta) +{ + off_t new; + + /* Define acceptable ranges for inputs. */ + KASSERT(delta <= size); + KASSERT((old == 0) || (old >= off)); + KASSERT(old < (size + off)); + + if ((old == 0) && (delta != 0)) + new = off + delta; + else if ((old + delta) < (size + off)) + new = old + delta; + else + new = (old + delta) - size; + + /* Note some interesting axioms */ + KASSERT((delta != 0) || (new == old)); + KASSERT((delta == 0) || (new != 0)); + KASSERT((delta != (size)) || (new == old)); + + /* Define acceptable ranges for output. */ + KASSERT((new == 0) || (new >= off)); + KASSERT(new < (size + off)); + return new; +} + +static __inline size_t +wapbl_space_used(size_t avail, off_t head, off_t tail) +{ + + if (tail == 0) { + KASSERT(head == 0); + return 0; + } + return ((head + (avail - 1) - tail) % avail) + 1; +} + +static __inline size_t +wapbl_space_free(size_t avail, off_t head, off_t tail) +{ + + return avail - wapbl_space_used(avail, head, tail); +} + +static __inline void +wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, + off_t *tailp) +{ + off_t head = *headp; + off_t tail = *tailp; + + KASSERT(delta <= wapbl_space_free(size, head, tail)); + head = wapbl_advance(size, off, head, delta); + if ((tail == 0) && (head != 0)) + tail = off; + *headp = head; + *tailp = tail; +} + +static __inline void +wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, + off_t *tailp) +{ + off_t head = *headp; + off_t tail = *tailp; + + KASSERT(delta <= wapbl_space_used(size, head, tail)); + tail = wapbl_advance(size, off, tail, delta); + if (head == tail) { + head = tail = 0; + } + *headp = head; + *tailp = tail; +} + +#ifdef _KERNEL + +/****************************************************************/ + +/* + * Remove transactions whose buffers are completely flushed to disk. + * Will block until at least minfree space is available. + * only intended to be called from inside wapbl_flush and therefore + * does not protect against commit races with itself or with flush. + */ +static int +wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) +{ + size_t delta; + size_t avail; + off_t head; + off_t tail; + int error = 0; + + KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); + KASSERT(rw_write_held(&wl->wl_rwlock)); + + mutex_enter(&wl->wl_mtx); + + /* + * First check to see if we have to do a commit + * at all. + */ + avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); + if (minfree < avail) { + mutex_exit(&wl->wl_mtx); + return 0; + } + minfree -= avail; + while ((wl->wl_error_count == 0) && + (wl->wl_reclaimable_bytes < minfree)) { + WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, + ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " + "minfree=%zd\n", + &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, + minfree)); + + cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); + } + if (wl->wl_reclaimable_bytes < minfree) { + KASSERT(wl->wl_error_count); + /* XXX maybe get actual error from buffer instead someday? */ + error = EIO; + } + head = wl->wl_head; + tail = wl->wl_tail; + delta = wl->wl_reclaimable_bytes; + + /* If all of of the entries are flushed, then be sure to keep + * the reserved bytes reserved. Watch out for discarded transactions, + * which could leave more bytes reserved than are reclaimable. + */ + if (SIMPLEQ_EMPTY(&wl->wl_entries) && + (delta >= wl->wl_reserved_bytes)) { + delta -= wl->wl_reserved_bytes; + } + wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, + &tail); + KDASSERT(wl->wl_reserved_bytes <= + wapbl_space_used(wl->wl_circ_size, head, tail)); + mutex_exit(&wl->wl_mtx); + + if (error) + return error; + + if (waitonly) + return 0; + + /* + * This is where head, tail and delta are unprotected + * from races against itself or flush. This is ok since + * we only call this routine from inside flush itself. + * + * XXX: how can it race against itself when accessed only + * from behind the write-locked rwlock? + */ + error = wapbl_write_commit(wl, head, tail); + if (error) + return error; + + wl->wl_head = head; + wl->wl_tail = tail; + + mutex_enter(&wl->wl_mtx); + KASSERT(wl->wl_reclaimable_bytes >= delta); + wl->wl_reclaimable_bytes -= delta; + mutex_exit(&wl->wl_mtx); + WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, + ("wapbl_truncate thread %d.%d truncating %zu bytes\n", + curproc->p_pid, curlwp->l_lid, delta)); + + return 0; +} + +/****************************************************************/ + +void +wapbl_biodone(struct buf *bp) +{ + struct wapbl_entry *we = bp->b_private; + struct wapbl *wl = we->we_wapbl; + + /* + * Handle possible flushing of buffers after log has been + * decomissioned. + */ + if (!wl) { + KASSERT(we->we_bufcount > 0); + we->we_bufcount--; +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); + we->we_unsynced_bufbytes -= bp->b_bufsize; +#endif + + if (we->we_bufcount == 0) { +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes == 0); +#endif + wapbl_free(we); + } + + brelse(bp, 0); + return; + } + +#ifdef ohbother + KDASSERT(bp->b_flags & B_DONE); + KDASSERT(!(bp->b_flags & B_DELWRI)); + KDASSERT(bp->b_flags & B_ASYNC); + KDASSERT(bp->b_flags & B_BUSY); + KDASSERT(!(bp->b_flags & B_LOCKED)); + KDASSERT(!(bp->b_flags & B_READ)); + KDASSERT(!(bp->b_flags & B_INVAL)); + KDASSERT(!(bp->b_flags & B_NOCACHE)); +#endif + + if (bp->b_error) { +#ifdef notyet /* Can't currently handle possible dirty buffer reuse */ + XXXpooka: interfaces not fully updated + Note: this was not enabled in the original patch + against netbsd4 either. I don't know if comment + above is true or not. + + /* + * If an error occurs, report the error and leave the + * buffer as a delayed write on the LRU queue. + * restarting the write would likely result in + * an error spinloop, so let it be done harmlessly + * by the syncer. + */ + bp->b_flags &= ~(B_DONE); + simple_unlock(&bp->b_interlock); + + if (we->we_error == 0) { + mutex_enter(&wl->wl_mtx); + wl->wl_error_count++; + mutex_exit(&wl->wl_mtx); + cv_broadcast(&wl->wl_reclaimable_cv); + } + we->we_error = bp->b_error; + bp->b_error = 0; + brelse(bp); + return; +#else + /* For now, just mark the log permanently errored out */ + + mutex_enter(&wl->wl_mtx); + if (wl->wl_error_count == 0) { + wl->wl_error_count++; + cv_broadcast(&wl->wl_reclaimable_cv); + } + mutex_exit(&wl->wl_mtx); +#endif + } + + mutex_enter(&wl->wl_mtx); + + KASSERT(we->we_bufcount > 0); + we->we_bufcount--; +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); + we->we_unsynced_bufbytes -= bp->b_bufsize; + KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); + wl->wl_unsynced_bufbytes -= bp->b_bufsize; +#endif + + /* + * If the current transaction can be reclaimed, start + * at the beginning and reclaim any consecutive reclaimable + * transactions. If we successfully reclaim anything, + * then wakeup anyone waiting for the reclaim. + */ + if (we->we_bufcount == 0) { + size_t delta = 0; + int errcnt = 0; +#ifdef WAPBL_DEBUG_BUFBYTES + KDASSERT(we->we_unsynced_bufbytes == 0); +#endif + /* + * clear any posted error, since the buffer it came from + * has successfully flushed by now + */ + while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && + (we->we_bufcount == 0)) { + delta += we->we_reclaimable_bytes; + if (we->we_error) + errcnt++; + SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); + wapbl_free(we); + } + + if (delta) { + wl->wl_reclaimable_bytes += delta; + KASSERT(wl->wl_error_count >= errcnt); + wl->wl_error_count -= errcnt; + cv_broadcast(&wl->wl_reclaimable_cv); + } + } + + mutex_exit(&wl->wl_mtx); + brelse(bp, 0); +} + +/* + * Write transactions to disk + start I/O for contents + */ +int +wapbl_flush(struct wapbl *wl, int waitfor) +{ + struct buf *bp; + struct wapbl_entry *we; + off_t off; + off_t head; + off_t tail; + size_t delta = 0; + size_t flushsize; + size_t reserved; + int error = 0; + + /* + * Do a quick check to see if a full flush can be skipped + * This assumes that the flush callback does not need to be called + * unless there are other outstanding bufs. + */ + if (!waitfor) { + size_t nbufs; + mutex_enter(&wl->wl_mtx); /* XXX need mutex here to + protect the KASSERTS */ + nbufs = wl->wl_bufcount; + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); + mutex_exit(&wl->wl_mtx); + if (nbufs == 0) + return 0; + } + + /* + * XXX we may consider using LK_UPGRADE here + * if we want to call flush from inside a transaction + */ + rw_enter(&wl->wl_rwlock, RW_WRITER); + wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, + wl->wl_dealloccnt); + + /* + * Now that we are fully locked and flushed, + * do another check for nothing to do. + */ + if (wl->wl_bufcount == 0) { + goto out; + } + +#if 0 + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush thread %d.%d flushing entries with " + "bufcount=%zu bufbytes=%zu\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes)); +#endif + + /* Calculate amount of space needed to flush */ + flushsize = wapbl_transaction_len(wl); + + if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { + /* + * XXX this could be handled more gracefully, perhaps place + * only a partial transaction in the log and allow the + * remaining to flush without the protection of the journal. + */ + panic("wapbl_flush: current transaction too big to flush\n"); + } + + error = wapbl_truncate(wl, flushsize, 0); + if (error) + goto out2; + + off = wl->wl_head; + KASSERT((off == 0) || ((off >= wl->wl_circ_off) && + (off < wl->wl_circ_off + wl->wl_circ_size))); + error = wapbl_write_blocks(wl, &off); + if (error) + goto out2; + error = wapbl_write_revocations(wl, &off); + if (error) + goto out2; + error = wapbl_write_inodes(wl, &off); + if (error) + goto out2; + + reserved = 0; + if (wl->wl_inohashcnt) + reserved = wapbl_transaction_inodes_len(wl); + + head = wl->wl_head; + tail = wl->wl_tail; + + wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, + &head, &tail); +#ifdef WAPBL_DEBUG + if (head != off) { + panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX + " off=%"PRIdMAX" flush=%zu\n", + (intmax_t)head, (intmax_t)tail, (intmax_t)off, + flushsize); + } +#else + KASSERT(head == off); +#endif + + /* Opportunistically move the tail forward if we can */ + if (!wapbl_lazy_truncate) { + mutex_enter(&wl->wl_mtx); + delta = wl->wl_reclaimable_bytes; + mutex_exit(&wl->wl_mtx); + wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, + &head, &tail); + } + + error = wapbl_write_commit(wl, head, tail); + if (error) + goto out2; + + /* poolme? or kmemme? */ + we = wapbl_calloc(1, sizeof(*we)); + +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" + " unsynced=%zu" + "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " + "inodes=%d\n", + curproc->p_pid, curlwp->l_lid, flushsize, delta, + wapbl_space_used(wl->wl_circ_size, head, tail), + wl->wl_unsynced_bufbytes, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt)); +#else + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" + "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " + "inodes=%d\n", + curproc->p_pid, curlwp->l_lid, flushsize, delta, + wapbl_space_used(wl->wl_circ_size, head, tail), + wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt)); +#endif + + + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + + wl->wl_reserved_bytes = reserved; + wl->wl_head = head; + wl->wl_tail = tail; + KASSERT(wl->wl_reclaimable_bytes >= delta); + wl->wl_reclaimable_bytes -= delta; + wl->wl_dealloccnt = 0; +#ifdef WAPBL_DEBUG_BUFBYTES + wl->wl_unsynced_bufbytes += wl->wl_bufbytes; +#endif + + we->we_wapbl = wl; + we->we_bufcount = wl->wl_bufcount; +#ifdef WAPBL_DEBUG_BUFBYTES + we->we_unsynced_bufbytes = wl->wl_bufbytes; +#endif + we->we_reclaimable_bytes = flushsize; + we->we_error = 0; + SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); + + /* + * this flushes bufs in reverse order than they were queued + * it shouldn't matter, but if we care we could use TAILQ instead. + * XXX Note they will get put on the lru queue when they flush + * so we might actually want to change this to preserve order. + */ + while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { + if (bbusy(bp, 0, 0, &wl->wl_mtx)) { + continue; + } + bp->b_iodone = wapbl_biodone; + bp->b_private = we; + bremfree(bp); + wapbl_remove_buf_locked(wl, bp); + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + bawrite(bp); + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + } + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + +#if 0 + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush thread %d.%d done flushing entries...\n", + curproc->p_pid, curlwp->l_lid)); +#endif + + out: + + /* + * If the waitfor flag is set, don't return until everything is + * fully flushed and the on disk log is empty. + */ + if (waitfor) { + error = wapbl_truncate(wl, wl->wl_circ_size - + wl->wl_reserved_bytes, wapbl_lazy_truncate); + } + + out2: + if (error) { + wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, + wl->wl_dealloclens, wl->wl_dealloccnt); + } + +#ifdef WAPBL_DEBUG_PRINT + if (error) { + pid_t pid = -1; + lwpid_t lid = -1; + if (curproc) + pid = curproc->p_pid; + if (curlwp) + lid = curlwp->l_lid; + mutex_enter(&wl->wl_mtx); +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_flush: thread %d.%d aborted flush: " + "error = %d\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %d, reclaimable=%zu reserved=%zu " + "unsynced=%zu\n", + pid, lid, error, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt, + wl->wl_error_count, wl->wl_reclaimable_bytes, + wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d, unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes)); + } +#else + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_flush: thread %d.%d aborted flush: " + "error = %d\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", + pid, lid, error, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt, + wl->wl_error_count, wl->wl_reclaimable_bytes, + wl->wl_reserved_bytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d\n", we->we_bufcount, + we->we_reclaimable_bytes, we->we_error)); + } +#endif + mutex_exit(&wl->wl_mtx); + } +#endif + + rw_exit(&wl->wl_rwlock); + return error; +} + +/****************************************************************/ + +void +wapbl_jlock_assert(struct wapbl *wl) +{ + +#ifdef WAPBL_DEBUG_SERIALIZE + KASSERT(rw_write_held(&wl->wl_rwlock)); +#else + KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock)); +#endif +} + +void +wapbl_junlock_assert(struct wapbl *wl) +{ + +#ifdef WAPBL_DEBUG_SERIALIZE + KASSERT(!rw_write_held(&wl->wl_rwlock)); +#endif +} + +/****************************************************************/ + +/* locks missing */ +void +wapbl_print(struct wapbl *wl, + int full, + void (*pr)(const char *, ...)) +{ + struct buf *bp; + struct wapbl_entry *we; + (*pr)("wapbl %p", wl); + (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", + wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); + (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", + wl->wl_circ_size, wl->wl_circ_off, + (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); + (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", + wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); +#ifdef WAPBL_DEBUG_BUFBYTES + (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " + "reserved = %zu errcnt = %d unsynced = %zu\n", + wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_error_count, wl->wl_unsynced_bufbytes); +#else + (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " + "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_error_count); +#endif + (*pr)("\tdealloccnt = %d, dealloclim = %d\n", + wl->wl_dealloccnt, wl->wl_dealloclim); + (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", + wl->wl_inohashcnt, wl->wl_inohashmask); + (*pr)("entries:\n"); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { +#ifdef WAPBL_DEBUG_BUFBYTES + (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " + "unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes); +#else + (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", + we->we_bufcount, we->we_reclaimable_bytes, we->we_error); +#endif + } + if (full) { + int cnt = 0; + (*pr)("bufs ="); + LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { + if (!LIST_NEXT(bp, b_wapbllist)) { + (*pr)(" %p", bp); + } else if ((++cnt % 6) == 0) { + (*pr)(" %p,\n\t", bp); + } else { + (*pr)(" %p,", bp); + } + } + (*pr)("\n"); + + (*pr)("dealloced blks = "); + { + int i; + cnt = 0; + for (i = 0; i < wl->wl_dealloccnt; i++) { + (*pr)(" %"PRId64":%d,", + wl->wl_deallocblks[i], + wl->wl_dealloclens[i]); + if ((++cnt % 4) == 0) { + (*pr)("\n\t"); + } + } + } + (*pr)("\n"); + + (*pr)("registered inodes = "); + { + int i; + cnt = 0; + for (i = 0; i <= wl->wl_inohashmask; i++) { + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + wih = &wl->wl_inohash[i]; + LIST_FOREACH(wi, wih, wi_hash) { + if (wi->wi_ino == 0) + continue; + (*pr)(" %"PRId32"/0%06"PRIo32",", + wi->wi_ino, wi->wi_mode); + if ((++cnt % 4) == 0) { + (*pr)("\n\t"); + } + } + } + (*pr)("\n"); + } + } +} + +#if defined(WAPBL_DEBUG) || defined(DDB) +void +wapbl_dump(struct wapbl *wl) +{ +#if defined(WAPBL_DEBUG) + if (!wl) + wl = wapbl_debug_wl; +#endif + if (!wl) + return; + wapbl_print(wl, 1, printf); +} +#endif + +/****************************************************************/ + +void +wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) +{ + + wapbl_jlock_assert(wl); + + /* XXX should eventually instead tie this into resource estimation */ + /* XXX this KASSERT needs locking/mutex analysis */ + KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim); + wl->wl_deallocblks[wl->wl_dealloccnt] = blk; + wl->wl_dealloclens[wl->wl_dealloccnt] = len; + wl->wl_dealloccnt++; + WAPBL_PRINTF(WAPBL_PRINT_ALLOC, + ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); +} + +/****************************************************************/ + +static void +wapbl_inodetrk_init(struct wapbl *wl, u_int size) +{ + + wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); + if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { + pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, + "wapblinopl", &pool_allocator_nointr, IPL_NONE); + } +} + +static void +wapbl_inodetrk_free(struct wapbl *wl) +{ + + /* XXX this KASSERT needs locking/mutex analysis */ + KASSERT(wl->wl_inohashcnt == 0); + hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); + if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { + pool_destroy(&wapbl_ino_pool); + } +} + +static struct wapbl_ino * +wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) +{ + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + KASSERT(mutex_owned(&wl->wl_mtx)); + + wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; + LIST_FOREACH(wi, wih, wi_hash) { + if (ino == wi->wi_ino) + return wi; + } + return 0; +} + +void +wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) +{ + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + wi = pool_get(&wapbl_ino_pool, PR_WAITOK); + + mutex_enter(&wl->wl_mtx); + if (wapbl_inodetrk_get(wl, ino) == NULL) { + wi->wi_ino = ino; + wi->wi_mode = mode; + wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; + LIST_INSERT_HEAD(wih, wi, wi_hash); + wl->wl_inohashcnt++; + WAPBL_PRINTF(WAPBL_PRINT_INODE, + ("wapbl_register_inode: ino=%"PRId64"\n", ino)); + mutex_exit(&wl->wl_mtx); + } else { + mutex_exit(&wl->wl_mtx); + pool_put(&wapbl_ino_pool, wi); + } +} + +void +wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) +{ + struct wapbl_ino *wi; + + mutex_enter(&wl->wl_mtx); + wi = wapbl_inodetrk_get(wl, ino); + if (wi) { + WAPBL_PRINTF(WAPBL_PRINT_INODE, + ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); + KASSERT(wl->wl_inohashcnt > 0); + wl->wl_inohashcnt--; + LIST_REMOVE(wi, wi_hash); + mutex_exit(&wl->wl_mtx); + + pool_put(&wapbl_ino_pool, wi); + } else { + mutex_exit(&wl->wl_mtx); + } +} + +/****************************************************************/ + +static __inline size_t +wapbl_transaction_inodes_len(struct wapbl *wl) +{ + int blocklen = 1<wl_log_dev_bshift; + int iph; + + /* Calculate number of inodes described in a inodelist header */ + iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / + sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); + + KASSERT(iph > 0); + + return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; +} + + +/* Calculate amount of space a transaction will take on disk */ +static size_t +wapbl_transaction_len(struct wapbl *wl) +{ + int blocklen = 1<wl_log_dev_bshift; + size_t len; + int bph; + + /* Calculate number of blocks described in a blocklist header */ + bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + KASSERT(bph > 0); + + len = wl->wl_bcount; + len += howmany(wl->wl_bufcount, bph)*blocklen; + len += howmany(wl->wl_dealloccnt, bph)*blocklen; + len += wapbl_transaction_inodes_len(wl); + + return len; +} + +/* + * Perform commit operation + * + * Note that generation number incrementation needs to + * be protected against racing with other invocations + * of wapbl_commit. This is ok since this routine + * is only invoked from wapbl_flush + */ +static int +wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) +{ + struct wapbl_wc_header *wc = wl->wl_wc_header; + struct timespec ts; + int error; + int force = 1; + + /* XXX Calc checksum here, instead we do this for now */ + error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " + "returned %d\n", wl->wl_devvp->v_rdev, error)); + } + + wc->wc_head = head; + wc->wc_tail = tail; + wc->wc_checksum = 0; + wc->wc_version = 1; + getnanotime(&ts); + wc->wc_time = ts.tv_sec;; + wc->wc_timensec = ts.tv_nsec; + + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", + (intmax_t)head, (intmax_t)tail)); + + /* + * XXX if generation will rollover, then first zero + * over second commit header before trying to write both headers. + */ + + error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, + wl->wl_logpbn + wc->wc_generation % 2); + if (error) + return error; + + error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " + "returned %d\n", wl->wl_devvp->v_rdev, error)); + } + + /* + * If the generation number was zero, write it out a second time. + * This handles initialization and generation number rollover + */ + if (wc->wc_generation++ == 0) { + error = wapbl_write_commit(wl, head, tail); + /* + * This panic should be able to be removed if we do the + * zero'ing mentioned above, and we are certain to roll + * back generation number on failure. + */ + if (error) + panic("wapbl_write_commit: error writing duplicate " + "log header: %d\n", error); + } + return 0; +} + +/* Returns new offset value */ +static int +wapbl_write_blocks(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; + int blocklen = 1<wl_log_dev_bshift; + int bph; + struct buf *bp; + off_t off = *offp; + int error; + + KASSERT(rw_write_held(&wl->wl_rwlock)); + + bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + bp = LIST_FIRST(&wl->wl_bufs); + + while (bp) { + int cnt; + struct buf *obp = bp; + + KASSERT(bp->b_flags & B_LOCKED); + + wc->wc_type = WAPBL_WC_BLOCKS; + wc->wc_len = blocklen; + wc->wc_blkcount = 0; + while (bp && (wc->wc_blkcount < bph)) { + /* + * Make sure all the physical block numbers are up to + * date. If this is not always true on a given + * filesystem, then VOP_BMAP must be called. We + * could call VOP_BMAP here, or else in the filesystem + * specific flush callback, although neither of those + * solutions allow us to take the vnode lock. If a + * filesystem requires that we must take the vnode lock + * to call VOP_BMAP, then we can probably do it in + * bwrite when the vnode lock should already be held + * by the invoking code. + */ + KASSERT((bp->b_vp->v_type == VBLK) || + (bp->b_blkno != bp->b_lblkno)); + KASSERT(bp->b_blkno > 0); + + wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; + wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; + wc->wc_len += bp->b_bcount; + wc->wc_blkcount++; + bp = LIST_NEXT(bp, b_wapbllist); + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_blocks: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + bp = obp; + cnt = 0; + while (bp && (cnt++ < bph)) { + error = wapbl_circ_write(wl, bp->b_data, + bp->b_bcount, &off); + if (error) + return error; + bp = LIST_NEXT(bp, b_wapbllist); + } + } + *offp = off; + return 0; +} + +static int +wapbl_write_revocations(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; + int i; + int blocklen = 1<wl_log_dev_bshift; + int bph; + off_t off = *offp; + int error; + + if (wl->wl_dealloccnt == 0) + return 0; + + bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + i = 0; + while (i < wl->wl_dealloccnt) { + wc->wc_type = WAPBL_WC_REVOCATIONS; + wc->wc_len = blocklen; + wc->wc_blkcount = 0; + while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { + wc->wc_blocks[wc->wc_blkcount].wc_daddr = + wl->wl_deallocblks[i]; + wc->wc_blocks[wc->wc_blkcount].wc_dlen = + wl->wl_dealloclens[i]; + wc->wc_blkcount++; + i++; + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + } + *offp = off; + return 0; +} + +static int +wapbl_write_inodes(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; + int i; + int blocklen = 1<wl_log_dev_bshift; + off_t off = *offp; + int error; + + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + int iph; + + iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / + sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); + + i = 0; + wih = &wl->wl_inohash[0]; + wi = 0; + do { + wc->wc_type = WAPBL_WC_INODES; + wc->wc_len = blocklen; + wc->wc_inocnt = 0; + wc->wc_clear = (i == 0); + while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { + while (!wi) { + KASSERT((wih - &wl->wl_inohash[0]) + <= wl->wl_inohashmask); + wi = LIST_FIRST(wih++); + } + wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; + wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; + wc->wc_inocnt++; + i++; + wi = LIST_NEXT(wi, wi_hash); + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + } while (i < wl->wl_inohashcnt); + + *offp = off; + return 0; +} + +#endif /* _KERNEL */ + +/****************************************************************/ + +#ifdef _KERNEL +static struct pool wapbl_blk_pool; +static int wapbl_blk_pool_refcount; +#endif +struct wapbl_blk { + LIST_ENTRY(wapbl_blk) wb_hash; + daddr_t wb_blk; + off_t wb_off; /* Offset of this block in the log */ +}; +#define WAPBL_BLKPOOL_MIN 83 + +static void +wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) +{ + if (size < WAPBL_BLKPOOL_MIN) + size = WAPBL_BLKPOOL_MIN; + KASSERT(wr->wr_blkhash == 0); +#ifdef _KERNEL + wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); + if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) { + pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0, + "wapblblkpl", &pool_allocator_nointr, IPL_NONE); + } +#else /* ! _KERNEL */ + /* Manually implement hashinit */ + { + int i; + unsigned long hashsize; + for (hashsize = 1; hashsize < size; hashsize <<= 1) + continue; + wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); + for (i = 0; i < wr->wr_blkhashmask; i++) + LIST_INIT(&wr->wr_blkhash[i]); + wr->wr_blkhashmask = hashsize - 1; + } +#endif /* ! _KERNEL */ +} + +static void +wapbl_blkhash_free(struct wapbl_replay *wr) +{ + KASSERT(wr->wr_blkhashcnt == 0); +#ifdef _KERNEL + hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); + if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) { + pool_destroy(&wapbl_blk_pool); + } +#else /* ! _KERNEL */ + wapbl_free(wr->wr_blkhash); +#endif /* ! _KERNEL */ +} + +static struct wapbl_blk * +wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) +{ + struct wapbl_blk_head *wbh; + struct wapbl_blk *wb; + wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; + LIST_FOREACH(wb, wbh, wb_hash) { + if (blk == wb->wb_blk) + return wb; + } + return 0; +} + +static void +wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) +{ + struct wapbl_blk_head *wbh; + struct wapbl_blk *wb; + wb = wapbl_blkhash_get(wr, blk); + if (wb) { + KASSERT(wb->wb_blk == blk); + wb->wb_off = off; + } else { +#ifdef _KERNEL + wb = pool_get(&wapbl_blk_pool, PR_WAITOK); +#else /* ! _KERNEL */ + wb = wapbl_malloc(sizeof(*wb)); +#endif /* ! _KERNEL */ + wb->wb_blk = blk; + wb->wb_off = off; + wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; + LIST_INSERT_HEAD(wbh, wb, wb_hash); + wr->wr_blkhashcnt++; + } +} + +static void +wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) +{ + struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); + if (wb) { + KASSERT(wr->wr_blkhashcnt > 0); + wr->wr_blkhashcnt--; + LIST_REMOVE(wb, wb_hash); +#ifdef _KERNEL + pool_put(&wapbl_blk_pool, wb); +#else /* ! _KERNEL */ + wapbl_free(wb); +#endif /* ! _KERNEL */ + } +} + +static void +wapbl_blkhash_clear(struct wapbl_replay *wr) +{ + int i; + for (i = 0; i <= wr->wr_blkhashmask; i++) { + struct wapbl_blk *wb; + + while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { + KASSERT(wr->wr_blkhashcnt > 0); + wr->wr_blkhashcnt--; + LIST_REMOVE(wb, wb_hash); +#ifdef _KERNEL + pool_put(&wapbl_blk_pool, wb); +#else /* ! _KERNEL */ + wapbl_free(wb); +#endif /* ! _KERNEL */ + } + } + KASSERT(wr->wr_blkhashcnt == 0); +} + +/****************************************************************/ + +static int +wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) +{ + size_t slen; + struct wapbl_wc_header *wc = &wr->wr_wc_header; + off_t off = *offp; + int error; + + KASSERT(((len >> wc->wc_log_dev_bshift) << + wc->wc_log_dev_bshift) == len); + if (off < wc->wc_circ_off) + off = wc->wc_circ_off; + slen = wc->wc_circ_off + wc->wc_circ_size - off; + if (slen < len) { + error = wapbl_read(data, slen, wr->wr_devvp, + wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); + if (error) + return error; + data = (uint8_t *)data + slen; + len -= slen; + off = wc->wc_circ_off; + } + error = wapbl_read(data, len, wr->wr_devvp, + wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); + if (error) + return error; + off += len; + if (off >= wc->wc_circ_off + wc->wc_circ_size) + off = wc->wc_circ_off; + *offp = off; + return 0; +} + +static void +wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) +{ + size_t slen; + struct wapbl_wc_header *wc = &wr->wr_wc_header; + off_t off = *offp; + + KASSERT(((len >> wc->wc_log_dev_bshift) << + wc->wc_log_dev_bshift) == len); + + if (off < wc->wc_circ_off) + off = wc->wc_circ_off; + slen = wc->wc_circ_off + wc->wc_circ_size - off; + if (slen < len) { + len -= slen; + off = wc->wc_circ_off; + } + off += len; + if (off >= wc->wc_circ_off + wc->wc_circ_size) + off = wc->wc_circ_off; + *offp = off; +} + +/****************************************************************/ + +int +wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, + daddr_t off, size_t count, size_t blksize) +{ + struct wapbl_replay *wr; + int error; + struct vnode *devvp; + daddr_t logpbn; + uint8_t *scratch; + struct wapbl_wc_header *wch; + struct wapbl_wc_header *wch2; + /* Use this until we read the actual log header */ + int log_dev_bshift = DEV_BSHIFT; + size_t used; + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", + vp, off, count, blksize)); + + if (off < 0) + return EINVAL; + + if (blksize < DEV_BSIZE) + return EINVAL; + if (blksize % DEV_BSIZE) + return EINVAL; + +#ifdef _KERNEL +#if 0 + /* XXX vp->v_size isn't reliably set for VBLK devices, + * especially root. However, we might still want to verify + * that the full load is readable */ + if ((off + count) * blksize > vp->v_size) + return EINVAL; +#endif + + if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { + return error; + } +#else /* ! _KERNEL */ + devvp = vp; + logpbn = off; +#endif /* ! _KERNEL */ + + scratch = wapbl_malloc(MAXBSIZE); + + error = wapbl_read(scratch, 2<wc_type != WAPBL_WC_HEADER) { + printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); + error = EFTYPE; + goto errout; + } + + if (wch2->wc_generation > wch->wc_generation) + wch = wch2; + + wr = wapbl_calloc(1, sizeof(*wr)); + + wr->wr_logvp = vp; + wr->wr_devvp = devvp; + wr->wr_logpbn = logpbn; + + wr->wr_scratch = scratch; + + memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header)); + + used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 + " len=%"PRId64" used=%zu\n", + wch->wc_head, wch->wc_tail, wch->wc_circ_off, + wch->wc_circ_size, used)); + + wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); + error = wapbl_replay_prescan(wr); + if (error) { + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + return error; + } + + error = wapbl_replay_get_inodes(wr); + if (error) { + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + return error; + } + + *wrp = wr; + return 0; + + errout: + wapbl_free(scratch); + return error; +} + +void +wapbl_replay_stop(struct wapbl_replay *wr) +{ + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); + + KDASSERT(wapbl_replay_isopen(wr)); + + wapbl_free(wr->wr_scratch); + wr->wr_scratch = 0; + + wr->wr_logvp = 0; + + wapbl_blkhash_clear(wr); + wapbl_blkhash_free(wr); +} + +void +wapbl_replay_free(struct wapbl_replay *wr) +{ + + KDASSERT(!wapbl_replay_isopen(wr)); + + if (wr->wr_inodes) + wapbl_free(wr->wr_inodes); + wapbl_free(wr); +} + +int +wapbl_replay_isopen1(struct wapbl_replay *wr) +{ + + return wapbl_replay_isopen(wr); +} + +static int +wapbl_replay_prescan(struct wapbl_replay *wr) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int error; + + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + + wapbl_blkhash_clear(wr); + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; + off_t saveoff = off; + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto errout; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Enter each physical block into the + * hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + wapbl_blkhash_ins(wr, + wc->wc_blocks[i].wc_daddr + j, + off); + wapbl_circ_advance(wr, + fsblklen, &off); + } + } + } + break; + + case WAPBL_WC_REVOCATIONS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Remove any blocks found from the + * hashtable + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + wapbl_blkhash_rem(wr, + wc->wc_blocks[i].wc_daddr + j); + } + } + } + break; + + case WAPBL_WC_INODES: + { + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wr->wr_scratch; + /* + * Keep track of where we found this so we + * can use it later + */ + if (wc->wc_clear) { + wr->wr_inodestail = saveoff; + wr->wr_inodescnt = 0; + } + if (wr->wr_inodestail) + wr->wr_inodeshead = off; + wr->wr_inodescnt += wc->wc_inocnt; + } + break; + default: + printf("Unrecognized wapbl type: 0x%08x\n", + wcn->wc_type); + error = EFTYPE; + goto errout; + } + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + if (off != saveoff) { + printf("wapbl_replay: corrupted records\n"); + error = EFTYPE; + goto errout; + } + } + return 0; + + errout: + wapbl_blkhash_clear(wr); + return error; +} + +static int +wapbl_replay_get_inodes(struct wapbl_replay *wr) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int logblklen = 1<wc_log_dev_bshift; + int cnt= 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + if (wr->wr_inodescnt == 0) + return 0; + + KASSERT(!wr->wr_inodes); + + wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0])); + + off = wr->wr_inodestail; + + while (off != wr->wr_inodeshead) { + struct wapbl_wc_null *wcn; + int error; + off_t saveoff = off; + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) { + wapbl_free(wr->wr_inodes); + wr->wr_inodes = 0; + return error; + } + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + case WAPBL_WC_REVOCATIONS: + break; + case WAPBL_WC_INODES: + { + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wr->wr_scratch; + /* + * Keep track of where we found this so we + * can use it later + */ + if (wc->wc_clear) { + cnt = 0; + } + /* This memcpy assumes that wr_inodes is + * laid out the same as wc_inodes. */ + memcpy(&wr->wr_inodes[cnt], wc->wc_inodes, + wc->wc_inocnt*sizeof(wc->wc_inodes[0])); + cnt += wc->wc_inocnt; + } + break; + default: + KASSERT(0); + } + off = saveoff; + wapbl_circ_advance(wr, wcn->wc_len, &off); + } + KASSERT(cnt == wr->wr_inodescnt); + return 0; +} + +#ifdef DEBUG +int +wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int mismatchcnt = 0; + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + void *scratch1 = wapbl_malloc(MAXBSIZE); + void *scratch2 = wapbl_malloc(MAXBSIZE); + int error = 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; +#ifdef DEBUG + off_t saveoff = off; +#endif + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto out; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int foundcnt = 0; + int dirtycnt = 0; + int j, n; + /* + * Check each physical block into the + * hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr + j); + if (wb && (wb->wb_off == off)) { + foundcnt++; + error = + wapbl_circ_read(wr, + scratch1, fsblklen, + &off); + if (error) + goto out; + error = + wapbl_read(scratch2, + fsblklen, fsdevvp, + wb->wb_blk); + if (error) + goto out; + if (memcmp(scratch1, + scratch2, + fsblklen)) { + printf( + "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", + wb->wb_blk, (intmax_t)off); + dirtycnt++; + mismatchcnt++; + } + } else { + wapbl_circ_advance(wr, + fsblklen, &off); + } + } +#if 0 + /* + * If all of the blocks in an entry + * are clean, then remove all of its + * blocks from the hashtable since they + * never will need replay. + */ + if ((foundcnt != 0) && + (dirtycnt == 0)) { + off = saveoff; + wapbl_circ_advance(wr, + logblklen, &off); + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr + j); + if (wb && + (wb->wb_off == off)) { + wapbl_blkhash_rem(wr, wb->wb_blk); + } + wapbl_circ_advance(wr, + fsblklen, &off); + } + } +#endif + } + } + break; + case WAPBL_WC_REVOCATIONS: + case WAPBL_WC_INODES: + break; + default: + KASSERT(0); + } +#ifdef DEBUG + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + KASSERT(off == saveoff); +#endif + } + out: + wapbl_free(scratch1); + wapbl_free(scratch2); + if (!error && mismatchcnt) + error = EFTYPE; + return error; +} +#endif + +int +wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + void *scratch1 = wapbl_malloc(MAXBSIZE); + int error = 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + /* + * This parses the journal for replay, although it could + * just as easily walk the hashtable instead. + */ + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; +#ifdef DEBUG + off_t saveoff = off; +#endif + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto out; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Check each physical block against + * the hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr + j); + if (wb && (wb->wb_off == off)) { + error = wapbl_circ_read( + wr, scratch1, + fsblklen, &off); + if (error) + goto out; + error = + wapbl_write(scratch1, + fsblklen, fsdevvp, + wb->wb_blk); + if (error) + goto out; + } else { + wapbl_circ_advance(wr, + fsblklen, &off); + } + } + } + } + break; + case WAPBL_WC_REVOCATIONS: + case WAPBL_WC_INODES: + break; + default: + KASSERT(0); + } +#ifdef DEBUG + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + KASSERT(off == saveoff); +#endif + } + out: + wapbl_free(scratch1); + return error; +} + +int +wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) +{ + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int fsblklen = 1<wc_fs_dev_bshift; + + KDASSERT(wapbl_replay_isopen(wr)); + + KASSERT((len % fsblklen) == 0); + + while (len != 0) { + struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); + if (wb) { + off_t off = wb->wb_off; + int error; + error = wapbl_circ_read(wr, data, fsblklen, &off); + if (error) + return error; + } + data = (uint8_t *)data + fsblklen; + len -= fsblklen; + blk++; + } + return 0; +} diff --git a/sys/kern/vnode_if.c b/sys/kern/vnode_if.c index 44f4f7f419b5..830c8055eb9f 100644 --- a/sys/kern/vnode_if.c +++ b/sys/kern/vnode_if.c @@ -1,4 +1,4 @@ -/* $NetBSD: vnode_if.c,v 1.76 2008/01/25 14:32:46 ad Exp $ */ +/* $NetBSD: vnode_if.c,v 1.77 2008/07/31 05:38:05 simonb Exp $ */ /* * Warning: DO NOT EDIT! This file is automatically generated! @@ -40,7 +40,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.76 2008/01/25 14:32:46 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vnode_if.c,v 1.77 2008/07/31 05:38:05 simonb Exp $"); #include "opt_vnode_lockdebug.h" @@ -802,6 +802,7 @@ VOP_FSYNC(struct vnode *vp, mpsafe = (vp->v_vflag & VV_MPSAFE); if (!mpsafe) { KERNEL_LOCK(1, curlwp); } error = (VCALL(vp, VOFFSET(vop_fsync), &a)); + if (!mpsafe) { KERNEL_UNLOCK_ONE(curlwp); } return error; } diff --git a/sys/miscfs/genfs/genfs_io.c b/sys/miscfs/genfs/genfs_io.c index 3cce9ed6704d..240371f1a825 100644 --- a/sys/miscfs/genfs/genfs_io.c +++ b/sys/miscfs/genfs/genfs_io.c @@ -1,4 +1,4 @@ -/* $NetBSD: genfs_io.c,v 1.8 2008/06/04 12:41:40 ad Exp $ */ +/* $NetBSD: genfs_io.c,v 1.9 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -31,7 +31,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.8 2008/06/04 12:41:40 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.9 2008/07/31 05:38:05 simonb Exp $"); #include #include @@ -589,8 +589,22 @@ loopdone: */ if (!error && sawhole && blockalloc) { - error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0, - cred); + /* + * XXX: This assumes that we come here only via + * the mmio path + */ + if (vp->v_mount->mnt_wapbl && write) { + error = WAPBL_BEGIN(vp->v_mount); + } + + if (!error) { + error = GOP_ALLOC(vp, startoffset, + npages << PAGE_SHIFT, 0, cred); + if (vp->v_mount->mnt_wapbl && write) { + WAPBL_END(vp->v_mount); + } + } + UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d", startoffset, npages << PAGE_SHIFT, error,0); if (!error) { diff --git a/sys/rump/fs/lib/libffs/Makefile b/sys/rump/fs/lib/libffs/Makefile index 4e186faf6359..c074e16c97c0 100644 --- a/sys/rump/fs/lib/libffs/Makefile +++ b/sys/rump/fs/lib/libffs/Makefile @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.4 2008/07/29 13:17:42 pooka Exp $ +# $NetBSD: Makefile,v 1.5 2008/07/31 05:38:05 simonb Exp $ # .PATH: ${.CURDIR}/../../../../ufs/ffs @@ -7,9 +7,9 @@ LIB= rumpfs_ffs SRCS= ffs_alloc.c ffs_balloc.c ffs_bswap.c ffs_inode.c \ ffs_softdep.stub.c ffs_subr.c ffs_tables.c ffs_vfsops.c \ - ffs_vnops.c ffs_snapshot.c + ffs_vnops.c ffs_snapshot.c ffs_wapbl.c -CPPFLAGS+= -DFFS_NO_SNAPSHOT -DFFS_EI +CPPFLAGS+= -DFFS_NO_SNAPSHOT -DFFS_EI -DWAPBL CFLAGS+= -Wno-pointer-sign .include diff --git a/sys/rump/fs/lib/libufs/Makefile b/sys/rump/fs/lib/libufs/Makefile index e6575abb1336..7c9d9ec58a58 100644 --- a/sys/rump/fs/lib/libufs/Makefile +++ b/sys/rump/fs/lib/libufs/Makefile @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.5 2008/07/29 13:17:47 pooka Exp $ +# $NetBSD: Makefile,v 1.6 2008/07/31 05:38:05 simonb Exp $ # .PATH: ${.CURDIR}/../../../../ufs/ufs @@ -6,9 +6,9 @@ LIB= rumpfs_ufs SRCS= ufs_bmap.c ufs_dirhash.c ufs_ihash.c ufs_inode.c ufs_lookup.c \ - ufs_vfsops.c ufs_vnops.c + ufs_vfsops.c ufs_vnops.c ufs_wapbl.c -CPPFLAGS+= -DUFS_DIRHASH -DFFS_EI +CPPFLAGS+= -DUFS_DIRHASH -DFFS_EI -DWAPBL .include .include diff --git a/sys/rump/librump/rumpkern/Makefile.rumpkern b/sys/rump/librump/rumpkern/Makefile.rumpkern index 2cadd1a098f5..644f221356fa 100644 --- a/sys/rump/librump/rumpkern/Makefile.rumpkern +++ b/sys/rump/librump/rumpkern/Makefile.rumpkern @@ -1,4 +1,4 @@ -# $NetBSD: Makefile.rumpkern,v 1.2 2008/07/30 01:32:47 oster Exp $ +# $NetBSD: Makefile.rumpkern,v 1.3 2008/07/31 05:38:05 simonb Exp $ # .include "${RUMPTOP}/Makefile.rump" @@ -26,7 +26,7 @@ SRCS+= clock_subr.c kern_descrip.c kern_stub.c param.c \ subr_bufq.c subr_hash.c subr_prf2.c subr_specificdata.c \ subr_time.c subr_workqueue.c sys_descrip.c sys_generic.c vfs_bio.c \ vfs_cache.c vfs_getcwd.c vfs_hooks.c vfs_init.c vfs_lookup.c \ - vfs_subr.c vfs_vnops.c vfs_syscalls.c vnode_if.c \ + vfs_subr.c vfs_vnops.c vfs_syscalls.c vfs_wapbl.c vnode_if.c \ subr_kobj.c kern_module.c # sys/miscfs diff --git a/sys/rump/librump/rumpkern/rump.c b/sys/rump/librump/rumpkern/rump.c index 44fa4ace070c..ffdc9dfee3c7 100644 --- a/sys/rump/librump/rumpkern/rump.c +++ b/sys/rump/librump/rumpkern/rump.c @@ -1,4 +1,4 @@ -/* $NetBSD: rump.c,v 1.48 2008/07/29 13:17:47 pooka Exp $ */ +/* $NetBSD: rump.c,v 1.49 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 2007 Antti Kantee. All Rights Reserved. @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -39,7 +40,7 @@ #include #include #include -#include +#include #include @@ -135,6 +136,7 @@ rump_init() module_init(); vfsinit(); bufinit(); + wapbl_init(); rumpvfs_init(); diff --git a/sys/sys/Makefile b/sys/sys/Makefile index e8a98d8001ed..7920033b183b 100644 --- a/sys/sys/Makefile +++ b/sys/sys/Makefile @@ -1,4 +1,4 @@ -# $NetBSD: Makefile,v 1.109 2008/06/04 14:31:15 ad Exp $ +# $NetBSD: Makefile,v 1.110 2008/07/31 05:38:05 simonb Exp $ .include @@ -19,12 +19,13 @@ INCS= acct.h agpio.h aio.h ansi.h ataio.h atomic.h audioio.h \ joystick.h \ kcore.h kgdb.h kmem.h ksem.h ksyms.h ktrace.h \ lkm.h localedef.h lock.h lockf.h lwp.h lwpctl.h \ - malloc.h mallocvar.h mbuf.h md4.h \ - md5.h midiio.h mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \ + malloc.h mallocvar.h mbuf.h md4.h md5.h midiio.h \ + mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \ namei.h null.h \ param.h pipe.h pmc.h poll.h pool.h power.h proc.h \ protosw.h pset.h ptrace.h queue.h \ - ras.h rb.h reboot.h radioio.h resource.h resourcevar.h rmd160.h rnd.h rwlock.h \ + ras.h rb.h reboot.h radioio.h resource.h resourcevar.h rmd160.h \ + rnd.h rwlock.h \ scanio.h sched.h scsiio.h select.h selinfo.h sem.h sha1.h sha2.h \ shm.h siginfo.h signal.h signalvar.h sigtypes.h simplelock.h \ sleepq.h socket.h \ @@ -36,7 +37,7 @@ INCS= acct.h agpio.h aio.h ansi.h ataio.h atomic.h audioio.h \ ttydefaults.h ttydev.h types.h \ ucontext.h ucred.h uio.h un.h unistd.h unpcb.h user.h utsname.h uuid.h \ vadvise.h verified_exec.h vmmeter.h vnode.h vnode_if.h \ - wait.h wdog.h + wait.h wapbl.h wdog.h INCSYMLINKS=\ sys/exec_elf.h /usr/include/elf.h \ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index f5135b5392a7..3b8dbd70c6ad 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $NetBSD: buf.h,v 1.109 2008/06/09 15:42:01 ad Exp $ */ +/* $NetBSD: buf.h,v 1.110 2008/07/31 05:38:05 simonb Exp $ */ /*- * Copyright (c) 1999, 2000, 2007 The NetBSD Foundation, Inc. @@ -162,6 +162,7 @@ struct buf { LIST_ENTRY(buf) b_hash; /* c: hash chain */ LIST_ENTRY(buf) b_vnbufs; /* c: associated vnode */ TAILQ_ENTRY(buf) b_freelist; /* c: position if not active */ + LIST_ENTRY(buf) b_wapbllist; /* c: transaction buffer list */ daddr_t b_lblkno; /* c: logical block number */ int b_freelistindex;/* c: free list index (BQ_) */ u_int b_cflags; /* c: BC_* flags */ @@ -244,6 +245,7 @@ do { \ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ #define B_METAONLY 0x04 /* Return indirect block buffer. */ +#define B_CONTIG 0x08 /* Allocate file contiguously. */ /* Flags to bread(), breadn() and breada(). */ #define B_MODIFY 0x01 /* Hint: caller might modify buffer */ diff --git a/sys/sys/fstypes.h b/sys/sys/fstypes.h index f6a8df4f6d08..ba55187c4b21 100644 --- a/sys/sys/fstypes.h +++ b/sys/sys/fstypes.h @@ -1,4 +1,4 @@ -/* $NetBSD: fstypes.h,v 1.23 2008/05/06 18:43:45 ad Exp $ */ +/* $NetBSD: fstypes.h,v 1.24 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 1989, 1991, 1993 @@ -87,7 +87,6 @@ typedef struct fhandle fhandle_t; #define __MNT_UNUSED2 0x00200000 #define __MNT_UNUSED3 0x00800000 #define __MNT_UNUSED4 0x01000000 -#define __MNT_UNUSED5 0x02000000 #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* file system written synchronously */ @@ -98,6 +97,7 @@ typedef struct fhandle fhandle_t; #define MNT_ASYNC 0x00000040 /* file system written asynchronously */ #define MNT_NOCOREDUMP 0x00008000 /* don't write core dumps to this FS */ #define MNT_IGNORE 0x00100000 /* don't show entry in df */ +#define MNT_LOG 0x02000000 /* Use logging */ #define MNT_NOATIME 0x04000000 /* Never update access times in fs */ #define MNT_SYMPERM 0x20000000 /* recognize symlink permission */ #define MNT_NODEVMTIME 0x40000000 /* Never update mod times for devs */ @@ -116,7 +116,8 @@ typedef struct fhandle fhandle_t; { MNT_NOATIME, 0, "noatime" }, \ { MNT_SYMPERM, 0, "symperm" }, \ { MNT_NODEVMTIME, 0, "nodevmtime" }, \ - { MNT_SOFTDEP, 0, "soft dependencies" }, + { MNT_SOFTDEP, 0, "soft dependencies" }, \ + { MNT_LOG, 0, "log" }, /* * exported mount flags. @@ -176,7 +177,8 @@ typedef struct fhandle fhandle_t; MNT_EXPUBLIC | \ MNT_LOCAL | \ MNT_QUOTA | \ - MNT_ROOTFS) + MNT_ROOTFS | \ + MNT_LOG) /* * External filesystem control flags. @@ -223,7 +225,7 @@ typedef struct fhandle fhandle_t; "\35MNT_EXPUBLIC" \ "\34MNT_EXNORESPORT" \ "\33MNT_NOATIME" \ - "\32MNT_UNUSED" \ + "\32MNT_LOG" \ "\31MNT_UNUSED" \ "\30MNT_UNUSED" \ "\27MNT_GETARGS" \ diff --git a/sys/sys/mount.h b/sys/sys/mount.h index fe4a665b69a0..a52c65106c1a 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,4 +1,4 @@ -/* $NetBSD: mount.h,v 1.180 2008/07/30 18:10:38 pooka Exp $ */ +/* $NetBSD: mount.h,v 1.181 2008/07/31 05:38:05 simonb Exp $ */ /* * Copyright (c) 1989, 1991, 1993 @@ -121,6 +121,11 @@ struct mount { specificdata_reference mnt_specdataref; /* subsystem specific data */ kmutex_t mnt_updating; /* to serialize updates */ + struct wapbl_ops + *mnt_wapbl_op; /* logging ops */ + struct wapbl *mnt_wapbl; /* log info */ + struct wapbl_replay + *mnt_wapbl_replay; /* replay support XXX: what? */ }; /* @@ -278,6 +283,45 @@ int fsname##_extattrctl(struct mount *, int, struct vnode *, int, \ const char *); \ int fsname##_suspendctl(struct mount *, int) +/* + * This operations vector is so wapbl can be wrapped into a filesystem lkm. + * XXX Eventually, we want to move this functionality + * down into the filesystems themselves so that this isn't needed. + */ +struct wapbl_ops { + void (*wo_wapbl_discard)(struct wapbl *); + int (*wo_wapbl_replay_isopen)(struct wapbl_replay *); + int (*wo_wapbl_replay_read)(struct wapbl_replay *, void *, daddr_t, long); + void (*wo_wapbl_add_buf)(struct wapbl *, struct buf *); + void (*wo_wapbl_remove_buf)(struct wapbl *, struct buf *); + void (*wo_wapbl_resize_buf)(struct wapbl *, struct buf *, long, long); + int (*wo_wapbl_begin)(struct wapbl *, const char *, int); + void (*wo_wapbl_end)(struct wapbl *); + void (*wo_wapbl_junlock_assert)(struct wapbl *); + void (*wo_wapbl_biodone)(struct buf *); +}; +#define WAPBL_DISCARD(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_discard)((MP)->mnt_wapbl) +#define WAPBL_REPLAY_ISOPEN(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_replay_isopen)((MP)->mnt_wapbl_replay) +#define WAPBL_REPLAY_READ(MP, DATA, BLK, LEN) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_replay_read)((MP)->mnt_wapbl_replay, \ + (DATA), (BLK), (LEN)) +#define WAPBL_ADD_BUF(MP, BP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_add_buf)((MP)->mnt_wapbl, (BP)) +#define WAPBL_REMOVE_BUF(MP, BP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_remove_buf)((MP)->mnt_wapbl, (BP)) +#define WAPBL_RESIZE_BUF(MP, BP, OLDSZ, OLDCNT) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_resize_buf)((MP)->mnt_wapbl, (BP), \ + (OLDSZ), (OLDCNT)) +#define WAPBL_BEGIN(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_begin)((MP)->mnt_wapbl, \ + __FILE__, __LINE__) +#define WAPBL_END(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_end)((MP)->mnt_wapbl) +#define WAPBL_JUNLOCK_ASSERT(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_junlock_assert)((MP)->mnt_wapbl) + struct vfs_hooks { void (*vh_unmount)(struct mount *); LIST_ENTRY(vfs_hooks) vfs_hooks_list; diff --git a/sys/sys/stat.h b/sys/sys/stat.h index 5267ddf83300..d806f6c82000 100644 --- a/sys/sys/stat.h +++ b/sys/sys/stat.h @@ -1,4 +1,4 @@ -/* $NetBSD: stat.h,v 1.56 2007/10/19 15:58:52 christos Exp $ */ +/* $NetBSD: stat.h,v 1.57 2008/07/31 05:38:06 simonb Exp $ */ /*- * Copyright (c) 1982, 1986, 1989, 1993 @@ -214,6 +214,7 @@ struct stat { #define SF_APPEND 0x00040000 /* writes to file may only append */ /* SF_NOUNLINK 0x00100000 [NOT IMPLEMENTED] */ #define SF_SNAPSHOT 0x00200000 /* snapshot inode */ +#define SF_LOG 0x00400000 /* WAPBL log file inode */ #ifdef _KERNEL /* diff --git a/sys/sys/statvfs.h b/sys/sys/statvfs.h index bae423d5a4f9..1db40cf3ea16 100644 --- a/sys/sys/statvfs.h +++ b/sys/sys/statvfs.h @@ -1,4 +1,4 @@ -/* $NetBSD: statvfs.h,v 1.14 2008/04/28 20:24:11 martin Exp $ */ +/* $NetBSD: statvfs.h,v 1.15 2008/07/31 05:38:06 simonb Exp $ */ /*- * Copyright (c) 2004 The NetBSD Foundation, Inc. @@ -118,6 +118,7 @@ struct statvfs { #define ST_SYMPERM MNT_SYMPERM #define ST_NODEVMTIME MNT_NODEVMTIME #define ST_SOFTDEP MNT_SOFTDEP +#define ST_LOG MNT_LOG #define ST_EXRDONLY MNT_EXRDONLY #define ST_EXPORTED MNT_EXPORTED diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 02a430ce8b07..5c659bbc1e11 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1,4 +1,4 @@ -/* $NetBSD: vnode.h,v 1.196 2008/06/24 11:21:46 ad Exp $ */ +/* $NetBSD: vnode.h,v 1.197 2008/07/31 05:38:06 simonb Exp $ */ /*- * Copyright (c) 2008 The NetBSD Foundation, Inc. @@ -296,6 +296,7 @@ struct vattr { #define IO_NORMAL 0x00800 /* operate on regular data */ #define IO_EXT 0x01000 /* operate on extended attributes */ #define IO_DIRECT 0x02000 /* direct I/O hint */ +#define IO_JOURNALLOCKED 0x04000 /* journal is already locked */ #define IO_ADV_MASK 0x00003 /* access pattern hint */ #define IO_ADV_SHIFT 0 @@ -342,6 +343,7 @@ extern const int vttoif_tab[]; #define FSYNC_DATAONLY 0x0002 /* fsync: hint: sync file data only */ #define FSYNC_RECLAIM 0x0004 /* fsync: hint: vnode is being reclaimed */ #define FSYNC_LAZY 0x0008 /* fsync: lazy sync (trickle) */ +#define FSYNC_NOLOG 0x0010 /* fsync: do not flush the log */ #define FSYNC_CACHE 0x0100 /* fsync: flush disk caches too */ #define FSYNC_VFS 0x0200 /* fsync: via FSYNC_VFS() */ diff --git a/sys/sys/wapbl.h b/sys/sys/wapbl.h new file mode 100644 index 000000000000..b985e906d83f --- /dev/null +++ b/sys/sys/wapbl.h @@ -0,0 +1,381 @@ +/* $NetBSD: wapbl.h,v 1.2 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _SYS_WAPBL_H +#define _SYS_WAPBL_H + +#include + +#include + +/* This header file describes the api and data structures for + * write ahead physical block logging (WAPBL) support. + */ + +#if defined(_KERNEL_OPT) +#include "opt_wapbl.h" +#endif + +#ifdef WAPBL_DEBUG +#ifndef WAPBL_DEBUG_PRINT +#define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN) +#endif + +#if 0 +#define WAPBL_DEBUG_BUFBYTES +#define WAPBL_DEBUG_SERIALIZE +#endif + +#endif + +#ifdef WAPBL_DEBUG_PRINT + +enum { + WAPBL_PRINT_OPEN = 0x1, + WAPBL_PRINT_FLUSH = 0x2, + WAPBL_PRINT_TRUNCATE = 0x4, + WAPBL_PRINT_TRANSACTION = 0x8, + WAPBL_PRINT_BUFFER = 0x10, + WAPBL_PRINT_BUFFER2 = 0x20, + WAPBL_PRINT_ALLOC = 0x40, + WAPBL_PRINT_INODE = 0x80, + WAPBL_PRINT_WRITE = 0x100, + WAPBL_PRINT_IO = 0x200, + WAPBL_PRINT_REPLAY = 0x400, + WAPBL_PRINT_ERROR = 0x800, + WAPBL_PRINT_DISCARD = 0x1000, + WAPBL_PRINT_BIODONE = 0x2000, +}; + +#define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a +extern int wapbl_debug_print; +#else +#define WAPBL_PRINTF(mask, a) +#endif + +/****************************************************************/ + +/* The WAPBL journal layout. + * + * The journal consists of a header followed by a circular buffer + * region. The circular data area is described by the header + * wc_circ_off, wc_circ_size, wc_head and wc_tail fields as bytes + * from the start of the journal header. New records are inserted + * at wc_head and the oldest valid record can be found at wc_tail. + * When ((wc_head == wc_tail) && (wc_head == 0)), the journal is empty. + * The condition of ((wc_head == wc_tail) && (wc_head != 0)) + * indicates a full journal, although this condition is rare. + * + * The journal header as well as its records are marked by a 32bit + * type tag and length for ease of parsing. Journal records are + * padded so as to fall on journal device block boundaries. + * (XXX i think there is currently a bug wrt WC_BLOCKS not ending + * correctly on a journal device block boundary. this would need + * to be fixed if the journal blocksize does not match filesystem.) + */ + +/* + * The following are the 4 record types used by the journal: + * Each tag indicates journal data organized by one of the + * structures used below. + */ +enum { + WAPBL_WC_HEADER = 0x5741424c, /* "WABL", struct wapbl_wc_header */ + WAPBL_WC_INODES, /* struct wapbl_wc_inodelist */ + WAPBL_WC_REVOCATIONS, /* struct wapbl_wc_blocklist */ + WAPBL_WC_BLOCKS, /* struct wapbl_wc_blocklist */ +}; + +/* null entry (on disk) */ +/* This structure isn't used directly, but shares its header + * layout with all the other log structures for the purpose + * of reading a log structure and determining its type + */ +struct wapbl_wc_null { + uint32_t wc_type; /* WAPBL_WC_* */ + int32_t wc_len; + uint8_t wc_spare[0]; /* actually longer */ +}; + +/* journal header (on-disk) + * This record is found at the start of the + * journal, but not within the circular buffer region. As well as + * describing the journal parameters and matching filesystem, it + * additionally serves as the atomic update record for journal + * updates. + */ +struct wapbl_wc_header { + uint32_t wc_type; /* WAPBL_WC_HEADER log magic number */ + int32_t wc_len; /* length of this journal entry */ + uint32_t wc_checksum; + uint32_t wc_generation; + int32_t wc_fsid[2]; + uint64_t wc_time; + uint32_t wc_timensec; + uint32_t wc_version; + uint32_t wc_log_dev_bshift; + uint32_t wc_fs_dev_bshift; + int64_t wc_head; + int64_t wc_tail; + int64_t wc_circ_off; /* offset of of circ buffer region */ + int64_t wc_circ_size; /* size of circular buffer region */ + uint8_t wc_spare[0]; /* actually longer */ +}; + +/* list of blocks (on disk) + * This record is used to describe a set of filesystem blocks, + * and is used with two type tags, WAPBL_WC_BLOCKS and + * WAPBL_WC_REVOCATIONS. + * + * For WAPBL_WC_BLOCKS, a copy of each listed block can be found + * starting at the next log device blocksize boundary. starting at + * one log device block since the start of the record. This contains + * the bulk of the filesystem journal data which is written using + * these records before being written into the filesystem. + * + * The WAPBL_WC_REVOCATIONS record is used to indicate that any + * previously listed blocks should not be written into the filesystem. + * This is important so that deallocated and reallocated data blocks + * do not get overwritten with stale data from the journal. The + * revocation records to not contain a copy of any actual block data. + */ +struct wapbl_wc_blocklist { + uint32_t wc_type; /* WAPBL_WC_{REVOCATIONS,BLOCKS} */ + int32_t wc_len; + int32_t wc_blkcount; + int32_t wc_unused; + struct { + int64_t wc_daddr; + int32_t wc_unused; + int32_t wc_dlen; + } wc_blocks[0]; /* actually longer */ +}; + +/* list of inodes (on disk) + * This record is used to describe the set of inodes which + * may be allocated but are unlinked. Inodes end up listed here + * while they are in the process of being initialized and + * deinitialized. Inodes unlinked while in use by a process + * will be listed here and the actual deletion must be completed + * on journal replay. + */ +struct wapbl_wc_inodelist { + uint32_t wc_type; /* WAPBL_WC_INODES */ + int32_t wc_len; + int32_t wc_inocnt; + int32_t wc_clear; /* set if previously listed inodes + hould be ignored */ + struct { + uint32_t wc_inumber; + uint32_t wc_imode; + } wc_inodes[0]; /* actually longer */ +}; + +/****************************************************************/ + +#include +#include +#include + +typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int); + +#ifdef _KERNEL + +struct wapbl_entry; +struct wapbl_wc_header; +struct wapbl_replay; +struct wapbl; + +/* + * This structure holds per transaction log information + */ +struct wapbl_entry { + struct wapbl *we_wapbl; + SIMPLEQ_ENTRY(wapbl_entry) we_entries; + size_t we_bufcount; /* Count of unsynced buffers */ + size_t we_reclaimable_bytes; /* Number on disk bytes for this + transaction */ + int we_error; +#ifdef WAPBL_DEBUG_BUFBYTES + size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */ +#endif +}; + +void wapbl_init(void); + +/* Start using a log */ +int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t, + size_t, size_t, struct wapbl_replay *, + wapbl_flush_fn_t, wapbl_flush_fn_t); + +/* Discard the current transaction, potentially dangerous */ +void wapbl_discard(struct wapbl *); + +/* stop using a log */ +int wapbl_stop(struct wapbl *, int); + +/* + * Begin a new transaction or increment transaction recursion + * level if called while a transaction is already in progress + * by the current process. + */ +int wapbl_begin(struct wapbl *, const char *, int); + + +/* End a transaction or decrement the transaction recursion level */ +void wapbl_end(struct wapbl *); + +/* + * Add a new buffer to the current transaction. The buffers + * data will be copied to the current transaction log and the + * buffer will be marked B_LOCKED so that it will not be + * flushed to disk by the syncer or reallocated. + */ +void wapbl_add_buf(struct wapbl *, struct buf *); + +/* Remove a buffer from the current transaction. */ +void wapbl_remove_buf(struct wapbl *, struct buf *); + +void wapbl_resize_buf(struct wapbl *, struct buf *, long, long); + +/* + * This will flush all completed transactions to disk and + * start asynchronous writes on the associated buffers + */ +int wapbl_flush(struct wapbl *, int); + +/* + * Inodes that are allocated but have zero link count + * must be registered with the current transaction + * so they may be recorded in the log and cleaned up later. + * registration/unregistration of ino numbers already registered is ok. + */ +void wapbl_register_inode(struct wapbl *, ino_t, mode_t); +void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t); + +/* + * Metadata block deallocations must be registered so + * that revocations records can be written and to prevent + * the corresponding blocks from being reused as data + * blocks until the log is on disk. + */ +void wapbl_register_deallocation(struct wapbl *, daddr_t, int); + +void wapbl_jlock_assert(struct wapbl *wl); +void wapbl_junlock_assert(struct wapbl *wl); + +void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...)); + +#if defined(WAPBL_DEBUG) || defined(DDB) +void wapbl_dump(struct wapbl *); +#endif + +void wapbl_biodone(struct buf *); + +extern struct wapbl_ops wapbl_ops; + +static __inline struct mount * +wapbl_vptomp(struct vnode *vp) +{ + struct mount *mp; + + mp = NULL; + if (vp != NULL) { + if (vp->v_type == VBLK) + mp = vp->v_specmountpoint; + else + mp = vp->v_mount; + } + + return mp; +} + +static __inline bool +wapbl_vphaswapbl(struct vnode *vp) +{ + struct mount *mp; + + if (vp == NULL) + return false; + + mp = wapbl_vptomp(vp); + if (mp && mp->mnt_wapbl) + return true; + else + return false; +} + +#endif /* _KERNEL */ + +/****************************************************************/ +/* Replay support */ + +struct wapbl_replay { + struct vnode *wr_logvp; + struct vnode *wr_devvp; + daddr_t wr_logpbn; + + struct wapbl_wc_header wr_wc_header; + void *wr_scratch; + + LIST_HEAD(wapbl_blk_head, wapbl_blk) *wr_blkhash; + u_long wr_blkhashmask; + int wr_blkhashcnt; + + off_t wr_inodeshead; + off_t wr_inodestail; + int wr_inodescnt; + struct { + uint32_t wr_inumber; + uint32_t wr_imode; + } *wr_inodes; +}; + +#define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0) + +int wapbl_replay_isopen1(struct wapbl_replay *); +int wapbl_replay_start(struct wapbl_replay **, struct vnode *, + daddr_t, size_t, size_t); +void wapbl_replay_stop(struct wapbl_replay *); +void wapbl_replay_free(struct wapbl_replay *); +int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); +int wapbl_replay_write(struct wapbl_replay *, struct vnode *); +int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long); + +/****************************************************************/ + +/* Supply this to provide i/o support */ +int wapbl_write(void *, size_t, struct vnode *, daddr_t); +int wapbl_read(void *, size_t, struct vnode *, daddr_t); + +/****************************************************************/ + +#endif /* !_SYS_WAPBL_H */ diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 9006ca6dca15..1bbf8a6b9e8e 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,4 +1,33 @@ -/* $NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $ */ +/* $NetBSD: ffs_alloc.c,v 1.111 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. @@ -41,7 +70,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.111 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" @@ -51,13 +80,14 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $ #include #include #include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -65,21 +95,22 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.110 2008/07/11 05:31:44 simonb Exp $ #include #include #include +#include #include #include -static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int); -static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t); +static daddr_t ffs_alloccg(struct inode *, int, daddr_t, int, int); +static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int); #ifdef XXXUBC static daddr_t ffs_clusteralloc(struct inode *, int, daddr_t, int); #endif static ino_t ffs_dirpref(struct inode *); static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int); static void ffs_fserr(struct fs *, u_int, const char *); -static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, - daddr_t (*)(struct inode *, int, daddr_t, int)); -static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int); +static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int, + daddr_t (*)(struct inode *, int, daddr_t, int, int)); +static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int); static int32_t ffs_mapsearch(struct fs *, struct cg *, daddr_t, int); #if defined(DIAGNOSTIC) || defined(DEBUG) @@ -118,7 +149,7 @@ extern const u_char * const fragtbl[]; * => releases um_lock before returning */ int -ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, +ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, int flags, kauth_cred_t cred, daddr_t *bnp) { struct ufsmount *ump; @@ -174,13 +205,14 @@ ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, return (error); mutex_enter(&ump->um_lock); #endif + if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); - bno = ffs_hashalloc(ip, cg, bpref, size, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, size, flags, ffs_alloccg); if (bno > 0) { DIP_ADD(ip, blocks, btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; @@ -193,6 +225,20 @@ ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size, */ (void) chkdq(ip, -btodb(size), cred, FORCE); #endif + if (flags & B_CONTIG) { + /* + * XXX ump->um_lock handling is "suspect" at best. + * For the case where ffs_hashalloc() fails early + * in the B_CONTIG case we reach here with um_lock + * already unlocked, so we can't release it again + * like in the normal error path. See kern/39206. + * + * + * Fail silently - it's up to our caller to report + * errors. + */ + return (ENOSPC); + } nospace: mutex_exit(&ump->um_lock); ffs_fserr(fs, kauth_cred_geteuid(cred), "file system full"); @@ -372,14 +418,30 @@ ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bpref, int osize, panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } - bno = ffs_hashalloc(ip, cg, bpref, request, ffs_alloccg); + bno = ffs_hashalloc(ip, cg, bpref, request, 0, ffs_alloccg); if (bno > 0) { - if (!DOINGSOFTDEP(ITOV(ip))) - ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, - ip->i_number); - if (nsize < request) - ffs_blkfree(fs, ip->i_devvp, bno + numfrags(fs, nsize), - (long)(request - nsize), ip->i_number); + if (!DOINGSOFTDEP(ITOV(ip))) { + if ((ip->i_ump->um_mountp->mnt_wapbl) && + (ITOV(ip)->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + ip->i_ump->um_mountp, fsbtodb(fs, bprev), + osize); + } else + ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize, + ip->i_number); + } + if (nsize < request) { + if ((ip->i_ump->um_mountp->mnt_wapbl) && + (ITOV(ip)->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + ip->i_ump->um_mountp, + fsbtodb(fs, (bno + numfrags(fs, nsize))), + request - nsize); + } else + ffs_blkfree(fs, ip->i_devvp, + bno + numfrags(fs, nsize), + (long)(request - nsize), ip->i_number); + } DIP_ADD(ip, blocks, btodb(nsize - osize)); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp != NULL) { @@ -443,7 +505,7 @@ struct ctldebug debug15 = { "prtrealloc", &prtrealloc }; #endif /* - * NOTE: when re-enabling this, it must be updated for UFS2. + * NOTE: when re-enabling this, it must be updated for UFS2 and WAPBL. */ int doasyncfree = 1; @@ -548,7 +610,7 @@ ffs_reallocblks(void *v) * Search the block map looking for an allocation of the desired size. */ if ((newblk = (daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref, - len, ffs_clusteralloc)) == 0) { + len, flags, ffs_clusteralloc)) == 0) { mutex_exit(&ump->um_lock); goto fail; } @@ -696,11 +758,17 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t ino, ipref; int cg, error; + UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount); + *vpp = NULL; pip = VTOI(pvp); fs = pip->i_fs; ump = pip->i_ump; + error = UFS_WAPBL_BEGIN(pvp->v_mount); + if (error) { + return error; + } mutex_enter(&ump->um_lock); if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; @@ -723,12 +791,18 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } - ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, ffs_nodealloccg); + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, ffs_nodealloccg); if (ino == 0) goto noinodes; + UFS_WAPBL_END(pvp->v_mount); error = VFS_VGET(pvp->v_mount, ino, vpp); if (error) { - ffs_vfree(pvp, ino, mode); + int err; + err = UFS_WAPBL_BEGIN(pvp->v_mount); + if (err == 0) + ffs_vfree(pvp, ino, mode); + if (err == 0) + UFS_WAPBL_END(pvp->v_mount); return (error); } KASSERT((*vpp)->v_type == VNON); @@ -774,6 +848,7 @@ ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, return (0); noinodes: mutex_exit(&ump->um_lock); + UFS_WAPBL_END(pvp->v_mount); ffs_fserr(fs, kauth_cred_geteuid(cred), "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); return (ENOSPC); @@ -922,7 +997,7 @@ ffs_dirpref(struct inode *pip) * => um_lock held on entry and exit */ daddr_t -ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, +ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags, int32_t *bap /* XXX ondisk32 */) { struct fs *fs; @@ -932,6 +1007,26 @@ ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, KASSERT(mutex_owned(&ip->i_ump->um_lock)); fs = ip->i_fs; + + /* + * If allocating a contiguous file with B_CONTIG, use the hints + * in the inode extentions to return the desired block. + * + * For metadata (indirect blocks) return the address of where + * the first indirect block resides - we'll scan for the next + * available slot if we need to allocate more than one indirect + * block. For data, return the address of the actual block + * relative to the address of the first data block. + */ + if (flags & B_CONTIG) { + KASSERT(ip->i_ffs_first_data_blk != 0); + KASSERT(ip->i_ffs_first_indir_blk != 0); + if (flags & B_METAONLY) + return ip->i_ffs_first_indir_blk; + else + return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn); + } + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { if (lbn < NDADDR + NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); @@ -966,7 +1061,8 @@ ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, } daddr_t -ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap) +ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags, + int64_t *bap) { struct fs *fs; int cg; @@ -975,6 +1071,26 @@ ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap) KASSERT(mutex_owned(&ip->i_ump->um_lock)); fs = ip->i_fs; + + /* + * If allocating a contiguous file with B_CONTIG, use the hints + * in the inode extentions to return the desired block. + * + * For metadata (indirect blocks) return the address of where + * the first indirect block resides - we'll scan for the next + * available slot if we need to allocate more than one indirect + * block. For data, return the address of the actual block + * relative to the address of the first data block. + */ + if (flags & B_CONTIG) { + KASSERT(ip->i_ffs_first_data_blk != 0); + KASSERT(ip->i_ffs_first_indir_blk != 0); + if (flags & B_METAONLY) + return ip->i_ffs_first_indir_blk; + else + return ip->i_ffs_first_data_blk + blkstofrags(fs, lbn); + } + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { if (lbn < NDADDR + NINDIR(fs)) { cg = ino_to_cg(fs, ip->i_number); @@ -1025,7 +1141,7 @@ ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int64_t *bap) static daddr_t ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, int size /* size for data blocks, mode for inodes */, - daddr_t (*allocator)(struct inode *, int, daddr_t, int)) + int flags, daddr_t (*allocator)(struct inode *, int, daddr_t, int, int)) { struct fs *fs; daddr_t result; @@ -1035,9 +1151,12 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, /* * 1: preferred cylinder group */ - result = (*allocator)(ip, cg, pref, size); + result = (*allocator)(ip, cg, pref, size, flags); if (result) return (result); + + if (flags & B_CONTIG) + return (result); /* * 2: quadratic rehash */ @@ -1045,7 +1164,7 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, flags); if (result) return (result); } @@ -1056,7 +1175,7 @@ ffs_hashalloc(struct inode *ip, int cg, daddr_t pref, */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { - result = (*allocator)(ip, cg, 0, size); + result = (*allocator)(ip, cg, 0, size, flags); if (result) return (result); cg++; @@ -1157,7 +1276,7 @@ ffs_fragextend(struct inode *ip, int cg, daddr_t bprev, int osize, int nsize) * and if it is, allocate it. */ static daddr_t -ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) +ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size, int flags) { struct ufsmount *ump; struct fs *fs = ip->i_fs; @@ -1192,7 +1311,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) cgp->cg_time = ufs_rw64(time_second, needswap); if (size == fs->fs_bsize) { mutex_enter(&ump->um_lock); - blkno = ffs_alloccgblk(ip, bp, bpref); + blkno = ffs_alloccgblk(ip, bp, bpref, flags); ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); bdwrite(bp); @@ -1216,7 +1335,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) if (cgp->cg_cs.cs_nbfree == 0) goto fail; mutex_enter(&ump->um_lock); - blkno = ffs_alloccgblk(ip, bp, bpref); + blkno = ffs_alloccgblk(ip, bp, bpref, flags); bno = dtogd(fs, blkno); for (i = frags; i < fs->fs_frag; i++) setbit(blksfree, bno + i); @@ -1276,7 +1395,7 @@ ffs_alloccg(struct inode *ip, int cg, daddr_t bpref, int size) * blocks may be fragmented by the routine that allocates them. */ static daddr_t -ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref) +ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int flags) { struct ufsmount *ump; struct fs *fs = ip->i_fs; @@ -1304,7 +1423,14 @@ ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref) */ if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; + /* + * if the requested data block isn't available and we are + * trying to allocate a contiguous file, return an error. + */ + if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG) + return (0); } + /* * Take the next available block in this cylinder group. */ @@ -1453,7 +1579,7 @@ ffs_clusteralloc(struct inode *ip, int cg, daddr_t bpref, int len) len = blkstofrags(fs, len); mutex_enter(&ump->um_lock); for (i = 0; i < len; i += fs->fs_frag) - if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i) + if ((got = ffs_alloccgblk(ip, bp, bno + i, flags)) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECG_CLR(fs, cg); mutex_exit(&ump->um_lock); @@ -1477,7 +1603,7 @@ fail: * inode in the specified cylinder group. */ static daddr_t -ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode) +ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode, int flags) { struct ufsmount *ump = ip->i_ump; struct fs *fs = ip->i_fs; @@ -1492,6 +1618,7 @@ ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode) #endif KASSERT(mutex_owned(&ump->um_lock)); + UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); @@ -1542,6 +1669,8 @@ ffs_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode) panic("ffs_nodealloccg: block not in map"); /* NOTREACHED */ gotit: + UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref, + mode); /* * Check to see if we need to initialize more inodes. */ @@ -1593,6 +1722,122 @@ gotit: return (0); } +/* + * Allocate a block or fragment. + * + * The specified block or fragment is removed from the + * free map, possibly fragmenting a block in the process. + * + * This implementation should mirror fs_blkfree + * + * => um_lock not held on entry or exit + */ +int +ffs_blkalloc(struct inode *ip, daddr_t bno, long size) +{ + struct ufsmount *ump = ip->i_ump; + struct fs *fs = ip->i_fs; + struct cg *cgp; + struct buf *bp; + int32_t fragno, cgbno; + int i, error, cg, blk, frags, bbase; + u_int8_t *blksfree; + const int needswap = UFS_FSNEEDSWAP(fs); + + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || + fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { + printf("dev = 0x%x, bno = %" PRId64 " bsize = %d, " + "size = %ld, fs = %s\n", + ip->i_dev, bno, fs->fs_bsize, size, fs->fs_fsmnt); + panic("blkalloc: bad size"); + } + cg = dtog(fs, bno); + if (bno >= fs->fs_size) { + printf("bad block %" PRId64 ", ino %" PRId64 "\n", bno, + ip->i_number); + ffs_fserr(fs, ip->i_uid, "bad block"); + return EINVAL; + } + error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, B_MODIFY, &bp); + if (error) { + brelse(bp, 0); + return error; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp, needswap)) { + brelse(bp, 0); + return EIO; + } + cgp->cg_old_time = ufs_rw32(time_second, needswap); + cgp->cg_time = ufs_rw64(time_second, needswap); + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp, needswap); + + mutex_enter(&ump->um_lock); + if (size == fs->fs_bsize) { + fragno = fragstoblks(fs, cgbno); + if (!ffs_isblock(fs, blksfree, fragno)) { + mutex_exit(&ump->um_lock); + brelse(bp, 0); + return EBUSY; + } + ffs_clrblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, -1); + ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cg).cs_nbfree--; + } else { + bbase = cgbno - fragnum(fs, cgbno); + + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isclr(blksfree, cgbno + i)) { + mutex_exit(&ump->um_lock); + brelse(bp, 0); + return EBUSY; + } + } + /* + * if a complete block is being split, account for it + */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap); + fs->fs_cstotal.cs_nffree += fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, -1); + ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap); + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cg).cs_nbfree--; + } + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap); + /* + * allocate the fragment + */ + for (i = 0; i < frags; i++) { + clrbit(blksfree, cgbno + i); + } + ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap); + fs->fs_cstotal.cs_nffree -= i; + fs->fs_cs(fs, cg).cs_nffree -= i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap); + } + fs->fs_fmod = 1; + ACTIVECG_CLR(fs, cg); + mutex_exit(&ump->um_lock); + bdwrite(bp); + return 0; +} + /* * Free a block or fragment. * @@ -1817,6 +2062,8 @@ ffs_vfree(struct vnode *vp, ino_t ino, int mode) /* * Do the actual free operation. * The specified inode is placed back in the free map. + * + * => um_lock not held on entry or exit */ int ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) @@ -1832,6 +2079,8 @@ ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) const int needswap = UFS_FSNEEDSWAP(fs); #endif + UFS_WAPBL_JLOCK_ASSERT(devvp->v_specinfo->si_mountpoint); + cg = ino_to_cg(fs, ino); if (devvp->v_type != VBLK) { /* devvp is a snapshot */ @@ -1871,6 +2120,8 @@ ffs_freefile(struct fs *fs, struct vnode *devvp, ino_t ino, int mode) panic("ifree: freeing free inode"); } clrbit(inosused, ino); + UFS_WAPBL_UNREGISTER_INODE(devvp->v_specmountpoint, + ino + cg * fs->fs_ipg, mode); if (ino < ufs_rw32(cgp->cg_irotor, needswap)) cgp->cg_irotor = ufs_rw32(ino, needswap); ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap); diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index a2e82d689bcb..661d6210747f 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_balloc.c,v 1.50 2008/06/03 09:47:49 hannken Exp $ */ +/* $NetBSD: ffs_balloc.c,v 1.51 2008/07/31 05:38:06 simonb Exp $ */ /* * Copyright (c) 2002 Networks Associates Technology, Inc. @@ -41,7 +41,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.50 2008/06/03 09:47:49 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ffs_balloc.c,v 1.51 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_quota.h" @@ -141,7 +141,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, if (osize < fs->fs_bsize && osize > 0) { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, nb, - ffs_blkpref_ufs1(ip, lastlbn, nb, + ffs_blkpref_ufs1(ip, lastlbn, nb, flags, &ip->i_ffs1_db[0]), osize, (int)fs->fs_bsize, cred, bpp, &newb); if (error) @@ -222,9 +222,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, */ mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, lbn, - ffs_blkpref_ufs1(ip, lbn, (int)lbn, - &ip->i_ffs1_db[0]), osize, nsize, cred, - bpp, &newb); + ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, + &ip->i_ffs1_db[0]), + osize, nsize, cred, bpp, &newb); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -245,9 +245,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, - ffs_blkpref_ufs1(ip, lbn, (int)lbn, + ffs_blkpref_ufs1(ip, lbn, (int)lbn, flags, &ip->i_ffs1_db[0]), - nsize, cred, &newb); + nsize, flags, cred, &newb); if (error) return (error); if (bpp != NULL) { @@ -284,9 +284,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, allocblk = allociblk; if (nb == 0) { mutex_enter(&ump->um_lock); - pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, - &newb); + pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); if (error) goto fail; nb = newb; @@ -341,9 +341,10 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, } mutex_enter(&ump->um_lock); if (pref == 0) - pref = ffs_blkpref_ufs1(ip, lbn, 0, (int32_t *)0); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, - &newb); + pref = ffs_blkpref_ufs1(ip, lbn, 0, flags | B_METAONLY, + NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); if (error) { brelse(bp, 0); goto fail; @@ -404,8 +405,9 @@ ffs_balloc_ufs1(struct vnode *vp, off_t off, int size, kauth_cred_t cred, goto fail; } mutex_enter(&ump->um_lock); - pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, &bap[0]); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, + pref = ffs_blkpref_ufs1(ip, lbn, indirs[num].in_off, flags, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb); if (error) { brelse(bp, 0); @@ -619,7 +621,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, error = ffs_realloccg(ip, -1 - nb, dp->di_extb[nb], ffs_blkpref_ufs2(ip, lastlbn, (int)nb, - &dp->di_extb[0]), osize, + flags, &dp->di_extb[0]), + osize, (int)fs->fs_bsize, cred, &bp); if (error) return (error); @@ -679,8 +682,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, -1 - lbn, dp->di_extb[lbn], - ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &dp->di_extb[0]), osize, nsize, cred, &bp); + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &dp->di_extb[0]), + osize, nsize, cred, &bp); if (error) return (error); bp->b_xflags |= BX_ALTDATA; @@ -696,8 +700,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, - ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), - nsize, cred, &newb); + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &dp->di_extb[0]), + nsize, flags, cred, &newb); if (error) return (error); error = ffs_getblk(vp, -1 - lbn, fsbtodb(fs, newb), @@ -728,7 +733,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, if (osize < fs->fs_bsize && osize > 0) { mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, nb, - ffs_blkpref_ufs2(ip, lastlbn, nb, + ffs_blkpref_ufs2(ip, lastlbn, nb, flags, &ip->i_ffs2_db[0]), osize, (int)fs->fs_bsize, cred, bpp, &newb); if (error) @@ -809,9 +814,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, */ mutex_enter(&ump->um_lock); error = ffs_realloccg(ip, lbn, - ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &ip->i_ffs2_db[0]), osize, nsize, cred, - bpp, &newb); + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &ip->i_ffs2_db[0]), + osize, nsize, cred, bpp, &newb); if (error) return (error); if (DOINGSOFTDEP(vp)) @@ -832,8 +837,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, nsize = fs->fs_bsize; mutex_enter(&ump->um_lock); error = ffs_alloc(ip, lbn, - ffs_blkpref_ufs2(ip, lbn, (int)lbn, - &ip->i_ffs2_db[0]), nsize, cred, &newb); + ffs_blkpref_ufs2(ip, lbn, (int)lbn, flags, + &ip->i_ffs2_db[0]), + nsize, flags, cred, &newb); if (error) return (error); if (bpp != NULL) { @@ -870,9 +876,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, allocblk = allociblk; if (nb == 0) { mutex_enter(&ump->um_lock); - pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, - &newb); + pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); if (error) goto fail; nb = newb; @@ -927,9 +933,10 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, } mutex_enter(&ump->um_lock); if (pref == 0) - pref = ffs_blkpref_ufs2(ip, lbn, 0, (int64_t *)0); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, - &newb); + pref = ffs_blkpref_ufs2(ip, lbn, 0, flags | B_METAONLY, + NULL); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | B_METAONLY, cred, &newb); if (error) { brelse(bp, 0); goto fail; @@ -990,8 +997,9 @@ ffs_balloc_ufs2(struct vnode *vp, off_t off, int size, kauth_cred_t cred, goto fail; } mutex_enter(&ump->um_lock); - pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, &bap[0]); - error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, + pref = ffs_blkpref_ufs2(ip, lbn, indirs[num].in_off, flags, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, flags, cred, &newb); if (error) { brelse(bp, 0); diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index d7b69dcfb359..b17c486f9dff 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -1,4 +1,4 @@ -/* $NetBSD: ffs_extern.h,v 1.66 2008/06/28 01:34:05 rumble Exp $ */ +/* $NetBSD: ffs_extern.h,v 1.67 2008/07/31 05:38:06 simonb Exp $ */ /*- * Copyright (c) 1991, 1993, 1994 @@ -84,9 +84,10 @@ __BEGIN_DECLS #include #include +#include /* ffs_alloc.c */ -int ffs_alloc(struct inode *, daddr_t, daddr_t , int, kauth_cred_t, +int ffs_alloc(struct inode *, daddr_t, daddr_t , int, int, kauth_cred_t, daddr_t *); int ffs_realloccg(struct inode *, daddr_t, daddr_t, int, int , kauth_cred_t, struct buf **, daddr_t *); @@ -94,8 +95,9 @@ int ffs_realloccg(struct inode *, daddr_t, daddr_t, int, int , int ffs_reallocblks(void *); #endif int ffs_valloc(struct vnode *, int, kauth_cred_t, struct vnode **); -daddr_t ffs_blkpref_ufs1(struct inode *, daddr_t, int, int32_t *); -daddr_t ffs_blkpref_ufs2(struct inode *, daddr_t, int, int64_t *); +daddr_t ffs_blkpref_ufs1(struct inode *, daddr_t, int, int, int32_t *); +daddr_t ffs_blkpref_ufs2(struct inode *, daddr_t, int, int, int64_t *); +int ffs_blkalloc(struct inode *, daddr_t, long); void ffs_blkfree(struct fs *, struct vnode *, daddr_t, long, ino_t); int ffs_vfree(struct vnode *, ino_t, int); void ffs_clusteracct(struct fs *, struct cg *, int32_t, int); @@ -175,6 +177,17 @@ void softdep_setup_allocindir_page(struct inode *, daddr_t, void softdep_fsync_mountdev(struct vnode *); int softdep_sync_metadata(struct vnode *); +/* Write Ahead Physical Block Logging */ +void ffs_wapbl_verify_inodes(struct mount *, const char *); +void ffs_wapbl_replay_finish(struct mount *); +int ffs_wapbl_start(struct mount *); +int ffs_wapbl_stop(struct mount *, int); +int ffs_wapbl_replay_start(struct mount *, struct fs *, struct vnode *); +void ffs_wapbl_blkalloc(struct fs *, struct vnode *, daddr_t, int); + +void ffs_wapbl_sync_metadata(struct mount *, daddr_t *, int *, int); +void ffs_wapbl_abort_sync_metadata(struct mount *, daddr_t *, int *, int); + extern int (**ffs_vnodeop_p)(void *); extern int (**ffs_specop_p)(void *); extern int (**ffs_fifoop_p)(void *); diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index e037ede2aea5..706ac653de92 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,4 +1,33 @@ -/* $NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $ */ +/* $NetBSD: ffs_inode.c,v 1.98 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -32,7 +61,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.98 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" @@ -41,23 +70,25 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.97 2008/06/03 09:47:49 hannken Exp $ #include #include -#include -#include -#include #include -#include +#include +#include +#include #include #include -#include +#include +#include #include -#include -#include +#include +#include +#include #include #include #include #include #include +#include #include #include @@ -128,6 +159,17 @@ ffs_update(struct vnode *vp, const struct timespec *acc, softdep_update_inodeblock(ip, bp, waitfor); } else if (ip->i_ffs_effnlink != ip->i_nlink) panic("ffs_update: bad link cnt"); + /* Keep unlinked inode list up to date */ + KDASSERT(DIP(ip, nlink) == ip->i_nlink); + if (ip->i_mode) { + if (ip->i_nlink > 0) { + UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp, + ip->i_number, ip->i_mode); + } else { + UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, + ip->i_number, ip->i_mode); + } + } if (fs->fs_magic == FS_UFS1_MAGIC) { cp = (char *)bp->b_data + (ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE); @@ -411,8 +453,13 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) blocksreleased += count; if (lastiblock[level] < 0) { DIP_ASSIGN(oip, ib[level], 0); - ffs_blkfree(fs, oip->i_devvp, bn, fs->fs_bsize, - oip->i_number); + if (oip->i_ump->um_mountp->mnt_wapbl) { + UFS_WAPBL_REGISTER_DEALLOCATION( + oip->i_ump->um_mountp, + fsbtodb(fs, bn), fs->fs_bsize); + } else + ffs_blkfree(fs, oip->i_devvp, bn, + fs->fs_bsize, oip->i_number); blocksreleased += nblocks; } } @@ -434,7 +481,12 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) continue; DIP_ASSIGN(oip, db[i], 0); bsize = blksize(fs, oip, i); - ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number); + if ((oip->i_ump->um_mountp->mnt_wapbl) && + (ovp->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION(oip->i_ump->um_mountp, + fsbtodb(fs, bn), bsize); + } else + ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number); blocksreleased += btodb(bsize); } if (lastblock < 0) @@ -468,8 +520,14 @@ ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred) * required for the storage we're keeping. */ bn += numfrags(fs, newspace); - ffs_blkfree(fs, oip->i_devvp, bn, oldspace - newspace, - oip->i_number); + if ((oip->i_ump->um_mountp->mnt_wapbl) && + (ovp->v_type != VREG)) { + UFS_WAPBL_REGISTER_DEALLOCATION( + oip->i_ump->um_mountp, fsbtodb(fs, bn), + oldspace - newspace); + } else + ffs_blkfree(fs, oip->i_devvp, bn, + oldspace - newspace, oip->i_number); blocksreleased += btodb(oldspace - newspace); } } @@ -494,6 +552,7 @@ done: DIP_ADD(oip, blocks, -blocksreleased); genfs_node_unlock(ovp); oip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0); #ifdef QUOTA (void) chkdq(oip, -blocksreleased, NOCRED, 0); #endif @@ -621,7 +680,13 @@ ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn, allerror = error; blocksreleased += blkcount; } - ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize, ip->i_number); + if ((ip->i_ump->um_mountp->mnt_wapbl) && + ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) { + UFS_WAPBL_REGISTER_DEALLOCATION(ip->i_ump->um_mountp, + fsbtodb(fs, nb), fs->fs_bsize); + } else + ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize, + ip->i_number); blocksreleased += nblocks; } diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 78da484a0038..d375c7c13bbb 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,4 +1,33 @@ -/* $NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp $ */ +/* $NetBSD: ffs_vfsops.c,v 1.231 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1989, 1991, 1993, 1994 @@ -32,12 +61,13 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.231 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" #include "opt_softdep.h" +#include "opt_wapbl.h" #endif #include @@ -61,6 +91,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp #include #include #include +#include #include #include @@ -73,6 +104,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.230 2008/06/28 01:34:05 rumble Exp #include #include #include +#include #include #include @@ -243,11 +275,17 @@ ffs_mountroot(void) vrele(rootvp); return (error); } + + /* + * We always need to be able to mount the root file system. + */ + mp->mnt_flag |= MNT_FORCE; if ((error = ffs_mountfs(rootvp, mp, l)) != 0) { vfs_unbusy(mp, false, NULL); vfs_destroy(mp); return (error); } + mp->mnt_flag &= ~MNT_FORCE; mutex_enter(&mountlist_lock); CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); mutex_exit(&mountlist_lock); @@ -261,6 +299,8 @@ ffs_mountroot(void) return (0); } +static int dolog; + /* * VFS Operations. * @@ -278,6 +318,9 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) int error = 0, flags, update; mode_t accessmode; + if (dolog) + mp->mnt_flag |= MNT_LOG; + if (*data_len < sizeof *args) return EINVAL; @@ -378,13 +421,31 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) return (error); } +#ifdef WAPBL + /* + * WAPBL can only be enabled on a r/w mount + * that does not use softdep. + */ + if ((mp->mnt_flag & MNT_RDONLY) && !(mp->mnt_iflag & IMNT_WANTRDWR)) { + mp->mnt_flag &= ~MNT_LOG; + } + if ((mp->mnt_flag & (MNT_SOFTDEP | MNT_LOG)) == + (MNT_SOFTDEP | MNT_LOG)) { + printf("%s fs is journalled, ignoring soft update mode\n", + VFSTOUFS(mp)->um_fs->fs_fsmnt); + mp->mnt_flag &= ~MNT_SOFTDEP; + } +#else /* !WAPBL */ + mp->mnt_flag &= ~MNT_LOG; +#endif /* !WAPBL */ + if (!update) { int xflags; if (mp->mnt_flag & MNT_RDONLY) xflags = FREAD; else - xflags = FREAD|FWRITE; + xflags = FREAD | FWRITE; error = VOP_OPEN(devvp, xflags, FSCRED); if (error) goto fail; @@ -439,6 +500,8 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } + if (error == 0) + error = UFS_WAPBL_BEGIN(mp); if (error == 0 && ffs_cgupdate(ump, MNT_WAIT) == 0 && fs->fs_clean & FS_WASCLEAN) { @@ -447,8 +510,24 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) fs->fs_clean = FS_ISCLEAN; (void) ffs_sbupdate(ump, MNT_WAIT); } + if (error == 0) + UFS_WAPBL_END(mp); if (error) return (error); + } + +#ifdef WAPBL + if ((mp->mnt_flag & MNT_LOG) == 0) { + error = ffs_wapbl_stop(mp, mp->mnt_flag & MNT_FORCE); + if (error) + return error; + } +#endif /* WAPBL */ + + if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) { + /* + * Finish change from r/w to r/o + */ fs->fs_ronly = 1; fs->fs_fmod = 0; } @@ -508,9 +587,30 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) if (error) return (error); } +#ifdef WAPBL + if (fs->fs_flags & FS_DOWAPBL) { + printf("%s: replaying log to disk\n", + fs->fs_fsmnt); + KDASSERT(mp->mnt_wapbl_replay); + error = wapbl_replay_write(mp->mnt_wapbl_replay, + devvp); + if (error) { + return error; + } + wapbl_replay_stop(mp->mnt_wapbl_replay); + fs->fs_clean = FS_WASCLEAN; + } +#endif /* WAPBL */ if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); } + +#ifdef WAPBL + error = ffs_wapbl_start(mp); + if (error) + return error; +#endif /* WAPBL */ + if (args->fspec == NULL) return EINVAL; if ((mp->mnt_flag & (MNT_SOFTDEP | MNT_ASYNC)) == @@ -531,17 +631,24 @@ ffs_mount(struct mount *mp, const char *path, void *data, size_t *data_len) else fs->fs_flags &= ~FS_DOSOFTDEP; if (fs->fs_fmod != 0) { /* XXX */ + int err; + fs->fs_fmod = 0; if (fs->fs_clean & FS_WASCLEAN) fs->fs_time = time_second; else { - printf("%s: file system not clean (fs_clean=%x); please fsck(8)\n", - mp->mnt_stat.f_mntfromname, fs->fs_clean); + printf("%s: file system not clean (fs_clean=%#x); " + "please fsck(8)\n", mp->mnt_stat.f_mntfromname, + fs->fs_clean); printf("%s: lost blocks %" PRId64 " files %d\n", mp->mnt_stat.f_mntfromname, fs->fs_pendingblocks, fs->fs_pendinginodes); } - (void) ffs_cgupdate(ump, MNT_WAIT); + err = UFS_WAPBL_BEGIN(mp); + if (err == 0) { + (void) ffs_cgupdate(ump, MNT_WAIT); + UFS_WAPBL_END(mp); + } } return (error); @@ -659,7 +766,7 @@ ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) return (error); } error = ffs_appleufs_validate(fs->fs_fsmnt, - (struct appleufslabel *)bp->b_data,NULL); + (struct appleufslabel *)bp->b_data, NULL); if (error == 0) ump->um_flags |= UFS_ISAPPLEUFS; brelse(bp, 0); @@ -686,6 +793,17 @@ ffs_reload(struct mount *mp, kauth_cred_t cred, struct lwp *l) ffs_oldfscompat_read(fs, ump, sblockloc); mutex_enter(&ump->um_lock); ump->um_maxfilesize = fs->fs_maxfilesize; + + if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { + uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", + mp->mnt_stat.f_mntonname, fs->fs_flags, + (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); + if ((mp->mnt_flag & MNT_FORCE) == 0) { + mutex_exit(&ump->um_lock); + return (EINVAL); + } + } + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; @@ -839,6 +957,17 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) if (error) return error; + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); + memset(ump, 0, sizeof *ump); + mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE); + error = ffs_snapshot_init(ump); + if (error) + goto out; + ump->um_ops = &ffs_ufsops; + +#ifdef WAPBL + sbagain: +#endif /* * Try reading the superblock in each of its possible locations. */ @@ -916,15 +1045,7 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) fs = malloc((u_long)sbsize, M_UFSMNT, M_WAITOK); memcpy(fs, bp->b_data, sbsize); - - ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK); - memset(ump, 0, sizeof *ump); - mutex_init(&ump->um_lock, MUTEX_DEFAULT, IPL_NONE); - error = ffs_snapshot_init(ump); - if (error) - goto out; ump->um_fs = fs; - ump->um_ops = &ffs_ufsops; #ifdef FFS_EI if (needswap) { @@ -934,9 +1055,52 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) #endif fs->fs_flags &= ~FS_SWAPPED; +#ifdef WAPBL + if ((mp->mnt_wapbl_replay == 0) && (fs->fs_flags & FS_DOWAPBL)) { + error = ffs_wapbl_replay_start(mp, fs, devvp); + if (error) + goto out; + + if (!ronly) { + /* XXX fsmnt may be stale. */ + printf("%s: replaying log to disk\n", fs->fs_fsmnt); + error = wapbl_replay_write(mp->mnt_wapbl_replay, devvp); + if (error) + goto out; + wapbl_replay_stop(mp->mnt_wapbl_replay); + fs->fs_clean = FS_WASCLEAN; + } else { + /* XXX fsmnt may be stale */ + printf("%s: replaying log to memory\n", fs->fs_fsmnt); + } + + /* Force a re-read of the superblock */ + brelse(bp, BC_INVAL); + bp = NULL; + free(fs, M_UFSMNT); + fs = NULL; + goto sbagain; + } +#else /* !WAPBL */ + if ((fs->fs_flags & FS_DOWAPBL) && (mp->mnt_flag & MNT_FORCE) == 0) { + error = EPERM; + goto out; + } +#endif /* !WAPBL */ + ffs_oldfscompat_read(fs, ump, sblockloc); ump->um_maxfilesize = fs->fs_maxfilesize; + if (fs->fs_flags & ~(FS_KNOWN_FLAGS | FS_INTERNAL)) { + uprintf("%s: unknown ufs flags: 0x%08"PRIx32"%s\n", + mp->mnt_stat.f_mntonname, fs->fs_flags, + (mp->mnt_flag & MNT_FORCE) ? "" : ", not mounting"); + if ((mp->mnt_flag & MNT_FORCE) == 0) { + error = EINVAL; + goto out; + } + } + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; @@ -966,7 +1130,7 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) if (error) goto out; error = ffs_appleufs_validate(fs->fs_fsmnt, - (struct appleufslabel *)bp->b_data,NULL); + (struct appleufslabel *)bp->b_data, NULL); if (error == 0) { ump->um_flags |= UFS_ISAPPLEUFS; } @@ -980,6 +1144,36 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) } #endif +#if 0 +/* + * XXX This code changes the behaviour of mounting dirty filesystems, to + * XXX require "mount -f ..." to mount them. This doesn't match what + * XXX mount(8) describes and is disabled for now. + */ + /* + * If the file system is not clean, don't allow it to be mounted + * unless MNT_FORCE is specified. (Note: MNT_FORCE is always set + * for the root file system.) + */ + if (fs->fs_flags & FS_DOWAPBL) { + /* + * wapbl normally expects to be FS_WASCLEAN when the FS_DOWAPBL + * bit is set, although there's a window in unmount where it + * could be FS_ISCLEAN + */ + if ((mp->mnt_flag & MNT_FORCE) == 0 && + (fs->fs_clean & (FS_WASCLEAN | FS_ISCLEAN)) == 0) { + error = EPERM; + goto out; + } + } else + if ((fs->fs_clean & FS_ISCLEAN) == 0 && + (mp->mnt_flag & MNT_FORCE) == 0) { + error = EPERM; + goto out; + } +#endif + /* * verify that we can access the last block in the fs * if we're mounting read/write. @@ -999,10 +1193,12 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) } fs->fs_ronly = ronly; - if (ronly == 0) { - fs->fs_clean <<= 1; - fs->fs_fmod = 1; - } + /* Don't bump fs_clean if we're replaying journal */ + if (!((fs->fs_flags & FS_DOWAPBL) && (fs->fs_clean & FS_WASCLEAN))) + if (ronly == 0) { + fs->fs_clean <<= 1; + fs->fs_fmod = 1; + } size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) @@ -1095,6 +1291,24 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) goto out; } } + +#ifdef WAPBL + if (!ronly) { + KDASSERT(fs->fs_ronly == 0); + /* + * ffs_wapbl_start() needs mp->mnt_stat initialised if it + * needs to create a new log file in-filesystem. + */ + ffs_statvfs(mp, &mp->mnt_stat); + + error = ffs_wapbl_start(mp); + if (error) { + free(fs->fs_csp, M_UFSMNT); + goto out; + } + } +#endif /* WAPBL */ + if (ronly == 0 && fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); #ifdef UFS_EXTATTR @@ -1115,6 +1329,15 @@ ffs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l) #endif /* UFS_EXTATTR */ return (0); out: +#ifdef WAPBL + if (mp->mnt_wapbl_replay) { + if (wapbl_replay_isopen(mp->mnt_wapbl_replay)) + wapbl_replay_stop(mp->mnt_wapbl_replay); + wapbl_replay_free(mp->mnt_wapbl_replay); + mp->mnt_wapbl_replay = 0; + } +#endif + fstrans_unmount(mp); if (fs) free(fs, M_UFSMNT); @@ -1175,7 +1398,7 @@ ffs_oldfscompat_read(struct fs *fs, struct ufsmount *ump, daddr_t sblockloc) fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_sblockloc = sblockloc; - fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL); + fs->fs_flags = fs->fs_old_flags | (fs->fs_flags & FS_INTERNAL); if (fs->fs_old_postblformat == FS_42POSTBLFMT) { fs->fs_old_nrpos = 8; @@ -1256,6 +1479,9 @@ ffs_unmount(struct mount *mp, int mntflags) struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs = ump->um_fs; int error, flags, penderr; +#ifdef WAPBL + extern int doforce; +#endif penderr = 0; flags = 0; @@ -1284,25 +1510,42 @@ ffs_unmount(struct mount *mp, int mntflags) penderr = 1; } mutex_exit(&ump->um_lock); - if (fs->fs_ronly == 0 && - ffs_cgupdate(ump, MNT_WAIT) == 0 && - fs->fs_clean & FS_WASCLEAN) { - /* - * XXXX don't mark fs clean in the case of softdep - * pending block errors, until they are fixed. - */ - if (penderr == 0) { - if (mp->mnt_flag & MNT_SOFTDEP) - fs->fs_flags &= ~FS_DOSOFTDEP; - fs->fs_clean = FS_ISCLEAN; + error = UFS_WAPBL_BEGIN(mp); + if (error == 0) + if (fs->fs_ronly == 0 && + ffs_cgupdate(ump, MNT_WAIT) == 0 && + fs->fs_clean & FS_WASCLEAN) { + /* + * XXXX don't mark fs clean in the case of softdep + * pending block errors, until they are fixed. + */ + if (penderr == 0) { + if (mp->mnt_flag & MNT_SOFTDEP) + fs->fs_flags &= ~FS_DOSOFTDEP; + fs->fs_clean = FS_ISCLEAN; + } + fs->fs_fmod = 0; + (void) ffs_sbupdate(ump, MNT_WAIT); } - fs->fs_fmod = 0; - (void) ffs_sbupdate(ump, MNT_WAIT); + if (error == 0) + UFS_WAPBL_END(mp); +#ifdef WAPBL + KASSERT(!(mp->mnt_wapbl_replay && mp->mnt_wapbl)); + if (mp->mnt_wapbl_replay) { + KDASSERT(fs->fs_ronly); + wapbl_replay_stop(mp->mnt_wapbl_replay); + wapbl_replay_free(mp->mnt_wapbl_replay); + mp->mnt_wapbl_replay = 0; } + error = ffs_wapbl_stop(mp, doforce && (mntflags & MNT_FORCE)); + if (error) { + return error; + } +#endif /* WAPBL */ if (ump->um_devvp->v_type != VBAD) ump->um_devvp->v_specmountpoint = NULL; vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); - (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD|FWRITE, + (void)VOP_CLOSE(ump->um_devvp, fs->fs_ronly ? FREAD : FREAD | FWRITE, NOCRED); vput(ump->um_devvp); free(fs->fs_csp, M_UFSMNT); @@ -1335,7 +1578,7 @@ ffs_flushfiles(struct mount *mp, int flags, struct lwp *l) #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; - if ((error = vflush(mp, NULLVP, SKIPSYSTEM|flags)) != 0) + if ((error = vflush(mp, NULLVP, SKIPSYSTEM | flags)) != 0) return (error); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP) @@ -1363,6 +1606,19 @@ ffs_flushfiles(struct mount *mp, int flags, struct lwp *l) vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, l->l_cred, FSYNC_WAIT, 0, 0); VOP_UNLOCK(ump->um_devvp, 0); + if (flags & FORCECLOSE) /* XXXDBJ */ + error = 0; + +#ifdef WAPBL + if (error) + return error; + if (mp->mnt_wapbl) { + error = wapbl_flush(mp->mnt_wapbl, 1); + if (flags & FORCECLOSE) + error = 0; + } +#endif + return (error); } @@ -1447,10 +1703,11 @@ loop: continue; mutex_enter(&vp->v_interlock); ip = VTOI(vp); - if (ip == NULL || (vp->v_iflag & (VI_XLOCK|VI_CLEAN)) != 0 || + /* XXXpooka: why wapbl check? */ + if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 || vp->v_type == VNON || ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && - LIST_EMPTY(&vp->v_dirtyblkhd) && + (LIST_EMPTY(&vp->v_dirtyblkhd) || (mp->mnt_wapbl)) && UVM_OBJ_IS_CLEAN(&vp->v_uobj))) { mutex_exit(&vp->v_interlock); @@ -1471,11 +1728,16 @@ loop: } continue; } - if (vp->v_type == VREG && waitfor == MNT_LAZY) - error = ffs_update(vp, NULL, NULL, 0); - else - error = VOP_FSYNC(vp, cred, - waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0); + if (vp->v_type == VREG && waitfor == MNT_LAZY) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (!error) { + error = ffs_update(vp, NULL, NULL, 0); + UFS_WAPBL_END(vp->v_mount); + } + } else { + error = VOP_FSYNC(vp, cred, FSYNC_NOLOG | + (waitfor == MNT_WAIT ? FSYNC_WAIT : 0), 0, 0); + } if (error) allerror = error; vput(vp); @@ -1498,10 +1760,11 @@ loop: !LIST_EMPTY(&ump->um_devvp->v_dirtyblkhd))) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, cred, - waitfor == MNT_WAIT ? FSYNC_WAIT : 0, 0, 0)) != 0) + (waitfor == MNT_WAIT ? FSYNC_WAIT : 0) | FSYNC_NOLOG, + 0, 0)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp, 0); - if (allerror == 0 && waitfor == MNT_WAIT) { + if (allerror == 0 && waitfor == MNT_WAIT && !mp->mnt_wapbl) { mutex_enter(&mntvnode_lock); goto loop; } @@ -1515,9 +1778,24 @@ loop: if (fs->fs_fmod != 0) { fs->fs_fmod = 0; fs->fs_time = time_second; - if ((error = ffs_cgupdate(ump, waitfor))) + error = UFS_WAPBL_BEGIN(mp); + if (error) + allerror = error; + else { + if ((error = ffs_cgupdate(ump, waitfor))) + allerror = error; + UFS_WAPBL_END(mp); + } + } + +#ifdef WAPBL + if (mp->mnt_wapbl) { + error = wapbl_flush(mp->mnt_wapbl, 0); + if (error) allerror = error; } +#endif + fstrans_done(mp); vnfree(mvp); return (allerror); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 071a78b9ceed..881476b4166a 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,4 +1,33 @@ -/* $NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $ */ +/* $NetBSD: ffs_vnops.c,v 1.100 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1989, 1993 @@ -32,7 +61,12 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.100 2008/07/31 05:38:06 simonb Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#include "opt_wapbl.h" +#endif #include #include @@ -48,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $"); #include #include #include +#include #include #include @@ -58,6 +93,7 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.99 2008/04/29 18:18:09 ad Exp $"); #include #include #include +#include #include #include @@ -246,6 +282,9 @@ ffs_fsync(void *v) int bsize; daddr_t blk_high; struct vnode *vp; +#ifdef WAPBL + struct mount *mp; +#endif vp = ap->a_vp; @@ -255,7 +294,11 @@ ffs_fsync(void *v) */ if ((ap->a_offlo == 0 && ap->a_offhi == 0) || DOINGSOFTDEP(vp) || (vp->v_type != VREG)) { - error = ffs_full_fsync(vp, ap->a_flags); + int flags = ap->a_flags; + + if (vp->v_type == VBLK) + flags |= FSYNC_VFS; + error = ffs_full_fsync(vp, flags); goto out; } @@ -276,6 +319,36 @@ ffs_fsync(void *v) goto out; } +#ifdef WAPBL + mp = wapbl_vptomp(vp); + if (mp->mnt_wapbl) { + if (ap->a_flags & FSYNC_DATAONLY) { + fstrans_done(vp->v_mount); + return 0; + } + error = 0; + if (vp->v_tag == VT_UFS && VTOI(vp)->i_flag & + (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | + IN_MODIFIED | IN_ACCESSED)) { + error = UFS_WAPBL_BEGIN(mp); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + error = ffs_update(vp, NULL, NULL, + (ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); + UFS_WAPBL_END(mp); + } + if (error || (ap->a_flags & FSYNC_NOLOG)) { + fstrans_done(vp->v_mount); + return error; + } + error = wapbl_flush(mp->mnt_wapbl, 0); + fstrans_done(vp->v_mount); + return error; + } +#endif /* WAPBL */ + /* * Then, flush indirect blocks. */ @@ -350,7 +423,7 @@ ffs_full_fsync(struct vnode *vp, int flags) */ if (vp->v_type == VREG || vp->v_type == VBLK) { - if ((flags & FSYNC_VFS) != 0) + if ((flags & FSYNC_VFS) != 0 && vp->v_specmountpoint != NULL) mp = vp->v_specmountpoint; else mp = vp->v_mount; @@ -360,8 +433,55 @@ ffs_full_fsync(struct vnode *vp, int flags) PGO_FREE : 0)); if (error) return error; - } else + } else { + mp = vp->v_mount; mutex_exit(&vp->v_interlock); + } + +#ifdef WAPBL + if (mp && mp->mnt_wapbl) { + error = 0; + if (flags & FSYNC_DATAONLY) + return error; + + if (VTOI(vp) && (VTOI(vp)->i_flag & + (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY | + IN_MODIFIED | IN_ACCESSED))) { + error = UFS_WAPBL_BEGIN(mp); + if (error) + return error; + error = ffs_update(vp, NULL, NULL, + (flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); + UFS_WAPBL_END(mp); + } + if (error || (flags & FSYNC_NOLOG)) + return error; + /* + * Don't flush the log if the vnode being flushed + * contains no dirty buffers that could be in the log. + */ + if (!((flags & FSYNC_RECLAIM) && + LIST_EMPTY(&vp->v_dirtyblkhd))) { + error = wapbl_flush(mp->mnt_wapbl, 0); + if (error) + return error; + } + + /* + * XXX temporary workaround for "dirty bufs" panic in + * vinvalbuf. need a full fix for the v_numoutput + * waiters issues. + */ + if (flags & FSYNC_WAIT) { + mutex_enter(&vp->v_interlock); + while (vp->v_numoutput) + cv_wait(&vp->v_cv, &vp->v_interlock); + mutex_exit(&vp->v_interlock); + } + + return error; + } +#endif /* WAPBL */ passes = NIADDR + 1; skipmeta = 0; @@ -453,8 +573,10 @@ loop: if (error == 0 && flags & FSYNC_CACHE) { int i = 0; - if ((flags & FSYNC_VFS) == 0) + if ((flags & FSYNC_VFS) == 0) { + KASSERT(VTOI(vp) != NULL); vp = VTOI(vp)->i_devvp; + } VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, curlwp->l_cred); } diff --git a/sys/ufs/ffs/ffs_wapbl.c b/sys/ufs/ffs/ffs_wapbl.c new file mode 100644 index 000000000000..e91050533413 --- /dev/null +++ b/sys/ufs/ffs/ffs_wapbl.c @@ -0,0 +1,858 @@ +/* $NetBSD: ffs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ffs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_ffs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef WAPBL_DEBUG +#ifdef WAPBL_DEBUG +int ffs_wapbl_debug = 1; +#define DPRINTF(fmt, args...) \ +do { \ + if (ffs_wapbl_debug) \ + printf("%s:%d "fmt, __func__ , __LINE__, ##args); \ +} while (/* CONSTCOND */0) +#else +#define DPRINTF(fmt, args...) \ +do { \ + /* nothing */ \ +} while (/* CONSTCOND */0) +#endif + +static int wapbl_log_position(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, size_t *, uint64_t *); +static int wapbl_create_infs_log(struct mount *, struct fs *, struct vnode *, + daddr_t *, size_t *, size_t *, uint64_t *); +static void wapbl_find_log_start(struct mount *, struct vnode *, off_t, + daddr_t *, daddr_t *, size_t *); +static int wapbl_remove_log(struct mount *); +static int wapbl_allocate_log_file(struct mount *, struct vnode *); + +/* + * This function is invoked after a log is replayed to + * disk to perform logical cleanup actions as described by + * the log + */ +void +ffs_wapbl_replay_finish(struct mount *mp) +{ + struct wapbl_replay *wr = mp->mnt_wapbl_replay; + int i; + int error; + + if (!wr) + return; + + KDASSERT((mp->mnt_flag & MNT_RDONLY) == 0); + + for (i = 0; i < wr->wr_inodescnt; i++) { + struct vnode *vp; + struct inode *ip; + error = VFS_VGET(mp, wr->wr_inodes[i].wr_inumber, &vp); + if (error) { + printf("ffs_wapbl_replay_finish: " + "unable to cleanup inode %" PRIu32 "\n", + wr->wr_inodes[i].wr_inumber); + continue; + } + ip = VTOI(vp); + KDASSERT(wr->wr_inodes[i].wr_inumber == ip->i_number); + printf("ffs_wapbl_replay_finish: " + "cleaning inode %" PRIu64 " size=%" PRIu64 " mode=%o nlink=%d\n", + ip->i_number, ip->i_size, ip->i_mode, ip->i_nlink); + KASSERT(ip->i_nlink == 0); + + /* + * The journal may have left partially allocated inodes in mode + * zero. This may occur if a crash occurs betweeen the node + * allocation in ffs_nodeallocg and when the node is properly + * initialized in ufs_makeinode. If so, just dallocate them. + */ + if (ip->i_mode == 0) { + UFS_WAPBL_BEGIN(mp); + ffs_vfree(vp, ip->i_number, wr->wr_inodes[i].wr_imode); + UFS_WAPBL_END(mp); + } + vput(vp); + } + mp->mnt_wapbl_replay = 0; + wapbl_replay_free(wr); +} + +/* Callback for wapbl */ +void +ffs_wapbl_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i, error; + +#ifdef WAPBL_DEBUG_INODES + ufs_wapbl_verify_inodes(mp, "ffs_wapbl_sync_metadata"); +#endif + + for (i = 0; i< dealloccnt; i++) { + /* + * blkfree errors are unreported, might silently fail + * if it cannot read the cylinder group block + */ + ffs_blkfree(fs, ump->um_devvp, + dbtofsb(fs, deallocblks[i]), dealloclens[i], -1); + } + + fs->fs_fmod = 0; + fs->fs_time = time_second; + error = ffs_cgupdate(ump, 0); + KASSERT(error == 0); +} + +void +ffs_wapbl_abort_sync_metadata(struct mount *mp, daddr_t *deallocblks, + int *dealloclens, int dealloccnt) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int i; + + /* + * I suppose we could dig around for an in use inode, but + * its not really used by ffs_blkalloc, so we just fake + * the couple of fields that it touches. + */ + struct inode in; + in.i_fs = fs; + in.i_devvp = ump->um_devvp; + in.i_dev = ump->um_dev; + in.i_number = -1; + in.i_uid = 0; + for (i = 0; i < dealloccnt; i++) { + /* + * Since the above blkfree may have failed, this blkalloc might + * fail as well, so don't check its error. Note that if the + * blkfree succeeded above, then this shouldn't fail because + * the buffer will be locked in the current transaction. + */ + ffs_blkalloc(&in, dbtofsb(fs, deallocblks[i]), + dealloclens[i]); + } +} + +static int +wapbl_remove_log(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *vp; + struct inode *ip; + ino_t log_ino; + int error; + + /* If all the log locators are 0, just clean up */ + if (fs->fs_journallocs[0] == 0 && + fs->fs_journallocs[1] == 0 && + fs->fs_journallocs[2] == 0 && + fs->fs_journallocs[3] == 0) { + DPRINTF("empty locators, just clear\n"); + goto done; + } + + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_NONE: + /* nothing! */ + DPRINTF("no log\n"); + break; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + log_ino = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; + DPRINTF("in-fs log, ino = %" PRId64 "\n",log_ino); + + /* if no existing log inode, just clear all fields and bail */ + if (log_ino == 0) + goto done; + error = VFS_VGET(mp, log_ino, &vp); + if (error != 0) { + printf("ffs_wapbl: vget failed %d\n", + error); + /* clear out log info on error */ + goto done; + } + ip = VTOI(vp); + KASSERT(log_ino == ip->i_number); + if ((ip->i_flags & SF_LOG) == 0) { + printf("ffs_wapbl: try to clear non-log inode " + "%" PRId64 "\n", log_ino); + vput(vp); + /* clear out log info on error */ + goto done; + } + + /* + * remove the log inode by setting its link count back + * to zero and bail. + */ + ip->i_ffs_effnlink = 0; + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + vput(vp); + + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("end-of-partition log\n"); + /* no extra work required */ + break; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + return EINVAL; + } + + +done: + /* Clear out all previous knowledge of journal */ + fs->fs_journal_version = 0; + fs->fs_journal_location = 0; + fs->fs_journal_flags = 0; + fs->fs_journallocs[0] = 0; + fs->fs_journallocs[1] = 0; + fs->fs_journallocs[2] = 0; + fs->fs_journallocs[3] = 0; + (void) ffs_sbupdate(ump, MNT_WAIT); + + return 0; +} + +int +ffs_wapbl_start(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + int error; + + if (mp->mnt_wapbl == 0) { + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CLEAR_LOG) { + /* Clear out any existing journal file */ + error = wapbl_remove_log(mp); + if (error != 0) + return error; + } + + if (mp->mnt_flag & MNT_LOG) { + KDASSERT(fs->fs_ronly == 0); + + error = wapbl_log_position(mp, fs, devvp, &off, + &count, &blksize, &extradata); + if (error) + return error; + + /* XXX any other consistancy checks here? */ + if (blksize != DEV_BSIZE) { + printf("%s: bad blocksize %zd\n", __func__, + blksize); + return EINVAL; + } + + error = wapbl_start(&mp->mnt_wapbl, mp, devvp, off, + count, blksize, mp->mnt_wapbl_replay, + ffs_wapbl_sync_metadata, + ffs_wapbl_abort_sync_metadata); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + +#ifdef WAPBL_DEBUG + printf("%s: enabling logging\n", fs->fs_fsmnt); +#endif + + if ((fs->fs_flags & FS_DOWAPBL) == 0) { + UFS_WAPBL_BEGIN(mp); + fs->fs_flags |= FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + if (error) { + UFS_WAPBL_END(mp); + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + UFS_WAPBL_END(mp); + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error) { + ffs_wapbl_stop(mp, MNT_FORCE); + return error; + } + } + } else if (fs->fs_flags & FS_DOWAPBL) { + fs->fs_fmod = 1; + fs->fs_flags &= ~FS_DOWAPBL; + } + } + + /* + * It is recommended that you finish replay with logging enabled. + * However, even if logging is not enabled, the remaining log + * replay should be safely recoverable with an fsck, so perform + * it anyway. + */ + if ((fs->fs_ronly == 0) && mp->mnt_wapbl_replay) { + int saveflag = mp->mnt_flag & MNT_RDONLY; + /* + * Make sure MNT_RDONLY is not set so that the inode + * cleanup in ufs_inactive will actually do its work. + */ + mp->mnt_flag &= ~MNT_RDONLY; + ffs_wapbl_replay_finish(mp); + mp->mnt_flag |= saveflag; + KASSERT(fs->fs_ronly == 0); + } + + return 0; +} + +int +ffs_wapbl_stop(struct mount *mp, int force) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + int error; + + if (mp->mnt_wapbl) { + KDASSERT(fs->fs_ronly == 0); + + /* + * Make sure turning off FS_DOWAPBL is only removed + * as the only change in the final flush since otherwise + * a transaction may reorder writes. + */ + error = wapbl_flush(mp->mnt_wapbl, 1); + if (error && !force) + return error; + if (error && force) + goto forceout; + error = UFS_WAPBL_BEGIN(mp); + if (error && !force) + return error; + if (error && force) + goto forceout; + KASSERT(fs->fs_flags & FS_DOWAPBL); + + fs->fs_flags &= ~FS_DOWAPBL; + error = ffs_sbupdate(ump, MNT_WAIT); + KASSERT(error == 0); /* XXX a bit drastic! */ + UFS_WAPBL_END(mp); + forceout: + error = wapbl_stop(mp->mnt_wapbl, force); + if (error) { + KASSERT(!force); + fs->fs_flags |= FS_DOWAPBL; + return error; + } + fs->fs_flags &= ~FS_DOWAPBL; /* Repeat in case of forced error */ + mp->mnt_wapbl = 0; + +#ifdef WAPBL_DEBUG + printf("%s: disabled logging\n", fs->fs_fsmnt); +#endif + } + + return 0; +} + +int +ffs_wapbl_replay_start(struct mount *mp, struct fs *fs, struct vnode *devvp) +{ + int error; + daddr_t off; + size_t count; + size_t blksize; + uint64_t extradata; + + error = wapbl_log_position(mp, fs, devvp, &off, &count, &blksize, + &extradata); + + if (error) + return error; + + error = wapbl_replay_start(&mp->mnt_wapbl_replay, devvp, off, + count, blksize); + if (error) + return error; + + mp->mnt_wapbl_op = &wapbl_ops; + + return 0; +} + +/* + * If the superblock doesn't already have a recorded journal location + * then we allocate the journal in one of two positions: + * + * - At the end of the partition after the filesystem if there's + * enough space. "Enough space" is defined as >= 1MB of journal + * per 1GB of filesystem or 64MB, whichever is smaller. + * + * - Inside the filesystem. We try to allocate a contiguous journal + * based on the total filesystem size - the target is 1MB of journal + * per 1GB of filesystem, up to a maximum journal size of 64MB. As + * a worst case allowing for fragmentation, we'll allocate a journal + * 1/4 of the desired size but never smaller than 1MB. + * + * XXX In the future if we allow for non-contiguous journal files we + * can tighten the above restrictions. + * + * XXX + * These seems like a lot of duplication both here and in some of + * the userland tools (fsck_ffs, dumpfs, tunefs) with similar + * "switch (fs_journal_location)" constructs. Can we centralise + * this sort of code somehow/somewhere? + */ +static int +wapbl_log_position(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct partinfo dpart; + daddr_t logstart, logend, desired_logsize; + size_t blksize; + int error; + + if (fs->fs_journal_version == UFS_WAPBL_VERSION) { + switch (fs->fs_journal_location) { + case UFS_WAPBL_JOURNALLOC_END_PARTITION: + DPRINTF("found existing end-of-partition log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_EPART_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_EPART_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zd, " + "blksize = %zd\n", *startp, *countp, *blksizep); + return 0; + + case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM: + DPRINTF("found existing in-filesystem log\n"); + *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + DPRINTF(" start = %" PRId64 ", size = %zd, " + "blksize = %zd\n", *startp, *countp, *blksizep); + return 0; + + default: + printf("ffs_wapbl: unknown journal type %d\n", + fs->fs_journal_location); + return EINVAL; + } + } + + desired_logsize = + lfragtosize(fs, fs->fs_size) / UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("desired log size = %" PRId64 " kB\n", desired_logsize / 1024); + desired_logsize = max(desired_logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + desired_logsize = min(desired_logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted desired log size = %" PRId64 " kB\n", + desired_logsize / 1024); + + /* Is there space after after filesystem on partition for log? */ + logstart = fsbtodb(fs, fs->fs_size); + error = VOP_IOCTL(devvp, DIOCGPART, &dpart, FREAD, FSCRED); + if (!error) { + logend = dpart.part->p_size; + blksize = dpart.disklab->d_secsize; + } else { + struct dkwedge_info dkw; + error = VOP_IOCTL(devvp, DIOCGWEDGEINFO, &dkw, FREAD, FSCRED); + if (error) + return error; + + blksize = DEV_BSIZE; + logend = dkw.dkw_size; + } + + if ((logend - logstart) >= desired_logsize) { + KDASSERT(blksize != 0); + DPRINTF("enough space, use end-of-partition log\n"); + + *startp = logstart; + *countp = (logend - logstart); + *blksizep = blksize; + *extradatap = 0; + + /* update superblock with log location */ + fs->fs_journal_version = UFS_WAPBL_VERSION; + fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_END_PARTITION; + fs->fs_journal_flags = 0; + fs->fs_journallocs[UFS_WAPBL_EPART_ADDR] = *startp; + fs->fs_journallocs[UFS_WAPBL_EPART_COUNT] = *countp; + fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ] = *blksizep; + fs->fs_journallocs[UFS_WAPBL_EPART_UNUSED] = *extradatap; + + error = ffs_sbupdate(ump, MNT_WAIT); + return error; + } + DPRINTF("end-of-partition has only %" PRId64 " free\n", + logend - logstart); + + error = wapbl_create_infs_log(mp, fs, devvp, startp, countp, blksizep, + extradatap); + + ffs_sync(mp, 1, FSCRED); + + return error; +} + +/* + * Try to create a journal log inside the filesystem. + */ +static int +wapbl_create_infs_log(struct mount *mp, struct fs *fs, struct vnode *devvp, + daddr_t *startp, size_t *countp, size_t *blksizep, uint64_t *extradatap) +{ + struct vnode *vp, *rvp; + struct inode *ip; + int error; + + if ((error = VFS_ROOT(mp, &rvp)) != 0) + return error; + + if ((error = UFS_VALLOC(rvp, 0 | S_IFREG, NOCRED, &vp)) != 0) { + vput(rvp); + return error; + } + vput(rvp); + + vp->v_type = VREG; + ip = VTOI(vp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = 0 | IFREG; + DIP_ASSIGN(ip, mode, ip->i_mode); + ip->i_flags = SF_LOG; + DIP_ASSIGN(ip, flags, ip->i_flags); + ip->i_ffs_effnlink = 1; + ip->i_nlink = 1; + DIP_ASSIGN(ip, nlink, 1); + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + ffs_update(vp, NULL, NULL, UPDATE_WAIT); + + if ((error = wapbl_allocate_log_file(mp, vp)) != 0) { + /* + * If we couldn't allocate the space for the log file, + * remove the inode by setting its link count back to + * zero and bail. + */ + ip->i_ffs_effnlink = 0; + ip->i_nlink = 0; + DIP_ASSIGN(ip, nlink, 0); + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + vput(vp); + + return error; + } + + /* + * Now that we have the place-holder inode for the journal, + * we don't need the vnode ever again. + */ + vput(vp); + + *startp = fs->fs_journallocs[UFS_WAPBL_INFS_ADDR]; + *countp = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + *blksizep = fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]; + *extradatap = fs->fs_journallocs[UFS_WAPBL_INFS_INO]; + + return 0; +} + +int +wapbl_allocate_log_file(struct mount *mp, struct vnode *vp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + daddr_t addr, indir_addr; + off_t logsize; + size_t size; + int error; + + logsize = 0; + /* check if there's a suggested log size */ + if (fs->fs_journal_flags & UFS_WAPBL_FLAGS_CREATE_LOG && + fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) + logsize = fs->fs_journallocs[UFS_WAPBL_INFS_COUNT]; + + if (vp->v_size > 0) { + printf("%s: file size (%" PRId64 ") non zero\n", __func__, + vp->v_size); + return EEXIST; + } + wapbl_find_log_start(mp, vp, logsize, &addr, &indir_addr, &size); + if (addr == 0) { + printf("%s: log not allocated, largest extent is " + "%" PRId64 "MB\n", __func__, + lblktosize(fs, size) / (1024 * 1024)); + return ENOSPC; + } + + logsize = lblktosize(fs, size); /* final log size */ + + VTOI(vp)->i_ffs_first_data_blk = addr; + VTOI(vp)->i_ffs_first_indir_blk = indir_addr; + + error = GOP_ALLOC(vp, 0, logsize, B_CONTIG, FSCRED); + if (error) { + printf("%s: GOP_ALLOC error %d\n", __func__, error); + return error; + } + + fs->fs_journal_version = UFS_WAPBL_VERSION; + fs->fs_journal_location = UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM; + fs->fs_journal_flags = 0; + fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] = + lfragtosize(fs, addr) / DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] = logsize / DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ] = DEV_BSIZE; + fs->fs_journallocs[UFS_WAPBL_INFS_INO] = VTOI(vp)->i_number; + + error = ffs_sbupdate(ump, MNT_WAIT); + return error; +} + +/* + * Find a suitable location for the journal in the filesystem. + * + * Our strategy here is to look for a contiguous block of free space + * at least "logfile" MB in size (plus room for any indirect blocks). + * We start at the middle of the filesystem and check each cylinder + * group working outwards. If "logfile" MB is not available as a + * single contigous chunk, then return the address and size of the + * largest chunk found. + * + * XXX + * At what stage does the search fail? Is if the largest space we could + * find is less than a quarter the requested space reasonable? If the + * search fails entirely, return a block address if "0" it indicate this. + */ +static void +wapbl_find_log_start(struct mount *mp, struct vnode *vp, off_t logsize, + daddr_t *addr, daddr_t *indir_addr, size_t *size) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs = ump->um_fs; + struct vnode *devvp = ump->um_devvp; + struct cg *cgp; + struct buf *bp; + uint8_t *blksfree; + daddr_t blkno, best_addr, start_addr; + daddr_t desired_blks, min_desired_blks; + daddr_t freeblks, best_blks; + int bpcg, cg, error, fixedsize, indir_blks, n, s; +#ifdef FFS_EI + const int needswap = UFS_FSNEEDSWAP(fs); +#endif + + if (logsize == 0) { + fixedsize = 0; /* We can adjust the size if tight */ + logsize = lfragtosize(fs, fs->fs_dsize) / + UFS_WAPBL_JOURNAL_SCALE; + DPRINTF("suggested log size = %" PRId64 "\n", logsize); + logsize = max(logsize, UFS_WAPBL_MIN_JOURNAL_SIZE); + logsize = min(logsize, UFS_WAPBL_MAX_JOURNAL_SIZE); + DPRINTF("adjusted log size = %" PRId64 "\n", logsize); + } else { + fixedsize = 1; + DPRINTF("fixed log size = %" PRId64 "\n", logsize); + } + + desired_blks = logsize / fs->fs_bsize; + DPRINTF("desired blocks = %" PRId64 "\n", desired_blks); + + /* add in number of indirect blocks needed */ + indir_blks = 0; + if (desired_blks >= NDADDR) { + struct indir indirs[NIADDR + 2]; + int num; + + error = ufs_getlbns(vp, desired_blks, indirs, &num); + if (error) { + printf("%s: ufs_getlbns failed, error %d!\n", + __func__, error); + goto bad; + } + + switch (num) { + case 2: + indir_blks = 1; /* 1st level indirect */ + break; + case 3: + indir_blks = 1 + /* 1st level indirect */ + 1 + /* 2nd level indirect */ + indirs[1].in_off + 1; /* extra 1st level indirect */ + break; + default: + printf("%s: unexpected numlevels %d from ufs_getlbns\n", + __func__, num); + *size = 0; + goto bad; + } + desired_blks += indir_blks; + } + DPRINTF("desired blocks = %" PRId64 " (including indirect)\n", + desired_blks); + + /* + * If a specific size wasn't requested, allow for a smaller log + * if we're really tight for space... + */ + min_desired_blks = desired_blks; + if (!fixedsize) + min_desired_blks = desired_blks / 4; + + /* Look at number of blocks per CG. If it's too small, bail early. */ + bpcg = fragstoblks(fs, fs->fs_fpg); + if (min_desired_blks > bpcg) { + printf("ffs_wapbl: cylinder group size of %" PRId64 " MB " + " is not big enough for journal\n", + lblktosize(fs, bpcg) / (1024 * 1024)); + goto bad; + } + + /* + * Start with the middle cylinder group, and search outwards in + * both directions until we either find the requested log size + * or reach the start/end of the file system. If we reach the + * start/end without finding enough space for the full requested + * log size, use the largest extent found if it is large enough + * to satisfy the our minimum size. + * + * XXX + * Can we just use the cluster contigsum stuff (esp on UFS2) + * here to simplify this search code? + */ + best_addr = 0; + best_blks = 0; + for (cg = fs->fs_ncg / 2, s = 0, n = 1; + best_blks < desired_blks && cg >= 0 && cg < fs->fs_ncg; + s++, n = -n, cg += n * s) { + DPRINTF("check cg %d of %d\n", cg, fs->fs_ncg); + error = bread(devvp, fsbtodb(fs, cgtod(fs, cg)), + fs->fs_cgsize, FSCRED, 0, &bp); + cgp = (struct cg *)bp->b_data; + if (error || !cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) { + brelse(bp, 0); + continue; + } + + blksfree = cg_blksfree(cgp, needswap); + + for (blkno = 0; blkno < bpcg;) { + /* look for next free block */ + /* XXX use scanc() and fragtbl[] here? */ + for (; blkno < bpcg - min_desired_blks; blkno++) + if (ffs_isblock(fs, blksfree, blkno)) + break; + + /* past end of search space in this CG? */ + if (blkno >= bpcg - min_desired_blks) + break; + + /* count how many free blocks in this extent */ + start_addr = blkno; + for (freeblks = 0; blkno < bpcg; blkno++, freeblks++) + if (!ffs_isblock(fs, blksfree, blkno)) + break; + + if (freeblks > best_blks) { + best_blks = freeblks; + best_addr = blkstofrags(fs, start_addr) + + cgbase(fs, cg); + + if (freeblks >= desired_blks) { + DPRINTF("found len %" PRId64 + " at offset %" PRId64 " in gc\n", + freeblks, start_addr); + break; + } + } + } + brelse(bp, 0); + } + DPRINTF("best found len = %" PRId64 ", wanted %" PRId64 + " at addr %" PRId64 "\n", best_blks, desired_blks, best_addr); + + if (best_blks < min_desired_blks) { + *addr = 0; + *indir_addr = 0; + } else { + /* put indirect blocks at start, and data blocks after */ + *addr = best_addr + blkstofrags(fs, indir_blks); + *indir_addr = best_addr; + } + *size = min(desired_blks, best_blks) - indir_blks; + return; + +bad: + *addr = 0; + *indir_addr = 0; + *size = 0; + return; +} diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h index 4d0ee8bc5f29..e2763799dbb7 100644 --- a/sys/ufs/ffs/fs.h +++ b/sys/ufs/ffs/fs.h @@ -1,4 +1,4 @@ -/* $NetBSD: fs.h,v 1.49 2007/12/25 18:33:49 perry Exp $ */ +/* $NetBSD: fs.h,v 1.50 2008/07/31 05:38:06 simonb Exp $ */ /* * Copyright (c) 1982, 1986, 1993 @@ -327,7 +327,12 @@ struct fs { int32_t fs_old_cpc; /* cyl per cycle in postbl */ /* this area is otherwise allocated unless fs_old_flags & FS_FLAGS_UPDATED */ int32_t fs_maxbsize; /* maximum blocking factor permitted */ - int64_t fs_sparecon64[17]; /* old rotation block list head */ + uint8_t fs_journal_version; /* journal format version */ + uint8_t fs_journal_location; /* journal location type */ + uint8_t fs_journal_reserved[2];/* reserved for future use */ + uint32_t fs_journal_flags; /* journal flags */ + uint64_t fs_journallocs[4]; /* location info for journal */ + int64_t fs_sparecon64[12]; /* reserved for future use */ int64_t fs_sblockloc; /* byte offset of standard superblock */ struct csum_total fs_cstotal; /* cylinder summary information */ int64_t fs_time; /* last time written */ @@ -406,13 +411,17 @@ struct fs { /* * File system flags */ -#define FS_UNCLEAN 0x01 /* file system not clean at mount (unused) */ -#define FS_DOSOFTDEP 0x02 /* file system using soft dependencies */ -#define FS_NEEDSFSCK 0x04 /* needs sync fsck (FreeBSD compat, unused) */ -#define FS_INDEXDIRS 0x08 /* kernel supports indexed directories */ -#define FS_ACLS 0x10 /* file system has ACLs enabled */ -#define FS_MULTILABEL 0x20 /* file system is MAC multi-label */ +#define FS_UNCLEAN 0x001 /* file system not clean at mount (unused) */ +#define FS_DOSOFTDEP 0x002 /* file system using soft dependencies */ +#define FS_NEEDSFSCK 0x004 /* needs sync fsck (FreeBSD compat, unused) */ +#define FS_INDEXDIRS 0x008 /* kernel supports indexed directories */ +#define FS_ACLS 0x010 /* file system has ACLs enabled */ +#define FS_MULTILABEL 0x020 /* file system is MAC multi-label */ #define FS_FLAGS_UPDATED 0x80 /* flags have been moved to new location */ +#define FS_DOWAPBL 0x100 /* Write ahead physical block logging */ + +/* File system flags that are ok for NetBSD if set in fs_flags */ +#define FS_KNOWN_FLAGS (FS_DOSOFTDEP | FS_DOWAPBL) /* * File system internal flags, also in fs_flags. diff --git a/sys/ufs/files.ufs b/sys/ufs/files.ufs index df10a491d90e..88078047c6ca 100644 --- a/sys/ufs/files.ufs +++ b/sys/ufs/files.ufs @@ -1,4 +1,4 @@ -# $NetBSD: files.ufs,v 1.17 2007/12/12 02:56:03 lukem Exp $ +# $NetBSD: files.ufs,v 1.18 2008/07/31 05:38:06 simonb Exp $ deffs fs_ffs.h FFS deffs EXT2FS @@ -34,6 +34,7 @@ file ufs/ffs/ffs_subr.c ffs | mfs | ext2fs file ufs/ffs/ffs_tables.c ffs | mfs | ext2fs file ufs/ffs/ffs_vfsops.c ffs | mfs | ext2fs file ufs/ffs/ffs_vnops.c ffs | mfs | ext2fs +file ufs/ffs/ffs_wapbl.c ffs & wapbl file ufs/ffs/ffs_appleufs.c ffs & apple_ufs file ufs/lfs/lfs_alloc.c lfs @@ -62,3 +63,4 @@ file ufs/ufs/ufs_lookup.c ffs | lfs | mfs | ext2fs file ufs/ufs/ufs_quota.c quota & (ffs | lfs | mfs | ext2fs) file ufs/ufs/ufs_vfsops.c ffs | lfs | mfs | ext2fs file ufs/ufs/ufs_vnops.c ffs | lfs | mfs | ext2fs +file ufs/ufs/ufs_wapbl.c ffs & wapbl diff --git a/sys/ufs/ufs/Makefile b/sys/ufs/ufs/Makefile index adb42dd39a56..e288c1cdc14c 100644 --- a/sys/ufs/ufs/Makefile +++ b/sys/ufs/ufs/Makefile @@ -1,8 +1,8 @@ -# $NetBSD: Makefile,v 1.5 2005/12/11 12:25:28 christos Exp $ +# $NetBSD: Makefile,v 1.6 2008/07/31 05:38:06 simonb Exp $ INCSDIR= /usr/include/ufs/ufs INCS= dinode.h dir.h extattr.h inode.h quota.h ufs_bswap.h ufs_extern.h \ - ufsmount.h + ufs_wapbl.h ufsmount.h .include diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 8447acd1fa36..c85552d41a5c 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -1,4 +1,4 @@ -/* $NetBSD: inode.h,v 1.51 2008/01/09 16:15:23 ad Exp $ */ +/* $NetBSD: inode.h,v 1.52 2008/07/31 05:38:06 simonb Exp $ */ /* * Copyright (c) 1982, 1989, 1993 @@ -51,6 +51,9 @@ */ struct ffs_inode_ext { daddr_t *ffs_snapblklist; /* Collect expunged snapshot blocks. */ + /* follow two fields are used by contiguous allocation code only. */ + daddr_t ffs_first_data_blk; /* first indirect block on disk. */ + daddr_t ffs_first_indir_blk; /* first data block on disk. */ }; struct ext2fs_inode_ext { @@ -113,6 +116,8 @@ struct inode { struct lfs_inode_ext *lfs; } inode_ext; #define i_snapblklist inode_ext.ffs.ffs_snapblklist +#define i_ffs_first_data_blk inode_ext.ffs.ffs_first_data_blk +#define i_ffs_first_indir_blk inode_ext.ffs.ffs_first_indir_blk #define i_e2fs_last_lblk inode_ext.e2fs.ext2fs_last_lblk #define i_e2fs_last_blk inode_ext.e2fs.ext2fs_last_blk /* @@ -219,7 +224,7 @@ struct inode { #define IN_CLEANING 0x0100 /* LFS: file is being cleaned */ #define IN_ADIROP 0x0200 /* LFS: dirop in progress */ #define IN_SPACECOUNTED 0x0400 /* Blocks to be freed in free count. */ -#define IN_PAGING 0x1000 /* LFS: file is on paging queue */ +#define IN_PAGING 0x1000 /* LFS: file is on paging queue */ #if defined(_KERNEL) diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index 383d57013b1f..4ea34181c362 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $ */ +/* $NetBSD: ufs_inode.c,v 1.76 2008/07/31 05:38:06 simonb Exp $ */ /* * Copyright (c) 1991, 1993 @@ -37,11 +37,12 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.76 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" #include "opt_quota.h" +#include "opt_wapbl.h" #endif #include @@ -52,12 +53,14 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.75 2008/01/17 10:39:15 ad Exp $"); #include #include #include +#include #include #include #include #include #include +#include #ifdef UFS_DIRHASH #include #endif @@ -84,6 +87,9 @@ ufs_inactive(void *v) struct mount *transmp; mode_t mode; int error = 0; + int logged = 0; + + UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); transmp = vp->v_mount; fstrans_start(transmp, FSTRANS_SHARED); @@ -96,6 +102,10 @@ ufs_inactive(void *v) softdep_releasefile(ip); if (ip->i_nlink <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + logged = 1; #ifdef QUOTA (void)chkiq(ip, -1, NOCRED, 0); #endif @@ -103,7 +113,35 @@ ufs_inactive(void *v) ufs_extattr_vnode_inactive(vp, curlwp); #endif if (ip->i_size != 0) { - error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED); + /* + * When journaling, only truncate one indirect block + * at a time + */ + if (vp->v_mount->mnt_wapbl) { + uint64_t incr = MNINDIR(ip->i_ump) << + vp->v_mount->mnt_fs_bshift; /* Power of 2 */ + uint64_t base = NDADDR << + vp->v_mount->mnt_fs_bshift; + while (!error && ip->i_size > base + incr) { + /* + * round down to next full indirect + * block boundary. + */ + uint64_t nsize = base + + ((ip->i_size - base - 1) & + ~(incr - 1)); + error = UFS_TRUNCATE(vp, nsize, 0, + NOCRED); + if (error) + break; + UFS_WAPBL_END(vp->v_mount); + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + } + } + if (!error) + error = UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED); } /* * Setting the mode to zero needs to wait for the inode @@ -125,8 +163,16 @@ ufs_inactive(void *v) } if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) { + if (!logged++) { + int err; + err = UFS_WAPBL_BEGIN(vp->v_mount); + if (err) + goto out; + } UFS_UPDATE(vp, NULL, NULL, 0); } + if (logged) + UFS_WAPBL_END(vp->v_mount); out: /* * If we are done with the inode, reclaim it @@ -149,6 +195,10 @@ ufs_reclaim(struct vnode *vp) if (prtactive && vp->v_usecount > 1) vprint("ufs_reclaim: pushing active", vp); + if (!UFS_WAPBL_BEGIN(vp->v_mount)) { + UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); + UFS_WAPBL_END(vp->v_mount); + } UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); /* diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index 9303494fc4f3..9f2b39555aed 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp $ */ +/* $NetBSD: ufs_lookup.c,v 1.99 2008/07/31 05:38:06 simonb Exp $ */ /* * Copyright (c) 1989, 1993 @@ -37,7 +37,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.99 2008/07/31 05:38:06 simonb Exp $"); #ifdef _KERNEL_OPT #include "opt_ffs.h" @@ -53,6 +53,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp #include #include #include +#include #include #include #include @@ -65,6 +66,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.98 2008/06/05 09:32:29 hannken Exp #include #include #include +#include #include "fs_ffs.h" @@ -158,7 +160,7 @@ ufs_lookup(void *v) return (error); if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && - (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + (nameiop == DELETE || nameiop == RENAME)) return (EROFS); /* @@ -495,6 +497,7 @@ found: dp->i_size = dp->i_offset + DIRSIZ(FSFMT(vdp), ep, needswap); DIP_ASSIGN(dp, size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; + UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); } brelse(bp, 0); @@ -690,11 +693,12 @@ ufs_dirbadentry(struct vnode *dp, struct direct *ep, int entryoffsetinblock) DIRSIZ(FSFMT(dp), ep, needswap) || namlen > FFS_MAXNAMLEN) { /*return (1); */ - printf("First bad, reclen=%x, DIRSIZ=%lu, namlen=%d, flags=%x " - "entryoffsetinblock=%d, dirblksiz = %d\n", + printf("First bad, reclen=%#x, DIRSIZ=%lu, namlen=%d, " + "flags=%#x, entryoffsetinblock=%d, dirblksiz = %d\n", ufs_rw16(ep->d_reclen, needswap), (u_long)DIRSIZ(FSFMT(dp), ep, needswap), - namlen, dp->v_mount->mnt_flag, entryoffsetinblock,dirblksiz); + namlen, dp->v_mount->mnt_flag, entryoffsetinblock, + dirblksiz); goto bad; } if (ep->d_ino == 0) @@ -762,6 +766,8 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp, const int needswap = UFS_MPNEEDSWAP(ump); int dirblksiz = ump->um_dirblksiz; + UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); + error = 0; cr = cnp->cn_cred; l = curlwp; @@ -882,6 +888,7 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp, dp->i_size = dp->i_offset + dp->i_count; DIP_ASSIGN(dp, size, dp->i_size); dp->i_flag |= IN_CHANGE | IN_UPDATE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); } /* * Get the block containing the space for the new directory entry. @@ -1014,6 +1021,7 @@ ufs_direnter(struct vnode *dvp, struct vnode *tvp, struct direct *dirp, if (DOINGSOFTDEP(dvp) && (tvp != NULL)) vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); } + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); return (error); } @@ -1040,6 +1048,8 @@ ufs_dirremove(struct vnode *dvp, struct inode *ip, int flags, int isrmdir) const int needswap = UFS_MPNEEDSWAP(dp->i_ump); #endif + UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount); + if (flags & DOWHITEOUT) { /* * Whiteout entry: set d_ino to WINO. @@ -1105,6 +1115,7 @@ out: ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0); } error = VOP_BWRITE(bp); } @@ -1118,6 +1129,7 @@ out: if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_ffs_effnlink == 0) ffs_snapgone(ip); + UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0); #endif return (error); } @@ -1151,6 +1163,7 @@ ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype, oip->i_nlink--; DIP_ASSIGN(oip, nlink, oip->i_nlink); oip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP); error = VOP_BWRITE(bp); } dp->i_flag |= iflags; @@ -1162,6 +1175,7 @@ ufs_dirrewrite(struct inode *dp, struct inode *oip, ino_t newinum, int newtype, */ if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_ffs_effnlink == 0) ffs_snapgone(oip); + UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP); #endif return (error); } @@ -1333,8 +1347,8 @@ ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp, const int bsize = 1 << bshift; off_t eof; - blks = kmem_alloc((1+dirrablks) * sizeof(daddr_t), KM_SLEEP); - blksizes = kmem_alloc((1+dirrablks) * sizeof(int), KM_SLEEP); + blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP); + blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP); ip = VTOI(vp); KASSERT(vp->v_size == ip->i_size); GOP_SIZE(vp, vp->v_size, &eof, 0); @@ -1370,7 +1384,7 @@ ufs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp, *bpp = bp; out: - kmem_free(blks, (1+dirrablks) * sizeof(daddr_t)); - kmem_free(blksizes, (1+dirrablks) * sizeof(int)); + kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t)); + kmem_free(blksizes, (1 + dirrablks) * sizeof(int)); return error; } diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index ce722980c027..9e9d81c1fe5a 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -1,4 +1,4 @@ -/* $NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken Exp $ */ +/* $NetBSD: ufs_readwrite.c,v 1.89 2008/07/31 05:38:06 simonb Exp $ */ /*- * Copyright (c) 1993 @@ -32,7 +32,7 @@ */ #include -__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken Exp $"); +__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.89 2008/07/31 05:38:06 simonb Exp $"); #ifdef LFS_READWRITE #define FS struct lfs @@ -43,6 +43,9 @@ __KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.88 2008/05/16 09:22:01 hannken E #define WRITE_S "lfs_write" #define fs_bsize lfs_bsize #define fs_bmask lfs_bmask +#define UFS_WAPBL_BEGIN(mp) 0 +#define UFS_WAPBL_END(mp) do { } while (0) +#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) #else #define FS struct fs #define I_FS i_fs @@ -177,8 +180,15 @@ READ(void *v) out: if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { ip->i_flag |= IN_ACCESS; - if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + UFS_WAPBL_END(vp->v_mount); + } } fstrans_done(vp->v_mount); @@ -283,6 +293,15 @@ WRITE(void *v) error = 0; usepc = vp->v_type == VREG; + + if ((ioflag & IO_JOURNALLOCKED) == 0) { + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + fstrans_done(vp->v_mount); + return error; + } + } + #ifdef LFS_READWRITE async = true; lfs_check(vp, LFS_UNUSED_LBN, 0); @@ -511,8 +530,11 @@ out: uio->uio_resid = resid; } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); + else + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); KASSERT(vp->v_size == ip->i_size); - + if ((ioflag & IO_JOURNALLOCKED) == 0) + UFS_WAPBL_END(vp->v_mount); fstrans_done(vp->v_mount); return (error); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 241a96a86cac..c749518f6c26 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,4 +1,33 @@ -/* $NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $ */ +/* $NetBSD: ufs_vnops.c,v 1.167 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1982, 1986, 1989, 1993, 1995 @@ -37,7 +66,7 @@ */ #include -__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.167 2008/07/31 05:38:06 simonb Exp $"); #if defined(_KERNEL_OPT) #include "opt_ffs.h" @@ -60,6 +89,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $"); #include #include #include +#include #include #include @@ -70,6 +100,7 @@ __KERNEL_RCSID(0, "$NetBSD: ufs_vnops.c,v 1.166 2008/06/02 16:00:33 ad Exp $"); #include #include #include +#include #ifdef UFS_DIRHASH #include #endif @@ -105,13 +136,20 @@ ufs_create(void *v) } */ *ap = v; int error; + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED); error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp); - fstrans_done(ap->a_dvp->v_mount); - if (error) + if (error) { + fstrans_done(ap->a_dvp->v_mount); return (error); + } + UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp); + fstrans_done(ap->a_dvp->v_mount); VN_KNOTE(ap->a_dvp, NOTE_WRITE); return (0); } @@ -138,6 +176,11 @@ ufs_mknod(void *v) vap = ap->a_vap; vpp = ap->a_vpp; + + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED); if ((error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), @@ -161,6 +204,8 @@ ufs_mknod(void *v) ip->i_ffs2_rdev = ufs_rw64(vap->va_rdev, UFS_MPNEEDSWAP(ump)); } + UFS_WAPBL_UPDATE(*vpp, NULL, NULL, 0); + UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp); /* * Remove inode so that it will be reloaded by VFS_VGET and * checked to see if it is an alias of an existing entry in @@ -394,8 +439,8 @@ ufs_setattr(void *v) goto out; } if (kauth_cred_geteuid(cred) != ip->i_uid && - (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, - NULL))) + (error = kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL))) goto out; if (kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL) == 0) { @@ -411,6 +456,9 @@ ufs_setattr(void *v) error = EPERM; goto out; } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; ip->i_flags = vap->va_flags; DIP_ASSIGN(ip, flags, ip->i_flags); } else { @@ -424,11 +472,16 @@ ufs_setattr(void *v) error = EPERM; goto out; } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; ip->i_flags &= SF_SETTABLE; ip->i_flags |= (vap->va_flags & UF_SETTABLE); DIP_ASSIGN(ip, flags, ip->i_flags); } ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + UFS_WAPBL_END(vp->v_mount); if (vap->va_flags & (IMMUTABLE | APPEND)) { error = 0; goto out; @@ -446,7 +499,11 @@ ufs_setattr(void *v) error = EROFS; goto out; } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, l); + UFS_WAPBL_END(vp->v_mount); if (error) goto out; } @@ -466,14 +523,46 @@ ufs_setattr(void *v) break; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) { - error = EROFS; - goto out; + error = EROFS; + goto out; } if ((ip->i_flags & SF_SNAPSHOT) != 0) { error = EPERM; goto out; } - error = UFS_TRUNCATE(vp, vap->va_size, 0, cred); + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; + /* + * When journaling, only truncate one indirect block + * at a time. + */ + if (vp->v_mount->mnt_wapbl) { + uint64_t incr = MNINDIR(ip->i_ump) << + vp->v_mount->mnt_fs_bshift; /* Power of 2 */ + uint64_t base = NDADDR << + vp->v_mount->mnt_fs_bshift; + while (!error && ip->i_size > base + incr && + ip->i_size > vap->va_size + incr) { + /* + * round down to next full indirect + * block boundary. + */ + uint64_t nsize = base + + ((ip->i_size - base - 1) & + ~(incr - 1)); + error = UFS_TRUNCATE(vp, nsize, 0, + cred); + if (error == 0) { + UFS_WAPBL_END(vp->v_mount); + error = + UFS_WAPBL_BEGIN(vp->v_mount); + } + } + } + if (!error) + error = UFS_TRUNCATE(vp, vap->va_size, 0, cred); + UFS_WAPBL_END(vp->v_mount); if (error) goto out; break; @@ -494,11 +583,14 @@ ufs_setattr(void *v) goto out; } if (kauth_cred_geteuid(cred) != ip->i_uid && - (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, - NULL)) && + (error = kauth_authorize_generic(cred, + KAUTH_GENERIC_ISSUSER, NULL)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, cred)))) goto out; + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; if (vap->va_atime.tv_sec != VNOVAL) if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) ip->i_flag |= IN_ACCESS; @@ -510,6 +602,7 @@ ufs_setattr(void *v) ip->i_ffs2_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, &vap->va_atime, &vap->va_mtime, 0); + UFS_WAPBL_END(vp->v_mount); if (error) goto out; } @@ -525,7 +618,11 @@ ufs_setattr(void *v) error = EPERM; goto out; } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) + goto out; error = ufs_chmod(vp, (int)vap->va_mode, cred, l); + UFS_WAPBL_END(vp->v_mount); } VN_KNOTE(vp, NOTE_ATTRIB); out: @@ -543,6 +640,8 @@ ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) struct inode *ip; int error, ismember = 0; + UFS_WAPBL_JLOCK_ASSERT(vp->v_mount); + ip = VTOI(vp); if (kauth_cred_geteuid(cred) != ip->i_uid && (error = kauth_authorize_generic(cred, KAUTH_GENERIC_ISSUSER, NULL))) @@ -558,6 +657,7 @@ ufs_chmod(struct vnode *vp, int mode, kauth_cred_t cred, struct lwp *l) ip->i_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; DIP_ASSIGN(ip, mode, ip->i_mode); + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); return (0); } @@ -626,6 +726,7 @@ ufs_chown(struct vnode *vp, uid_t uid, gid_t gid, kauth_cred_t cred, good: #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); return (0); } @@ -649,8 +750,13 @@ ufs_remove(void *v) if (vp->v_type == VDIR || (ip->i_flags & (IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) error = EPERM; - else - error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); + else { + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error == 0) { + error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); + UFS_WAPBL_END(dvp->v_mount); + } + } VN_KNOTE(vp, NOTE_DELETE); VN_KNOTE(dvp, NOTE_WRITE); if (dvp == vp) @@ -720,6 +826,11 @@ ufs_link(void *v) error = EPERM; goto out1; } + error = UFS_WAPBL_BEGIN(vp->v_mount); + if (error) { + VOP_ABORTOP(dvp, cnp); + goto out1; + } ip->i_ffs_effnlink++; ip->i_nlink++; DIP_ASSIGN(ip, nlink, ip->i_nlink); @@ -738,10 +849,12 @@ ufs_link(void *v) ip->i_nlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, UPDATE_DIROP); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); } PNBUF_PUT(cnp->cn_pnbuf); + UFS_WAPBL_END(vp->v_mount); out1: if (dvp != vp) VOP_UNLOCK(vp, 0); @@ -865,6 +978,11 @@ ufs_rename(void *v) struct direct *newdir; int doingdirectory, oldparent, newparent, error; +#ifdef WAPBL + if (ap->a_tdvp->v_mount->mnt_wapbl) + return wapbl_ufs_rename(v); +#endif + tvp = ap->a_tvp; tdvp = ap->a_tdvp; fvp = ap->a_fvp; @@ -1297,6 +1415,9 @@ ufs_mkdir(void *v) */ if ((error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, ap->a_vpp)) != 0) goto out; + error = UFS_WAPBL_BEGIN(ap->a_dvp->v_mount); + if (error) + goto out; tvp = *ap->a_vpp; ip = VTOI(tvp); ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); @@ -1307,6 +1428,7 @@ ufs_mkdir(void *v) if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) { PNBUF_PUT(cnp->cn_pnbuf); UFS_VFREE(tvp, ip->i_number, dmode); + UFS_WAPBL_END(dvp->v_mount); fstrans_done(dvp->v_mount); vput(tvp); vput(dvp); @@ -1412,11 +1534,13 @@ ufs_mkdir(void *v) bad: if (error == 0) { VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + UFS_WAPBL_END(dvp->v_mount); } else { dp->i_ffs_effnlink--; dp->i_nlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); if (DOINGSOFTDEP(dvp)) softdep_change_linkcnt(dp); /* @@ -1431,8 +1555,10 @@ ufs_mkdir(void *v) /* If IN_ADIROP, account for it */ lfs_unmark_vnode(tvp); #endif + UFS_WAPBL_UPDATE(tvp, NULL, NULL, UPDATE_DIROP); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); + UFS_WAPBL_END(dvp->v_mount); vput(tvp); } out: @@ -1496,6 +1622,9 @@ ufs_rmdir(void *v) error = EPERM; goto out; } + error = UFS_WAPBL_BEGIN(dvp->v_mount); + if (error) + goto out; /* * Delete reference to directory before purging * inode. If we crash in between, the directory @@ -1515,6 +1644,7 @@ ufs_rmdir(void *v) softdep_change_linkcnt(dp); softdep_change_linkcnt(ip); } + UFS_WAPBL_END(dvp->v_mount); goto out; } VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); @@ -1531,6 +1661,7 @@ ufs_rmdir(void *v) dp->i_ffs_effnlink--; DIP_ASSIGN(dp, nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP); ip->i_nlink--; ip->i_ffs_effnlink--; DIP_ASSIGN(ip, nlink, ip->i_nlink); @@ -1538,6 +1669,11 @@ ufs_rmdir(void *v) error = UFS_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred); } cache_purge(vp); + /* + * Unlock the log while we still have reference to unlinked + * directory vp so that it will not get locked for recycling + */ + UFS_WAPBL_END(dvp->v_mount); #ifdef UFS_DIRHASH if (ip->i_dirhash != NULL) ufsdirhash_free(ip); @@ -1576,6 +1712,10 @@ ufs_symlink(void *v) int len, error; vpp = ap->a_vpp; + /* + * UFS_WAPBL_BEGIN1(dvp->v_mount, dvp) performed by successful + * ufs_makeinode + */ fstrans_start(ap->a_dvp->v_mount, FSTRANS_SHARED); error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp); @@ -1591,10 +1731,12 @@ ufs_symlink(void *v) DIP_ASSIGN(ip, size, len); uvm_vnp_setsize(vp, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); } else error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, - UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, NULL, - NULL); + UIO_SYSSPACE, IO_NODELOCKED | IO_JOURNALLOCKED, + ap->a_cnp->cn_cred, NULL, NULL); + UFS_WAPBL_END1(ap->a_dvp->v_mount, ap->a_dvp); if (error) vput(vp); out: @@ -2096,6 +2238,8 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, struct vnode *tvp; int error, ismember = 0; + UFS_WAPBL_JUNLOCK_ASSERT(dvp->v_mount); + pdir = VTOI(dvp); #ifdef DIAGNOSTIC if ((cnp->cn_flags & HASBUF) == 0) @@ -2115,9 +2259,22 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, DIP_ASSIGN(ip, gid, ip->i_gid); ip->i_uid = kauth_cred_geteuid(cnp->cn_cred); DIP_ASSIGN(ip, uid, ip->i_uid); + error = UFS_WAPBL_BEGIN1(dvp->v_mount, dvp); + if (error) { + /* + * Note, we can't VOP_VFREE(tvp) here like we should + * because we can't write to the disk. Instead, we leave + * the vnode dangling from the journal. + */ + vput(tvp); + PNBUF_PUT(cnp->cn_pnbuf); + vput(dvp); + return (error); + } #ifdef QUOTA if ((error = chkiq(ip, 1, cnp->cn_cred, 0))) { UFS_VFREE(tvp, ip->i_number, mode); + UFS_WAPBL_END1(dvp->v_mount, dvp); vput(tvp); PNBUF_PUT(cnp->cn_pnbuf); vput(dvp); @@ -2175,9 +2332,11 @@ ufs_makeinode(int mode, struct vnode *dvp, struct vnode **vpp, /* If IN_ADIROP, account for it */ lfs_unmark_vnode(tvp); #endif + UFS_WAPBL_UPDATE(tvp, NULL, NULL, 0); if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(ip); tvp->v_type = VNON; /* explodes later if VBLK */ + UFS_WAPBL_END1(dvp->v_mount, dvp); vput(tvp); PNBUF_PUT(cnp->cn_pnbuf); vput(dvp); @@ -2228,7 +2387,8 @@ ufs_gop_alloc(struct vnode *vp, off_t off, off_t len, int flags, } out: - return error; + UFS_WAPBL_UPDATE(vp, NULL, NULL, 0); + return error; } void diff --git a/sys/ufs/ufs/ufs_wapbl.c b/sys/ufs/ufs/ufs_wapbl.c new file mode 100644 index 000000000000..663c6e7a02c6 --- /dev/null +++ b/sys/ufs/ufs/ufs_wapbl.c @@ -0,0 +1,805 @@ +/* $NetBSD: ufs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 1982, 1986, 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.28 (Berkeley) 7/31/95 + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: ufs_wapbl.c,v 1.2 2008/07/31 05:38:06 simonb Exp $"); + +#if defined(_KERNEL_OPT) +#include "opt_quota.h" +#include "fs_lfs.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* XXX following lifted from ufs_lookup.c */ +#define FSFMT(vp) (((vp)->v_mount->mnt_iflag & IMNT_DTYPE) == 0) + +/* + * A virgin directory (no blushing please). + */ +static const struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; + +/* + * Rename vnode operation + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + * + * WAPBL NOTE: wapbl_ufs_rename derived from ufs_rename in ufs_vnops.c + * ufs_vnops.c netbsd cvs revision 1.108 + * which has the berkeley copyright above + * changes introduced to ufs_rename since netbsd cvs revision 1.164 + * will need to be ported into wapbl_ufs_rename + */ +int +wapbl_ufs_rename(void *v) +{ + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap = v; + struct vnode *tvp, *tdvp, *fvp, *fdvp; + struct componentname *tcnp, *fcnp; + struct inode *ip, *txp, *fxp, *tdp, *fdp; + struct mount *mp; + struct direct *newdir; + int doingdirectory, oldparent, newparent, error; + + int32_t saved_f_count; + doff_t saved_f_diroff; + doff_t saved_f_offset; + u_int32_t saved_f_reclen; + int32_t saved_t_count; + doff_t saved_t_endoff; + doff_t saved_t_diroff; + doff_t saved_t_offset; + u_int32_t saved_t_reclen; + + tvp = ap->a_tvp; + tdvp = ap->a_tdvp; + fvp = ap->a_fvp; + fdvp = ap->a_fdvp; + tcnp = ap->a_tcnp; + fcnp = ap->a_fcnp; + doingdirectory = oldparent = newparent = error = 0; + +#ifdef DIAGNOSTIC + if ((tcnp->cn_flags & HASBUF) == 0 || + (fcnp->cn_flags & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + abortit: + VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */ + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */ + vrele(fdvp); + vrele(fvp); + return (error); + } + + /* + * Check if just deleting a link name. + */ + if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto abortit; + } + if (fvp == tvp) { + if (fvp->v_type == VDIR) { + error = EINVAL; + goto abortit; + } + + /* Release destination completely. */ + VOP_ABORTOP(tdvp, tcnp); + vput(tdvp); + vput(tvp); + + /* Delete source. */ + vrele(fvp); + fcnp->cn_flags &= ~(MODMASK | SAVESTART); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + fcnp->cn_nameiop = DELETE; + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(fdvp, &fvp, fcnp))) { + vput(fdvp); + return (error); + } + return (VOP_REMOVE(fdvp, fvp, fcnp)); + } + if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0) + goto abortit; + fdp = VTOI(fdvp); + ip = VTOI(fvp); + if ((nlink_t) ip->i_nlink >= LINK_MAX) { + VOP_UNLOCK(fvp, 0); + error = EMLINK; + goto abortit; + } + if ((ip->i_flags & (IMMUTABLE | APPEND)) || + (fdp->i_flags & APPEND)) { + VOP_UNLOCK(fvp, 0); + error = EPERM; + goto abortit; + } + if ((ip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + fdp == ip || + (fcnp->cn_flags & ISDOTDOT) || + (tcnp->cn_flags & ISDOTDOT) || + (ip->i_flag & IN_RENAME)) { + VOP_UNLOCK(fvp, 0); + error = EINVAL; + goto abortit; + } + ip->i_flag |= IN_RENAME; + doingdirectory = 1; + } + oldparent = fdp->i_number; + VN_KNOTE(fdvp, NOTE_WRITE); /* XXXLUKEM/XXX: right place? */ + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + tdp = VTOI(tdvp); + txp = NULL; + if (tvp) + txp = VTOI(tvp); + + mp = fdvp->v_mount; + fstrans_start(mp, FSTRANS_SHARED); + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory hierarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred); + VOP_UNLOCK(fvp, 0); + if (oldparent != tdp->i_number) + newparent = tdp->i_number; + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto out; + if (txp != NULL) + vput(tvp); + txp = NULL; + vref(tdvp); /* compensate for the ref checkpath loses */ + if ((error = ufs_checkpath(ip, tdp, tcnp->cn_cred)) != 0) { + vrele(tdvp); + tdp = NULL; + goto out; + } + tcnp->cn_flags &= ~SAVESTART; + tdp = NULL; + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = relookup(tdvp, &tvp, tcnp); + if (error != 0) { + vput(tdvp); + goto out; + } + tdp = VTOI(tdvp); + if (tvp) + txp = VTOI(tvp); + } + + /* + * XXX handle case where fdvp is parent of tdvp, + * by unlocking tdvp and regrabbing it with vget after? + */ + + /* save directory lookup information in case tdvp == fdvp */ + saved_t_count = tdp->i_count; + saved_t_endoff = tdp->i_endoff; + saved_t_diroff = tdp->i_diroff; + saved_t_offset = tdp->i_offset; + saved_t_reclen = tdp->i_reclen; + + /* + * This was moved up to before the journal lock to + * avoid potential deadlock + */ + fcnp->cn_flags &= ~(MODMASK | SAVESTART); + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if (newparent) { + vn_lock(fdvp, LK_EXCLUSIVE | LK_RETRY); + if ((error = relookup(fdvp, &fvp, fcnp))) { + vput(fdvp); + vrele(ap->a_fvp); + goto out2; + } + } else { + error = VOP_LOOKUP(fdvp, &fvp, fcnp); + if (error && (error != EJUSTRETURN)) { + vrele(ap->a_fvp); + goto out2; + } + error = 0; + } + if (fvp != NULL) { + fxp = VTOI(fvp); + fdp = VTOI(fdvp); + } else { + /* + * From name has disappeared. + */ + if (doingdirectory) + panic("rename: lost dir entry"); + vrele(ap->a_fvp); + error = ENOENT; /* XXX ufs_rename sets "0" here */ + goto out2; + } + vrele(ap->a_fvp); + + /* save directory lookup information in case tdvp == fdvp */ + saved_f_count = fdp->i_count; + saved_f_diroff = fdp->i_diroff; + saved_f_offset = fdp->i_offset; + saved_f_reclen = fdp->i_reclen; + + /* restore directory lookup information in case tdvp == fdvp */ + tdp->i_offset = saved_t_offset; + tdp->i_reclen = saved_t_reclen; + tdp->i_count = saved_t_count; + tdp->i_endoff = saved_t_endoff; + tdp->i_diroff = saved_t_diroff; + + error = UFS_WAPBL_BEGIN(fdvp->v_mount); + if (error) + goto out2; + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_ffs_effnlink++; + ip->i_nlink++; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(ip); + if ((error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP)) != 0) { + goto bad; + } + + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (txp == NULL) { + if (tdp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + tdp->i_ffs_effnlink++; + tdp->i_nlink++; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + if ((error = UFS_UPDATE(tdvp, NULL, NULL, + UPDATE_DIROP)) != 0) { + tdp->i_ffs_effnlink--; + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + goto bad; + } + } + newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK); + ufs_makedirentry(ip, tcnp, newdir); + error = ufs_direnter(tdvp, NULL, newdir, tcnp, NULL); + pool_cache_put(ufs_direct_cache, newdir); + if (error != 0) { + if (doingdirectory && newparent) { + tdp->i_ffs_effnlink--; + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + (void)UFS_UPDATE(tdvp, NULL, NULL, + UPDATE_WAIT | UPDATE_DIROP); + } + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + } else { + if (txp->i_dev != tdp->i_dev || txp->i_dev != ip->i_dev) + panic("rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (txp->i_number == ip->i_number) + panic("rename: same file"); + /* + * If the parent directory is "sticky", then the user must + * own the parent directory, or the destination of the rename, + * otherwise the destination may not be changed (except by + * root). This implements append-only directories. + */ + if ((tdp->i_mode & S_ISTXT) && + kauth_authorize_generic(tcnp->cn_cred, + KAUTH_GENERIC_ISSUSER, NULL) != 0 && + kauth_cred_geteuid(tcnp->cn_cred) != tdp->i_uid && + txp->i_uid != kauth_cred_geteuid(tcnp->cn_cred)) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((txp->i_mode & IFMT) == IFDIR) { + if (txp->i_ffs_effnlink > 2 || + !ufs_dirempty(txp, tdp->i_number, tcnp->cn_cred)) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if ((error = ufs_dirrewrite(tdp, txp, ip->i_number, + IFTODT(ip->i_mode), doingdirectory && newparent ? + newparent : doingdirectory, IN_CHANGE | IN_UPDATE)) != 0) + goto bad; + if (doingdirectory) { + if (!newparent) { + tdp->i_ffs_effnlink--; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + txp->i_ffs_effnlink--; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(txp); + } + if (doingdirectory && !DOINGSOFTDEP(tvp)) { + /* + * Truncate inode. The only stuff left in the directory + * is "." and "..". The "." reference is inconsequential + * since we are quashing it. We have removed the "." + * reference and the reference in the parent directory, + * but there may be other hard links. The soft + * dependency code will arrange to do these operations + * after the parent directory entry has been deleted on + * disk, so when running with that code we avoid doing + * them now. + */ + if (!newparent) { + tdp->i_nlink--; + DIP_ASSIGN(tdp, nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0); + } + txp->i_nlink--; + DIP_ASSIGN(txp, nlink, txp->i_nlink); + txp->i_flag |= IN_CHANGE; + if ((error = UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, + tcnp->cn_cred))) + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + VN_KNOTE(tvp, NOTE_DELETE); + } + + /* restore directory lookup information in case tdvp == fdvp */ + fdp->i_offset = saved_f_offset; + fdp->i_reclen = saved_f_reclen; + fdp->i_count = saved_f_count; + fdp->i_diroff = saved_f_diroff; + + /* + * Handle case where the directory we need to remove may have + * been moved when the directory insertion above performed compaction. + * or when i_count may be wrong due to insertion before this entry. + */ + if ((tdp->i_number == fdp->i_number) && + (((saved_f_offset >= saved_t_offset) && + (saved_f_offset < saved_t_offset + saved_t_count)) || + ((saved_f_offset - saved_f_count >= saved_t_offset) && + (saved_f_offset - saved_f_count < + saved_t_offset + saved_t_count)))) { + struct buf *bp; + struct direct *ep; + struct ufsmount *ump = fdp->i_ump; + doff_t endsearch; /* offset to end directory search */ + int dirblksiz = ump->um_dirblksiz; + const int needswap = UFS_MPNEEDSWAP(ump); + u_long bmask; + int namlen, entryoffsetinblock; + char *dirbuf; + + bmask = fdvp->v_mount->mnt_stat.f_iosize - 1; + + /* + * the fcnp entry will be somewhere between the start of + * compaction and the original location. + */ + fdp->i_offset = saved_t_offset; + error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset, &dirbuf, &bp, + false); + if (error) + goto bad; + + /* + * keep existing fdp->i_count in case + * compaction started at the same location as the fcnp entry. + */ + endsearch = saved_f_offset + saved_f_reclen; + entryoffsetinblock = 0; + while (fdp->i_offset < endsearch) { + int reclen; + + /* + * If necessary, get the next directory block. + */ + if ((fdp->i_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp, 0); + error = ufs_blkatoff(fdvp, (off_t)fdp->i_offset, + &dirbuf, &bp, false); + if (error) + goto bad; + entryoffsetinblock = 0; + } + + KASSERT(bp != NULL); + ep = (struct direct *)(dirbuf + entryoffsetinblock); + reclen = ufs_rw16(ep->d_reclen, needswap); + +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (FSFMT(fdvp) && needswap == 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#else + if (FSFMT(fdvp) && needswap != 0) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +#endif + if ((ep->d_ino != 0) && + (ufs_rw32(ep->d_ino, needswap) != WINO) && + (namlen == fcnp->cn_namelen) && + memcmp(ep->d_name, fcnp->cn_nameptr, namlen) == 0) { + fdp->i_reclen = reclen; + break; + } + fdp->i_offset += reclen; + fdp->i_count = reclen; + entryoffsetinblock += reclen; + } + + KASSERT(fdp->i_offset <= endsearch); + + /* + * If fdp->i_offset points to start of a directory block, + * set fdp->i_count so ufs_dirremove() doesn't compact over + * a directory block boundary. + */ + if ((fdp->i_offset & (dirblksiz - 1)) == 0) + fdp->i_count = 0; + + brelse(bp, 0); + } + + /* + * 3) Unlink the source. + */ + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; The IRENAME + * flag ensures that it cannot be moved by another rename or removed + * by a rmdir. + */ + if (fxp != ip) { + if (doingdirectory) + panic("rename: lost dir entry"); + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + KASSERT(fdp != NULL); + fxp->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(fxp, fdp, newparent, DT_DIR, 0, IN_CHANGE); + cache_purge(fdvp); + } + error = ufs_dirremove(fdvp, fxp, fcnp->cn_flags, 0); + fxp->i_flag &= ~IN_RENAME; + } + VN_KNOTE(fvp, NOTE_RENAME); + goto done; + + out: + vrele(fvp); + vrele(fdvp); + goto out2; + + /* exit routines from steps 1 & 2 */ + bad: + if (doingdirectory) + ip->i_flag &= ~IN_RENAME; + ip->i_ffs_effnlink--; + ip->i_nlink--; + DIP_ASSIGN(ip, nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + ip->i_flag &= ~IN_RENAME; + UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0); + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(ip); + done: + UFS_WAPBL_END(fdvp->v_mount); + vput(fdvp); + vput(fvp); + out2: + /* + * clear IN_RENAME - some exit paths happen too early to go + * through the cleanup done in the "bad" case above, so we + * always do this mini-cleanup here. + */ + ip->i_flag &= ~IN_RENAME; + + if (txp) + vput(ITOV(txp)); + if (tdp) { + if (newparent) + vput(ITOV(tdp)); + else + vrele(ITOV(tdp)); + } + + fstrans_done(mp); + return (error); +} + +#ifdef WAPBL_DEBUG_INODES +void +ufs_wapbl_verify_inodes(struct mount *mp, const char *str) +{ + struct vnode *vp, *nvp; + struct inode *ip; + + simple_lock(&mntvnode_slock); + loop: + TAILQ_FOREACH_REVERSE(vp, &mp->mnt_vnodelist, vnodelst, v_mntvnodes) { + /* + * If the vnode that we are about to sync is no longer + * associated with this mount point, start over. + */ + if (vp->v_mount != mp) + goto loop; + simple_lock(&vp->v_interlock); + nvp = TAILQ_NEXT(vp, v_mntvnodes); + ip = VTOI(vp); + if (vp->v_type == VNON) { + simple_unlock(&vp->v_interlock); + continue; + } + /* verify that update has been called on all inodes */ + if (ip->i_flag & (IN_CHANGE | IN_UPDATE)) { + panic("wapbl_verify: mp %p: dirty vnode %p (inode %p): 0x%x\n", + mp, vp, ip, ip->i_flag); + } + KDASSERT(ip->i_nlink == ip->i_ffs_effnlink); + + simple_unlock(&mntvnode_slock); + { + int s; + struct buf *bp; + struct buf *nbp; + s = splbio(); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + simple_lock(&bp->b_interlock); + if ((bp->b_flags & B_BUSY)) { + simple_unlock(&bp->b_interlock); + continue; + } + if ((bp->b_flags & B_DELWRI) == 0) + panic("wapbl_verify: not dirty, bp %p", bp); + if ((bp->b_flags & B_LOCKED) == 0) + panic("wapbl_verify: not locked, bp %p", bp); + simple_unlock(&bp->b_interlock); + } + splx(s); + } + simple_unlock(&vp->v_interlock); + simple_lock(&mntvnode_slock); + } + simple_unlock(&mntvnode_slock); + + vp = VFSTOUFS(mp)->um_devvp; + simple_lock(&vp->v_interlock); + { + int s; + struct buf *bp; + struct buf *nbp; + s = splbio(); + for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = LIST_NEXT(bp, b_vnbufs); + simple_lock(&bp->b_interlock); + if ((bp->b_flags & B_BUSY)) { + simple_unlock(&bp->b_interlock); + continue; + } + if ((bp->b_flags & B_DELWRI) == 0) + panic("wapbl_verify: devvp not dirty, bp %p", bp); + if ((bp->b_flags & B_LOCKED) == 0) + panic("wapbl_verify: devvp not locked, bp %p", bp); + simple_unlock(&bp->b_interlock); + } + splx(s); + } + simple_unlock(&vp->v_interlock); +} +#endif /* WAPBL_DEBUG_INODES */ diff --git a/sys/ufs/ufs/ufs_wapbl.h b/sys/ufs/ufs/ufs_wapbl.h new file mode 100644 index 000000000000..2ec1abcee339 --- /dev/null +++ b/sys/ufs/ufs/ufs_wapbl.h @@ -0,0 +1,176 @@ +/* $NetBSD: ufs_wapbl.h,v 1.2 2008/07/31 05:38:07 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#ifndef _UFS_UFS_UFS_WAPBL_H_ +#define _UFS_UFS_UFS_WAPBL_H_ + +#if defined(_KERNEL_OPT) +#include "opt_wapbl.h" +#endif + +/* + * Information for the journal location stored in the superblock. + * We store the journal version, some flags, the journal location + * type, and some location specific "locators" that identify where + * the log itself is located. + */ + +/* fs->fs_journal_version */ +#define UFS_WAPBL_VERSION 1 + +/* fs->fs_journal_location */ +#define UFS_WAPBL_JOURNALLOC_NONE 0 + +#define UFS_WAPBL_JOURNALLOC_END_PARTITION 1 +#define UFS_WAPBL_EPART_ADDR 0 /* locator slots */ +#define UFS_WAPBL_EPART_COUNT 1 +#define UFS_WAPBL_EPART_BLKSZ 2 +#define UFS_WAPBL_EPART_UNUSED 3 + +#define UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM 2 +#define UFS_WAPBL_INFS_ADDR 0 /* locator slots */ +#define UFS_WAPBL_INFS_COUNT 1 +#define UFS_WAPBL_INFS_BLKSZ 2 +#define UFS_WAPBL_INFS_INO 3 + +/* fs->fs_journal_flags */ +#define UFS_WAPBL_FLAGS_CREATE_LOG 0x1 +#define UFS_WAPBL_FLAGS_CLEAR_LOG 0x2 + + +/* + * The journal size is limited to between 1MB and 64MB. + * The default journal size is the filesystem size divided by + * the scale factor - this is 1M of journal per 1GB of filesystem + * space. + * + * XXX: Is 64MB too limiting? If user explicitly asks for more, allow it? + */ +#define UFS_WAPBL_JOURNAL_SCALE 1024 +#define UFS_WAPBL_MIN_JOURNAL_SIZE (1024 * 1024) +#define UFS_WAPBL_MAX_JOURNAL_SIZE (64 * 1024 * 1024) + + +#if defined(WAPBL) + +#if defined(WAPBL_DEBUG) +#define WAPBL_DEBUG_INODES +#endif + +int wapbl_ufs_rename(void *v); + +#ifdef WAPBL_DEBUG_INODES +void ufs_wapbl_verify_inodes(struct mount *, const char *); +#endif + +static __inline int +ufs_wapbl_begin2(struct mount *mp, struct vnode *vp1, struct vnode *vp2, + const char *file, int line) +{ + if (mp->mnt_wapbl) { + int error; + + if (vp1) + vref(vp1); + if (vp2) + vref(vp2); + error = wapbl_begin(mp->mnt_wapbl, file, line); + if (error) + return error; +#ifdef WAPBL_DEBUG_INODES + if (mp->mnt_wapbl->wl_lock.lk_exclusivecount == 1) + ufs_wapbl_verify_inodes(mp, "wapbl_begin"); +#endif + } + return 0; +} + +static __inline void +ufs_wapbl_end2(struct mount *mp, struct vnode *vp1, struct vnode *vp2) +{ + if (mp->mnt_wapbl) { +#ifdef WAPBL_DEBUG_INODES + if (mp->mnt_wapbl->wl_lock.lk_exclusivecount == 1) + ufs_wapbl_verify_inodes(mp, "wapbl_end"); +#endif + wapbl_end(mp->mnt_wapbl); + if (vp2) + vrele(vp2); + if (vp1) + vrele(vp1); + } +} + +#define UFS_WAPBL_BEGIN(mp) \ + ufs_wapbl_begin2(mp, 0, 0, __FUNCTION__, __LINE__) +#define UFS_WAPBL_BEGIN1(mp, v1) \ + ufs_wapbl_begin2(mp, v1, 0, __FUNCTION__, __LINE__) +#define UFS_WAPBL_END(mp) ufs_wapbl_end2(mp, 0, 0) +#define UFS_WAPBL_END1(mp, v1) ufs_wapbl_end2(mp, v1, 0) + +#define UFS_WAPBL_UPDATE(vp, access, modify, flags) \ + if ((vp)->v_mount->mnt_wapbl) { \ + UFS_UPDATE(vp, access, modify, flags); \ + } + +#ifdef UFS_WAPBL_DEBUG_JLOCK +#define UFS_WAPBL_JLOCK_ASSERT(mp) \ + if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl) +#define UFS_WAPBL_JUNLOCK_ASSERT(mp) \ + if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl) +#else +#define UFS_WAPBL_JLOCK_ASSERT(mp) +#define UFS_WAPBL_JUNLOCK_ASSERT(mp) +#endif + +#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) \ + if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode) +#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) \ + if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode) + +#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len) \ + if (mp->mnt_wapbl) wapbl_register_deallocation(mp->mnt_wapbl, blk, len) + +#else /* ! WAPBL */ +#define UFS_WAPBL_BEGIN(mp) 0 +#define UFS_WAPBL_BEGIN1(mp, v1) 0 +#define UFS_WAPBL_END(mp) do { } while (0) +#define UFS_WAPBL_END1(mp, v1) +#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0) +#define UFS_WAPBL_JLOCK_ASSERT(mp) +#define UFS_WAPBL_JUNLOCK_ASSERT(mp) +#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) +#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) +#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len) +#endif + +#endif /* !_UFS_UFS_UFS_WAPBL_H_ */ diff --git a/usr.sbin/dumpfs/dumpfs.c b/usr.sbin/dumpfs/dumpfs.c index ceb7d45a5869..242b27050bac 100644 --- a/usr.sbin/dumpfs/dumpfs.c +++ b/usr.sbin/dumpfs/dumpfs.c @@ -1,4 +1,4 @@ -/* $NetBSD: dumpfs.c,v 1.49 2008/07/21 13:36:58 lukem Exp $ */ +/* $NetBSD: dumpfs.c,v 1.50 2008/07/31 05:38:07 simonb Exp $ */ /* * Copyright (c) 1983, 1992, 1993 @@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1983, 1992, 1993\ #if 0 static char sccsid[] = "@(#)dumpfs.c 8.5 (Berkeley) 4/29/95"; #else -__RCSID("$NetBSD: dumpfs.c,v 1.49 2008/07/21 13:36:58 lukem Exp $"); +__RCSID("$NetBSD: dumpfs.c,v 1.50 2008/07/31 05:38:07 simonb Exp $"); #endif #endif /* not lint */ @@ -379,6 +379,13 @@ print_superblock(struct fs *fs, uint16_t *opostbl, fs->fs_old_csshift, fs->fs_old_csmask); printf("\ncgrotor\t%d\tfmod\t%d\tronly\t%d\tclean\t0x%02x\n", fs->fs_cgrotor, fs->fs_fmod, fs->fs_ronly, fs->fs_clean); + printf("wapbl version 0x%x\tlocation %u\tflags 0x%x\n", + fs->fs_journal_version, fs->fs_journal_location, + fs->fs_journal_flags); + printf("wapbl loc0 %" PRIu64 "\tloc1 %" PRIu64, + fs->fs_journallocs[0], fs->fs_journallocs[1]); + printf("\tloc1 %" PRIu64 "\tloc2 %" PRIu64 "\n", + fs->fs_journallocs[2], fs->fs_journallocs[3]); printf("flags\t"); if (fs->fs_flags == 0) printf("none"); @@ -396,8 +403,11 @@ print_superblock(struct fs *fs, uint16_t *opostbl, printf("multilabel "); if (fs->fs_flags & FS_FLAGS_UPDATED) printf("fs_flags expanded "); - fsflags = fs->fs_flags & ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | FS_INDEXDIRS | - FS_ACLS | FS_MULTILABEL | FS_FLAGS_UPDATED); + if (fs->fs_flags & FS_DOWAPBL) + printf("wapbl "); + fsflags = fs->fs_flags & ~(FS_UNCLEAN | FS_DOSOFTDEP | FS_NEEDSFSCK | + FS_INDEXDIRS | FS_ACLS | FS_MULTILABEL | + FS_FLAGS_UPDATED | FS_DOWAPBL); if (fsflags != 0) printf("unknown flags (%#x)", fsflags); printf("\nfsmnt\t%s\n", fs->fs_fsmnt);