NetBSD/sys/kern/vfs_wapbl.c
ad 430f67aa17 PR kern/39564 wapbl performance issues with disk cache flushing
PR kern/40361 WAPBL locking panic in -current
PR kern/40361 WAPBL locking panic in -current
PR kern/40470 WAPBL corrupts ext2fs
PR kern/40562 busy loop in ffs_sync when unmounting a file system
PR kern/40525 panic: ffs_valloc: dup alloc

- A fix for an issue that can lead to "ffs_valloc: dup" due to dirty cg
  buffers being invalidated. Problem discovered and patch by dholland@.

- If the syncer fails to lazily sync a vnode due to lock contention,
  retry 1 second later instead of 30 seconds later.

- Flush inode atime updates every ~10 seconds (this makes most sense with
  logging). Presently they didn't hit the disk for read-only files or
  devices until the file system was unmounted. It would be better to trickle
  the updates out but that would require more extensive changes.

- Fix issues with file system corruption, busy looping and other nasty
  problems when logging and non-logging file systems are intermixed,
  with one being the root file system.

- For logging, do not flush metadata on an inode-at-a-time basis if the sync
  has been requested by ioflush. Previously, we could try hundreds of log
  sync operations a second due to inode update activity, causing the syncer
  to fall behind and metadata updates to be serialized across the entire
  file system. Instead, burst out metadata and log flushes at a minimum
  interval of every 10 seconds on an active file system (happens more often
  if the log becomes full). Note this does not change the operation of
  fsync() etc.

- With the flush issue fixed, re-enable concurrent metadata updates in
  vfs_wapbl.c.
2009-02-22 20:10:25 +00:00

2680 lines
68 KiB
C

/* $NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $ */
/*-
* Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This implements file system independent write ahead filesystem logging.
*/
#define WAPBL_INTERNAL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $");
#include <sys/param.h>
#ifdef _KERNEL
#include <sys/param.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/atomic.h>
#include <sys/wapbl.h>
#include <sys/wapbl_replay.h>
#include <miscfs/specfs/specdev.h>
#if 0 /* notyet */
#define wapbl_malloc(s) kmem_alloc((s), KM_SLEEP)
#define wapbl_free(a, s) kmem_free((a), (s))
#define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
#else
MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging");
#define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK)
#define wapbl_free(a, s) free((a), M_WAPBL)
#define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO)
#endif
#else /* !_KERNEL */
#include <assert.h>
#include <errno.h>
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/wapbl.h>
#include <sys/wapbl_replay.h>
#define KDASSERT(x) assert(x)
#define KASSERT(x) assert(x)
#define wapbl_malloc(s) malloc(s)
#define wapbl_free(a, s) free(a)
#define wapbl_calloc(n, s) calloc((n), (s))
#endif /* !_KERNEL */
/*
* INTERNAL DATA STRUCTURES
*/
/*
* This structure holds per-mount log information.
*
* Legend: a = atomic access only
* r = read-only after init
* l = rwlock held
* m = mutex held
* u = unlocked access ok
* b = bufcache_lock held
*/
struct wapbl {
struct vnode *wl_logvp; /* r: log here */
struct vnode *wl_devvp; /* r: log on this device */
struct mount *wl_mount; /* r: mountpoint wl is associated with */
daddr_t wl_logpbn; /* r: Physical block number of start of log */
int wl_log_dev_bshift; /* r: logarithm of device block size of log
device */
int wl_fs_dev_bshift; /* r: logarithm of device block size of
filesystem device */
unsigned wl_lock_count; /* m: Count of transactions in progress */
size_t wl_circ_size; /* r: Number of bytes in buffer of log */
size_t wl_circ_off; /* r: Number of bytes reserved at start */
size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
off_t wl_head; /* l: Byte offset of log head */
off_t wl_tail; /* l: Byte offset of log tail */
/*
* head == tail == 0 means log is empty
* head == tail != 0 means log is full
* see assertions in wapbl_advance() for other boundary conditions.
* only truncate moves the tail, except when flush sets it to
* wl_header_size only flush moves the head, except when truncate
* sets it to 0.
*/
struct wapbl_wc_header *wl_wc_header; /* l */
void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
kmutex_t wl_mtx; /* u: short-term lock */
krwlock_t wl_rwlock; /* u: File system transaction lock */
/*
* Must be held while accessing
* wl_count or wl_bufs or head or tail
*/
/*
* Callback called from within the flush routine to flush any extra
* bits. Note that flush may be skipped without calling this if
* there are no outstanding buffers in the transaction.
*/
#if _KERNEL
wapbl_flush_fn_t wl_flush; /* r */
wapbl_flush_fn_t wl_flush_abort;/* r */
#endif
size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
size_t wl_bcount; /* m: Total bcount of wl_bufs */
LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
kcondvar_t wl_reclaimable_cv; /* m (obviously) */
size_t wl_reclaimable_bytes; /* m: Amount of space available for
reclamation by truncate */
int wl_error_count; /* m: # of wl_entries with errors */
size_t wl_reserved_bytes; /* never truncate log smaller than this */
#ifdef WAPBL_DEBUG_BUFBYTES
size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
#endif
daddr_t *wl_deallocblks;/* l: address of block */
int *wl_dealloclens; /* l: size of block */
int wl_dealloccnt; /* l: total count */
int wl_dealloclim; /* l: max count */
/* hashtable of inode numbers for allocated but unlinked inodes */
/* synch ??? */
LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash;
u_long wl_inohashmask;
int wl_inohashcnt;
SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction
accounting */
};
#ifdef WAPBL_DEBUG_PRINT
int wapbl_debug_print = WAPBL_DEBUG_PRINT;
#endif
/****************************************************************/
#ifdef _KERNEL
#ifdef WAPBL_DEBUG
struct wapbl *wapbl_debug_wl;
#endif
static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
#endif /* _KERNEL */
static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
static __inline size_t wapbl_space_free(size_t avail, off_t head,
off_t tail);
static __inline size_t wapbl_space_used(size_t avail, off_t head,
off_t tail);
#ifdef _KERNEL
#define WAPBL_INODETRK_SIZE 83
static int wapbl_ino_pool_refcount;
static struct pool wapbl_ino_pool;
struct wapbl_ino {
LIST_ENTRY(wapbl_ino) wi_hash;
ino_t wi_ino;
mode_t wi_mode;
};
static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
static void wapbl_inodetrk_free(struct wapbl *wl);
static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
static size_t wapbl_transaction_len(struct wapbl *wl);
static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
#if 0
int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
#endif
static int wapbl_replay_isopen1(struct wapbl_replay *);
/*
* This is useful for debugging. If set, the log will
* only be truncated when necessary.
*/
int wapbl_lazy_truncate = 0;
struct wapbl_ops wapbl_ops = {
.wo_wapbl_discard = wapbl_discard,
.wo_wapbl_replay_isopen = wapbl_replay_isopen1,
.wo_wapbl_replay_can_read = wapbl_replay_can_read,
.wo_wapbl_replay_read = wapbl_replay_read,
.wo_wapbl_add_buf = wapbl_add_buf,
.wo_wapbl_remove_buf = wapbl_remove_buf,
.wo_wapbl_resize_buf = wapbl_resize_buf,
.wo_wapbl_begin = wapbl_begin,
.wo_wapbl_end = wapbl_end,
.wo_wapbl_junlock_assert= wapbl_junlock_assert,
/* XXX: the following is only used to say "this is a wapbl buf" */
.wo_wapbl_biodone = wapbl_biodone,
};
void
wapbl_init()
{
malloc_type_attach(M_WAPBL);
}
static int
wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
{
int error, i;
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
/*
* Its only valid to reuse the replay log if its
* the same as the new log we just opened.
*/
KDASSERT(!wapbl_replay_isopen(wr));
KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
KASSERT(wl->wl_logpbn == wr->wr_logpbn);
KASSERT(wl->wl_circ_size == wr->wr_circ_size);
KASSERT(wl->wl_circ_off == wr->wr_circ_off);
KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
for (i = 0; i < wr->wr_inodescnt; i++)
wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
wr->wr_inodes[i].wr_imode);
/* Make sure new transaction won't overwrite old inodes list */
KDASSERT(wapbl_transaction_len(wl) <=
wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
wr->wr_inodestail));
wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
wapbl_transaction_len(wl);
error = wapbl_write_inodes(wl, &wl->wl_head);
if (error)
return error;
KASSERT(wl->wl_head != wl->wl_tail);
KASSERT(wl->wl_head != 0);
return 0;
}
int
wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
{
struct wapbl *wl;
struct vnode *devvp;
daddr_t logpbn;
int error;
int log_dev_bshift = DEV_BSHIFT;
int fs_dev_bshift = DEV_BSHIFT;
int run;
WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
" count=%zu blksize=%zu\n", vp, off, count, blksize));
if (log_dev_bshift > fs_dev_bshift) {
WAPBL_PRINTF(WAPBL_PRINT_OPEN,
("wapbl: log device's block size cannot be larger "
"than filesystem's\n"));
/*
* Not currently implemented, although it could be if
* needed someday.
*/
return ENOSYS;
}
if (off < 0)
return EINVAL;
if (blksize < DEV_BSIZE)
return EINVAL;
if (blksize % DEV_BSIZE)
return EINVAL;
/* XXXTODO: verify that the full load is writable */
/*
* XXX check for minimum log size
* minimum is governed by minimum amount of space
* to complete a transaction. (probably truncate)
*/
/* XXX for now pick something minimal */
if ((count * blksize) < MAXPHYS) {
return ENOSPC;
}
if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
return error;
}
wl = wapbl_calloc(1, sizeof(*wl));
rw_init(&wl->wl_rwlock);
mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&wl->wl_reclaimable_cv, "wapblrec");
LIST_INIT(&wl->wl_bufs);
SIMPLEQ_INIT(&wl->wl_entries);
wl->wl_logvp = vp;
wl->wl_devvp = devvp;
wl->wl_mount = mp;
wl->wl_logpbn = logpbn;
wl->wl_log_dev_bshift = log_dev_bshift;
wl->wl_fs_dev_bshift = fs_dev_bshift;
wl->wl_flush = flushfn;
wl->wl_flush_abort = flushabortfn;
/* Reserve two log device blocks for the commit headers */
wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
/* truncate the log usage to a multiple of log_dev_bshift */
wl->wl_circ_size >>= wl->wl_log_dev_bshift;
wl->wl_circ_size <<= wl->wl_log_dev_bshift;
/*
* wl_bufbytes_max limits the size of the in memory transaction space.
* - Since buffers are allocated and accounted for in units of
* PAGE_SIZE it is required to be a multiple of PAGE_SIZE
* (i.e. 1<<PAGE_SHIFT)
* - Since the log device has to be written in units of
* 1<<wl_log_dev_bshift it is required to be a mulitple of
* 1<<wl_log_dev_bshift.
* - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
* it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
* Therefore it must be multiple of the least common multiple of those
* three quantities. Fortunately, all of those quantities are
* guaranteed to be a power of two, and the least common multiple of
* a set of numbers which are all powers of two is simply the maximum
* of those numbers. Finally, the maximum logarithm of a power of two
* is the same as the log of the maximum power of two. So we can do
* the following operations to size wl_bufbytes_max:
*/
/* XXX fix actual number of pages reserved per filesystem. */
wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
/* Round wl_bufbytes_max to the largest power of two constraint */
wl->wl_bufbytes_max >>= PAGE_SHIFT;
wl->wl_bufbytes_max <<= PAGE_SHIFT;
wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
/* XXX maybe use filesystem fragment size instead of 1024 */
/* XXX fix actual number of buffers reserved per filesystem. */
wl->wl_bufcount_max = (nbuf / 2) * 1024;
/* XXX tie this into resource estimation */
wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max);
wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) *
wl->wl_dealloclim);
wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) *
wl->wl_dealloclim);
wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
/* Initialize the commit header */
{
struct wapbl_wc_header *wc;
size_t len = 1 << wl->wl_log_dev_bshift;
wc = wapbl_calloc(1, len);
wc->wc_type = WAPBL_WC_HEADER;
wc->wc_len = len;
wc->wc_circ_off = wl->wl_circ_off;
wc->wc_circ_size = wl->wl_circ_size;
/* XXX wc->wc_fsid */
wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
wl->wl_wc_header = wc;
wl->wl_wc_scratch = wapbl_malloc(len);
}
/*
* if there was an existing set of unlinked but
* allocated inodes, preserve it in the new
* log.
*/
if (wr && wr->wr_inodescnt) {
error = wapbl_start_flush_inodes(wl, wr);
if (error)
goto errout;
}
error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
if (error) {
goto errout;
}
*wlp = wl;
#if defined(WAPBL_DEBUG)
wapbl_debug_wl = wl;
#endif
return 0;
errout:
wapbl_discard(wl);
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_deallocblks,
sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
wapbl_free(wl->wl_dealloclens,
sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
wapbl_inodetrk_free(wl);
wapbl_free(wl, sizeof(*wl));
return error;
}
/*
* Like wapbl_flush, only discards the transaction
* completely
*/
void
wapbl_discard(struct wapbl *wl)
{
struct wapbl_entry *we;
struct buf *bp;
int i;
/*
* XXX we may consider using upgrade here
* if we want to call flush from inside a transaction
*/
rw_enter(&wl->wl_rwlock, RW_WRITER);
wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
wl->wl_dealloccnt);
#ifdef WAPBL_DEBUG_PRINT
{
struct wapbl_entry *we;
pid_t pid = -1;
lwpid_t lid = -1;
if (curproc)
pid = curproc->p_pid;
if (curlwp)
lid = curlwp->l_lid;
#ifdef WAPBL_DEBUG_BUFBYTES
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
("wapbl_discard: thread %d.%d discarding "
"transaction\n"
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
"deallocs=%d inodes=%d\n"
"\terrcnt = %u, reclaimable=%zu reserved=%zu "
"unsynced=%zu\n",
pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
wl->wl_bcount, wl->wl_dealloccnt,
wl->wl_inohashcnt, wl->wl_error_count,
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
wl->wl_unsynced_bufbytes));
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
("\tentry: bufcount = %zu, reclaimable = %zu, "
"error = %d, unsynced = %zu\n",
we->we_bufcount, we->we_reclaimable_bytes,
we->we_error, we->we_unsynced_bufbytes));
}
#else /* !WAPBL_DEBUG_BUFBYTES */
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
("wapbl_discard: thread %d.%d discarding transaction\n"
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
"deallocs=%d inodes=%d\n"
"\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
wl->wl_bcount, wl->wl_dealloccnt,
wl->wl_inohashcnt, wl->wl_error_count,
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
("\tentry: bufcount = %zu, reclaimable = %zu, "
"error = %d\n",
we->we_bufcount, we->we_reclaimable_bytes,
we->we_error));
}
#endif /* !WAPBL_DEBUG_BUFBYTES */
}
#endif /* WAPBL_DEBUG_PRINT */
for (i = 0; i <= wl->wl_inohashmask; i++) {
struct wapbl_ino_head *wih;
struct wapbl_ino *wi;
wih = &wl->wl_inohash[i];
while ((wi = LIST_FIRST(wih)) != NULL) {
LIST_REMOVE(wi, wi_hash);
pool_put(&wapbl_ino_pool, wi);
KASSERT(wl->wl_inohashcnt > 0);
wl->wl_inohashcnt--;
}
}
/*
* clean buffer list
*/
mutex_enter(&bufcache_lock);
mutex_enter(&wl->wl_mtx);
while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
/*
* The buffer will be unlocked and
* removed from the transaction in brelse
*/
mutex_exit(&wl->wl_mtx);
brelsel(bp, 0);
mutex_enter(&wl->wl_mtx);
}
}
mutex_exit(&wl->wl_mtx);
mutex_exit(&bufcache_lock);
/*
* Remove references to this wl from wl_entries, free any which
* no longer have buffers, others will be freed in wapbl_biodone
* when they no longer have any buffers.
*/
while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
/* XXX should we be accumulating wl_error_count
* and increasing reclaimable bytes ? */
we->we_wapbl = NULL;
if (we->we_bufcount == 0) {
#ifdef WAPBL_DEBUG_BUFBYTES
KASSERT(we->we_unsynced_bufbytes == 0);
#endif
wapbl_free(we, sizeof(*we));
}
}
/* Discard list of deallocs */
wl->wl_dealloccnt = 0;
/* XXX should we clear wl_reserved_bytes? */
KASSERT(wl->wl_bufbytes == 0);
KASSERT(wl->wl_bcount == 0);
KASSERT(wl->wl_bufcount == 0);
KASSERT(LIST_EMPTY(&wl->wl_bufs));
KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
KASSERT(wl->wl_inohashcnt == 0);
rw_exit(&wl->wl_rwlock);
}
int
wapbl_stop(struct wapbl *wl, int force)
{
struct vnode *vp;
int error;
WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
error = wapbl_flush(wl, 1);
if (error) {
if (force)
wapbl_discard(wl);
else
return error;
}
/* Unlinked inodes persist after a flush */
if (wl->wl_inohashcnt) {
if (force) {
wapbl_discard(wl);
} else {
return EBUSY;
}
}
KASSERT(wl->wl_bufbytes == 0);
KASSERT(wl->wl_bcount == 0);
KASSERT(wl->wl_bufcount == 0);
KASSERT(LIST_EMPTY(&wl->wl_bufs));
KASSERT(wl->wl_dealloccnt == 0);
KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
KASSERT(wl->wl_inohashcnt == 0);
vp = wl->wl_logvp;
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
wapbl_free(wl->wl_deallocblks,
sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim);
wapbl_free(wl->wl_dealloclens,
sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim);
wapbl_inodetrk_free(wl);
cv_destroy(&wl->wl_reclaimable_cv);
mutex_destroy(&wl->wl_mtx);
rw_destroy(&wl->wl_rwlock);
wapbl_free(wl, sizeof(*wl));
return 0;
}
static int
wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
{
struct pstats *pstats = curlwp->l_proc->p_stats;
struct buf *bp;
int error;
KASSERT((flags & ~(B_WRITE | B_READ)) == 0);
KASSERT(devvp->v_type == VBLK);
if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
mutex_enter(&devvp->v_interlock);
devvp->v_numoutput++;
mutex_exit(&devvp->v_interlock);
pstats->p_ru.ru_oublock++;
} else {
pstats->p_ru.ru_inblock++;
}
bp = getiobuf(devvp, true);
bp->b_flags = flags;
bp->b_cflags = BC_BUSY; /* silly & dubious */
bp->b_dev = devvp->v_rdev;
bp->b_data = data;
bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
bp->b_blkno = pbn;
WAPBL_PRINTF(WAPBL_PRINT_IO,
("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n",
BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
bp->b_blkno, bp->b_dev));
VOP_STRATEGY(devvp, bp);
error = biowait(bp);
putiobuf(bp);
if (error) {
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("wapbl_doio: %s %zu bytes at block %" PRId64
" on dev 0x%x failed with error %d\n",
(((flags & (B_WRITE | B_READ)) == B_WRITE) ?
"write" : "read"),
len, pbn, devvp->v_rdev, error));
}
return error;
}
int
wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
{
return wapbl_doio(data, len, devvp, pbn, B_WRITE);
}
int
wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
{
return wapbl_doio(data, len, devvp, pbn, B_READ);
}
/*
* Off is byte offset returns new offset for next write
* handles log wraparound
*/
static int
wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
{
size_t slen;
off_t off = *offp;
int error;
KDASSERT(((len >> wl->wl_log_dev_bshift) <<
wl->wl_log_dev_bshift) == len);
if (off < wl->wl_circ_off)
off = wl->wl_circ_off;
slen = wl->wl_circ_off + wl->wl_circ_size - off;
if (slen < len) {
error = wapbl_write(data, slen, wl->wl_devvp,
wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
if (error)
return error;
data = (uint8_t *)data + slen;
len -= slen;
off = wl->wl_circ_off;
}
error = wapbl_write(data, len, wl->wl_devvp,
wl->wl_logpbn + (off >> wl->wl_log_dev_bshift));
if (error)
return error;
off += len;
if (off >= wl->wl_circ_off + wl->wl_circ_size)
off = wl->wl_circ_off;
*offp = off;
return 0;
}
/****************************************************************/
int
wapbl_begin(struct wapbl *wl, const char *file, int line)
{
int doflush;
unsigned lockcount;
KDASSERT(wl);
/*
* XXX this needs to be made much more sophisticated.
* perhaps each wapbl_begin could reserve a specified
* number of buffers and bytes.
*/
mutex_enter(&wl->wl_mtx);
lockcount = wl->wl_lock_count;
doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
wl->wl_bufbytes_max / 2) ||
((wl->wl_bufcount + (lockcount * 10)) >
wl->wl_bufcount_max / 2) ||
(wapbl_transaction_len(wl) > wl->wl_circ_size / 2);
mutex_exit(&wl->wl_mtx);
if (doflush) {
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
("force flush lockcnt=%d bufbytes=%zu "
"(max=%zu) bufcount=%zu (max=%zu)\n",
lockcount, wl->wl_bufbytes,
wl->wl_bufbytes_max, wl->wl_bufcount,
wl->wl_bufcount_max));
}
if (doflush) {
int error = wapbl_flush(wl, 0);
if (error)
return error;
}
rw_enter(&wl->wl_rwlock, RW_READER);
mutex_enter(&wl->wl_mtx);
wl->wl_lock_count++;
mutex_exit(&wl->wl_mtx);
#if defined(WAPBL_DEBUG_PRINT)
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
("wapbl_begin thread %d.%d with bufcount=%zu "
"bufbytes=%zu bcount=%zu at %s:%d\n",
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
wl->wl_bufbytes, wl->wl_bcount, file, line));
#endif
return 0;
}
void
wapbl_end(struct wapbl *wl)
{
#if defined(WAPBL_DEBUG_PRINT)
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
("wapbl_end thread %d.%d with bufcount=%zu "
"bufbytes=%zu bcount=%zu\n",
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
wl->wl_bufbytes, wl->wl_bcount));
#endif
mutex_enter(&wl->wl_mtx);
KASSERT(wl->wl_lock_count > 0);
wl->wl_lock_count--;
mutex_exit(&wl->wl_mtx);
rw_exit(&wl->wl_rwlock);
}
void
wapbl_add_buf(struct wapbl *wl, struct buf * bp)
{
KASSERT(bp->b_cflags & BC_BUSY);
KASSERT(bp->b_vp);
wapbl_jlock_assert(wl);
#if 0
/*
* XXX this might be an issue for swapfiles.
* see uvm_swap.c:1702
*
* XXX2 why require it then? leap of semantics?
*/
KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
#endif
mutex_enter(&wl->wl_mtx);
if (bp->b_flags & B_LOCKED) {
LIST_REMOVE(bp, b_wapbllist);
WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
("wapbl_add_buf thread %d.%d re-adding buf %p "
"with %d bytes %d bcount\n",
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
bp->b_bcount));
} else {
/* unlocked by dirty buffers shouldn't exist */
KASSERT(!(bp->b_oflags & BO_DELWRI));
wl->wl_bufbytes += bp->b_bufsize;
wl->wl_bcount += bp->b_bcount;
wl->wl_bufcount++;
WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
("wapbl_add_buf thread %d.%d adding buf %p "
"with %d bytes %d bcount\n",
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
bp->b_bcount));
}
LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist);
mutex_exit(&wl->wl_mtx);
bp->b_flags |= B_LOCKED;
}
static void
wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
{
KASSERT(mutex_owned(&wl->wl_mtx));
KASSERT(bp->b_cflags & BC_BUSY);
wapbl_jlock_assert(wl);
#if 0
/*
* XXX this might be an issue for swapfiles.
* see uvm_swap.c:1725
*
* XXXdeux: see above
*/
KASSERT((bp->b_flags & BC_NOCACHE) == 0);
#endif
KASSERT(bp->b_flags & B_LOCKED);
WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
("wapbl_remove_buf thread %d.%d removing buf %p with "
"%d bytes %d bcount\n",
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
wl->wl_bufbytes -= bp->b_bufsize;
KASSERT(wl->wl_bcount >= bp->b_bcount);
wl->wl_bcount -= bp->b_bcount;
KASSERT(wl->wl_bufcount > 0);
wl->wl_bufcount--;
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
LIST_REMOVE(bp, b_wapbllist);
bp->b_flags &= ~B_LOCKED;
}
/* called from brelsel() in vfs_bio among other places */
void
wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
{
mutex_enter(&wl->wl_mtx);
wapbl_remove_buf_locked(wl, bp);
mutex_exit(&wl->wl_mtx);
}
void
wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
{
KASSERT(bp->b_cflags & BC_BUSY);
/*
* XXX: why does this depend on B_LOCKED? otherwise the buf
* is not for a transaction? if so, why is this called in the
* first place?
*/
if (bp->b_flags & B_LOCKED) {
mutex_enter(&wl->wl_mtx);
wl->wl_bufbytes += bp->b_bufsize - oldsz;
wl->wl_bcount += bp->b_bcount - oldcnt;
mutex_exit(&wl->wl_mtx);
}
}
#endif /* _KERNEL */
/****************************************************************/
/* Some utility inlines */
/* This is used to advance the pointer at old to new value at old+delta */
static __inline off_t
wapbl_advance(size_t size, size_t off, off_t old, size_t delta)
{
off_t new;
/* Define acceptable ranges for inputs. */
KASSERT(delta <= size);
KASSERT((old == 0) || (old >= off));
KASSERT(old < (size + off));
if ((old == 0) && (delta != 0))
new = off + delta;
else if ((old + delta) < (size + off))
new = old + delta;
else
new = (old + delta) - size;
/* Note some interesting axioms */
KASSERT((delta != 0) || (new == old));
KASSERT((delta == 0) || (new != 0));
KASSERT((delta != (size)) || (new == old));
/* Define acceptable ranges for output. */
KASSERT((new == 0) || (new >= off));
KASSERT(new < (size + off));
return new;
}
static __inline size_t
wapbl_space_used(size_t avail, off_t head, off_t tail)
{
if (tail == 0) {
KASSERT(head == 0);
return 0;
}
return ((head + (avail - 1) - tail) % avail) + 1;
}
static __inline size_t
wapbl_space_free(size_t avail, off_t head, off_t tail)
{
return avail - wapbl_space_used(avail, head, tail);
}
static __inline void
wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
off_t *tailp)
{
off_t head = *headp;
off_t tail = *tailp;
KASSERT(delta <= wapbl_space_free(size, head, tail));
head = wapbl_advance(size, off, head, delta);
if ((tail == 0) && (head != 0))
tail = off;
*headp = head;
*tailp = tail;
}
static __inline void
wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
off_t *tailp)
{
off_t head = *headp;
off_t tail = *tailp;
KASSERT(delta <= wapbl_space_used(size, head, tail));
tail = wapbl_advance(size, off, tail, delta);
if (head == tail) {
head = tail = 0;
}
*headp = head;
*tailp = tail;
}
#ifdef _KERNEL
/****************************************************************/
/*
* Remove transactions whose buffers are completely flushed to disk.
* Will block until at least minfree space is available.
* only intended to be called from inside wapbl_flush and therefore
* does not protect against commit races with itself or with flush.
*/
static int
wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly)
{
size_t delta;
size_t avail;
off_t head;
off_t tail;
int error = 0;
KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
KASSERT(rw_write_held(&wl->wl_rwlock));
mutex_enter(&wl->wl_mtx);
/*
* First check to see if we have to do a commit
* at all.
*/
avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
if (minfree < avail) {
mutex_exit(&wl->wl_mtx);
return 0;
}
minfree -= avail;
while ((wl->wl_error_count == 0) &&
(wl->wl_reclaimable_bytes < minfree)) {
WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
"minfree=%zd\n",
&wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
minfree));
cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
}
if (wl->wl_reclaimable_bytes < minfree) {
KASSERT(wl->wl_error_count);
/* XXX maybe get actual error from buffer instead someday? */
error = EIO;
}
head = wl->wl_head;
tail = wl->wl_tail;
delta = wl->wl_reclaimable_bytes;
/* If all of of the entries are flushed, then be sure to keep
* the reserved bytes reserved. Watch out for discarded transactions,
* which could leave more bytes reserved than are reclaimable.
*/
if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
(delta >= wl->wl_reserved_bytes)) {
delta -= wl->wl_reserved_bytes;
}
wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
&tail);
KDASSERT(wl->wl_reserved_bytes <=
wapbl_space_used(wl->wl_circ_size, head, tail));
mutex_exit(&wl->wl_mtx);
if (error)
return error;
if (waitonly)
return 0;
/*
* This is where head, tail and delta are unprotected
* from races against itself or flush. This is ok since
* we only call this routine from inside flush itself.
*
* XXX: how can it race against itself when accessed only
* from behind the write-locked rwlock?
*/
error = wapbl_write_commit(wl, head, tail);
if (error)
return error;
wl->wl_head = head;
wl->wl_tail = tail;
mutex_enter(&wl->wl_mtx);
KASSERT(wl->wl_reclaimable_bytes >= delta);
wl->wl_reclaimable_bytes -= delta;
mutex_exit(&wl->wl_mtx);
WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
("wapbl_truncate thread %d.%d truncating %zu bytes\n",
curproc->p_pid, curlwp->l_lid, delta));
return 0;
}
/****************************************************************/
void
wapbl_biodone(struct buf *bp)
{
struct wapbl_entry *we = bp->b_private;
struct wapbl *wl = we->we_wapbl;
/*
* Handle possible flushing of buffers after log has been
* decomissioned.
*/
if (!wl) {
KASSERT(we->we_bufcount > 0);
we->we_bufcount--;
#ifdef WAPBL_DEBUG_BUFBYTES
KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
we->we_unsynced_bufbytes -= bp->b_bufsize;
#endif
if (we->we_bufcount == 0) {
#ifdef WAPBL_DEBUG_BUFBYTES
KASSERT(we->we_unsynced_bufbytes == 0);
#endif
wapbl_free(we, sizeof(*we));
}
brelse(bp, 0);
return;
}
#ifdef ohbother
KDASSERT(bp->b_flags & B_DONE);
KDASSERT(!(bp->b_flags & B_DELWRI));
KDASSERT(bp->b_flags & B_ASYNC);
KDASSERT(bp->b_flags & B_BUSY);
KDASSERT(!(bp->b_flags & B_LOCKED));
KDASSERT(!(bp->b_flags & B_READ));
KDASSERT(!(bp->b_flags & B_INVAL));
KDASSERT(!(bp->b_flags & B_NOCACHE));
#endif
if (bp->b_error) {
#ifdef notyet /* Can't currently handle possible dirty buffer reuse */
XXXpooka: interfaces not fully updated
Note: this was not enabled in the original patch
against netbsd4 either. I don't know if comment
above is true or not.
/*
* If an error occurs, report the error and leave the
* buffer as a delayed write on the LRU queue.
* restarting the write would likely result in
* an error spinloop, so let it be done harmlessly
* by the syncer.
*/
bp->b_flags &= ~(B_DONE);
simple_unlock(&bp->b_interlock);
if (we->we_error == 0) {
mutex_enter(&wl->wl_mtx);
wl->wl_error_count++;
mutex_exit(&wl->wl_mtx);
cv_broadcast(&wl->wl_reclaimable_cv);
}
we->we_error = bp->b_error;
bp->b_error = 0;
brelse(bp);
return;
#else
/* For now, just mark the log permanently errored out */
mutex_enter(&wl->wl_mtx);
if (wl->wl_error_count == 0) {
wl->wl_error_count++;
cv_broadcast(&wl->wl_reclaimable_cv);
}
mutex_exit(&wl->wl_mtx);
#endif
}
mutex_enter(&wl->wl_mtx);
KASSERT(we->we_bufcount > 0);
we->we_bufcount--;
#ifdef WAPBL_DEBUG_BUFBYTES
KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize);
we->we_unsynced_bufbytes -= bp->b_bufsize;
KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize);
wl->wl_unsynced_bufbytes -= bp->b_bufsize;
#endif
/*
* If the current transaction can be reclaimed, start
* at the beginning and reclaim any consecutive reclaimable
* transactions. If we successfully reclaim anything,
* then wakeup anyone waiting for the reclaim.
*/
if (we->we_bufcount == 0) {
size_t delta = 0;
int errcnt = 0;
#ifdef WAPBL_DEBUG_BUFBYTES
KDASSERT(we->we_unsynced_bufbytes == 0);
#endif
/*
* clear any posted error, since the buffer it came from
* has successfully flushed by now
*/
while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
(we->we_bufcount == 0)) {
delta += we->we_reclaimable_bytes;
if (we->we_error)
errcnt++;
SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
wapbl_free(we, sizeof(*we));
}
if (delta) {
wl->wl_reclaimable_bytes += delta;
KASSERT(wl->wl_error_count >= errcnt);
wl->wl_error_count -= errcnt;
cv_broadcast(&wl->wl_reclaimable_cv);
}
}
mutex_exit(&wl->wl_mtx);
brelse(bp, 0);
}
/*
* Write transactions to disk + start I/O for contents
*/
int
wapbl_flush(struct wapbl *wl, int waitfor)
{
struct buf *bp;
struct wapbl_entry *we;
off_t off;
off_t head;
off_t tail;
size_t delta = 0;
size_t flushsize;
size_t reserved;
int error = 0;
/*
* Do a quick check to see if a full flush can be skipped
* This assumes that the flush callback does not need to be called
* unless there are other outstanding bufs.
*/
if (!waitfor) {
size_t nbufs;
mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
protect the KASSERTS */
nbufs = wl->wl_bufcount;
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
mutex_exit(&wl->wl_mtx);
if (nbufs == 0)
return 0;
}
/*
* XXX we may consider using LK_UPGRADE here
* if we want to call flush from inside a transaction
*/
rw_enter(&wl->wl_rwlock, RW_WRITER);
wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens,
wl->wl_dealloccnt);
/*
* Now that we are fully locked and flushed,
* do another check for nothing to do.
*/
if (wl->wl_bufcount == 0) {
goto out;
}
#if 0
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
("wapbl_flush thread %d.%d flushing entries with "
"bufcount=%zu bufbytes=%zu\n",
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
wl->wl_bufbytes));
#endif
/* Calculate amount of space needed to flush */
flushsize = wapbl_transaction_len(wl);
if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
/*
* XXX this could be handled more gracefully, perhaps place
* only a partial transaction in the log and allow the
* remaining to flush without the protection of the journal.
*/
panic("wapbl_flush: current transaction too big to flush\n");
}
error = wapbl_truncate(wl, flushsize, 0);
if (error)
goto out2;
off = wl->wl_head;
KASSERT((off == 0) || ((off >= wl->wl_circ_off) &&
(off < wl->wl_circ_off + wl->wl_circ_size)));
error = wapbl_write_blocks(wl, &off);
if (error)
goto out2;
error = wapbl_write_revocations(wl, &off);
if (error)
goto out2;
error = wapbl_write_inodes(wl, &off);
if (error)
goto out2;
reserved = 0;
if (wl->wl_inohashcnt)
reserved = wapbl_transaction_inodes_len(wl);
head = wl->wl_head;
tail = wl->wl_tail;
wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
&head, &tail);
#ifdef WAPBL_DEBUG
if (head != off) {
panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX
" off=%"PRIdMAX" flush=%zu\n",
(intmax_t)head, (intmax_t)tail, (intmax_t)off,
flushsize);
}
#else
KASSERT(head == off);
#endif
/* Opportunistically move the tail forward if we can */
if (!wapbl_lazy_truncate) {
mutex_enter(&wl->wl_mtx);
delta = wl->wl_reclaimable_bytes;
mutex_exit(&wl->wl_mtx);
wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
&head, &tail);
}
error = wapbl_write_commit(wl, head, tail);
if (error)
goto out2;
we = wapbl_calloc(1, sizeof(*we));
#ifdef WAPBL_DEBUG_BUFBYTES
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
" unsynced=%zu"
"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
"inodes=%d\n",
curproc->p_pid, curlwp->l_lid, flushsize, delta,
wapbl_space_used(wl->wl_circ_size, head, tail),
wl->wl_unsynced_bufbytes, wl->wl_bufcount,
wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
wl->wl_inohashcnt));
#else
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
"inodes=%d\n",
curproc->p_pid, curlwp->l_lid, flushsize, delta,
wapbl_space_used(wl->wl_circ_size, head, tail),
wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
wl->wl_dealloccnt, wl->wl_inohashcnt));
#endif
mutex_enter(&bufcache_lock);
mutex_enter(&wl->wl_mtx);
wl->wl_reserved_bytes = reserved;
wl->wl_head = head;
wl->wl_tail = tail;
KASSERT(wl->wl_reclaimable_bytes >= delta);
wl->wl_reclaimable_bytes -= delta;
wl->wl_dealloccnt = 0;
#ifdef WAPBL_DEBUG_BUFBYTES
wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
#endif
we->we_wapbl = wl;
we->we_bufcount = wl->wl_bufcount;
#ifdef WAPBL_DEBUG_BUFBYTES
we->we_unsynced_bufbytes = wl->wl_bufbytes;
#endif
we->we_reclaimable_bytes = flushsize;
we->we_error = 0;
SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
/*
* this flushes bufs in reverse order than they were queued
* it shouldn't matter, but if we care we could use TAILQ instead.
* XXX Note they will get put on the lru queue when they flush
* so we might actually want to change this to preserve order.
*/
while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) {
if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
continue;
}
bp->b_iodone = wapbl_biodone;
bp->b_private = we;
bremfree(bp);
wapbl_remove_buf_locked(wl, bp);
mutex_exit(&wl->wl_mtx);
mutex_exit(&bufcache_lock);
bawrite(bp);
mutex_enter(&bufcache_lock);
mutex_enter(&wl->wl_mtx);
}
mutex_exit(&wl->wl_mtx);
mutex_exit(&bufcache_lock);
#if 0
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
("wapbl_flush thread %d.%d done flushing entries...\n",
curproc->p_pid, curlwp->l_lid));
#endif
out:
/*
* If the waitfor flag is set, don't return until everything is
* fully flushed and the on disk log is empty.
*/
if (waitfor) {
error = wapbl_truncate(wl, wl->wl_circ_size -
wl->wl_reserved_bytes, wapbl_lazy_truncate);
}
out2:
if (error) {
wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks,
wl->wl_dealloclens, wl->wl_dealloccnt);
}
#ifdef WAPBL_DEBUG_PRINT
if (error) {
pid_t pid = -1;
lwpid_t lid = -1;
if (curproc)
pid = curproc->p_pid;
if (curlwp)
lid = curlwp->l_lid;
mutex_enter(&wl->wl_mtx);
#ifdef WAPBL_DEBUG_BUFBYTES
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("wapbl_flush: thread %d.%d aborted flush: "
"error = %d\n"
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
"deallocs=%d inodes=%d\n"
"\terrcnt = %d, reclaimable=%zu reserved=%zu "
"unsynced=%zu\n",
pid, lid, error, wl->wl_bufcount,
wl->wl_bufbytes, wl->wl_bcount,
wl->wl_dealloccnt, wl->wl_inohashcnt,
wl->wl_error_count, wl->wl_reclaimable_bytes,
wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("\tentry: bufcount = %zu, reclaimable = %zu, "
"error = %d, unsynced = %zu\n",
we->we_bufcount, we->we_reclaimable_bytes,
we->we_error, we->we_unsynced_bufbytes));
}
#else
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("wapbl_flush: thread %d.%d aborted flush: "
"error = %d\n"
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
"deallocs=%d inodes=%d\n"
"\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
pid, lid, error, wl->wl_bufcount,
wl->wl_bufbytes, wl->wl_bcount,
wl->wl_dealloccnt, wl->wl_inohashcnt,
wl->wl_error_count, wl->wl_reclaimable_bytes,
wl->wl_reserved_bytes));
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("\tentry: bufcount = %zu, reclaimable = %zu, "
"error = %d\n", we->we_bufcount,
we->we_reclaimable_bytes, we->we_error));
}
#endif
mutex_exit(&wl->wl_mtx);
}
#endif
rw_exit(&wl->wl_rwlock);
return error;
}
/****************************************************************/
void
wapbl_jlock_assert(struct wapbl *wl)
{
KASSERT(rw_lock_held(&wl->wl_rwlock));
}
void
wapbl_junlock_assert(struct wapbl *wl)
{
KASSERT(!rw_write_held(&wl->wl_rwlock));
}
/****************************************************************/
/* locks missing */
void
wapbl_print(struct wapbl *wl,
int full,
void (*pr)(const char *, ...))
{
struct buf *bp;
struct wapbl_entry *we;
(*pr)("wapbl %p", wl);
(*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
(*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
wl->wl_circ_size, wl->wl_circ_off,
(intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
(*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
#ifdef WAPBL_DEBUG_BUFBYTES
(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
"reserved = %zu errcnt = %d unsynced = %zu\n",
wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
wl->wl_error_count, wl->wl_unsynced_bufbytes);
#else
(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
"reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
wl->wl_error_count);
#endif
(*pr)("\tdealloccnt = %d, dealloclim = %d\n",
wl->wl_dealloccnt, wl->wl_dealloclim);
(*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
wl->wl_inohashcnt, wl->wl_inohashmask);
(*pr)("entries:\n");
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
#ifdef WAPBL_DEBUG_BUFBYTES
(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
"unsynced = %zu\n",
we->we_bufcount, we->we_reclaimable_bytes,
we->we_error, we->we_unsynced_bufbytes);
#else
(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
#endif
}
if (full) {
int cnt = 0;
(*pr)("bufs =");
LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
if (!LIST_NEXT(bp, b_wapbllist)) {
(*pr)(" %p", bp);
} else if ((++cnt % 6) == 0) {
(*pr)(" %p,\n\t", bp);
} else {
(*pr)(" %p,", bp);
}
}
(*pr)("\n");
(*pr)("dealloced blks = ");
{
int i;
cnt = 0;
for (i = 0; i < wl->wl_dealloccnt; i++) {
(*pr)(" %"PRId64":%d,",
wl->wl_deallocblks[i],
wl->wl_dealloclens[i]);
if ((++cnt % 4) == 0) {
(*pr)("\n\t");
}
}
}
(*pr)("\n");
(*pr)("registered inodes = ");
{
int i;
cnt = 0;
for (i = 0; i <= wl->wl_inohashmask; i++) {
struct wapbl_ino_head *wih;
struct wapbl_ino *wi;
wih = &wl->wl_inohash[i];
LIST_FOREACH(wi, wih, wi_hash) {
if (wi->wi_ino == 0)
continue;
(*pr)(" %"PRId32"/0%06"PRIo32",",
wi->wi_ino, wi->wi_mode);
if ((++cnt % 4) == 0) {
(*pr)("\n\t");
}
}
}
(*pr)("\n");
}
}
}
#if defined(WAPBL_DEBUG) || defined(DDB)
void
wapbl_dump(struct wapbl *wl)
{
#if defined(WAPBL_DEBUG)
if (!wl)
wl = wapbl_debug_wl;
#endif
if (!wl)
return;
wapbl_print(wl, 1, printf);
}
#endif
/****************************************************************/
void
wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len)
{
wapbl_jlock_assert(wl);
/* XXX should eventually instead tie this into resource estimation */
/* XXX this KASSERT needs locking/mutex analysis */
KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim);
wl->wl_deallocblks[wl->wl_dealloccnt] = blk;
wl->wl_dealloclens[wl->wl_dealloccnt] = len;
wl->wl_dealloccnt++;
WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len));
}
/****************************************************************/
static void
wapbl_inodetrk_init(struct wapbl *wl, u_int size)
{
wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
"wapblinopl", &pool_allocator_nointr, IPL_NONE);
}
}
static void
wapbl_inodetrk_free(struct wapbl *wl)
{
/* XXX this KASSERT needs locking/mutex analysis */
KASSERT(wl->wl_inohashcnt == 0);
hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
pool_destroy(&wapbl_ino_pool);
}
}
static struct wapbl_ino *
wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
{
struct wapbl_ino_head *wih;
struct wapbl_ino *wi;
KASSERT(mutex_owned(&wl->wl_mtx));
wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
LIST_FOREACH(wi, wih, wi_hash) {
if (ino == wi->wi_ino)
return wi;
}
return 0;
}
void
wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
{
struct wapbl_ino_head *wih;
struct wapbl_ino *wi;
wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
mutex_enter(&wl->wl_mtx);
if (wapbl_inodetrk_get(wl, ino) == NULL) {
wi->wi_ino = ino;
wi->wi_mode = mode;
wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
LIST_INSERT_HEAD(wih, wi, wi_hash);
wl->wl_inohashcnt++;
WAPBL_PRINTF(WAPBL_PRINT_INODE,
("wapbl_register_inode: ino=%"PRId64"\n", ino));
mutex_exit(&wl->wl_mtx);
} else {
mutex_exit(&wl->wl_mtx);
pool_put(&wapbl_ino_pool, wi);
}
}
void
wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
{
struct wapbl_ino *wi;
mutex_enter(&wl->wl_mtx);
wi = wapbl_inodetrk_get(wl, ino);
if (wi) {
WAPBL_PRINTF(WAPBL_PRINT_INODE,
("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
KASSERT(wl->wl_inohashcnt > 0);
wl->wl_inohashcnt--;
LIST_REMOVE(wi, wi_hash);
mutex_exit(&wl->wl_mtx);
pool_put(&wapbl_ino_pool, wi);
} else {
mutex_exit(&wl->wl_mtx);
}
}
/****************************************************************/
static __inline size_t
wapbl_transaction_inodes_len(struct wapbl *wl)
{
int blocklen = 1<<wl->wl_log_dev_bshift;
int iph;
/* Calculate number of inodes described in a inodelist header */
iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
KASSERT(iph > 0);
return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen;
}
/* Calculate amount of space a transaction will take on disk */
static size_t
wapbl_transaction_len(struct wapbl *wl)
{
int blocklen = 1<<wl->wl_log_dev_bshift;
size_t len;
int bph;
/* Calculate number of blocks described in a blocklist header */
bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
KASSERT(bph > 0);
len = wl->wl_bcount;
len += howmany(wl->wl_bufcount, bph)*blocklen;
len += howmany(wl->wl_dealloccnt, bph)*blocklen;
len += wapbl_transaction_inodes_len(wl);
return len;
}
/*
* Perform commit operation
*
* Note that generation number incrementation needs to
* be protected against racing with other invocations
* of wapbl_commit. This is ok since this routine
* is only invoked from wapbl_flush
*/
static int
wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
{
struct wapbl_wc_header *wc = wl->wl_wc_header;
struct timespec ts;
int error;
int force = 1;
/* XXX Calc checksum here, instead we do this for now */
error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
if (error) {
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
"returned %d\n", wl->wl_devvp->v_rdev, error));
}
wc->wc_head = head;
wc->wc_tail = tail;
wc->wc_checksum = 0;
wc->wc_version = 1;
getnanotime(&ts);
wc->wc_time = ts.tv_sec;
wc->wc_timensec = ts.tv_nsec;
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
(intmax_t)head, (intmax_t)tail));
/*
* XXX if generation will rollover, then first zero
* over second commit header before trying to write both headers.
*/
error = wapbl_write(wc, wc->wc_len, wl->wl_devvp,
wl->wl_logpbn + wc->wc_generation % 2);
if (error)
return error;
error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED);
if (error) {
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x "
"returned %d\n", wl->wl_devvp->v_rdev, error));
}
/*
* If the generation number was zero, write it out a second time.
* This handles initialization and generation number rollover
*/
if (wc->wc_generation++ == 0) {
error = wapbl_write_commit(wl, head, tail);
/*
* This panic should be able to be removed if we do the
* zero'ing mentioned above, and we are certain to roll
* back generation number on failure.
*/
if (error)
panic("wapbl_write_commit: error writing duplicate "
"log header: %d\n", error);
}
return 0;
}
/* Returns new offset value */
static int
wapbl_write_blocks(struct wapbl *wl, off_t *offp)
{
struct wapbl_wc_blocklist *wc =
(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
int blocklen = 1<<wl->wl_log_dev_bshift;
int bph;
struct buf *bp;
off_t off = *offp;
int error;
size_t padding;
KASSERT(rw_write_held(&wl->wl_rwlock));
bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
bp = LIST_FIRST(&wl->wl_bufs);
while (bp) {
int cnt;
struct buf *obp = bp;
KASSERT(bp->b_flags & B_LOCKED);
wc->wc_type = WAPBL_WC_BLOCKS;
wc->wc_len = blocklen;
wc->wc_blkcount = 0;
while (bp && (wc->wc_blkcount < bph)) {
/*
* Make sure all the physical block numbers are up to
* date. If this is not always true on a given
* filesystem, then VOP_BMAP must be called. We
* could call VOP_BMAP here, or else in the filesystem
* specific flush callback, although neither of those
* solutions allow us to take the vnode lock. If a
* filesystem requires that we must take the vnode lock
* to call VOP_BMAP, then we can probably do it in
* bwrite when the vnode lock should already be held
* by the invoking code.
*/
KASSERT((bp->b_vp->v_type == VBLK) ||
(bp->b_blkno != bp->b_lblkno));
KASSERT(bp->b_blkno > 0);
wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
wc->wc_len += bp->b_bcount;
wc->wc_blkcount++;
bp = LIST_NEXT(bp, b_wapbllist);
}
if (wc->wc_len % blocklen != 0) {
padding = blocklen - wc->wc_len % blocklen;
wc->wc_len += padding;
} else {
padding = 0;
}
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
wc->wc_len, padding, (intmax_t)off));
error = wapbl_circ_write(wl, wc, blocklen, &off);
if (error)
return error;
bp = obp;
cnt = 0;
while (bp && (cnt++ < bph)) {
error = wapbl_circ_write(wl, bp->b_data,
bp->b_bcount, &off);
if (error)
return error;
bp = LIST_NEXT(bp, b_wapbllist);
}
if (padding) {
void *zero;
zero = wapbl_malloc(padding);
memset(zero, 0, padding);
error = wapbl_circ_write(wl, zero, padding, &off);
wapbl_free(zero, padding);
if (error)
return error;
}
}
*offp = off;
return 0;
}
static int
wapbl_write_revocations(struct wapbl *wl, off_t *offp)
{
struct wapbl_wc_blocklist *wc =
(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
int i;
int blocklen = 1<<wl->wl_log_dev_bshift;
int bph;
off_t off = *offp;
int error;
if (wl->wl_dealloccnt == 0)
return 0;
bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
i = 0;
while (i < wl->wl_dealloccnt) {
wc->wc_type = WAPBL_WC_REVOCATIONS;
wc->wc_len = blocklen;
wc->wc_blkcount = 0;
while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) {
wc->wc_blocks[wc->wc_blkcount].wc_daddr =
wl->wl_deallocblks[i];
wc->wc_blocks[wc->wc_blkcount].wc_dlen =
wl->wl_dealloclens[i];
wc->wc_blkcount++;
i++;
}
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
wc->wc_len, (intmax_t)off));
error = wapbl_circ_write(wl, wc, blocklen, &off);
if (error)
return error;
}
*offp = off;
return 0;
}
static int
wapbl_write_inodes(struct wapbl *wl, off_t *offp)
{
struct wapbl_wc_inodelist *wc =
(struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
int i;
int blocklen = 1 << wl->wl_log_dev_bshift;
off_t off = *offp;
int error;
struct wapbl_ino_head *wih;
struct wapbl_ino *wi;
int iph;
iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
i = 0;
wih = &wl->wl_inohash[0];
wi = 0;
do {
wc->wc_type = WAPBL_WC_INODES;
wc->wc_len = blocklen;
wc->wc_inocnt = 0;
wc->wc_clear = (i == 0);
while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
while (!wi) {
KASSERT((wih - &wl->wl_inohash[0])
<= wl->wl_inohashmask);
wi = LIST_FIRST(wih++);
}
wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
wc->wc_inocnt++;
i++;
wi = LIST_NEXT(wi, wi_hash);
}
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
wc->wc_len, (intmax_t)off));
error = wapbl_circ_write(wl, wc, blocklen, &off);
if (error)
return error;
} while (i < wl->wl_inohashcnt);
*offp = off;
return 0;
}
#endif /* _KERNEL */
/****************************************************************/
struct wapbl_blk {
LIST_ENTRY(wapbl_blk) wb_hash;
daddr_t wb_blk;
off_t wb_off; /* Offset of this block in the log */
};
#define WAPBL_BLKPOOL_MIN 83
static void
wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
{
if (size < WAPBL_BLKPOOL_MIN)
size = WAPBL_BLKPOOL_MIN;
KASSERT(wr->wr_blkhash == 0);
#ifdef _KERNEL
wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
#else /* ! _KERNEL */
/* Manually implement hashinit */
{
int i;
unsigned long hashsize;
for (hashsize = 1; hashsize < size; hashsize <<= 1)
continue;
wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash));
for (i = 0; i < wr->wr_blkhashmask; i++)
LIST_INIT(&wr->wr_blkhash[i]);
wr->wr_blkhashmask = hashsize - 1;
}
#endif /* ! _KERNEL */
}
static void
wapbl_blkhash_free(struct wapbl_replay *wr)
{
KASSERT(wr->wr_blkhashcnt == 0);
#ifdef _KERNEL
hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
#else /* ! _KERNEL */
wapbl_free(wr->wr_blkhash,
(wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
#endif /* ! _KERNEL */
}
static struct wapbl_blk *
wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
{
struct wapbl_blk_head *wbh;
struct wapbl_blk *wb;
wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
LIST_FOREACH(wb, wbh, wb_hash) {
if (blk == wb->wb_blk)
return wb;
}
return 0;
}
static void
wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
{
struct wapbl_blk_head *wbh;
struct wapbl_blk *wb;
wb = wapbl_blkhash_get(wr, blk);
if (wb) {
KASSERT(wb->wb_blk == blk);
wb->wb_off = off;
} else {
wb = wapbl_malloc(sizeof(*wb));
wb->wb_blk = blk;
wb->wb_off = off;
wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
LIST_INSERT_HEAD(wbh, wb, wb_hash);
wr->wr_blkhashcnt++;
}
}
static void
wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
{
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
if (wb) {
KASSERT(wr->wr_blkhashcnt > 0);
wr->wr_blkhashcnt--;
LIST_REMOVE(wb, wb_hash);
wapbl_free(wb, sizeof(*wb));
}
}
static void
wapbl_blkhash_clear(struct wapbl_replay *wr)
{
int i;
for (i = 0; i <= wr->wr_blkhashmask; i++) {
struct wapbl_blk *wb;
while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
KASSERT(wr->wr_blkhashcnt > 0);
wr->wr_blkhashcnt--;
LIST_REMOVE(wb, wb_hash);
wapbl_free(wb, sizeof(*wb));
}
}
KASSERT(wr->wr_blkhashcnt == 0);
}
/****************************************************************/
static int
wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
{
size_t slen;
off_t off = *offp;
int error;
KASSERT(((len >> wr->wr_log_dev_bshift) <<
wr->wr_log_dev_bshift) == len);
if (off < wr->wr_circ_off)
off = wr->wr_circ_off;
slen = wr->wr_circ_off + wr->wr_circ_size - off;
if (slen < len) {
error = wapbl_read(data, slen, wr->wr_devvp,
wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
if (error)
return error;
data = (uint8_t *)data + slen;
len -= slen;
off = wr->wr_circ_off;
}
error = wapbl_read(data, len, wr->wr_devvp,
wr->wr_logpbn + (off >> wr->wr_log_dev_bshift));
if (error)
return error;
off += len;
if (off >= wr->wr_circ_off + wr->wr_circ_size)
off = wr->wr_circ_off;
*offp = off;
return 0;
}
static void
wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
{
size_t slen;
off_t off = *offp;
KASSERT(((len >> wr->wr_log_dev_bshift) <<
wr->wr_log_dev_bshift) == len);
if (off < wr->wr_circ_off)
off = wr->wr_circ_off;
slen = wr->wr_circ_off + wr->wr_circ_size - off;
if (slen < len) {
len -= slen;
off = wr->wr_circ_off;
}
off += len;
if (off >= wr->wr_circ_off + wr->wr_circ_size)
off = wr->wr_circ_off;
*offp = off;
}
/****************************************************************/
int
wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
daddr_t off, size_t count, size_t blksize)
{
struct wapbl_replay *wr;
int error;
struct vnode *devvp;
daddr_t logpbn;
uint8_t *scratch;
struct wapbl_wc_header *wch;
struct wapbl_wc_header *wch2;
/* Use this until we read the actual log header */
int log_dev_bshift = DEV_BSHIFT;
size_t used;
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
vp, off, count, blksize));
if (off < 0)
return EINVAL;
if (blksize < DEV_BSIZE)
return EINVAL;
if (blksize % DEV_BSIZE)
return EINVAL;
#ifdef _KERNEL
#if 0
/* XXX vp->v_size isn't reliably set for VBLK devices,
* especially root. However, we might still want to verify
* that the full load is readable */
if ((off + count) * blksize > vp->v_size)
return EINVAL;
#endif
if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
return error;
}
#else /* ! _KERNEL */
devvp = vp;
logpbn = off;
#endif /* ! _KERNEL */
scratch = wapbl_malloc(MAXBSIZE);
error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, logpbn);
if (error)
goto errout;
wch = (struct wapbl_wc_header *)scratch;
wch2 =
(struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
/* XXX verify checksums and magic numbers */
if (wch->wc_type != WAPBL_WC_HEADER) {
printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
error = EFTYPE;
goto errout;
}
if (wch2->wc_generation > wch->wc_generation)
wch = wch2;
wr = wapbl_calloc(1, sizeof(*wr));
wr->wr_logvp = vp;
wr->wr_devvp = devvp;
wr->wr_logpbn = logpbn;
wr->wr_scratch = scratch;
wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
wr->wr_circ_off = wch->wc_circ_off;
wr->wr_circ_size = wch->wc_circ_size;
wr->wr_generation = wch->wc_generation;
used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
" len=%"PRId64" used=%zu\n",
wch->wc_head, wch->wc_tail, wch->wc_circ_off,
wch->wc_circ_size, used));
wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
if (error) {
wapbl_replay_stop(wr);
wapbl_replay_free(wr);
return error;
}
*wrp = wr;
return 0;
errout:
wapbl_free(scratch, MAXBSIZE);
return error;
}
void
wapbl_replay_stop(struct wapbl_replay *wr)
{
if (!wapbl_replay_isopen(wr))
return;
WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
wapbl_free(wr->wr_scratch, MAXBSIZE);
wr->wr_scratch = NULL;
wr->wr_logvp = NULL;
wapbl_blkhash_clear(wr);
wapbl_blkhash_free(wr);
}
void
wapbl_replay_free(struct wapbl_replay *wr)
{
KDASSERT(!wapbl_replay_isopen(wr));
if (wr->wr_inodes)
wapbl_free(wr->wr_inodes,
wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
wapbl_free(wr, sizeof(*wr));
}
#ifdef _KERNEL
int
wapbl_replay_isopen1(struct wapbl_replay *wr)
{
return wapbl_replay_isopen(wr);
}
#endif
static void
wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
{
struct wapbl_wc_blocklist *wc =
(struct wapbl_wc_blocklist *)wr->wr_scratch;
int fsblklen = 1 << wr->wr_fs_dev_bshift;
int i, j, n;
for (i = 0; i < wc->wc_blkcount; i++) {
/*
* Enter each physical block into the hashtable independently.
*/
n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
for (j = 0; j < n; j++) {
wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j,
*offp);
wapbl_circ_advance(wr, fsblklen, offp);
}
}
}
static void
wapbl_replay_process_revocations(struct wapbl_replay *wr)
{
struct wapbl_wc_blocklist *wc =
(struct wapbl_wc_blocklist *)wr->wr_scratch;
int i, j, n;
for (i = 0; i < wc->wc_blkcount; i++) {
/*
* Remove any blocks found from the hashtable.
*/
n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
for (j = 0; j < n; j++)
wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j);
}
}
static void
wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
{
struct wapbl_wc_inodelist *wc =
(struct wapbl_wc_inodelist *)wr->wr_scratch;
void *new_inodes;
const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
/*
* Keep track of where we found this so location won't be
* overwritten.
*/
if (wc->wc_clear) {
wr->wr_inodestail = oldoff;
wr->wr_inodescnt = 0;
if (wr->wr_inodes != NULL) {
wapbl_free(wr->wr_inodes, oldsize);
wr->wr_inodes = NULL;
}
}
wr->wr_inodeshead = newoff;
if (wc->wc_inocnt == 0)
return;
new_inodes = wapbl_malloc((wr->wr_inodescnt + wc->wc_inocnt) *
sizeof(wr->wr_inodes[0]));
if (wr->wr_inodes != NULL) {
memcpy(new_inodes, wr->wr_inodes, oldsize);
wapbl_free(wr->wr_inodes, oldsize);
}
wr->wr_inodes = new_inodes;
memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
wr->wr_inodescnt += wc->wc_inocnt;
}
static int
wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
{
off_t off;
int error;
int logblklen = 1 << wr->wr_log_dev_bshift;
wapbl_blkhash_clear(wr);
off = tail;
while (off != head) {
struct wapbl_wc_null *wcn;
off_t saveoff = off;
error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
if (error)
goto errout;
wcn = (struct wapbl_wc_null *)wr->wr_scratch;
switch (wcn->wc_type) {
case WAPBL_WC_BLOCKS:
wapbl_replay_process_blocks(wr, &off);
break;
case WAPBL_WC_REVOCATIONS:
wapbl_replay_process_revocations(wr);
break;
case WAPBL_WC_INODES:
wapbl_replay_process_inodes(wr, saveoff, off);
break;
default:
printf("Unrecognized wapbl type: 0x%08x\n",
wcn->wc_type);
error = EFTYPE;
goto errout;
}
wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
if (off != saveoff) {
printf("wapbl_replay: corrupted records\n");
error = EFTYPE;
goto errout;
}
}
return 0;
errout:
wapbl_blkhash_clear(wr);
return error;
}
#if 0
int
wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
{
off_t off;
int mismatchcnt = 0;
int logblklen = 1 << wr->wr_log_dev_bshift;
int fsblklen = 1 << wr->wr_fs_dev_bshift;
void *scratch1 = wapbl_malloc(MAXBSIZE);
void *scratch2 = wapbl_malloc(MAXBSIZE);
int error = 0;
KDASSERT(wapbl_replay_isopen(wr));
off = wch->wc_tail;
while (off != wch->wc_head) {
struct wapbl_wc_null *wcn;
#ifdef DEBUG
off_t saveoff = off;
#endif
error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
if (error)
goto out;
wcn = (struct wapbl_wc_null *)wr->wr_scratch;
switch (wcn->wc_type) {
case WAPBL_WC_BLOCKS:
{
struct wapbl_wc_blocklist *wc =
(struct wapbl_wc_blocklist *)wr->wr_scratch;
int i;
for (i = 0; i < wc->wc_blkcount; i++) {
int foundcnt = 0;
int dirtycnt = 0;
int j, n;
/*
* Check each physical block into the
* hashtable independently
*/
n = wc->wc_blocks[i].wc_dlen >>
wch->wc_fs_dev_bshift;
for (j = 0; j < n; j++) {
struct wapbl_blk *wb =
wapbl_blkhash_get(wr,
wc->wc_blocks[i].wc_daddr + j);
if (wb && (wb->wb_off == off)) {
foundcnt++;
error =
wapbl_circ_read(wr,
scratch1, fsblklen,
&off);
if (error)
goto out;
error =
wapbl_read(scratch2,
fsblklen, fsdevvp,
wb->wb_blk);
if (error)
goto out;
if (memcmp(scratch1,
scratch2,
fsblklen)) {
printf(
"wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
wb->wb_blk, (intmax_t)off);
dirtycnt++;
mismatchcnt++;
}
} else {
wapbl_circ_advance(wr,
fsblklen, &off);
}
}
#if 0
/*
* If all of the blocks in an entry
* are clean, then remove all of its
* blocks from the hashtable since they
* never will need replay.
*/
if ((foundcnt != 0) &&
(dirtycnt == 0)) {
off = saveoff;
wapbl_circ_advance(wr,
logblklen, &off);
for (j = 0; j < n; j++) {
struct wapbl_blk *wb =
wapbl_blkhash_get(wr,
wc->wc_blocks[i].wc_daddr + j);
if (wb &&
(wb->wb_off == off)) {
wapbl_blkhash_rem(wr, wb->wb_blk);
}
wapbl_circ_advance(wr,
fsblklen, &off);
}
}
#endif
}
}
break;
case WAPBL_WC_REVOCATIONS:
case WAPBL_WC_INODES:
break;
default:
KASSERT(0);
}
#ifdef DEBUG
wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
KASSERT(off == saveoff);
#endif
}
out:
wapbl_free(scratch1, MAXBSIZE);
wapbl_free(scratch2, MAXBSIZE);
if (!error && mismatchcnt)
error = EFTYPE;
return error;
}
#endif
int
wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
{
struct wapbl_blk *wb;
size_t i;
off_t off;
void *scratch;
int error = 0;
int fsblklen = 1 << wr->wr_fs_dev_bshift;
KDASSERT(wapbl_replay_isopen(wr));
scratch = wapbl_malloc(MAXBSIZE);
for (i = 0; i < wr->wr_blkhashmask; ++i) {
LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
off = wb->wb_off;
error = wapbl_circ_read(wr, scratch, fsblklen, &off);
if (error)
break;
error = wapbl_write(scratch, fsblklen, fsdevvp,
wb->wb_blk);
if (error)
break;
}
}
wapbl_free(scratch, MAXBSIZE);
return error;
}
int
wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
{
int fsblklen = 1 << wr->wr_fs_dev_bshift;
KDASSERT(wapbl_replay_isopen(wr));
KASSERT((len % fsblklen) == 0);
while (len != 0) {
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
if (wb)
return 1;
len -= fsblklen;
}
return 0;
}
int
wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
{
int fsblklen = 1 << wr->wr_fs_dev_bshift;
KDASSERT(wapbl_replay_isopen(wr));
KASSERT((len % fsblklen) == 0);
while (len != 0) {
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
if (wb) {
off_t off = wb->wb_off;
int error;
error = wapbl_circ_read(wr, data, fsblklen, &off);
if (error)
return error;
}
data = (uint8_t *)data + fsblklen;
len -= fsblklen;
blk++;
}
return 0;
}