/* $NetBSD: vfs_wapbl.c,v 1.8 2008/11/17 19:36:11 joerg Exp $ */ /*- * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * This implements file system independent write ahead filesystem logging. */ #define WAPBL_INTERNAL #include __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.8 2008/11/17 19:36:11 joerg Exp $"); #include #ifdef _KERNEL #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if WAPBL_UVM_ALLOC #include #endif #include MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); #define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) #define wapbl_free(a) free((a), M_WAPBL) #define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) #else /* !_KERNEL */ #include #include #include #include #include #include #include #include #define KDASSERT(x) assert(x) #define KASSERT(x) assert(x) #define wapbl_malloc(s) malloc(s) #define wapbl_free(a) free(a) #define wapbl_calloc(n, s) calloc((n), (s)) #endif /* !_KERNEL */ /* * INTERNAL DATA STRUCTURES */ /* * This structure holds per-mount log information. * * Legend: a = atomic access only * r = read-only after init * l = rwlock held * m = mutex held * u = unlocked access ok * b = bufcache_lock held */ struct wapbl { struct vnode *wl_logvp; /* r: log here */ struct vnode *wl_devvp; /* r: log on this device */ struct mount *wl_mount; /* r: mountpoint wl is associated with */ daddr_t wl_logpbn; /* r: Physical block number of start of log */ int wl_log_dev_bshift; /* r: logarithm of device block size of log device */ int wl_fs_dev_bshift; /* r: logarithm of device block size of filesystem device */ unsigned wl_lock_count; /* m: Count of transactions in progress */ size_t wl_circ_size; /* r: Number of bytes in buffer of log */ size_t wl_circ_off; /* r: Number of bytes reserved at start */ size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ off_t wl_head; /* l: Byte offset of log head */ off_t wl_tail; /* l: Byte offset of log tail */ /* * head == tail == 0 means log is empty * head == tail != 0 means log is full * see assertions in wapbl_advance() for other boundary conditions. * only truncate moves the tail, except when flush sets it to * wl_header_size only flush moves the head, except when truncate * sets it to 0. */ struct wapbl_wc_header *wl_wc_header; /* l */ void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ kmutex_t wl_mtx; /* u: short-term lock */ krwlock_t wl_rwlock; /* u: File system transaction lock */ /* * Must be held while accessing * wl_count or wl_bufs or head or tail */ /* * Callback called from within the flush routine to flush any extra * bits. Note that flush may be skipped without calling this if * there are no outstanding buffers in the transaction. */ #if _KERNEL wapbl_flush_fn_t wl_flush; /* r */ wapbl_flush_fn_t wl_flush_abort;/* r */ #endif size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ size_t wl_bcount; /* m: Total bcount of wl_bufs */ LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ kcondvar_t wl_reclaimable_cv; /* m (obviously) */ size_t wl_reclaimable_bytes; /* m: Amount of space available for reclamation by truncate */ int wl_error_count; /* m: # of wl_entries with errors */ size_t wl_reserved_bytes; /* never truncate log smaller than this */ #ifdef WAPBL_DEBUG_BUFBYTES size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ #endif daddr_t *wl_deallocblks;/* l: address of block */ int *wl_dealloclens; /* l: size of block (fragments, kom ihåg) */ int wl_dealloccnt; /* l: total count */ int wl_dealloclim; /* l: max count */ /* hashtable of inode numbers for allocated but unlinked inodes */ /* synch ??? */ LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; u_long wl_inohashmask; int wl_inohashcnt; SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction accounting */ }; #ifdef WAPBL_DEBUG_PRINT int wapbl_debug_print = WAPBL_DEBUG_PRINT; #endif /****************************************************************/ #ifdef _KERNEL #ifdef WAPBL_DEBUG struct wapbl *wapbl_debug_wl; #endif static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); #endif /* _KERNEL */ static int wapbl_replay_prescan(struct wapbl_replay *wr); static int wapbl_replay_get_inodes(struct wapbl_replay *wr); static __inline size_t wapbl_space_free(size_t avail, off_t head, off_t tail); static __inline size_t wapbl_space_used(size_t avail, off_t head, off_t tail); #ifdef _KERNEL #define WAPBL_INODETRK_SIZE 83 static int wapbl_ino_pool_refcount; static struct pool wapbl_ino_pool; struct wapbl_ino { LIST_ENTRY(wapbl_ino) wi_hash; ino_t wi_ino; mode_t wi_mode; }; static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); static void wapbl_inodetrk_free(struct wapbl *wl); static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); static size_t wapbl_transaction_len(struct wapbl *wl); static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); #ifdef DEBUG int wapbl_replay_verify(struct wapbl_replay *, struct vnode *); #endif static int wapbl_replay_isopen1(struct wapbl_replay *); /* * This is useful for debugging. If set, the log will * only be truncated when necessary. */ int wapbl_lazy_truncate = 0; struct wapbl_ops wapbl_ops = { .wo_wapbl_discard = wapbl_discard, .wo_wapbl_replay_isopen = wapbl_replay_isopen1, .wo_wapbl_replay_can_read = wapbl_replay_can_read, .wo_wapbl_replay_read = wapbl_replay_read, .wo_wapbl_add_buf = wapbl_add_buf, .wo_wapbl_remove_buf = wapbl_remove_buf, .wo_wapbl_resize_buf = wapbl_resize_buf, .wo_wapbl_begin = wapbl_begin, .wo_wapbl_end = wapbl_end, .wo_wapbl_junlock_assert= wapbl_junlock_assert, /* XXX: the following is only used to say "this is a wapbl buf" */ .wo_wapbl_biodone = wapbl_biodone, }; void wapbl_init() { malloc_type_attach(M_WAPBL); } int wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) { struct wapbl *wl; struct vnode *devvp; daddr_t logpbn; int error; int log_dev_bshift = DEV_BSHIFT; int fs_dev_bshift = DEV_BSHIFT; int run; WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 " count=%zu blksize=%zu\n", vp, off, count, blksize)); if (log_dev_bshift > fs_dev_bshift) { WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl: log device's block size cannot be larger " "than filesystem's\n")); /* * Not currently implemented, although it could be if * needed someday. */ return ENOSYS; } if (off < 0) return EINVAL; if (blksize < DEV_BSIZE) return EINVAL; if (blksize % DEV_BSIZE) return EINVAL; /* XXXTODO: verify that the full load is writable */ /* * XXX check for minimum log size * minimum is governed by minimum amount of space * to complete a transaction. (probably truncate) */ /* XXX for now pick something minimal */ if ((count * blksize) < MAXPHYS) { return ENOSPC; } if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { return error; } wl = wapbl_calloc(1, sizeof(*wl)); rw_init(&wl->wl_rwlock); mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); cv_init(&wl->wl_reclaimable_cv, "wapblrec"); LIST_INIT(&wl->wl_bufs); SIMPLEQ_INIT(&wl->wl_entries); wl->wl_logvp = vp; wl->wl_devvp = devvp; wl->wl_mount = mp; wl->wl_logpbn = logpbn; wl->wl_log_dev_bshift = log_dev_bshift; wl->wl_fs_dev_bshift = fs_dev_bshift; wl->wl_flush = flushfn; wl->wl_flush_abort = flushabortfn; /* Reserve two log device blocks for the commit headers */ wl->wl_circ_off = 2<wl_log_dev_bshift; wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); /* truncate the log usage to a multiple of log_dev_bshift */ wl->wl_circ_size >>= wl->wl_log_dev_bshift; wl->wl_circ_size <<= wl->wl_log_dev_bshift; /* * wl_bufbytes_max limits the size of the in memory transaction space. * - Since buffers are allocated and accounted for in units of * PAGE_SIZE it is required to be a multiple of PAGE_SIZE * (i.e. 1<wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2); /* Round wl_bufbytes_max to the largest power of two constraint */ wl->wl_bufbytes_max >>= PAGE_SHIFT; wl->wl_bufbytes_max <<= PAGE_SHIFT; wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; /* XXX maybe use filesystem fragment size instead of 1024 */ /* XXX fix actual number of buffers reserved per filesystem. */ wl->wl_bufcount_max = (nbuf / 2) * 1024; /* XXX tie this into resource estimation */ wl->wl_dealloclim = 2 * btodb(wl->wl_bufbytes_max); #if WAPBL_UVM_ALLOC wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map, round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim)); KASSERT(wl->wl_deallocblks != NULL); wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map, round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim)); KASSERT(wl->wl_dealloclens != NULL); #else wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim); wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim); #endif wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); /* Initialize the commit header */ { struct wapbl_wc_header *wc; size_t len = 1<wl_log_dev_bshift; wc = wapbl_calloc(1, len); wc->wc_type = WAPBL_WC_HEADER; wc->wc_len = len; wc->wc_circ_off = wl->wl_circ_off; wc->wc_circ_size = wl->wl_circ_size; /* XXX wc->wc_fsid */ wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; wl->wl_wc_header = wc; wl->wl_wc_scratch = wapbl_malloc(len); } /* * if there was an existing set of unlinked but * allocated inodes, preserve it in the new * log. */ if (wr && wr->wr_inodescnt) { int i; WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt)); /* * Its only valid to reuse the replay log if its * the same as the new log we just opened. */ KDASSERT(!wapbl_replay_isopen(wr)); KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev); KASSERT(logpbn == wr->wr_logpbn); KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size); KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off); KASSERT(wl->wl_log_dev_bshift == wr->wr_wc_header.wc_log_dev_bshift); KASSERT(wl->wl_fs_dev_bshift == wr->wr_wc_header.wc_fs_dev_bshift); wl->wl_wc_header->wc_generation = wr->wr_wc_header.wc_generation + 1; for (i = 0; i < wr->wr_inodescnt; i++) wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, wr->wr_inodes[i].wr_imode); /* Make sure new transaction won't overwrite old inodes list */ KDASSERT(wapbl_transaction_len(wl) <= wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, wr->wr_inodestail)); wl->wl_head = wl->wl_tail = wr->wr_inodeshead; wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = wapbl_transaction_len(wl); error = wapbl_write_inodes(wl, &wl->wl_head); if (error) goto errout; KASSERT(wl->wl_head != wl->wl_tail); KASSERT(wl->wl_head != 0); } error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); if (error) { goto errout; } *wlp = wl; #if defined(WAPBL_DEBUG) wapbl_debug_wl = wl; #endif return 0; errout: wapbl_discard(wl); wapbl_free(wl->wl_wc_scratch); wapbl_free(wl->wl_wc_header); #if WAPBL_UVM_ALLOC uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, round_page(sizeof(*wl->wl_deallocblks * wl->wl_dealloclim))); uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, round_page(sizeof(*wl->wl_dealloclens * wl->wl_dealloclim))); #else wapbl_free(wl->wl_deallocblks); wapbl_free(wl->wl_dealloclens); #endif wapbl_inodetrk_free(wl); wapbl_free(wl); return error; } /* * Like wapbl_flush, only discards the transaction * completely */ void wapbl_discard(struct wapbl *wl) { struct wapbl_entry *we; struct buf *bp; int i; /* * XXX we may consider using upgrade here * if we want to call flush from inside a transaction */ rw_enter(&wl->wl_rwlock, RW_WRITER); wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, wl->wl_dealloccnt); #ifdef WAPBL_DEBUG_PRINT { struct wapbl_entry *we; pid_t pid = -1; lwpid_t lid = -1; if (curproc) pid = curproc->p_pid; if (curlwp) lid = curlwp->l_lid; #ifdef WAPBL_DEBUG_BUFBYTES WAPBL_PRINTF(WAPBL_PRINT_DISCARD, ("wapbl_discard: thread %d.%d discarding " "transaction\n" "\tbufcount=%zu bufbytes=%zu bcount=%zu " "deallocs=%d inodes=%d\n" "\terrcnt = %u, reclaimable=%zu reserved=%zu " "unsynced=%zu\n", pid, lid, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt, wl->wl_error_count, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { WAPBL_PRINTF(WAPBL_PRINT_DISCARD, ("\tentry: bufcount = %zu, reclaimable = %zu, " "error = %d, unsynced = %zu\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error, we->we_unsynced_bufbytes)); } #else /* !WAPBL_DEBUG_BUFBYTES */ WAPBL_PRINTF(WAPBL_PRINT_DISCARD, ("wapbl_discard: thread %d.%d discarding transaction\n" "\tbufcount=%zu bufbytes=%zu bcount=%zu " "deallocs=%d inodes=%d\n" "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", pid, lid, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt, wl->wl_error_count, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { WAPBL_PRINTF(WAPBL_PRINT_DISCARD, ("\tentry: bufcount = %zu, reclaimable = %zu, " "error = %d\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error)); } #endif /* !WAPBL_DEBUG_BUFBYTES */ } #endif /* WAPBL_DEBUG_PRINT */ for (i = 0; i <= wl->wl_inohashmask; i++) { struct wapbl_ino_head *wih; struct wapbl_ino *wi; wih = &wl->wl_inohash[i]; while ((wi = LIST_FIRST(wih)) != NULL) { LIST_REMOVE(wi, wi_hash); pool_put(&wapbl_ino_pool, wi); KASSERT(wl->wl_inohashcnt > 0); wl->wl_inohashcnt--; } } /* * clean buffer list */ mutex_enter(&bufcache_lock); mutex_enter(&wl->wl_mtx); while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { /* * The buffer will be unlocked and * removed from the transaction in brelse */ mutex_exit(&wl->wl_mtx); brelsel(bp, 0); mutex_enter(&wl->wl_mtx); } } mutex_exit(&wl->wl_mtx); mutex_exit(&bufcache_lock); /* * Remove references to this wl from wl_entries, free any which * no longer have buffers, others will be freed in wapbl_biodone * when they no longer have any buffers. */ while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); /* XXX should we be accumulating wl_error_count * and increasing reclaimable bytes ? */ we->we_wapbl = NULL; if (we->we_bufcount == 0) { #ifdef WAPBL_DEBUG_BUFBYTES KASSERT(we->we_unsynced_bufbytes == 0); #endif wapbl_free(we); } } /* Discard list of deallocs */ wl->wl_dealloccnt = 0; /* XXX should we clear wl_reserved_bytes? */ KASSERT(wl->wl_bufbytes == 0); KASSERT(wl->wl_bcount == 0); KASSERT(wl->wl_bufcount == 0); KASSERT(LIST_EMPTY(&wl->wl_bufs)); KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); KASSERT(wl->wl_inohashcnt == 0); rw_exit(&wl->wl_rwlock); } int wapbl_stop(struct wapbl *wl, int force) { struct vnode *vp; int error; WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); error = wapbl_flush(wl, 1); if (error) { if (force) wapbl_discard(wl); else return error; } /* Unlinked inodes persist after a flush */ if (wl->wl_inohashcnt) { if (force) { wapbl_discard(wl); } else { return EBUSY; } } KASSERT(wl->wl_bufbytes == 0); KASSERT(wl->wl_bcount == 0); KASSERT(wl->wl_bufcount == 0); KASSERT(LIST_EMPTY(&wl->wl_bufs)); KASSERT(wl->wl_dealloccnt == 0); KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); KASSERT(wl->wl_inohashcnt == 0); vp = wl->wl_logvp; wapbl_free(wl->wl_wc_scratch); wapbl_free(wl->wl_wc_header); #if WAPBL_UVM_ALLOC uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, round_page(sizeof(*wl->wl_deallocblks * wl->wl_dealloclim))); uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, round_page(sizeof(*wl->wl_dealloclens * wl->wl_dealloclim))); #else wapbl_free(wl->wl_deallocblks); wapbl_free(wl->wl_dealloclens); #endif wapbl_inodetrk_free(wl); cv_destroy(&wl->wl_reclaimable_cv); mutex_destroy(&wl->wl_mtx); rw_destroy(&wl->wl_rwlock); wapbl_free(wl); return 0; } static int wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) { struct pstats *pstats = curlwp->l_proc->p_stats; struct buf *bp; int error; KASSERT((flags & ~(B_WRITE | B_READ)) == 0); KASSERT(devvp->v_type == VBLK); if ((flags & (B_WRITE | B_READ)) == B_WRITE) { mutex_enter(&devvp->v_interlock); devvp->v_numoutput++; mutex_exit(&devvp->v_interlock); pstats->p_ru.ru_oublock++; } else { pstats->p_ru.ru_inblock++; } bp = getiobuf(devvp, true); bp->b_flags = flags; bp->b_cflags = BC_BUSY; /* silly & dubious */ bp->b_dev = devvp->v_rdev; bp->b_data = data; bp->b_bufsize = bp->b_resid = bp->b_bcount = len; bp->b_blkno = pbn; WAPBL_PRINTF(WAPBL_PRINT_IO, ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, bp->b_blkno, bp->b_dev)); VOP_STRATEGY(devvp, bp); error = biowait(bp); putiobuf(bp); if (error) { WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("wapbl_doio: %s %zu bytes at block %" PRId64 " on dev 0x%x failed with error %d\n", (((flags & (B_WRITE | B_READ)) == B_WRITE) ? "write" : "read"), len, pbn, devvp->v_rdev, error)); } return error; } int wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) { return wapbl_doio(data, len, devvp, pbn, B_WRITE); } int wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) { return wapbl_doio(data, len, devvp, pbn, B_READ); } /* * Off is byte offset returns new offset for next write * handles log wraparound */ static int wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) { size_t slen; off_t off = *offp; int error; KDASSERT(((len >> wl->wl_log_dev_bshift) << wl->wl_log_dev_bshift) == len); if (off < wl->wl_circ_off) off = wl->wl_circ_off; slen = wl->wl_circ_off + wl->wl_circ_size - off; if (slen < len) { error = wapbl_write(data, slen, wl->wl_devvp, wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); if (error) return error; data = (uint8_t *)data + slen; len -= slen; off = wl->wl_circ_off; } error = wapbl_write(data, len, wl->wl_devvp, wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); if (error) return error; off += len; if (off >= wl->wl_circ_off + wl->wl_circ_size) off = wl->wl_circ_off; *offp = off; return 0; } /****************************************************************/ int wapbl_begin(struct wapbl *wl, const char *file, int line) { int doflush; unsigned lockcount; krw_t op; KDASSERT(wl); /* * XXX: The original code calls for the use of a RW_READER lock * here, but it turns out there are performance issues with high * metadata-rate workloads (e.g. multiple simultaneous tar * extractions). For now, we force the lock to be RW_WRITER, * since that currently has the best performance characteristics * (even for a single tar-file extraction). * */ #define WAPBL_DEBUG_SERIALIZE 1 #ifdef WAPBL_DEBUG_SERIALIZE op = RW_WRITER; #else op = RW_READER; #endif /* * XXX this needs to be made much more sophisticated. * perhaps each wapbl_begin could reserve a specified * number of buffers and bytes. */ mutex_enter(&wl->wl_mtx); lockcount = wl->wl_lock_count; doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > wl->wl_bufbytes_max / 2) || ((wl->wl_bufcount + (lockcount * 10)) > wl->wl_bufcount_max / 2) || (wapbl_transaction_len(wl) > wl->wl_circ_size / 2); mutex_exit(&wl->wl_mtx); if (doflush) { WAPBL_PRINTF(WAPBL_PRINT_FLUSH, ("force flush lockcnt=%d bufbytes=%zu " "(max=%zu) bufcount=%zu (max=%zu)\n", lockcount, wl->wl_bufbytes, wl->wl_bufbytes_max, wl->wl_bufcount, wl->wl_bufcount_max)); } if (doflush) { int error = wapbl_flush(wl, 0); if (error) return error; } rw_enter(&wl->wl_rwlock, op); mutex_enter(&wl->wl_mtx); wl->wl_lock_count++; mutex_exit(&wl->wl_mtx); #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, ("wapbl_begin thread %d.%d with bufcount=%zu " "bufbytes=%zu bcount=%zu at %s:%d\n", curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, file, line)); #endif return 0; } void wapbl_end(struct wapbl *wl) { #if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, ("wapbl_end thread %d.%d with bufcount=%zu " "bufbytes=%zu bcount=%zu\n", curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount)); #endif mutex_enter(&wl->wl_mtx); KASSERT(wl->wl_lock_count > 0); wl->wl_lock_count--; mutex_exit(&wl->wl_mtx); rw_exit(&wl->wl_rwlock); } void wapbl_add_buf(struct wapbl *wl, struct buf * bp) { KASSERT(bp->b_cflags & BC_BUSY); KASSERT(bp->b_vp); wapbl_jlock_assert(wl); #if 0 /* * XXX this might be an issue for swapfiles. * see uvm_swap.c:1702 * * XXX2 why require it then? leap of semantics? */ KASSERT((bp->b_cflags & BC_NOCACHE) == 0); #endif mutex_enter(&wl->wl_mtx); if (bp->b_flags & B_LOCKED) { LIST_REMOVE(bp, b_wapbllist); WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, ("wapbl_add_buf thread %d.%d re-adding buf %p " "with %d bytes %d bcount\n", curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); } else { /* unlocked by dirty buffers shouldn't exist */ KASSERT(!(bp->b_oflags & BO_DELWRI)); wl->wl_bufbytes += bp->b_bufsize; wl->wl_bcount += bp->b_bcount; wl->wl_bufcount++; WAPBL_PRINTF(WAPBL_PRINT_BUFFER, ("wapbl_add_buf thread %d.%d adding buf %p " "with %d bytes %d bcount\n", curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); } LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); mutex_exit(&wl->wl_mtx); bp->b_flags |= B_LOCKED; } static void wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) { KASSERT(mutex_owned(&wl->wl_mtx)); KASSERT(bp->b_cflags & BC_BUSY); wapbl_jlock_assert(wl); #if 0 /* * XXX this might be an issue for swapfiles. * see uvm_swap.c:1725 * * XXXdeux: see above */ KASSERT((bp->b_flags & BC_NOCACHE) == 0); #endif KASSERT(bp->b_flags & B_LOCKED); WAPBL_PRINTF(WAPBL_PRINT_BUFFER, ("wapbl_remove_buf thread %d.%d removing buf %p with " "%d bytes %d bcount\n", curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); KASSERT(wl->wl_bufbytes >= bp->b_bufsize); wl->wl_bufbytes -= bp->b_bufsize; KASSERT(wl->wl_bcount >= bp->b_bcount); wl->wl_bcount -= bp->b_bcount; KASSERT(wl->wl_bufcount > 0); wl->wl_bufcount--; KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); LIST_REMOVE(bp, b_wapbllist); bp->b_flags &= ~B_LOCKED; } /* called from brelsel() in vfs_bio among other places */ void wapbl_remove_buf(struct wapbl * wl, struct buf *bp) { mutex_enter(&wl->wl_mtx); wapbl_remove_buf_locked(wl, bp); mutex_exit(&wl->wl_mtx); } void wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) { KASSERT(bp->b_cflags & BC_BUSY); /* * XXX: why does this depend on B_LOCKED? otherwise the buf * is not for a transaction? if so, why is this called in the * first place? */ if (bp->b_flags & B_LOCKED) { mutex_enter(&wl->wl_mtx); wl->wl_bufbytes += bp->b_bufsize - oldsz; wl->wl_bcount += bp->b_bcount - oldcnt; mutex_exit(&wl->wl_mtx); } } #endif /* _KERNEL */ /****************************************************************/ /* Some utility inlines */ /* This is used to advance the pointer at old to new value at old+delta */ static __inline off_t wapbl_advance(size_t size, size_t off, off_t old, size_t delta) { off_t new; /* Define acceptable ranges for inputs. */ KASSERT(delta <= size); KASSERT((old == 0) || (old >= off)); KASSERT(old < (size + off)); if ((old == 0) && (delta != 0)) new = off + delta; else if ((old + delta) < (size + off)) new = old + delta; else new = (old + delta) - size; /* Note some interesting axioms */ KASSERT((delta != 0) || (new == old)); KASSERT((delta == 0) || (new != 0)); KASSERT((delta != (size)) || (new == old)); /* Define acceptable ranges for output. */ KASSERT((new == 0) || (new >= off)); KASSERT(new < (size + off)); return new; } static __inline size_t wapbl_space_used(size_t avail, off_t head, off_t tail) { if (tail == 0) { KASSERT(head == 0); return 0; } return ((head + (avail - 1) - tail) % avail) + 1; } static __inline size_t wapbl_space_free(size_t avail, off_t head, off_t tail) { return avail - wapbl_space_used(avail, head, tail); } static __inline void wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, off_t *tailp) { off_t head = *headp; off_t tail = *tailp; KASSERT(delta <= wapbl_space_free(size, head, tail)); head = wapbl_advance(size, off, head, delta); if ((tail == 0) && (head != 0)) tail = off; *headp = head; *tailp = tail; } static __inline void wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, off_t *tailp) { off_t head = *headp; off_t tail = *tailp; KASSERT(delta <= wapbl_space_used(size, head, tail)); tail = wapbl_advance(size, off, tail, delta); if (head == tail) { head = tail = 0; } *headp = head; *tailp = tail; } #ifdef _KERNEL /****************************************************************/ /* * Remove transactions whose buffers are completely flushed to disk. * Will block until at least minfree space is available. * only intended to be called from inside wapbl_flush and therefore * does not protect against commit races with itself or with flush. */ static int wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) { size_t delta; size_t avail; off_t head; off_t tail; int error = 0; KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); KASSERT(rw_write_held(&wl->wl_rwlock)); mutex_enter(&wl->wl_mtx); /* * First check to see if we have to do a commit * at all. */ avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); if (minfree < avail) { mutex_exit(&wl->wl_mtx); return 0; } minfree -= avail; while ((wl->wl_error_count == 0) && (wl->wl_reclaimable_bytes < minfree)) { WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " "minfree=%zd\n", &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, minfree)); cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); } if (wl->wl_reclaimable_bytes < minfree) { KASSERT(wl->wl_error_count); /* XXX maybe get actual error from buffer instead someday? */ error = EIO; } head = wl->wl_head; tail = wl->wl_tail; delta = wl->wl_reclaimable_bytes; /* If all of of the entries are flushed, then be sure to keep * the reserved bytes reserved. Watch out for discarded transactions, * which could leave more bytes reserved than are reclaimable. */ if (SIMPLEQ_EMPTY(&wl->wl_entries) && (delta >= wl->wl_reserved_bytes)) { delta -= wl->wl_reserved_bytes; } wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, &tail); KDASSERT(wl->wl_reserved_bytes <= wapbl_space_used(wl->wl_circ_size, head, tail)); mutex_exit(&wl->wl_mtx); if (error) return error; if (waitonly) return 0; /* * This is where head, tail and delta are unprotected * from races against itself or flush. This is ok since * we only call this routine from inside flush itself. * * XXX: how can it race against itself when accessed only * from behind the write-locked rwlock? */ error = wapbl_write_commit(wl, head, tail); if (error) return error; wl->wl_head = head; wl->wl_tail = tail; mutex_enter(&wl->wl_mtx); KASSERT(wl->wl_reclaimable_bytes >= delta); wl->wl_reclaimable_bytes -= delta; mutex_exit(&wl->wl_mtx); WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, ("wapbl_truncate thread %d.%d truncating %zu bytes\n", curproc->p_pid, curlwp->l_lid, delta)); return 0; } /****************************************************************/ void wapbl_biodone(struct buf *bp) { struct wapbl_entry *we = bp->b_private; struct wapbl *wl = we->we_wapbl; /* * Handle possible flushing of buffers after log has been * decomissioned. */ if (!wl) { KASSERT(we->we_bufcount > 0); we->we_bufcount--; #ifdef WAPBL_DEBUG_BUFBYTES KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); we->we_unsynced_bufbytes -= bp->b_bufsize; #endif if (we->we_bufcount == 0) { #ifdef WAPBL_DEBUG_BUFBYTES KASSERT(we->we_unsynced_bufbytes == 0); #endif wapbl_free(we); } brelse(bp, 0); return; } #ifdef ohbother KDASSERT(bp->b_flags & B_DONE); KDASSERT(!(bp->b_flags & B_DELWRI)); KDASSERT(bp->b_flags & B_ASYNC); KDASSERT(bp->b_flags & B_BUSY); KDASSERT(!(bp->b_flags & B_LOCKED)); KDASSERT(!(bp->b_flags & B_READ)); KDASSERT(!(bp->b_flags & B_INVAL)); KDASSERT(!(bp->b_flags & B_NOCACHE)); #endif if (bp->b_error) { #ifdef notyet /* Can't currently handle possible dirty buffer reuse */ XXXpooka: interfaces not fully updated Note: this was not enabled in the original patch against netbsd4 either. I don't know if comment above is true or not. /* * If an error occurs, report the error and leave the * buffer as a delayed write on the LRU queue. * restarting the write would likely result in * an error spinloop, so let it be done harmlessly * by the syncer. */ bp->b_flags &= ~(B_DONE); simple_unlock(&bp->b_interlock); if (we->we_error == 0) { mutex_enter(&wl->wl_mtx); wl->wl_error_count++; mutex_exit(&wl->wl_mtx); cv_broadcast(&wl->wl_reclaimable_cv); } we->we_error = bp->b_error; bp->b_error = 0; brelse(bp); return; #else /* For now, just mark the log permanently errored out */ mutex_enter(&wl->wl_mtx); if (wl->wl_error_count == 0) { wl->wl_error_count++; cv_broadcast(&wl->wl_reclaimable_cv); } mutex_exit(&wl->wl_mtx); #endif } mutex_enter(&wl->wl_mtx); KASSERT(we->we_bufcount > 0); we->we_bufcount--; #ifdef WAPBL_DEBUG_BUFBYTES KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); we->we_unsynced_bufbytes -= bp->b_bufsize; KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); wl->wl_unsynced_bufbytes -= bp->b_bufsize; #endif /* * If the current transaction can be reclaimed, start * at the beginning and reclaim any consecutive reclaimable * transactions. If we successfully reclaim anything, * then wakeup anyone waiting for the reclaim. */ if (we->we_bufcount == 0) { size_t delta = 0; int errcnt = 0; #ifdef WAPBL_DEBUG_BUFBYTES KDASSERT(we->we_unsynced_bufbytes == 0); #endif /* * clear any posted error, since the buffer it came from * has successfully flushed by now */ while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && (we->we_bufcount == 0)) { delta += we->we_reclaimable_bytes; if (we->we_error) errcnt++; SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); wapbl_free(we); } if (delta) { wl->wl_reclaimable_bytes += delta; KASSERT(wl->wl_error_count >= errcnt); wl->wl_error_count -= errcnt; cv_broadcast(&wl->wl_reclaimable_cv); } } mutex_exit(&wl->wl_mtx); brelse(bp, 0); } /* * Write transactions to disk + start I/O for contents */ int wapbl_flush(struct wapbl *wl, int waitfor) { struct buf *bp; struct wapbl_entry *we; off_t off; off_t head; off_t tail; size_t delta = 0; size_t flushsize; size_t reserved; int error = 0; /* * Do a quick check to see if a full flush can be skipped * This assumes that the flush callback does not need to be called * unless there are other outstanding bufs. */ if (!waitfor) { size_t nbufs; mutex_enter(&wl->wl_mtx); /* XXX need mutex here to protect the KASSERTS */ nbufs = wl->wl_bufcount; KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); mutex_exit(&wl->wl_mtx); if (nbufs == 0) return 0; } /* * XXX we may consider using LK_UPGRADE here * if we want to call flush from inside a transaction */ rw_enter(&wl->wl_rwlock, RW_WRITER); wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, wl->wl_dealloccnt); /* * Now that we are fully locked and flushed, * do another check for nothing to do. */ if (wl->wl_bufcount == 0) { goto out; } #if 0 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, ("wapbl_flush thread %d.%d flushing entries with " "bufcount=%zu bufbytes=%zu\n", curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, wl->wl_bufbytes)); #endif /* Calculate amount of space needed to flush */ flushsize = wapbl_transaction_len(wl); if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { /* * XXX this could be handled more gracefully, perhaps place * only a partial transaction in the log and allow the * remaining to flush without the protection of the journal. */ panic("wapbl_flush: current transaction too big to flush\n"); } error = wapbl_truncate(wl, flushsize, 0); if (error) goto out2; off = wl->wl_head; KASSERT((off == 0) || ((off >= wl->wl_circ_off) && (off < wl->wl_circ_off + wl->wl_circ_size))); error = wapbl_write_blocks(wl, &off); if (error) goto out2; error = wapbl_write_revocations(wl, &off); if (error) goto out2; error = wapbl_write_inodes(wl, &off); if (error) goto out2; reserved = 0; if (wl->wl_inohashcnt) reserved = wapbl_transaction_inodes_len(wl); head = wl->wl_head; tail = wl->wl_tail; wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, &head, &tail); #ifdef WAPBL_DEBUG if (head != off) { panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX " off=%"PRIdMAX" flush=%zu\n", (intmax_t)head, (intmax_t)tail, (intmax_t)off, flushsize); } #else KASSERT(head == off); #endif /* Opportunistically move the tail forward if we can */ if (!wapbl_lazy_truncate) { mutex_enter(&wl->wl_mtx); delta = wl->wl_reclaimable_bytes; mutex_exit(&wl->wl_mtx); wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, &tail); } error = wapbl_write_commit(wl, head, tail); if (error) goto out2; /* poolme? or kmemme? */ we = wapbl_calloc(1, sizeof(*we)); #ifdef WAPBL_DEBUG_BUFBYTES WAPBL_PRINTF(WAPBL_PRINT_FLUSH, ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" " unsynced=%zu" "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " "inodes=%d\n", curproc->p_pid, curlwp->l_lid, flushsize, delta, wapbl_space_used(wl->wl_circ_size, head, tail), wl->wl_unsynced_bufbytes, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt)); #else WAPBL_PRINTF(WAPBL_PRINT_FLUSH, ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " "inodes=%d\n", curproc->p_pid, curlwp->l_lid, flushsize, delta, wapbl_space_used(wl->wl_circ_size, head, tail), wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt)); #endif mutex_enter(&bufcache_lock); mutex_enter(&wl->wl_mtx); wl->wl_reserved_bytes = reserved; wl->wl_head = head; wl->wl_tail = tail; KASSERT(wl->wl_reclaimable_bytes >= delta); wl->wl_reclaimable_bytes -= delta; wl->wl_dealloccnt = 0; #ifdef WAPBL_DEBUG_BUFBYTES wl->wl_unsynced_bufbytes += wl->wl_bufbytes; #endif we->we_wapbl = wl; we->we_bufcount = wl->wl_bufcount; #ifdef WAPBL_DEBUG_BUFBYTES we->we_unsynced_bufbytes = wl->wl_bufbytes; #endif we->we_reclaimable_bytes = flushsize; we->we_error = 0; SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); /* * this flushes bufs in reverse order than they were queued * it shouldn't matter, but if we care we could use TAILQ instead. * XXX Note they will get put on the lru queue when they flush * so we might actually want to change this to preserve order. */ while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { if (bbusy(bp, 0, 0, &wl->wl_mtx)) { continue; } bp->b_iodone = wapbl_biodone; bp->b_private = we; bremfree(bp); wapbl_remove_buf_locked(wl, bp); mutex_exit(&wl->wl_mtx); mutex_exit(&bufcache_lock); bawrite(bp); mutex_enter(&bufcache_lock); mutex_enter(&wl->wl_mtx); } mutex_exit(&wl->wl_mtx); mutex_exit(&bufcache_lock); #if 0 WAPBL_PRINTF(WAPBL_PRINT_FLUSH, ("wapbl_flush thread %d.%d done flushing entries...\n", curproc->p_pid, curlwp->l_lid)); #endif out: /* * If the waitfor flag is set, don't return until everything is * fully flushed and the on disk log is empty. */ if (waitfor) { error = wapbl_truncate(wl, wl->wl_circ_size - wl->wl_reserved_bytes, wapbl_lazy_truncate); } out2: if (error) { wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, wl->wl_dealloccnt); } #ifdef WAPBL_DEBUG_PRINT if (error) { pid_t pid = -1; lwpid_t lid = -1; if (curproc) pid = curproc->p_pid; if (curlwp) lid = curlwp->l_lid; mutex_enter(&wl->wl_mtx); #ifdef WAPBL_DEBUG_BUFBYTES WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("wapbl_flush: thread %d.%d aborted flush: " "error = %d\n" "\tbufcount=%zu bufbytes=%zu bcount=%zu " "deallocs=%d inodes=%d\n" "\terrcnt = %d, reclaimable=%zu reserved=%zu " "unsynced=%zu\n", pid, lid, error, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt, wl->wl_error_count, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("\tentry: bufcount = %zu, reclaimable = %zu, " "error = %d, unsynced = %zu\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error, we->we_unsynced_bufbytes)); } #else WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("wapbl_flush: thread %d.%d aborted flush: " "error = %d\n" "\tbufcount=%zu bufbytes=%zu bcount=%zu " "deallocs=%d inodes=%d\n" "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", pid, lid, error, wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, wl->wl_inohashcnt, wl->wl_error_count, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("\tentry: bufcount = %zu, reclaimable = %zu, " "error = %d\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error)); } #endif mutex_exit(&wl->wl_mtx); } #endif rw_exit(&wl->wl_rwlock); return error; } /****************************************************************/ void wapbl_jlock_assert(struct wapbl *wl) { #ifdef WAPBL_DEBUG_SERIALIZE KASSERT(rw_write_held(&wl->wl_rwlock)); #else KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock)); #endif } void wapbl_junlock_assert(struct wapbl *wl) { #ifdef WAPBL_DEBUG_SERIALIZE KASSERT(!rw_write_held(&wl->wl_rwlock)); #endif } /****************************************************************/ /* locks missing */ void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...)) { struct buf *bp; struct wapbl_entry *we; (*pr)("wapbl %p", wl); (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", wl->wl_circ_size, wl->wl_circ_off, (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); #ifdef WAPBL_DEBUG_BUFBYTES (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " "reserved = %zu errcnt = %d unsynced = %zu\n", wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, wl->wl_error_count, wl->wl_unsynced_bufbytes); #else (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, wl->wl_error_count); #endif (*pr)("\tdealloccnt = %d, dealloclim = %d\n", wl->wl_dealloccnt, wl->wl_dealloclim); (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", wl->wl_inohashcnt, wl->wl_inohashmask); (*pr)("entries:\n"); SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { #ifdef WAPBL_DEBUG_BUFBYTES (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " "unsynced = %zu\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error, we->we_unsynced_bufbytes); #else (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", we->we_bufcount, we->we_reclaimable_bytes, we->we_error); #endif } if (full) { int cnt = 0; (*pr)("bufs ="); LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { if (!LIST_NEXT(bp, b_wapbllist)) { (*pr)(" %p", bp); } else if ((++cnt % 6) == 0) { (*pr)(" %p,\n\t", bp); } else { (*pr)(" %p,", bp); } } (*pr)("\n"); (*pr)("dealloced blks = "); { int i; cnt = 0; for (i = 0; i < wl->wl_dealloccnt; i++) { (*pr)(" %"PRId64":%d,", wl->wl_deallocblks[i], wl->wl_dealloclens[i]); if ((++cnt % 4) == 0) { (*pr)("\n\t"); } } } (*pr)("\n"); (*pr)("registered inodes = "); { int i; cnt = 0; for (i = 0; i <= wl->wl_inohashmask; i++) { struct wapbl_ino_head *wih; struct wapbl_ino *wi; wih = &wl->wl_inohash[i]; LIST_FOREACH(wi, wih, wi_hash) { if (wi->wi_ino == 0) continue; (*pr)(" %"PRId32"/0%06"PRIo32",", wi->wi_ino, wi->wi_mode); if ((++cnt % 4) == 0) { (*pr)("\n\t"); } } } (*pr)("\n"); } } } #if defined(WAPBL_DEBUG) || defined(DDB) void wapbl_dump(struct wapbl *wl) { #if defined(WAPBL_DEBUG) if (!wl) wl = wapbl_debug_wl; #endif if (!wl) return; wapbl_print(wl, 1, printf); } #endif /****************************************************************/ void wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) { wapbl_jlock_assert(wl); /* XXX should eventually instead tie this into resource estimation */ /* XXX this KASSERT needs locking/mutex analysis */ KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim); wl->wl_deallocblks[wl->wl_dealloccnt] = blk; wl->wl_dealloclens[wl->wl_dealloccnt] = len; wl->wl_dealloccnt++; WAPBL_PRINTF(WAPBL_PRINT_ALLOC, ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); } /****************************************************************/ static void wapbl_inodetrk_init(struct wapbl *wl, u_int size) { wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask); if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) { pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0, "wapblinopl", &pool_allocator_nointr, IPL_NONE); } } static void wapbl_inodetrk_free(struct wapbl *wl) { /* XXX this KASSERT needs locking/mutex analysis */ KASSERT(wl->wl_inohashcnt == 0); hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask); if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) { pool_destroy(&wapbl_ino_pool); } } static struct wapbl_ino * wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) { struct wapbl_ino_head *wih; struct wapbl_ino *wi; KASSERT(mutex_owned(&wl->wl_mtx)); wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; LIST_FOREACH(wi, wih, wi_hash) { if (ino == wi->wi_ino) return wi; } return 0; } void wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) { struct wapbl_ino_head *wih; struct wapbl_ino *wi; wi = pool_get(&wapbl_ino_pool, PR_WAITOK); mutex_enter(&wl->wl_mtx); if (wapbl_inodetrk_get(wl, ino) == NULL) { wi->wi_ino = ino; wi->wi_mode = mode; wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; LIST_INSERT_HEAD(wih, wi, wi_hash); wl->wl_inohashcnt++; WAPBL_PRINTF(WAPBL_PRINT_INODE, ("wapbl_register_inode: ino=%"PRId64"\n", ino)); mutex_exit(&wl->wl_mtx); } else { mutex_exit(&wl->wl_mtx); pool_put(&wapbl_ino_pool, wi); } } void wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) { struct wapbl_ino *wi; mutex_enter(&wl->wl_mtx); wi = wapbl_inodetrk_get(wl, ino); if (wi) { WAPBL_PRINTF(WAPBL_PRINT_INODE, ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); KASSERT(wl->wl_inohashcnt > 0); wl->wl_inohashcnt--; LIST_REMOVE(wi, wi_hash); mutex_exit(&wl->wl_mtx); pool_put(&wapbl_ino_pool, wi); } else { mutex_exit(&wl->wl_mtx); } } /****************************************************************/ static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl) { int blocklen = 1<wl_log_dev_bshift; int iph; /* Calculate number of inodes described in a inodelist header */ iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); KASSERT(iph > 0); return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; } /* Calculate amount of space a transaction will take on disk */ static size_t wapbl_transaction_len(struct wapbl *wl) { int blocklen = 1<wl_log_dev_bshift; size_t len; int bph; /* Calculate number of blocks described in a blocklist header */ bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); KASSERT(bph > 0); len = wl->wl_bcount; len += howmany(wl->wl_bufcount, bph)*blocklen; len += howmany(wl->wl_dealloccnt, bph)*blocklen; len += wapbl_transaction_inodes_len(wl); return len; } /* * Perform commit operation * * Note that generation number incrementation needs to * be protected against racing with other invocations * of wapbl_commit. This is ok since this routine * is only invoked from wapbl_flush */ static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) { struct wapbl_wc_header *wc = wl->wl_wc_header; struct timespec ts; int error; int force = 1; /* XXX Calc checksum here, instead we do this for now */ error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); if (error) { WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " "returned %d\n", wl->wl_devvp->v_rdev, error)); } wc->wc_head = head; wc->wc_tail = tail; wc->wc_checksum = 0; wc->wc_version = 1; getnanotime(&ts); wc->wc_time = ts.tv_sec;; wc->wc_timensec = ts.tv_nsec; WAPBL_PRINTF(WAPBL_PRINT_WRITE, ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", (intmax_t)head, (intmax_t)tail)); /* * XXX if generation will rollover, then first zero * over second commit header before trying to write both headers. */ error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, wl->wl_logpbn + wc->wc_generation % 2); if (error) return error; error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); if (error) { WAPBL_PRINTF(WAPBL_PRINT_ERROR, ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " "returned %d\n", wl->wl_devvp->v_rdev, error)); } /* * If the generation number was zero, write it out a second time. * This handles initialization and generation number rollover */ if (wc->wc_generation++ == 0) { error = wapbl_write_commit(wl, head, tail); /* * This panic should be able to be removed if we do the * zero'ing mentioned above, and we are certain to roll * back generation number on failure. */ if (error) panic("wapbl_write_commit: error writing duplicate " "log header: %d\n", error); } return 0; } /* Returns new offset value */ static int wapbl_write_blocks(struct wapbl *wl, off_t *offp) { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; int blocklen = 1<wl_log_dev_bshift; int bph; struct buf *bp; off_t off = *offp; int error; size_t padding; KASSERT(rw_write_held(&wl->wl_rwlock)); bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); bp = LIST_FIRST(&wl->wl_bufs); while (bp) { int cnt; struct buf *obp = bp; KASSERT(bp->b_flags & B_LOCKED); wc->wc_type = WAPBL_WC_BLOCKS; wc->wc_len = blocklen; wc->wc_blkcount = 0; while (bp && (wc->wc_blkcount < bph)) { /* * Make sure all the physical block numbers are up to * date. If this is not always true on a given * filesystem, then VOP_BMAP must be called. We * could call VOP_BMAP here, or else in the filesystem * specific flush callback, although neither of those * solutions allow us to take the vnode lock. If a * filesystem requires that we must take the vnode lock * to call VOP_BMAP, then we can probably do it in * bwrite when the vnode lock should already be held * by the invoking code. */ KASSERT((bp->b_vp->v_type == VBLK) || (bp->b_blkno != bp->b_lblkno)); KASSERT(bp->b_blkno > 0); wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; wc->wc_len += bp->b_bcount; wc->wc_blkcount++; bp = LIST_NEXT(bp, b_wapbllist); } if (wc->wc_len % blocklen != 0) { padding = blocklen - wc->wc_len % blocklen; wc->wc_len += padding; } else { padding = 0; } WAPBL_PRINTF(WAPBL_PRINT_WRITE, ("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n", wc->wc_len, padding, (intmax_t)off)); error = wapbl_circ_write(wl, wc, blocklen, &off); if (error) return error; bp = obp; cnt = 0; while (bp && (cnt++ < bph)) { error = wapbl_circ_write(wl, bp->b_data, bp->b_bcount, &off); if (error) return error; bp = LIST_NEXT(bp, b_wapbllist); } if (padding) { void *zero; zero = wapbl_malloc(padding); memset(zero, 0, padding); error = wapbl_circ_write(wl, zero, padding, &off); wapbl_free(zero); if (error) return error; } } *offp = off; return 0; } static int wapbl_write_revocations(struct wapbl *wl, off_t *offp) { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; int i; int blocklen = 1<wl_log_dev_bshift; int bph; off_t off = *offp; int error; if (wl->wl_dealloccnt == 0) return 0; bph = (blocklen - offsetof(struct wapbl_wc_blocklist, wc_blocks)) / sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); i = 0; while (i < wl->wl_dealloccnt) { wc->wc_type = WAPBL_WC_REVOCATIONS; wc->wc_len = blocklen; wc->wc_blkcount = 0; while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { wc->wc_blocks[wc->wc_blkcount].wc_daddr = wl->wl_deallocblks[i]; wc->wc_blocks[wc->wc_blkcount].wc_dlen = wl->wl_dealloclens[i]; wc->wc_blkcount++; i++; } WAPBL_PRINTF(WAPBL_PRINT_WRITE, ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", wc->wc_len, (intmax_t)off)); error = wapbl_circ_write(wl, wc, blocklen, &off); if (error) return error; } *offp = off; return 0; } static int wapbl_write_inodes(struct wapbl *wl, off_t *offp) { struct wapbl_wc_inodelist *wc = (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; int i; int blocklen = 1<wl_log_dev_bshift; off_t off = *offp; int error; struct wapbl_ino_head *wih; struct wapbl_ino *wi; int iph; iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) / sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); i = 0; wih = &wl->wl_inohash[0]; wi = 0; do { wc->wc_type = WAPBL_WC_INODES; wc->wc_len = blocklen; wc->wc_inocnt = 0; wc->wc_clear = (i == 0); while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { while (!wi) { KASSERT((wih - &wl->wl_inohash[0]) <= wl->wl_inohashmask); wi = LIST_FIRST(wih++); } wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; wc->wc_inocnt++; i++; wi = LIST_NEXT(wi, wi_hash); } WAPBL_PRINTF(WAPBL_PRINT_WRITE, ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", wc->wc_len, (intmax_t)off)); error = wapbl_circ_write(wl, wc, blocklen, &off); if (error) return error; } while (i < wl->wl_inohashcnt); *offp = off; return 0; } #endif /* _KERNEL */ /****************************************************************/ #ifdef _KERNEL static struct pool wapbl_blk_pool; static int wapbl_blk_pool_refcount; #endif struct wapbl_blk { LIST_ENTRY(wapbl_blk) wb_hash; daddr_t wb_blk; off_t wb_off; /* Offset of this block in the log */ }; #define WAPBL_BLKPOOL_MIN 83 static void wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) { if (size < WAPBL_BLKPOOL_MIN) size = WAPBL_BLKPOOL_MIN; KASSERT(wr->wr_blkhash == 0); #ifdef _KERNEL wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask); if (atomic_inc_uint_nv(&wapbl_blk_pool_refcount) == 1) { pool_init(&wapbl_blk_pool, sizeof(struct wapbl_blk), 0, 0, 0, "wapblblkpl", &pool_allocator_nointr, IPL_NONE); } #else /* ! _KERNEL */ /* Manually implement hashinit */ { int i; unsigned long hashsize; for (hashsize = 1; hashsize < size; hashsize <<= 1) continue; wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); for (i = 0; i < wr->wr_blkhashmask; i++) LIST_INIT(&wr->wr_blkhash[i]); wr->wr_blkhashmask = hashsize - 1; } #endif /* ! _KERNEL */ } static void wapbl_blkhash_free(struct wapbl_replay *wr) { KASSERT(wr->wr_blkhashcnt == 0); #ifdef _KERNEL hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask); if (atomic_dec_uint_nv(&wapbl_blk_pool_refcount) == 0) { pool_destroy(&wapbl_blk_pool); } #else /* ! _KERNEL */ wapbl_free(wr->wr_blkhash); #endif /* ! _KERNEL */ } static struct wapbl_blk * wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) { struct wapbl_blk_head *wbh; struct wapbl_blk *wb; wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; LIST_FOREACH(wb, wbh, wb_hash) { if (blk == wb->wb_blk) return wb; } return 0; } static void wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) { struct wapbl_blk_head *wbh; struct wapbl_blk *wb; wb = wapbl_blkhash_get(wr, blk); if (wb) { KASSERT(wb->wb_blk == blk); wb->wb_off = off; } else { #ifdef _KERNEL wb = pool_get(&wapbl_blk_pool, PR_WAITOK); #else /* ! _KERNEL */ wb = wapbl_malloc(sizeof(*wb)); #endif /* ! _KERNEL */ wb->wb_blk = blk; wb->wb_off = off; wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; LIST_INSERT_HEAD(wbh, wb, wb_hash); wr->wr_blkhashcnt++; } } static void wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); if (wb) { KASSERT(wr->wr_blkhashcnt > 0); wr->wr_blkhashcnt--; LIST_REMOVE(wb, wb_hash); #ifdef _KERNEL pool_put(&wapbl_blk_pool, wb); #else /* ! _KERNEL */ wapbl_free(wb); #endif /* ! _KERNEL */ } } static void wapbl_blkhash_clear(struct wapbl_replay *wr) { int i; for (i = 0; i <= wr->wr_blkhashmask; i++) { struct wapbl_blk *wb; while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { KASSERT(wr->wr_blkhashcnt > 0); wr->wr_blkhashcnt--; LIST_REMOVE(wb, wb_hash); #ifdef _KERNEL pool_put(&wapbl_blk_pool, wb); #else /* ! _KERNEL */ wapbl_free(wb); #endif /* ! _KERNEL */ } } KASSERT(wr->wr_blkhashcnt == 0); } /****************************************************************/ static int wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) { size_t slen; struct wapbl_wc_header *wc = &wr->wr_wc_header; off_t off = *offp; int error; KASSERT(((len >> wc->wc_log_dev_bshift) << wc->wc_log_dev_bshift) == len); if (off < wc->wc_circ_off) off = wc->wc_circ_off; slen = wc->wc_circ_off + wc->wc_circ_size - off; if (slen < len) { error = wapbl_read(data, slen, wr->wr_devvp, wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); if (error) return error; data = (uint8_t *)data + slen; len -= slen; off = wc->wc_circ_off; } error = wapbl_read(data, len, wr->wr_devvp, wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); if (error) return error; off += len; if (off >= wc->wc_circ_off + wc->wc_circ_size) off = wc->wc_circ_off; *offp = off; return 0; } static void wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) { size_t slen; struct wapbl_wc_header *wc = &wr->wr_wc_header; off_t off = *offp; KASSERT(((len >> wc->wc_log_dev_bshift) << wc->wc_log_dev_bshift) == len); if (off < wc->wc_circ_off) off = wc->wc_circ_off; slen = wc->wc_circ_off + wc->wc_circ_size - off; if (slen < len) { len -= slen; off = wc->wc_circ_off; } off += len; if (off >= wc->wc_circ_off + wc->wc_circ_size) off = wc->wc_circ_off; *offp = off; } /****************************************************************/ int wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, daddr_t off, size_t count, size_t blksize) { struct wapbl_replay *wr; int error; struct vnode *devvp; daddr_t logpbn; uint8_t *scratch; struct wapbl_wc_header *wch; struct wapbl_wc_header *wch2; /* Use this until we read the actual log header */ int log_dev_bshift = DEV_BSHIFT; size_t used; WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", vp, off, count, blksize)); if (off < 0) return EINVAL; if (blksize < DEV_BSIZE) return EINVAL; if (blksize % DEV_BSIZE) return EINVAL; #ifdef _KERNEL #if 0 /* XXX vp->v_size isn't reliably set for VBLK devices, * especially root. However, we might still want to verify * that the full load is readable */ if ((off + count) * blksize > vp->v_size) return EINVAL; #endif if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { return error; } #else /* ! _KERNEL */ devvp = vp; logpbn = off; #endif /* ! _KERNEL */ scratch = wapbl_malloc(MAXBSIZE); error = wapbl_read(scratch, 2<wc_type != WAPBL_WC_HEADER) { printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); error = EFTYPE; goto errout; } if (wch2->wc_generation > wch->wc_generation) wch = wch2; wr = wapbl_calloc(1, sizeof(*wr)); wr->wr_logvp = vp; wr->wr_devvp = devvp; wr->wr_logpbn = logpbn; wr->wr_scratch = scratch; memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header)); used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 " len=%"PRId64" used=%zu\n", wch->wc_head, wch->wc_tail, wch->wc_circ_off, wch->wc_circ_size, used)); wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); error = wapbl_replay_prescan(wr); if (error) { wapbl_replay_stop(wr); wapbl_replay_free(wr); return error; } error = wapbl_replay_get_inodes(wr); if (error) { wapbl_replay_stop(wr); wapbl_replay_free(wr); return error; } *wrp = wr; return 0; errout: wapbl_free(scratch); return error; } void wapbl_replay_stop(struct wapbl_replay *wr) { if (!wapbl_replay_isopen(wr)) return; WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); wapbl_free(wr->wr_scratch); wr->wr_scratch = 0; wr->wr_logvp = 0; wapbl_blkhash_clear(wr); wapbl_blkhash_free(wr); } void wapbl_replay_free(struct wapbl_replay *wr) { KDASSERT(!wapbl_replay_isopen(wr)); if (wr->wr_inodes) wapbl_free(wr->wr_inodes); wapbl_free(wr); } #ifdef _KERNEL int wapbl_replay_isopen1(struct wapbl_replay *wr) { return wapbl_replay_isopen(wr); } #endif static int wapbl_replay_prescan(struct wapbl_replay *wr) { off_t off; struct wapbl_wc_header *wch = &wr->wr_wc_header; int error; int logblklen = 1<wc_log_dev_bshift; int fsblklen = 1<wc_fs_dev_bshift; wapbl_blkhash_clear(wr); off = wch->wc_tail; while (off != wch->wc_head) { struct wapbl_wc_null *wcn; off_t saveoff = off; error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); if (error) goto errout; wcn = (struct wapbl_wc_null *)wr->wr_scratch; switch (wcn->wc_type) { case WAPBL_WC_BLOCKS: { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wr->wr_scratch; int i; for (i = 0; i < wc->wc_blkcount; i++) { int j, n; /* * Enter each physical block into the * hashtable independently */ n = wc->wc_blocks[i].wc_dlen >> wch->wc_fs_dev_bshift; for (j = 0; j < n; j++) { wapbl_blkhash_ins(wr, wc->wc_blocks[i].wc_daddr + j, off); wapbl_circ_advance(wr, fsblklen, &off); } } } break; case WAPBL_WC_REVOCATIONS: { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wr->wr_scratch; int i; for (i = 0; i < wc->wc_blkcount; i++) { int j, n; /* * Remove any blocks found from the * hashtable */ n = wc->wc_blocks[i].wc_dlen >> wch->wc_fs_dev_bshift; for (j = 0; j < n; j++) { wapbl_blkhash_rem(wr, wc->wc_blocks[i].wc_daddr + j); } } } break; case WAPBL_WC_INODES: { struct wapbl_wc_inodelist *wc = (struct wapbl_wc_inodelist *)wr->wr_scratch; /* * Keep track of where we found this so we * can use it later */ if (wc->wc_clear) { wr->wr_inodestail = saveoff; wr->wr_inodescnt = 0; } if (wr->wr_inodestail) wr->wr_inodeshead = off; wr->wr_inodescnt += wc->wc_inocnt; } break; default: printf("Unrecognized wapbl type: 0x%08x\n", wcn->wc_type); error = EFTYPE; goto errout; } wapbl_circ_advance(wr, wcn->wc_len, &saveoff); if (off != saveoff) { printf("wapbl_replay: corrupted records\n"); error = EFTYPE; goto errout; } } return 0; errout: wapbl_blkhash_clear(wr); return error; } static int wapbl_replay_get_inodes(struct wapbl_replay *wr) { off_t off; struct wapbl_wc_header *wch = &wr->wr_wc_header; int logblklen = 1<wc_log_dev_bshift; int cnt= 0; KDASSERT(wapbl_replay_isopen(wr)); if (wr->wr_inodescnt == 0) return 0; KASSERT(!wr->wr_inodes); wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0])); off = wr->wr_inodestail; while (off != wr->wr_inodeshead) { struct wapbl_wc_null *wcn; int error; off_t saveoff = off; error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); if (error) { wapbl_free(wr->wr_inodes); wr->wr_inodes = 0; return error; } wcn = (struct wapbl_wc_null *)wr->wr_scratch; switch (wcn->wc_type) { case WAPBL_WC_BLOCKS: case WAPBL_WC_REVOCATIONS: break; case WAPBL_WC_INODES: { struct wapbl_wc_inodelist *wc = (struct wapbl_wc_inodelist *)wr->wr_scratch; /* * Keep track of where we found this so we * can use it later */ if (wc->wc_clear) { cnt = 0; } /* This memcpy assumes that wr_inodes is * laid out the same as wc_inodes. */ memcpy(&wr->wr_inodes[cnt], wc->wc_inodes, wc->wc_inocnt*sizeof(wc->wc_inodes[0])); cnt += wc->wc_inocnt; } break; default: KASSERT(0); } off = saveoff; wapbl_circ_advance(wr, wcn->wc_len, &off); } KASSERT(cnt == wr->wr_inodescnt); return 0; } #ifdef DEBUG int wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) { off_t off; struct wapbl_wc_header *wch = &wr->wr_wc_header; int mismatchcnt = 0; int logblklen = 1<wc_log_dev_bshift; int fsblklen = 1<wc_fs_dev_bshift; void *scratch1 = wapbl_malloc(MAXBSIZE); void *scratch2 = wapbl_malloc(MAXBSIZE); int error = 0; KDASSERT(wapbl_replay_isopen(wr)); off = wch->wc_tail; while (off != wch->wc_head) { struct wapbl_wc_null *wcn; #ifdef DEBUG off_t saveoff = off; #endif error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); if (error) goto out; wcn = (struct wapbl_wc_null *)wr->wr_scratch; switch (wcn->wc_type) { case WAPBL_WC_BLOCKS: { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wr->wr_scratch; int i; for (i = 0; i < wc->wc_blkcount; i++) { int foundcnt = 0; int dirtycnt = 0; int j, n; /* * Check each physical block into the * hashtable independently */ n = wc->wc_blocks[i].wc_dlen >> wch->wc_fs_dev_bshift; for (j = 0; j < n; j++) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, wc->wc_blocks[i].wc_daddr + j); if (wb && (wb->wb_off == off)) { foundcnt++; error = wapbl_circ_read(wr, scratch1, fsblklen, &off); if (error) goto out; error = wapbl_read(scratch2, fsblklen, fsdevvp, wb->wb_blk); if (error) goto out; if (memcmp(scratch1, scratch2, fsblklen)) { printf( "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", wb->wb_blk, (intmax_t)off); dirtycnt++; mismatchcnt++; } } else { wapbl_circ_advance(wr, fsblklen, &off); } } #if 0 /* * If all of the blocks in an entry * are clean, then remove all of its * blocks from the hashtable since they * never will need replay. */ if ((foundcnt != 0) && (dirtycnt == 0)) { off = saveoff; wapbl_circ_advance(wr, logblklen, &off); for (j = 0; j < n; j++) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, wc->wc_blocks[i].wc_daddr + j); if (wb && (wb->wb_off == off)) { wapbl_blkhash_rem(wr, wb->wb_blk); } wapbl_circ_advance(wr, fsblklen, &off); } } #endif } } break; case WAPBL_WC_REVOCATIONS: case WAPBL_WC_INODES: break; default: KASSERT(0); } #ifdef DEBUG wapbl_circ_advance(wr, wcn->wc_len, &saveoff); KASSERT(off == saveoff); #endif } out: wapbl_free(scratch1); wapbl_free(scratch2); if (!error && mismatchcnt) error = EFTYPE; return error; } #endif int wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) { off_t off; struct wapbl_wc_header *wch = &wr->wr_wc_header; int logblklen = 1<wc_log_dev_bshift; int fsblklen = 1<wc_fs_dev_bshift; void *scratch1 = wapbl_malloc(MAXBSIZE); int error = 0; KDASSERT(wapbl_replay_isopen(wr)); /* * This parses the journal for replay, although it could * just as easily walk the hashtable instead. */ off = wch->wc_tail; while (off != wch->wc_head) { struct wapbl_wc_null *wcn; #ifdef DEBUG off_t saveoff = off; #endif error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); if (error) goto out; wcn = (struct wapbl_wc_null *)wr->wr_scratch; switch (wcn->wc_type) { case WAPBL_WC_BLOCKS: { struct wapbl_wc_blocklist *wc = (struct wapbl_wc_blocklist *)wr->wr_scratch; int i; for (i = 0; i < wc->wc_blkcount; i++) { int j, n; /* * Check each physical block against * the hashtable independently */ n = wc->wc_blocks[i].wc_dlen >> wch->wc_fs_dev_bshift; for (j = 0; j < n; j++) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, wc->wc_blocks[i].wc_daddr + j); if (wb && (wb->wb_off == off)) { error = wapbl_circ_read( wr, scratch1, fsblklen, &off); if (error) goto out; error = wapbl_write(scratch1, fsblklen, fsdevvp, wb->wb_blk); if (error) goto out; } else { wapbl_circ_advance(wr, fsblklen, &off); } } } } break; case WAPBL_WC_REVOCATIONS: case WAPBL_WC_INODES: break; default: KASSERT(0); } #ifdef DEBUG wapbl_circ_advance(wr, wcn->wc_len, &saveoff); KASSERT(off == saveoff); #endif } out: wapbl_free(scratch1); return error; } int wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len) { struct wapbl_wc_header *wch = &wr->wr_wc_header; int fsblklen = 1<wc_fs_dev_bshift; KDASSERT(wapbl_replay_isopen(wr)); KASSERT((len % fsblklen) == 0); while (len != 0) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); if (wb) return 1; len -= fsblklen; } return 0; } int wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) { struct wapbl_wc_header *wch = &wr->wr_wc_header; int fsblklen = 1<wc_fs_dev_bshift; KDASSERT(wapbl_replay_isopen(wr)); KASSERT((len % fsblklen) == 0); while (len != 0) { struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); if (wb) { off_t off = wb->wb_off; int error; error = wapbl_circ_read(wr, data, fsblklen, &off); if (error) return error; } data = (uint8_t *)data + fsblklen; len -= fsblklen; blk++; } return 0; }