sys: Membar audit around reference count releases.
If two threads are using an object that is freed when the reference
count goes to zero, we need to ensure that all memory operations
related to the object happen before freeing the object.
Using an atomic_dec_uint_nv(&refcnt) == 0 ensures that only one
thread takes responsibility for freeing, but it's not enough to
ensure that the other thread's memory operations happen before the
freeing.
Consider:
Thread A Thread B
obj->foo = 42; obj->baz = 73;
mumble(&obj->bar); grumble(&obj->quux);
/* membar_exit(); */ /* membar_exit(); */
atomic_dec -- not last atomic_dec -- last
/* membar_enter(); */
KASSERT(invariant(obj->foo,
obj->bar));
free_stuff(obj);
The memory barriers ensure that
obj->foo = 42;
mumble(&obj->bar);
in thread A happens before
KASSERT(invariant(obj->foo, obj->bar));
free_stuff(obj);
in thread B. Without them, this ordering is not guaranteed.
So in general it is necessary to do
membar_exit();
if (atomic_dec_uint_nv(&obj->refcnt) != 0)
return;
membar_enter();
to release a reference, for the `last one out hit the lights' style
of reference counting. (This is in contrast to the style where one
thread blocks new references and then waits under a lock for existing
ones to drain with a condvar -- no membar needed thanks to mutex(9).)
I searched for atomic_dec to find all these. Obviously we ought to
have a better abstraction for this because there's so much copypasta.
This is a stop-gap measure to fix actual bugs until we have that. It
would be nice if an abstraction could gracefully handle the different
styles of reference counting in use -- some years ago I drafted an
API for this, but making it cover everything got a little out of hand
(particularly with struct vnode::v_usecount) and I ended up setting
it aside to work on psref/localcount instead for better scalability.
I got bored of adding #ifdef __HAVE_ATOMIC_AS_MEMBAR everywhere, so I
only put it on things that look performance-critical on 5sec review.
We should really adopt membar_enter_preatomic/membar_exit_postatomic
or something (except they are applicable only to atomic r/m/w, not to
atomic_load/store_*, making the naming annoying) and get rid of all
the ifdefs.
2022-03-12 18:32:30 +03:00
|
|
|
/* $NetBSD: vfs_wapbl.c,v 1.110 2022/03/12 15:32:32 riastradh Exp $ */
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/*-
|
2009-02-22 23:10:25 +03:00
|
|
|
* Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
|
2008-07-31 09:38:04 +04:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This code is derived from software contributed to The NetBSD Foundation
|
|
|
|
* by Wasabi Systems, Inc.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
|
|
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
|
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This implements file system independent write ahead filesystem logging.
|
|
|
|
*/
|
2008-11-10 23:12:13 +03:00
|
|
|
|
|
|
|
#define WAPBL_INTERNAL
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
#include <sys/cdefs.h>
|
sys: Membar audit around reference count releases.
If two threads are using an object that is freed when the reference
count goes to zero, we need to ensure that all memory operations
related to the object happen before freeing the object.
Using an atomic_dec_uint_nv(&refcnt) == 0 ensures that only one
thread takes responsibility for freeing, but it's not enough to
ensure that the other thread's memory operations happen before the
freeing.
Consider:
Thread A Thread B
obj->foo = 42; obj->baz = 73;
mumble(&obj->bar); grumble(&obj->quux);
/* membar_exit(); */ /* membar_exit(); */
atomic_dec -- not last atomic_dec -- last
/* membar_enter(); */
KASSERT(invariant(obj->foo,
obj->bar));
free_stuff(obj);
The memory barriers ensure that
obj->foo = 42;
mumble(&obj->bar);
in thread A happens before
KASSERT(invariant(obj->foo, obj->bar));
free_stuff(obj);
in thread B. Without them, this ordering is not guaranteed.
So in general it is necessary to do
membar_exit();
if (atomic_dec_uint_nv(&obj->refcnt) != 0)
return;
membar_enter();
to release a reference, for the `last one out hit the lights' style
of reference counting. (This is in contrast to the style where one
thread blocks new references and then waits under a lock for existing
ones to drain with a condvar -- no membar needed thanks to mutex(9).)
I searched for atomic_dec to find all these. Obviously we ought to
have a better abstraction for this because there's so much copypasta.
This is a stop-gap measure to fix actual bugs until we have that. It
would be nice if an abstraction could gracefully handle the different
styles of reference counting in use -- some years ago I drafted an
API for this, but making it cover everything got a little out of hand
(particularly with struct vnode::v_usecount) and I ended up setting
it aside to work on psref/localcount instead for better scalability.
I got bored of adding #ifdef __HAVE_ATOMIC_AS_MEMBAR everywhere, so I
only put it on things that look performance-critical on 5sec review.
We should really adopt membar_enter_preatomic/membar_exit_postatomic
or something (except they are applicable only to atomic r/m/w, not to
atomic_load/store_*, making the naming annoying) and get rid of all
the ifdefs.
2022-03-12 18:32:30 +03:00
|
|
|
__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.110 2022/03/12 15:32:32 riastradh Exp $");
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
#include <sys/param.h>
|
2010-02-23 23:51:25 +03:00
|
|
|
#include <sys/bitops.h>
|
2016-05-07 09:38:47 +03:00
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/wapbl.h>
|
|
|
|
#include <sys/wapbl_replay.h>
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
#ifdef _KERNEL
|
2016-05-07 09:38:47 +03:00
|
|
|
|
|
|
|
#include <sys/atomic.h>
|
|
|
|
#include <sys/conf.h>
|
2017-03-17 06:16:29 +03:00
|
|
|
#include <sys/evcnt.h>
|
2016-05-07 09:38:47 +03:00
|
|
|
#include <sys/file.h>
|
|
|
|
#include <sys/kauth.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/module.h>
|
|
|
|
#include <sys/mount.h>
|
|
|
|
#include <sys/mutex.h>
|
2008-07-31 09:38:04 +04:00
|
|
|
#include <sys/namei.h>
|
|
|
|
#include <sys/proc.h>
|
2016-05-07 09:38:47 +03:00
|
|
|
#include <sys/resourcevar.h>
|
2011-01-08 23:37:05 +03:00
|
|
|
#include <sys/sysctl.h>
|
2008-07-31 09:38:04 +04:00
|
|
|
#include <sys/uio.h>
|
|
|
|
#include <sys/vnode.h>
|
|
|
|
|
|
|
|
#include <miscfs/specfs/specdev.h>
|
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
#define wapbl_alloc(s) kmem_alloc((s), KM_SLEEP)
|
|
|
|
#define wapbl_free(a, s) kmem_free((a), (s))
|
|
|
|
#define wapbl_calloc(n, s) kmem_zalloc((n)*(s), KM_SLEEP)
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2011-01-08 23:37:05 +03:00
|
|
|
static int wapbl_flush_disk_cache = 1;
|
|
|
|
static int wapbl_verbose_commit = 0;
|
2017-04-11 00:36:05 +03:00
|
|
|
static int wapbl_allow_dpofua = 0; /* switched off by default for now */
|
2017-04-11 00:34:37 +03:00
|
|
|
static int wapbl_journal_iobufs = 4;
|
2011-01-08 23:37:05 +03:00
|
|
|
|
2013-09-15 12:11:33 +04:00
|
|
|
static inline size_t wapbl_space_free(size_t, off_t, off_t);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
#else /* !_KERNEL */
|
2016-05-07 09:38:47 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
#include <assert.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stdbool.h>
|
2016-05-07 09:38:47 +03:00
|
|
|
#include <stdio.h>
|
2008-07-31 09:38:04 +04:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#define KDASSERT(x) assert(x)
|
|
|
|
#define KASSERT(x) assert(x)
|
2012-01-28 22:02:56 +04:00
|
|
|
#define wapbl_alloc(s) malloc(s)
|
2009-01-31 12:33:36 +03:00
|
|
|
#define wapbl_free(a, s) free(a)
|
2008-07-31 09:38:04 +04:00
|
|
|
#define wapbl_calloc(n, s) calloc((n), (s))
|
|
|
|
|
|
|
|
#endif /* !_KERNEL */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* INTERNAL DATA STRUCTURES
|
|
|
|
*/
|
|
|
|
|
2017-03-17 06:17:07 +03:00
|
|
|
/*
|
2008-07-31 09:38:04 +04:00
|
|
|
* This structure holds per-mount log information.
|
|
|
|
*
|
|
|
|
* Legend: a = atomic access only
|
|
|
|
* r = read-only after init
|
|
|
|
* l = rwlock held
|
|
|
|
* m = mutex held
|
2010-11-09 19:30:26 +03:00
|
|
|
* lm = rwlock held writing or mutex held
|
2008-07-31 09:38:04 +04:00
|
|
|
* u = unlocked access ok
|
|
|
|
* b = bufcache_lock held
|
|
|
|
*/
|
2014-09-05 09:57:21 +04:00
|
|
|
LIST_HEAD(wapbl_ino_head, wapbl_ino);
|
2008-07-31 09:38:04 +04:00
|
|
|
struct wapbl {
|
|
|
|
struct vnode *wl_logvp; /* r: log here */
|
|
|
|
struct vnode *wl_devvp; /* r: log on this device */
|
|
|
|
struct mount *wl_mount; /* r: mountpoint wl is associated with */
|
|
|
|
daddr_t wl_logpbn; /* r: Physical block number of start of log */
|
|
|
|
int wl_log_dev_bshift; /* r: logarithm of device block size of log
|
|
|
|
device */
|
|
|
|
int wl_fs_dev_bshift; /* r: logarithm of device block size of
|
|
|
|
filesystem device */
|
|
|
|
|
2008-08-11 06:45:27 +04:00
|
|
|
unsigned wl_lock_count; /* m: Count of transactions in progress */
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
size_t wl_circ_size; /* r: Number of bytes in buffer of log */
|
|
|
|
size_t wl_circ_off; /* r: Number of bytes reserved at start */
|
|
|
|
|
|
|
|
size_t wl_bufcount_max; /* r: Number of buffers reserved for log */
|
|
|
|
size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */
|
|
|
|
|
|
|
|
off_t wl_head; /* l: Byte offset of log head */
|
|
|
|
off_t wl_tail; /* l: Byte offset of log tail */
|
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* WAPBL log layout, stored on wl_devvp at wl_logpbn:
|
|
|
|
*
|
|
|
|
* ___________________ wl_circ_size __________________
|
|
|
|
* / \
|
|
|
|
* +---------+---------+-------+--------------+--------+
|
|
|
|
* [ commit0 | commit1 | CCWCW | EEEEEEEEEEEE | CCCWCW ]
|
|
|
|
* +---------+---------+-------+--------------+--------+
|
|
|
|
* wl_circ_off --^ ^-- wl_head ^-- wl_tail
|
|
|
|
*
|
|
|
|
* commit0 and commit1 are commit headers. A commit header has
|
|
|
|
* a generation number, indicating which of the two headers is
|
|
|
|
* more recent, and an assignment of head and tail pointers.
|
|
|
|
* The rest is a circular queue of log records, starting at
|
|
|
|
* the byte offset wl_circ_off.
|
|
|
|
*
|
|
|
|
* E marks empty space for records.
|
|
|
|
* W marks records for block writes issued but waiting.
|
|
|
|
* C marks completed records.
|
|
|
|
*
|
|
|
|
* wapbl_flush writes new records to empty `E' spaces after
|
|
|
|
* wl_head from the current transaction in memory.
|
|
|
|
*
|
|
|
|
* wapbl_truncate advances wl_tail past any completed `C'
|
|
|
|
* records, freeing them up for use.
|
|
|
|
*
|
|
|
|
* head == tail == 0 means log is empty.
|
|
|
|
* head == tail != 0 means log is full.
|
|
|
|
*
|
|
|
|
* See assertions in wapbl_advance() for other boundary
|
|
|
|
* conditions.
|
|
|
|
*
|
|
|
|
* Only wapbl_flush moves the head, except when wapbl_truncate
|
|
|
|
* sets it to 0 to indicate that the log is empty.
|
|
|
|
*
|
|
|
|
* Only wapbl_truncate moves the tail, except when wapbl_flush
|
|
|
|
* sets it to wl_circ_off to indicate that the log is full.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
struct wapbl_wc_header *wl_wc_header; /* l */
|
|
|
|
void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */
|
|
|
|
|
|
|
|
kmutex_t wl_mtx; /* u: short-term lock */
|
|
|
|
krwlock_t wl_rwlock; /* u: File system transaction lock */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Must be held while accessing
|
|
|
|
* wl_count or wl_bufs or head or tail
|
|
|
|
*/
|
|
|
|
|
2017-03-05 16:57:29 +03:00
|
|
|
#if _KERNEL
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
|
|
|
* Callback called from within the flush routine to flush any extra
|
|
|
|
* bits. Note that flush may be skipped without calling this if
|
|
|
|
* there are no outstanding buffers in the transaction.
|
|
|
|
*/
|
|
|
|
wapbl_flush_fn_t wl_flush; /* r */
|
|
|
|
wapbl_flush_fn_t wl_flush_abort;/* r */
|
2017-03-05 16:57:29 +03:00
|
|
|
|
|
|
|
/* Event counters */
|
|
|
|
char wl_ev_group[EVCNT_STRING_MAX]; /* r */
|
|
|
|
struct evcnt wl_ev_commit; /* l */
|
|
|
|
struct evcnt wl_ev_journalwrite; /* l */
|
2017-04-11 00:34:37 +03:00
|
|
|
struct evcnt wl_ev_jbufs_bio_nowait; /* l */
|
2017-03-05 16:57:29 +03:00
|
|
|
struct evcnt wl_ev_metawrite; /* lm */
|
|
|
|
struct evcnt wl_ev_cacheflush; /* l */
|
2008-11-10 23:30:31 +03:00
|
|
|
#endif
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */
|
|
|
|
size_t wl_bufcount; /* m: Count of buffers in wl_bufs */
|
|
|
|
size_t wl_bcount; /* m: Total bcount of wl_bufs */
|
|
|
|
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
kcondvar_t wl_reclaimable_cv; /* m (obviously) */
|
|
|
|
size_t wl_reclaimable_bytes; /* m: Amount of space available for
|
|
|
|
reclamation by truncate */
|
|
|
|
int wl_error_count; /* m: # of wl_entries with errors */
|
|
|
|
size_t wl_reserved_bytes; /* never truncate log smaller than this */
|
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */
|
|
|
|
#endif
|
|
|
|
|
2016-09-22 19:20:56 +03:00
|
|
|
#if _KERNEL
|
|
|
|
int wl_brperjblock; /* r Block records per journal block */
|
|
|
|
#endif
|
|
|
|
|
2016-11-10 23:56:32 +03:00
|
|
|
TAILQ_HEAD(, wapbl_dealloc) wl_dealloclist; /* lm: list head */
|
2016-10-01 16:15:45 +03:00
|
|
|
int wl_dealloccnt; /* lm: total count */
|
|
|
|
int wl_dealloclim; /* r: max count */
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* hashtable of inode numbers for allocated but unlinked inodes */
|
|
|
|
/* synch ??? */
|
2014-09-05 09:57:21 +04:00
|
|
|
struct wapbl_ino_head *wl_inohash;
|
2008-07-31 09:38:04 +04:00
|
|
|
u_long wl_inohashmask;
|
|
|
|
int wl_inohashcnt;
|
|
|
|
|
2020-04-12 11:51:41 +03:00
|
|
|
SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* m: On disk transaction
|
2008-07-31 09:38:04 +04:00
|
|
|
accounting */
|
2012-12-08 11:24:42 +04:00
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
/* buffers for wapbl_buffered_write() */
|
|
|
|
TAILQ_HEAD(, buf) wl_iobufs; /* l: Free or filling bufs */
|
|
|
|
TAILQ_HEAD(, buf) wl_iobufs_busy; /* l: In-transit bufs */
|
2017-04-05 23:38:53 +03:00
|
|
|
|
|
|
|
int wl_dkcache; /* r: disk cache flags */
|
|
|
|
#define WAPBL_USE_FUA(wl) \
|
2017-04-11 00:36:05 +03:00
|
|
|
(wapbl_allow_dpofua && ISSET((wl)->wl_dkcache, DKCACHE_FUA))
|
2017-04-05 23:38:53 +03:00
|
|
|
#define WAPBL_JFLAGS(wl) \
|
|
|
|
(WAPBL_USE_FUA(wl) ? (wl)->wl_jwrite_flags : 0)
|
2017-12-02 20:29:55 +03:00
|
|
|
#define WAPBL_JDATA_FLAGS(wl) \
|
|
|
|
(WAPBL_JFLAGS(wl) & B_MEDIA_DPO) /* only DPO */
|
2017-04-05 23:38:53 +03:00
|
|
|
int wl_jwrite_flags; /* r: journal write flags */
|
2008-07-31 09:38:04 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG_PRINT
|
|
|
|
int wapbl_debug_print = WAPBL_DEBUG_PRINT;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
#ifdef _KERNEL
|
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG
|
|
|
|
struct wapbl *wapbl_debug_wl;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail);
|
|
|
|
static int wapbl_write_blocks(struct wapbl *wl, off_t *offp);
|
|
|
|
static int wapbl_write_revocations(struct wapbl *wl, off_t *offp);
|
|
|
|
static int wapbl_write_inodes(struct wapbl *wl, off_t *offp);
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
static int wapbl_replay_process(struct wapbl_replay *wr, off_t, off_t);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline size_t wapbl_space_used(size_t avail, off_t head,
|
2008-07-31 09:38:04 +04:00
|
|
|
off_t tail);
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
static struct pool wapbl_entry_pool;
|
2016-10-01 16:15:45 +03:00
|
|
|
static struct pool wapbl_dealloc_pool;
|
2012-01-28 22:02:56 +04:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
#define WAPBL_INODETRK_SIZE 83
|
|
|
|
static int wapbl_ino_pool_refcount;
|
|
|
|
static struct pool wapbl_ino_pool;
|
|
|
|
struct wapbl_ino {
|
|
|
|
LIST_ENTRY(wapbl_ino) wi_hash;
|
|
|
|
ino_t wi_ino;
|
|
|
|
mode_t wi_mode;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void wapbl_inodetrk_init(struct wapbl *wl, u_int size);
|
|
|
|
static void wapbl_inodetrk_free(struct wapbl *wl);
|
|
|
|
static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino);
|
|
|
|
|
|
|
|
static size_t wapbl_transaction_len(struct wapbl *wl);
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline size_t wapbl_transaction_inodes_len(struct wapbl *wl);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2016-11-10 23:56:32 +03:00
|
|
|
static void wapbl_deallocation_free(struct wapbl *, struct wapbl_dealloc *,
|
|
|
|
bool);
|
|
|
|
|
2017-03-05 16:57:29 +03:00
|
|
|
static void wapbl_evcnt_init(struct wapbl *);
|
|
|
|
static void wapbl_evcnt_free(struct wapbl *);
|
|
|
|
|
2017-04-05 23:38:53 +03:00
|
|
|
static void wapbl_dkcache_init(struct wapbl *);
|
|
|
|
|
2008-11-18 22:31:35 +03:00
|
|
|
#if 0
|
2008-11-10 23:12:13 +03:00
|
|
|
int wapbl_replay_verify(struct wapbl_replay *, struct vnode *);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int wapbl_replay_isopen1(struct wapbl_replay *);
|
|
|
|
|
2018-12-11 00:19:33 +03:00
|
|
|
const struct wapbl_ops wapbl_ops = {
|
2008-07-31 09:38:04 +04:00
|
|
|
.wo_wapbl_discard = wapbl_discard,
|
|
|
|
.wo_wapbl_replay_isopen = wapbl_replay_isopen1,
|
2008-11-11 11:29:58 +03:00
|
|
|
.wo_wapbl_replay_can_read = wapbl_replay_can_read,
|
2008-07-31 09:38:04 +04:00
|
|
|
.wo_wapbl_replay_read = wapbl_replay_read,
|
|
|
|
.wo_wapbl_add_buf = wapbl_add_buf,
|
|
|
|
.wo_wapbl_remove_buf = wapbl_remove_buf,
|
|
|
|
.wo_wapbl_resize_buf = wapbl_resize_buf,
|
|
|
|
.wo_wapbl_begin = wapbl_begin,
|
|
|
|
.wo_wapbl_end = wapbl_end,
|
|
|
|
.wo_wapbl_junlock_assert= wapbl_junlock_assert,
|
2018-12-11 00:03:48 +03:00
|
|
|
.wo_wapbl_jlock_assert = wapbl_jlock_assert,
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* XXX: the following is only used to say "this is a wapbl buf" */
|
|
|
|
.wo_wapbl_biodone = wapbl_biodone,
|
|
|
|
};
|
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
SYSCTL_SETUP(wapbl_sysctl_init, "wapbl sysctl")
|
2011-01-08 23:37:05 +03:00
|
|
|
{
|
|
|
|
int rv;
|
|
|
|
const struct sysctlnode *rnode, *cnode;
|
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
rv = sysctl_createv(clog, 0, NULL, &rnode,
|
2011-01-08 23:37:05 +03:00
|
|
|
CTLFLAG_PERMANENT,
|
|
|
|
CTLTYPE_NODE, "wapbl",
|
|
|
|
SYSCTL_DESCR("WAPBL journaling options"),
|
|
|
|
NULL, 0, NULL, 0,
|
2014-02-25 22:30:08 +04:00
|
|
|
CTL_VFS, CTL_CREATE, CTL_EOL);
|
2011-01-08 23:37:05 +03:00
|
|
|
if (rv)
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2011-01-08 23:37:05 +03:00
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
rv = sysctl_createv(clog, 0, &rnode, &cnode,
|
2011-01-08 23:37:05 +03:00
|
|
|
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
|
|
|
|
CTLTYPE_INT, "flush_disk_cache",
|
|
|
|
SYSCTL_DESCR("flush disk cache"),
|
|
|
|
NULL, 0, &wapbl_flush_disk_cache, 0,
|
|
|
|
CTL_CREATE, CTL_EOL);
|
|
|
|
if (rv)
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2011-01-08 23:37:05 +03:00
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
rv = sysctl_createv(clog, 0, &rnode, &cnode,
|
2011-01-08 23:37:05 +03:00
|
|
|
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
|
|
|
|
CTLTYPE_INT, "verbose_commit",
|
|
|
|
SYSCTL_DESCR("show time and size of wapbl log commits"),
|
|
|
|
NULL, 0, &wapbl_verbose_commit, 0,
|
|
|
|
CTL_CREATE, CTL_EOL);
|
2017-04-05 23:38:53 +03:00
|
|
|
if (rv)
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2017-04-05 23:38:53 +03:00
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
rv = sysctl_createv(clog, 0, &rnode, &cnode,
|
2017-04-05 23:38:53 +03:00
|
|
|
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
|
2017-04-11 00:36:05 +03:00
|
|
|
CTLTYPE_INT, "allow_dpofua",
|
2020-03-08 21:26:59 +03:00
|
|
|
SYSCTL_DESCR("allow use of FUA/DPO instead of cache flush if available"),
|
2017-04-11 00:36:05 +03:00
|
|
|
NULL, 0, &wapbl_allow_dpofua, 0,
|
2017-04-05 23:38:53 +03:00
|
|
|
CTL_CREATE, CTL_EOL);
|
|
|
|
if (rv)
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2017-04-05 23:38:53 +03:00
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
rv = sysctl_createv(clog, 0, &rnode, &cnode,
|
2017-04-11 00:34:37 +03:00
|
|
|
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
|
|
|
|
CTLTYPE_INT, "journal_iobufs",
|
|
|
|
SYSCTL_DESCR("count of bufs used for journal I/O (max async count)"),
|
|
|
|
NULL, 0, &wapbl_journal_iobufs, 0,
|
|
|
|
CTL_CREATE, CTL_EOL);
|
|
|
|
if (rv)
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2017-04-11 00:34:37 +03:00
|
|
|
|
2020-03-17 00:20:09 +03:00
|
|
|
return;
|
2011-01-08 23:37:05 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_init(void)
|
|
|
|
{
|
2012-01-28 22:02:56 +04:00
|
|
|
|
|
|
|
pool_init(&wapbl_entry_pool, sizeof(struct wapbl_entry), 0, 0, 0,
|
|
|
|
"wapblentrypl", &pool_allocator_kmem, IPL_VM);
|
2016-10-01 16:15:45 +03:00
|
|
|
pool_init(&wapbl_dealloc_pool, sizeof(struct wapbl_dealloc), 0, 0, 0,
|
|
|
|
"wapbldealloc", &pool_allocator_nointr, IPL_NONE);
|
2011-01-08 23:37:05 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2016-05-07 23:59:46 +03:00
|
|
|
wapbl_fini(void)
|
2011-01-08 23:37:05 +03:00
|
|
|
{
|
2012-01-28 22:02:56 +04:00
|
|
|
|
2016-10-01 16:15:45 +03:00
|
|
|
pool_destroy(&wapbl_dealloc_pool);
|
2012-01-28 22:02:56 +04:00
|
|
|
pool_destroy(&wapbl_entry_pool);
|
|
|
|
|
2011-01-08 23:37:05 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-03-05 16:57:29 +03:00
|
|
|
static void
|
|
|
|
wapbl_evcnt_init(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
snprintf(wl->wl_ev_group, sizeof(wl->wl_ev_group),
|
|
|
|
"wapbl fsid 0x%x/0x%x",
|
|
|
|
wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[0],
|
|
|
|
wl->wl_mount->mnt_stat.f_fsidx.__fsid_val[1]
|
|
|
|
);
|
|
|
|
|
|
|
|
evcnt_attach_dynamic(&wl->wl_ev_commit, EVCNT_TYPE_MISC,
|
|
|
|
NULL, wl->wl_ev_group, "commit");
|
|
|
|
evcnt_attach_dynamic(&wl->wl_ev_journalwrite, EVCNT_TYPE_MISC,
|
2017-10-23 22:03:40 +03:00
|
|
|
NULL, wl->wl_ev_group, "journal write total");
|
2017-04-11 00:34:37 +03:00
|
|
|
evcnt_attach_dynamic(&wl->wl_ev_jbufs_bio_nowait, EVCNT_TYPE_MISC,
|
2017-10-23 22:03:40 +03:00
|
|
|
NULL, wl->wl_ev_group, "journal write finished async");
|
2017-03-05 16:57:29 +03:00
|
|
|
evcnt_attach_dynamic(&wl->wl_ev_metawrite, EVCNT_TYPE_MISC,
|
2017-10-23 22:03:40 +03:00
|
|
|
NULL, wl->wl_ev_group, "metadata async write");
|
2017-03-05 16:57:29 +03:00
|
|
|
evcnt_attach_dynamic(&wl->wl_ev_cacheflush, EVCNT_TYPE_MISC,
|
|
|
|
NULL, wl->wl_ev_group, "cache flush");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_evcnt_free(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
evcnt_detach(&wl->wl_ev_commit);
|
|
|
|
evcnt_detach(&wl->wl_ev_journalwrite);
|
2017-04-11 00:34:37 +03:00
|
|
|
evcnt_detach(&wl->wl_ev_jbufs_bio_nowait);
|
2017-03-05 16:57:29 +03:00
|
|
|
evcnt_detach(&wl->wl_ev_metawrite);
|
|
|
|
evcnt_detach(&wl->wl_ev_cacheflush);
|
|
|
|
}
|
|
|
|
|
2017-04-05 23:38:53 +03:00
|
|
|
static void
|
|
|
|
wapbl_dkcache_init(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* Get disk cache flags */
|
|
|
|
error = VOP_IOCTL(wl->wl_devvp, DIOCGCACHE, &wl->wl_dkcache,
|
|
|
|
FWRITE, FSCRED);
|
|
|
|
if (error) {
|
|
|
|
/* behave as if there was a write cache */
|
|
|
|
wl->wl_dkcache = DKCACHE_WRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Use FUA instead of cache flush if available */
|
2017-12-02 20:29:55 +03:00
|
|
|
if (ISSET(wl->wl_dkcache, DKCACHE_FUA))
|
2017-04-05 23:38:53 +03:00
|
|
|
wl->wl_jwrite_flags |= B_MEDIA_FUA;
|
|
|
|
|
|
|
|
/* Use DPO for journal writes if available */
|
|
|
|
if (ISSET(wl->wl_dkcache, DKCACHE_DPO))
|
|
|
|
wl->wl_jwrite_flags |= B_MEDIA_DPO;
|
|
|
|
}
|
|
|
|
|
2009-02-18 16:12:00 +03:00
|
|
|
static int
|
2008-11-20 03:17:08 +03:00
|
|
|
wapbl_start_flush_inodes(struct wapbl *wl, struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
int error, i;
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
|
|
|
|
("wapbl_start: reusing log with %d inodes\n", wr->wr_inodescnt));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Its only valid to reuse the replay log if its
|
|
|
|
* the same as the new log we just opened.
|
|
|
|
*/
|
|
|
|
KDASSERT(!wapbl_replay_isopen(wr));
|
2011-09-01 13:03:43 +04:00
|
|
|
KASSERT(wl->wl_devvp->v_type == VBLK);
|
|
|
|
KASSERT(wr->wr_devvp->v_type == VBLK);
|
2008-11-20 03:17:08 +03:00
|
|
|
KASSERT(wl->wl_devvp->v_rdev == wr->wr_devvp->v_rdev);
|
|
|
|
KASSERT(wl->wl_logpbn == wr->wr_logpbn);
|
|
|
|
KASSERT(wl->wl_circ_size == wr->wr_circ_size);
|
|
|
|
KASSERT(wl->wl_circ_off == wr->wr_circ_off);
|
|
|
|
KASSERT(wl->wl_log_dev_bshift == wr->wr_log_dev_bshift);
|
|
|
|
KASSERT(wl->wl_fs_dev_bshift == wr->wr_fs_dev_bshift);
|
|
|
|
|
|
|
|
wl->wl_wc_header->wc_generation = wr->wr_generation + 1;
|
|
|
|
|
|
|
|
for (i = 0; i < wr->wr_inodescnt; i++)
|
|
|
|
wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber,
|
|
|
|
wr->wr_inodes[i].wr_imode);
|
|
|
|
|
|
|
|
/* Make sure new transaction won't overwrite old inodes list */
|
2017-03-17 06:17:07 +03:00
|
|
|
KDASSERT(wapbl_transaction_len(wl) <=
|
2008-11-20 03:17:08 +03:00
|
|
|
wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead,
|
|
|
|
wr->wr_inodestail));
|
|
|
|
|
|
|
|
wl->wl_head = wl->wl_tail = wr->wr_inodeshead;
|
|
|
|
wl->wl_reclaimable_bytes = wl->wl_reserved_bytes =
|
|
|
|
wapbl_transaction_len(wl);
|
|
|
|
|
|
|
|
error = wapbl_write_inodes(wl, &wl->wl_head);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
KASSERT(wl->wl_head != wl->wl_tail);
|
|
|
|
KASSERT(wl->wl_head != 0);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp,
|
|
|
|
daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr,
|
|
|
|
wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn)
|
|
|
|
{
|
|
|
|
struct wapbl *wl;
|
|
|
|
struct vnode *devvp;
|
|
|
|
daddr_t logpbn;
|
|
|
|
int error;
|
2010-02-23 23:51:25 +03:00
|
|
|
int log_dev_bshift = ilog2(blksize);
|
2010-02-27 01:24:07 +03:00
|
|
|
int fs_dev_bshift = log_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
int run;
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64
|
|
|
|
" count=%zu blksize=%zu\n", vp, off, count, blksize));
|
|
|
|
|
|
|
|
if (log_dev_bshift > fs_dev_bshift) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_OPEN,
|
|
|
|
("wapbl: log device's block size cannot be larger "
|
|
|
|
"than filesystem's\n"));
|
|
|
|
/*
|
|
|
|
* Not currently implemented, although it could be if
|
|
|
|
* needed someday.
|
|
|
|
*/
|
|
|
|
return ENOSYS;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (off < 0)
|
|
|
|
return EINVAL;
|
|
|
|
|
|
|
|
if (blksize < DEV_BSIZE)
|
|
|
|
return EINVAL;
|
|
|
|
if (blksize % DEV_BSIZE)
|
|
|
|
return EINVAL;
|
|
|
|
|
|
|
|
/* XXXTODO: verify that the full load is writable */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX check for minimum log size
|
|
|
|
* minimum is governed by minimum amount of space
|
|
|
|
* to complete a transaction. (probably truncate)
|
|
|
|
*/
|
|
|
|
/* XXX for now pick something minimal */
|
|
|
|
if ((count * blksize) < MAXPHYS) {
|
|
|
|
return ENOSPC;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
wl = wapbl_calloc(1, sizeof(*wl));
|
|
|
|
rw_init(&wl->wl_rwlock);
|
|
|
|
mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE);
|
|
|
|
cv_init(&wl->wl_reclaimable_cv, "wapblrec");
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_INIT(&wl->wl_bufs);
|
2008-07-31 09:38:04 +04:00
|
|
|
SIMPLEQ_INIT(&wl->wl_entries);
|
|
|
|
|
|
|
|
wl->wl_logvp = vp;
|
|
|
|
wl->wl_devvp = devvp;
|
|
|
|
wl->wl_mount = mp;
|
|
|
|
wl->wl_logpbn = logpbn;
|
|
|
|
wl->wl_log_dev_bshift = log_dev_bshift;
|
|
|
|
wl->wl_fs_dev_bshift = fs_dev_bshift;
|
|
|
|
|
|
|
|
wl->wl_flush = flushfn;
|
|
|
|
wl->wl_flush_abort = flushabortfn;
|
|
|
|
|
|
|
|
/* Reserve two log device blocks for the commit headers */
|
|
|
|
wl->wl_circ_off = 2<<wl->wl_log_dev_bshift;
|
2010-02-27 19:51:03 +03:00
|
|
|
wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off);
|
2008-07-31 09:38:04 +04:00
|
|
|
/* truncate the log usage to a multiple of log_dev_bshift */
|
|
|
|
wl->wl_circ_size >>= wl->wl_log_dev_bshift;
|
|
|
|
wl->wl_circ_size <<= wl->wl_log_dev_bshift;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wl_bufbytes_max limits the size of the in memory transaction space.
|
|
|
|
* - Since buffers are allocated and accounted for in units of
|
|
|
|
* PAGE_SIZE it is required to be a multiple of PAGE_SIZE
|
|
|
|
* (i.e. 1<<PAGE_SHIFT)
|
|
|
|
* - Since the log device has to be written in units of
|
|
|
|
* 1<<wl_log_dev_bshift it is required to be a mulitple of
|
|
|
|
* 1<<wl_log_dev_bshift.
|
|
|
|
* - Since filesystem will provide data in units of 1<<wl_fs_dev_bshift,
|
|
|
|
* it is convenient to be a multiple of 1<<wl_fs_dev_bshift.
|
|
|
|
* Therefore it must be multiple of the least common multiple of those
|
|
|
|
* three quantities. Fortunately, all of those quantities are
|
|
|
|
* guaranteed to be a power of two, and the least common multiple of
|
|
|
|
* a set of numbers which are all powers of two is simply the maximum
|
|
|
|
* of those numbers. Finally, the maximum logarithm of a power of two
|
|
|
|
* is the same as the log of the maximum power of two. So we can do
|
|
|
|
* the following operations to size wl_bufbytes_max:
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* XXX fix actual number of pages reserved per filesystem. */
|
|
|
|
wl->wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc() / 2);
|
|
|
|
|
|
|
|
/* Round wl_bufbytes_max to the largest power of two constraint */
|
|
|
|
wl->wl_bufbytes_max >>= PAGE_SHIFT;
|
|
|
|
wl->wl_bufbytes_max <<= PAGE_SHIFT;
|
|
|
|
wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift;
|
|
|
|
wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift;
|
|
|
|
wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift;
|
|
|
|
wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift;
|
|
|
|
|
|
|
|
/* XXX maybe use filesystem fragment size instead of 1024 */
|
|
|
|
/* XXX fix actual number of buffers reserved per filesystem. */
|
2017-06-08 04:23:01 +03:00
|
|
|
wl->wl_bufcount_max = (buf_nbuf() / 2) * 1024;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2016-09-22 19:20:56 +03:00
|
|
|
wl->wl_brperjblock = ((1<<wl->wl_log_dev_bshift)
|
|
|
|
- offsetof(struct wapbl_wc_blocklist, wc_blocks)) /
|
|
|
|
sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]);
|
|
|
|
KASSERT(wl->wl_brperjblock > 0);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/* XXX tie this into resource estimation */
|
2011-02-16 22:43:05 +03:00
|
|
|
wl->wl_dealloclim = wl->wl_bufbytes_max / mp->mnt_stat.f_bsize / 2;
|
2016-11-10 23:56:32 +03:00
|
|
|
TAILQ_INIT(&wl->wl_dealloclist);
|
2017-03-17 06:17:07 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE);
|
|
|
|
|
2017-03-05 16:57:29 +03:00
|
|
|
wapbl_evcnt_init(wl);
|
|
|
|
|
2017-04-05 23:38:53 +03:00
|
|
|
wapbl_dkcache_init(wl);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/* Initialize the commit header */
|
|
|
|
{
|
|
|
|
struct wapbl_wc_header *wc;
|
2008-11-19 01:21:48 +03:00
|
|
|
size_t len = 1 << wl->wl_log_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
wc = wapbl_calloc(1, len);
|
|
|
|
wc->wc_type = WAPBL_WC_HEADER;
|
|
|
|
wc->wc_len = len;
|
|
|
|
wc->wc_circ_off = wl->wl_circ_off;
|
|
|
|
wc->wc_circ_size = wl->wl_circ_size;
|
|
|
|
/* XXX wc->wc_fsid */
|
|
|
|
wc->wc_log_dev_bshift = wl->wl_log_dev_bshift;
|
|
|
|
wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift;
|
|
|
|
wl->wl_wc_header = wc;
|
2012-01-28 22:02:56 +04:00
|
|
|
wl->wl_wc_scratch = wapbl_alloc(len);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
TAILQ_INIT(&wl->wl_iobufs);
|
|
|
|
TAILQ_INIT(&wl->wl_iobufs_busy);
|
|
|
|
for (int i = 0; i < wapbl_journal_iobufs; i++) {
|
|
|
|
struct buf *bp;
|
|
|
|
|
|
|
|
if ((bp = geteblk(MAXPHYS)) == NULL)
|
|
|
|
goto errout;
|
|
|
|
|
|
|
|
mutex_enter(&bufcache_lock);
|
|
|
|
mutex_enter(devvp->v_interlock);
|
|
|
|
bgetvp(devvp, bp);
|
|
|
|
mutex_exit(devvp->v_interlock);
|
|
|
|
mutex_exit(&bufcache_lock);
|
|
|
|
|
|
|
|
bp->b_dev = devvp->v_rdev;
|
|
|
|
|
|
|
|
TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
|
|
|
* if there was an existing set of unlinked but
|
|
|
|
* allocated inodes, preserve it in the new
|
|
|
|
* log.
|
|
|
|
*/
|
|
|
|
if (wr && wr->wr_inodescnt) {
|
2008-11-20 03:17:08 +03:00
|
|
|
error = wapbl_start_flush_inodes(wl, wr);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail);
|
|
|
|
if (error) {
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
*wlp = wl;
|
|
|
|
#if defined(WAPBL_DEBUG)
|
|
|
|
wapbl_debug_wl = wl;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
errout:
|
|
|
|
wapbl_discard(wl);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
|
|
|
|
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
|
2017-04-11 00:34:37 +03:00
|
|
|
while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
|
|
|
|
struct buf *bp;
|
|
|
|
|
|
|
|
bp = TAILQ_FIRST(&wl->wl_iobufs);
|
|
|
|
TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
|
|
|
|
brelse(bp, BC_INVAL);
|
|
|
|
}
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_inodetrk_free(wl);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wl, sizeof(*wl));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Like wapbl_flush, only discards the transaction
|
|
|
|
* completely
|
|
|
|
*/
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_discard(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
struct wapbl_entry *we;
|
2016-10-01 16:15:45 +03:00
|
|
|
struct wapbl_dealloc *wd;
|
2008-07-31 09:38:04 +04:00
|
|
|
struct buf *bp;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX we may consider using upgrade here
|
|
|
|
* if we want to call flush from inside a transaction
|
|
|
|
*/
|
|
|
|
rw_enter(&wl->wl_rwlock, RW_WRITER);
|
2016-11-10 23:56:32 +03:00
|
|
|
wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG_PRINT
|
|
|
|
{
|
|
|
|
pid_t pid = -1;
|
|
|
|
lwpid_t lid = -1;
|
|
|
|
if (curproc)
|
|
|
|
pid = curproc->p_pid;
|
|
|
|
if (curlwp)
|
|
|
|
lid = curlwp->l_lid;
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
|
|
|
|
("wapbl_discard: thread %d.%d discarding "
|
|
|
|
"transaction\n"
|
|
|
|
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
|
|
|
|
"deallocs=%d inodes=%d\n"
|
|
|
|
"\terrcnt = %u, reclaimable=%zu reserved=%zu "
|
|
|
|
"unsynced=%zu\n",
|
|
|
|
pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
|
|
|
|
wl->wl_bcount, wl->wl_dealloccnt,
|
|
|
|
wl->wl_inohashcnt, wl->wl_error_count,
|
|
|
|
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
|
|
|
|
wl->wl_unsynced_bufbytes));
|
|
|
|
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
|
|
|
|
("\tentry: bufcount = %zu, reclaimable = %zu, "
|
|
|
|
"error = %d, unsynced = %zu\n",
|
|
|
|
we->we_bufcount, we->we_reclaimable_bytes,
|
|
|
|
we->we_error, we->we_unsynced_bufbytes));
|
|
|
|
}
|
|
|
|
#else /* !WAPBL_DEBUG_BUFBYTES */
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
|
|
|
|
("wapbl_discard: thread %d.%d discarding transaction\n"
|
|
|
|
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
|
|
|
|
"deallocs=%d inodes=%d\n"
|
|
|
|
"\terrcnt = %u, reclaimable=%zu reserved=%zu\n",
|
|
|
|
pid, lid, wl->wl_bufcount, wl->wl_bufbytes,
|
|
|
|
wl->wl_bcount, wl->wl_dealloccnt,
|
|
|
|
wl->wl_inohashcnt, wl->wl_error_count,
|
|
|
|
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes));
|
|
|
|
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_DISCARD,
|
|
|
|
("\tentry: bufcount = %zu, reclaimable = %zu, "
|
|
|
|
"error = %d\n",
|
|
|
|
we->we_bufcount, we->we_reclaimable_bytes,
|
|
|
|
we->we_error));
|
|
|
|
}
|
|
|
|
#endif /* !WAPBL_DEBUG_BUFBYTES */
|
|
|
|
}
|
|
|
|
#endif /* WAPBL_DEBUG_PRINT */
|
|
|
|
|
|
|
|
for (i = 0; i <= wl->wl_inohashmask; i++) {
|
|
|
|
struct wapbl_ino_head *wih;
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
|
|
|
|
wih = &wl->wl_inohash[i];
|
|
|
|
while ((wi = LIST_FIRST(wih)) != NULL) {
|
|
|
|
LIST_REMOVE(wi, wi_hash);
|
|
|
|
pool_put(&wapbl_ino_pool, wi);
|
|
|
|
KASSERT(wl->wl_inohashcnt > 0);
|
|
|
|
wl->wl_inohashcnt--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* clean buffer list
|
|
|
|
*/
|
|
|
|
mutex_enter(&bufcache_lock);
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
2017-04-10 22:52:38 +03:00
|
|
|
while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
|
2008-07-31 09:38:04 +04:00
|
|
|
if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) {
|
2020-04-12 20:02:52 +03:00
|
|
|
KASSERT(bp->b_flags & B_LOCKED);
|
|
|
|
KASSERT(bp->b_oflags & BO_DELWRI);
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
2020-04-12 20:02:52 +03:00
|
|
|
* Buffer is already on BQ_LOCKED queue.
|
2008-07-31 09:38:04 +04:00
|
|
|
* The buffer will be unlocked and
|
2020-04-12 20:02:52 +03:00
|
|
|
* removed from the transaction in brelsel()
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
2020-04-12 20:02:52 +03:00
|
|
|
bremfree(bp);
|
|
|
|
brelsel(bp, BC_INVAL);
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove references to this wl from wl_entries, free any which
|
2020-04-12 11:51:41 +03:00
|
|
|
* no longer have buffers, others will be freed in wapbl_biodone()
|
2008-07-31 09:38:04 +04:00
|
|
|
* when they no longer have any buffers.
|
|
|
|
*/
|
|
|
|
while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) {
|
|
|
|
SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
|
|
|
|
/* XXX should we be accumulating wl_error_count
|
|
|
|
* and increasing reclaimable bytes ? */
|
|
|
|
we->we_wapbl = NULL;
|
|
|
|
if (we->we_bufcount == 0) {
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
KASSERT(we->we_unsynced_bufbytes == 0);
|
|
|
|
#endif
|
2012-01-28 22:02:56 +04:00
|
|
|
pool_put(&wapbl_entry_pool, we);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-12 11:51:41 +03:00
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
mutex_exit(&bufcache_lock);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/* Discard list of deallocs */
|
2016-11-10 23:56:32 +03:00
|
|
|
while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL)
|
|
|
|
wapbl_deallocation_free(wl, wd, true);
|
2016-10-01 16:15:45 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/* XXX should we clear wl_reserved_bytes? */
|
|
|
|
|
|
|
|
KASSERT(wl->wl_bufbytes == 0);
|
|
|
|
KASSERT(wl->wl_bcount == 0);
|
|
|
|
KASSERT(wl->wl_bufcount == 0);
|
2017-04-10 22:52:38 +03:00
|
|
|
KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
|
2008-07-31 09:38:04 +04:00
|
|
|
KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
|
|
|
|
KASSERT(wl->wl_inohashcnt == 0);
|
2016-11-10 23:56:32 +03:00
|
|
|
KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
|
2016-10-01 16:15:45 +03:00
|
|
|
KASSERT(wl->wl_dealloccnt == 0);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
rw_exit(&wl->wl_rwlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
wapbl_stop(struct wapbl *wl, int force)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n"));
|
|
|
|
error = wapbl_flush(wl, 1);
|
|
|
|
if (error) {
|
|
|
|
if (force)
|
|
|
|
wapbl_discard(wl);
|
|
|
|
else
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Unlinked inodes persist after a flush */
|
|
|
|
if (wl->wl_inohashcnt) {
|
|
|
|
if (force) {
|
|
|
|
wapbl_discard(wl);
|
|
|
|
} else {
|
|
|
|
return EBUSY;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
KASSERT(wl->wl_bufbytes == 0);
|
|
|
|
KASSERT(wl->wl_bcount == 0);
|
|
|
|
KASSERT(wl->wl_bufcount == 0);
|
2017-04-10 22:52:38 +03:00
|
|
|
KASSERT(TAILQ_EMPTY(&wl->wl_bufs));
|
2008-07-31 09:38:04 +04:00
|
|
|
KASSERT(wl->wl_dealloccnt == 0);
|
|
|
|
KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries));
|
|
|
|
KASSERT(wl->wl_inohashcnt == 0);
|
2016-11-10 23:56:32 +03:00
|
|
|
KASSERT(TAILQ_EMPTY(&wl->wl_dealloclist));
|
2016-10-01 16:15:45 +03:00
|
|
|
KASSERT(wl->wl_dealloccnt == 0);
|
2017-04-11 00:34:37 +03:00
|
|
|
KASSERT(TAILQ_EMPTY(&wl->wl_iobufs_busy));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wl->wl_wc_scratch, wl->wl_wc_header->wc_len);
|
|
|
|
wapbl_free(wl->wl_wc_header, wl->wl_wc_header->wc_len);
|
2017-04-11 00:34:37 +03:00
|
|
|
while (!TAILQ_EMPTY(&wl->wl_iobufs)) {
|
|
|
|
struct buf *bp;
|
|
|
|
|
|
|
|
bp = TAILQ_FIRST(&wl->wl_iobufs);
|
|
|
|
TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
|
|
|
|
brelse(bp, BC_INVAL);
|
|
|
|
}
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_inodetrk_free(wl);
|
|
|
|
|
2017-03-05 16:57:29 +03:00
|
|
|
wapbl_evcnt_free(wl);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
cv_destroy(&wl->wl_reclaimable_cv);
|
|
|
|
mutex_destroy(&wl->wl_mtx);
|
|
|
|
rw_destroy(&wl->wl_rwlock);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wl, sizeof(*wl));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/****************************************************************/
|
|
|
|
/*
|
|
|
|
* Unbuffered disk I/O
|
|
|
|
*/
|
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
static void
|
|
|
|
wapbl_doio_accounting(struct vnode *devvp, int flags)
|
2008-07-31 09:38:04 +04:00
|
|
|
{
|
|
|
|
struct pstats *pstats = curlwp->l_proc->p_stats;
|
|
|
|
|
|
|
|
if ((flags & (B_WRITE | B_READ)) == B_WRITE) {
|
2011-06-12 07:35:36 +04:00
|
|
|
mutex_enter(devvp->v_interlock);
|
2008-07-31 09:38:04 +04:00
|
|
|
devvp->v_numoutput++;
|
2011-06-12 07:35:36 +04:00
|
|
|
mutex_exit(devvp->v_interlock);
|
2008-07-31 09:38:04 +04:00
|
|
|
pstats->p_ru.ru_oublock++;
|
|
|
|
} else {
|
|
|
|
pstats->p_ru.ru_inblock++;
|
|
|
|
}
|
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags)
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
KASSERT(devvp->v_type == VBLK);
|
|
|
|
|
|
|
|
wapbl_doio_accounting(devvp, flags);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
bp = getiobuf(devvp, true);
|
|
|
|
bp->b_flags = flags;
|
2020-03-14 18:31:29 +03:00
|
|
|
bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
|
2008-07-31 09:38:04 +04:00
|
|
|
bp->b_dev = devvp->v_rdev;
|
|
|
|
bp->b_data = data;
|
|
|
|
bp->b_bufsize = bp->b_resid = bp->b_bcount = len;
|
|
|
|
bp->b_blkno = pbn;
|
2012-04-30 02:55:11 +04:00
|
|
|
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_IO,
|
2009-11-25 17:43:31 +03:00
|
|
|
("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%"PRIx64"\n",
|
2008-07-31 09:38:04 +04:00
|
|
|
BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount,
|
|
|
|
bp->b_blkno, bp->b_dev));
|
|
|
|
|
|
|
|
VOP_STRATEGY(devvp, bp);
|
|
|
|
|
|
|
|
error = biowait(bp);
|
|
|
|
putiobuf(bp);
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
|
|
|
("wapbl_doio: %s %zu bytes at block %" PRId64
|
2009-11-25 17:43:31 +03:00
|
|
|
" on dev 0x%"PRIx64" failed with error %d\n",
|
2008-07-31 09:38:04 +04:00
|
|
|
(((flags & (B_WRITE | B_READ)) == B_WRITE) ?
|
|
|
|
"write" : "read"),
|
|
|
|
len, pbn, devvp->v_rdev, error));
|
|
|
|
}
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_write(data, len, devvp, pbn)
|
|
|
|
*
|
|
|
|
* Synchronously write len bytes from data to physical block pbn
|
|
|
|
* on devvp.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
|
|
|
|
{
|
|
|
|
|
|
|
|
return wapbl_doio(data, len, devvp, pbn, B_WRITE);
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_read(data, len, devvp, pbn)
|
|
|
|
*
|
|
|
|
* Synchronously read len bytes into data from physical block pbn
|
|
|
|
* on devvp.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn)
|
|
|
|
{
|
|
|
|
|
|
|
|
return wapbl_doio(data, len, devvp, pbn, B_READ);
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/****************************************************************/
|
|
|
|
/*
|
|
|
|
* Buffered disk writes -- try to coalesce writes and emit
|
|
|
|
* MAXPHYS-aligned blocks.
|
|
|
|
*/
|
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
/*
|
|
|
|
* wapbl_buffered_write_async(wl, bp)
|
|
|
|
*
|
|
|
|
* Send buffer for asynchronous write.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
wapbl_buffered_write_async(struct wapbl *wl, struct buf *bp)
|
|
|
|
{
|
|
|
|
wapbl_doio_accounting(wl->wl_devvp, bp->b_flags);
|
|
|
|
|
|
|
|
KASSERT(TAILQ_FIRST(&wl->wl_iobufs) == bp);
|
|
|
|
TAILQ_REMOVE(&wl->wl_iobufs, bp, b_wapbllist);
|
|
|
|
|
2017-12-02 20:29:55 +03:00
|
|
|
bp->b_flags |= B_WRITE;
|
2020-03-14 18:31:29 +03:00
|
|
|
bp->b_cflags |= BC_BUSY; /* mandatory, asserted by biowait() */
|
2017-04-11 00:34:37 +03:00
|
|
|
bp->b_oflags = 0;
|
|
|
|
bp->b_bcount = bp->b_resid;
|
|
|
|
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
|
|
|
|
|
|
|
|
VOP_STRATEGY(wl->wl_devvp, bp);
|
|
|
|
|
|
|
|
wl->wl_ev_journalwrite.ev_count++;
|
|
|
|
|
|
|
|
TAILQ_INSERT_TAIL(&wl->wl_iobufs_busy, bp, b_wapbllist);
|
|
|
|
}
|
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_buffered_flush(wl)
|
|
|
|
*
|
|
|
|
* Flush any buffered writes from wapbl_buffered_write.
|
2012-12-08 11:24:42 +04:00
|
|
|
*/
|
|
|
|
static int
|
2017-04-11 00:34:37 +03:00
|
|
|
wapbl_buffered_flush(struct wapbl *wl, bool full)
|
2012-12-08 11:24:42 +04:00
|
|
|
{
|
2017-04-11 00:34:37 +03:00
|
|
|
int error = 0;
|
|
|
|
struct buf *bp, *bnext;
|
|
|
|
bool only_done = true, found = false;
|
|
|
|
|
|
|
|
/* if there is outstanding buffered write, send it now */
|
|
|
|
if ((bp = TAILQ_FIRST(&wl->wl_iobufs)) && bp->b_resid > 0)
|
|
|
|
wapbl_buffered_write_async(wl, bp);
|
|
|
|
|
|
|
|
/* wait for I/O to complete */
|
|
|
|
again:
|
|
|
|
TAILQ_FOREACH_SAFE(bp, &wl->wl_iobufs_busy, b_wapbllist, bnext) {
|
|
|
|
if (!full && only_done) {
|
|
|
|
/* skip unfinished */
|
|
|
|
if (!ISSET(bp->b_oflags, BO_DONE))
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ISSET(bp->b_oflags, BO_DONE))
|
|
|
|
wl->wl_ev_jbufs_bio_nowait.ev_count++;
|
2012-12-08 11:24:42 +04:00
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
TAILQ_REMOVE(&wl->wl_iobufs_busy, bp, b_wapbllist);
|
|
|
|
error = biowait(bp);
|
2012-12-08 11:24:42 +04:00
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
/* reset for reuse */
|
2017-12-02 20:29:55 +03:00
|
|
|
bp->b_blkno = bp->b_resid = bp->b_flags = 0;
|
2017-04-11 00:34:37 +03:00
|
|
|
TAILQ_INSERT_TAIL(&wl->wl_iobufs, bp, b_wapbllist);
|
|
|
|
found = true;
|
2012-12-08 11:24:42 +04:00
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
if (!full)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!found && only_done && !TAILQ_EMPTY(&wl->wl_iobufs_busy)) {
|
|
|
|
only_done = false;
|
|
|
|
goto again;
|
|
|
|
}
|
2017-03-05 16:57:29 +03:00
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_buffered_write(data, len, wl, pbn)
|
|
|
|
*
|
|
|
|
* Write len bytes from data to physical block pbn on
|
|
|
|
* wl->wl_devvp. The write may not complete until
|
|
|
|
* wapbl_buffered_flush.
|
2012-12-08 11:24:42 +04:00
|
|
|
*/
|
|
|
|
static int
|
2017-12-02 20:29:55 +03:00
|
|
|
wapbl_buffered_write(void *data, size_t len, struct wapbl *wl, daddr_t pbn,
|
|
|
|
int bflags)
|
2012-12-08 11:24:42 +04:00
|
|
|
{
|
|
|
|
size_t resid;
|
2017-04-11 00:34:37 +03:00
|
|
|
struct buf *bp;
|
|
|
|
|
|
|
|
again:
|
|
|
|
bp = TAILQ_FIRST(&wl->wl_iobufs);
|
|
|
|
|
|
|
|
if (bp == NULL) {
|
|
|
|
/* No more buffers, wait for any previous I/O to finish. */
|
|
|
|
wapbl_buffered_flush(wl, false);
|
|
|
|
|
|
|
|
bp = TAILQ_FIRST(&wl->wl_iobufs);
|
|
|
|
KASSERT(bp != NULL);
|
|
|
|
}
|
2012-12-08 11:24:42 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If not adjacent to buffered data flush first. Disk block
|
|
|
|
* address is always valid for non-empty buffer.
|
|
|
|
*/
|
2017-04-11 00:34:37 +03:00
|
|
|
if ((bp->b_resid > 0 && pbn != bp->b_blkno + btodb(bp->b_resid))) {
|
|
|
|
wapbl_buffered_write_async(wl, bp);
|
|
|
|
goto again;
|
2012-12-08 11:24:42 +04:00
|
|
|
}
|
2017-04-11 00:34:37 +03:00
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
/*
|
|
|
|
* If this write goes to an empty buffer we have to
|
|
|
|
* save the disk block address first.
|
|
|
|
*/
|
2017-12-02 20:29:55 +03:00
|
|
|
if (bp->b_blkno == 0) {
|
2017-04-11 00:34:37 +03:00
|
|
|
bp->b_blkno = pbn;
|
2017-12-02 20:29:55 +03:00
|
|
|
bp->b_flags |= bflags;
|
|
|
|
}
|
2017-04-11 00:34:37 +03:00
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
/*
|
2017-04-11 00:34:37 +03:00
|
|
|
* Remaining space so this buffer ends on a buffer size boundary.
|
2012-12-08 11:24:42 +04:00
|
|
|
*
|
|
|
|
* Cannot become less or equal zero as the buffer would have been
|
|
|
|
* flushed on the last call then.
|
|
|
|
*/
|
2017-04-11 00:34:37 +03:00
|
|
|
resid = bp->b_bufsize - dbtob(bp->b_blkno % btodb(bp->b_bufsize)) -
|
|
|
|
bp->b_resid;
|
2012-12-08 11:24:42 +04:00
|
|
|
KASSERT(resid > 0);
|
|
|
|
KASSERT(dbtob(btodb(resid)) == resid);
|
2017-04-11 00:34:37 +03:00
|
|
|
|
|
|
|
if (len < resid)
|
|
|
|
resid = len;
|
|
|
|
|
|
|
|
memcpy((uint8_t *)bp->b_data + bp->b_resid, data, resid);
|
|
|
|
bp->b_resid += resid;
|
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
if (len >= resid) {
|
2017-04-11 00:34:37 +03:00
|
|
|
/* Just filled the buf, or data did not fit */
|
|
|
|
wapbl_buffered_write_async(wl, bp);
|
|
|
|
|
2012-12-08 11:24:42 +04:00
|
|
|
data = (uint8_t *)data + resid;
|
|
|
|
len -= resid;
|
2017-04-11 00:34:37 +03:00
|
|
|
pbn += btodb(resid);
|
|
|
|
|
|
|
|
if (len > 0)
|
|
|
|
goto again;
|
2012-12-08 11:24:42 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_circ_write(wl, data, len, offp)
|
|
|
|
*
|
|
|
|
* Write len bytes from data to the circular queue of wl, starting
|
|
|
|
* at linear byte offset *offp, and returning the new linear byte
|
|
|
|
* offset in *offp.
|
|
|
|
*
|
|
|
|
* If the starting linear byte offset precedes wl->wl_circ_off,
|
|
|
|
* the write instead begins at wl->wl_circ_off. XXX WTF? This
|
|
|
|
* should be a KASSERT, not a conditional.
|
|
|
|
*
|
|
|
|
* The write is buffered in wl and must be flushed with
|
|
|
|
* wapbl_buffered_flush before it will be submitted to the disk.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp)
|
|
|
|
{
|
|
|
|
size_t slen;
|
|
|
|
off_t off = *offp;
|
|
|
|
int error;
|
2010-02-27 19:51:03 +03:00
|
|
|
daddr_t pbn;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
KDASSERT(((len >> wl->wl_log_dev_bshift) <<
|
|
|
|
wl->wl_log_dev_bshift) == len);
|
|
|
|
|
|
|
|
if (off < wl->wl_circ_off)
|
|
|
|
off = wl->wl_circ_off;
|
|
|
|
slen = wl->wl_circ_off + wl->wl_circ_size - off;
|
|
|
|
if (slen < len) {
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << wl->wl_log_dev_bshift);
|
|
|
|
#endif
|
2017-12-02 20:29:55 +03:00
|
|
|
error = wapbl_buffered_write(data, slen, wl, pbn,
|
|
|
|
WAPBL_JDATA_FLAGS(wl));
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
data = (uint8_t *)data + slen;
|
|
|
|
len -= slen;
|
|
|
|
off = wl->wl_circ_off;
|
|
|
|
}
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = wl->wl_logpbn + (off >> wl->wl_log_dev_bshift);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << wl->wl_log_dev_bshift);
|
|
|
|
#endif
|
2017-12-02 20:29:55 +03:00
|
|
|
error = wapbl_buffered_write(data, len, wl, pbn,
|
|
|
|
WAPBL_JDATA_FLAGS(wl));
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
off += len;
|
|
|
|
if (off >= wl->wl_circ_off + wl->wl_circ_size)
|
|
|
|
off = wl->wl_circ_off;
|
|
|
|
*offp = off;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* WAPBL transactions: entering, adding/removing bufs, and exiting
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
int
|
|
|
|
wapbl_begin(struct wapbl *wl, const char *file, int line)
|
|
|
|
{
|
|
|
|
int doflush;
|
|
|
|
unsigned lockcount;
|
|
|
|
|
|
|
|
KDASSERT(wl);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX this needs to be made much more sophisticated.
|
|
|
|
* perhaps each wapbl_begin could reserve a specified
|
|
|
|
* number of buffers and bytes.
|
|
|
|
*/
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
lockcount = wl->wl_lock_count;
|
|
|
|
doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) >
|
|
|
|
wl->wl_bufbytes_max / 2) ||
|
|
|
|
((wl->wl_bufcount + (lockcount * 10)) >
|
|
|
|
wl->wl_bufcount_max / 2) ||
|
2009-10-01 16:28:34 +04:00
|
|
|
(wapbl_transaction_len(wl) > wl->wl_circ_size / 2) ||
|
2011-02-18 16:24:40 +03:00
|
|
|
(wl->wl_dealloccnt >= (wl->wl_dealloclim / 2));
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
if (doflush) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
|
|
|
|
("force flush lockcnt=%d bufbytes=%zu "
|
2009-10-01 16:28:34 +04:00
|
|
|
"(max=%zu) bufcount=%zu (max=%zu) "
|
|
|
|
"dealloccnt %d (lim=%d)\n",
|
2008-07-31 09:38:04 +04:00
|
|
|
lockcount, wl->wl_bufbytes,
|
|
|
|
wl->wl_bufbytes_max, wl->wl_bufcount,
|
2009-10-01 16:28:34 +04:00
|
|
|
wl->wl_bufcount_max,
|
|
|
|
wl->wl_dealloccnt, wl->wl_dealloclim));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (doflush) {
|
|
|
|
int error = wapbl_flush(wl, 0);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2009-02-22 23:10:25 +03:00
|
|
|
rw_enter(&wl->wl_rwlock, RW_READER);
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
wl->wl_lock_count++;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
2009-02-22 23:10:25 +03:00
|
|
|
#if defined(WAPBL_DEBUG_PRINT)
|
2008-07-31 09:38:04 +04:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
|
|
|
|
("wapbl_begin thread %d.%d with bufcount=%zu "
|
|
|
|
"bufbytes=%zu bcount=%zu at %s:%d\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes, wl->wl_bcount, file, line));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_end(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
|
2009-02-22 23:10:25 +03:00
|
|
|
#if defined(WAPBL_DEBUG_PRINT)
|
2008-07-31 09:38:04 +04:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
|
|
|
|
("wapbl_end thread %d.%d with bufcount=%zu "
|
|
|
|
"bufbytes=%zu bcount=%zu\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes, wl->wl_bcount));
|
|
|
|
#endif
|
|
|
|
|
2016-05-03 22:15:29 +03:00
|
|
|
/*
|
|
|
|
* XXX this could be handled more gracefully, perhaps place
|
|
|
|
* only a partial transaction in the log and allow the
|
|
|
|
* remaining to flush without the protection of the journal.
|
|
|
|
*/
|
2016-05-03 22:43:45 +03:00
|
|
|
KASSERTMSG((wapbl_transaction_len(wl) <=
|
|
|
|
(wl->wl_circ_size - wl->wl_reserved_bytes)),
|
2016-05-03 22:15:29 +03:00
|
|
|
"wapbl_end: current transaction too big to flush");
|
2011-02-14 19:05:11 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
KASSERT(wl->wl_lock_count > 0);
|
|
|
|
wl->wl_lock_count--;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
rw_exit(&wl->wl_rwlock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_add_buf(struct wapbl *wl, struct buf * bp)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(bp->b_cflags & BC_BUSY);
|
|
|
|
KASSERT(bp->b_vp);
|
|
|
|
|
|
|
|
wapbl_jlock_assert(wl);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* XXX this might be an issue for swapfiles.
|
|
|
|
* see uvm_swap.c:1702
|
|
|
|
*
|
|
|
|
* XXX2 why require it then? leap of semantics?
|
|
|
|
*/
|
|
|
|
KASSERT((bp->b_cflags & BC_NOCACHE) == 0);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
if (bp->b_flags & B_LOCKED) {
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
|
2008-07-31 09:38:04 +04:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_BUFFER2,
|
|
|
|
("wapbl_add_buf thread %d.%d re-adding buf %p "
|
|
|
|
"with %d bytes %d bcount\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
|
|
|
|
bp->b_bcount));
|
|
|
|
} else {
|
|
|
|
/* unlocked by dirty buffers shouldn't exist */
|
|
|
|
KASSERT(!(bp->b_oflags & BO_DELWRI));
|
|
|
|
wl->wl_bufbytes += bp->b_bufsize;
|
|
|
|
wl->wl_bcount += bp->b_bcount;
|
|
|
|
wl->wl_bufcount++;
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
|
|
|
|
("wapbl_add_buf thread %d.%d adding buf %p "
|
|
|
|
"with %d bytes %d bcount\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize,
|
|
|
|
bp->b_bcount));
|
|
|
|
}
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_INSERT_TAIL(&wl->wl_bufs, bp, b_wapbllist);
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
bp->b_flags |= B_LOCKED;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&wl->wl_mtx));
|
|
|
|
KASSERT(bp->b_cflags & BC_BUSY);
|
|
|
|
wapbl_jlock_assert(wl);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* XXX this might be an issue for swapfiles.
|
|
|
|
* see uvm_swap.c:1725
|
|
|
|
*
|
|
|
|
* XXXdeux: see above
|
|
|
|
*/
|
|
|
|
KASSERT((bp->b_flags & BC_NOCACHE) == 0);
|
|
|
|
#endif
|
|
|
|
KASSERT(bp->b_flags & B_LOCKED);
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_BUFFER,
|
|
|
|
("wapbl_remove_buf thread %d.%d removing buf %p with "
|
|
|
|
"%d bytes %d bcount\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount));
|
|
|
|
|
|
|
|
KASSERT(wl->wl_bufbytes >= bp->b_bufsize);
|
|
|
|
wl->wl_bufbytes -= bp->b_bufsize;
|
|
|
|
KASSERT(wl->wl_bcount >= bp->b_bcount);
|
|
|
|
wl->wl_bcount -= bp->b_bcount;
|
|
|
|
KASSERT(wl->wl_bufcount > 0);
|
|
|
|
wl->wl_bufcount--;
|
|
|
|
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
|
|
|
|
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_REMOVE(&wl->wl_bufs, bp, b_wapbllist);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
bp->b_flags &= ~B_LOCKED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* called from brelsel() in vfs_bio among other places */
|
|
|
|
void
|
|
|
|
wapbl_remove_buf(struct wapbl * wl, struct buf *bp)
|
|
|
|
{
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
wapbl_remove_buf_locked(wl, bp);
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(bp->b_cflags & BC_BUSY);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX: why does this depend on B_LOCKED? otherwise the buf
|
|
|
|
* is not for a transaction? if so, why is this called in the
|
|
|
|
* first place?
|
|
|
|
*/
|
|
|
|
if (bp->b_flags & B_LOCKED) {
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
wl->wl_bufbytes += bp->b_bufsize - oldsz;
|
|
|
|
wl->wl_bcount += bp->b_bcount - oldcnt;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
/* Some utility inlines */
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_space_used(avail, head, tail)
|
|
|
|
*
|
|
|
|
* Number of bytes used in a circular queue of avail total bytes,
|
|
|
|
* from tail to head.
|
|
|
|
*/
|
2013-09-14 17:19:50 +04:00
|
|
|
static inline size_t
|
|
|
|
wapbl_space_used(size_t avail, off_t head, off_t tail)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (tail == 0) {
|
|
|
|
KASSERT(head == 0);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return ((head + (avail - 1) - tail) % avail) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_advance(size, off, oldoff, delta)
|
|
|
|
*
|
|
|
|
* Given a byte offset oldoff into a circular queue of size bytes
|
|
|
|
* starting at off, return a new byte offset oldoff + delta into
|
|
|
|
* the circular queue.
|
|
|
|
*/
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline off_t
|
2014-09-05 09:57:21 +04:00
|
|
|
wapbl_advance(size_t size, size_t off, off_t oldoff, size_t delta)
|
2008-07-31 09:38:04 +04:00
|
|
|
{
|
2014-09-05 09:57:21 +04:00
|
|
|
off_t newoff;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* Define acceptable ranges for inputs. */
|
2011-08-14 16:37:09 +04:00
|
|
|
KASSERT(delta <= (size_t)size);
|
2014-09-05 09:57:21 +04:00
|
|
|
KASSERT((oldoff == 0) || ((size_t)oldoff >= off));
|
|
|
|
KASSERT(oldoff < (off_t)(size + off));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2014-09-05 09:57:21 +04:00
|
|
|
if ((oldoff == 0) && (delta != 0))
|
|
|
|
newoff = off + delta;
|
|
|
|
else if ((oldoff + delta) < (size + off))
|
|
|
|
newoff = oldoff + delta;
|
2008-07-31 09:38:04 +04:00
|
|
|
else
|
2014-09-05 09:57:21 +04:00
|
|
|
newoff = (oldoff + delta) - size;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* Note some interesting axioms */
|
2014-09-05 09:57:21 +04:00
|
|
|
KASSERT((delta != 0) || (newoff == oldoff));
|
|
|
|
KASSERT((delta == 0) || (newoff != 0));
|
|
|
|
KASSERT((delta != (size)) || (newoff == oldoff));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* Define acceptable ranges for output. */
|
2014-09-05 09:57:21 +04:00
|
|
|
KASSERT((newoff == 0) || ((size_t)newoff >= off));
|
|
|
|
KASSERT((size_t)newoff < (size + off));
|
|
|
|
return newoff;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_space_free(avail, head, tail)
|
|
|
|
*
|
|
|
|
* Number of bytes free in a circular queue of avail total bytes,
|
|
|
|
* in which everything from tail to head is used.
|
|
|
|
*/
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline size_t
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_space_free(size_t avail, off_t head, off_t tail)
|
|
|
|
{
|
|
|
|
|
|
|
|
return avail - wapbl_space_used(avail, head, tail);
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_advance_head(size, off, delta, headp, tailp)
|
|
|
|
*
|
|
|
|
* In a circular queue of size bytes starting at off, given the
|
|
|
|
* old head and tail offsets *headp and *tailp, store the new head
|
|
|
|
* and tail offsets in *headp and *tailp resulting from adding
|
|
|
|
* delta bytes of data to the head.
|
|
|
|
*/
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline void
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp,
|
|
|
|
off_t *tailp)
|
|
|
|
{
|
|
|
|
off_t head = *headp;
|
|
|
|
off_t tail = *tailp;
|
|
|
|
|
|
|
|
KASSERT(delta <= wapbl_space_free(size, head, tail));
|
|
|
|
head = wapbl_advance(size, off, head, delta);
|
|
|
|
if ((tail == 0) && (head != 0))
|
|
|
|
tail = off;
|
|
|
|
*headp = head;
|
|
|
|
*tailp = tail;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_advance_tail(size, off, delta, headp, tailp)
|
|
|
|
*
|
|
|
|
* In a circular queue of size bytes starting at off, given the
|
|
|
|
* old head and tail offsets *headp and *tailp, store the new head
|
|
|
|
* and tail offsets in *headp and *tailp resulting from removing
|
|
|
|
* delta bytes of data from the tail.
|
|
|
|
*/
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline void
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp,
|
|
|
|
off_t *tailp)
|
|
|
|
{
|
|
|
|
off_t head = *headp;
|
|
|
|
off_t tail = *tailp;
|
|
|
|
|
|
|
|
KASSERT(delta <= wapbl_space_used(size, head, tail));
|
|
|
|
tail = wapbl_advance(size, off, tail, delta);
|
|
|
|
if (head == tail) {
|
|
|
|
head = tail = 0;
|
|
|
|
}
|
|
|
|
*headp = head;
|
|
|
|
*tailp = tail;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
/*
|
2016-05-07 23:39:33 +03:00
|
|
|
* wapbl_truncate(wl, minfree)
|
2016-05-07 23:16:38 +03:00
|
|
|
*
|
|
|
|
* Wait until at least minfree bytes are available in the log.
|
|
|
|
*
|
2016-05-07 23:39:33 +03:00
|
|
|
* If it was necessary to wait for writes to complete,
|
|
|
|
* advance the circular queue tail to reflect the new write
|
|
|
|
* completions and issue a write commit to the log.
|
2016-05-07 23:16:38 +03:00
|
|
|
*
|
|
|
|
* => Caller must hold wl->wl_rwlock writer lock.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
static int
|
2016-05-07 23:39:33 +03:00
|
|
|
wapbl_truncate(struct wapbl *wl, size_t minfree)
|
2008-07-31 09:38:04 +04:00
|
|
|
{
|
|
|
|
size_t delta;
|
|
|
|
size_t avail;
|
|
|
|
off_t head;
|
|
|
|
off_t tail;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes));
|
|
|
|
KASSERT(rw_write_held(&wl->wl_rwlock));
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* First check to see if we have to do a commit
|
|
|
|
* at all.
|
|
|
|
*/
|
|
|
|
avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail);
|
|
|
|
if (minfree < avail) {
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
minfree -= avail;
|
|
|
|
while ((wl->wl_error_count == 0) &&
|
|
|
|
(wl->wl_reclaimable_bytes < minfree)) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
|
|
|
|
("wapbl_truncate: sleeping on %p wl=%p bytes=%zd "
|
|
|
|
"minfree=%zd\n",
|
|
|
|
&wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes,
|
|
|
|
minfree));
|
|
|
|
|
|
|
|
cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx);
|
|
|
|
}
|
|
|
|
if (wl->wl_reclaimable_bytes < minfree) {
|
|
|
|
KASSERT(wl->wl_error_count);
|
|
|
|
/* XXX maybe get actual error from buffer instead someday? */
|
|
|
|
error = EIO;
|
|
|
|
}
|
|
|
|
head = wl->wl_head;
|
|
|
|
tail = wl->wl_tail;
|
|
|
|
delta = wl->wl_reclaimable_bytes;
|
|
|
|
|
|
|
|
/* If all of of the entries are flushed, then be sure to keep
|
|
|
|
* the reserved bytes reserved. Watch out for discarded transactions,
|
|
|
|
* which could leave more bytes reserved than are reclaimable.
|
|
|
|
*/
|
2017-03-17 06:17:07 +03:00
|
|
|
if (SIMPLEQ_EMPTY(&wl->wl_entries) &&
|
2008-07-31 09:38:04 +04:00
|
|
|
(delta >= wl->wl_reserved_bytes)) {
|
|
|
|
delta -= wl->wl_reserved_bytes;
|
|
|
|
}
|
|
|
|
wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head,
|
|
|
|
&tail);
|
|
|
|
KDASSERT(wl->wl_reserved_bytes <=
|
|
|
|
wapbl_space_used(wl->wl_circ_size, head, tail));
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is where head, tail and delta are unprotected
|
|
|
|
* from races against itself or flush. This is ok since
|
|
|
|
* we only call this routine from inside flush itself.
|
|
|
|
*
|
|
|
|
* XXX: how can it race against itself when accessed only
|
|
|
|
* from behind the write-locked rwlock?
|
|
|
|
*/
|
|
|
|
error = wapbl_write_commit(wl, head, tail);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
wl->wl_head = head;
|
|
|
|
wl->wl_tail = tail;
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
KASSERT(wl->wl_reclaimable_bytes >= delta);
|
|
|
|
wl->wl_reclaimable_bytes -= delta;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE,
|
|
|
|
("wapbl_truncate thread %d.%d truncating %zu bytes\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, delta));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_biodone(struct buf *bp)
|
|
|
|
{
|
|
|
|
struct wapbl_entry *we = bp->b_private;
|
2020-04-12 11:51:41 +03:00
|
|
|
struct wapbl *wl;
|
2012-11-17 14:10:17 +04:00
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
const int bufsize = bp->b_bufsize;
|
|
|
|
#endif
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2020-04-12 11:51:41 +03:00
|
|
|
mutex_enter(&bufcache_lock);
|
|
|
|
wl = we->we_wapbl;
|
|
|
|
mutex_exit(&bufcache_lock);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
|
|
|
* Handle possible flushing of buffers after log has been
|
|
|
|
* decomissioned.
|
|
|
|
*/
|
|
|
|
if (!wl) {
|
|
|
|
KASSERT(we->we_bufcount > 0);
|
|
|
|
we->we_bufcount--;
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
2012-11-17 14:10:17 +04:00
|
|
|
KASSERT(we->we_unsynced_bufbytes >= bufsize);
|
|
|
|
we->we_unsynced_bufbytes -= bufsize;
|
2008-07-31 09:38:04 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
if (we->we_bufcount == 0) {
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
KASSERT(we->we_unsynced_bufbytes == 0);
|
|
|
|
#endif
|
2012-01-28 22:02:56 +04:00
|
|
|
pool_put(&wapbl_entry_pool, we);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
brelse(bp, 0);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef ohbother
|
2011-05-26 08:51:57 +04:00
|
|
|
KDASSERT(bp->b_oflags & BO_DONE);
|
|
|
|
KDASSERT(!(bp->b_oflags & BO_DELWRI));
|
2008-07-31 09:38:04 +04:00
|
|
|
KDASSERT(bp->b_flags & B_ASYNC);
|
2011-05-26 08:51:57 +04:00
|
|
|
KDASSERT(bp->b_cflags & BC_BUSY);
|
2008-07-31 09:38:04 +04:00
|
|
|
KDASSERT(!(bp->b_flags & B_LOCKED));
|
|
|
|
KDASSERT(!(bp->b_flags & B_READ));
|
2011-05-26 08:51:57 +04:00
|
|
|
KDASSERT(!(bp->b_cflags & BC_INVAL));
|
|
|
|
KDASSERT(!(bp->b_cflags & BC_NOCACHE));
|
2008-07-31 09:38:04 +04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
if (bp->b_error) {
|
2009-07-15 00:59:00 +04:00
|
|
|
/*
|
2016-05-19 21:32:29 +03:00
|
|
|
* If an error occurs, it would be nice to leave the buffer
|
|
|
|
* as a delayed write on the LRU queue so that we can retry
|
|
|
|
* it later. But buffercache(9) can't handle dirty buffer
|
|
|
|
* reuse, so just mark the log permanently errored out.
|
2009-07-15 00:59:00 +04:00
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
if (wl->wl_error_count == 0) {
|
|
|
|
wl->wl_error_count++;
|
|
|
|
cv_broadcast(&wl->wl_reclaimable_cv);
|
|
|
|
}
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
|
2017-04-05 23:38:53 +03:00
|
|
|
/*
|
|
|
|
* Make sure that the buf doesn't retain the media flags, so that
|
|
|
|
* e.g. wapbl_allow_fuadpo has immediate effect on any following I/O.
|
|
|
|
* The flags will be set again if needed by another I/O.
|
|
|
|
*/
|
|
|
|
bp->b_flags &= ~B_MEDIA_FLAGS;
|
|
|
|
|
2012-11-17 14:10:17 +04:00
|
|
|
/*
|
|
|
|
* Release the buffer here. wapbl_flush() may wait for the
|
|
|
|
* log to become empty and we better unbusy the buffer before
|
|
|
|
* wapbl_flush() returns.
|
|
|
|
*/
|
|
|
|
brelse(bp, 0);
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
|
|
|
|
KASSERT(we->we_bufcount > 0);
|
|
|
|
we->we_bufcount--;
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
2012-11-17 14:10:17 +04:00
|
|
|
KASSERT(we->we_unsynced_bufbytes >= bufsize);
|
|
|
|
we->we_unsynced_bufbytes -= bufsize;
|
|
|
|
KASSERT(wl->wl_unsynced_bufbytes >= bufsize);
|
|
|
|
wl->wl_unsynced_bufbytes -= bufsize;
|
2008-07-31 09:38:04 +04:00
|
|
|
#endif
|
2017-03-05 16:57:29 +03:00
|
|
|
wl->wl_ev_metawrite.ev_count++;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the current transaction can be reclaimed, start
|
|
|
|
* at the beginning and reclaim any consecutive reclaimable
|
|
|
|
* transactions. If we successfully reclaim anything,
|
|
|
|
* then wakeup anyone waiting for the reclaim.
|
|
|
|
*/
|
|
|
|
if (we->we_bufcount == 0) {
|
|
|
|
size_t delta = 0;
|
|
|
|
int errcnt = 0;
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
KDASSERT(we->we_unsynced_bufbytes == 0);
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* clear any posted error, since the buffer it came from
|
|
|
|
* has successfully flushed by now
|
|
|
|
*/
|
|
|
|
while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) &&
|
|
|
|
(we->we_bufcount == 0)) {
|
|
|
|
delta += we->we_reclaimable_bytes;
|
|
|
|
if (we->we_error)
|
|
|
|
errcnt++;
|
|
|
|
SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries);
|
2012-01-28 22:02:56 +04:00
|
|
|
pool_put(&wapbl_entry_pool, we);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (delta) {
|
|
|
|
wl->wl_reclaimable_bytes += delta;
|
|
|
|
KASSERT(wl->wl_error_count >= errcnt);
|
|
|
|
wl->wl_error_count -= errcnt;
|
|
|
|
cv_broadcast(&wl->wl_reclaimable_cv);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_flush(wl, wait)
|
|
|
|
*
|
|
|
|
* Flush pending block writes, deallocations, and inodes from
|
|
|
|
* the current transaction in memory to the log on disk:
|
|
|
|
*
|
|
|
|
* 1. Call the file system's wl_flush callback to flush any
|
|
|
|
* per-file-system pending updates.
|
|
|
|
* 2. Wait for enough space in the log for the current transaction.
|
|
|
|
* 3. Synchronously write the new log records, advancing the
|
|
|
|
* circular queue head.
|
2016-05-08 01:12:29 +03:00
|
|
|
* 4. Issue the pending block writes asynchronously, now that they
|
|
|
|
* are recorded in the log and can be replayed after crash.
|
|
|
|
* 5. If wait is true, wait for all writes to complete and for the
|
|
|
|
* log to become empty.
|
2016-05-07 23:16:38 +03:00
|
|
|
*
|
|
|
|
* On failure, call the file system's wl_flush_abort callback.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
wapbl_flush(struct wapbl *wl, int waitfor)
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
struct wapbl_entry *we;
|
|
|
|
off_t off;
|
|
|
|
off_t head;
|
|
|
|
off_t tail;
|
|
|
|
size_t delta = 0;
|
|
|
|
size_t flushsize;
|
|
|
|
size_t reserved;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a quick check to see if a full flush can be skipped
|
|
|
|
* This assumes that the flush callback does not need to be called
|
|
|
|
* unless there are other outstanding bufs.
|
|
|
|
*/
|
|
|
|
if (!waitfor) {
|
|
|
|
size_t nbufs;
|
|
|
|
mutex_enter(&wl->wl_mtx); /* XXX need mutex here to
|
|
|
|
protect the KASSERTS */
|
|
|
|
nbufs = wl->wl_bufcount;
|
|
|
|
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0));
|
|
|
|
KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0));
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
if (nbufs == 0)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX we may consider using LK_UPGRADE here
|
|
|
|
* if we want to call flush from inside a transaction
|
|
|
|
*/
|
|
|
|
rw_enter(&wl->wl_rwlock, RW_WRITER);
|
2016-11-10 23:56:32 +03:00
|
|
|
wl->wl_flush(wl->wl_mount, TAILQ_FIRST(&wl->wl_dealloclist));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/*
|
2016-05-08 00:11:51 +03:00
|
|
|
* Now that we are exclusively locked and the file system has
|
|
|
|
* issued any deferred block writes for this transaction, check
|
|
|
|
* whether there are any blocks to write to the log. If not,
|
|
|
|
* skip waiting for space or writing any log entries.
|
|
|
|
*
|
|
|
|
* XXX Shouldn't this also check wl_dealloccnt and
|
|
|
|
* wl_inohashcnt? Perhaps wl_dealloccnt doesn't matter if the
|
|
|
|
* file system didn't produce any blocks as a consequence of
|
|
|
|
* it, but the same does not seem to be so of wl_inohashcnt.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
if (wl->wl_bufcount == 0) {
|
2016-05-07 20:12:22 +03:00
|
|
|
goto wait_out;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
|
|
|
|
("wapbl_flush thread %d.%d flushing entries with "
|
|
|
|
"bufcount=%zu bufbytes=%zu\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Calculate amount of space needed to flush */
|
|
|
|
flushsize = wapbl_transaction_len(wl);
|
2011-01-08 23:37:05 +03:00
|
|
|
if (wapbl_verbose_commit) {
|
|
|
|
struct timespec ts;
|
|
|
|
getnanotime(&ts);
|
2011-02-20 14:21:34 +03:00
|
|
|
printf("%s: %lld.%09ld this transaction = %zu bytes\n",
|
2011-01-08 23:37:05 +03:00
|
|
|
__func__, (long long)ts.tv_sec,
|
|
|
|
(long)ts.tv_nsec, flushsize);
|
|
|
|
}
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) {
|
|
|
|
/*
|
|
|
|
* XXX this could be handled more gracefully, perhaps place
|
|
|
|
* only a partial transaction in the log and allow the
|
|
|
|
* remaining to flush without the protection of the journal.
|
|
|
|
*/
|
2016-05-03 22:17:16 +03:00
|
|
|
panic("wapbl_flush: current transaction too big to flush");
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2016-05-07 23:39:33 +03:00
|
|
|
error = wapbl_truncate(wl, flushsize);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
2016-05-07 20:12:22 +03:00
|
|
|
goto out;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
off = wl->wl_head;
|
2016-05-07 20:47:34 +03:00
|
|
|
KASSERT((off == 0) || (off >= wl->wl_circ_off));
|
|
|
|
KASSERT((off == 0) || (off < wl->wl_circ_off + wl->wl_circ_size));
|
2008-07-31 09:38:04 +04:00
|
|
|
error = wapbl_write_blocks(wl, &off);
|
|
|
|
if (error)
|
2016-05-07 20:12:22 +03:00
|
|
|
goto out;
|
2008-07-31 09:38:04 +04:00
|
|
|
error = wapbl_write_revocations(wl, &off);
|
|
|
|
if (error)
|
2016-05-07 20:12:22 +03:00
|
|
|
goto out;
|
2008-07-31 09:38:04 +04:00
|
|
|
error = wapbl_write_inodes(wl, &off);
|
|
|
|
if (error)
|
2016-05-07 20:12:22 +03:00
|
|
|
goto out;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
reserved = 0;
|
|
|
|
if (wl->wl_inohashcnt)
|
|
|
|
reserved = wapbl_transaction_inodes_len(wl);
|
|
|
|
|
|
|
|
head = wl->wl_head;
|
|
|
|
tail = wl->wl_tail;
|
|
|
|
|
|
|
|
wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize,
|
|
|
|
&head, &tail);
|
2016-05-07 23:18:44 +03:00
|
|
|
|
|
|
|
KASSERTMSG(head == off,
|
|
|
|
"lost head! head=%"PRIdMAX" tail=%" PRIdMAX
|
|
|
|
" off=%"PRIdMAX" flush=%zu",
|
|
|
|
(intmax_t)head, (intmax_t)tail, (intmax_t)off,
|
|
|
|
flushsize);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/* Opportunistically move the tail forward if we can */
|
2016-05-07 23:39:33 +03:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
delta = wl->wl_reclaimable_bytes;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta,
|
|
|
|
&head, &tail);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
error = wapbl_write_commit(wl, head, tail);
|
|
|
|
if (error)
|
2016-05-07 20:12:22 +03:00
|
|
|
goto out;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
we = pool_get(&wapbl_entry_pool, PR_WAITOK);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
|
|
|
|
("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
|
|
|
|
" unsynced=%zu"
|
|
|
|
"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
|
|
|
|
"inodes=%d\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, flushsize, delta,
|
|
|
|
wapbl_space_used(wl->wl_circ_size, head, tail),
|
|
|
|
wl->wl_unsynced_bufbytes, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt,
|
|
|
|
wl->wl_inohashcnt));
|
|
|
|
#else
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
|
|
|
|
("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu"
|
|
|
|
"\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d "
|
|
|
|
"inodes=%d\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid, flushsize, delta,
|
|
|
|
wapbl_space_used(wl->wl_circ_size, head, tail),
|
|
|
|
wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
|
|
|
|
wl->wl_dealloccnt, wl->wl_inohashcnt));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
mutex_enter(&bufcache_lock);
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
|
|
|
|
wl->wl_reserved_bytes = reserved;
|
|
|
|
wl->wl_head = head;
|
|
|
|
wl->wl_tail = tail;
|
|
|
|
KASSERT(wl->wl_reclaimable_bytes >= delta);
|
|
|
|
wl->wl_reclaimable_bytes -= delta;
|
2016-10-01 16:15:45 +03:00
|
|
|
KDASSERT(wl->wl_dealloccnt == 0);
|
2008-07-31 09:38:04 +04:00
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
wl->wl_unsynced_bufbytes += wl->wl_bufbytes;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
we->we_wapbl = wl;
|
|
|
|
we->we_bufcount = wl->wl_bufcount;
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
we->we_unsynced_bufbytes = wl->wl_bufbytes;
|
|
|
|
#endif
|
|
|
|
we->we_reclaimable_bytes = flushsize;
|
|
|
|
we->we_error = 0;
|
|
|
|
SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries);
|
|
|
|
|
|
|
|
/*
|
2017-04-10 22:52:38 +03:00
|
|
|
* This flushes bufs in order than they were queued, so the LRU
|
|
|
|
* order is preserved.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
2017-04-10 22:52:38 +03:00
|
|
|
while ((bp = TAILQ_FIRST(&wl->wl_bufs)) != NULL) {
|
2008-07-31 09:38:04 +04:00
|
|
|
if (bbusy(bp, 0, 0, &wl->wl_mtx)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
bp->b_iodone = wapbl_biodone;
|
|
|
|
bp->b_private = we;
|
2017-04-05 23:38:53 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
bremfree(bp);
|
|
|
|
wapbl_remove_buf_locked(wl, bp);
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
mutex_exit(&bufcache_lock);
|
|
|
|
bawrite(bp);
|
|
|
|
mutex_enter(&bufcache_lock);
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
mutex_exit(&bufcache_lock);
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_FLUSH,
|
|
|
|
("wapbl_flush thread %d.%d done flushing entries...\n",
|
|
|
|
curproc->p_pid, curlwp->l_lid));
|
|
|
|
#endif
|
|
|
|
|
2016-05-07 20:12:22 +03:00
|
|
|
wait_out:
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the waitfor flag is set, don't return until everything is
|
|
|
|
* fully flushed and the on disk log is empty.
|
|
|
|
*/
|
|
|
|
if (waitfor) {
|
2017-03-17 06:17:07 +03:00
|
|
|
error = wapbl_truncate(wl, wl->wl_circ_size -
|
2016-05-07 23:39:33 +03:00
|
|
|
wl->wl_reserved_bytes);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2016-05-07 20:12:22 +03:00
|
|
|
out:
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error) {
|
2016-10-01 16:15:45 +03:00
|
|
|
wl->wl_flush_abort(wl->wl_mount,
|
2016-11-10 23:56:32 +03:00
|
|
|
TAILQ_FIRST(&wl->wl_dealloclist));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef WAPBL_DEBUG_PRINT
|
|
|
|
if (error) {
|
|
|
|
pid_t pid = -1;
|
|
|
|
lwpid_t lid = -1;
|
|
|
|
if (curproc)
|
|
|
|
pid = curproc->p_pid;
|
|
|
|
if (curlwp)
|
|
|
|
lid = curlwp->l_lid;
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
|
|
|
("wapbl_flush: thread %d.%d aborted flush: "
|
|
|
|
"error = %d\n"
|
|
|
|
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
|
|
|
|
"deallocs=%d inodes=%d\n"
|
|
|
|
"\terrcnt = %d, reclaimable=%zu reserved=%zu "
|
|
|
|
"unsynced=%zu\n",
|
|
|
|
pid, lid, error, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes, wl->wl_bcount,
|
|
|
|
wl->wl_dealloccnt, wl->wl_inohashcnt,
|
|
|
|
wl->wl_error_count, wl->wl_reclaimable_bytes,
|
|
|
|
wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes));
|
|
|
|
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
|
|
|
("\tentry: bufcount = %zu, reclaimable = %zu, "
|
|
|
|
"error = %d, unsynced = %zu\n",
|
|
|
|
we->we_bufcount, we->we_reclaimable_bytes,
|
|
|
|
we->we_error, we->we_unsynced_bufbytes));
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
|
|
|
("wapbl_flush: thread %d.%d aborted flush: "
|
|
|
|
"error = %d\n"
|
|
|
|
"\tbufcount=%zu bufbytes=%zu bcount=%zu "
|
|
|
|
"deallocs=%d inodes=%d\n"
|
|
|
|
"\terrcnt = %d, reclaimable=%zu reserved=%zu\n",
|
|
|
|
pid, lid, error, wl->wl_bufcount,
|
|
|
|
wl->wl_bufbytes, wl->wl_bcount,
|
|
|
|
wl->wl_dealloccnt, wl->wl_inohashcnt,
|
|
|
|
wl->wl_error_count, wl->wl_reclaimable_bytes,
|
|
|
|
wl->wl_reserved_bytes));
|
|
|
|
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
|
|
|
("\tentry: bufcount = %zu, reclaimable = %zu, "
|
|
|
|
"error = %d\n", we->we_bufcount,
|
|
|
|
we->we_reclaimable_bytes, we->we_error));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
rw_exit(&wl->wl_rwlock);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_jlock_assert(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
|
2009-02-22 23:10:25 +03:00
|
|
|
KASSERT(rw_lock_held(&wl->wl_rwlock));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_junlock_assert(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT(!rw_write_held(&wl->wl_rwlock));
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
/* locks missing */
|
|
|
|
void
|
|
|
|
wapbl_print(struct wapbl *wl,
|
|
|
|
int full,
|
|
|
|
void (*pr)(const char *, ...))
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
struct wapbl_entry *we;
|
|
|
|
(*pr)("wapbl %p", wl);
|
|
|
|
(*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n",
|
|
|
|
wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn);
|
|
|
|
(*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n",
|
|
|
|
wl->wl_circ_size, wl->wl_circ_off,
|
|
|
|
(intmax_t)wl->wl_head, (intmax_t)wl->wl_tail);
|
|
|
|
(*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n",
|
|
|
|
wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift);
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
|
|
|
|
"reserved = %zu errcnt = %d unsynced = %zu\n",
|
|
|
|
wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount,
|
|
|
|
wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
|
|
|
|
wl->wl_error_count, wl->wl_unsynced_bufbytes);
|
|
|
|
#else
|
|
|
|
(*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu "
|
|
|
|
"reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes,
|
|
|
|
wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes,
|
|
|
|
wl->wl_error_count);
|
|
|
|
#endif
|
|
|
|
(*pr)("\tdealloccnt = %d, dealloclim = %d\n",
|
|
|
|
wl->wl_dealloccnt, wl->wl_dealloclim);
|
|
|
|
(*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n",
|
|
|
|
wl->wl_inohashcnt, wl->wl_inohashmask);
|
|
|
|
(*pr)("entries:\n");
|
|
|
|
SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) {
|
|
|
|
#ifdef WAPBL_DEBUG_BUFBYTES
|
|
|
|
(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, "
|
|
|
|
"unsynced = %zu\n",
|
|
|
|
we->we_bufcount, we->we_reclaimable_bytes,
|
|
|
|
we->we_error, we->we_unsynced_bufbytes);
|
|
|
|
#else
|
|
|
|
(*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n",
|
|
|
|
we->we_bufcount, we->we_reclaimable_bytes, we->we_error);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
if (full) {
|
|
|
|
int cnt = 0;
|
|
|
|
(*pr)("bufs =");
|
2017-04-10 22:52:38 +03:00
|
|
|
TAILQ_FOREACH(bp, &wl->wl_bufs, b_wapbllist) {
|
|
|
|
if (!TAILQ_NEXT(bp, b_wapbllist)) {
|
2008-07-31 09:38:04 +04:00
|
|
|
(*pr)(" %p", bp);
|
|
|
|
} else if ((++cnt % 6) == 0) {
|
|
|
|
(*pr)(" %p,\n\t", bp);
|
|
|
|
} else {
|
|
|
|
(*pr)(" %p,", bp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*pr)("\n");
|
|
|
|
|
|
|
|
(*pr)("dealloced blks = ");
|
|
|
|
{
|
2016-10-01 16:15:45 +03:00
|
|
|
struct wapbl_dealloc *wd;
|
2008-07-31 09:38:04 +04:00
|
|
|
cnt = 0;
|
2016-11-10 23:56:32 +03:00
|
|
|
TAILQ_FOREACH(wd, &wl->wl_dealloclist, wd_entries) {
|
2008-07-31 09:38:04 +04:00
|
|
|
(*pr)(" %"PRId64":%d,",
|
2016-10-01 16:15:45 +03:00
|
|
|
wd->wd_blkno,
|
|
|
|
wd->wd_len);
|
2008-07-31 09:38:04 +04:00
|
|
|
if ((++cnt % 4) == 0) {
|
|
|
|
(*pr)("\n\t");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*pr)("\n");
|
|
|
|
|
|
|
|
(*pr)("registered inodes = ");
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
cnt = 0;
|
|
|
|
for (i = 0; i <= wl->wl_inohashmask; i++) {
|
|
|
|
struct wapbl_ino_head *wih;
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
|
|
|
|
wih = &wl->wl_inohash[i];
|
|
|
|
LIST_FOREACH(wi, wih, wi_hash) {
|
|
|
|
if (wi->wi_ino == 0)
|
|
|
|
continue;
|
2013-02-09 04:32:12 +04:00
|
|
|
(*pr)(" %"PRIu64"/0%06"PRIo32",",
|
2008-07-31 09:38:04 +04:00
|
|
|
wi->wi_ino, wi->wi_mode);
|
|
|
|
if ((++cnt % 4) == 0) {
|
|
|
|
(*pr)("\n\t");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*pr)("\n");
|
|
|
|
}
|
2017-04-11 00:34:37 +03:00
|
|
|
|
|
|
|
(*pr)("iobufs free =");
|
|
|
|
TAILQ_FOREACH(bp, &wl->wl_iobufs, b_wapbllist) {
|
|
|
|
if (!TAILQ_NEXT(bp, b_wapbllist)) {
|
|
|
|
(*pr)(" %p", bp);
|
|
|
|
} else if ((++cnt % 6) == 0) {
|
|
|
|
(*pr)(" %p,\n\t", bp);
|
|
|
|
} else {
|
|
|
|
(*pr)(" %p,", bp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*pr)("\n");
|
|
|
|
|
|
|
|
(*pr)("iobufs busy =");
|
|
|
|
TAILQ_FOREACH(bp, &wl->wl_iobufs_busy, b_wapbllist) {
|
|
|
|
if (!TAILQ_NEXT(bp, b_wapbllist)) {
|
|
|
|
(*pr)(" %p", bp);
|
|
|
|
} else if ((++cnt % 6) == 0) {
|
|
|
|
(*pr)(" %p,\n\t", bp);
|
|
|
|
} else {
|
|
|
|
(*pr)(" %p,", bp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
(*pr)("\n");
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(WAPBL_DEBUG) || defined(DDB)
|
|
|
|
void
|
|
|
|
wapbl_dump(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
#if defined(WAPBL_DEBUG)
|
|
|
|
if (!wl)
|
|
|
|
wl = wapbl_debug_wl;
|
|
|
|
#endif
|
|
|
|
if (!wl)
|
|
|
|
return;
|
2017-10-27 15:25:14 +03:00
|
|
|
wapbl_print(wl, 1, printf);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
2016-10-28 23:38:12 +03:00
|
|
|
int
|
2016-11-10 23:56:32 +03:00
|
|
|
wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len, bool force,
|
|
|
|
void **cookiep)
|
2008-07-31 09:38:04 +04:00
|
|
|
{
|
2016-10-01 16:15:45 +03:00
|
|
|
struct wapbl_dealloc *wd;
|
2016-10-28 23:38:12 +03:00
|
|
|
int error = 0;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
wapbl_jlock_assert(wl);
|
|
|
|
|
2010-11-09 19:30:26 +03:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
2016-10-28 23:38:12 +03:00
|
|
|
|
|
|
|
if (__predict_false(wl->wl_dealloccnt >= wl->wl_dealloclim)) {
|
|
|
|
if (!force) {
|
|
|
|
error = EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Forced registration can only be used when:
|
|
|
|
* 1) the caller can't cope with failure
|
|
|
|
* 2) the path can be triggered only bounded, small
|
|
|
|
* times per transaction
|
|
|
|
* If this is not fullfilled, and the path would be triggered
|
|
|
|
* many times, this could overflow maximum transaction size
|
|
|
|
* and panic later.
|
|
|
|
*/
|
|
|
|
printf("%s: forced dealloc registration over limit: %d >= %d\n",
|
|
|
|
wl->wl_mount->mnt_stat.f_mntonname,
|
|
|
|
wl->wl_dealloccnt, wl->wl_dealloclim);
|
|
|
|
}
|
2009-10-01 11:42:45 +04:00
|
|
|
|
2016-10-02 19:52:27 +03:00
|
|
|
wl->wl_dealloccnt++;
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
2016-10-01 16:15:45 +03:00
|
|
|
wd = pool_get(&wapbl_dealloc_pool, PR_WAITOK);
|
|
|
|
wd->wd_blkno = blk;
|
|
|
|
wd->wd_len = len;
|
|
|
|
|
2016-10-02 19:52:27 +03:00
|
|
|
mutex_enter(&wl->wl_mtx);
|
2016-11-10 23:56:32 +03:00
|
|
|
TAILQ_INSERT_TAIL(&wl->wl_dealloclist, wd, wd_entries);
|
|
|
|
|
|
|
|
if (cookiep)
|
|
|
|
*cookiep = wd;
|
2016-10-28 23:38:12 +03:00
|
|
|
|
|
|
|
out:
|
2016-10-02 19:52:27 +03:00
|
|
|
mutex_exit(&wl->wl_mtx);
|
2016-10-01 16:15:45 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ALLOC,
|
2016-10-28 23:38:12 +03:00
|
|
|
("wapbl_register_deallocation: blk=%"PRId64" len=%d error=%d\n",
|
|
|
|
blk, len, error));
|
|
|
|
|
|
|
|
return error;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2016-11-10 23:56:32 +03:00
|
|
|
static void
|
|
|
|
wapbl_deallocation_free(struct wapbl *wl, struct wapbl_dealloc *wd,
|
|
|
|
bool locked)
|
|
|
|
{
|
|
|
|
KASSERT(!locked
|
|
|
|
|| rw_lock_held(&wl->wl_rwlock) || mutex_owned(&wl->wl_mtx));
|
|
|
|
|
|
|
|
if (!locked)
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
|
|
|
|
TAILQ_REMOVE(&wl->wl_dealloclist, wd, wd_entries);
|
|
|
|
wl->wl_dealloccnt--;
|
|
|
|
|
|
|
|
if (!locked)
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
pool_put(&wapbl_dealloc_pool, wd);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_unregister_deallocation(struct wapbl *wl, void *cookie)
|
|
|
|
{
|
|
|
|
KASSERT(cookie != NULL);
|
|
|
|
wapbl_deallocation_free(wl, cookie, false);
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_inodetrk_init(struct wapbl *wl, u_int size)
|
|
|
|
{
|
|
|
|
|
|
|
|
wl->wl_inohash = hashinit(size, HASH_LIST, true, &wl->wl_inohashmask);
|
|
|
|
if (atomic_inc_uint_nv(&wapbl_ino_pool_refcount) == 1) {
|
|
|
|
pool_init(&wapbl_ino_pool, sizeof(struct wapbl_ino), 0, 0, 0,
|
|
|
|
"wapblinopl", &pool_allocator_nointr, IPL_NONE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_inodetrk_free(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
|
|
|
|
/* XXX this KASSERT needs locking/mutex analysis */
|
|
|
|
KASSERT(wl->wl_inohashcnt == 0);
|
|
|
|
hashdone(wl->wl_inohash, HASH_LIST, wl->wl_inohashmask);
|
sys: Membar audit around reference count releases.
If two threads are using an object that is freed when the reference
count goes to zero, we need to ensure that all memory operations
related to the object happen before freeing the object.
Using an atomic_dec_uint_nv(&refcnt) == 0 ensures that only one
thread takes responsibility for freeing, but it's not enough to
ensure that the other thread's memory operations happen before the
freeing.
Consider:
Thread A Thread B
obj->foo = 42; obj->baz = 73;
mumble(&obj->bar); grumble(&obj->quux);
/* membar_exit(); */ /* membar_exit(); */
atomic_dec -- not last atomic_dec -- last
/* membar_enter(); */
KASSERT(invariant(obj->foo,
obj->bar));
free_stuff(obj);
The memory barriers ensure that
obj->foo = 42;
mumble(&obj->bar);
in thread A happens before
KASSERT(invariant(obj->foo, obj->bar));
free_stuff(obj);
in thread B. Without them, this ordering is not guaranteed.
So in general it is necessary to do
membar_exit();
if (atomic_dec_uint_nv(&obj->refcnt) != 0)
return;
membar_enter();
to release a reference, for the `last one out hit the lights' style
of reference counting. (This is in contrast to the style where one
thread blocks new references and then waits under a lock for existing
ones to drain with a condvar -- no membar needed thanks to mutex(9).)
I searched for atomic_dec to find all these. Obviously we ought to
have a better abstraction for this because there's so much copypasta.
This is a stop-gap measure to fix actual bugs until we have that. It
would be nice if an abstraction could gracefully handle the different
styles of reference counting in use -- some years ago I drafted an
API for this, but making it cover everything got a little out of hand
(particularly with struct vnode::v_usecount) and I ended up setting
it aside to work on psref/localcount instead for better scalability.
I got bored of adding #ifdef __HAVE_ATOMIC_AS_MEMBAR everywhere, so I
only put it on things that look performance-critical on 5sec review.
We should really adopt membar_enter_preatomic/membar_exit_postatomic
or something (except they are applicable only to atomic r/m/w, not to
atomic_load/store_*, making the naming annoying) and get rid of all
the ifdefs.
2022-03-12 18:32:30 +03:00
|
|
|
membar_exit();
|
2008-07-31 09:38:04 +04:00
|
|
|
if (atomic_dec_uint_nv(&wapbl_ino_pool_refcount) == 0) {
|
sys: Membar audit around reference count releases.
If two threads are using an object that is freed when the reference
count goes to zero, we need to ensure that all memory operations
related to the object happen before freeing the object.
Using an atomic_dec_uint_nv(&refcnt) == 0 ensures that only one
thread takes responsibility for freeing, but it's not enough to
ensure that the other thread's memory operations happen before the
freeing.
Consider:
Thread A Thread B
obj->foo = 42; obj->baz = 73;
mumble(&obj->bar); grumble(&obj->quux);
/* membar_exit(); */ /* membar_exit(); */
atomic_dec -- not last atomic_dec -- last
/* membar_enter(); */
KASSERT(invariant(obj->foo,
obj->bar));
free_stuff(obj);
The memory barriers ensure that
obj->foo = 42;
mumble(&obj->bar);
in thread A happens before
KASSERT(invariant(obj->foo, obj->bar));
free_stuff(obj);
in thread B. Without them, this ordering is not guaranteed.
So in general it is necessary to do
membar_exit();
if (atomic_dec_uint_nv(&obj->refcnt) != 0)
return;
membar_enter();
to release a reference, for the `last one out hit the lights' style
of reference counting. (This is in contrast to the style where one
thread blocks new references and then waits under a lock for existing
ones to drain with a condvar -- no membar needed thanks to mutex(9).)
I searched for atomic_dec to find all these. Obviously we ought to
have a better abstraction for this because there's so much copypasta.
This is a stop-gap measure to fix actual bugs until we have that. It
would be nice if an abstraction could gracefully handle the different
styles of reference counting in use -- some years ago I drafted an
API for this, but making it cover everything got a little out of hand
(particularly with struct vnode::v_usecount) and I ended up setting
it aside to work on psref/localcount instead for better scalability.
I got bored of adding #ifdef __HAVE_ATOMIC_AS_MEMBAR everywhere, so I
only put it on things that look performance-critical on 5sec review.
We should really adopt membar_enter_preatomic/membar_exit_postatomic
or something (except they are applicable only to atomic r/m/w, not to
atomic_load/store_*, making the naming annoying) and get rid of all
the ifdefs.
2022-03-12 18:32:30 +03:00
|
|
|
membar_enter();
|
2008-07-31 09:38:04 +04:00
|
|
|
pool_destroy(&wapbl_ino_pool);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct wapbl_ino *
|
|
|
|
wapbl_inodetrk_get(struct wapbl *wl, ino_t ino)
|
|
|
|
{
|
|
|
|
struct wapbl_ino_head *wih;
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
|
|
|
|
KASSERT(mutex_owned(&wl->wl_mtx));
|
|
|
|
|
|
|
|
wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
|
|
|
|
LIST_FOREACH(wi, wih, wi_hash) {
|
|
|
|
if (ino == wi->wi_ino)
|
|
|
|
return wi;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode)
|
|
|
|
{
|
|
|
|
struct wapbl_ino_head *wih;
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
|
|
|
|
wi = pool_get(&wapbl_ino_pool, PR_WAITOK);
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
if (wapbl_inodetrk_get(wl, ino) == NULL) {
|
|
|
|
wi->wi_ino = ino;
|
|
|
|
wi->wi_mode = mode;
|
|
|
|
wih = &wl->wl_inohash[ino & wl->wl_inohashmask];
|
|
|
|
LIST_INSERT_HEAD(wih, wi, wi_hash);
|
|
|
|
wl->wl_inohashcnt++;
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_INODE,
|
|
|
|
("wapbl_register_inode: ino=%"PRId64"\n", ino));
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
pool_put(&wapbl_ino_pool, wi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode)
|
|
|
|
{
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
|
|
|
|
mutex_enter(&wl->wl_mtx);
|
|
|
|
wi = wapbl_inodetrk_get(wl, ino);
|
|
|
|
if (wi) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_INODE,
|
|
|
|
("wapbl_unregister_inode: ino=%"PRId64"\n", ino));
|
|
|
|
KASSERT(wl->wl_inohashcnt > 0);
|
|
|
|
wl->wl_inohashcnt--;
|
|
|
|
LIST_REMOVE(wi, wi_hash);
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
|
|
|
|
pool_put(&wapbl_ino_pool, wi);
|
|
|
|
} else {
|
|
|
|
mutex_exit(&wl->wl_mtx);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_transaction_inodes_len(wl)
|
|
|
|
*
|
|
|
|
* Calculate the number of bytes required for inode registration
|
|
|
|
* log records in wl.
|
|
|
|
*/
|
2010-02-06 15:10:59 +03:00
|
|
|
static inline size_t
|
2008-07-31 09:38:04 +04:00
|
|
|
wapbl_transaction_inodes_len(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
int blocklen = 1<<wl->wl_log_dev_bshift;
|
|
|
|
int iph;
|
|
|
|
|
|
|
|
/* Calculate number of inodes described in a inodelist header */
|
|
|
|
iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
|
|
|
|
sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
|
|
|
|
|
|
|
|
KASSERT(iph > 0);
|
|
|
|
|
2011-01-08 23:37:05 +03:00
|
|
|
return MAX(1, howmany(wl->wl_inohashcnt, iph)) * blocklen;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_transaction_len(wl)
|
|
|
|
*
|
|
|
|
* Calculate number of bytes required for all log records in wl.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static size_t
|
|
|
|
wapbl_transaction_len(struct wapbl *wl)
|
|
|
|
{
|
|
|
|
int blocklen = 1<<wl->wl_log_dev_bshift;
|
|
|
|
size_t len;
|
|
|
|
|
2016-09-22 19:22:29 +03:00
|
|
|
/* Calculate number of blocks described in a blocklist header */
|
2008-07-31 09:38:04 +04:00
|
|
|
len = wl->wl_bcount;
|
2016-09-22 19:20:56 +03:00
|
|
|
len += howmany(wl->wl_bufcount, wl->wl_brperjblock) * blocklen;
|
|
|
|
len += howmany(wl->wl_dealloccnt, wl->wl_brperjblock) * blocklen;
|
2008-07-31 09:38:04 +04:00
|
|
|
len += wapbl_transaction_inodes_len(wl);
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2011-12-02 16:38:59 +04:00
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_cache_sync(wl, msg)
|
|
|
|
*
|
|
|
|
* Issue DIOCCACHESYNC to wl->wl_devvp.
|
|
|
|
*
|
|
|
|
* If sysctl(vfs.wapbl.verbose_commit) >= 2, print a message
|
|
|
|
* including msg about the duration of the cache sync.
|
2011-12-02 16:38:59 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
wapbl_cache_sync(struct wapbl *wl, const char *msg)
|
|
|
|
{
|
|
|
|
const bool verbose = wapbl_verbose_commit >= 2;
|
|
|
|
struct bintime start_time;
|
|
|
|
int force = 1;
|
|
|
|
int error;
|
|
|
|
|
2017-12-02 20:29:55 +03:00
|
|
|
/* Skip full cache sync if disabled */
|
|
|
|
if (!wapbl_flush_disk_cache) {
|
2011-12-02 16:38:59 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (verbose) {
|
|
|
|
bintime(&start_time);
|
|
|
|
}
|
|
|
|
error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force,
|
|
|
|
FWRITE, FSCRED);
|
|
|
|
if (error) {
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_ERROR,
|
2016-05-08 00:15:38 +03:00
|
|
|
("wapbl_cache_sync: DIOCCACHESYNC on dev 0x%jx "
|
|
|
|
"returned %d\n", (uintmax_t)wl->wl_devvp->v_rdev, error));
|
2011-12-02 16:38:59 +04:00
|
|
|
}
|
|
|
|
if (verbose) {
|
|
|
|
struct bintime d;
|
|
|
|
struct timespec ts;
|
|
|
|
|
|
|
|
bintime(&d);
|
|
|
|
bintime_sub(&d, &start_time);
|
|
|
|
bintime2timespec(&d, &ts);
|
|
|
|
printf("wapbl_cache_sync: %s: dev 0x%jx %ju.%09lu\n",
|
|
|
|
msg, (uintmax_t)wl->wl_devvp->v_rdev,
|
|
|
|
(uintmax_t)ts.tv_sec, ts.tv_nsec);
|
|
|
|
}
|
2017-03-05 16:57:29 +03:00
|
|
|
|
|
|
|
wl->wl_ev_cacheflush.ev_count++;
|
|
|
|
|
2011-12-02 16:38:59 +04:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
2016-05-07 23:16:38 +03:00
|
|
|
* wapbl_write_commit(wl, head, tail)
|
|
|
|
*
|
|
|
|
* Issue a disk cache sync to wait for all pending writes to the
|
|
|
|
* log to complete, and then synchronously commit the current
|
|
|
|
* circular queue head and tail to the log, in the next of two
|
|
|
|
* locations for commit headers on disk.
|
2008-07-31 09:38:04 +04:00
|
|
|
*
|
2016-05-07 23:16:38 +03:00
|
|
|
* Increment the generation number. If the generation number
|
|
|
|
* rolls over to zero, then a subsequent commit would appear to
|
|
|
|
* have an older generation than this one -- in that case, issue a
|
|
|
|
* duplicate commit to avoid this.
|
|
|
|
*
|
|
|
|
* => Caller must have exclusive access to wl, either by holding
|
|
|
|
* wl->wl_rwlock for writer or by being wapbl_start before anyone
|
|
|
|
* else has seen wl.
|
2008-07-31 09:38:04 +04:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_header *wc = wl->wl_wc_header;
|
|
|
|
struct timespec ts;
|
|
|
|
int error;
|
2010-02-27 19:51:03 +03:00
|
|
|
daddr_t pbn;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2017-04-11 00:34:37 +03:00
|
|
|
error = wapbl_buffered_flush(wl, true);
|
2012-12-08 11:24:42 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
2012-01-11 04:11:32 +04:00
|
|
|
/*
|
2017-12-02 20:29:55 +03:00
|
|
|
* Flush disk cache to ensure that blocks we've written are actually
|
2012-01-11 04:11:32 +04:00
|
|
|
* written to the stable storage before the commit header.
|
2017-12-02 20:29:55 +03:00
|
|
|
* This flushes to disk not only journal blocks, but also all
|
|
|
|
* metadata blocks, written asynchronously since previous commit.
|
2012-01-11 04:11:32 +04:00
|
|
|
*
|
|
|
|
* XXX Calc checksum here, instead we do this for now
|
|
|
|
*/
|
2011-12-02 16:38:59 +04:00
|
|
|
wapbl_cache_sync(wl, "1");
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
wc->wc_head = head;
|
|
|
|
wc->wc_tail = tail;
|
|
|
|
wc->wc_checksum = 0;
|
|
|
|
wc->wc_version = 1;
|
|
|
|
getnanotime(&ts);
|
2009-01-03 06:31:23 +03:00
|
|
|
wc->wc_time = ts.tv_sec;
|
2008-07-31 09:38:04 +04:00
|
|
|
wc->wc_timensec = ts.tv_nsec;
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
|
|
|
|
("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n",
|
|
|
|
(intmax_t)head, (intmax_t)tail));
|
|
|
|
|
|
|
|
/*
|
2012-01-11 04:11:32 +04:00
|
|
|
* write the commit header.
|
|
|
|
*
|
2008-07-31 09:38:04 +04:00
|
|
|
* XXX if generation will rollover, then first zero
|
|
|
|
* over second commit header before trying to write both headers.
|
|
|
|
*/
|
|
|
|
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = wl->wl_logpbn + (wc->wc_generation % 2);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << wc->wc_log_dev_bshift);
|
|
|
|
#endif
|
2017-12-02 20:29:55 +03:00
|
|
|
error = wapbl_buffered_write(wc, wc->wc_len, wl, pbn, WAPBL_JFLAGS(wl));
|
2012-12-08 11:24:42 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
2017-04-11 00:34:37 +03:00
|
|
|
error = wapbl_buffered_flush(wl, true);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2012-01-11 04:11:32 +04:00
|
|
|
/*
|
2017-12-02 20:29:55 +03:00
|
|
|
* Flush disk cache to ensure that the commit header is actually
|
|
|
|
* written before meta data blocks. Commit block is written using
|
|
|
|
* FUA when enabled, in that case this flush is not needed.
|
2012-01-11 04:11:32 +04:00
|
|
|
*/
|
2017-12-02 20:29:55 +03:00
|
|
|
if (!WAPBL_USE_FUA(wl))
|
|
|
|
wapbl_cache_sync(wl, "2");
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the generation number was zero, write it out a second time.
|
|
|
|
* This handles initialization and generation number rollover
|
|
|
|
*/
|
|
|
|
if (wc->wc_generation++ == 0) {
|
|
|
|
error = wapbl_write_commit(wl, head, tail);
|
|
|
|
/*
|
|
|
|
* This panic should be able to be removed if we do the
|
|
|
|
* zero'ing mentioned above, and we are certain to roll
|
|
|
|
* back generation number on failure.
|
|
|
|
*/
|
|
|
|
if (error)
|
|
|
|
panic("wapbl_write_commit: error writing duplicate "
|
2016-05-03 22:17:16 +03:00
|
|
|
"log header: %d", error);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
2017-03-05 16:57:29 +03:00
|
|
|
|
|
|
|
wl->wl_ev_commit.ev_count++;
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_write_blocks(wl, offp)
|
|
|
|
*
|
|
|
|
* Write all pending physical blocks in the current transaction
|
|
|
|
* from wapbl_add_buf to the log on disk, adding to the circular
|
|
|
|
* queue head at byte offset *offp, and returning the new head's
|
|
|
|
* byte offset in *offp.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static int
|
|
|
|
wapbl_write_blocks(struct wapbl *wl, off_t *offp)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_blocklist *wc =
|
|
|
|
(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
|
|
|
|
int blocklen = 1<<wl->wl_log_dev_bshift;
|
|
|
|
struct buf *bp;
|
|
|
|
off_t off = *offp;
|
|
|
|
int error;
|
2008-11-17 22:31:47 +03:00
|
|
|
size_t padding;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
KASSERT(rw_write_held(&wl->wl_rwlock));
|
|
|
|
|
2017-04-10 22:52:38 +03:00
|
|
|
bp = TAILQ_FIRST(&wl->wl_bufs);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
while (bp) {
|
|
|
|
int cnt;
|
|
|
|
struct buf *obp = bp;
|
|
|
|
|
|
|
|
KASSERT(bp->b_flags & B_LOCKED);
|
|
|
|
|
|
|
|
wc->wc_type = WAPBL_WC_BLOCKS;
|
|
|
|
wc->wc_len = blocklen;
|
|
|
|
wc->wc_blkcount = 0;
|
2021-08-03 23:25:43 +03:00
|
|
|
wc->wc_unused = 0;
|
2016-09-22 19:20:56 +03:00
|
|
|
while (bp && (wc->wc_blkcount < wl->wl_brperjblock)) {
|
2008-07-31 09:38:04 +04:00
|
|
|
/*
|
|
|
|
* Make sure all the physical block numbers are up to
|
|
|
|
* date. If this is not always true on a given
|
|
|
|
* filesystem, then VOP_BMAP must be called. We
|
|
|
|
* could call VOP_BMAP here, or else in the filesystem
|
|
|
|
* specific flush callback, although neither of those
|
|
|
|
* solutions allow us to take the vnode lock. If a
|
|
|
|
* filesystem requires that we must take the vnode lock
|
|
|
|
* to call VOP_BMAP, then we can probably do it in
|
|
|
|
* bwrite when the vnode lock should already be held
|
|
|
|
* by the invoking code.
|
|
|
|
*/
|
|
|
|
KASSERT((bp->b_vp->v_type == VBLK) ||
|
|
|
|
(bp->b_blkno != bp->b_lblkno));
|
|
|
|
KASSERT(bp->b_blkno > 0);
|
|
|
|
|
|
|
|
wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno;
|
|
|
|
wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount;
|
|
|
|
wc->wc_len += bp->b_bcount;
|
|
|
|
wc->wc_blkcount++;
|
2017-04-10 22:52:38 +03:00
|
|
|
bp = TAILQ_NEXT(bp, b_wapbllist);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
2008-11-17 22:31:47 +03:00
|
|
|
if (wc->wc_len % blocklen != 0) {
|
|
|
|
padding = blocklen - wc->wc_len % blocklen;
|
|
|
|
wc->wc_len += padding;
|
|
|
|
} else {
|
|
|
|
padding = 0;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
|
2008-11-17 22:31:47 +03:00
|
|
|
("wapbl_write_blocks: len = %u (padding %zu) off = %"PRIdMAX"\n",
|
|
|
|
wc->wc_len, padding, (intmax_t)off));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
error = wapbl_circ_write(wl, wc, blocklen, &off);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
bp = obp;
|
|
|
|
cnt = 0;
|
2016-09-22 19:20:56 +03:00
|
|
|
while (bp && (cnt++ < wl->wl_brperjblock)) {
|
2008-07-31 09:38:04 +04:00
|
|
|
error = wapbl_circ_write(wl, bp->b_data,
|
|
|
|
bp->b_bcount, &off);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2017-04-10 22:52:38 +03:00
|
|
|
bp = TAILQ_NEXT(bp, b_wapbllist);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
2008-11-17 22:31:47 +03:00
|
|
|
if (padding) {
|
|
|
|
void *zero;
|
2017-03-17 06:17:07 +03:00
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
zero = wapbl_alloc(padding);
|
2008-11-17 22:31:47 +03:00
|
|
|
memset(zero, 0, padding);
|
|
|
|
error = wapbl_circ_write(wl, zero, padding, &off);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(zero, padding);
|
2008-11-17 22:31:47 +03:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
*offp = off;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_write_revocations(wl, offp)
|
|
|
|
*
|
|
|
|
* Write all pending deallocations in the current transaction from
|
|
|
|
* wapbl_register_deallocation to the log on disk, adding to the
|
|
|
|
* circular queue's head at byte offset *offp, and returning the
|
|
|
|
* new head's byte offset in *offp.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static int
|
|
|
|
wapbl_write_revocations(struct wapbl *wl, off_t *offp)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_blocklist *wc =
|
|
|
|
(struct wapbl_wc_blocklist *)wl->wl_wc_scratch;
|
2016-10-01 16:15:45 +03:00
|
|
|
struct wapbl_dealloc *wd, *lwd;
|
2008-07-31 09:38:04 +04:00
|
|
|
int blocklen = 1<<wl->wl_log_dev_bshift;
|
|
|
|
off_t off = *offp;
|
|
|
|
int error;
|
|
|
|
|
2017-03-17 06:06:17 +03:00
|
|
|
KASSERT(rw_write_held(&wl->wl_rwlock));
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
if (wl->wl_dealloccnt == 0)
|
|
|
|
return 0;
|
|
|
|
|
2016-11-10 23:56:32 +03:00
|
|
|
while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
|
2008-07-31 09:38:04 +04:00
|
|
|
wc->wc_type = WAPBL_WC_REVOCATIONS;
|
|
|
|
wc->wc_len = blocklen;
|
|
|
|
wc->wc_blkcount = 0;
|
2021-08-03 23:25:43 +03:00
|
|
|
wc->wc_unused = 0;
|
2016-10-01 16:15:45 +03:00
|
|
|
while (wd && (wc->wc_blkcount < wl->wl_brperjblock)) {
|
2008-07-31 09:38:04 +04:00
|
|
|
wc->wc_blocks[wc->wc_blkcount].wc_daddr =
|
2016-10-01 16:15:45 +03:00
|
|
|
wd->wd_blkno;
|
2008-07-31 09:38:04 +04:00
|
|
|
wc->wc_blocks[wc->wc_blkcount].wc_dlen =
|
2016-10-01 16:15:45 +03:00
|
|
|
wd->wd_len;
|
2008-07-31 09:38:04 +04:00
|
|
|
wc->wc_blkcount++;
|
2016-10-01 16:15:45 +03:00
|
|
|
|
2016-11-10 23:56:32 +03:00
|
|
|
wd = TAILQ_NEXT(wd, wd_entries);
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
|
|
|
|
("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n",
|
|
|
|
wc->wc_len, (intmax_t)off));
|
|
|
|
error = wapbl_circ_write(wl, wc, blocklen, &off);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2016-10-01 16:15:45 +03:00
|
|
|
|
|
|
|
/* free all successfully written deallocs */
|
|
|
|
lwd = wd;
|
2016-11-10 23:56:32 +03:00
|
|
|
while ((wd = TAILQ_FIRST(&wl->wl_dealloclist)) != NULL) {
|
2016-10-02 19:44:02 +03:00
|
|
|
if (wd == lwd)
|
|
|
|
break;
|
2016-11-10 23:56:32 +03:00
|
|
|
wapbl_deallocation_free(wl, wd, true);
|
2016-10-01 16:15:45 +03:00
|
|
|
}
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
*offp = off;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_write_inodes(wl, offp)
|
|
|
|
*
|
|
|
|
* Write all pending inode allocations in the current transaction
|
|
|
|
* from wapbl_register_inode to the log on disk, adding to the
|
|
|
|
* circular queue's head at byte offset *offp and returning the
|
|
|
|
* new head's byte offset in *offp.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static int
|
|
|
|
wapbl_write_inodes(struct wapbl *wl, off_t *offp)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_inodelist *wc =
|
|
|
|
(struct wapbl_wc_inodelist *)wl->wl_wc_scratch;
|
|
|
|
int i;
|
2008-11-19 01:21:48 +03:00
|
|
|
int blocklen = 1 << wl->wl_log_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
off_t off = *offp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
struct wapbl_ino_head *wih;
|
|
|
|
struct wapbl_ino *wi;
|
|
|
|
int iph;
|
|
|
|
|
|
|
|
iph = (blocklen - offsetof(struct wapbl_wc_inodelist, wc_inodes)) /
|
|
|
|
sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]);
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
wih = &wl->wl_inohash[0];
|
|
|
|
wi = 0;
|
|
|
|
do {
|
|
|
|
wc->wc_type = WAPBL_WC_INODES;
|
|
|
|
wc->wc_len = blocklen;
|
|
|
|
wc->wc_inocnt = 0;
|
|
|
|
wc->wc_clear = (i == 0);
|
|
|
|
while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) {
|
|
|
|
while (!wi) {
|
|
|
|
KASSERT((wih - &wl->wl_inohash[0])
|
|
|
|
<= wl->wl_inohashmask);
|
|
|
|
wi = LIST_FIRST(wih++);
|
|
|
|
}
|
|
|
|
wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino;
|
|
|
|
wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode;
|
|
|
|
wc->wc_inocnt++;
|
|
|
|
i++;
|
|
|
|
wi = LIST_NEXT(wi, wi_hash);
|
|
|
|
}
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_WRITE,
|
|
|
|
("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n",
|
|
|
|
wc->wc_len, (intmax_t)off));
|
|
|
|
error = wapbl_circ_write(wl, wc, blocklen, &off);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
} while (i < wl->wl_inohashcnt);
|
2017-03-17 06:17:07 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
*offp = off;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* _KERNEL */
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
struct wapbl_blk {
|
|
|
|
LIST_ENTRY(wapbl_blk) wb_hash;
|
|
|
|
daddr_t wb_blk;
|
|
|
|
off_t wb_off; /* Offset of this block in the log */
|
|
|
|
};
|
|
|
|
#define WAPBL_BLKPOOL_MIN 83
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_blkhash_init(struct wapbl_replay *wr, u_int size)
|
|
|
|
{
|
|
|
|
if (size < WAPBL_BLKPOOL_MIN)
|
|
|
|
size = WAPBL_BLKPOOL_MIN;
|
|
|
|
KASSERT(wr->wr_blkhash == 0);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
wr->wr_blkhash = hashinit(size, HASH_LIST, true, &wr->wr_blkhashmask);
|
|
|
|
#else /* ! _KERNEL */
|
|
|
|
/* Manually implement hashinit */
|
|
|
|
{
|
2009-04-05 15:48:02 +04:00
|
|
|
unsigned long i, hashsize;
|
2008-07-31 09:38:04 +04:00
|
|
|
for (hashsize = 1; hashsize < size; hashsize <<= 1)
|
|
|
|
continue;
|
2012-01-28 22:02:56 +04:00
|
|
|
wr->wr_blkhash = wapbl_alloc(hashsize * sizeof(*wr->wr_blkhash));
|
2010-09-10 14:14:55 +04:00
|
|
|
for (i = 0; i < hashsize; i++)
|
2008-07-31 09:38:04 +04:00
|
|
|
LIST_INIT(&wr->wr_blkhash[i]);
|
|
|
|
wr->wr_blkhashmask = hashsize - 1;
|
|
|
|
}
|
|
|
|
#endif /* ! _KERNEL */
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_blkhash_free(struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
KASSERT(wr->wr_blkhashcnt == 0);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
hashdone(wr->wr_blkhash, HASH_LIST, wr->wr_blkhashmask);
|
|
|
|
#else /* ! _KERNEL */
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wr->wr_blkhash,
|
|
|
|
(wr->wr_blkhashmask + 1) * sizeof(*wr->wr_blkhash));
|
2008-07-31 09:38:04 +04:00
|
|
|
#endif /* ! _KERNEL */
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct wapbl_blk *
|
|
|
|
wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk)
|
|
|
|
{
|
|
|
|
struct wapbl_blk_head *wbh;
|
|
|
|
struct wapbl_blk *wb;
|
|
|
|
wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
|
|
|
|
LIST_FOREACH(wb, wbh, wb_hash) {
|
|
|
|
if (blk == wb->wb_blk)
|
|
|
|
return wb;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off)
|
|
|
|
{
|
|
|
|
struct wapbl_blk_head *wbh;
|
|
|
|
struct wapbl_blk *wb;
|
|
|
|
wb = wapbl_blkhash_get(wr, blk);
|
|
|
|
if (wb) {
|
|
|
|
KASSERT(wb->wb_blk == blk);
|
|
|
|
wb->wb_off = off;
|
|
|
|
} else {
|
2012-01-28 22:02:56 +04:00
|
|
|
wb = wapbl_alloc(sizeof(*wb));
|
2008-07-31 09:38:04 +04:00
|
|
|
wb->wb_blk = blk;
|
|
|
|
wb->wb_off = off;
|
|
|
|
wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask];
|
|
|
|
LIST_INSERT_HEAD(wbh, wb, wb_hash);
|
|
|
|
wr->wr_blkhashcnt++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk)
|
|
|
|
{
|
|
|
|
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
|
|
|
|
if (wb) {
|
|
|
|
KASSERT(wr->wr_blkhashcnt > 0);
|
|
|
|
wr->wr_blkhashcnt--;
|
|
|
|
LIST_REMOVE(wb, wb_hash);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wb, sizeof(*wb));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_blkhash_clear(struct wapbl_replay *wr)
|
|
|
|
{
|
2009-04-05 15:48:02 +04:00
|
|
|
unsigned long i;
|
2008-07-31 09:38:04 +04:00
|
|
|
for (i = 0; i <= wr->wr_blkhashmask; i++) {
|
|
|
|
struct wapbl_blk *wb;
|
|
|
|
|
|
|
|
while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) {
|
|
|
|
KASSERT(wr->wr_blkhashcnt > 0);
|
|
|
|
wr->wr_blkhashcnt--;
|
|
|
|
LIST_REMOVE(wb, wb_hash);
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wb, sizeof(*wb));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
KASSERT(wr->wr_blkhashcnt == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_circ_read(wr, data, len, offp)
|
|
|
|
*
|
|
|
|
* Read len bytes into data from the circular queue of wr,
|
|
|
|
* starting at the linear byte offset *offp, and returning the new
|
|
|
|
* linear byte offset in *offp.
|
|
|
|
*
|
|
|
|
* If the starting linear byte offset precedes wr->wr_circ_off,
|
|
|
|
* the read instead begins at wr->wr_circ_off. XXX WTF? This
|
|
|
|
* should be a KASSERT, not a conditional.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static int
|
|
|
|
wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp)
|
|
|
|
{
|
|
|
|
size_t slen;
|
|
|
|
off_t off = *offp;
|
|
|
|
int error;
|
2010-02-27 19:51:03 +03:00
|
|
|
daddr_t pbn;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
KASSERT(((len >> wr->wr_log_dev_bshift) <<
|
|
|
|
wr->wr_log_dev_bshift) == len);
|
2010-02-27 19:51:03 +03:00
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
if (off < wr->wr_circ_off)
|
|
|
|
off = wr->wr_circ_off;
|
|
|
|
slen = wr->wr_circ_off + wr->wr_circ_size - off;
|
2008-07-31 09:38:04 +04:00
|
|
|
if (slen < len) {
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << wr->wr_log_dev_bshift);
|
|
|
|
#endif
|
|
|
|
error = wapbl_read(data, slen, wr->wr_devvp, pbn);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
data = (uint8_t *)data + slen;
|
|
|
|
len -= slen;
|
2008-11-19 01:21:48 +03:00
|
|
|
off = wr->wr_circ_off;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = wr->wr_logpbn + (off >> wr->wr_log_dev_bshift);
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << wr->wr_log_dev_bshift);
|
|
|
|
#endif
|
|
|
|
error = wapbl_read(data, len, wr->wr_devvp, pbn);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
off += len;
|
2008-11-19 01:21:48 +03:00
|
|
|
if (off >= wr->wr_circ_off + wr->wr_circ_size)
|
|
|
|
off = wr->wr_circ_off;
|
2008-07-31 09:38:04 +04:00
|
|
|
*offp = off;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-07 23:16:38 +03:00
|
|
|
/*
|
|
|
|
* wapbl_circ_advance(wr, len, offp)
|
|
|
|
*
|
|
|
|
* Compute the linear byte offset of the circular queue of wr that
|
|
|
|
* is len bytes past *offp, and store it in *offp.
|
|
|
|
*
|
|
|
|
* This is as if wapbl_circ_read, but without actually reading
|
|
|
|
* anything.
|
|
|
|
*
|
|
|
|
* If the starting linear byte offset precedes wr->wr_circ_off, it
|
|
|
|
* is taken to be wr->wr_circ_off instead. XXX WTF? This should
|
|
|
|
* be a KASSERT, not a conditional.
|
|
|
|
*/
|
2008-07-31 09:38:04 +04:00
|
|
|
static void
|
|
|
|
wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp)
|
|
|
|
{
|
|
|
|
size_t slen;
|
|
|
|
off_t off = *offp;
|
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
KASSERT(((len >> wr->wr_log_dev_bshift) <<
|
|
|
|
wr->wr_log_dev_bshift) == len);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
if (off < wr->wr_circ_off)
|
|
|
|
off = wr->wr_circ_off;
|
|
|
|
slen = wr->wr_circ_off + wr->wr_circ_size - off;
|
2008-07-31 09:38:04 +04:00
|
|
|
if (slen < len) {
|
|
|
|
len -= slen;
|
2008-11-19 01:21:48 +03:00
|
|
|
off = wr->wr_circ_off;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
off += len;
|
2008-11-19 01:21:48 +03:00
|
|
|
if (off >= wr->wr_circ_off + wr->wr_circ_size)
|
|
|
|
off = wr->wr_circ_off;
|
2008-07-31 09:38:04 +04:00
|
|
|
*offp = off;
|
|
|
|
}
|
|
|
|
|
|
|
|
/****************************************************************/
|
|
|
|
|
|
|
|
int
|
|
|
|
wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp,
|
|
|
|
daddr_t off, size_t count, size_t blksize)
|
|
|
|
{
|
|
|
|
struct wapbl_replay *wr;
|
|
|
|
int error;
|
|
|
|
struct vnode *devvp;
|
|
|
|
daddr_t logpbn;
|
|
|
|
uint8_t *scratch;
|
|
|
|
struct wapbl_wc_header *wch;
|
|
|
|
struct wapbl_wc_header *wch2;
|
|
|
|
/* Use this until we read the actual log header */
|
2010-02-23 23:51:25 +03:00
|
|
|
int log_dev_bshift = ilog2(blksize);
|
2008-07-31 09:38:04 +04:00
|
|
|
size_t used;
|
2010-02-27 19:51:03 +03:00
|
|
|
daddr_t pbn;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
|
|
|
|
("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n",
|
|
|
|
vp, off, count, blksize));
|
|
|
|
|
|
|
|
if (off < 0)
|
|
|
|
return EINVAL;
|
|
|
|
|
|
|
|
if (blksize < DEV_BSIZE)
|
|
|
|
return EINVAL;
|
|
|
|
if (blksize % DEV_BSIZE)
|
|
|
|
return EINVAL;
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
#if 0
|
|
|
|
/* XXX vp->v_size isn't reliably set for VBLK devices,
|
|
|
|
* especially root. However, we might still want to verify
|
|
|
|
* that the full load is readable */
|
|
|
|
if ((off + count) * blksize > vp->v_size)
|
|
|
|
return EINVAL;
|
|
|
|
#endif
|
|
|
|
if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) {
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
#else /* ! _KERNEL */
|
|
|
|
devvp = vp;
|
|
|
|
logpbn = off;
|
|
|
|
#endif /* ! _KERNEL */
|
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
scratch = wapbl_alloc(MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2010-02-27 19:51:03 +03:00
|
|
|
pbn = logpbn;
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = btodb(pbn << log_dev_bshift);
|
|
|
|
#endif
|
|
|
|
error = wapbl_read(scratch, 2<<log_dev_bshift, devvp, pbn);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error)
|
|
|
|
goto errout;
|
|
|
|
|
|
|
|
wch = (struct wapbl_wc_header *)scratch;
|
|
|
|
wch2 =
|
|
|
|
(struct wapbl_wc_header *)(scratch + (1<<log_dev_bshift));
|
|
|
|
/* XXX verify checksums and magic numbers */
|
|
|
|
if (wch->wc_type != WAPBL_WC_HEADER) {
|
|
|
|
printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type);
|
|
|
|
error = EFTYPE;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wch2->wc_generation > wch->wc_generation)
|
|
|
|
wch = wch2;
|
|
|
|
|
|
|
|
wr = wapbl_calloc(1, sizeof(*wr));
|
|
|
|
|
|
|
|
wr->wr_logvp = vp;
|
|
|
|
wr->wr_devvp = devvp;
|
|
|
|
wr->wr_logpbn = logpbn;
|
|
|
|
|
|
|
|
wr->wr_scratch = scratch;
|
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
wr->wr_log_dev_bshift = wch->wc_log_dev_bshift;
|
|
|
|
wr->wr_fs_dev_bshift = wch->wc_fs_dev_bshift;
|
|
|
|
wr->wr_circ_off = wch->wc_circ_off;
|
|
|
|
wr->wr_circ_size = wch->wc_circ_size;
|
|
|
|
wr->wr_generation = wch->wc_generation;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail);
|
|
|
|
|
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_REPLAY,
|
|
|
|
("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64
|
|
|
|
" len=%"PRId64" used=%zu\n",
|
|
|
|
wch->wc_head, wch->wc_tail, wch->wc_circ_off,
|
|
|
|
wch->wc_circ_size, used));
|
|
|
|
|
|
|
|
wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift));
|
2008-11-18 16:29:34 +03:00
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
error = wapbl_replay_process(wr, wch->wc_head, wch->wc_tail);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (error) {
|
|
|
|
wapbl_replay_stop(wr);
|
|
|
|
wapbl_replay_free(wr);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
*wrp = wr;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
errout:
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(scratch, MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_replay_stop(struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
|
2008-11-10 23:12:13 +03:00
|
|
|
if (!wapbl_replay_isopen(wr))
|
|
|
|
return;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2008-11-10 23:12:13 +03:00
|
|
|
WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n"));
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wr->wr_scratch, MAXBSIZE);
|
|
|
|
wr->wr_scratch = NULL;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2009-01-31 12:33:36 +03:00
|
|
|
wr->wr_logvp = NULL;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
wapbl_blkhash_clear(wr);
|
|
|
|
wapbl_blkhash_free(wr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
wapbl_replay_free(struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
|
|
|
|
KDASSERT(!wapbl_replay_isopen(wr));
|
|
|
|
|
|
|
|
if (wr->wr_inodes)
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wr->wr_inodes,
|
|
|
|
wr->wr_inodescnt * sizeof(wr->wr_inodes[0]));
|
|
|
|
wapbl_free(wr, sizeof(*wr));
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
|
2008-11-10 23:12:13 +03:00
|
|
|
#ifdef _KERNEL
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_replay_isopen1(struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
|
|
|
|
return wapbl_replay_isopen(wr);
|
|
|
|
}
|
2008-11-10 23:12:13 +03:00
|
|
|
#endif
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2015-08-09 10:40:59 +03:00
|
|
|
/*
|
|
|
|
* calculate the disk address for the i'th block in the wc_blockblist
|
|
|
|
* offset by j blocks of size blen.
|
|
|
|
*
|
|
|
|
* wc_daddr is always a kernel disk address in DEV_BSIZE units that
|
|
|
|
* was written to the journal.
|
|
|
|
*
|
|
|
|
* The kernel needs that address plus the offset in DEV_BSIZE units.
|
|
|
|
*
|
|
|
|
* Userland needs that address plus the offset in blen units.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
static daddr_t
|
|
|
|
wapbl_block_daddr(struct wapbl_wc_blocklist *wc, int i, int j, int blen)
|
|
|
|
{
|
|
|
|
daddr_t pbn;
|
|
|
|
|
|
|
|
#ifdef _KERNEL
|
|
|
|
pbn = wc->wc_blocks[i].wc_daddr + btodb(j * blen);
|
|
|
|
#else
|
|
|
|
pbn = dbtob(wc->wc_blocks[i].wc_daddr) / blen + j;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return pbn;
|
|
|
|
}
|
|
|
|
|
2008-11-18 14:37:37 +03:00
|
|
|
static void
|
|
|
|
wapbl_replay_process_blocks(struct wapbl_replay *wr, off_t *offp)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_blocklist *wc =
|
|
|
|
(struct wapbl_wc_blocklist *)wr->wr_scratch;
|
2008-11-19 01:21:48 +03:00
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2008-11-18 14:37:37 +03:00
|
|
|
int i, j, n;
|
|
|
|
|
|
|
|
for (i = 0; i < wc->wc_blkcount; i++) {
|
|
|
|
/*
|
|
|
|
* Enter each physical block into the hashtable independently.
|
|
|
|
*/
|
2008-11-19 01:21:48 +03:00
|
|
|
n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
|
2008-11-18 14:37:37 +03:00
|
|
|
for (j = 0; j < n; j++) {
|
2015-08-09 10:40:59 +03:00
|
|
|
wapbl_blkhash_ins(wr, wapbl_block_daddr(wc, i, j, fsblklen),
|
2008-11-18 14:37:37 +03:00
|
|
|
*offp);
|
|
|
|
wapbl_circ_advance(wr, fsblklen, offp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_replay_process_revocations(struct wapbl_replay *wr)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_blocklist *wc =
|
|
|
|
(struct wapbl_wc_blocklist *)wr->wr_scratch;
|
2010-02-27 19:51:03 +03:00
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2008-11-18 14:37:37 +03:00
|
|
|
int i, j, n;
|
|
|
|
|
|
|
|
for (i = 0; i < wc->wc_blkcount; i++) {
|
|
|
|
/*
|
|
|
|
* Remove any blocks found from the hashtable.
|
|
|
|
*/
|
2008-11-19 01:21:48 +03:00
|
|
|
n = wc->wc_blocks[i].wc_dlen >> wr->wr_fs_dev_bshift;
|
2008-11-18 14:37:37 +03:00
|
|
|
for (j = 0; j < n; j++)
|
2015-08-09 10:40:59 +03:00
|
|
|
wapbl_blkhash_rem(wr, wapbl_block_daddr(wc, i, j, fsblklen));
|
2008-11-18 14:37:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
wapbl_replay_process_inodes(struct wapbl_replay *wr, off_t oldoff, off_t newoff)
|
|
|
|
{
|
|
|
|
struct wapbl_wc_inodelist *wc =
|
|
|
|
(struct wapbl_wc_inodelist *)wr->wr_scratch;
|
2009-01-31 12:33:36 +03:00
|
|
|
void *new_inodes;
|
|
|
|
const size_t oldsize = wr->wr_inodescnt * sizeof(wr->wr_inodes[0]);
|
|
|
|
|
|
|
|
KASSERT(sizeof(wr->wr_inodes[0]) == sizeof(wc->wc_inodes[0]));
|
|
|
|
|
2008-11-18 14:37:37 +03:00
|
|
|
/*
|
|
|
|
* Keep track of where we found this so location won't be
|
|
|
|
* overwritten.
|
|
|
|
*/
|
|
|
|
if (wc->wc_clear) {
|
|
|
|
wr->wr_inodestail = oldoff;
|
|
|
|
wr->wr_inodescnt = 0;
|
2008-11-18 21:54:39 +03:00
|
|
|
if (wr->wr_inodes != NULL) {
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(wr->wr_inodes, oldsize);
|
2008-11-18 21:54:39 +03:00
|
|
|
wr->wr_inodes = NULL;
|
|
|
|
}
|
2008-11-18 14:37:37 +03:00
|
|
|
}
|
|
|
|
wr->wr_inodeshead = newoff;
|
|
|
|
if (wc->wc_inocnt == 0)
|
|
|
|
return;
|
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
new_inodes = wapbl_alloc((wr->wr_inodescnt + wc->wc_inocnt) *
|
2009-01-31 12:33:36 +03:00
|
|
|
sizeof(wr->wr_inodes[0]));
|
|
|
|
if (wr->wr_inodes != NULL) {
|
|
|
|
memcpy(new_inodes, wr->wr_inodes, oldsize);
|
|
|
|
wapbl_free(wr->wr_inodes, oldsize);
|
|
|
|
}
|
|
|
|
wr->wr_inodes = new_inodes;
|
2008-11-18 14:37:37 +03:00
|
|
|
memcpy(&wr->wr_inodes[wr->wr_inodescnt], wc->wc_inodes,
|
2009-01-31 12:33:36 +03:00
|
|
|
wc->wc_inocnt * sizeof(wr->wr_inodes[0]));
|
2008-11-18 14:37:37 +03:00
|
|
|
wr->wr_inodescnt += wc->wc_inocnt;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
static int
|
2008-11-19 01:21:48 +03:00
|
|
|
wapbl_replay_process(struct wapbl_replay *wr, off_t head, off_t tail)
|
2008-07-31 09:38:04 +04:00
|
|
|
{
|
|
|
|
off_t off;
|
|
|
|
int error;
|
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
int logblklen = 1 << wr->wr_log_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
wapbl_blkhash_clear(wr);
|
|
|
|
|
2008-11-19 01:21:48 +03:00
|
|
|
off = tail;
|
|
|
|
while (off != head) {
|
2008-07-31 09:38:04 +04:00
|
|
|
struct wapbl_wc_null *wcn;
|
|
|
|
off_t saveoff = off;
|
|
|
|
error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
|
|
|
|
if (error)
|
|
|
|
goto errout;
|
|
|
|
wcn = (struct wapbl_wc_null *)wr->wr_scratch;
|
|
|
|
switch (wcn->wc_type) {
|
|
|
|
case WAPBL_WC_BLOCKS:
|
2008-11-18 14:37:37 +03:00
|
|
|
wapbl_replay_process_blocks(wr, &off);
|
2008-07-31 09:38:04 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
case WAPBL_WC_REVOCATIONS:
|
2008-11-18 14:37:37 +03:00
|
|
|
wapbl_replay_process_revocations(wr);
|
2008-07-31 09:38:04 +04:00
|
|
|
break;
|
|
|
|
|
|
|
|
case WAPBL_WC_INODES:
|
2008-11-18 14:37:37 +03:00
|
|
|
wapbl_replay_process_inodes(wr, saveoff, off);
|
2008-07-31 09:38:04 +04:00
|
|
|
break;
|
2008-11-18 14:37:37 +03:00
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
default:
|
|
|
|
printf("Unrecognized wapbl type: 0x%08x\n",
|
|
|
|
wcn->wc_type);
|
|
|
|
error = EFTYPE;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
|
|
|
|
if (off != saveoff) {
|
|
|
|
printf("wapbl_replay: corrupted records\n");
|
|
|
|
error = EFTYPE;
|
|
|
|
goto errout;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
errout:
|
|
|
|
wapbl_blkhash_clear(wr);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2008-11-18 22:31:35 +03:00
|
|
|
#if 0
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp)
|
|
|
|
{
|
|
|
|
off_t off;
|
|
|
|
int mismatchcnt = 0;
|
2008-11-19 01:21:48 +03:00
|
|
|
int logblklen = 1 << wr->wr_log_dev_bshift;
|
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2012-01-28 22:02:56 +04:00
|
|
|
void *scratch1 = wapbl_alloc(MAXBSIZE);
|
|
|
|
void *scratch2 = wapbl_alloc(MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
KDASSERT(wapbl_replay_isopen(wr));
|
|
|
|
|
|
|
|
off = wch->wc_tail;
|
|
|
|
while (off != wch->wc_head) {
|
|
|
|
struct wapbl_wc_null *wcn;
|
|
|
|
#ifdef DEBUG
|
|
|
|
off_t saveoff = off;
|
|
|
|
#endif
|
|
|
|
error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
wcn = (struct wapbl_wc_null *)wr->wr_scratch;
|
|
|
|
switch (wcn->wc_type) {
|
|
|
|
case WAPBL_WC_BLOCKS:
|
|
|
|
{
|
|
|
|
struct wapbl_wc_blocklist *wc =
|
|
|
|
(struct wapbl_wc_blocklist *)wr->wr_scratch;
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < wc->wc_blkcount; i++) {
|
|
|
|
int foundcnt = 0;
|
|
|
|
int dirtycnt = 0;
|
|
|
|
int j, n;
|
|
|
|
/*
|
|
|
|
* Check each physical block into the
|
|
|
|
* hashtable independently
|
|
|
|
*/
|
|
|
|
n = wc->wc_blocks[i].wc_dlen >>
|
|
|
|
wch->wc_fs_dev_bshift;
|
|
|
|
for (j = 0; j < n; j++) {
|
|
|
|
struct wapbl_blk *wb =
|
|
|
|
wapbl_blkhash_get(wr,
|
2015-08-09 10:40:59 +03:00
|
|
|
wapbl_block_daddr(wc, i, j, fsblklen));
|
2008-07-31 09:38:04 +04:00
|
|
|
if (wb && (wb->wb_off == off)) {
|
|
|
|
foundcnt++;
|
|
|
|
error =
|
|
|
|
wapbl_circ_read(wr,
|
|
|
|
scratch1, fsblklen,
|
|
|
|
&off);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
error =
|
|
|
|
wapbl_read(scratch2,
|
|
|
|
fsblklen, fsdevvp,
|
|
|
|
wb->wb_blk);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
if (memcmp(scratch1,
|
|
|
|
scratch2,
|
|
|
|
fsblklen)) {
|
|
|
|
printf(
|
|
|
|
"wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n",
|
|
|
|
wb->wb_blk, (intmax_t)off);
|
|
|
|
dirtycnt++;
|
|
|
|
mismatchcnt++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
wapbl_circ_advance(wr,
|
|
|
|
fsblklen, &off);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#if 0
|
|
|
|
/*
|
|
|
|
* If all of the blocks in an entry
|
|
|
|
* are clean, then remove all of its
|
|
|
|
* blocks from the hashtable since they
|
|
|
|
* never will need replay.
|
|
|
|
*/
|
|
|
|
if ((foundcnt != 0) &&
|
|
|
|
(dirtycnt == 0)) {
|
|
|
|
off = saveoff;
|
|
|
|
wapbl_circ_advance(wr,
|
|
|
|
logblklen, &off);
|
|
|
|
for (j = 0; j < n; j++) {
|
|
|
|
struct wapbl_blk *wb =
|
|
|
|
wapbl_blkhash_get(wr,
|
2015-08-09 10:40:59 +03:00
|
|
|
wapbl_block_daddr(wc, i, j, fsblklen));
|
2008-07-31 09:38:04 +04:00
|
|
|
if (wb &&
|
|
|
|
(wb->wb_off == off)) {
|
|
|
|
wapbl_blkhash_rem(wr, wb->wb_blk);
|
|
|
|
}
|
|
|
|
wapbl_circ_advance(wr,
|
|
|
|
fsblklen, &off);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case WAPBL_WC_REVOCATIONS:
|
|
|
|
case WAPBL_WC_INODES:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
KASSERT(0);
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
wapbl_circ_advance(wr, wcn->wc_len, &saveoff);
|
|
|
|
KASSERT(off == saveoff);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
out:
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(scratch1, MAXBSIZE);
|
|
|
|
wapbl_free(scratch2, MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
if (!error && mismatchcnt)
|
|
|
|
error = EFTYPE;
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int
|
|
|
|
wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp)
|
|
|
|
{
|
2008-11-18 01:08:09 +03:00
|
|
|
struct wapbl_blk *wb;
|
|
|
|
size_t i;
|
|
|
|
off_t off;
|
|
|
|
void *scratch;
|
2008-07-31 09:38:04 +04:00
|
|
|
int error = 0;
|
2008-11-19 01:21:48 +03:00
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
KDASSERT(wapbl_replay_isopen(wr));
|
|
|
|
|
2012-01-28 22:02:56 +04:00
|
|
|
scratch = wapbl_alloc(MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
|
2010-09-10 14:14:55 +04:00
|
|
|
for (i = 0; i <= wr->wr_blkhashmask; ++i) {
|
2008-11-18 01:08:09 +03:00
|
|
|
LIST_FOREACH(wb, &wr->wr_blkhash[i], wb_hash) {
|
|
|
|
off = wb->wb_off;
|
|
|
|
error = wapbl_circ_read(wr, scratch, fsblklen, &off);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
error = wapbl_write(scratch, fsblklen, fsdevvp,
|
|
|
|
wb->wb_blk);
|
|
|
|
if (error)
|
|
|
|
break;
|
2008-07-31 09:38:04 +04:00
|
|
|
}
|
|
|
|
}
|
2008-11-18 01:08:09 +03:00
|
|
|
|
2009-01-31 12:33:36 +03:00
|
|
|
wapbl_free(scratch, MAXBSIZE);
|
2008-07-31 09:38:04 +04:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2008-11-11 11:29:58 +03:00
|
|
|
int
|
|
|
|
wapbl_replay_can_read(struct wapbl_replay *wr, daddr_t blk, long len)
|
|
|
|
{
|
2008-11-19 01:21:48 +03:00
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2008-11-11 11:29:58 +03:00
|
|
|
|
|
|
|
KDASSERT(wapbl_replay_isopen(wr));
|
|
|
|
KASSERT((len % fsblklen) == 0);
|
|
|
|
|
|
|
|
while (len != 0) {
|
|
|
|
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
|
|
|
|
if (wb)
|
|
|
|
return 1;
|
|
|
|
len -= fsblklen;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-31 09:38:04 +04:00
|
|
|
int
|
|
|
|
wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len)
|
|
|
|
{
|
2008-11-19 01:21:48 +03:00
|
|
|
int fsblklen = 1 << wr->wr_fs_dev_bshift;
|
2008-07-31 09:38:04 +04:00
|
|
|
|
|
|
|
KDASSERT(wapbl_replay_isopen(wr));
|
|
|
|
|
|
|
|
KASSERT((len % fsblklen) == 0);
|
|
|
|
|
|
|
|
while (len != 0) {
|
|
|
|
struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk);
|
|
|
|
if (wb) {
|
|
|
|
off_t off = wb->wb_off;
|
|
|
|
int error;
|
|
|
|
error = wapbl_circ_read(wr, data, fsblklen, &off);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
data = (uint8_t *)data + fsblklen;
|
|
|
|
len -= fsblklen;
|
|
|
|
blk++;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2010-04-21 20:51:24 +04:00
|
|
|
|
2010-04-21 23:50:57 +04:00
|
|
|
#ifdef _KERNEL
|
2015-11-15 06:09:39 +03:00
|
|
|
|
2010-04-21 20:51:24 +04:00
|
|
|
MODULE(MODULE_CLASS_VFS, wapbl, NULL);
|
|
|
|
|
|
|
|
static int
|
|
|
|
wapbl_modcmd(modcmd_t cmd, void *arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case MODULE_CMD_INIT:
|
2011-01-08 23:37:05 +03:00
|
|
|
wapbl_init();
|
2010-04-21 20:51:24 +04:00
|
|
|
return 0;
|
|
|
|
case MODULE_CMD_FINI:
|
2016-05-07 23:59:46 +03:00
|
|
|
return wapbl_fini();
|
2010-04-21 20:51:24 +04:00
|
|
|
default:
|
|
|
|
return ENOTTY;
|
|
|
|
}
|
|
|
|
}
|
2010-04-21 23:50:57 +04:00
|
|
|
#endif /* _KERNEL */
|