533 lines
15 KiB
Groff
533 lines
15 KiB
Groff
.\" $NetBSD: wapbl.9,v 1.15 2017/03/18 19:01:01 riastradh Exp $
|
|
.\"
|
|
.\" Copyright (c) 2015 The NetBSD Foundation, Inc.
|
|
.\" All rights reserved.
|
|
.\"
|
|
.\" This code is derived from software contributed to The NetBSD Foundation
|
|
.\" by Taylor R. Campbell.
|
|
.\"
|
|
.\" Redistribution and use in source and binary forms, with or without
|
|
.\" modification, are permitted provided that the following conditions
|
|
.\" are met:
|
|
.\" 1. Redistributions of source code must retain the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer.
|
|
.\" 2. Redistributions in binary form must reproduce the above copyright
|
|
.\" notice, this list of conditions and the following disclaimer in the
|
|
.\" documentation and/or other materials provided with the distribution.
|
|
.\"
|
|
.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
|
.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
|
.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
.\" POSSIBILITY OF SUCH DAMAGE.
|
|
.\"
|
|
.Dd March 26, 2015
|
|
.Dt WAPBL 9
|
|
.Os
|
|
.Sh NAME
|
|
.Nm WAPBL ,
|
|
.Nm wapbl_start ,
|
|
.Nm wapbl_stop ,
|
|
.Nm wapbl_begin ,
|
|
.Nm wapbl_end ,
|
|
.Nm wapbl_flush ,
|
|
.Nm wapbl_discard ,
|
|
.Nm wapbl_add_buf ,
|
|
.Nm wapbl_remove_buf ,
|
|
.Nm wapbl_resize_buf ,
|
|
.Nm wapbl_register_inode ,
|
|
.Nm wapbl_unregister_inode ,
|
|
.Nm wapbl_register_deallocation ,
|
|
.Nm wapbl_jlock_assert ,
|
|
.Nm wapbl_junlock_assert
|
|
.Nd write-ahead physical block logging for file systems
|
|
.Sh SYNOPSIS
|
|
.In sys/wapbl.h
|
|
.Vt typedef void (*wapbl_flush_fn_t)(struct mount *, daddr_t *, int *, int) ;
|
|
.Ft int
|
|
.Fn wapbl_start "struct wapbl **wlp" "struct mount *mp" "struct vnode *devvp" \
|
|
"daddr_t off" "size_t count" "size_t blksize" \
|
|
"struct wapbl_replay *wr" \
|
|
"wapbl_flush_fn_t flushfn" "wapbl_flush_fn_t flushabortfn"
|
|
.Ft int
|
|
.Fn wapbl_stop "struct wapbl *wl" "int force"
|
|
.Ft int
|
|
.Fn wapbl_begin "struct wapbl *wl" "const char *file" "int line"
|
|
.Ft void
|
|
.Fn wapbl_end "struct wapbl *wl"
|
|
.Ft int
|
|
.Fn wapbl_flush "struct wapbl *wl" "int wait"
|
|
.Ft void
|
|
.Fn wapbl_discard "struct wapbl *wl"
|
|
.Ft void
|
|
.Fn wapbl_add_buf "struct wapbl *wl" "struct buf *bp"
|
|
.Ft void
|
|
.Fn wapbl_remove_buf "struct wapbl *wl" "struct buf *bp"
|
|
.Ft void
|
|
.Fn wapbl_resize_buf "struct wapbl *wl" "struct buf *bp" "long oldsz" \
|
|
"long oldcnt"
|
|
.Ft void
|
|
.Fn wapbl_register_inode "struct wapbl *wl" "ino_t ino" "mode_t mode"
|
|
.Ft void
|
|
.Fn wapbl_unregister_inode "struct wapbl *wl" "ino_t ino" "mode_t mode"
|
|
.Ft void
|
|
.Fn wapbl_register_deallocation "struct wapbl *wl" "daddr_t blk" "int len"
|
|
.Ft void
|
|
.Fn wapbl_jlock_assert "struct wapbl *wl"
|
|
.Ft void
|
|
.Fn wapbl_junlock_assert "struct wapbl *wl"
|
|
.Sh DESCRIPTION
|
|
.Nm ,
|
|
or
|
|
.Em write-ahead physical block logging ,
|
|
is an abstraction for file systems to write physical blocks in the
|
|
.Xr buffercache 9
|
|
to a bounded-size log first before their real destinations on disk.
|
|
The name means:
|
|
.Bl -tag -width "physical block" -offset abcd
|
|
.It logging
|
|
batches of writes are issued atomically via a log
|
|
.It physical block
|
|
only physical blocks, not logical file system operations, are stored in
|
|
the log
|
|
.It write-ahead
|
|
before writing a block to disk, its new content, rather than its old
|
|
content for roll-back, is recorded in the log
|
|
.El
|
|
.Pp
|
|
When a file system using
|
|
.Nm
|
|
issues writes (as in
|
|
.Xr bwrite 9
|
|
or
|
|
.Xr bdwrite 9 ) ,
|
|
they are grouped in batches called
|
|
.Em transactions
|
|
in memory, which are serialized to be consistent with program order
|
|
before
|
|
.Nm
|
|
submits them to disk atomically.
|
|
.Pp
|
|
Thus, within a transaction, after one write, another write need not
|
|
wait for disk I/O, and if the system is interrupted, e.g. by a crash or
|
|
by power failure, either both writes will appear on disk, or neither
|
|
will.
|
|
.Pp
|
|
When a transaction is full, it is written to a circular buffer on
|
|
disk called the
|
|
.Em log .
|
|
When the transaction has been written to disk, every write in the
|
|
transaction is submitted to disk asynchronously.
|
|
Finally, the file system may issue new writes via
|
|
.Nm
|
|
once enough writes submitted to disk have completed.
|
|
.Pp
|
|
After interruption, such as a crash or power failure, some writes
|
|
issued by the file system may not have completed.
|
|
However, the log is written consistently with program order and before
|
|
file system writes are submitted to disk.
|
|
Hence a consistent program-order view of the file system can be
|
|
attained by resubmitting the writes that were successfully stored in
|
|
the log using
|
|
.Xr wapbl_replay 9 .
|
|
This may not be the same state just before interruption \(em writes in
|
|
transactions that did not reach the disk will be excluded.
|
|
.Pp
|
|
For a file system to use
|
|
.Nm ,
|
|
its
|
|
.Xr VFS_MOUNT 9
|
|
method should first replay any journal on disk using
|
|
.Xr wapbl_replay 9 ,
|
|
and then, if the mount is read/write, initialize
|
|
.Nm
|
|
for the mount by calling
|
|
.Fn wapbl_start .
|
|
The
|
|
.Xr VFS_UNMOUNT 9
|
|
method should call
|
|
.Fn wapbl_stop .
|
|
.Pp
|
|
Before issuing any
|
|
.Xr buffercache 9
|
|
writes, the file system must acquire a shared lock on the current
|
|
.Nm
|
|
transaction with
|
|
.Fn wapbl_begin ,
|
|
which may sleep until there is room in the transaction for new writes.
|
|
After issuing the writes, the file system must release its shared lock
|
|
on the transaction with
|
|
.Fn wapbl_end .
|
|
Either all writes issued between
|
|
.Fn wapbl_begin
|
|
and
|
|
.Fn wapbl_end
|
|
will complete, or none of them will.
|
|
.Pp
|
|
File systems may also witness an
|
|
.Em exclusive
|
|
lock on the current transaction when
|
|
.Nm
|
|
is flushing the transaction to disk, or aborting a flush, and invokes a
|
|
file system's callback.
|
|
File systems can assert that the transaction is locked with
|
|
.Fn wapbl_jlock_assert ,
|
|
or not
|
|
.Em exclusively
|
|
locked, with
|
|
.Fn wapbl_junlock_assert .
|
|
.Pp
|
|
If a file system requires multiple transactions to initialize an
|
|
inode, and needs to destroy partially initialized inodes during replay,
|
|
it can register them by
|
|
.Vt ino_t
|
|
inode number before initialization with
|
|
.Fn wapbl_register_inode
|
|
and unregister them with
|
|
.Fn wapbl_unregister_inode
|
|
once initialization is complete.
|
|
.Nm
|
|
does not actually concern itself whether the objects identified by
|
|
.Vt ino_t
|
|
values are
|
|
.Sq inodes
|
|
or
|
|
.Sq quaggas
|
|
or anything else \(em file systems may use this to list any objects
|
|
keyed by
|
|
.Vt ino_t
|
|
value in the log.
|
|
.Pp
|
|
When a file system frees resources on disk and issues writes to reflect
|
|
the fact, it cannot then reuse the resources until the writes have
|
|
reached the disk.
|
|
However, as far as the
|
|
.Xr buffercache 9
|
|
is concerned, as soon as the file system issues the writes, they will
|
|
appear to have been written.
|
|
So the file system must not attempt to reuse the resource until the
|
|
current
|
|
.Nm
|
|
transaction has been flushed to disk.
|
|
.Pp
|
|
The file system can defer freeing a resource by calling
|
|
.Fn wapbl_register_deallocation
|
|
to record the disk address of the resource and length in bytes of the
|
|
resource.
|
|
Then, when
|
|
.Nm
|
|
next flushes the transaction to disk, it will pass an array of the disk
|
|
addresses and lengths in bytes to a file-system-supplied callback.
|
|
(Again,
|
|
.Nm
|
|
does not care whether the
|
|
.Sq disk address
|
|
or
|
|
.Sq length in bytes
|
|
is actually that; it will pass along
|
|
.Vt daddr_t
|
|
and
|
|
.Vt int
|
|
values.)
|
|
.Sh FUNCTIONS
|
|
.Bl -tag -width abcd
|
|
.It Fn wapbl_start wlp mp devvp off count blksize wr flushfn flushabortfn
|
|
Start using
|
|
.Nm
|
|
for the file system mounted at
|
|
.Fa mp ,
|
|
storing a log of
|
|
.Fa count
|
|
disk sectors at disk address
|
|
.Fa off
|
|
on the block device
|
|
.Fa devvp
|
|
writing blocks in units of
|
|
.Fa blksize
|
|
bytes.
|
|
On success, stores an opaque
|
|
.Vt "struct wapbl *"
|
|
cookie in
|
|
.Li * Ns Fa wlp
|
|
for use with the other
|
|
.Nm
|
|
routines and returns zero.
|
|
On failure, returns an error number.
|
|
.Pp
|
|
If the file system had replayed the log with
|
|
.Xr wapbl_replay 9 ,
|
|
then
|
|
.Fa wr
|
|
must be the
|
|
.Vt "struct wapbl_replay *"
|
|
cookie used to replay it, and
|
|
.Fn wapbl_start
|
|
will register any inodes that were in the log as if with
|
|
.Fn wapbl_register_inode ;
|
|
otherwise
|
|
.Fa wr
|
|
must be
|
|
.Dv NULL .
|
|
.Pp
|
|
.Fa flushfn
|
|
is a callback that
|
|
.Nm
|
|
will invoke as
|
|
.Fa flushfn ( Fa mp , Fa deallocblks , Fa dealloclens , Fa dealloccnt )
|
|
just before it flushes a transaction to disk, with the an exclusive
|
|
lock held on the transaction, where
|
|
.Fa mp
|
|
is the mount point passed to
|
|
.Fn wapbl_start ,
|
|
.Fa deallocblks
|
|
is an array of
|
|
.Fa dealloccnt
|
|
disk addresses, and
|
|
.Fa dealloclens
|
|
is an array of
|
|
.Fa dealloccnt
|
|
lengths, corresponding to the addresses and lengths the file system
|
|
passed to
|
|
.Fn wapbl_register_deallocation .
|
|
If flushing the transaction to disk fails,
|
|
.Nm
|
|
will call
|
|
.Fa flushabortfn
|
|
with the same arguments to undo any effects that
|
|
.Fa flushfn
|
|
had.
|
|
.It Fn wapbl_stop wl force
|
|
Flush the current transaction to disk and stop using
|
|
.Nm .
|
|
If flushing the transaction fails and
|
|
.Fa force
|
|
is zero,
|
|
return error.
|
|
If flushing the transaction fails and
|
|
.Fa force
|
|
is nonzero, discard the transaction, permanently losing any writes in
|
|
it.
|
|
If flushing the transaction is successful or if
|
|
.Fa force
|
|
is nonzero,
|
|
free memory associated with
|
|
.Fa wl
|
|
and return zero.
|
|
.It Fn wapbl_begin wl file line
|
|
Wait for space in the current transaction for new writes, flushing it
|
|
if necessary, and acquire a shared lock on it.
|
|
.Pp
|
|
The lock is not exclusive: other threads may acquire shared locks on
|
|
the transaction too.
|
|
The lock is not recursive: a thread may not acquire it again without
|
|
calling
|
|
.Fa wapbl_end
|
|
first.
|
|
.Pp
|
|
May sleep.
|
|
.Pp
|
|
.Fa file
|
|
and
|
|
.Fa line
|
|
are the file name and line number of the caller for debugging
|
|
purposes.
|
|
.It Fn wapbl_end wl
|
|
Release a shared lock on the transaction acquired with
|
|
.Fn wapbl_begin .
|
|
.It Fn wapbl_flush wl wait
|
|
Flush the current transaction to disk.
|
|
If
|
|
.Fa wait
|
|
is nonzero, wait for all writes in the current transaction to
|
|
complete.
|
|
.Pp
|
|
The current transaction must not be locked.
|
|
.It Fn wapbl_discard wl
|
|
Discard the current transaction, permanently losing any writes in it.
|
|
.Pp
|
|
The current transaction must not be locked.
|
|
.It Fn wapbl_add_buf wl bp
|
|
Add the buffer
|
|
.Fa bp
|
|
to the current transaction, which must be locked, because someone has
|
|
asked to write it.
|
|
.Pp
|
|
This is meant to be called from within
|
|
.Xr buffercache 9 ,
|
|
not by file systems directly.
|
|
.It Fn wapbl_remove_buf wl bp
|
|
Remove the buffer
|
|
.Fa bp ,
|
|
which must have been added using
|
|
.Fa wapbl_add_buf ,
|
|
from the current transaction, which must be locked, because it has been
|
|
invalidated (or XXX ???).
|
|
.Pp
|
|
This is meant to be called from within
|
|
.Xr buffercache 9 ,
|
|
not by file systems directly.
|
|
.It Fn wapbl_resize_buf wl bp oldsz oldcnt
|
|
Note that the buffer
|
|
.Fa bp ,
|
|
which must have been added using
|
|
.Fa wapbl_add_buf ,
|
|
has changed size, where
|
|
.Fa oldsz
|
|
is the previous allocated size in bytes and
|
|
.Fa oldcnt
|
|
is the previous number of valid bytes in
|
|
.Fa bp .
|
|
.Pp
|
|
This is meant to be called from within
|
|
.Xr buffercache 9 ,
|
|
not by file systems directly.
|
|
.It Fn wapbl_register_inode wl ino mode
|
|
Register
|
|
.Fa ino
|
|
with the mode
|
|
.Fa mode
|
|
as commencing initialization.
|
|
.It Fn wapbl_unregister_inode wl ino mode
|
|
Unregister
|
|
.Fa ino ,
|
|
which must have previously been registered with
|
|
.Fa wapbl_register_inode
|
|
using the same
|
|
.Fa mode ,
|
|
now that its initialization has completed.
|
|
.It Fn wapbl_register_deallocation wl blk len
|
|
Register
|
|
.Fa len
|
|
bytes at the disk address
|
|
.Fa blk
|
|
as ready for deallocation, so that they will be passed to the
|
|
.Fa flushfn
|
|
that was given to
|
|
.Fn wapbl_start .
|
|
.It Fn wapbl_jlock_assert wl
|
|
Assert that the current transaction is locked.
|
|
.Pp
|
|
Note that it might not be locked by the current thread: this assertion
|
|
passes if
|
|
.Em any
|
|
thread has it locked.
|
|
.It Fn wapbl_junlock_assert wl
|
|
Assert that the current transaction is not exclusively locked by the
|
|
current thread.
|
|
.Pp
|
|
Users of
|
|
.Nm
|
|
observe exclusive locks only in the
|
|
.Fa flushfn
|
|
and
|
|
.Fa flushabortfn
|
|
callbacks to
|
|
.Fn wapbl_start .
|
|
Outside of such contexts, the transaction is never exclusively locked,
|
|
even between
|
|
.Fn wapbl_begin
|
|
and
|
|
.Fn wapbl_end .
|
|
.Pp
|
|
There is no way to assert that the current transaction is not locked at
|
|
all \(em i.e., that the caller may acquire a shared lock on the
|
|
transaction with
|
|
.Fn wapbl_begin
|
|
without danger of deadlock.
|
|
.El
|
|
.Sh CODE REFERENCES
|
|
The
|
|
.Nm
|
|
subsystem is implemented in
|
|
.Pa sys/kern/vfs_wapbl.c ,
|
|
with hooks in
|
|
.Pa sys/kern/vfs_bio.c .
|
|
.Sh SEE ALSO
|
|
.Xr buffercache 9 ,
|
|
.Xr vfsops 9 ,
|
|
.Xr wapbl_replay 9
|
|
.Sh BUGS
|
|
.Nm
|
|
works only for file system metadata managed via the
|
|
.Xr buffercache 9 ,
|
|
and provides no way to log writes via the page cache, as in
|
|
.Xr VOP_GETPAGES 9 ,
|
|
.Xr VOP_PUTPAGES 9 ,
|
|
and
|
|
.Xr ubc_uiomove 9 ,
|
|
which is normally used for file data.
|
|
.Pp
|
|
Not only is
|
|
.Nm
|
|
unable to log writes via the page cache, it is also unable to defer
|
|
.Xr buffercache 9
|
|
writes until cached pages have been written.
|
|
This manifests as the well-known garbage-data-appended-after-crash bug
|
|
in FFS: when appending to a file, the pages containing new data may not
|
|
reach the disk before the inode update reporting its new size.
|
|
After a crash, the inode update will be on disk, but the new data will
|
|
not be \(em instead, whatever garbage data in the free space will
|
|
appear to have been appended to the file.
|
|
.Nm
|
|
exacerbates the problem by increasing the throughput of metadata
|
|
writes, because it can issue many metadata writes asynchronously that
|
|
FFS without
|
|
.Nm
|
|
would need to issue synchronously in order for
|
|
.Xr fsck 8
|
|
to work.
|
|
.Pp
|
|
The criteria for when the transaction must be flushed to disk before
|
|
.Fn wapbl_begin
|
|
returns are heuristic, i.e. wrong.
|
|
There is no way for a file system to communicate to
|
|
.Fn wapbl_begin
|
|
how many buffers, inodes, and deallocations it will issue via
|
|
.Nm
|
|
in the transaction.
|
|
.Pp
|
|
.Nm
|
|
mainly supports write-ahead, and has only limited support for rolling
|
|
back operations, in the form of
|
|
.Fn wapbl_register_inode
|
|
and
|
|
.Fn wapbl_unregister_inode .
|
|
Consequently, for example, large writes appending to a file, which
|
|
requires multiple disk block allocations and an inode update, must
|
|
occur in a single transaction \(em there is no way to roll back the
|
|
disk block allocations if the write fails in the middle, e.g. because
|
|
of a fault in the middle of the user buffer.
|
|
.Pp
|
|
.Fn wapbl_jlock_assert
|
|
does not guarantee that the current thread has the current transaction
|
|
locked.
|
|
.Fn wapbl_junlock_assert
|
|
does not guarantee that the current thread does not have the current
|
|
transaction locked at all.
|
|
.Pp
|
|
There is only one
|
|
.Nm
|
|
transaction for each file system at any given time, and only one
|
|
.Nm
|
|
log on disk.
|
|
Consequently, all writes are serialized.
|
|
Extending
|
|
.Nm
|
|
to support multiple logs per file system, partitioned according to an
|
|
appropriate scheme, is left as an exercise for the reader.
|
|
.Pp
|
|
There is no reason for
|
|
.Nm
|
|
to require its own hooks in
|
|
.Xr buffercache 9 .
|
|
.Pp
|
|
The on-disk format used by
|
|
.Nm
|
|
is undocumented.
|