PR kern/39564 wapbl performance issues with disk cache flushing

PR kern/40361 WAPBL locking panic in -current
PR kern/40361 WAPBL locking panic in -current
PR kern/40470 WAPBL corrupts ext2fs
PR kern/40562 busy loop in ffs_sync when unmounting a file system
PR kern/40525 panic: ffs_valloc: dup alloc

- A fix for an issue that can lead to "ffs_valloc: dup" due to dirty cg
  buffers being invalidated. Problem discovered and patch by dholland@.

- If the syncer fails to lazily sync a vnode due to lock contention,
  retry 1 second later instead of 30 seconds later.

- Flush inode atime updates every ~10 seconds (this makes most sense with
  logging). Presently they didn't hit the disk for read-only files or
  devices until the file system was unmounted. It would be better to trickle
  the updates out but that would require more extensive changes.

- Fix issues with file system corruption, busy looping and other nasty
  problems when logging and non-logging file systems are intermixed,
  with one being the root file system.

- For logging, do not flush metadata on an inode-at-a-time basis if the sync
  has been requested by ioflush. Previously, we could try hundreds of log
  sync operations a second due to inode update activity, causing the syncer
  to fall behind and metadata updates to be serialized across the entire
  file system. Instead, burst out metadata and log flushes at a minimum
  interval of every 10 seconds on an active file system (happens more often
  if the log becomes full). Note this does not change the operation of
  fsync() etc.

- With the flush issue fixed, re-enable concurrent metadata updates in
  vfs_wapbl.c.
This commit is contained in:
ad 2009-02-22 20:10:25 +00:00
parent 4534498c64
commit 430f67aa17
6 changed files with 328 additions and 90 deletions

View File

@ -1,7 +1,7 @@
/* $NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $ */ /* $NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $ */
/*- /*-
* Copyright (c) 2003,2008 The NetBSD Foundation, Inc. * Copyright (c) 2003, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved. * All rights reserved.
* *
* This code is derived from software contributed to The NetBSD Foundation * This code is derived from software contributed to The NetBSD Foundation
@ -36,7 +36,7 @@
#define WAPBL_INTERNAL #define WAPBL_INTERNAL
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.22 2009/02/18 13:22:10 yamt Exp $"); __KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.23 2009/02/22 20:10:25 ad Exp $");
#include <sys/param.h> #include <sys/param.h>
@ -770,27 +770,9 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
{ {
int doflush; int doflush;
unsigned lockcount; unsigned lockcount;
krw_t op;
KDASSERT(wl); KDASSERT(wl);
/*
* XXX: The original code calls for the use of a RW_READER lock
* here, but it turns out there are performance issues with high
* metadata-rate workloads (e.g. multiple simultaneous tar
* extractions). For now, we force the lock to be RW_WRITER,
* since that currently has the best performance characteristics
* (even for a single tar-file extraction).
*
*/
#define WAPBL_DEBUG_SERIALIZE 1
#ifdef WAPBL_DEBUG_SERIALIZE
op = RW_WRITER;
#else
op = RW_READER;
#endif
/* /*
* XXX this needs to be made much more sophisticated. * XXX this needs to be made much more sophisticated.
* perhaps each wapbl_begin could reserve a specified * perhaps each wapbl_begin could reserve a specified
@ -820,12 +802,12 @@ wapbl_begin(struct wapbl *wl, const char *file, int line)
return error; return error;
} }
rw_enter(&wl->wl_rwlock, op); rw_enter(&wl->wl_rwlock, RW_READER);
mutex_enter(&wl->wl_mtx); mutex_enter(&wl->wl_mtx);
wl->wl_lock_count++; wl->wl_lock_count++;
mutex_exit(&wl->wl_mtx); mutex_exit(&wl->wl_mtx);
#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) #if defined(WAPBL_DEBUG_PRINT)
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
("wapbl_begin thread %d.%d with bufcount=%zu " ("wapbl_begin thread %d.%d with bufcount=%zu "
"bufbytes=%zu bcount=%zu at %s:%d\n", "bufbytes=%zu bcount=%zu at %s:%d\n",
@ -840,7 +822,7 @@ void
wapbl_end(struct wapbl *wl) wapbl_end(struct wapbl *wl)
{ {
#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) #if defined(WAPBL_DEBUG_PRINT)
WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION,
("wapbl_end thread %d.%d with bufcount=%zu " ("wapbl_end thread %d.%d with bufcount=%zu "
"bufbytes=%zu bcount=%zu\n", "bufbytes=%zu bcount=%zu\n",
@ -1552,20 +1534,14 @@ void
wapbl_jlock_assert(struct wapbl *wl) wapbl_jlock_assert(struct wapbl *wl)
{ {
#ifdef WAPBL_DEBUG_SERIALIZE KASSERT(rw_lock_held(&wl->wl_rwlock));
KASSERT(rw_write_held(&wl->wl_rwlock));
#else
KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock));
#endif
} }
void void
wapbl_junlock_assert(struct wapbl *wl) wapbl_junlock_assert(struct wapbl *wl)
{ {
#ifdef WAPBL_DEBUG_SERIALIZE
KASSERT(!rw_write_held(&wl->wl_rwlock)); KASSERT(!rw_write_held(&wl->wl_rwlock));
#endif
} }
/****************************************************************/ /****************************************************************/

View File

@ -1,4 +1,33 @@
/* $NetBSD: sync_subr.c,v 1.35 2009/01/17 07:02:35 yamt Exp $ */ /* $NetBSD: sync_subr.c,v 1.36 2009/02/22 20:10:25 ad Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* /*
* Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
@ -32,7 +61,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.35 2009/01/17 07:02:35 yamt Exp $"); __KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.36 2009/02/22 20:10:25 ad Exp $");
#include <sys/param.h> #include <sys/param.h>
#include <sys/systm.h> #include <sys/systm.h>
@ -59,6 +88,7 @@ time_t syncdelay = 30; /* max time to delay syncing data */
time_t filedelay = 30; /* time to delay syncing files */ time_t filedelay = 30; /* time to delay syncing files */
time_t dirdelay = 15; /* time to delay syncing directories */ time_t dirdelay = 15; /* time to delay syncing directories */
time_t metadelay = 10; /* time to delay syncing metadata */ time_t metadelay = 10; /* time to delay syncing metadata */
time_t lockdelay = 1; /* time to delay if locking fails */
kmutex_t syncer_mutex; /* used to freeze syncer, long term */ kmutex_t syncer_mutex; /* used to freeze syncer, long term */
static kmutex_t syncer_data_lock; /* short term lock on data structures */ static kmutex_t syncer_data_lock; /* short term lock on data structures */
@ -196,6 +226,7 @@ sched_sync(void *v)
struct synclist *slp; struct synclist *slp;
struct vnode *vp; struct vnode *vp;
long starttime; long starttime;
bool synced;
updateproc = curlwp; updateproc = curlwp;
@ -206,8 +237,7 @@ sched_sync(void *v)
starttime = time_second; starttime = time_second;
/* /*
* Push files whose dirty time has expired. Be careful * Push files whose dirty time has expired.
* of interrupt race on slp queue.
*/ */
slp = &syncer_workitem_pending[syncer_delayno]; slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1; syncer_delayno += 1;
@ -216,10 +246,12 @@ sched_sync(void *v)
while ((vp = TAILQ_FIRST(slp)) != NULL) { while ((vp = TAILQ_FIRST(slp)) != NULL) {
/* We are locking in the wrong direction. */ /* We are locking in the wrong direction. */
synced = false;
if (mutex_tryenter(&vp->v_interlock)) { if (mutex_tryenter(&vp->v_interlock)) {
mutex_exit(&syncer_data_lock); mutex_exit(&syncer_data_lock);
if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT | if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT |
LK_INTERLOCK) == 0) { LK_INTERLOCK) == 0) {
synced = true;
(void) VOP_FSYNC(vp, curlwp->l_cred, (void) VOP_FSYNC(vp, curlwp->l_cred,
FSYNC_LAZY, 0, 0); FSYNC_LAZY, 0, 0);
vput(vp); vput(vp);
@ -227,15 +259,36 @@ sched_sync(void *v)
mutex_enter(&syncer_data_lock); mutex_enter(&syncer_data_lock);
} }
/* XXXAD The vnode may have been recycled. */ /*
* XXX The vnode may have been recycled, in which
* case it may have a new identity.
*/
if (TAILQ_FIRST(slp) == vp) { if (TAILQ_FIRST(slp) == vp) {
/* /*
* Put us back on the worklist. The worklist * Put us back on the worklist. The worklist
* routine will remove us from our current * routine will remove us from our current
* position and then add us back in at a later * position and then add us back in at a later
* position. * position.
*
* Try again sooner rather than later if
* we were unable to lock the vnode. Lock
* failure should not prevent us from doing
* the sync "soon".
*
* If we locked it yet arrive here, it's
* likely that lazy sync is in progress and
* so the vnode still has dirty metadata.
* syncdelay is mainly to get this vnode out
* of the way so we do not consider it again
* "soon" in this loop, so the delay time is
* not critical as long as it is not "soon".
* While write-back strategy is the file
* system's domain, we expect write-back to
* occur no later than syncdelay seconds
* into the future.
*/ */
vn_syncer_add1(vp, syncdelay); vn_syncer_add1(vp,
synced ? syncdelay : lockdelay);
} }
} }
@ -247,8 +300,10 @@ sched_sync(void *v)
if (bioopsp != NULL) if (bioopsp != NULL)
(*bioopsp->io_sync)(NULL); (*bioopsp->io_sync)(NULL);
/*
* Wait until there are more workitems to process.
*/
mutex_exit(&syncer_mutex); mutex_exit(&syncer_mutex);
mutex_enter(&syncer_data_lock); mutex_enter(&syncer_data_lock);
if (rushjob > 0) { if (rushjob > 0) {
/* /*

View File

@ -1,4 +1,33 @@
/* $NetBSD: sync_vnops.c,v 1.25 2008/05/06 18:43:44 ad Exp $ */ /* $NetBSD: sync_vnops.c,v 1.26 2009/02/22 20:10:25 ad Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/* /*
* Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
@ -32,7 +61,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sync_vnops.c,v 1.25 2008/05/06 18:43:44 ad Exp $"); __KERNEL_RCSID(0, "$NetBSD: sync_vnops.c,v 1.26 2009/02/22 20:10:25 ad Exp $");
#include <sys/param.h> #include <sys/param.h>
#include <sys/proc.h> #include <sys/proc.h>
@ -62,6 +91,18 @@ const struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
const struct vnodeopv_desc sync_vnodeop_opv_desc = const struct vnodeopv_desc sync_vnodeop_opv_desc =
{ &sync_vnodeop_p, sync_vnodeop_entries }; { &sync_vnodeop_p, sync_vnodeop_entries };
/*
* Return delay factor appropriate for the given file system. For
* WAPBL we use the sync vnode to burst out metadata updates: sync
* those file systems more frequently.
*/
static inline int
sync_delay(struct mount *mp)
{
return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
}
/* /*
* Create a new filesystem syncer vnode for the specified mount point. * Create a new filesystem syncer vnode for the specified mount point.
*/ */
@ -70,8 +111,8 @@ vfs_allocate_syncvnode(mp)
struct mount *mp; struct mount *mp;
{ {
struct vnode *vp; struct vnode *vp;
static long start, incr, next; static int start, incr, next;
int error; int error, vdelay;
/* Allocate a new vnode */ /* Allocate a new vnode */
if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0)
@ -98,7 +139,8 @@ vfs_allocate_syncvnode(mp)
next = start; next = start;
} }
mutex_enter(&vp->v_interlock); mutex_enter(&vp->v_interlock);
vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); vdelay = sync_delay(mp);
vn_syncer_add_to_worklist(vp, vdelay > 0 ? next % vdelay : 0);
mutex_exit(&vp->v_interlock); mutex_exit(&vp->v_interlock);
mp->mnt_syncer = vp; mp->mnt_syncer = vp;
return (0); return (0);
@ -149,7 +191,7 @@ sync_fsync(v)
* Move ourselves to the back of the sync list. * Move ourselves to the back of the sync list.
*/ */
mutex_enter(&syncvp->v_interlock); mutex_enter(&syncvp->v_interlock);
vn_syncer_add_to_worklist(syncvp, syncdelay); vn_syncer_add_to_worklist(syncvp, sync_delay(mp));
mutex_exit(&syncvp->v_interlock); mutex_exit(&syncvp->v_interlock);
/* /*

View File

@ -1,4 +1,4 @@
/* $NetBSD: ffs_alloc.c,v 1.120 2009/01/11 02:45:56 christos Exp $ */ /* $NetBSD: ffs_alloc.c,v 1.121 2009/02/22 20:10:25 ad Exp $ */
/*- /*-
* Copyright (c) 2008 The NetBSD Foundation, Inc. * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -70,7 +70,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.120 2009/01/11 02:45:56 christos Exp $"); __KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.121 2009/02/22 20:10:25 ad Exp $");
#if defined(_KERNEL_OPT) #if defined(_KERNEL_OPT)
#include "opt_ffs.h" #include "opt_ffs.h"
@ -1284,7 +1284,7 @@ retry:
if (ibp != NULL && if (ibp != NULL &&
initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) { initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
/* Another thread allocated more inodes so we retry the test. */ /* Another thread allocated more inodes so we retry the test. */
brelse(ibp, BC_INVAL); brelse(ibp, 0);
ibp = NULL; ibp = NULL;
} }
/* /*
@ -1396,7 +1396,7 @@ gotit:
if (bp != NULL) if (bp != NULL)
brelse(bp, 0); brelse(bp, 0);
if (ibp != NULL) if (ibp != NULL)
brelse(ibp, BC_INVAL); brelse(ibp, 0);
mutex_enter(&ump->um_lock); mutex_enter(&ump->um_lock);
return (0); return (0);
} }

View File

@ -1,4 +1,4 @@
/* $NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $ */ /* $NetBSD: ffs_vfsops.c,v 1.242 2009/02/22 20:10:25 ad Exp $ */
/*- /*-
* Copyright (c) 2008 The NetBSD Foundation, Inc. * Copyright (c) 2008 The NetBSD Foundation, Inc.
@ -61,7 +61,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $"); __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.242 2009/02/22 20:10:25 ad Exp $");
#if defined(_KERNEL_OPT) #if defined(_KERNEL_OPT)
#include "opt_ffs.h" #include "opt_ffs.h"
@ -111,6 +111,8 @@ __KERNEL_RCSID(0, "$NetBSD: ffs_vfsops.c,v 1.241 2008/11/13 11:09:45 ad Exp $");
MODULE(MODULE_CLASS_VFS, ffs, NULL); MODULE(MODULE_CLASS_VFS, ffs, NULL);
static int ffs_vfs_fsync(vnode_t *, int);
static struct sysctllog *ffs_sysctl_log; static struct sysctllog *ffs_sysctl_log;
/* how many times ffs_init() was called */ /* how many times ffs_init() was called */
@ -151,7 +153,7 @@ struct vfsops ffs_vfsops = {
ffs_suspendctl, ffs_suspendctl,
genfs_renamelock_enter, genfs_renamelock_enter,
genfs_renamelock_exit, genfs_renamelock_exit,
ffs_full_fsync, ffs_vfs_fsync,
ffs_vnodeopv_descs, ffs_vnodeopv_descs,
0, 0,
{ NULL, NULL }, { NULL, NULL },
@ -1697,11 +1699,22 @@ loop:
continue; continue;
mutex_enter(&vp->v_interlock); mutex_enter(&vp->v_interlock);
ip = VTOI(vp); ip = VTOI(vp);
/* XXXpooka: why wapbl check? */
/*
* We deliberately update inode times here. This will
* prevent a massive queue of updates accumulating, only
* to be handled by a call to unmount.
*
* XXX It would be better to have the syncer trickle these
* out. Adjustment needed to allow registering vnodes for
* sync when the vnode is clean, but the inode dirty. Or
* have ufs itself trickle out inode updates.
*/
if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 || if (ip == NULL || (vp->v_iflag & (VI_XLOCK | VI_CLEAN)) != 0 ||
vp->v_type == VNON || ((ip->i_flag & vp->v_type == VNON || ((ip->i_flag &
(IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY |
(LIST_EMPTY(&vp->v_dirtyblkhd) || (mp->mnt_wapbl)) && IN_MODIFIED | IN_ACCESSED)) == 0 &&
LIST_EMPTY(&vp->v_dirtyblkhd) &&
UVM_OBJ_IS_CLEAN(&vp->v_uobj))) UVM_OBJ_IS_CLEAN(&vp->v_uobj)))
{ {
mutex_exit(&vp->v_interlock); mutex_exit(&vp->v_interlock);
@ -2138,3 +2151,152 @@ ffs_suspendctl(struct mount *mp, int cmd)
return EINVAL; return EINVAL;
} }
} }
/*
* Synch vnode for a mounted file system. This is called for foreign
* vnodes, i.e. non-ffs.
*/
static int
ffs_vfs_fsync(vnode_t *vp, int flags)
{
int error, passes, skipmeta, i, pflags;
buf_t *bp, *nbp;
struct mount *mp;
KASSERT(vp->v_type == VBLK);
KASSERT(vp->v_specmountpoint != NULL);
mp = vp->v_specmountpoint;
if ((mp->mnt_flag & MNT_SOFTDEP) != 0)
softdep_fsync_mountdev(vp);
/*
* Flush all dirty data associated with the vnode.
*/
pflags = PGO_ALLPAGES | PGO_CLEANIT;
if ((flags & FSYNC_WAIT) != 0)
pflags |= PGO_SYNCIO;
mutex_enter(&vp->v_interlock);
error = VOP_PUTPAGES(vp, 0, 0, pflags);
if (error)
return error;
#ifdef WAPBL
if (mp && mp->mnt_wapbl) {
/*
* Don't bother writing out metadata if the syncer is
* making the request. We will let the sync vnode
* write it out in a single burst through a call to
* VFS_SYNC().
*/
if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY | FSYNC_NOLOG)) != 0)
return 0;
/*
* Don't flush the log if the vnode being flushed
* contains no dirty buffers that could be in the log.
*/
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
error = wapbl_flush(mp->mnt_wapbl, 0);
if (error)
return error;
}
if ((flags & FSYNC_WAIT) != 0) {
mutex_enter(&vp->v_interlock);
while (vp->v_numoutput)
cv_wait(&vp->v_cv, &vp->v_interlock);
mutex_exit(&vp->v_interlock);
}
return 0;
}
#endif /* WAPBL */
/*
* Write out metadata for non-logging file systems. This block can
* be simplified once softdep goes.
*/
passes = NIADDR + 1;
skipmeta = 0;
if (flags & FSYNC_WAIT)
skipmeta = 1;
loop:
mutex_enter(&bufcache_lock);
LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
bp->b_cflags &= ~BC_SCANNED;
}
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_cflags & (BC_BUSY | BC_SCANNED))
continue;
if ((bp->b_oflags & BO_DELWRI) == 0)
panic("ffs_fsync: not dirty");
if (skipmeta && bp->b_lblkno < 0)
continue;
bp->b_cflags |= BC_BUSY | BC_VFLUSH | BC_SCANNED;
mutex_exit(&bufcache_lock);
/*
* On our final pass through, do all I/O synchronously
* so that we can find out if our flush is failing
* because of write errors.
*/
if (passes > 0 || !(flags & FSYNC_WAIT))
(void) bawrite(bp);
else if ((error = bwrite(bp)) != 0)
return (error);
/*
* Since we unlocked during the I/O, we need
* to start from a known point.
*/
mutex_enter(&bufcache_lock);
nbp = LIST_FIRST(&vp->v_dirtyblkhd);
}
mutex_exit(&bufcache_lock);
if (skipmeta) {
skipmeta = 0;
goto loop;
}
if ((flags & FSYNC_WAIT) != 0) {
mutex_enter(&vp->v_interlock);
while (vp->v_numoutput) {
cv_wait(&vp->v_cv, &vp->v_interlock);
}
mutex_exit(&vp->v_interlock);
/*
* Ensure that any filesystem metadata associated
* with the vnode has been written.
*/
if ((error = softdep_sync_metadata(vp)) != 0)
return (error);
if (!LIST_EMPTY(&vp->v_dirtyblkhd)) {
/*
* Block devices associated with filesystems may
* have new I/O requests posted for them even if
* the vnode is locked, so no amount of trying will
* get them clean. Thus we give block devices a
* good effort, then just give up. For all other file
* types, go around and try again until it is clean.
*/
if (passes > 0) {
passes--;
goto loop;
}
#ifdef DIAGNOSTIC
if (vp->v_type != VBLK)
vprint("ffs_fsync: dirty", vp);
#endif
}
}
if (error == 0 && (flags & FSYNC_CACHE) != 0) {
(void)VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE,
kauth_cred_get());
}
return error;
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: ffs_vnops.c,v 1.109 2009/02/01 17:36:43 ad Exp $ */ /* $NetBSD: ffs_vnops.c,v 1.110 2009/02/22 20:10:25 ad Exp $ */
/*- /*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc. * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
@ -61,7 +61,7 @@
*/ */
#include <sys/cdefs.h> #include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.109 2009/02/01 17:36:43 ad Exp $"); __KERNEL_RCSID(0, "$NetBSD: ffs_vnops.c,v 1.110 2009/02/22 20:10:25 ad Exp $");
#if defined(_KERNEL_OPT) #if defined(_KERNEL_OPT)
#include "opt_ffs.h" #include "opt_ffs.h"
@ -319,7 +319,13 @@ ffs_fsync(void *v)
#ifdef WAPBL #ifdef WAPBL
mp = wapbl_vptomp(vp); mp = wapbl_vptomp(vp);
if (mp->mnt_wapbl) { if (mp->mnt_wapbl) {
if (ap->a_flags & FSYNC_DATAONLY) { /*
* Don't bother writing out metadata if the syncer is
* making the request. We will let the sync vnode
* write it out in a single burst through a call to
* VFS_SYNC().
*/
if ((ap->a_flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0) {
fstrans_done(vp->v_mount); fstrans_done(vp->v_mount);
return 0; return 0;
} }
@ -336,7 +342,7 @@ ffs_fsync(void *v)
(ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); (ap->a_flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
UFS_WAPBL_END(mp); UFS_WAPBL_END(mp);
} }
if (error || (ap->a_flags & FSYNC_NOLOG)) { if (error || (ap->a_flags & FSYNC_NOLOG) != 0) {
fstrans_done(vp->v_mount); fstrans_done(vp->v_mount);
return error; return error;
} }
@ -393,43 +399,38 @@ out:
} }
/* /*
* Synch an open file. Called for VOP_FSYNC() and VFS_FSYNC(). * Synch an open file. Called for VOP_FSYNC().
*
* BEWARE: THIS ROUTINE ACCEPTS BOTH FFS AND NON-FFS VNODES.
*/ */
/* ARGSUSED */ /* ARGSUSED */
int int
ffs_full_fsync(struct vnode *vp, int flags) ffs_full_fsync(struct vnode *vp, int flags)
{ {
struct buf *bp, *nbp; struct buf *bp, *nbp;
int error, passes, skipmeta, inodedeps_only, waitfor; int error, passes, skipmeta, inodedeps_only, waitfor, i;
struct mount *mp; struct mount *mp;
KASSERT(VTOI(vp) != NULL);
KASSERT(vp->v_tag == VT_UFS);
error = 0; error = 0;
if ((flags & FSYNC_VFS) != 0) { mp = vp->v_mount;
KASSERT(vp->v_specmountpoint != NULL); if (vp->v_type == VBLK && vp->v_specmountpoint != NULL) {
mp = vp->v_specmountpoint; mp = vp->v_specmountpoint;
KASSERT(vp->v_type == VBLK); if ((mp->mnt_flag & MNT_SOFTDEP) != 0)
softdep_fsync_mountdev(vp);
} else { } else {
mp = vp->v_mount; mp = vp->v_mount;
KASSERT(vp->v_tag == VT_UFS);
} }
if (vp->v_type == VBLK &&
vp->v_specmountpoint != NULL &&
(vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP))
softdep_fsync_mountdev(vp);
mutex_enter(&vp->v_interlock); mutex_enter(&vp->v_interlock);
inodedeps_only = DOINGSOFTDEP(vp) && (flags & FSYNC_RECLAIM) inodedeps_only = DOINGSOFTDEP(vp) && (flags & FSYNC_RECLAIM)
&& UVM_OBJ_IS_CLEAN(&vp->v_uobj) && LIST_EMPTY(&vp->v_dirtyblkhd); && UVM_OBJ_IS_CLEAN(&vp->v_uobj) && LIST_EMPTY(&vp->v_dirtyblkhd);
/* /*
* Flush all dirty data associated with a vnode. * Flush all dirty data associated with the vnode.
*/ */
if (vp->v_type == VREG || vp->v_type == VBLK) { if (vp->v_type == VREG || vp->v_type == VBLK) {
int pflags = PGO_ALLPAGES | PGO_CLEANIT; int pflags = PGO_ALLPAGES | PGO_CLEANIT;
@ -447,21 +448,25 @@ ffs_full_fsync(struct vnode *vp, int flags)
#ifdef WAPBL #ifdef WAPBL
if (mp && mp->mnt_wapbl) { if (mp && mp->mnt_wapbl) {
error = 0; /*
if (flags & FSYNC_DATAONLY) * Don't bother writing out metadata if the syncer is
return error; * making the request. We will let the sync vnode
* write it out in a single burst through a call to
* VFS_SYNC().
*/
if ((flags & (FSYNC_DATAONLY | FSYNC_LAZY)) != 0)
return 0;
if ((flags & FSYNC_VFS) == 0 && VTOI(vp) != NULL && if ((VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
(VTOI(vp)->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE
| IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) { | IN_MODIFY | IN_MODIFIED | IN_ACCESSED)) != 0) {
error = UFS_WAPBL_BEGIN(mp); error = UFS_WAPBL_BEGIN(mp);
if (error) if (error)
return error; return error;
error = ffs_update(vp, NULL, NULL, error = ffs_update(vp, NULL, NULL,
(flags & FSYNC_WAIT) ? UPDATE_WAIT : 0); (flags & FSYNC_WAIT) ? UPDATE_WAIT : 0);
UFS_WAPBL_END(mp); UFS_WAPBL_END(mp);
} }
if (error || (flags & FSYNC_NOLOG)) if (error || (flags & FSYNC_NOLOG) != 0)
return error; return error;
/* /*
@ -476,7 +481,7 @@ ffs_full_fsync(struct vnode *vp, int flags)
if ((flags & FSYNC_WAIT) != 0) { if ((flags & FSYNC_WAIT) != 0) {
mutex_enter(&vp->v_interlock); mutex_enter(&vp->v_interlock);
while (vp->v_numoutput) while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, &vp->v_interlock); cv_wait(&vp->v_cv, &vp->v_interlock);
mutex_exit(&vp->v_interlock); mutex_exit(&vp->v_interlock);
} }
@ -485,6 +490,10 @@ ffs_full_fsync(struct vnode *vp, int flags)
} }
#endif /* WAPBL */ #endif /* WAPBL */
/*
* Write out metadata for non-logging file systems. This block can
* be simplified once softdep goes.
*/
passes = NIADDR + 1; passes = NIADDR + 1;
skipmeta = 0; skipmeta = 0;
if (flags & FSYNC_WAIT) if (flags & FSYNC_WAIT)
@ -565,17 +574,11 @@ loop:
waitfor = 0; waitfor = 0;
else else
waitfor = (flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0; waitfor = (flags & FSYNC_WAIT) != 0 ? UPDATE_WAIT : 0;
error = ffs_update(vp, NULL, NULL, waitfor);
if ((flags & FSYNC_VFS) == 0)
error = ffs_update(vp, NULL, NULL, waitfor);
if (error == 0 && (flags & FSYNC_CACHE) != 0) { if (error == 0 && (flags & FSYNC_CACHE) != 0) {
int i = 0; (void)VOP_IOCTL(VTOI(vp)->i_devvp, DIOCCACHESYNC, &i, FWRITE,
if ((flags & FSYNC_VFS) == 0) { kauth_cred_get());
KASSERT(VTOI(vp) != NULL);
vp = VTOI(vp)->i_devvp;
}
VOP_IOCTL(vp, DIOCCACHESYNC, &i, FWRITE, curlwp->l_cred);
} }
return error; return error;