* Remove PGO_RECLAIM during lfs_putpages()' call to genfs_putpages(),

to avoid a live lock in the latter when reclaiming a vnode with
  dirty pages.

* Add a new segment flag, SEGM_RECLAIM, to note when a segment is
  being written for vnode reclamation, and record which inode is being
  reclaimed, to aid in forensic debugging.

* Add a new segment flag, SEGM_SINGLE, so that opportunistic writes
  can write a single segment's worth of blocks and then stop, rather
  than writing all the way up to the cleaner's reserved number of
  segments.

* Add assert statements to check mutex ownership is the way it ought
  to be, mostly in lfs_putpages; fix problems uncovered by this.

* Don't clear VU_DIROP until the inode actually makes its way to disk,
  avoiding a problem where dirop inodes could become separated
  (uncovered by a modified version of the "ckckp" forensic regression
  test).

* Move the vfs_getopsbyname() call into lfs_writerd.  Prepare code to
  make lfs_writerd notice when there are no more LFSs, and exit losing
  the reference, so that, in theory, the module can be unloaded.  This
  code is not enabled, since it causes a crash on exit.

* Set IN_MODIFIED on inodes flushed by lfs_flush_dirops.  Really we
  only need to set IN_MODIFIED if we are going to write them again
  (e.g., to write pages); need to think about this more.

Finally, several changes to help avoid "no clean segments" panics:

* In lfs_bmapv, note when a vnode is loaded only to discover whether
  its blocks are live, so it can immediately be recycled.  Since the
  cleaner will try to choose ~empty segments over full ones, this
  prevents the cleaner from (1) filling the vnode cache with junk, and
  (2) squeezing any unwritten writes to disk and running the fs out of
  segments.

* Overestimate by half the amount of metadata that will be required
  to fill the clean segments.  This will make the disk appear smaller,
  but should help avoid a "no clean segments" panic.

* Rearrange lfs_writerd.  In particular, lfs_writerd now pays
  attention to the number of clean segments available, and holds off
  writing until there is room.
This commit is contained in:
perseant 2012-01-02 22:10:44 +00:00
parent b9d601ff9f
commit f9b3466d45
10 changed files with 483 additions and 207 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs.h,v 1.134 2011/07/11 08:27:40 hannken Exp $ */
/* $NetBSD: lfs.h,v 1.135 2012/01/02 22:10:44 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -592,6 +592,7 @@ struct segsum_v1 {
#define SS_CONT 0x02 /* more partials to finish this write*/
#define SS_CLEAN 0x04 /* written by the cleaner */
#define SS_RFW 0x08 /* written by the roll-forward agent */
#define SS_RECLAIM 0x10 /* written by the roll-forward agent */
u_int16_t ss_flags; /* 24: used for directory operations */
u_int16_t ss_pad; /* 26: extra space */
/* FINFO's and inode daddr's... */
@ -608,7 +609,8 @@ struct segsum {
u_int16_t ss_nfinfo; /* 20: number of file info structures */
u_int16_t ss_ninos; /* 22: number of inodes in summary */
u_int16_t ss_flags; /* 24: used for directory operations */
u_int8_t ss_pad[6]; /* 26: extra space */
u_int8_t ss_pad[2]; /* 26: extra space */
u_int32_t ss_reclino; /* 28: inode being reclaimed */
u_int64_t ss_serial; /* 32: serial number */
u_int64_t ss_create; /* 40: time stamp */
/* FINFO's and inode daddr's... */
@ -840,6 +842,8 @@ struct lfs {
int lfs_nowrap; /* Suspend log wrap */
int lfs_wrappass; /* Allow first log wrap requester to pass */
int lfs_wrapstatus; /* Wrap status */
int lfs_reclino; /* Inode being reclaimed */
int lfs_startseg; /* Segment we started writing at */
LIST_HEAD(, segdelta) lfs_segdhd; /* List of pending trunc accounting events */
};
@ -945,13 +949,15 @@ struct segment {
u_int32_t seg_number; /* number of this segment */
int32_t *start_lbp; /* beginning lbn for this set */
#define SEGM_CKP 0x01 /* doing a checkpoint */
#define SEGM_CLEAN 0x02 /* cleaner call; don't sort */
#define SEGM_SYNC 0x04 /* wait for segment */
#define SEGM_PROT 0x08 /* don't inactivate at segunlock */
#define SEGM_PAGEDAEMON 0x10 /* pagedaemon called us */
#define SEGM_WRITERD 0x20 /* LFS writed called us */
#define SEGM_FORCE_CKP 0x40 /* Force checkpoint right away */
#define SEGM_CKP 0x0001 /* doing a checkpoint */
#define SEGM_CLEAN 0x0002 /* cleaner call; don't sort */
#define SEGM_SYNC 0x0004 /* wait for segment */
#define SEGM_PROT 0x0008 /* don't inactivate at segunlock */
#define SEGM_PAGEDAEMON 0x0010 /* pagedaemon called us */
#define SEGM_WRITERD 0x0020 /* LFS writed called us */
#define SEGM_FORCE_CKP 0x0040 /* Force checkpoint right away */
#define SEGM_RECLAIM 0x0080 /* Writing to reclaim vnode */
#define SEGM_SINGLE 0x0100 /* Opportunistic writevnodes */
u_int16_t seg_flags; /* run-time flags for this segment */
u_int32_t seg_iocount; /* number of ios pending */
int ndupino; /* number of duplicate inodes */
@ -992,6 +998,7 @@ struct lfs_inode_ext {
#define LFSI_DELETED 0x02
#define LFSI_WRAPBLOCK 0x04
#define LFSI_WRAPWAIT 0x08
#define LFSI_BMAP 0x10
u_int32_t lfs_iflags; /* Inode flags */
daddr_t lfs_hiblk; /* Highest lbn held by inode */
#ifdef _KERNEL
@ -1017,10 +1024,16 @@ struct lfs_inode_ext {
* Macros for determining free space on the disk, with the variable metadata
* of segment summaries and inode blocks taken into account.
*/
/* Estimate number of clean blocks not available for writing */
#define LFS_EST_CMETA(F) (int32_t)((((F)->lfs_dmeta * \
(int64_t)(F)->lfs_nclean) / \
((F)->lfs_nseg - (F)->lfs_nclean)))
/*
* Estimate number of clean blocks not available for writing because
* they will contain metadata or overhead. This is calculated as
* (dmeta / # dirty segments) * (# clean segments).
*/
#define CM_MAG_NUM 3
#define CM_MAG_DEN 2
#define LFS_EST_CMETA(F) (int32_t)(( \
(CM_MAG_NUM * ((F)->lfs_dmeta * (int64_t)(F)->lfs_nclean)) / \
(CM_MAG_DEN * ((F)->lfs_nseg - (F)->lfs_nclean))))
/* Estimate total size of the disk not including metadata */
#define LFS_EST_NONMETA(F) ((F)->lfs_dsize - (F)->lfs_dmeta - LFS_EST_CMETA(F))

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $ */
/* $NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -96,6 +96,7 @@ int lfs_subsys_pages = 0L; /* Total number LFS-written pages */
int lfs_fs_pagetrip = 0; /* # of pages to trip per-fs write */
int lfs_writing = 0; /* Set if already kicked off a writer
because of buffer space */
int locked_queue_waiters = 0; /* Number of processes waiting on lq */
/* Lock and condition variables for above. */
kcondvar_t locked_queue_cv;
@ -160,8 +161,12 @@ lfs_reservebuf(struct lfs *fs, struct vnode *vp,
lfs_flush(fs, 0, 0);
DLOG((DLOG_AVAIL, "lfs_reservebuf: waiting: count=%d, bytes=%ld\n",
locked_queue_count, locked_queue_bytes));
++locked_queue_waiters;
error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
hz * LFS_BUFWAIT);
--locked_queue_waiters;
if (error && error != EWOULDBLOCK) {
mutex_exit(&lfs_lock);
return error;
@ -171,8 +176,11 @@ lfs_reservebuf(struct lfs *fs, struct vnode *vp,
locked_queue_rcount += n;
locked_queue_rbytes += bytes;
if (n < 0)
if (n < 0 && locked_queue_waiters > 0) {
DLOG((DLOG_AVAIL, "lfs_reservebuf: broadcast: count=%d, bytes=%ld\n",
locked_queue_count, locked_queue_bytes));
cv_broadcast(&locked_queue_cv);
}
mutex_exit(&lfs_lock);
@ -461,7 +469,7 @@ lfs_bwrite_ext(struct buf *bp, int flags)
*/
if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
bp->b_oflags &= ~BO_DELWRI;
bp->b_flags |= B_READ;
bp->b_flags |= B_READ; /* XXX is this right? --ks */
bp->b_error = 0;
mutex_enter(&bufcache_lock);
LFS_UNLOCK_BUF(bp);
@ -535,6 +543,7 @@ lfs_flush_fs(struct lfs *fs, int flags)
if (lfs_dostats)
++lfs_stats.flush_invoked;
fs->lfs_pdflush = 0;
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "fldirop");
lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
@ -689,10 +698,10 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
/* If there are too many pending dirops, we have to flush them. */
if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
flags |= SEGM_CKP;
}
if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
mutex_exit(&lfs_lock);
lfs_flush_dirops(fs);
mutex_enter(&lfs_lock);
} else if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
lfs_subsys_pages > LFS_MAX_PAGES ||
fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
@ -717,8 +726,10 @@ lfs_check(struct vnode *vp, daddr_t blkno, int flags)
++lfs_stats.wait_exceeded;
DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
locked_queue_count, locked_queue_bytes));
++locked_queue_waiters;
error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
hz * LFS_BUFWAIT);
--locked_queue_waiters;
if (error != EWOULDBLOCK)
break;

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_extern.h,v 1.96 2008/06/28 01:34:05 rumble Exp $ */
/* $NetBSD: lfs_extern.h,v 1.97 2012/01/02 22:10:44 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -240,8 +240,8 @@ int lfs_gop_alloc(struct vnode *, off_t, off_t, int, kauth_cred_t);
void lfs_gop_size(struct vnode *, off_t, off_t *, int);
int lfs_putpages_ext(void *, int);
int lfs_gatherpages(struct vnode *);
void lfs_flush_dirops(struct lfs *);
void lfs_flush_pchain(struct lfs *);
int lfs_flush_dirops(struct lfs *);
int lfs_flush_pchain(struct lfs *);
int lfs_bwrite (void *);
int lfs_fsync (void *);

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $ */
/* $NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $");
#ifdef DEBUG
# define vndebug(vp, str) do { \
@ -202,6 +202,9 @@ lfs_vflush(struct vnode *vp)
relock = 0;
top:
KASSERT(mutex_owned(vp->v_interlock) == false);
KASSERT(mutex_owned(&lfs_lock) == false);
KASSERT(mutex_owned(&bufcache_lock) == false);
ASSERT_NO_SEGLOCK(fs);
if (ip->i_flag & IN_CLEANING) {
ivndebug(vp,"vflush/in_cleaning");
@ -280,7 +283,10 @@ lfs_vflush(struct vnode *vp)
mutex_exit(vp->v_interlock);
/* Protect against VI_XLOCK deadlock in vinvalbuf() */
lfs_seglock(fs, SEGM_SYNC);
lfs_seglock(fs, SEGM_SYNC | ((vp->v_iflag & VI_XLOCK) ? SEGM_RECLAIM : 0));
if (vp->v_iflag & VI_XLOCK) {
fs->lfs_reclino = ip->i_number;
}
/* If we're supposed to flush a freed inode, just toss it */
if (ip->i_lfs_iflags & LFSI_DELETED) {
@ -380,11 +386,12 @@ lfs_vflush(struct vnode *vp)
do {
if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
relock = lfs_writefile(fs, sp, vp);
if (relock) {
if (relock && vp != fs->lfs_ivnode) {
/*
* Might have to wait for the
* cleaner to run; but we're
* still not done with this vnode.
* XXX we can do better than this.
*/
KDASSERT(ip->i_number != LFS_IFILE_INUM);
lfs_writeinode(fs, sp, ip);
@ -486,9 +493,16 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
* After this, pages might be busy
* due to our own previous putpages.
* Start actual segment write here to avoid deadlock.
* If we were just writing one segment and we've done
* that, break out.
*/
mutex_exit(&mntvnode_lock);
(void)lfs_writeseg(fs, sp);
if (lfs_writeseg(fs, sp) &&
(sp->seg_flags & SEGM_SINGLE) &&
fs->lfs_curseg != fs->lfs_startseg) {
DLOG((DLOG_VNODE, "lfs_writevnodes: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
break;
}
goto loop;
}
@ -626,6 +640,10 @@ lfs_segwrite(struct mount *mp, int flags)
*/
do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
/* We can't do a partial write and checkpoint at the same time. */
if (do_ckp)
flags &= ~SEGM_SINGLE;
lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
sp = fs->lfs_sp;
if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
@ -645,6 +663,11 @@ lfs_segwrite(struct mount *mp, int flags)
else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
do {
um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
if ((sp->seg_flags & SEGM_SINGLE) &&
fs->lfs_curseg != fs->lfs_startseg) {
DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
break;
}
if (do_ckp || fs->lfs_dirops == 0) {
if (!writer_set) {
@ -1025,6 +1048,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
{
struct buf *bp;
struct ufs1_dinode *cdp;
struct vnode *vp = ITOV(ip);
daddr_t daddr;
int32_t *daddrp; /* XXX ondisk32 */
int i, ndx;
@ -1033,7 +1057,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
int count;
ASSERT_SEGLOCK(fs);
if (!(ip->i_flag & IN_ALLMOD))
if (!(ip->i_flag & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP))
return (0);
/* Can't write ifile when writer is not set */
@ -1047,7 +1071,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
* solid.
*/
count = 0;
while (ip->i_number == LFS_IFILE_INUM) {
while (vp == fs->lfs_ivnode) {
int redo = 0;
if (sp->idp == NULL && sp->ibp == NULL &&
@ -1112,7 +1136,7 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
}
/* Check VU_DIROP in case there is a new file with no data blocks */
if (ITOV(ip)->v_uflag & VU_DIROP)
if (vp->v_uflag & VU_DIROP)
((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
/* Update the inode times and copy the inode onto the inode page. */
@ -1138,6 +1162,18 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
cdp = ((struct ufs1_dinode *)bp->b_data) + (sp->ninodes % INOPB(fs));
*cdp = *ip->i_din.ffs1_din;
/*
* This inode is on its way to disk; clear its VU_DIROP status when
* the write is complete.
*/
if (vp->v_uflag & VU_DIROP) {
if (!(sp->seg_flags & SEGM_CLEAN))
ip->i_flag |= IN_CDIROP;
else {
DLOG((DLOG_DIROP, "lfs_writeinode: not clearing dirop for cleaned ino %d\n", (int)ip->i_number));
}
}
/*
* If cleaning, link counts and directory file sizes cannot change,
* since those would be directory operations---even if the file
@ -1146,9 +1182,9 @@ lfs_writeinode(struct lfs *fs, struct segment *sp, struct inode *ip)
* current values the next time we clean.
*/
if (sp->seg_flags & SEGM_CLEAN) {
if (ITOV(ip)->v_uflag & VU_DIROP) {
if (vp->v_uflag & VU_DIROP) {
cdp->di_nlink = ip->i_lfs_odnlink;
/* if (ITOV(ip)->v_type == VDIR) */
/* if (vp->v_type == VDIR) */
cdp->di_size = ip->i_lfs_osize;
}
} else {
@ -1988,6 +2024,12 @@ lfs_writeseg(struct lfs *fs, struct segment *sp)
if (sp->seg_flags & SEGM_CLEAN)
ssp->ss_flags |= SS_CLEAN;
/* Note if we are writing to reclaim */
if (sp->seg_flags & SEGM_RECLAIM) {
ssp->ss_flags |= SS_RECLAIM;
ssp->ss_reclino = fs->lfs_reclino;
}
devvp = VTOI(fs->lfs_ivnode)->i_devvp;
/* Update the segment usage information. */
@ -2720,7 +2762,6 @@ lfs_shellsort(struct buf **bp_array, int32_t *lb_array, int nmemb, int size)
int
lfs_vref(struct vnode *vp)
{
int error;
struct lfs *fs;
KASSERT(mutex_owned(vp->v_interlock));
@ -2734,12 +2775,13 @@ lfs_vref(struct vnode *vp)
* being able to flush all of the pages from this vnode, which
* will cause it to panic. So, return 0 if a flush is in progress.
*/
error = vget(vp, LK_NOWAIT);
if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
++fs->lfs_flushvp_fakevref;
return 0;
}
return error;
if (IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
++fs->lfs_flushvp_fakevref;
mutex_exit(vp->v_interlock);
return 0;
}
return vget(vp, LK_NOWAIT);
}
/*

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $ */
/* $NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -335,6 +335,7 @@ lfs_seglock(struct lfs *fs, unsigned long flags)
*/
mutex_enter(&lfs_lock);
++fs->lfs_iocount;
fs->lfs_startseg = fs->lfs_curseg;
mutex_exit(&lfs_lock);
return 0;
}
@ -361,7 +362,7 @@ lfs_unmark_dirop(struct lfs *fs)
for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
nip = TAILQ_NEXT(ip, i_lfs_dchain);
vp = ITOV(ip);
if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
if ((ip->i_flag & (IN_ADIROP | IN_CDIROP)) == IN_CDIROP) {
--lfs_dirvcount;
--fs->lfs_dirvcount;
vp->v_uflag &= ~VU_DIROP;
@ -372,6 +373,7 @@ lfs_unmark_dirop(struct lfs *fs)
vrele(vp);
mutex_enter(&lfs_lock);
fs->lfs_unlockvp = NULL;
ip->i_flag &= ~IN_CDIROP;
}
}
@ -437,8 +439,7 @@ lfs_segunlock(struct lfs *fs)
mutex_enter(&lfs_lock);
KASSERT(LFS_SEGLOCK_HELD(fs));
if (fs->lfs_seglock == 1) {
if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
LFS_STARVED_FOR_SEGS(fs) == 0)
if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0)
do_unmark_dirop = 1;
mutex_exit(&lfs_lock);
sync = sp->seg_flags & SEGM_SYNC;

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $ */
/* $NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
@ -61,7 +61,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $");
#ifndef LFS
# define LFS /* for prototypes in syscallargs.h */
@ -291,6 +291,17 @@ lfs_markv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov,
*/
if (v_daddr != LFS_UNUSED_DADDR) {
lfs_vunref(vp);
/*
* If the vnode has LFSI_BMAP, it was
* not found in the cache. Dump it so
* we can reuse the vnode.
* XXX If we knew what segment we were
* XXX supposed to be looking for, we
* XXX would be able to be more selective
* XXX here.
*/
if (ip->i_lfs_iflags & LFSI_BMAP)
vrecycle(vp, NULL, NULL);
numrefed--;
}
@ -760,6 +771,7 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
continue;
} else {
KASSERT(VOP_ISLOCKED(vp));
VTOI(vp)->i_lfs_iflags |= LFSI_BMAP;
VOP_UNLOCK(vp);
numrefed++;
}
@ -814,6 +826,9 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp, BLOCK_INFO *blkiov, int blkcnt)
*/
if (v_daddr != LFS_UNUSED_DADDR) {
lfs_vunref(vp);
/* Recycle as above. */
if (ip->i_lfs_iflags & LFSI_BMAP)
vrecycle(vp, NULL, NULL);
numrefed--;
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $ */
/* $NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
@ -61,7 +61,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $");
#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
@ -129,6 +129,7 @@ extern const struct vnodeopv_desc lfs_specop_opv_desc;
extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
pid_t lfs_writer_daemon = 0;
lwpid_t lfs_writer_lid = 0;
int lfs_do_flush = 0;
#ifdef LFS_KERNEL_RFW
int lfs_do_rfw = 0;
@ -399,85 +400,151 @@ struct pool lfs_lbnentry_pool;
static void
lfs_writerd(void *arg)
{
struct mount *mp, *nmp;
struct lfs *fs;
int fsflags;
int loopcount;
lfs_writer_daemon = curproc->p_pid;
struct mount *mp, *nmp;
struct lfs *fs;
struct vfsops *vfs = NULL;
int fsflags;
int loopcount;
int skipc;
int lfsc;
int wrote_something = 0;
mutex_enter(&lfs_lock);
for (;;) {
mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
&lfs_lock);
lfs_writer_daemon = curproc->p_pid;
lfs_writer_lid = curlwp->l_lid;
mutex_exit(&lfs_lock);
/*
* Look through the list of LFSs to see if any of them
* have requested pageouts.
*/
mutex_enter(&mountlist_lock);
for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
mp = nmp) {
if (vfs_busy(mp, &nmp)) {
continue;
}
if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
sizeof(mp->mnt_stat.f_fstypename)) == 0) {
fs = VFSTOUFS(mp)->um_lfs;
mutex_enter(&lfs_lock);
fsflags = 0;
if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
lfs_dirvcount > LFS_MAX_DIROP) &&
fs->lfs_dirops == 0)
fsflags |= SEGM_CKP;
if (fs->lfs_pdflush) {
DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
fs->lfs_pdflush = 0;
lfs_flush_fs(fs, fsflags);
mutex_exit(&lfs_lock);
} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "wrdirop");
lfs_flush_pchain(fs);
lfs_writer_leave(fs);
} else
mutex_exit(&lfs_lock);
}
vfs_unbusy(mp, false, &nmp);
}
mutex_exit(&mountlist_lock);
/* Take an extra reference to the LFS vfsops. */
vfs = vfs_getopsbyname(MOUNT_LFS);
mutex_enter(&lfs_lock);
for (;;) {
KASSERT(mutex_owned(&lfs_lock));
if (wrote_something == 0)
mtsleep(&lfs_writer_daemon, PVM, "lfswriter", hz/10 + 1,
&lfs_lock);
KASSERT(mutex_owned(&lfs_lock));
loopcount = 0;
wrote_something = 0;
/*
* If global state wants a flush, flush everything.
*/
mutex_enter(&lfs_lock);
loopcount = 0;
if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
locked_queue_bytes > LFS_MAX_BYTES ||
lfs_subsys_pages > LFS_MAX_PAGES) {
if (lfs_do_flush) {
DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
}
if (locked_queue_count > LFS_MAX_BUFS) {
DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
locked_queue_count, LFS_MAX_BUFS));
}
if (locked_queue_bytes > LFS_MAX_BYTES) {
DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
locked_queue_bytes, LFS_MAX_BYTES));
}
if (lfs_subsys_pages > LFS_MAX_PAGES) {
DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
lfs_subsys_pages, LFS_MAX_PAGES));
}
lfs_flush(NULL, SEGM_WRITERD, 0);
lfs_do_flush = 0;
KASSERT(mutex_owned(&lfs_lock));
continue;
}
}
/* NOTREACHED */
KASSERT(mutex_owned(&lfs_lock));
mutex_exit(&lfs_lock);
/*
* Look through the list of LFSs to see if any of them
* have requested pageouts.
*/
mutex_enter(&mountlist_lock);
lfsc = 0;
skipc = 0;
for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
mp = nmp) {
if (vfs_busy(mp, &nmp)) {
++skipc;
continue;
}
KASSERT(!mutex_owned(&lfs_lock));
if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
sizeof(mp->mnt_stat.f_fstypename)) == 0) {
++lfsc;
fs = VFSTOUFS(mp)->um_lfs;
int32_t ooffset = 0;
fsflags = SEGM_SINGLE;
mutex_enter(&lfs_lock);
ooffset = fs->lfs_offset;
if (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
/* Don't try to write if we're suspended */
mutex_exit(&lfs_lock);
vfs_unbusy(mp, false, &nmp);
continue;
}
if (LFS_STARVED_FOR_SEGS(fs)) {
mutex_exit(&lfs_lock);
DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
lfs_wakeup_cleaner(fs);
vfs_unbusy(mp, false, &nmp);
continue;
}
if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
lfs_dirvcount > LFS_MAX_DIROP) &&
fs->lfs_dirops == 0) {
fsflags &= ~SEGM_SINGLE;
fsflags |= SEGM_CKP;
DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
lfs_flush_fs(fs, fsflags);
} else if (fs->lfs_pdflush) {
DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
lfs_flush_fs(fs, fsflags);
} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "wrdirop");
lfs_flush_pchain(fs);
lfs_writer_leave(fs);
mutex_enter(&lfs_lock);
}
if (fs->lfs_offset != ooffset)
++wrote_something;
mutex_exit(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
vfs_unbusy(mp, false, &nmp);
}
if (lfsc + skipc == 0) {
#ifdef notyet
mutex_enter(&lfs_lock);
lfs_writer_daemon = 0;
lfs_writer_lid = 0;
mutex_exit(&lfs_lock);
mutex_exit(&mountlist_lock);
break;
#endif
}
mutex_exit(&mountlist_lock);
mutex_enter(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
KASSERT(!mutex_owned(&mountlist_lock));
/* Give up our extra reference so the module can be unloaded. */
mutex_enter(&vfs_list_lock);
if (vfs != NULL)
vfs->vfs_refcount--;
mutex_exit(&vfs_list_lock);
}
/*
@ -1063,16 +1130,12 @@ lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
vput(vp);
/* Start the pagedaemon-anticipating daemon */
if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
mutex_enter(&lfs_lock);
if (lfs_writer_daemon == 0 && lfs_writer_lid == 0 &&
kthread_create(PRI_BIO, 0, NULL,
lfs_writerd, NULL, NULL, "lfs_writer") != 0)
panic("fork lfs_writer");
/*
* XXX: Get extra reference to LFS vfsops. This prevents unload,
* but also prevents kernel panic due to text being unloaded
* from below lfs_writerd. When lfs_writerd can exit, remove
* this!!!
*/
vfs_getopsbyname(MOUNT_LFS);
mutex_exit(&lfs_lock);
printf("WARNING: the log-structured file system is experimental\n"
"WARNING: it may cause system crashes and/or corrupt data\n");
@ -1576,6 +1639,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
struct lfs *fs = ip->i_lfs;
struct segment *sp = fs->lfs_sp;
UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
const char * failreason = NULL;
ASSERT_SEGLOCK(fs);
@ -1591,8 +1655,10 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
* We must write everything, however, if our vnode is being
* reclaimed.
*/
if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
goto tryagain;
if (LFS_STARVED_FOR_SEGS(fs) && !(vp->v_iflag & VI_XLOCK)) {
failreason = "Starved for segs and not flushing vp";
goto tryagain;
}
/*
* Sometimes things slip past the filters in lfs_putpages,
@ -1610,9 +1676,16 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
*
* XXXUBC that last statement is an oversimplification of course.
*/
if (!LFS_SEGLOCK_HELD(fs) ||
(ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
(pgs[0]->offset & fs->lfs_bmask) != 0) {
if (!LFS_SEGLOCK_HELD(fs)) {
failreason = "Seglock not held";
goto tryagain;
}
if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
failreason = "Inode with no_gop_write";
goto tryagain;
}
if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
failreason = "Bad page offset";
goto tryagain;
}
@ -1632,6 +1705,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
KASSERT(eof >= 0);
if (startoffset >= eof) {
failreason = "Offset beyond EOF";
goto tryagain;
} else
bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
@ -1646,9 +1720,11 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
pgs[i]->flags &= ~PG_DELWRI;
pgs[i]->flags |= PG_PAGEOUT;
uvm_pageout_start(1);
mutex_enter(vp->v_interlock);
mutex_enter(&uvm_pageqlock);
uvm_pageunwire(pgs[i]);
mutex_exit(&uvm_pageqlock);
mutex_exit(vp->v_interlock);
}
}
@ -1768,7 +1844,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
/*
* LFS doesn't like async I/O here, dies with
* and assert in lfs_bwrite(). Is that assert
* an assert in lfs_bwrite(). Is that assert
* valid? I retained non-async behaviour when
* converted this to use nestiobuf --pooka
*/
@ -1805,6 +1881,10 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
lfs_flush(fs, 0, 1);
mutex_exit(&lfs_lock);
}
if ((sp->seg_flags & SEGM_SINGLE) && fs->lfs_curseg != fs->lfs_startseg)
return EAGAIN;
return (0);
tryagain:
@ -1815,18 +1895,13 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
mutex_enter(vp->v_interlock);
/* Tell why we're here, if we know */
if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
} else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
} else if (haveeof && startoffset >= eof) {
DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
" eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
pgs[0]->offset, eof, npages));
} else if (LFS_STARVED_FOR_SEGS(fs)) {
DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
} else {
DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
if (failreason != NULL) {
DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
}
if (haveeof && startoffset >= eof) {
DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
" eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
pgs[0]->offset, eof, npages));
}
mutex_enter(&uvm_pageqlock);
@ -1898,14 +1973,14 @@ lfs_vinit(struct mount *mp, struct vnode **vpp)
i == 0)
continue;
if (ip->i_ffs1_db[i] != 0) {
inconsistent:
lfs_dump_dinode(ip->i_din.ffs1_din);
panic("inconsistent inode");
panic("inconsistent inode (direct)");
}
}
for ( ; i < NDADDR + NIADDR; i++) {
if (ip->i_ffs1_ib[i - NDADDR] != 0) {
goto inconsistent;
lfs_dump_dinode(ip->i_din.ffs1_din);
panic("inconsistent inode (indirect)");
}
}
#endif /* DEBUG */

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $ */
/* $NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -60,7 +60,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
@ -363,6 +363,17 @@ lfs_inactive(void *v)
return 0;
}
#ifdef DEBUG
/*
* This might happen on unmount.
* XXX If it happens at any other time, it should be a panic.
*/
if (ap->a_vp->v_uflag & VU_DIROP) {
struct inode *ip = VTOI(ap->a_vp);
printf("lfs_inactive: inactivating VU_DIROP? ino = %d\n", (int)ip->i_number);
}
#endif /* DIAGNOSTIC */
return ufs_inactive(v);
}
@ -438,7 +449,7 @@ lfs_set_dirop(struct vnode *dvp, struct vnode *vp)
}
++fs->lfs_dirops;
fs->lfs_doifile = 1;
/* fs->lfs_doifile = 1; */ /* XXX why? --ks */
mutex_exit(&lfs_lock);
/* Hold a reference so SET_ENDOP will be happy */
@ -544,13 +555,15 @@ lfs_mark_vnode(struct vnode *vp)
if (!(ip->i_flag & IN_ADIROP)) {
if (!(vp->v_uflag & VU_DIROP)) {
mutex_enter(vp->v_interlock);
(void)lfs_vref(vp);
if (lfs_vref(vp) != 0)
panic("lfs_mark_vnode: could not vref");
++lfs_dirvcount;
++fs->lfs_dirvcount;
TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
vp->v_uflag |= VU_DIROP;
}
++fs->lfs_nadirop;
ip->i_flag &= ~IN_CDIROP;
ip->i_flag |= IN_ADIROP;
} else
KASSERT(vp->v_uflag & VU_DIROP);
@ -1153,7 +1166,8 @@ lfs_strategy(void *v)
struct vnode *vp;
struct inode *ip;
daddr_t tbn;
int i, sn, error, slept;
#define MAXLOOP 25
int i, sn, error, slept, loopcount;
bp = ap->a_bp;
vp = ap->a_vp;
@ -1185,6 +1199,7 @@ lfs_strategy(void *v)
}
slept = 1;
loopcount = 0;
mutex_enter(&lfs_lock);
while (slept && fs->lfs_seglock) {
mutex_exit(&lfs_lock);
@ -1213,12 +1228,19 @@ lfs_strategy(void *v)
PRId64 "\n", ip->i_number, bp->b_lblkno));
mutex_enter(&lfs_lock);
if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
/* Cleaner can't wait for itself */
mtsleep(&fs->lfs_iocount,
(PRIBIO + 1) | PNORELOCK,
"clean2", 0,
&lfs_lock);
/*
* Cleaner can't wait for itself.
* Instead, wait for the blocks
* to be written to disk.
* XXX we need pribio in the test
* XXX here.
*/
mtsleep(&fs->lfs_iocount,
(PRIBIO + 1) | PNORELOCK,
"clean2", hz/10 + 1,
&lfs_lock);
slept = 1;
++loopcount;
break;
} else if (fs->lfs_seglock) {
mtsleep(&fs->lfs_seglock,
@ -1232,6 +1254,10 @@ lfs_strategy(void *v)
}
}
mutex_enter(&lfs_lock);
if (loopcount > MAXLOOP) {
printf("lfs_strategy: breaking out of clean2 loop\n");
break;
}
}
mutex_exit(&lfs_lock);
@ -1240,37 +1266,39 @@ lfs_strategy(void *v)
return (0);
}
void
/*
* Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
* Technically this is a checkpoint (the on-disk state is valid)
* even though we are leaving out all the file data.
*/
int
lfs_flush_dirops(struct lfs *fs)
{
struct inode *ip, *nip;
struct vnode *vp;
extern int lfs_dostats;
struct segment *sp;
int flags = 0;
int error = 0;
ASSERT_MAYBE_SEGLOCK(fs);
KASSERT(fs->lfs_nadirop == 0);
if (fs->lfs_ronly)
return;
return EROFS;
mutex_enter(&lfs_lock);
if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
mutex_exit(&lfs_lock);
return;
return 0;
} else
mutex_exit(&lfs_lock);
if (lfs_dostats)
++lfs_stats.flush_invoked;
/*
* Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
* Technically this is a checkpoint (the on-disk state is valid)
* even though we are leaving out all the file data.
*/
lfs_imtime(fs);
lfs_seglock(fs, SEGM_CKP);
lfs_seglock(fs, flags);
sp = fs->lfs_sp;
/*
@ -1293,6 +1321,8 @@ lfs_flush_dirops(struct lfs *fs)
vp = ITOV(ip);
KASSERT((ip->i_flag & IN_ADIROP) == 0);
KASSERT(vp->v_uflag & VU_DIROP);
KASSERT(!(vp->v_iflag & VI_XLOCK));
/*
* All writes to directories come from dirops; all
@ -1300,9 +1330,7 @@ lfs_flush_dirops(struct lfs *fs)
* cache, which we're not touching. Reads to files
* and/or directories will not be affected by writing
* directory blocks inodes and file inodes. So we don't
* really need to lock. If we don't lock, though,
* make sure that we don't clear IN_MODIFIED
* unnecessarily.
* really need to lock.
*/
if (vp->v_iflag & VI_XLOCK) {
mutex_enter(&lfs_lock);
@ -1313,23 +1341,36 @@ lfs_flush_dirops(struct lfs *fs)
*/
if (vp->v_type != VREG &&
((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
lfs_writefile(fs, sp, vp);
error = lfs_writefile(fs, sp, vp);
if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
!(ip->i_flag & IN_ALLMOD)) {
mutex_enter(&lfs_lock);
LFS_SET_UINO(ip, IN_MODIFIED);
mutex_exit(&lfs_lock);
}
if (error && (sp->seg_flags & SEGM_SINGLE)) {
mutex_enter(&lfs_lock);
error = EAGAIN;
break;
}
}
KDASSERT(ip->i_number != LFS_IFILE_INUM);
(void) lfs_writeinode(fs, sp, ip);
error = lfs_writeinode(fs, sp, ip);
mutex_enter(&lfs_lock);
if (error && (sp->seg_flags & SEGM_SINGLE)) {
error = EAGAIN;
break;
}
/*
* XXX
* LK_EXCLOTHER is dead -- what is intended here?
* if (waslocked == LK_EXCLOTHER)
* LFS_SET_UINO(ip, IN_MODIFIED);
* We might need to update these inodes again,
* for example, if they have data blocks to write.
* Make sure that after this flush, they are still
* marked IN_MODIFIED so that we don't forget to
* write them.
*/
/* XXX only for non-directories? --KS */
LFS_SET_UINO(ip, IN_MODIFIED);
}
mutex_exit(&lfs_lock);
/* We've written all the dirops there are */
@ -1337,6 +1378,8 @@ lfs_flush_dirops(struct lfs *fs)
lfs_finalize_fs_seguse(fs);
(void) lfs_writeseg(fs, sp);
lfs_segunlock(fs);
return error;
}
/*
@ -1346,29 +1389,30 @@ lfs_flush_dirops(struct lfs *fs)
* for any reason, just skip it; if we have to wait for the cleaner,
* abort. The writer daemon will call us again later.
*/
void
int
lfs_flush_pchain(struct lfs *fs)
{
struct inode *ip, *nip;
struct vnode *vp;
extern int lfs_dostats;
struct segment *sp;
int error;
int error, error2;
ASSERT_NO_SEGLOCK(fs);
if (fs->lfs_ronly)
return;
return EROFS;
mutex_enter(&lfs_lock);
if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
mutex_exit(&lfs_lock);
return;
return 0;
} else
mutex_exit(&lfs_lock);
/* Get dirops out of the way */
lfs_flush_dirops(fs);
if ((error = lfs_flush_dirops(fs)) != 0)
return error;
if (lfs_dostats)
++lfs_stats.flush_invoked;
@ -1422,12 +1466,12 @@ lfs_flush_pchain(struct lfs *fs)
mutex_exit(&lfs_lock);
}
KDASSERT(ip->i_number != LFS_IFILE_INUM);
(void) lfs_writeinode(fs, sp, ip);
error2 = lfs_writeinode(fs, sp, ip);
VOP_UNLOCK(vp);
lfs_vunref(vp);
if (error == EAGAIN) {
if (error == EAGAIN || error2 == EAGAIN) {
lfs_writeseg(fs, sp);
mutex_enter(&lfs_lock);
break;
@ -1437,6 +1481,8 @@ lfs_flush_pchain(struct lfs *fs)
mutex_exit(&lfs_lock);
(void) lfs_writeseg(fs, sp);
lfs_segunlock(fs);
return 0;
}
/*
@ -1682,7 +1728,8 @@ segwait_common:
/* Wait for the log to wrap, if asked */
if (*(int *)ap->a_data) {
mutex_enter(ap->a_vp->v_interlock);
lfs_vref(ap->a_vp);
if (lfs_vref(ap->a_vp) != 0)
panic("LFCNWRAPPASS: lfs_vref failed");
VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
@ -1746,6 +1793,7 @@ lfs_getpages(void *v)
static void
wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
{
KASSERT(mutex_owned(vp->v_interlock));
if ((pg->flags & PG_BUSY) == 0)
return; /* Nothing to wait for! */
@ -1786,6 +1834,7 @@ static void
write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
int seglocked, const char *label)
{
KASSERT(mutex_owned(vp->v_interlock));
#ifndef BUSYWAIT
struct inode *ip = VTOI(vp);
struct segment *sp = fs->lfs_sp;
@ -1814,12 +1863,15 @@ write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
mutex_enter(vp->v_interlock);
wait_for_page(vp, pg, label);
}
if (label != NULL && count > 1)
printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
label, (count > 0 ? "looping, " : ""), count);
if (label != NULL && count > 1) {
DLOG((DLOG_PAGE, "lfs_putpages[%d]: %s: %sn = %d\n",
curproc->p_pid, label, (count > 0 ? "looping, " : ""),
count));
}
#else
preempt(1);
#endif
KASSERT(mutex_owned(vp->v_interlock));
}
/*
@ -1849,6 +1901,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
KASSERT(mutex_owned(vp->v_interlock));
ASSERT_MAYBE_SEGLOCK(fs);
top:
by_list = (vp->v_uobj.uo_npages <=
@ -1891,6 +1944,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
*/
nonexistent = dirty = 0;
for (i = 0; i == 0 || i < pages_per_block; i++) {
KASSERT(mutex_owned(vp->v_interlock));
if (by_list && pages_per_block <= 1) {
pgs[i] = pg = curpg;
} else {
@ -1916,13 +1970,16 @@ check_dirty(struct lfs *fs, struct vnode *vp,
DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
if (pgp)
*pgp = pg;
KASSERT(mutex_owned(vp->v_interlock));
return -1;
}
while (pg->flags & PG_BUSY) {
wait_for_page(vp, pg, NULL);
KASSERT(mutex_owned(vp->v_interlock));
if (i > 0)
uvm_page_unbusy(pgs, i);
KASSERT(mutex_owned(vp->v_interlock));
goto top;
}
pg->flags |= PG_BUSY;
@ -1944,6 +2001,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
any_dirty += dirty;
KASSERT(nonexistent == 0);
KASSERT(mutex_owned(vp->v_interlock));
/*
* If any are dirty make all dirty; unbusy them,
@ -1952,8 +2010,10 @@ check_dirty(struct lfs *fs, struct vnode *vp,
* they're on their way to disk.
*/
for (i = 0; i == 0 || i < pages_per_block; i++) {
KASSERT(mutex_owned(vp->v_interlock));
pg = pgs[i];
KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
KASSERT(pg->flags & PG_BUSY);
if (dirty) {
pg->flags &= ~PG_CLEAN;
if (flags & PGO_FREE) {
@ -1985,6 +2045,7 @@ check_dirty(struct lfs *fs, struct vnode *vp,
}
}
KASSERT(mutex_owned(vp->v_interlock));
return any_dirty;
}
@ -2048,9 +2109,11 @@ lfs_putpages(void *v)
struct segment *sp;
off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
off_t off, max_endoffset;
bool seglocked, sync, pagedaemon;
bool seglocked, sync, pagedaemon, reclaim;
struct vm_page *pg, *busypg;
UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
int oreclaim = 0;
int donewriting = 0;
#ifdef DEBUG
int debug_n_again, debug_n_dirtyclean;
#endif
@ -2059,8 +2122,11 @@ lfs_putpages(void *v)
ip = VTOI(vp);
fs = ip->i_lfs;
sync = (ap->a_flags & PGO_SYNCIO) != 0;
reclaim = (ap->a_flags & PGO_RECLAIM) != 0;
pagedaemon = (curlwp == uvm.pagedaemon_lwp);
KASSERT(mutex_owned(vp->v_interlock));
/* Putpages does nothing for metadata. */
if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
mutex_exit(vp->v_interlock);
@ -2086,6 +2152,8 @@ lfs_putpages(void *v)
TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
}
mutex_exit(&lfs_lock);
KASSERT(!mutex_owned(vp->v_interlock));
return 0;
}
@ -2093,12 +2161,15 @@ lfs_putpages(void *v)
/*
* Ignore requests to free pages past EOF but in the same block
* as EOF, unless the request is synchronous. (If the request is
* sync, it comes from lfs_truncate.)
* XXXUBC Make these pages look "active" so the pagedaemon won't
* XXXUBC bother us with them again.
* as EOF, unless the vnode is being reclaimed or the request
* is synchronous. (If the request is sync, it comes from
* lfs_truncate.)
*
* To avoid being flooded with this request, make these pages
* look "active".
*/
if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
if (!sync && !reclaim &&
ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
origoffset = ap->a_offlo;
for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
pg = uvm_pagelookup(&vp->v_uobj, off);
@ -2154,8 +2225,13 @@ lfs_putpages(void *v)
* If not cleaning, just send the pages through genfs_putpages
* to be returned to the pool.
*/
if (!(ap->a_flags & PGO_CLEANIT))
return genfs_putpages(v);
if (!(ap->a_flags & PGO_CLEANIT)) {
DLOG((DLOG_PAGE, "lfs_putpages: no cleanit vn %p ino %d (flags %x)\n",
vp, (int)ip->i_number, ap->a_flags));
int r = genfs_putpages(v);
KASSERT(!mutex_owned(vp->v_interlock));
return r;
}
/* Set PGO_BUSYFAIL to avoid deadlocks */
ap->a_flags |= PGO_BUSYFAIL;
@ -2169,6 +2245,7 @@ lfs_putpages(void *v)
#endif
do {
int r;
KASSERT(mutex_owned(vp->v_interlock));
/* Count the number of dirty pages */
r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
@ -2191,8 +2268,10 @@ lfs_putpages(void *v)
r = genfs_do_putpages(vp, startoffset, endoffset,
ap->a_flags & ~PGO_SYNCIO, &busypg);
ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
if (r != EDEADLK)
return r;
if (r != EDEADLK) {
KASSERT(!mutex_owned(vp->v_interlock));
return r;
}
/* One of the pages was busy. Start over. */
mutex_enter(vp->v_interlock);
@ -2204,8 +2283,8 @@ lfs_putpages(void *v)
#ifdef DEBUG
if (debug_n_dirtyclean > TOOMANY)
printf("lfs_putpages: dirtyclean: looping, n = %d\n",
debug_n_dirtyclean);
DLOG((DLOG_PAGE, "lfs_putpages: dirtyclean: looping, n = %d\n",
debug_n_dirtyclean));
#endif
/*
@ -2228,6 +2307,7 @@ lfs_putpages(void *v)
wakeup(&lfs_writer_daemon);
mutex_exit(&lfs_lock);
preempt();
KASSERT(!mutex_owned(vp->v_interlock));
return EWOULDBLOCK;
}
@ -2239,26 +2319,28 @@ lfs_putpages(void *v)
*/
if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
(vp->v_uflag & VU_DIROP)) {
int locked;
DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
lfs_writer_enter(fs, "ppdirop");
/* Note if we hold the vnode locked */
if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
{
DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n"));
} else {
DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n"));
}
mutex_exit(vp->v_interlock);
lfs_writer_enter(fs, "ppdirop");
if (locked)
VOP_UNLOCK(vp); /* XXX why? */
mutex_enter(&lfs_lock);
lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
mutex_exit(&lfs_lock);
if (locked)
VOP_LOCK(vp, LK_EXCLUSIVE);
mutex_enter(vp->v_interlock);
lfs_writer_leave(fs);
/* XXX the flush should have taken care of this one too! */
/* The flush will have cleaned out this vnode as well,
no need to do more to it. */
}
/*
@ -2286,8 +2368,10 @@ lfs_putpages(void *v)
if (!seglocked) {
mutex_exit(vp->v_interlock);
error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
if (error != 0)
return error;
if (error != 0) {
KASSERT(!mutex_owned(vp->v_interlock));
return error;
}
mutex_enter(vp->v_interlock);
lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
}
@ -2295,6 +2379,12 @@ lfs_putpages(void *v)
KASSERT(sp->vp == NULL);
sp->vp = vp;
/* Note segments written by reclaim; only for debugging */
if ((vp->v_iflag & VI_XLOCK) != 0) {
sp->seg_flags |= SEGM_RECLAIM;
fs->lfs_reclino = ip->i_number;
}
/*
* Ensure that the partial segment is marked SS_DIROP if this
* vnode is a DIROP.
@ -2313,10 +2403,11 @@ lfs_putpages(void *v)
#endif
do {
busypg = NULL;
KASSERT(mutex_owned(vp->v_interlock));
if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
ap->a_flags, 0, &busypg) < 0) {
mutex_exit(vp->v_interlock);
/* XXX why? --ks */
mutex_enter(vp->v_interlock);
write_and_wait(fs, vp, busypg, seglocked, NULL);
if (!seglocked) {
@ -2330,8 +2421,12 @@ lfs_putpages(void *v)
}
busypg = NULL;
KASSERT(!mutex_owned(&uvm_pageqlock));
oreclaim = (ap->a_flags & PGO_RECLAIM);
ap->a_flags &= ~PGO_RECLAIM;
error = genfs_do_putpages(vp, startoffset, endoffset,
ap->a_flags, &busypg);
ap->a_flags |= oreclaim;
if (error == EDEADLK || error == EAGAIN) {
DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
@ -2339,20 +2434,40 @@ lfs_putpages(void *v)
ip->i_number, fs->lfs_offset,
dtosn(fs, fs->lfs_offset)));
mutex_enter(vp->v_interlock);
write_and_wait(fs, vp, busypg, seglocked, "again");
if (oreclaim) {
mutex_enter(vp->v_interlock);
write_and_wait(fs, vp, busypg, seglocked, "again");
mutex_exit(vp->v_interlock);
} else {
if ((sp->seg_flags & SEGM_SINGLE) &&
fs->lfs_curseg != fs->lfs_startseg)
donewriting = 1;
}
} else if (error) {
DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
" %d ino %d off %x (seg %d)\n", error,
(int)ip->i_number, fs->lfs_offset,
dtosn(fs, fs->lfs_offset)));
}
/* genfs_do_putpages loses the interlock */
#ifdef DEBUG
++debug_n_again;
#endif
} while (error == EDEADLK);
if (oreclaim && error == EAGAIN) {
DLOG((DLOG_PAGE, "vp %p ino %d vi_flags %x a_flags %x avoiding vclean panic\n",
vp, (int)ip->i_number, vp->v_iflag, ap->a_flags));
mutex_enter(vp->v_interlock);
}
if (error == EDEADLK)
mutex_enter(vp->v_interlock);
} while (error == EDEADLK || (oreclaim && error == EAGAIN));
#ifdef DEBUG
if (debug_n_again > TOOMANY)
printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
DLOG((DLOG_PAGE, "lfs_putpages: again: looping, n = %d\n", debug_n_again));
#endif
KASSERT(sp != NULL && sp->vp == vp);
if (!seglocked) {
if (!seglocked && !donewriting) {
sp->vp = NULL;
/* Write indirect blocks as well */
@ -2376,8 +2491,10 @@ lfs_putpages(void *v)
* If we were called from lfs_writefile, we don't need to clean up
* the FIP or unlock the segment lock. We're done.
*/
if (seglocked)
if (seglocked) {
KASSERT(!mutex_owned(vp->v_interlock));
return error;
}
/* Clean up FIP and send it to disk. */
lfs_release_finfo(fs);
@ -2417,6 +2534,7 @@ lfs_putpages(void *v)
}
mutex_exit(vp->v_interlock);
}
KASSERT(!mutex_owned(vp->v_interlock));
return error;
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: inode.h,v 1.58 2011/07/12 02:22:13 dholland Exp $ */
/* $NetBSD: inode.h,v 1.59 2012/01/02 22:10:45 perseant Exp $ */
/*
* Copyright (c) 1982, 1989, 1993
@ -242,7 +242,7 @@ struct inode {
#define IN_ADIROP 0x0200 /* LFS: dirop in progress */
#define IN_SPACECOUNTED 0x0400 /* Blocks to be freed in free count. */
#define IN_PAGING 0x1000 /* LFS: file is on paging queue */
#define IN_CDIROP 0x4000 /* LFS: dirop completed pending i/o */
#if defined(_KERNEL)
/*

View File

@ -1,4 +1,4 @@
/* $NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $ */
/* $NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $ */
/*-
* Copyright (c) 1993
@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $");
#ifdef LFS_READWRITE
#define FS struct lfs
@ -294,6 +294,7 @@ WRITE(void *v)
#ifdef LFS_READWRITE
async = true;
lfs_availwait(fs, btofsb(fs, uio->uio_resid));
lfs_check(vp, LFS_UNUSED_LBN, 0);
#endif /* !LFS_READWRITE */
if (!usepc)