Improvements to LFS's paging mechanism, to wit:

* Acknowledge that sometimes there are more dirty pages to be written to
  disk than clean segments.  When we reach the danger line,
  lfs_gop_write() now returns EAGAIN.  The caller of VOP_PUTPAGES(), if
  it holds the segment lock, drops it and waits for the cleaner to make
  room before continuing.

* Note and avoid a three-way deadlock in lfs_putpages (a writer holding
  a page busy blocks on the cleaner while the cleaner blocks on the
  segment lock while lfs_putpages blocks on the page).
This commit is contained in:
perseant 2006-03-24 20:05:32 +00:00
parent 1acb7de56f
commit dddf5c5171
6 changed files with 198 additions and 71 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs.h,v 1.96 2006/03/17 23:21:01 tls Exp $ */
/* $NetBSD: lfs.h,v 1.97 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -120,6 +120,9 @@
(((uvmexp.active + uvmexp.inactive + uvmexp.free) * uvmexp.filemax) >> 8)
#define LFS_BUFWAIT 2 /* How long to wait if over *_WAIT_* */
/* How starved can we be before we start holding back page writes */
#define LFS_STARVED_FOR_SEGS(fs) ((fs)->lfs_nclean < (fs)->lfs_minfreeseg / 2 + 1)
/*
* Reserved blocks for lfs_malloc
*/

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_extern.h,v 1.75 2006/01/14 17:41:17 yamt Exp $ */
/* $NetBSD: lfs_extern.h,v 1.76 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -183,7 +183,7 @@ struct ufs1_dinode *lfs_ifind(struct lfs *, ino_t, struct buf *);
void lfs_imtime(struct lfs *);
int lfs_vflush(struct vnode *);
int lfs_segwrite(struct mount *, int);
void lfs_writefile(struct lfs *, struct segment *, struct vnode *);
int lfs_writefile(struct lfs *, struct segment *, struct vnode *);
int lfs_writeinode(struct lfs *, struct segment *, struct inode *);
int lfs_gatherblock(struct segment *, struct buf *, int *);
int lfs_gather(struct lfs *, struct segment *, struct vnode *, int (*match )(struct lfs *, struct buf *));
@ -212,6 +212,7 @@ void *lfs_malloc(struct lfs *, size_t, int);
void lfs_free(struct lfs *, void *, int);
int lfs_seglock(struct lfs *, unsigned long);
void lfs_segunlock(struct lfs *);
void lfs_segunlock_relock(struct lfs *);
int lfs_writer_enter(struct lfs *, const char *);
void lfs_writer_leave(struct lfs *);

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_segment.c,v 1.170 2006/03/17 23:21:01 tls Exp $ */
/* $NetBSD: lfs_segment.c,v 1.171 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.170 2006/03/17 23:21:01 tls Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.171 2006/03/24 20:05:32 perseant Exp $");
#ifdef DEBUG
# define vndebug(vp, str) do { \
@ -130,6 +130,15 @@ static void lfs_cluster_callback(struct buf *);
((fs)->lfs_fsbpseg - ((fs)->lfs_offset - (fs)->lfs_curseg) > \
fragstofsb((fs), (fs)->lfs_frag))
/*
* Figure out whether we should do a checkpoint write or go ahead with
* an ordinary write.
*/
#define LFS_SHOULD_CHECKPOINT(fs, flags) \
(fs->lfs_nactive > LFS_MAX_ACTIVE || \
(flags & SEGM_CKP) || \
fs->lfs_nclean < LFS_MAX_ACTIVE)
int lfs_match_fake(struct lfs *, struct buf *);
void lfs_newseg(struct lfs *);
/* XXX ondisk32 */
@ -194,13 +203,13 @@ lfs_vflush(struct vnode *vp)
struct buf *bp, *nbp, *tbp, *tnbp;
int error, s;
int flushed;
#if 0
int redo;
#endif
int relock;
ip = VTOI(vp);
fs = VFSTOUFS(vp->v_mount)->um_lfs;
relock = 0;
top:
ASSERT_NO_SEGLOCK(fs);
if (ip->i_flag & IN_CLEANING) {
ivndebug(vp,"vflush/in_cleaning");
@ -317,8 +326,7 @@ lfs_vflush(struct vnode *vp)
}
SET_FLUSHING(fs,vp);
if (fs->lfs_nactive > LFS_MAX_ACTIVE ||
(fs->lfs_sp->seg_flags & SEGM_CKP)) {
if (LFS_SHOULD_CHECKPOINT(fs, fs->lfs_sp->seg_flags)) {
error = lfs_segwrite(vp->v_mount, SEGM_CKP | SEGM_SYNC);
CLR_FLUSHING(fs,vp);
lfs_segunlock(fs);
@ -352,28 +360,25 @@ lfs_vflush(struct vnode *vp)
}
#endif
#if 1
do {
do {
if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
lfs_writefile(fs, sp, vp);
if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
relock = lfs_writefile(fs, sp, vp);
if (relock) {
/*
* Might have to wait for the
* cleaner to run; but we're
* still not done with this vnode.
*/
lfs_writeseg(fs, sp);
lfs_segunlock(fs);
lfs_segunlock_relock(fs);
goto top;
}
}
} while (lfs_writeinode(fs, sp, ip));
} while (lfs_writeseg(fs, sp) && ip->i_number == LFS_IFILE_INUM);
#else
if (flushed && vp != fs->lfs_ivnode)
lfs_writeseg(fs, sp);
else do {
simple_lock(&fs->lfs_interlock);
fs->lfs_flags &= ~LFS_IFDIRTY;
simple_unlock(&fs->lfs_interlock);
lfs_writefile(fs, sp, vp);
redo = lfs_writeinode(fs, sp, ip);
redo += lfs_writeseg(fs, sp);
simple_lock(&fs->lfs_interlock);
redo += (fs->lfs_flags & LFS_IFDIRTY);
simple_unlock(&fs->lfs_interlock);
} while (redo && vp == fs->lfs_ivnode);
#endif
if (lfs_dostats) {
++lfs_stats.nwrites;
if (sp->seg_flags & SEGM_SYNC)
@ -422,6 +427,7 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
struct inode *ip;
struct vnode *vp, *nvp;
int inodes_written = 0, only_cleaning;
int error = 0;
ASSERT_SEGLOCK(fs);
#ifndef LFS_NO_BACKVP_HACK
@ -498,7 +504,24 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
((ip->i_flag & IN_ALLMOD) == IN_CLEANING);
if (ip->i_number != LFS_IFILE_INUM) {
lfs_writefile(fs, sp, vp);
error = lfs_writefile(fs, sp, vp);
if (error) {
lfs_vunref(vp);
if (error == EAGAIN) {
/*
* This error from lfs_putpages
* indicates we need to drop
* the segment lock and start
* over after the cleaner has
* had a chance to run.
*/
lfs_writeseg(fs, sp);
break;
}
error = 0; /* XXX not quite right */
continue;
}
if (!VPISEMPTY(vp)) {
if (WRITEINPROG(vp)) {
ivndebug(vp,"writevnodes/write2");
@ -516,7 +539,7 @@ lfs_writevnodes(struct lfs *fs, struct mount *mp, struct segment *sp, int op)
else
lfs_vunref(vp);
}
return inodes_written;
return error;
}
/*
@ -536,6 +559,7 @@ lfs_segwrite(struct mount *mp, int flags)
int writer_set = 0;
int dirty;
int redo;
int um_error;
fs = VFSTOUFS(mp)->um_lfs;
ASSERT_MAYBE_SEGLOCK(fs);
@ -550,7 +574,8 @@ lfs_segwrite(struct mount *mp, int flags)
* the maximum possible number of buffers which can be described in a
* single summary block.
*/
do_ckp = (flags & SEGM_CKP) || fs->lfs_nactive > LFS_MAX_ACTIVE;
do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
sp = fs->lfs_sp;
@ -566,22 +591,23 @@ lfs_segwrite(struct mount *mp, int flags)
if (sp->seg_flags & SEGM_CLEAN)
lfs_writevnodes(fs, mp, sp, VN_CLEAN);
else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
lfs_writevnodes(fs, mp, sp, VN_REG);
if (!fs->lfs_dirops || !fs->lfs_flushvp) {
error = lfs_writer_enter(fs, "lfs writer");
if (error) {
DLOG((DLOG_SEG, "segwrite mysterious error\n"));
/* XXX why not segunlock? */
pool_put(&fs->lfs_bpppool, sp->bpp);
sp->bpp = NULL;
pool_put(&fs->lfs_segpool, sp);
sp = fs->lfs_sp = NULL;
return (error);
do {
um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
if (!fs->lfs_dirops || !fs->lfs_flushvp) {
if (!writer_set) {
lfs_writer_enter(fs, "lfs writer");
writer_set = 1;
}
error = lfs_writevnodes(fs, mp, sp, VN_DIROP);
if (um_error == 0)
um_error = error;
((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
}
writer_set = 1;
lfs_writevnodes(fs, mp, sp, VN_DIROP);
((SEGSUM *)(sp->segsum))->ss_flags &= ~(SS_CONT);
}
if (do_ckp && um_error) {
lfs_segunlock_relock(fs);
sp = fs->lfs_sp;
}
} while (do_ckp && um_error != 0);
}
/*
@ -640,8 +666,13 @@ lfs_segwrite(struct mount *mp, int flags)
ip = VTOI(vp);
if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL)
if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
/*
* Ifile has no pages, so we don't need
* to check error return here.
*/
lfs_writefile(fs, sp, vp);
}
if (ip->i_flag & IN_ALLMOD)
++did_ckp;
@ -717,7 +748,7 @@ lfs_segwrite(struct mount *mp, int flags)
/*
* Write the dirty blocks associated with a vnode.
*/
void
int
lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
{
struct buf *bp;
@ -725,8 +756,10 @@ lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
struct inode *ip;
IFILE *ifp;
int i, frag;
int error;
ASSERT_SEGLOCK(fs);
error = 0;
ip = VTOI(vp);
if (sp->seg_bytes_left < fs->lfs_bsize ||
@ -772,8 +805,8 @@ lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
*/
if (!IS_FLUSHING(fs, vp)) {
simple_lock(&vp->v_interlock);
VOP_PUTPAGES(vp, 0, 0,
PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
error = VOP_PUTPAGES(vp, 0, 0,
PGO_CLEANIT | PGO_ALLPAGES | PGO_LOCKED);
}
}
@ -819,6 +852,8 @@ lfs_writefile(struct lfs *fs, struct segment *sp, struct vnode *vp)
sp->sum_bytes_left += FINFOSIZE;
--((SEGSUM *)(sp->segsum))->ss_nfinfo;
}
return error;
}
int

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_subr.c,v 1.56 2006/01/14 17:41:17 yamt Exp $ */
/* $NetBSD: lfs_subr.c,v 1.57 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.56 2006/01/14 17:41:17 yamt Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.57 2006/03/24 20:05:32 perseant Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -458,18 +458,20 @@ lfs_segunlock(struct lfs *fs)
simple_lock(&fs->lfs_interlock);
LOCK_ASSERT(LFS_SEGLOCK_HELD(fs));
if (fs->lfs_seglock == 1) {
if ((sp->seg_flags & SEGM_PROT) == 0)
if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
LFS_STARVED_FOR_SEGS(fs) == 0)
do_unmark_dirop = 1;
simple_unlock(&fs->lfs_interlock);
sync = sp->seg_flags & SEGM_SYNC;
ckp = sp->seg_flags & SEGM_CKP;
if (sp->bpp != sp->cbpp) {
/* Free allocated segment summary */
fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
bp = *sp->bpp;
lfs_freebuf(fs, bp);
} else
DLOG((DLOG_SEG, "lfs_segunlock: unlock to 0 with no summary"));
/* We should have a segment summary, and nothing else */
KASSERT(sp->cbpp == sp->bpp + 1);
/* Free allocated segment summary */
fs->lfs_offset -= btofsb(fs, fs->lfs_sumsize);
bp = *sp->bpp;
lfs_freebuf(fs, bp);
pool_put(&fs->lfs_bpppool, sp->bpp);
sp->bpp = NULL;
@ -607,3 +609,43 @@ lfs_writer_leave(struct lfs *fs)
if (dowakeup)
wakeup(&fs->lfs_dirops);
}
/*
* Unlock, wait for the cleaner, then relock to where we were before.
* To be used only at a fairly high level, to address a paucity of free
* segments propagated back from lfs_gop_write().
*/
void
lfs_segunlock_relock(struct lfs *fs)
{
int n = fs->lfs_seglock;
u_int16_t seg_flags;
if (n == 0)
return;
/* Write anything we've already gathered to disk */
lfs_writeseg(fs, fs->lfs_sp);
/* Save segment flags for later */
seg_flags = fs->lfs_sp->seg_flags;
fs->lfs_sp->seg_flags |= SEGM_PROT; /* Don't unmark dirop nodes */
while(fs->lfs_seglock)
lfs_segunlock(fs);
/* Wait for the cleaner */
wakeup(&lfs_allclean_wakeup);
wakeup(&fs->lfs_nextseg);
simple_lock(&fs->lfs_interlock);
while (LFS_STARVED_FOR_SEGS(fs))
ltsleep(&fs->lfs_avail, PRIBIO, "relock", 0,
&fs->lfs_interlock);
simple_unlock(&fs->lfs_interlock);
/* Put the segment lock back the way it was. */
while(n--)
lfs_seglock(fs, seg_flags);
return;
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_vfsops.c,v 1.194 2006/03/17 23:21:01 tls Exp $ */
/* $NetBSD: lfs_vfsops.c,v 1.195 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.194 2006/03/17 23:21:01 tls Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.195 2006/03/24 20:05:32 perseant Exp $");
#if defined(_KERNEL_OPT)
#include "opt_quota.h"
@ -1363,6 +1363,9 @@ lfs_unmount(struct mount *mp, int mntflags, struct lwp *l)
ump = VFSTOUFS(mp);
fs = ump->um_lfs;
/* Write everything we've got */
lfs_segwrite(mp, SEGM_CKP);
/* wake up the cleaner so it can die */
wakeup(&fs->lfs_nextseg);
wakeup(&lfs_allclean_wakeup);
@ -1957,6 +1960,15 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
/* The Ifile lives in the buffer cache */
KASSERT(vp != fs->lfs_ivnode);
/*
* We don't want to fill the disk before the cleaner has a chance
* to make room for us. If we're in danger of doing that, fail
* with EAGAIN. The caller will have to notice this, unlock
* so the cleaner can run, relock and try again.
*/
if (LFS_STARVED_FOR_SEGS(fs))
goto tryagain;
/*
* Sometimes things slip past the filters in lfs_putpages,
* and the pagedaemon tries to write pages---problem is
@ -1991,6 +2003,7 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
error = 0;
pg = pgs[0];
startoffset = pg->offset;
KASSERT(eof >= 0);
if (startoffset >= eof) {
goto tryagain;
} else
@ -2199,6 +2212,8 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
" eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
pgs[0]->offset, eof, npages));
else if (LFS_STARVED_FOR_SEGS(fs))
DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
else
DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
@ -2213,7 +2228,8 @@ lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
}
uvm_pageactivate(pg);
pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
DLOG((DLOG_PAGE, "pg[%d] = %p\n", i, pg));
DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
vp, pg->offset));
DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));

View File

@ -1,4 +1,4 @@
/* $NetBSD: lfs_vnops.c,v 1.157 2005/12/11 12:25:26 christos Exp $ */
/* $NetBSD: lfs_vnops.c,v 1.158 2006/03/24 20:05:32 perseant Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.157 2005/12/11 12:25:26 christos Exp $");
__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.158 2006/03/24 20:05:32 perseant Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -1326,6 +1326,7 @@ lfs_fcntl(void *v)
oclean = cip->clean;
LFS_SYNC_CLEANERINFO(cip, fs, bp, 1);
lfs_segwrite(ap->a_vp->v_mount, SEGM_FORCE_CKP);
fs->lfs_sp->seg_flags |= SEGM_PROT;
lfs_segunlock(fs);
lfs_writer_leave(fs);
@ -1478,6 +1479,22 @@ check_dirty(struct lfs *fs, struct vnode *vp,
}
}
KASSERT(pg != NULL);
/*
* If we're holding the segment lock, we can deadlocked
* against a process that has our page and is waiting
* for the cleaner, while the cleaner waits for the
* segment lock. Just bail in that case.
*/
if ((pg->flags & PG_BUSY) && LFS_SEGLOCK_HELD(fs)) {
if (by_list) {
if (i > 0)
uvm_page_unbusy(pgs, i);
DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way deadlock\n"));
return -1;
}
}
while (pg->flags & PG_BUSY) {
pg->flags |= PG_WANTED;
UVM_UNLOCK_AND_WAIT(pg, &vp->v_interlock, 0,
@ -1653,7 +1670,8 @@ lfs_putpages(void *v)
/*
* Ignore requests to free pages past EOF but in the same block
* as EOF, unless the request is synchronous. (XXX why sync?)
* as EOF, unless the request is synchronous. (If the request is
* sync, it comes from lfs_truncate.)
* XXXUBC Make these pages look "active" so the pagedaemon won't
* XXXUBC bother us with them again.
*/
@ -1723,8 +1741,13 @@ lfs_putpages(void *v)
int r;
/* If no pages are dirty, we can just use genfs_putpages. */
if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
ap->a_flags, 1) != 0)
r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
ap->a_flags, 1);
if (r < 0) {
simple_unlock(&vp->v_interlock);
return EDEADLK;
}
if (r > 0)
break;
/*
@ -1861,9 +1884,15 @@ lfs_putpages(void *v)
* well.
*/
again:
check_dirty(fs, vp, startoffset, endoffset, blkeof, ap->a_flags, 0);
if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
ap->a_flags, 0) < 0) {
simple_unlock(&vp->v_interlock);
sp->vp = NULL;
return EDEADLK;
}
if ((error = genfs_putpages(v)) == EDEADLK) {
error = genfs_putpages(v);
if (error == EDEADLK || error == EAGAIN) {
DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
" EDEADLK [2] ino %d off %x (seg %d)\n",
ip->i_number, fs->lfs_offset,
@ -1892,7 +1921,8 @@ again:
/* We've lost the interlock. Start over. */
simple_lock(&vp->v_interlock);
goto again;
if (error == EDEADLK)
goto again;
}
KASSERT(sp->vp == vp);