2003-01-18 11:51:40 +03:00
|
|
|
/* $NetBSD: sync_subr.c,v 1.12 2003/01/18 09:18:07 thorpej Exp $ */
|
1999-11-15 21:49:07 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Copyright 1997 Marshall Kirk McKusick. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* This code is derived from work done by Greg Ganger at the
|
|
|
|
* University of Michigan.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. None of the names of McKusick, Ganger, or the University of Michigan
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
2001-11-10 16:33:40 +03:00
|
|
|
#include <sys/cdefs.h>
|
2003-01-18 11:51:40 +03:00
|
|
|
__KERNEL_RCSID(0, "$NetBSD: sync_subr.c,v 1.12 2003/01/18 09:18:07 thorpej Exp $");
|
2001-11-10 16:33:40 +03:00
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/kernel.h>
|
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/mount.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/vnode.h>
|
|
|
|
#include <sys/buf.h>
|
|
|
|
#include <sys/errno.h>
|
|
|
|
#include <sys/malloc.h>
|
|
|
|
|
|
|
|
#include <miscfs/genfs/genfs.h>
|
|
|
|
#include <miscfs/syncfs/syncfs.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Defines and variables for the syncer process.
|
|
|
|
*/
|
|
|
|
int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
|
|
|
|
time_t syncdelay = 30; /* max time to delay syncing data */
|
|
|
|
time_t filedelay = 30; /* time to delay syncing files */
|
2000-10-06 23:08:00 +04:00
|
|
|
time_t dirdelay = 15; /* time to delay syncing directories */
|
1999-11-15 21:49:07 +03:00
|
|
|
time_t metadelay = 10; /* time to delay syncing metadata */
|
|
|
|
|
|
|
|
struct lock syncer_lock; /* used to freeze syncer */
|
|
|
|
|
|
|
|
static int rushjob; /* number of slots to run ASAP */
|
|
|
|
static int stat_rush_requests; /* number of times I/O speeded up */
|
|
|
|
|
|
|
|
static int syncer_delayno = 0;
|
|
|
|
static long syncer_last;
|
|
|
|
static struct synclist *syncer_workitem_pending;
|
2003-01-18 11:51:40 +03:00
|
|
|
struct lwp *updateproc = NULL;
|
1999-11-15 21:49:07 +03:00
|
|
|
|
|
|
|
void
|
|
|
|
vn_initialize_syncerd()
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
syncer_last = SYNCER_MAXDELAY + 2;
|
|
|
|
|
|
|
|
syncer_workitem_pending = malloc(syncer_last * sizeof (struct synclist),
|
|
|
|
M_VNODE, M_WAITOK);
|
|
|
|
|
|
|
|
for (i = 0; i < syncer_last; i++)
|
|
|
|
LIST_INIT(&syncer_workitem_pending[i]);
|
|
|
|
|
|
|
|
lockinit(&syncer_lock, PVFS, "synclk", 0, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The workitem queue.
|
|
|
|
*
|
|
|
|
* It is useful to delay writes of file data and filesystem metadata
|
|
|
|
* for tens of seconds so that quickly created and deleted files need
|
|
|
|
* not waste disk bandwidth being created and removed. To realize this,
|
|
|
|
* we append vnodes to a "workitem" queue. When running with a soft
|
|
|
|
* updates implementation, most pending metadata dependencies should
|
|
|
|
* not wait for more than a few seconds. Thus, mounted on block devices
|
|
|
|
* are delayed only about a half the time that file data is delayed.
|
|
|
|
* Similarly, directory updates are more critical, so are only delayed
|
|
|
|
* about a third the time that file data is delayed. Thus, there are
|
|
|
|
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
|
|
|
|
* one each second (driven off the filesystem syner process). The
|
|
|
|
* syncer_delayno variable indicates the next queue that is to be processed.
|
|
|
|
* Items that need to be processed soon are placed in this queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[syncer_delayno]
|
|
|
|
*
|
|
|
|
* A delay of fifteen seconds is done by placing the request fifteen
|
|
|
|
* entries later in the queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add an item to the syncer work queue.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vn_syncer_add_to_worklist(vp, delay)
|
|
|
|
struct vnode *vp;
|
|
|
|
int delay;
|
|
|
|
{
|
|
|
|
int s, slot;
|
|
|
|
|
|
|
|
s = splbio();
|
|
|
|
|
|
|
|
if (vp->v_flag & VONWORKLST) {
|
|
|
|
LIST_REMOVE(vp, v_synclist);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (delay > syncer_maxdelay - 2)
|
|
|
|
delay = syncer_maxdelay - 2;
|
|
|
|
slot = (syncer_delayno + delay) % syncer_last;
|
|
|
|
|
|
|
|
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
|
|
|
|
vp->v_flag |= VONWORKLST;
|
|
|
|
splx(s);
|
|
|
|
}
|
|
|
|
|
2000-07-09 04:59:03 +04:00
|
|
|
/*
|
|
|
|
* Remove an item fromthe syncer work queue.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vn_syncer_remove_from_worklist(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
|
|
|
int s;
|
|
|
|
|
|
|
|
s = splbio();
|
|
|
|
|
|
|
|
if (vp->v_flag & VONWORKLST) {
|
2001-12-06 07:29:55 +03:00
|
|
|
vp->v_flag &= ~VONWORKLST;
|
2000-07-09 04:59:03 +04:00
|
|
|
LIST_REMOVE(vp, v_synclist);
|
|
|
|
}
|
|
|
|
|
|
|
|
splx(s);
|
|
|
|
}
|
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
/*
|
|
|
|
* System filesystem synchronizer daemon.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
sched_sync(v)
|
|
|
|
void *v;
|
|
|
|
{
|
|
|
|
struct synclist *slp;
|
|
|
|
struct vnode *vp;
|
|
|
|
long starttime;
|
|
|
|
int s;
|
|
|
|
|
2003-01-18 11:51:40 +03:00
|
|
|
updateproc = curlwp;
|
1999-11-15 21:49:07 +03:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
starttime = time.tv_sec;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Push files whose dirty time has expired. Be careful
|
|
|
|
* of interrupt race on slp queue.
|
|
|
|
*/
|
|
|
|
s = splbio();
|
|
|
|
slp = &syncer_workitem_pending[syncer_delayno];
|
|
|
|
syncer_delayno += 1;
|
|
|
|
if (syncer_delayno >= syncer_last)
|
|
|
|
syncer_delayno = 0;
|
|
|
|
splx(s);
|
|
|
|
|
|
|
|
lockmgr(&syncer_lock, LK_EXCLUSIVE, NULL);
|
|
|
|
|
|
|
|
while ((vp = LIST_FIRST(slp)) != NULL) {
|
a whole bunch of changes to improve performance and robustness under load:
- remove special treatment of pager_map mappings in pmaps. this is
required now, since I've removed the globals that expose the address range.
pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's
no longer any need to special-case it.
- eliminate struct uvm_vnode by moving its fields into struct vnode.
- rewrite the pageout path. the pager is now responsible for handling the
high-level requests instead of only getting control after a bunch of work
has already been done on its behalf. this will allow us to UBCify LFS,
which needs tighter control over its pages than other filesystems do.
writing a page to disk no longer requires making it read-only, which
allows us to write wired pages without causing all kinds of havoc.
- use a new PG_PAGEOUT flag to indicate that a page should be freed
on behalf of the pagedaemon when it's unlocked. this flag is very similar
to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the
pageout fails due to eg. an indirect-block buffer being locked.
this allows us to remove the "version" field from struct vm_page,
and together with shrinking "loan_count" from 32 bits to 16,
struct vm_page is now 4 bytes smaller.
- no longer use PG_RELEASED for swap-backed pages. if the page is busy
because it's being paged out, we can't release the swap slot to be
reallocated until that write is complete, but unlike with vnodes we
don't keep a count of in-progress writes so there's no good way to
know when the write is done. instead, when we need to free a busy
swap-backed page, just sleep until we can get it busy ourselves.
- implement a fast-path for extending writes which allows us to avoid
zeroing new pages. this substantially reduces cpu usage.
- encapsulate the data used by the genfs code in a struct genfs_node,
which must be the first element of the filesystem-specific vnode data
for filesystems which use genfs_{get,put}pages().
- eliminate many of the UVM pagerops, since they aren't needed anymore
now that the pager "put" operation is a higher-level operation.
- enhance the genfs code to allow NFS to use the genfs_{get,put}pages
instead of a modified copy.
- clean up struct vnode by removing all the fields that used to be used by
the vfs_cluster.c code (which we don't use anymore with UBC).
- remove kmem_object and mb_object since they were useless.
instead of allocating pages to these objects, we now just allocate
pages with no object. such pages are mapped in the kernel until they
are freed, so we can use the mapping to find the page to free it.
this allows us to remove splvm() protection in several places.
The sum of all these changes improves write throughput on my
decstation 5000/200 to within 1% of the rate of NetBSD 1.5
and reduces the elapsed time for "make release" of a NetBSD 1.5
source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
|
|
|
if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
|
1999-11-15 21:49:07 +03:00
|
|
|
(void) VOP_FSYNC(vp, curproc->p_ucred,
|
2000-09-20 02:01:59 +04:00
|
|
|
FSYNC_LAZY, 0, 0, curproc);
|
1999-11-15 21:49:07 +03:00
|
|
|
VOP_UNLOCK(vp, 0);
|
|
|
|
}
|
|
|
|
s = splbio();
|
|
|
|
if (LIST_FIRST(slp) == vp) {
|
2000-11-27 11:39:39 +03:00
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
/*
|
|
|
|
* Put us back on the worklist. The worklist
|
|
|
|
* routine will remove us from our current
|
|
|
|
* position and then add us back in at a later
|
|
|
|
* position.
|
|
|
|
*/
|
2000-11-27 11:39:39 +03:00
|
|
|
|
1999-11-15 21:49:07 +03:00
|
|
|
vn_syncer_add_to_worklist(vp, syncdelay);
|
|
|
|
}
|
|
|
|
splx(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do soft update processing.
|
|
|
|
*/
|
|
|
|
if (bioops.io_sync)
|
|
|
|
(*bioops.io_sync)(NULL);
|
|
|
|
|
|
|
|
lockmgr(&syncer_lock, LK_RELEASE, NULL);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The variable rushjob allows the kernel to speed up the
|
|
|
|
* processing of the filesystem syncer process. A rushjob
|
|
|
|
* value of N tells the filesystem syncer to process the next
|
|
|
|
* N seconds worth of work on its queue ASAP. Currently rushjob
|
|
|
|
* is used by the soft update code to speed up the filesystem
|
|
|
|
* syncer process when the incore state is getting so far
|
|
|
|
* ahead of the disk that the kernel memory pool is being
|
|
|
|
* threatened with exhaustion.
|
|
|
|
*/
|
|
|
|
if (rushjob > 0) {
|
|
|
|
rushjob--;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it has taken us less than a second to process the
|
|
|
|
* current work, then wait. Otherwise start right over
|
|
|
|
* again. We can still lose time if any single round
|
|
|
|
* takes more than two seconds, but it does not really
|
|
|
|
* matter as we are just trying to generally pace the
|
|
|
|
* filesystem activity.
|
|
|
|
*/
|
|
|
|
if (time.tv_sec == starttime)
|
2000-11-27 11:39:39 +03:00
|
|
|
tsleep(&rushjob, PPAUSE, "syncer", hz);
|
1999-11-15 21:49:07 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Request the syncer daemon to speed up its work.
|
|
|
|
* We never push it to speed up more than half of its
|
|
|
|
* normal turn time, otherwise it could take over the cpu.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
speedup_syncer()
|
|
|
|
{
|
2000-11-27 11:39:39 +03:00
|
|
|
if (rushjob >= syncdelay / 2) {
|
|
|
|
return (0);
|
1999-11-15 21:49:07 +03:00
|
|
|
}
|
2000-11-27 11:39:39 +03:00
|
|
|
|
|
|
|
rushjob++;
|
|
|
|
wakeup(&rushjob);
|
|
|
|
stat_rush_requests += 1;
|
|
|
|
return (1);
|
1999-11-15 21:49:07 +03:00
|
|
|
}
|