NetBSD/sys/kern/vfs_subr.c

1968 lines
46 KiB
C
Raw Normal View History

/* $NetBSD: vfs_subr.c,v 1.332 2008/02/05 14:19:52 ad Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
1994-05-17 08:21:49 +04:00
/*
1994-06-08 15:28:29 +04:00
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
1994-05-17 08:21:49 +04:00
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-05-17 08:21:49 +04:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
1994-05-17 08:21:49 +04:00
*/
/*
* External virtual filesystem routines.
*
* This file contains vfs subroutines which are heavily dependant on
* the kernel and are not suitable for standalone use. Examples include
* routines involved vnode and mountpoint management.
1994-05-17 08:21:49 +04:00
*/
2001-11-12 18:25:01 +03:00
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.332 2008/02/05 14:19:52 ad Exp $");
2001-11-12 18:25:01 +03:00
#include "opt_ddb.h"
#include "opt_compat_netbsd.h"
1998-12-10 18:07:01 +03:00
#include "opt_compat_43.h"
1994-05-17 08:21:49 +04:00
#include <sys/param.h>
1994-06-08 15:28:29 +04:00
#include <sys/systm.h>
1994-05-17 08:21:49 +04:00
#include <sys/proc.h>
#include <sys/kernel.h>
1994-05-17 08:21:49 +04:00
#include <sys/mount.h>
1995-07-03 20:58:38 +04:00
#include <sys/fcntl.h>
1994-05-17 08:21:49 +04:00
#include <sys/vnode.h>
1994-06-08 15:28:29 +04:00
#include <sys/stat.h>
1994-05-17 08:21:49 +04:00
#include <sys/namei.h>
#include <sys/ucred.h>
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/malloc.h>
1996-02-09 21:59:18 +03:00
#include <sys/syscallargs.h>
#include <sys/device.h>
#include <sys/filedesc.h>
2006-05-15 01:15:11 +04:00
#include <sys/kauth.h>
#include <sys/atomic.h>
2008-01-02 14:48:20 +03:00
#include <sys/kthread.h>
1996-02-04 05:17:43 +03:00
1994-06-08 15:28:29 +04:00
#include <miscfs/specfs/specdev.h>
#include <miscfs/syncfs/syncfs.h>
1994-06-08 15:28:29 +04:00
#include <uvm/uvm.h>
2005-11-30 01:52:02 +03:00
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_ddb.h>
2000-06-27 21:41:07 +04:00
#include <sys/sysctl.h>
extern int dovfsusermount; /* 1 => permit any user to mount filesystems */
extern int vfs_magiclinks; /* 1 => expand "magic" symlinks */
2008-01-02 14:48:20 +03:00
static vnodelst_t vnode_free_list = TAILQ_HEAD_INITIALIZER(vnode_free_list);
static vnodelst_t vnode_hold_list = TAILQ_HEAD_INITIALIZER(vnode_hold_list);
static vnodelst_t vrele_list = TAILQ_HEAD_INITIALIZER(vrele_list);
2008-01-02 14:48:20 +03:00
static int vrele_pending;
static kmutex_t vrele_lock;
static kcondvar_t vrele_cv;
static lwp_t *vrele_lwp;
2008-01-02 14:48:20 +03:00
static pool_cache_t vnode_cache;
MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes");
/*
* Local declarations.
*/
2008-01-02 14:48:20 +03:00
static void vrele_thread(void *);
static void insmntque(vnode_t *, struct mount *);
static int getdevvp(dev_t, vnode_t **, enum vtype);
static vnode_t *getcleanvnode(void);;
void vpanic(vnode_t *, const char *);
#ifdef DIAGNOSTIC
void
vpanic(vnode_t *vp, const char *msg)
{
vprint(NULL, vp);
panic("%s\n", msg);
}
#else
#define vpanic(vp, msg) /* nothing */
#endif
void
vn_init1(void)
{
vnode_cache = pool_cache_init(sizeof(struct vnode), 0, 0, 0, "vnodepl",
NULL, IPL_NONE, NULL, NULL, NULL);
KASSERT(vnode_cache != NULL);
/* Create deferred release thread. */
mutex_init(&vrele_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&vrele_cv, "vrele");
if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL, vrele_thread,
NULL, &vrele_lwp, "vrele"))
panic("fork vrele");
}
1996-02-09 21:59:18 +03:00
int
2005-12-11 15:16:03 +03:00
vfs_drainvnodes(long target, struct lwp *l)
{
while (numvnodes > target) {
2008-01-02 14:48:20 +03:00
vnode_t *vp;
2008-01-02 14:48:20 +03:00
mutex_enter(&vnode_free_list_lock);
vp = getcleanvnode();
if (vp == NULL)
return EBUSY; /* give up */
2008-01-02 14:48:20 +03:00
ungetnewvnode(vp);
}
return 0;
}
/*
* grab a vnode from freelist and clean it.
*/
2008-01-02 14:48:20 +03:00
vnode_t *
getcleanvnode(void)
{
2008-01-02 14:48:20 +03:00
vnode_t *vp;
vnodelst_t *listhd;
2008-01-02 14:48:20 +03:00
KASSERT(mutex_owned(&vnode_free_list_lock));
2008-01-02 14:48:20 +03:00
retry:
listhd = &vnode_free_list;
try_nextlist:
TAILQ_FOREACH(vp, listhd, v_freelist) {
2008-01-02 14:48:20 +03:00
/*
* It's safe to test v_usecount and v_iflag
* without holding the interlock here, since
* these vnodes should never appear on the
* lists.
*/
if (vp->v_usecount != 0) {
vpanic(vp, "free vnode isn't");
}
if ((vp->v_iflag & VI_CLEAN) != 0) {
vpanic(vp, "clean vnode on freelist");
}
if (vp->v_freelisthd != listhd) {
printf("vnode sez %p, listhd %p\n", vp->v_freelisthd, listhd);
vpanic(vp, "list head mismatch");
}
if (!mutex_tryenter(&vp->v_interlock))
continue;
/*
2008-01-02 14:48:20 +03:00
* Our lwp might hold the underlying vnode
* locked, so don't try to reclaim a VI_LAYER
* node if it's locked.
*/
if ((vp->v_iflag & VI_XLOCK) == 0 &&
((vp->v_iflag & VI_LAYER) == 0 || VOP_ISLOCKED(vp) == 0)) {
break;
}
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
}
2008-01-02 14:48:20 +03:00
if (vp == NULL) {
if (listhd == &vnode_free_list) {
listhd = &vnode_hold_list;
goto try_nextlist;
}
2008-01-02 14:48:20 +03:00
mutex_exit(&vnode_free_list_lock);
return NULL;
}
2008-01-02 14:48:20 +03:00
/* Remove it from the freelist. */
TAILQ_REMOVE(listhd, vp, v_freelist);
2008-01-02 14:48:20 +03:00
vp->v_freelisthd = NULL;
mutex_exit(&vnode_free_list_lock);
/*
* The vnode is still associated with a file system, so we must
* clean it out before reusing it. We need to add a reference
* before doing this. If the vnode gains another reference while
* being cleaned out then we lose - retry.
*/
vp->v_usecount++;
vclean(vp, DOCLOSE);
if (vp->v_usecount == 1) {
/* We're about to dirty it. */
vp->v_iflag &= ~VI_CLEAN;
mutex_exit(&vp->v_interlock);
if (vp->v_type == VBLK || vp->v_type == VCHR) {
spec_node_destroy(vp);
}
vp->v_type = VNON;
2008-01-02 14:48:20 +03:00
} else {
/*
* Don't return to freelist - the holder of the last
* reference will destroy it.
*/
KASSERT(vp->v_usecount > 1);
2008-01-02 14:48:20 +03:00
vp->v_usecount--;
mutex_exit(&vp->v_interlock);
mutex_enter(&vnode_free_list_lock);
goto retry;
}
if (vp->v_data != NULL || vp->v_uobj.uo_npages != 0 ||
!TAILQ_EMPTY(&vp->v_uobj.memq)) {
vpanic(vp, "cleaned vnode isn't");
}
if (vp->v_numoutput != 0) {
vpanic(vp, "clean vnode has pending I/O's");
}
if ((vp->v_iflag & VI_ONWORKLST) != 0) {
vpanic(vp, "clean vnode on syncer list");
}
return vp;
}
1994-05-17 08:21:49 +04:00
/*
* Mark a mount point as busy, and gain a new reference to it. Used to
* synchronize access and to delay unmounting.
*
* => Interlock is not released on failure.
* => If no interlock, the caller is expected to already hold a reference
* on the mount.
* => If interlocked, the interlock must prevent the last reference to
* the mount from disappearing.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:17:43 +03:00
int
vfs_busy(struct mount *mp, const krw_t op, kmutex_t *interlock)
1994-05-17 08:21:49 +04:00
{
KASSERT(mp->mnt_refcnt > 0);
atomic_inc_uint(&mp->mnt_refcnt);
if (interlock != NULL) {
mutex_exit(interlock);
}
if (mp->mnt_writer == curlwp) {
mp->mnt_recursecnt++;
} else {
rw_enter(&mp->mnt_lock, op);
if (op == RW_WRITER) {
KASSERT(mp->mnt_writer == NULL);
mp->mnt_writer = curlwp;
}
}
if ((mp->mnt_iflag & IMNT_GONE) != 0) {
vfs_unbusy(mp, false);
if (interlock != NULL) {
mutex_enter(interlock);
}
return ENOENT;
}
return 0;
1994-05-17 08:21:49 +04:00
}
/*
* As vfs_busy(), but return immediatley if the mount cannot be
* locked without waiting.
*/
int
vfs_trybusy(struct mount *mp, krw_t op, kmutex_t *interlock)
{
KASSERT(mp->mnt_refcnt > 0);
if (mp->mnt_writer == curlwp) {
mp->mnt_recursecnt++;
} else {
if (!rw_tryenter(&mp->mnt_lock, op)) {
return EBUSY;
}
if (op == RW_WRITER) {
KASSERT(mp->mnt_writer == NULL);
mp->mnt_writer = curlwp;
}
}
atomic_inc_uint(&mp->mnt_refcnt);
if ((mp->mnt_iflag & IMNT_GONE) != 0) {
vfs_unbusy(mp, false);
return ENOENT;
}
if (interlock != NULL) {
mutex_exit(interlock);
}
return 0;
}
/*
* Unlock a busy filesystem and drop reference to it. If 'keepref' is
* true, unlock but preserve the reference.
1994-05-17 08:21:49 +04:00
*/
void
vfs_unbusy(struct mount *mp, bool keepref)
1994-05-17 08:21:49 +04:00
{
KASSERT(mp->mnt_refcnt > 0);
if (mp->mnt_writer == curlwp) {
KASSERT(rw_write_held(&mp->mnt_lock));
if (mp->mnt_recursecnt != 0) {
mp->mnt_recursecnt--;
} else {
mp->mnt_writer = NULL;
rw_exit(&mp->mnt_lock);
}
} else {
rw_exit(&mp->mnt_lock);
}
if (!keepref) {
vfs_destroy(mp);
}
1994-05-17 08:21:49 +04:00
}
/*
1998-03-01 05:20:01 +03:00
* Lookup a filesystem type, and if found allocate and initialize
* a mount structure for it.
*
* Devname is usually updated by mount(8) after booting.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:17:43 +03:00
int
2005-06-06 03:47:48 +04:00
vfs_rootmountalloc(const char *fstypename, const char *devname,
struct mount **mpp)
1994-05-17 08:21:49 +04:00
{
1998-03-01 05:20:01 +03:00
struct vfsops *vfsp = NULL;
struct mount *mp;
1994-05-17 08:21:49 +04:00
2008-01-02 14:48:20 +03:00
mutex_enter(&vfs_list_lock);
LIST_FOREACH(vfsp, &vfs_list, vfs_list)
if (!strncmp(vfsp->vfs_name, fstypename,
sizeof(mp->mnt_stat.f_fstypename)))
1998-03-01 05:20:01 +03:00
break;
if (vfsp == NULL) {
mutex_exit(&vfs_list_lock);
1998-03-01 05:20:01 +03:00
return (ENODEV);
}
2008-01-02 14:48:20 +03:00
vfsp->vfs_refcount++;
mutex_exit(&vfs_list_lock);
mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
if (mp == NULL)
return ENOMEM;
mp->mnt_refcnt = 1;
rw_init(&mp->mnt_lock);
mutex_init(&mp->mnt_renamelock, MUTEX_DEFAULT, IPL_NONE);
(void)vfs_busy(mp, RW_WRITER, NULL);
TAILQ_INIT(&mp->mnt_vnodelist);
1998-03-01 05:20:01 +03:00
mp->mnt_op = vfsp;
mp->mnt_flag = MNT_RDONLY;
2008-01-02 14:48:20 +03:00
mp->mnt_vnodecovered = NULL;
(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
sizeof(mp->mnt_stat.f_fstypename));
1998-03-01 05:20:01 +03:00
mp->mnt_stat.f_mntonname[0] = '/';
mp->mnt_stat.f_mntonname[1] = '\0';
mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
'\0';
(void)copystr(devname, mp->mnt_stat.f_mntfromname,
sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
mount_initspecific(mp);
1998-03-01 05:20:01 +03:00
*mpp = mp;
return (0);
1994-05-17 08:21:49 +04:00
}
1994-06-08 15:28:29 +04:00
/*
* Routines having to do with the management of the vnode table.
*/
2004-03-23 16:22:32 +03:00
extern int (**dead_vnodeop_p)(void *);
1994-06-08 15:28:29 +04:00
1994-05-17 08:21:49 +04:00
/*
* Return the next vnode from the free list.
*/
1996-02-04 05:17:43 +03:00
int
2005-06-06 03:47:48 +04:00
getnewvnode(enum vtagtype tag, struct mount *mp, int (**vops)(void *),
2008-01-02 14:48:20 +03:00
vnode_t **vpp)
1994-05-17 08:21:49 +04:00
{
struct uvm_object *uobj;
static int toggle;
2008-01-02 14:48:20 +03:00
vnode_t *vp;
int error = 0, tryalloc;
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
try_again:
if (mp != NULL) {
/*
* Mark filesystem busy while we're creating a
* vnode. If unmount is in progress, this will
* wait; if the unmount succeeds (only if umount
* -f), this will return an error. If the
* unmount fails, we'll keep going afterwards.
*/
error = vfs_busy(mp, RW_READER, NULL);
if (error)
return error;
}
1994-05-17 08:21:49 +04:00
/*
* We must choose whether to allocate a new vnode or recycle an
* existing one. The criterion for allocating a new one is that
* the total number of vnodes is less than the number desired or
* there are no vnodes on either free list. Generally we only
* want to recycle vnodes that have no buffers associated with
* them, so we look first on the vnode_free_list. If it is empty,
* we next consider vnodes with referencing buffers on the
* vnode_hold_list. The toggle ensures that half the time we
* will use a buffer from the vnode_hold_list, and half the time
* we will allocate a new one unless the list has grown to twice
* the desired size. We are reticent to recycle vnodes from the
* vnode_hold_list because we will lose the identity of all its
* referencing buffers.
*/
vp = NULL;
2008-01-02 14:48:20 +03:00
mutex_enter(&vnode_free_list_lock);
toggle ^= 1;
if (numvnodes > 2 * desiredvnodes)
toggle = 0;
tryalloc = numvnodes < desiredvnodes ||
(TAILQ_FIRST(&vnode_free_list) == NULL &&
(TAILQ_FIRST(&vnode_hold_list) == NULL || toggle));
2008-01-02 14:48:20 +03:00
if (tryalloc) {
numvnodes++;
2008-01-02 14:48:20 +03:00
mutex_exit(&vnode_free_list_lock);
if ((vp = vnalloc(NULL)) == NULL) {
2008-01-02 14:48:20 +03:00
mutex_enter(&vnode_free_list_lock);
numvnodes--;
} else
vp->v_usecount = 1;
}
if (vp == NULL) {
vp = getcleanvnode();
if (vp == NULL) {
if (mp != NULL) {
vfs_unbusy(mp, false);
}
if (tryalloc) {
printf("WARNING: unable to allocate new "
"vnode, retrying...\n");
(void) tsleep(&lbolt, PRIBIO, "newvn", hz);
goto try_again;
}
tablefull("vnode", "increase kern.maxvnodes or NVNODE");
1994-05-17 08:21:49 +04:00
*vpp = 0;
return (ENFILE);
}
vp->v_iflag = 0;
vp->v_vflag = 0;
vp->v_uflag = 0;
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
vp->v_socket = NULL;
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
KASSERT(vp->v_usecount == 1);
KASSERT(vp->v_freelisthd == NULL);
KASSERT(LIST_EMPTY(&vp->v_nclist));
KASSERT(LIST_EMPTY(&vp->v_dnclist));
2008-01-02 14:48:20 +03:00
vp->v_type = VNON;
vp->v_vnlock = &vp->v_lock;
1994-05-17 08:21:49 +04:00
vp->v_tag = tag;
vp->v_op = vops;
insmntque(vp, mp);
*vpp = vp;
1994-06-08 15:28:29 +04:00
vp->v_data = 0;
/*
* initialize uvm_object within vnode.
*/
a whole bunch of changes to improve performance and robustness under load: - remove special treatment of pager_map mappings in pmaps. this is required now, since I've removed the globals that expose the address range. pager_map now uses pmap_kenter_pa() instead of pmap_enter(), so there's no longer any need to special-case it. - eliminate struct uvm_vnode by moving its fields into struct vnode. - rewrite the pageout path. the pager is now responsible for handling the high-level requests instead of only getting control after a bunch of work has already been done on its behalf. this will allow us to UBCify LFS, which needs tighter control over its pages than other filesystems do. writing a page to disk no longer requires making it read-only, which allows us to write wired pages without causing all kinds of havoc. - use a new PG_PAGEOUT flag to indicate that a page should be freed on behalf of the pagedaemon when it's unlocked. this flag is very similar to PG_RELEASED, but unlike PG_RELEASED, PG_PAGEOUT can be cleared if the pageout fails due to eg. an indirect-block buffer being locked. this allows us to remove the "version" field from struct vm_page, and together with shrinking "loan_count" from 32 bits to 16, struct vm_page is now 4 bytes smaller. - no longer use PG_RELEASED for swap-backed pages. if the page is busy because it's being paged out, we can't release the swap slot to be reallocated until that write is complete, but unlike with vnodes we don't keep a count of in-progress writes so there's no good way to know when the write is done. instead, when we need to free a busy swap-backed page, just sleep until we can get it busy ourselves. - implement a fast-path for extending writes which allows us to avoid zeroing new pages. this substantially reduces cpu usage. - encapsulate the data used by the genfs code in a struct genfs_node, which must be the first element of the filesystem-specific vnode data for filesystems which use genfs_{get,put}pages(). - eliminate many of the UVM pagerops, since they aren't needed anymore now that the pager "put" operation is a higher-level operation. - enhance the genfs code to allow NFS to use the genfs_{get,put}pages instead of a modified copy. - clean up struct vnode by removing all the fields that used to be used by the vfs_cluster.c code (which we don't use anymore with UBC). - remove kmem_object and mb_object since they were useless. instead of allocating pages to these objects, we now just allocate pages with no object. such pages are mapped in the kernel until they are freed, so we can use the mapping to find the page to free it. this allows us to remove splvm() protection in several places. The sum of all these changes improves write throughput on my decstation 5000/200 to within 1% of the rate of NetBSD 1.5 and reduces the elapsed time for "make release" of a NetBSD 1.5 source tree on my 128MB pc to 10% less than a 1.5 kernel took.
2001-09-16 00:36:31 +04:00
uobj = &vp->v_uobj;
KASSERT(uobj->pgops == &uvm_vnodeops);
KASSERT(uobj->uo_npages == 0);
KASSERT(TAILQ_FIRST(&uobj->memq) == NULL);
vp->v_size = vp->v_writesize = VSIZENOTSET;
2008-01-02 14:48:20 +03:00
if (mp != NULL) {
if ((mp->mnt_iflag & IMNT_MPSAFE) != 0)
vp->v_vflag |= VV_MPSAFE;
vfs_unbusy(mp, true);
2008-01-02 14:48:20 +03:00
}
1994-05-17 08:21:49 +04:00
return (0);
}
/*
* This is really just the reverse of getnewvnode(). Needed for
* VFS_VGET functions who may need to push back a vnode in case
* of a locking race.
*/
void
2008-01-02 14:48:20 +03:00
ungetnewvnode(vnode_t *vp)
{
2008-01-02 14:48:20 +03:00
KASSERT(vp->v_usecount == 1);
KASSERT(vp->v_data == NULL);
KASSERT(vp->v_freelisthd == NULL);
mutex_enter(&vp->v_interlock);
vp->v_iflag |= VI_CLEAN;
vrelel(vp, 0);
2008-01-02 14:48:20 +03:00
}
/*
* Allocate a new, uninitialized vnode. If 'mp' is non-NULL, this is a
* marker vnode and we are prepared to wait for the allocation.
*/
vnode_t *
vnalloc(struct mount *mp)
2008-01-02 14:48:20 +03:00
{
vnode_t *vp;
vp = pool_cache_get(vnode_cache, (mp != NULL ? PR_WAITOK : PR_NOWAIT));
if (vp == NULL) {
return NULL;
}
memset(vp, 0, sizeof(*vp));
UVM_OBJ_INIT(&vp->v_uobj, &uvm_vnodeops, 0);
cv_init(&vp->v_cv, "vnode");
2004-03-23 16:22:32 +03:00
/*
2008-01-02 14:48:20 +03:00
* done by memset() above.
* LIST_INIT(&vp->v_nclist);
* LIST_INIT(&vp->v_dnclist);
*/
2008-01-02 14:48:20 +03:00
if (mp != NULL) {
vp->v_mount = mp;
vp->v_type = VBAD;
vp->v_iflag = VI_MARKER;
} else {
rw_init(&vp->v_lock.vl_lock);
2008-01-02 14:48:20 +03:00
}
return vp;
}
/*
* Free an unused, unreferenced vnode.
*/
void
vnfree(vnode_t *vp)
2008-01-02 14:48:20 +03:00
{
KASSERT(vp->v_usecount == 0);
if ((vp->v_iflag & VI_MARKER) == 0) {
rw_destroy(&vp->v_lock.vl_lock);
2008-01-02 14:48:20 +03:00
mutex_enter(&vnode_free_list_lock);
numvnodes--;
mutex_exit(&vnode_free_list_lock);
}
UVM_OBJ_DESTROY(&vp->v_uobj);
cv_destroy(&vp->v_cv);
pool_cache_put(vnode_cache, vp);
}
/*
* Remove a vnode from its freelist.
*/
static inline void
vremfree(vnode_t *vp)
{
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT(vp->v_usecount == 0);
/*
* Note that the reference count must not change until
* the vnode is removed.
*/
mutex_enter(&vnode_free_list_lock);
if (vp->v_holdcnt > 0) {
KASSERT(vp->v_freelisthd == &vnode_hold_list);
} else {
KASSERT(vp->v_freelisthd == &vnode_free_list);
}
TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
vp->v_freelisthd = NULL;
mutex_exit(&vnode_free_list_lock);
}
1994-05-17 08:21:49 +04:00
/*
* Move a vnode from one mount queue to another.
*/
2006-01-17 00:44:46 +03:00
static void
2008-01-02 14:48:20 +03:00
insmntque(vnode_t *vp, struct mount *mp)
1994-05-17 08:21:49 +04:00
{
struct mount *omp;
1994-05-17 08:21:49 +04:00
#ifdef DIAGNOSTIC
if ((mp != NULL) &&
(mp->mnt_iflag & IMNT_UNMOUNT) &&
!(mp->mnt_flag & MNT_SOFTDEP) &&
vp->v_tag != VT_VFS) {
panic("insmntque into dying filesystem");
}
#endif
2004-03-23 16:22:32 +03:00
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
1994-05-17 08:21:49 +04:00
/*
* Delete from old mount point vnode list, if on one.
*/
if ((omp = vp->v_mount) != NULL)
TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vp, v_mntvnodes);
1994-05-17 08:21:49 +04:00
/*
* Insert into list of vnodes for the new mount point, if
* available. The caller must take a reference on the mount
* structure and donate to the vnode.
1994-05-17 08:21:49 +04:00
*/
if ((vp->v_mount = mp) != NULL)
TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vp, v_mntvnodes);
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
if (omp != NULL) {
/* Release reference to old mount. */
vfs_destroy(omp);
}
1994-05-17 08:21:49 +04:00
}
/*
* Create a vnode for a block device.
1997-01-31 22:10:27 +03:00
* Used for root filesystem and swap areas.
1994-05-17 08:21:49 +04:00
* Also used for memory file system special devices.
*/
1996-02-04 05:17:43 +03:00
int
2008-01-02 14:48:20 +03:00
bdevvp(dev_t dev, vnode_t **vpp)
1994-05-17 08:21:49 +04:00
{
1994-06-08 15:28:29 +04:00
return (getdevvp(dev, vpp, VBLK));
1994-05-17 08:21:49 +04:00
}
/*
* Create a vnode for a character device.
* Used for kernfs and some console handling.
*/
1996-02-04 05:17:43 +03:00
int
2008-01-02 14:48:20 +03:00
cdevvp(dev_t dev, vnode_t **vpp)
1994-05-17 08:21:49 +04:00
{
1994-06-08 15:28:29 +04:00
return (getdevvp(dev, vpp, VCHR));
1994-05-17 08:21:49 +04:00
}
/*
* Create a vnode for a device.
* Used by bdevvp (block device) for root file system etc.,
* and by cdevvp (character device) for console and kernfs.
*/
2006-01-17 00:44:46 +03:00
static int
2008-01-02 14:48:20 +03:00
getdevvp(dev_t dev, vnode_t **vpp, enum vtype type)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
vnode_t *vp;
vnode_t *nvp;
1994-05-17 08:21:49 +04:00
int error;
1998-03-01 05:20:01 +03:00
if (dev == NODEV) {
*vpp = NULL;
1994-05-17 08:21:49 +04:00
return (0);
1998-03-01 05:20:01 +03:00
}
1996-02-04 05:17:43 +03:00
error = getnewvnode(VT_NON, NULL, spec_vnodeop_p, &nvp);
1994-05-17 08:21:49 +04:00
if (error) {
*vpp = NULL;
1994-05-17 08:21:49 +04:00
return (error);
}
vp = nvp;
vp->v_type = type;
2008-01-02 14:48:20 +03:00
vp->v_vflag |= VV_MPSAFE;
uvm_vnp_setsize(vp, 0);
spec_node_init(vp, dev);
1994-05-17 08:21:49 +04:00
*vpp = vp;
return (0);
}
/*
* Grab a particular vnode from the free list, increment its
1998-03-04 12:13:48 +03:00
* reference count and lock it. If the vnode lock bit is set the
* vnode is being eliminated in vgone. In that case, we can not
* grab the vnode, so the process is awakened when the transition is
* completed, and an error returned to indicate that the vnode is no
* longer usable (possibly having been changed to a new file system type).
1994-05-17 08:21:49 +04:00
*/
1994-06-08 15:28:29 +04:00
int
2008-01-02 14:48:20 +03:00
vget(vnode_t *vp, int flags)
1994-05-17 08:21:49 +04:00
{
int error;
1994-05-17 08:21:49 +04:00
2008-01-02 14:48:20 +03:00
KASSERT((vp->v_iflag & VI_MARKER) == 0);
if ((flags & LK_INTERLOCK) == 0)
mutex_enter(&vp->v_interlock);
/*
* Before adding a reference, we must remove the vnode
* from its freelist.
*/
if (vp->v_usecount == 0) {
vremfree(vp);
}
if (++vp->v_usecount == 0) {
vpanic(vp, "vget: usecount overflow");
}
1994-06-08 15:28:29 +04:00
/*
* If the vnode is in the process of being cleaned out for
* another use, we wait for the cleaning to finish and then
* return failure. Cleaning is determined by checking if
* the VI_XLOCK or VI_FREEING flags are set.
1994-06-08 15:28:29 +04:00
*/
if ((vp->v_iflag & (VI_XLOCK | VI_FREEING)) != 0) {
if ((flags & LK_NOWAIT) != 0) {
vrelel(vp, 0);
return EBUSY;
}
vwait(vp, VI_XLOCK | VI_FREEING);
vrelel(vp, 0);
return ENOENT;
1994-05-17 08:21:49 +04:00
}
1998-03-01 05:20:01 +03:00
if (flags & LK_TYPE_MASK) {
error = vn_lock(vp, flags | LK_INTERLOCK);
if (error != 0) {
vrele(vp);
}
return error;
1998-03-01 05:20:01 +03:00
}
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
return 0;
1994-05-17 08:21:49 +04:00
}
/*
* vput(), just unlock and vrele()
*/
void
2008-01-02 14:48:20 +03:00
vput(vnode_t *vp)
1994-05-17 08:21:49 +04:00
{
1994-06-08 15:28:29 +04:00
2008-01-02 14:48:20 +03:00
KASSERT((vp->v_iflag & VI_MARKER) == 0);
VOP_UNLOCK(vp, 0);
vrele(vp);
1994-05-17 08:21:49 +04:00
}
/*
2008-01-02 14:48:20 +03:00
* Vnode release. If reference count drops to zero, call inactive
* routine and either return to freelist or free to the pool.
1994-05-17 08:21:49 +04:00
*/
2008-01-02 14:48:20 +03:00
void
vrelel(vnode_t *vp, int flags)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
bool recycle, defer;
int error;
1994-05-17 08:21:49 +04:00
2008-01-02 14:48:20 +03:00
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT((vp->v_iflag & VI_MARKER) == 0);
KASSERT(vp->v_freelisthd == NULL);
2008-01-02 14:48:20 +03:00
if (vp->v_op == dead_vnodeop_p && (vp->v_iflag & VI_CLEAN) == 0) {
vpanic(vp, "dead but not clean");
}
/*
* If not the last reference, just drop the reference count
* and unlock.
*/
if (vp->v_usecount > 1) {
vp->v_usecount--;
vp->v_iflag |= VI_INACTREDO;
mutex_exit(&vp->v_interlock);
1994-05-17 08:21:49 +04:00
return;
1998-03-01 05:20:01 +03:00
}
2008-01-02 14:48:20 +03:00
if (vp->v_usecount <= 0 || vp->v_writecount != 0) {
vpanic(vp, "vput: bad ref count");
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
1994-06-08 15:28:29 +04:00
/*
2008-01-02 14:48:20 +03:00
* If not clean, deactivate the vnode, but preserve
* our reference across the call to VOP_INACTIVE().
1994-06-08 15:28:29 +04:00
*/
2008-01-02 14:48:20 +03:00
retry:
if ((vp->v_iflag & VI_CLEAN) == 0) {
recycle = false;
/*
* XXX This ugly block can be largely eliminated if
* locking is pushed down into the file systems.
*/
if (curlwp == uvm.pagedaemon_lwp) {
/* The pagedaemon can't wait around; defer. */
defer = true;
} else if (curlwp == vrele_lwp) {
/* We have to try harder. */
vp->v_iflag &= ~VI_INACTREDO;
error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
LK_RETRY);
if (error != 0) {
/* XXX */
vpanic(vp, "vrele: unable to lock %p");
}
defer = false;
} else if ((vp->v_iflag & VI_LAYER) != 0) {
/*
* Acquiring the stack's lock in vclean() even
* for an honest vput/vrele is dangerous because
* our caller may hold other vnode locks; defer.
*/
defer = true;
} else {
/* If we can't acquire the lock, then defer. */
vp->v_iflag &= ~VI_INACTREDO;
error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK |
LK_NOWAIT);
if (error != 0) {
defer = true;
mutex_enter(&vp->v_interlock);
} else {
defer = false;
}
}
if (defer) {
/*
* Defer reclaim to the kthread; it's not safe to
* clean it here. We donate it our last reference.
*/
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT((vp->v_iflag & VI_INACTPEND) == 0);
vp->v_iflag |= VI_INACTPEND;
mutex_enter(&vrele_lock);
TAILQ_INSERT_TAIL(&vrele_list, vp, v_freelist);
if (++vrele_pending > (desiredvnodes >> 8))
cv_signal(&vrele_cv);
mutex_exit(&vrele_lock);
mutex_exit(&vp->v_interlock);
return;
}
#ifdef DIAGNOSTIC
2008-01-25 00:04:12 +03:00
if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) {
vprint("vrelel: missing VOP_CLOSE()", vp);
}
#endif
2008-01-02 14:48:20 +03:00
/*
* The vnode can gain another reference while being
* deactivated. If VOP_INACTIVE() indicates that
* the described file has been deleted, then recycle
* the vnode irrespective of additional references.
* Another thread may be waiting to re-use the on-disk
* inode.
*
* Note that VOP_INACTIVE() will drop the vnode lock.
2008-01-02 14:48:20 +03:00
*/
VOP_INACTIVE(vp, &recycle);
mutex_enter(&vp->v_interlock);
if (!recycle) {
if (vp->v_usecount > 1) {
vp->v_usecount--;
mutex_exit(&vp->v_interlock);
return;
}
2008-01-02 14:48:20 +03:00
/*
* If we grew another reference while
* VOP_INACTIVE() was underway, retry.
*/
if ((vp->v_iflag & VI_INACTREDO) != 0) {
goto retry;
}
2008-01-02 14:48:20 +03:00
}
/* Take care of space accounting. */
if (vp->v_iflag & VI_EXECMAP) {
atomic_add_int(&uvmexp.execpages,
-vp->v_uobj.uo_npages);
atomic_add_int(&uvmexp.filepages,
vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP|VI_MAPPED);
vp->v_vflag &= ~VV_MAPPED;
/*
* Recycle the vnode if the file is now unused (unlinked),
* otherwise just free it.
*/
if (recycle) {
vclean(vp, DOCLOSE);
}
KASSERT(vp->v_usecount > 0);
}
2008-01-02 14:48:20 +03:00
if (--vp->v_usecount != 0) {
/* Gained another reference while being reclaimed. */
mutex_exit(&vp->v_interlock);
return;
}
2008-01-02 14:48:20 +03:00
if ((vp->v_iflag & VI_CLEAN) != 0) {
/*
* It's clean so destroy it. It isn't referenced
* anywhere since it has been reclaimed.
*/
KASSERT(vp->v_holdcnt == 0);
KASSERT(vp->v_writecount == 0);
mutex_exit(&vp->v_interlock);
insmntque(vp, NULL);
if (vp->v_type == VBLK || vp->v_type == VCHR) {
spec_node_destroy(vp);
}
vnfree(vp);
} else {
2008-01-02 14:48:20 +03:00
/*
* Otherwise, put it back onto the freelist. It
* can't be destroyed while still associated with
* a file system.
*/
mutex_enter(&vnode_free_list_lock);
if (vp->v_holdcnt > 0) {
vp->v_freelisthd = &vnode_hold_list;
} else {
vp->v_freelisthd = &vnode_free_list;
}
TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
mutex_exit(&vnode_free_list_lock);
mutex_exit(&vp->v_interlock);
}
}
void
2008-01-02 14:48:20 +03:00
vrele(vnode_t *vp)
{
2008-01-02 14:48:20 +03:00
KASSERT((vp->v_iflag & VI_MARKER) == 0);
mutex_enter(&vp->v_interlock);
vrelel(vp, 0);
}
2008-01-02 14:48:20 +03:00
static void
vrele_thread(void *cookie)
{
2008-01-02 14:48:20 +03:00
vnode_t *vp;
2008-01-02 14:48:20 +03:00
for (;;) {
mutex_enter(&vrele_lock);
while (TAILQ_EMPTY(&vrele_list)) {
cv_timedwait(&vrele_cv, &vrele_lock, hz);
}
vp = TAILQ_FIRST(&vrele_list);
TAILQ_REMOVE(&vrele_list, vp, v_freelist);
vrele_pending--;
mutex_exit(&vrele_lock);
/*
* If not the last reference, then ignore the vnode
* and look for more work.
*/
mutex_enter(&vp->v_interlock);
KASSERT((vp->v_iflag & VI_INACTPEND) != 0);
vp->v_iflag &= ~VI_INACTPEND;
if (vp->v_usecount > 1) {
vp->v_usecount--;
mutex_exit(&vp->v_interlock);
continue;
}
vrelel(vp, 0);
2008-01-02 14:48:20 +03:00
}
1994-05-17 08:21:49 +04:00
}
/*
* Page or buffer structure gets a reference.
* Called with v_interlock held.
1994-05-17 08:21:49 +04:00
*/
1994-06-08 15:28:29 +04:00
void
2008-01-02 14:48:20 +03:00
vholdl(vnode_t *vp)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT((vp->v_iflag & VI_MARKER) == 0);
if (vp->v_holdcnt++ == 0 && vp->v_usecount == 0) {
mutex_enter(&vnode_free_list_lock);
KASSERT(vp->v_freelisthd == &vnode_free_list);
TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
vp->v_freelisthd = &vnode_hold_list;
TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
mutex_exit(&vnode_free_list_lock);
}
1994-05-17 08:21:49 +04:00
}
/*
* Page or buffer structure frees a reference.
* Called with v_interlock held.
1994-05-17 08:21:49 +04:00
*/
1994-06-08 15:28:29 +04:00
void
2008-01-02 14:48:20 +03:00
holdrelel(vnode_t *vp)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT((vp->v_iflag & VI_MARKER) == 0);
2008-01-02 14:48:20 +03:00
if (vp->v_holdcnt <= 0) {
vpanic(vp, "holdrelel: holdcnt vp %p");
}
2008-01-02 14:48:20 +03:00
vp->v_holdcnt--;
if (vp->v_holdcnt == 0 && vp->v_usecount == 0) {
mutex_enter(&vnode_free_list_lock);
KASSERT(vp->v_freelisthd == &vnode_hold_list);
TAILQ_REMOVE(vp->v_freelisthd, vp, v_freelist);
vp->v_freelisthd = &vnode_free_list;
TAILQ_INSERT_TAIL(vp->v_freelisthd, vp, v_freelist);
mutex_exit(&vnode_free_list_lock);
}
1994-05-17 08:21:49 +04:00
}
/*
2008-01-02 14:48:20 +03:00
* Vnode reference, where a reference is already held by some other
* object (for example, a file structure).
*/
void
2008-01-02 14:48:20 +03:00
vref(vnode_t *vp)
{
2008-01-02 14:48:20 +03:00
KASSERT((vp->v_iflag & VI_MARKER) == 0);
mutex_enter(&vp->v_interlock);
if (vp->v_usecount <= 0) {
vpanic(vp, "vref used where vget required");
}
2008-01-02 14:48:20 +03:00
if (++vp->v_usecount == 0) {
vpanic(vp, "vref: usecount overflow");
}
mutex_exit(&vp->v_interlock);
}
1994-05-17 08:21:49 +04:00
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If FORCECLOSE is not specified, there should not be any active ones,
1994-05-17 08:21:49 +04:00
* return error if any are found (nb: this is a user error, not a
* system error). If FORCECLOSE is specified, detach any active vnodes
1994-05-17 08:21:49 +04:00
* that are found.
*
* If WRITECLOSE is set, only flush out regular file vnodes open for
* writing.
*
* SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped.
1994-05-17 08:21:49 +04:00
*/
1994-06-08 15:28:29 +04:00
#ifdef DEBUG
int busyprt = 0; /* print out busy vnodes */
struct ctldebug debug1 = { "busyprt", &busyprt };
#endif
1994-05-17 08:21:49 +04:00
1996-02-04 05:17:43 +03:00
int
2008-01-02 14:48:20 +03:00
vflush(struct mount *mp, vnode_t *skipvp, int flags)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
vnode_t *vp, *mvp;
1994-05-17 08:21:49 +04:00
int busy = 0;
2008-01-02 14:48:20 +03:00
/* Allocate a marker vnode. */
if ((mvp = vnalloc(mp)) == NULL)
2008-01-02 14:48:20 +03:00
return (ENOMEM);
mutex_enter(&mntvnode_lock);
/*
* NOTE: not using the TAILQ_FOREACH here since in this loop vgone()
* and vclean() are called
*/
2008-01-02 14:48:20 +03:00
for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
vmark(mvp, vp);
if (vp->v_mount != mp || vismarker(vp))
continue;
1994-05-17 08:21:49 +04:00
/*
* Skip over a selected vnode.
*/
if (vp == skipvp)
continue;
2008-01-02 14:48:20 +03:00
mutex_enter(&vp->v_interlock);
/*
* Ignore clean but still referenced vnodes.
*/
if ((vp->v_iflag & VI_CLEAN) != 0) {
mutex_exit(&vp->v_interlock);
continue;
}
1994-05-17 08:21:49 +04:00
/*
2008-01-02 14:48:20 +03:00
* Skip over a vnodes marked VSYSTEM.
1994-05-17 08:21:49 +04:00
*/
if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
1994-05-17 08:21:49 +04:00
continue;
1998-03-01 05:20:01 +03:00
}
1994-06-08 15:28:29 +04:00
/*
* If WRITECLOSE is set, only flush out regular file
* vnodes open for writing.
*/
if ((flags & WRITECLOSE) &&
(vp->v_writecount == 0 || vp->v_type != VREG)) {
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
1994-06-08 15:28:29 +04:00
continue;
}
1994-05-17 08:21:49 +04:00
/*
* With v_usecount == 0, all we need to do is clear
* out the vnode data structures and we are done.
*/
if (vp->v_usecount == 0) {
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
vremfree(vp);
vp->v_usecount++;
vclean(vp, DOCLOSE);
vrelel(vp, 0);
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
1994-05-17 08:21:49 +04:00
continue;
}
/*
1994-06-08 15:28:29 +04:00
* If FORCECLOSE is set, forcibly close the vnode.
1994-05-17 08:21:49 +04:00
* For block or character devices, revert to an
* anonymous device. For all other files, just
* kill them.
1994-05-17 08:21:49 +04:00
*/
if (flags & FORCECLOSE) {
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
vp->v_usecount++;
1994-05-17 08:21:49 +04:00
if (vp->v_type != VBLK && vp->v_type != VCHR) {
2008-01-02 14:48:20 +03:00
vclean(vp, DOCLOSE);
vrelel(vp, 0);
1994-05-17 08:21:49 +04:00
} else {
2008-01-02 14:48:20 +03:00
vclean(vp, 0);
vp->v_op = spec_vnodeop_p; /* XXXSMP */
2008-01-24 21:31:52 +03:00
mutex_exit(&vp->v_interlock);
/*
* The vnode isn't clean, but still resides
* on the mount list. Remove it. XXX This
* is a bit dodgy.
*/
insmntque(vp, NULL);
vrele(vp);
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
1994-05-17 08:21:49 +04:00
continue;
}
1994-06-08 15:28:29 +04:00
#ifdef DEBUG
1994-05-17 08:21:49 +04:00
if (busyprt)
vprint("vflush: busy vnode", vp);
1994-06-08 15:28:29 +04:00
#endif
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
1994-05-17 08:21:49 +04:00
busy++;
}
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
vnfree(mvp);
1994-05-17 08:21:49 +04:00
if (busy)
return (EBUSY);
return (0);
}
/*
* Disassociate the underlying file system from a vnode.
2008-01-02 14:48:20 +03:00
*
* Must be called with the interlock held, and will return with it held.
1994-05-17 08:21:49 +04:00
*/
2008-01-02 14:48:20 +03:00
void
vclean(vnode_t *vp, int flags)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
lwp_t *l = curlwp;
bool recycle, active;
int error;
1994-05-17 08:21:49 +04:00
2008-01-02 14:48:20 +03:00
KASSERT(mutex_owned(&vp->v_interlock));
KASSERT((vp->v_iflag & VI_MARKER) == 0);
KASSERT(vp->v_usecount != 0);
2008-01-02 14:48:20 +03:00
/* If cleaning is already in progress wait until done and return. */
if (vp->v_iflag & VI_XLOCK) {
vwait(vp, VI_XLOCK);
return;
}
2008-01-02 14:48:20 +03:00
/* If already clean, nothing to do. */
if ((vp->v_iflag & VI_CLEAN) != 0) {
return;
}
1994-05-17 08:21:49 +04:00
/*
2008-01-02 14:48:20 +03:00
* Prevent the vnode from being recycled or brought into use
* while we clean it out.
1994-05-17 08:21:49 +04:00
*/
vp->v_iflag |= VI_XLOCK;
if (vp->v_iflag & VI_EXECMAP) {
atomic_add_int(&uvmexp.execpages, -vp->v_uobj.uo_npages);
atomic_add_int(&uvmexp.filepages, vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
2008-01-02 14:48:20 +03:00
active = (vp->v_usecount > 1);
2008-01-02 14:48:20 +03:00
/* XXXAD should not lock vnode under layer */
VOP_LOCK(vp, LK_EXCLUSIVE | LK_INTERLOCK);
1998-03-01 05:20:01 +03:00
/*
* Clean out any cached data associated with the vnode.
* If purging an active vnode, it must be closed and
* deactivated before being reclaimed. Note that the
* VOP_INACTIVE will unlock the vnode.
1994-05-17 08:21:49 +04:00
*/
if (flags & DOCLOSE) {
2005-12-11 15:16:03 +03:00
error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
if (error != 0)
2005-12-11 15:16:03 +03:00
error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
KASSERT(error == 0);
KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) {
spec_node_revoke(vp);
}
}
1994-05-17 08:21:49 +04:00
if (active) {
2008-01-02 14:48:20 +03:00
VOP_INACTIVE(vp, &recycle);
1998-03-01 05:20:01 +03:00
} else {
/*
* Any other processes trying to obtain this lock must first
* wait for VI_XLOCK to clear, then call the new lock operation.
1998-03-01 05:20:01 +03:00
*/
VOP_UNLOCK(vp, 0);
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
/* Disassociate the underlying file system from the vnode. */
if (VOP_RECLAIM(vp)) {
vpanic(vp, "vclean: cannot reclaim");
}
1994-06-08 15:28:29 +04:00
KASSERT(vp->v_uobj.uo_npages == 0);
2005-11-30 01:52:02 +03:00
if (vp->v_type == VREG && vp->v_ractx != NULL) {
uvm_ra_freectx(vp->v_ractx);
vp->v_ractx = NULL;
}
1998-03-01 05:20:01 +03:00
cache_purge(vp);
2008-01-02 14:48:20 +03:00
/* Done with purge, notify sleepers of the grim news. */
1994-06-08 15:28:29 +04:00
vp->v_op = dead_vnodeop_p;
vp->v_tag = VT_NON;
2008-01-02 14:48:20 +03:00
mutex_enter(&vp->v_interlock);
vp->v_vnlock = &vp->v_lock;
KNOTE(&vp->v_klist, NOTE_REVOKE);
vp->v_iflag &= ~(VI_XLOCK | VI_FREEING);
vp->v_vflag &= ~VV_LOCKSWORK;
2008-01-24 20:57:14 +03:00
if ((flags & DOCLOSE) != 0) {
vp->v_iflag |= VI_CLEAN;
}
2008-01-02 14:48:20 +03:00
cv_broadcast(&vp->v_cv);
KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
1994-05-17 08:21:49 +04:00
}
/*
1998-03-01 05:20:01 +03:00
* Recycle an unused vnode to the front of the free list.
* Release the passed interlock if the vnode will be recycled.
1994-05-17 08:21:49 +04:00
*/
1998-03-01 05:20:01 +03:00
int
2008-01-02 14:48:20 +03:00
vrecycle(vnode_t *vp, kmutex_t *inter_lkp, struct lwp *l)
2004-03-23 16:22:32 +03:00
{
2008-01-02 14:48:20 +03:00
KASSERT((vp->v_iflag & VI_MARKER) == 0);
mutex_enter(&vp->v_interlock);
if (vp->v_usecount != 0) {
mutex_exit(&vp->v_interlock);
return (0);
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
if (inter_lkp)
mutex_exit(inter_lkp);
vremfree(vp);
vp->v_usecount++;
vclean(vp, DOCLOSE);
vrelel(vp, 0);
2008-01-02 14:48:20 +03:00
return (1);
1994-05-17 08:21:49 +04:00
}
/*
2008-01-02 14:48:20 +03:00
* Eliminate all activity associated with a vnode in preparation for
* reuse. Drops a reference from the vnode.
1994-05-17 08:21:49 +04:00
*/
void
2008-01-02 14:48:20 +03:00
vgone(vnode_t *vp)
1998-03-01 05:20:01 +03:00
{
2008-01-02 14:48:20 +03:00
mutex_enter(&vp->v_interlock);
vclean(vp, DOCLOSE);
vrelel(vp, 0);
1994-05-17 08:21:49 +04:00
}
/*
* Lookup a vnode by device number.
*/
1996-02-04 05:17:43 +03:00
int
2008-01-02 14:48:20 +03:00
vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
1994-05-17 08:21:49 +04:00
{
2008-01-02 14:48:20 +03:00
vnode_t *vp;
1998-03-01 05:20:01 +03:00
int rc = 0;
1994-05-17 08:21:49 +04:00
mutex_enter(&specfs_lock);
for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
1994-05-17 08:21:49 +04:00
if (dev != vp->v_rdev || type != vp->v_type)
continue;
*vpp = vp;
1998-03-01 05:20:01 +03:00
rc = 1;
break;
1994-05-17 08:21:49 +04:00
}
mutex_exit(&specfs_lock);
1998-03-01 05:20:01 +03:00
return (rc);
1994-05-17 08:21:49 +04:00
}
/*
* Revoke all the vnodes corresponding to the specified minor number
* range (endpoints inclusive) of the specified major.
*/
void
2005-06-06 03:47:48 +04:00
vdevgone(int maj, int minl, int minh, enum vtype type)
{
vnode_t *vp, **vpp;
dev_t dev;
int mn;
vp = NULL; /* XXX gcc */
mutex_enter(&specfs_lock);
for (mn = minl; mn <= minh; mn++) {
dev = makedev(maj, mn);
vpp = &specfs_hash[SPECHASH(dev)];
for (vp = *vpp; vp != NULL;) {
mutex_enter(&vp->v_interlock);
if ((vp->v_iflag & VI_CLEAN) != 0 ||
dev != vp->v_rdev || type != vp->v_type) {
mutex_exit(&vp->v_interlock);
vp = vp->v_specnext;
continue;
}
mutex_exit(&specfs_lock);
if (vget(vp, LK_INTERLOCK) == 0) {
VOP_REVOKE(vp, REVOKEALL);
vrele(vp);
}
mutex_enter(&specfs_lock);
vp = *vpp;
}
}
mutex_exit(&specfs_lock);
}
1994-05-17 08:21:49 +04:00
/*
* Calculate the total number of references to a special device.
*/
1994-06-08 15:28:29 +04:00
int
2008-01-02 14:48:20 +03:00
vcount(vnode_t *vp)
1994-05-17 08:21:49 +04:00
{
int count;
mutex_enter(&specfs_lock);
2008-01-02 14:48:20 +03:00
mutex_enter(&vp->v_interlock);
if (vp->v_specnode == NULL) {
2008-01-02 14:48:20 +03:00
count = vp->v_usecount - ((vp->v_iflag & VI_INACTPEND) != 0);
mutex_exit(&vp->v_interlock);
mutex_exit(&specfs_lock);
2008-01-02 14:48:20 +03:00
return (count);
}
mutex_exit(&vp->v_interlock);
count = vp->v_specnode->sn_dev->sd_opencnt;
mutex_exit(&specfs_lock);
1994-05-17 08:21:49 +04:00
return (count);
}
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
void
vrevoke(vnode_t *vp)
{
vnode_t *vq, **vpp;
enum vtype type;
dev_t dev;
KASSERT(vp->v_usecount > 0);
mutex_enter(&vp->v_interlock);
if ((vp->v_iflag & VI_CLEAN) != 0) {
mutex_exit(&vp->v_interlock);
return;
} else {
dev = vp->v_rdev;
type = vp->v_type;
mutex_exit(&vp->v_interlock);
}
vpp = &specfs_hash[SPECHASH(dev)];
mutex_enter(&specfs_lock);
for (vq = *vpp; vq != NULL;) {
2008-01-17 22:23:13 +03:00
if ((vq->v_iflag & VI_CLEAN) != 0 ||
vq->v_rdev != dev || vq->v_type != type) {
vq = vq->v_specnext;
continue;
}
mutex_enter(&vq->v_interlock);
mutex_exit(&specfs_lock);
if (vq->v_usecount == 0) {
2008-01-17 22:23:13 +03:00
vremfree(vq);
}
vq->v_usecount++;
vclean(vq, DOCLOSE);
vrelel(vq, 0);
mutex_enter(&specfs_lock);
vq = *vpp;
}
mutex_exit(&specfs_lock);
}
/*
* sysctl helper routine to return list of supported fstypes
*/
static int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
{
char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
char *where = oldp;
struct vfsops *v;
size_t needed, left, slen;
int error, first;
if (newp != NULL)
return (EPERM);
if (namelen != 0)
return (EINVAL);
first = 1;
error = 0;
needed = 0;
left = *oldlenp;
sysctl_unlock();
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (where == NULL)
needed += strlen(v->vfs_name) + 1;
else {
memset(bf, 0, sizeof(bf));
if (first) {
strncpy(bf, v->vfs_name, sizeof(bf));
first = 0;
} else {
bf[0] = ' ';
strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
}
bf[sizeof(bf)-1] = '\0';
slen = strlen(bf);
if (left < slen + 1)
break;
/* +1 to copy out the trailing NUL byte */
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
error = copyout(bf, where, slen + 1);
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (error)
break;
where += slen;
needed += slen;
left -= slen;
}
}
mutex_exit(&vfs_list_lock);
sysctl_relock();
*oldlenp = needed;
return (error);
}
1998-03-01 05:20:01 +03:00
/*
* Top level filesystem related information gathering.
*/
SYSCTL_SETUP(sysctl_vfs_setup, "sysctl vfs subtree setup")
1998-03-01 05:20:01 +03:00
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "vfs", NULL,
NULL, 0, NULL, 0,
CTL_VFS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
2004-05-25 08:44:43 +04:00
CTLTYPE_NODE, "generic",
SYSCTL_DESCR("Non-specific vfs related information"),
NULL, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
2004-05-25 08:44:43 +04:00
CTLTYPE_INT, "usermount",
SYSCTL_DESCR("Whether unprivileged users may mount "
"filesystems"),
NULL, 0, &dovfsusermount, 0,
CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "fstypes",
SYSCTL_DESCR("List of file systems present"),
sysctl_vfs_generic_fstypes, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "magiclinks",
SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
NULL, 0, &vfs_magiclinks, 0,
CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
1998-03-01 05:20:01 +03:00
}
1994-05-17 08:21:49 +04:00
int kinfo_vdebug = 1;
int kinfo_vgetfailed;
#define KINFO_VNODESLOP 10
/*
* Dump vnode list (via sysctl).
* Copyout address of vnode followed by vnode.
*/
/* ARGSUSED */
1996-02-04 05:17:43 +03:00
int
sysctl_kern_vnode(SYSCTLFN_ARGS)
1994-05-17 08:21:49 +04:00
{
char *where = oldp;
size_t *sizep = oldlenp;
1998-03-01 05:20:01 +03:00
struct mount *mp, *nmp;
vnode_t *vp, *mvp, vbuf;
1998-03-01 05:20:01 +03:00
char *bp = where, *savebp;
1994-05-17 08:21:49 +04:00
char *ewhere;
int error;
if (namelen != 0)
return (EOPNOTSUPP);
if (newp != NULL)
return (EPERM);
2008-01-02 14:48:20 +03:00
#define VPTRSZ sizeof(vnode_t *)
#define VNODESZ sizeof(vnode_t)
1994-05-17 08:21:49 +04:00
if (where == NULL) {
*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
return (0);
}
ewhere = where + *sizep;
1998-03-01 05:20:01 +03:00
sysctl_unlock();
mutex_enter(&mountlist_lock);
for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
mp = nmp) {
if (vfs_trybusy(mp, RW_READER, &mountlist_lock)) {
nmp = CIRCLEQ_NEXT(mp, mnt_list);
1994-05-17 08:21:49 +04:00
continue;
1998-03-01 05:20:01 +03:00
}
1994-05-17 08:21:49 +04:00
savebp = bp;
2008-01-02 14:48:20 +03:00
/* Allocate a marker vnode. */
if ((mvp = vnalloc(mp)) == NULL) {
sysctl_relock();
2008-01-02 14:48:20 +03:00
return (ENOMEM);
}
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = vunmark(mvp)) {
vmark(mvp, vp);
1994-05-17 08:21:49 +04:00
/*
* Check that the vp is still associated with
* this filesystem. RACE: could have been
* recycled onto the same filesystem.
*/
2008-01-02 14:48:20 +03:00
if (vp->v_mount != mp || vismarker(vp))
continue;
1994-05-17 08:21:49 +04:00
if (bp + VPTRSZ + VNODESZ > ewhere) {
2008-01-02 14:48:20 +03:00
(void)vunmark(mvp);
mutex_exit(&mntvnode_lock);
vnfree(mvp);
sysctl_relock();
1994-05-17 08:21:49 +04:00
*sizep = bp - where;
return (ENOMEM);
}
memcpy(&vbuf, vp, VNODESZ);
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
if ((error = copyout(vp, bp, VPTRSZ)) ||
(error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
(void)vunmark(mvp);
mutex_exit(&mntvnode_lock);
vnfree(mvp);
sysctl_relock();
1994-05-17 08:21:49 +04:00
return (error);
2008-01-02 14:48:20 +03:00
}
1994-05-17 08:21:49 +04:00
bp += VPTRSZ + VNODESZ;
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
1994-05-17 08:21:49 +04:00
}
2008-01-02 14:48:20 +03:00
mutex_exit(&mntvnode_lock);
mutex_enter(&mountlist_lock);
nmp = CIRCLEQ_NEXT(mp, mnt_list);
vfs_unbusy(mp, false);
vnfree(mvp);
1994-05-17 08:21:49 +04:00
}
mutex_exit(&mountlist_lock);
sysctl_relock();
1994-05-17 08:21:49 +04:00
*sizep = bp - where;
return (0);
}
1994-06-08 15:28:29 +04:00
2008-01-02 14:48:20 +03:00
/*
* Remove clean vnodes from a mountpoint's vnode list.
*/
void
vfs_scrubvnlist(struct mount *mp)
{
vnode_t *vp, *nvp;
retry:
2008-01-02 14:48:20 +03:00
mutex_enter(&mntvnode_lock);
for (vp = TAILQ_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
nvp = TAILQ_NEXT(vp, v_mntvnodes);
mutex_enter(&vp->v_interlock);
if ((vp->v_iflag & VI_CLEAN) != 0) {
2008-01-02 14:48:20 +03:00
TAILQ_REMOVE(&mp->mnt_vnodelist, vp, v_mntvnodes);
vp->v_mount = NULL;
mutex_exit(&mntvnode_lock);
mutex_exit(&vp->v_interlock);
vfs_destroy(mp);
goto retry;
}
2008-01-02 14:48:20 +03:00
mutex_exit(&vp->v_interlock);
}
mutex_exit(&mntvnode_lock);
}
1994-06-08 15:28:29 +04:00
/*
* Check to see if a filesystem is mounted on a block device.
*/
int
2008-01-02 14:48:20 +03:00
vfs_mountedon(vnode_t *vp)
1994-06-08 15:28:29 +04:00
{
2008-01-02 14:48:20 +03:00
vnode_t *vq;
1998-03-01 05:20:01 +03:00
int error = 0;
1994-06-08 15:28:29 +04:00
if (vp->v_type != VBLK)
return ENOTBLK;
if (vp->v_specmountpoint != NULL)
1994-06-08 15:28:29 +04:00
return (EBUSY);
mutex_enter(&specfs_lock);
for (vq = specfs_hash[SPECHASH(vp->v_rdev)]; vq != NULL;
vq = vq->v_specnext) {
if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
continue;
if (vq->v_specmountpoint != NULL) {
error = EBUSY;
break;
1994-06-08 15:28:29 +04:00
}
}
mutex_exit(&specfs_lock);
1998-03-01 05:20:01 +03:00
return (error);
1994-06-08 15:28:29 +04:00
}
/*
* Unmount all file systems.
* We traverse the list in reverse order under the assumption that doing so
* will avoid needing to worry about dependencies.
*/
void
2005-12-11 15:16:03 +03:00
vfs_unmountall(struct lwp *l)
{
2000-03-30 13:27:11 +04:00
struct mount *mp, *nmp;
int allerror, error;
printf("unmounting file systems...");
for (allerror = 0, mp = CIRCLEQ_LAST(&mountlist);
!CIRCLEQ_EMPTY(&mountlist);
mp = nmp) {
nmp = CIRCLEQ_PREV(mp, mnt_list);
#ifdef DEBUG
printf("\nunmounting %s (%s)...",
1996-10-11 02:46:11 +04:00
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
#endif
/*
* XXX Freeze syncer. Must do this before locking the
* mount point. See dounmount() for details.
*/
2007-02-10 00:55:00 +03:00
mutex_enter(&syncer_mutex);
if (vfs_busy(mp, RW_WRITER, NULL)) {
2007-02-10 00:55:00 +03:00
mutex_exit(&syncer_mutex);
continue;
}
2005-12-11 15:16:03 +03:00
if ((error = dounmount(mp, MNT_FORCE, l)) != 0) {
1996-10-13 06:32:29 +04:00
printf("unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
allerror = 1;
}
}
printf(" done\n");
if (allerror)
1996-10-13 06:32:29 +04:00
printf("WARNING: some file systems would not unmount\n");
}
/*
* Sync and unmount file systems before shutting down.
*/
void
2005-06-06 03:47:48 +04:00
vfs_shutdown(void)
{
struct lwp *l;
/* XXX we're certainly not running in lwp0's context! */
l = curlwp;
if (l == NULL)
l = &lwp0;
printf("syncing disks... ");
/* remove user processes from run queue */
suspendsched();
(void) spl0();
/* avoid coming back this way again if we panic. */
doing_shutdown = 1;
2003-01-18 13:06:22 +03:00
sys_sync(l, NULL, NULL);
/* Wait for sync to finish. */
if (buf_syncwait() != 0) {
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
Debugger();
#endif
printf("giving up\n");
return;
} else
1996-10-13 06:32:29 +04:00
printf("done\n");
/*
* If we've panic'd, don't make the situation potentially
* worse by unmounting the file systems.
*/
if (panicstr != NULL)
return;
/* Release inodes held by texts before update. */
#ifdef notdef
vnshutdown();
#endif
/* Unmount file systems. */
2005-12-11 15:16:03 +03:00
vfs_unmountall(l);
}
/*
* Mount the root file system. If the operator didn't specify a
* file system to use, try all possible file systems until one
* succeeds.
*/
int
2005-06-06 03:47:48 +04:00
vfs_mountroot(void)
{
struct vfsops *v;
int error = ENODEV;
if (root_device == NULL)
panic("vfs_mountroot: root device unknown");
switch (device_class(root_device)) {
case DV_IFNET:
if (rootdev != NODEV)
panic("vfs_mountroot: rootdev set for DV_IFNET "
"(0x%08x -> %d,%d)", rootdev,
major(rootdev), minor(rootdev));
break;
case DV_DISK:
if (rootdev == NODEV)
panic("vfs_mountroot: rootdev not set for DV_DISK");
if (bdevvp(rootdev, &rootvp))
panic("vfs_mountroot: can't get vnode for rootdev");
error = VOP_OPEN(rootvp, FREAD, FSCRED);
if (error) {
printf("vfs_mountroot: can't open root device\n");
return (error);
}
break;
default:
printf("%s: inappropriate for root file system\n",
root_device->dv_xname);
return (ENODEV);
}
/*
* If user specified a file system, use it.
*/
if (mountroot != NULL) {
error = (*mountroot)();
goto done;
}
/*
* Try each file system currently configured into the kernel.
*/
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (v->vfs_mountroot == NULL)
continue;
#ifdef DEBUG
aprint_normal("mountroot: trying %s...\n", v->vfs_name);
#endif
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
error = (*v->vfs_mountroot)();
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (!error) {
aprint_normal("root file system type: %s\n",
v->vfs_name);
break;
}
}
mutex_exit(&vfs_list_lock);
if (v == NULL) {
printf("no file system for %s", root_device->dv_xname);
if (device_class(root_device) == DV_DISK)
printf(" (dev 0x%x)", rootdev);
printf("\n");
error = EFTYPE;
}
done:
if (error && device_class(root_device) == DV_DISK) {
VOP_CLOSE(rootvp, FREAD, FSCRED);
vrele(rootvp);
}
return (error);
}
/*
* Sham lock manager for vnodes. This is a temporary measure.
*/
int
vlockmgr(struct vnlock *vl, int flags)
{
KASSERT((flags & ~(LK_CANRECURSE | LK_NOWAIT | LK_TYPE_MASK)) == 0);
switch (flags & LK_TYPE_MASK) {
case LK_SHARED:
if (rw_tryenter(&vl->vl_lock, RW_READER)) {
return 0;
}
if ((flags & LK_NOWAIT) != 0) {
return EBUSY;
}
rw_enter(&vl->vl_lock, RW_READER);
return 0;
case LK_EXCLUSIVE:
if (rw_tryenter(&vl->vl_lock, RW_WRITER)) {
return 0;
}
if ((vl->vl_canrecurse || (flags & LK_CANRECURSE) != 0) &&
rw_write_held(&vl->vl_lock)) {
vl->vl_recursecnt++;
return 0;
}
if ((flags & LK_NOWAIT) != 0) {
return EBUSY;
}
rw_enter(&vl->vl_lock, RW_WRITER);
return 0;
case LK_RELEASE:
if (vl->vl_recursecnt != 0) {
KASSERT(rw_write_held(&vl->vl_lock));
vl->vl_recursecnt--;
return 0;
}
rw_exit(&vl->vl_lock);
return 0;
default:
panic("vlockmgr: flags %x", flags);
}
}
int
vlockstatus(struct vnlock *vl)
{
if (rw_write_held(&vl->vl_lock)) {
return LK_EXCLUSIVE;
}
if (rw_read_held(&vl->vl_lock)) {
return LK_SHARED;
}
return 0;
}