.\" $NetBSD: vnode.9,v 1.53 2011/06/14 00:56:02 rmind Exp $ .\" .\" Copyright (c) 2001, 2005, 2006 The NetBSD Foundation, Inc. .\" All rights reserved. .\" .\" This code is derived from software contributed to The NetBSD Foundation .\" by Gregory McGarry. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS .\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED .\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR .\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS .\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS .\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN .\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) .\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGE. .\" .Dd June 14, 2011 .Dt VNODE 9 .Os .Sh NAME .Nm vnode , .Nm vref , .Nm vrele , .Nm vrele_async , .Nm vget , .Nm vput , .Nm vhold , .Nm holdrele , .Nm getnewvnode , .Nm ungetnewvnode , .Nm vrecycle , .Nm vgone , .Nm vgonel , .Nm vflush , .Nm vaccess , .Nm bdevvp , .Nm cdevvp , .Nm vfinddev , .Nm vdevgone , .Nm vwakeup , .Nm vflushbuf , .Nm vinvalbuf , .Nm vtruncbuf , .Nm vprint .Nd kernel representation of a file or directory .Sh SYNOPSIS .In sys/param.h .In sys/vnode.h .Ft void .Fn vref "struct vnode *vp" .Ft void .Fn vrele "struct vnode *vp" .Ft void .Fn vrele_async "struct vnode *vp" .Ft int .Fn vget "struct vnode *vp" "int lockflag" .Ft void .Fn vput "struct vnode *vp" .Ft void .Fn vhold "struct vnode *vp" .Ft void .Fn holdrele "struct vnode *vp" .Ft int .Fn getnewvnode "enum vtagtype tag" "struct mount *mp" "int (**vops)(void *)" \ "kmutex_t *slock" "struct vnode **vpp" .Ft void .Fn ungetnewvnode "struct vnode *vp" .Ft int .Fn vrecycle "struct vnode *vp" "struct simplelock *inter_lkp" "struct lwp *l" .Ft void .Fn vgone "struct vnode *vp" .Ft void .Fn vgonel "struct vnode *vp" "struct lwp *l" .Ft int .Fn vflush "struct mount *mp" "struct vnode *skipvp" "int flags" .Ft int .Fn vaccess "enum vtype type" "mode_t file_mode" "uid_t uid" "gid_t gid" "mode_t acc_mode" "kauth_cred_t cred" .Ft int .Fn bdevvp "dev_t dev" "struct vnode **vpp" .Ft int .Fn cdevvp "dev_t dev" "struct vnode **vpp" .Ft int .Fn vfinddev "dev_t dev" "enum vtype" "struct vnode **vpp" .Ft void .Fn vdevgone "int maj" "int minl" "int minh" "enum vtype type" .Ft void .Fn vwakeup "struct buf *bp" .Ft int .Fn vflushbuf "struct vnode *vp" "int sync" .Ft int .Fn vinvalbuf "struct vnode *vp" "int flags" "kauth_cred_t cred" "struct lwp *l" "int slpflag" "int slptimeo" .Ft int .Fn vtruncbuf "struct vnode *vp" "daddr_t lbn" "int slpflag" "int slptimeo" .Ft void .Fn vprint "const char *label" "struct vnode *vp" .Sh DESCRIPTION The vnode is the focus of all file activity in .Nx . There is a unique vnode allocated for each active file, directory, mounted-on file, fifo, domain socket, symbolic link and device. The kernel has no concept of a file's underlying structure and so it relies on the information stored in the vnode to describe the file. Thus, the vnode associated with a file holds all the administration information pertaining to it. .Pp When a process requests an operation on a file, the .Xr vfs 9 interface passes control to a file system type dependent function to carry out the operation. If the file system type dependent function finds that a vnode representing the file is not in main memory, it dynamically allocates a new vnode from the system main memory pool. Once allocated, the vnode is attached to the data structure pointer associated with the cause of the vnode allocation and it remains resident in the main memory until the system decides that it is no longer needed and can be recycled. .Pp The vnode has the following structure: .Bd -literal struct vnode { struct uvm_object v_uobj; /* the VM object */ kcondvar_t v_cv; /* synchronization */ voff_t v_size; /* size of file */ voff_t v_writesize; /* new size after write */ int v_iflag; /* VI_* flags */ int v_vflag; /* VV_* flags */ int v_uflag; /* VU_* flags */ int v_numoutput; /* # of pending writes */ int v_writecount; /* ref count of writers */ int v_holdcnt; /* page & buffer refs */ int v_synclist_slot; /* synclist slot index */ struct mount *v_mount; /* ptr to vfs we are in */ int (**v_op)(void *); /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ struct vnodelst *v_freelisthd; /* which freelist? */ TAILQ_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ TAILQ_ENTRY(vnode) v_synclist; /* vnodes with dirty bufs */ LIST_HEAD(, namecache) v_dnclist; /* namecaches (children) */ LIST_HEAD(, namecache) v_nclist; /* namecaches (parent) */ union { struct mount *vu_mountedhere;/* ptr to vfs (VDIR) */ struct socket *vu_socket; /* unix ipc (VSOCK) */ struct specnode *vu_specnode; /* device (VCHR, VBLK) */ struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ struct uvm_ractx *vu_ractx; /* read-ahead ctx (VREG) */ } v_un; enum vtype v_type; /* vnode type */ enum vtagtype v_tag; /* type of underlying data */ struct vnlock v_lock; /* lock for this vnode */ void *v_data; /* private data for fs */ struct klist v_klist; /* notes attached to vnode */ }; .Ed .Pp Most members of the vnode structure should be treated as opaque and only manipulated using the proper functions. There are some rather common exceptions detailed throughout this page. .Pp Files and file systems are inextricably linked with the virtual memory system and .Em v_uobj contains the data maintained by the virtual memory system. For compatibility with code written before the integration of .Xr uvm 9 into .Nx , C-preprocessor directives are used to alias the members of .Em v_uobj . .Pp Vnode flags are recorded by .Em v_flag . Valid flags are: .Pp .Bl -tag -offset indent -width VONWORKLST -compact .It VROOT This vnode is the root of its file system. .It VTEXT This vnode is a pure text prototype. .It VSYSTEM This vnode is being used by the kernel; only used to skip quota files in .Fn vflush . .It VISTTY This vnode represents a tty; used when reading dead vnodes. .It VEXECMAP This vnode has executable mappings. .It VWRITEMAP This vnode might have PROT_WRITE user mappings. .It VWRITEMAPDIRTY This vnode might have dirty pages due to VWRITEMAP .It VLOCKSWORK This vnode's file system supports locking. .It VXLOCK This vnode is currently locked to change underlying type. .It VXWANT A process is waiting for this vnode. .It VBWAIT Waiting for output associated with this vnode to complete. .It VALIASED This vnode has an alias. .It VDIROP This vnode is involved in a directory operation. This flag is used exclusively by LFS. .It VLAYER This vnode is on a layered file system. .It VONWORKLST This vnode is on syncer work-list. .It VFREEING This vnode is being freed. .It VMAPPED This vnode might have user mappings. .El .Pp The VXLOCK flag is used to prevent multiple processes from entering the vnode reclamation code. It is also used as a flag to indicate that reclamation is in progress. The VXWANT flag is set by threads that wish to be awakened when reclamation is finished. Before .Em v_flag can be modified, the .Em v_interlock simplelock must be acquired. See .Xr lock 9 for details on the kernel locking API. .Pp Each vnode has three reference counts: .Em v_usecount , .Em v_writecount and .Em v_holdcnt . The first is the number of active references within the kernel to the vnode. This count is maintained by .Fn vref , .Fn vrele , .Fn vrele_async , and .Fn vput . The second is the number of active references within the kernel to the vnode performing write access to the file. It is maintained by the .Xr open 2 and .Xr close 2 system calls. The third is the number of references within the kernel requiring the vnode to remain active and not be recycled. This count is maintained by .Fn vhold and .Fn holdrele . When both the .Em v_usecount and .Em v_holdcnt reach zero, the vnode is recycled to the freelist and may be reused for another file. The transition to and from the freelist is handled by .Fn getnewvnode , .Fn ungetnewvnode and .Fn vrecycle . Access to .Em v_usecount , .Em v_writecount and .Em v_holdcnt is also protected by the .Em v_interlock simplelock. .Pp The number of pending synchronous and asynchronous writes on the vnode are recorded in .Em v_numoutput . It is used by .Xr fsync 2 to wait for all writes to complete before returning to the user. Its value must only be modified at splbio (see .Xr spl 9 ) . It does not track the number of dirty buffers attached to the vnode. .Pp .Em v_dnclist and .Em v_nclist are used by .Xr namecache 9 to maintain the list of associated entries so that .Xr cache_purge 9 can purge them. .Pp The link to the file system which owns the vnode is recorded by .Em v_mount . See .Xr vfsops 9 for further information of file system mount status. .Pp The .Em v_op pointer points to its vnode operations vector. This vector describes what operations can be done to the file associated with the vnode. The system maintains one vnode operations vector for each file system type configured into the kernel. The vnode operations vector contains a pointer to a function for each operation supported by the file system. See .Xr vnodeops 9 for a description of vnode operations. .Pp When not in use, vnodes are kept on the freelist through .Em v_freelist . The vnodes still reference valid files but may be reused to refer to a new file at any time. When a valid vnode which is on the freelist is used again, the user must call .Fn vget to increment the reference count and retrieve it from the freelist. When a user wants a new vnode for another file, .Fn getnewvnode is invoked to remove a vnode from the freelist and initialize it for the new file. .Pp The type of object the vnode represents is recorded by .Em v_type . It is used by generic code to perform checks to ensure operations are performed on valid file system objects. Valid types are: .Pp .Bl -tag -offset indent -width VFIFO -compact .It VNON The vnode has no type. .It VREG The vnode represents a regular file. .It VDIR The vnode represents a directory. .It VBLK The vnode represents a block special device. .It VCHR The vnode represents a character special device. .It VLNK The vnode represents a symbolic link. .It VSOCK The vnode represents a socket. .It VFIFO The vnode represents a pipe. .It VBAD The vnode represents a bad file (not currently used). .El .Pp Vnode tag types are used by external programs only (e.g., .Xr pstat 8 ) , and should never be inspected by the kernel. Its use is deprecated since new .Em v_tag values cannot be defined for loadable file systems. The .Em v_tag member is read-only. Valid tag types are: .Pp .Bl -tag -offset indent -width "VT_FILECORE " -compact .It VT_NON non file system .It VT_UFS universal file system .It VT_NFS network file system .It VT_MFS memory file system .It VT_MSDOSFS FAT file system .It VT_LFS log-structured file system .It VT_LOFS loopback file system .It VT_FDESC file descriptor file system .It VT_NULL null file system layer .It VT_UMAP uid/gid remapping file system layer .It VT_KERNFS kernel interface file system .It VT_PROCFS process interface file system .It VT_AFS AFS file system .It VT_ISOFS ISO 9660 file system(s) .It VT_UNION union file system .It VT_ADOSFS Amiga file system .It VT_EXT2FS Linux's ext2 file system .It VT_CODA Coda file system .It VT_FILECORE filecore file system .It VT_NTFS Microsoft NT's file system .It VT_VFS virtual file system .It VT_OVERLAY overlay file system .It VT_SMBFS SMB file system .It VT_PTYFS pseudo-terminal device file system .It VT_TMPFS efficient memory file system .It VT_UDF universal disk format file system .It VT_SYSVBFS systemV boot file system .El .Pp All vnode locking operations use .Em v_lock . This lock is acquired by calling .Xr vn_lock 9 and released by calling .Xr VOP_UNLOCK 9 . The reason for this asymmetry is that .Xr vn_lock 9 is a wrapper for .Xr VOP_LOCK 9 with extra checks, while the unlocking step usually does not need additional checks and thus has no wrapper. .Pp The vnode locking operation is complicated because it is used for many purposes. Sometimes it is used to bundle a series of vnode operations (see .Xr vnodeops 9 ) into an atomic group. Many file systems rely on it to prevent race conditions in updating file system type specific data structures rather than using their own private locks. The vnode lock can operate as a multiple-reader (shared-access lock) or single-writer lock (exclusive access lock), however many current file system implementations were written assuming only single-writer locking. Multiple-reader locking functions equivalently only in the presence of big-lock SMP locking or a uni-processor machine. The lock may be held while sleeping. While the .Em v_lock is acquired, the holder is guaranteed that the vnode will not be reclaimed or invalidated. Most file system functions require that you hold the vnode lock on entry. See .Xr lock 9 for details on the kernel locking API. .Pp Each file system underlying a vnode allocates its own private area and hangs it from .Em v_data . .Pp Most functions discussed in this page that operate on vnodes cannot be called from interrupt context. The members .Em v_numoutput , .Em v_holdcnt , .Em v_dirtyblkhd , .Em v_cleanblkhd , .Em v_freelist , and .Em v_synclist are modified in interrupt context and must be protected by .Xr splbio 9 unless it is certain that there is no chance an interrupt handler will modify them. The vnode lock must not be acquired within interrupt context. .Sh FUNCTIONS .Bl -tag -width compact .It Fn vref "vp" Increment .Em v_usecount of the vnode .Em vp . Any kernel thread system which uses a vnode (e.g., during the operation of some algorithm or to store in a data structure) should call .Fn vref . .It Fn vrele "vp" Decrement .Em v_usecount of unlocked vnode .Em vp . Any code in the system which is using a vnode should call .Fn vrele when it is finished with the vnode. If .Em v_usecount of the vnode reaches zero and .Em v_holdcnt is greater than zero, the vnode is placed on the holdlist. If both .Em v_usecount and .Em v_holdcnt are zero, the vnode is placed on the freelist. .It Fn vrele_async "vp" Will asychronously release the vnode in different context than the caller, sometime after the call. .It Fn vget "vp" "lockflags" Reclaim vnode .Fa vp from the freelist, increment its reference count and lock it. The argument .Fa lockflags specifies the .Xr lockmgr 9 flags used to lock the vnode. If the VXLOCK is set in .Fa vp Ns 's .Em v_flag , vnode .Fa vp is being recycled in .Fn vgone and the calling thread sleeps until the transition is complete. When it is awakened, an error is returned to indicate that the vnode is no longer usable (possibly having been recycled to a new file system type). .It Fn vput "vp" Unlock vnode .Fa vp and decrement its .Em v_usecount . Depending on the reference counts, move the vnode to the holdlist or the freelist. This operation is functionally equivalent to calling .Xr VOP_UNLOCK 9 followed by .Fn vrele . .It Fn vhold "vp" Mark the vnode .Fa vp as active by incrementing .Em vp-\*[Gt]v_holdcnt and moving the vnode from the freelist to the holdlist. Once on the holdlist, the vnode will not be recycled until it is released with .Fn holdrele . .It Fn holdrele "vp" Mark the vnode .Fa vp as inactive by decrementing .Em vp-\*[Gt]v_holdcnt and moving the vnode from the holdlist to the freelist. .It Fn getnewvnode "tag" "mp" "vops" "slock" "vpp" Retrieve the next vnode from the freelist. .Fn getnewvnode must choose whether to allocate a new vnode or recycle an existing one. The criterion for allocating a new one is that the total number of vnodes is less than the number desired or there are no vnodes on either free list. Generally only vnodes that have no buffers associated with them are recycled and the next vnode from the freelist is retrieved. If the freelist is empty, vnodes on the holdlist are considered. The new vnode is returned in the address specified by .Fa vpp . .Pp The argument .Fa mp is the mount point for the file system requested the new vnode. Before retrieving the new vnode, the file system is checked if it is busy (such as currently unmounting). An error is returned if the file system is unmounted. .Pp The argument .Fa tag is the vnode tag assigned to .Fa *vpp-\*[Gt]v_tag . The argument .Fa vops is the vnode operations vector of the file system requesting the new vnode. If a vnode is successfully retrieved zero is returned, otherwise an appropriate error code is returned. If .Fa slock is not NULL, it specifies the lock to share for .Em v_interlock . The reference will be held on the lock and sharing noted. Reference will be released and lock unshared when the vnode gets recycled. If NULL (regular case), vnode will use its own interlock. .It Fn ungetnewvnode "vp" Undo the operation of .Fn getnewvnode . The argument .Fa vp is the vnode to return to the freelist. This function is needed for .Xr VFS_VGET 9 which may need to push back a vnode in case of a locking race condition. .It Fn vrecycle "vp" "inter_lkp" "l" Recycle the unused vnode .Fa vp to the front of the freelist. .Fn vrecycle is a null operation if the reference count is greater than zero. .It Fn vgone "vp" Eliminate all activity associated with the unlocked vnode .Fa vp in preparation for recycling. .It Fn vgonel "vp" "p" Eliminate all activity associated with the locked vnode .Fa vp in preparation for recycling. .It Fn vflush "mp" "skipvp" "flags" Remove any vnodes in the vnode table belonging to mount point .Fa mp . If .Fa skipvp is not NULL it is exempt from being flushed. The argument .Fa flags is a set of flags modifying the operation of .Fn vflush . If FORCECLOSE is not specified, there should not be any active vnodes and the error .Er EBUSY is returned if any are found (this is a user error, not a system error). If FORCECLOSE is specified, active vnodes that are found are detached. If WRITECLOSE is set, only flush out regular file vnodes open for writing. SKIPSYSTEM causes any vnodes marked V_SYSTEM to be skipped. .It Fn vaccess "type" "file_mode" "uid" "gid" "acc_mode" "cred" Do access checking by comparing the file's permissions to the caller's desired access type .Fa acc_mode and credentials .Fa cred . .It Fn bdevvp "dev" "vpp" Create a vnode for a block device. .Fn bdevvp is used for root file systems, swap areas and for memory file system special devices. .It Fn cdevvp "dev" "vpp" Create a vnode for a character device. .Fn cdevvp is used for the console and kernfs special devices. .It Fn vfinddev "dev" "vtype" "vpp" Lookup a vnode by device number. The vnode is referenced and returned in the address specified by .Fa vpp . .It Fn vdevgone "int maj" "int min" "int minh" "enum vtype type" Reclaim all vnodes that correspond to the specified minor number range .Fa minl to .Fa minh (endpoints inclusive) of the specified major .Fa maj . .It Fn vwakeup "bp" Update outstanding I/O count .Em vp-\*[Gt]v_numoutput for the vnode .Em bp-\*[Gt]b_vp and do a wakeup if requested and .Em vp-\*[Gt]vflag has VBWAIT set. .It Fn vflushbuf "vp" "sync" Flush all dirty buffers to disk for the file with the locked vnode .Fa vp . The argument .Fa sync specifies whether the I/O should be synchronous and .Fn vflushbuf will sleep until .Em vp-\*[Gt]v_numoutput is zero and .Em vp-\*[Gt]v_dirtyblkhd is empty. .It Fn vinvalbuf "vp" "flags" "cred" "l" "slpflag" "slptimeo" Flush out and invalidate all buffers associated with locked vnode .Fa vp . The argument .Fa l and .Fa cred specified the calling process and its credentials. The .Xr ltsleep 9 flag and timeout are specified by the arguments .Fa slpflag and .Fa slptimeo respectively. If the operation is successful zero is returned, otherwise an appropriate error code is returned. .It Fn vtruncbuf "vp" "lbn" "slpflag" "slptimeo" Destroy any in-core buffers past the file truncation length for the locked vnode .Fa vp . The truncation length is specified by .Fa lbn . .Fn vtruncbuf will sleep while the I/O is performed, The .Xr ltsleep 9 flag and timeout are specified by the arguments .Fa slpflag and .Fa slptimeo respectively. If the operation is successful zero is returned, otherwise an appropriate error code is returned. .It Fn vprint "label" "vp" This function is used by the kernel to dump vnode information during a panic. It is only used if the kernel option DIAGNOSTIC is compiled into the kernel. The argument .Fa label is a string to prefix the information dump of vnode .Fa vp . .El .Sh CODE REFERENCES The vnode framework is implemented within the file .Pa sys/kern/vfs_subr.c . .Sh SEE ALSO .Xr intro 9 , .Xr lock 9 , .Xr namecache 9 , .Xr namei 9 , .Xr uvm 9 , .Xr vattr 9 , .Xr vfs 9 , .Xr vfsops 9 , .Xr vnodeops 9 , .Xr vnsubr 9 .Sh BUGS The locking protocol is inconsistent. Many vnode operations are passed locked vnodes on entry but release the lock before they exit. The locking protocol is used in some places to attempt to make a series of operations atomic (e.g., access check then operation). This does not work for non-local file systems that do not support locking (e.g., NFS). The .Nm interface would benefit from a simpler locking protocol.