NetBSD/sys/kern/kern_descrip.c

1829 lines
42 KiB
C
Raw Normal View History

2009-08-16 15:00:20 +04:00
/* $NetBSD: kern_descrip.c,v 1.199 2009/08/16 11:00:20 yamt Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
1994-05-17 08:21:49 +04:00
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
1994-05-17 08:21:49 +04:00
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-05-17 08:21:49 +04:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
1998-03-01 05:20:01 +03:00
* @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
1994-05-17 08:21:49 +04:00
*/
/*
* File descriptor management.
*/
2001-11-12 18:25:01 +03:00
#include <sys/cdefs.h>
2009-08-16 15:00:20 +04:00
__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.199 2009/08/16 11:00:20 yamt Exp $");
2001-11-12 18:25:01 +03:00
1994-05-17 08:21:49 +04:00
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/pool.h>
#include <sys/unistd.h>
1994-05-17 08:21:49 +04:00
#include <sys/resourcevar.h>
1996-03-31 01:24:38 +03:00
#include <sys/conf.h>
#include <sys/event.h>
2006-05-15 01:15:11 +04:00
#include <sys/kauth.h>
2007-11-29 21:15:14 +03:00
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
static int file_ctor(void *, void *, int);
static void file_dtor(void *, void *);
static int fdfile_ctor(void *, void *, int);
static void fdfile_dtor(void *, void *);
static int filedesc_ctor(void *, void *, int);
static void filedesc_dtor(void *, void *);
static int filedescopen(dev_t, int, int, lwp_t *);
kmutex_t filelist_lock; /* lock on filehead */
2001-02-26 23:24:30 +03:00
struct filelist filehead; /* head of list of open files */
u_int nfiles; /* actual number of open files */
static pool_cache_t filedesc_cache;
static pool_cache_t file_cache;
static pool_cache_t fdfile_cache;
1994-05-17 08:21:49 +04:00
const struct cdevsw filedesc_cdevsw = {
filedescopen, noclose, noread, nowrite, noioctl,
nostop, notty, nopoll, nommap, nokqfilter, D_OTHER | D_MPSAFE,
};
/* For ease of reading. */
__strong_alias(fd_putvnode,fd_putfile)
__strong_alias(fd_putsock,fd_putfile)
/*
* Initialize the descriptor system.
*/
void
fd_sys_init(void)
{
mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
KASSERT(file_cache != NULL);
fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
NULL);
KASSERT(fdfile_cache != NULL);
filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
NULL);
KASSERT(filedesc_cache != NULL);
}
static bool
fd_isused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
KASSERT(fd < fdp->fd_dt->dt_nfiles);
return (fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0;
}
/*
* Verify that the bitmaps match the descriptor table.
*/
static inline void
fd_checkmaps(filedesc_t *fdp)
{
#ifdef DEBUG
fdtab_t *dt;
u_int fd;
dt = fdp->fd_dt;
if (fdp->fd_refcnt == -1) {
/*
* fd_free tears down the table without maintaining its bitmap.
*/
return;
}
for (fd = 0; fd < dt->dt_nfiles; fd++) {
if (fd < NDFDFILE) {
KASSERT(dt->dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
if (dt->dt_ff[fd] == NULL) {
KASSERT(!fd_isused(fdp, fd));
} else if (dt->dt_ff[fd]->ff_file != NULL) {
KASSERT(fd_isused(fdp, fd));
}
}
#else /* DEBUG */
/* nothing */
#endif /* DEBUG */
}
static int
fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
{
int i, off, maxoff;
uint32_t sub;
KASSERT(mutex_owned(&fdp->fd_lock));
fd_checkmaps(fdp);
if (want > bits)
return -1;
off = want >> NDENTRYSHIFT;
i = want & NDENTRYMASK;
if (i) {
sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
if (sub != ~0)
goto found;
off++;
}
maxoff = NDLOSLOTS(bits);
while (off < maxoff) {
if ((sub = bitmap[off]) != ~0)
goto found;
off++;
}
return (-1);
found:
return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
}
static int
fd_last_set(filedesc_t *fd, int last)
{
int off, i;
fdfile_t **ff = fd->fd_dt->dt_ff;
uint32_t *bitmap = fd->fd_lomap;
KASSERT(mutex_owned(&fd->fd_lock));
fd_checkmaps(fd);
off = (last - 1) >> NDENTRYSHIFT;
while (off >= 0 && !bitmap[off])
off--;
if (off < 0)
return (-1);
2005-02-27 00:34:55 +03:00
i = ((off + 1) << NDENTRYSHIFT) - 1;
if (i >= last)
i = last - 1;
/* XXX should use bitmap */
while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
i--;
return (i);
}
static inline void
fd_used(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(mutex_owned(&fdp->fd_lock));
KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) == 0);
KASSERT(ff != NULL);
KASSERT(ff->ff_file == NULL);
KASSERT(!ff->ff_allocated);
ff->ff_allocated = 1;
fdp->fd_lomap[off] |= 1 << (fd & NDENTRYMASK);
if (__predict_false(fdp->fd_lomap[off] == ~0)) {
KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1 << (off & NDENTRYMASK))) == 0);
fdp->fd_himap[off >> NDENTRYSHIFT] |= 1 << (off & NDENTRYMASK);
}
if ((int)fd > fdp->fd_lastfile) {
fdp->fd_lastfile = fd;
}
fd_checkmaps(fdp);
}
static inline void
fd_unused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
/*
* Don't assert the lock is held here, as we may be copying
* the table during exec() and it is not needed there.
* procfs and sysctl are locked out by proc::p_reflock.
*
* KASSERT(mutex_owned(&fdp->fd_lock));
*/
KASSERT(ff != NULL);
KASSERT(ff->ff_file == NULL);
KASSERT(ff->ff_allocated);
if (fd < fdp->fd_freefile) {
fdp->fd_freefile = fd;
}
if (fdp->fd_lomap[off] == ~0) {
KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1 << (off & NDENTRYMASK))) != 0);
fdp->fd_himap[off >> NDENTRYSHIFT] &=
~(1 << (off & NDENTRYMASK));
}
KASSERT((fdp->fd_lomap[off] & (1 << (fd & NDENTRYMASK))) != 0);
fdp->fd_lomap[off] &= ~(1 << (fd & NDENTRYMASK));
ff->ff_allocated = 0;
KASSERT(fd <= fdp->fd_lastfile);
if (fd == fdp->fd_lastfile) {
fdp->fd_lastfile = fd_last_set(fdp, fd);
}
fd_checkmaps(fdp);
}
/*
* Look up the file structure corresponding to a file descriptor
* and return the file, holding a reference on the descriptor.
*/
inline file_t *
fd_getfile(unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
/*
* Look up the fdfile structure representing this descriptor.
* We are doing this unlocked. See fd_tryexpand().
*/
fdp = curlwp->l_fd;
dt = fdp->fd_dt;
if (__predict_false(fd >= dt->dt_nfiles)) {
return NULL;
}
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (__predict_false(ff == NULL)) {
return NULL;
}
/* Now get a reference to the descriptor. */
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
ff->ff_refcnt++;
} else {
/*
* Multi threaded: issue a memory barrier to ensure that we
* acquire the file pointer _after_ adding a reference. If
* no memory barrier, we could fetch a stale pointer.
*/
atomic_inc_uint(&ff->ff_refcnt);
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_enter();
#endif
}
/*
* If the file is not open or is being closed then put the
* reference back.
*/
fp = ff->ff_file;
if (__predict_true(fp != NULL)) {
return fp;
}
fd_putfile(fd);
return NULL;
}
1994-05-17 08:21:49 +04:00
/*
* Release a reference to a file descriptor acquired with fd_getfile().
1994-05-17 08:21:49 +04:00
*/
void
fd_putfile(unsigned fd)
1994-05-17 08:21:49 +04:00
{
filedesc_t *fdp;
fdfile_t *ff;
u_int u, v;
2001-02-26 23:24:30 +03:00
fdp = curlwp->l_fd;
ff = fdp->fd_dt->dt_ff[fd];
1994-05-17 08:21:49 +04:00
KASSERT(fd < fdp->fd_dt->dt_nfiles);
KASSERT(ff != NULL);
KASSERT((ff->ff_refcnt & FR_MASK) > 0);
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
fd_close(fd);
return;
}
ff->ff_refcnt--;
return;
}
/*
* Ensure that any use of the file is complete and globally
* visible before dropping the final reference. If no membar,
* the current CPU could still access memory associated with
* the file after it has been freed or recycled by another
* CPU.
*/
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_exit();
#endif
/*
* Be optimistic and start out with the assumption that no other
* threads are trying to close the descriptor. If the CAS fails,
* we lost a race and/or it's being closed.
*/
for (u = ff->ff_refcnt & FR_MASK;; u = v) {
v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
if (__predict_true(u == v)) {
return;
}
if (__predict_false((v & FR_CLOSING) != 0)) {
break;
}
}
/* Another thread is waiting to close the file: join it. */
(void)fd_close(fd);
1994-05-17 08:21:49 +04:00
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a vnode.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:15:01 +03:00
int
fd_getvnode(unsigned fd, file_t **fpp)
{
vnode_t *vp;
file_t *fp;
fp = fd_getfile(fd);
if (__predict_false(fp == NULL)) {
return EBADF;
}
if (__predict_false(fp->f_type != DTYPE_VNODE)) {
fd_putfile(fd);
return EINVAL;
}
vp = fp->f_data;
if (__predict_false(vp->v_type == VBAD)) {
/* XXX Is this case really necessary? */
fd_putfile(fd);
return EBADF;
1994-05-17 08:21:49 +04:00
}
*fpp = fp;
return 0;
1994-05-17 08:21:49 +04:00
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a socket.
*/
int
fd_getsock(unsigned fd, struct socket **sop)
{
file_t *fp;
fp = fd_getfile(fd);
if (__predict_false(fp == NULL)) {
return EBADF;
}
if (__predict_false(fp->f_type != DTYPE_SOCKET)) {
fd_putfile(fd);
return ENOTSOCK;
}
*sop = fp->f_data;
return 0;
}
/*
* Look up the file structure corresponding to a file descriptor
* and return it with a reference held on the file, not the
* descriptor.
*
* This is heavyweight and only used when accessing descriptors
* from a foreign process. The caller must ensure that `p' does
* not exit or fork across this call.
*
* To release the file (not descriptor) reference, use closef().
*/
file_t *
fd_getfile2(proc_t *p, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
fdp = p->p_fd;
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
if (fd >= dt->dt_nfiles) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((ff = dt->dt_ff[fd]) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((fp = ff->ff_file) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&fdp->fd_lock);
return fp;
}
1994-05-17 08:21:49 +04:00
/*
* Internal form of close. Must be called with a reference to the
* descriptor, and will drop the reference. When all descriptor
* references are dropped, releases the descriptor slot and a single
* reference to the file structure.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:15:01 +03:00
int
fd_close(unsigned fd)
{
struct flock lf;
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
proc_t *p;
lwp_t *l;
u_int refcnt;
l = curlwp;
2003-01-18 13:06:22 +03:00
p = l->l_proc;
fdp = l->l_fd;
ff = fdp->fd_dt->dt_ff[fd];
1994-05-17 08:21:49 +04:00
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
2004-01-05 03:36:49 +03:00
mutex_enter(&fdp->fd_lock);
KASSERT((ff->ff_refcnt & FR_MASK) > 0);
if (__predict_false(ff->ff_file == NULL)) {
/*
* Another user of the file is already closing, and is
* waiting for other users of the file to drain. Release
* our reference, and wake up the closer.
*/
atomic_dec_uint(&ff->ff_refcnt);
cv_broadcast(&ff->ff_closing);
mutex_exit(&fdp->fd_lock);
2004-01-05 03:36:49 +03:00
/*
* An application error, so pretend that the descriptor
* was already closed. We can't safely wait for it to
* be closed without potentially deadlocking.
*/
1994-05-17 08:21:49 +04:00
return (EBADF);
}
KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
/*
* There may be multiple users of this file within the process.
* Notify existing and new users that the file is closing. This
* will prevent them from adding additional uses to this file
* while we are closing it.
*/
fp = ff->ff_file;
ff->ff_file = NULL;
ff->ff_exclose = false;
1994-05-17 08:21:49 +04:00
/*
* We expect the caller to hold a descriptor reference - drop it.
* The reference count may increase beyond zero at this point due
* to an erroneous descriptor reference by an application, but
* fd_getfile() will notice that the file is being closed and drop
* the reference again.
*/
if (fdp->fd_refcnt == 1) {
/* Single threaded. */
refcnt = --(ff->ff_refcnt);
} else {
/* Multi threaded. */
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_producer();
#endif
refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
}
if (__predict_false(refcnt != 0)) {
/*
* Wait for other references to drain. This is typically
* an application error - the descriptor is being closed
* while still in use.
*
*/
atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
/*
* Remove any knotes attached to the file. A knote
* attached to the descriptor can hold references on it.
*/
mutex_exit(&fdp->fd_lock);
if (!SLIST_EMPTY(&ff->ff_knlist)) {
knote_fdclose(fd);
}
/* Try to drain out descriptor references. */
(*fp->f_ops->fo_drain)(fp);
mutex_enter(&fdp->fd_lock);
/*
* We need to see the count drop to zero at least once,
* in order to ensure that all pre-existing references
* have been drained. New references past this point are
* of no interest.
*/
while ((ff->ff_refcnt & FR_MASK) != 0) {
cv_wait(&ff->ff_closing, &fdp->fd_lock);
}
atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
} else {
/* If no references, there must be no knotes. */
KASSERT(SLIST_EMPTY(&ff->ff_knlist));
1994-05-17 08:21:49 +04:00
}
/*
* POSIX record locking dictates that any close releases ALL
* locks owned by this process. This is handled by setting
* a flag in the unlock to free ONLY locks obeying POSIX
* semantics, and not to free BSD-style file locks.
* If the descriptor was in a message, POSIX-style locks
* aren't passed with the descriptor.
*/
if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 &&
fp->f_type == DTYPE_VNODE)) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
mutex_exit(&fdp->fd_lock);
(void)VOP_ADVLOCK(fp->f_data, p, F_UNLCK, &lf, F_POSIX);
mutex_enter(&fdp->fd_lock);
}
/* Free descriptor slot. */
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
/* Now drop reference to the file itself. */
return closef(fp);
1994-05-17 08:21:49 +04:00
}
/*
* Duplicate a file descriptor.
*/
int
fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
{
proc_t *p;
int error;
p = curproc;
while ((error = fd_alloc(p, minfd, newp)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
curlwp->l_fd->fd_dt->dt_ff[*newp]->ff_exclose = exclose;
fd_affix(p, fp, *newp);
return 0;
}
/*
* dup2 operation.
*/
1996-02-04 05:15:01 +03:00
int
fd_dup2(file_t *fp, unsigned new)
{
filedesc_t *fdp;
fdfile_t *ff;
fdtab_t *dt;
1994-05-17 08:21:49 +04:00
fdp = curlwp->l_fd;
/*
* Ensure there are enough slots in the descriptor table,
* and allocate an fdfile_t up front in case we need it.
*/
while (new >= fdp->fd_dt->dt_nfiles) {
fd_tryexpand(curproc);
}
ff = pool_cache_get(fdfile_cache, PR_WAITOK);
/*
* If there is already a file open, close it. If the file is
* half open, wait for it to be constructed before closing it.
* XXX Potential for deadlock here?
*/
mutex_enter(&fdp->fd_lock);
while (fd_isused(fdp, new)) {
mutex_exit(&fdp->fd_lock);
if (fd_getfile(new) != NULL) {
(void)fd_close(new);
} else {
/*
* Crummy, but unlikely to happen.
* Can occur if we interrupt another
* thread while it is opening a file.
*/
kpause("dup2", false, 1, NULL);
}
mutex_enter(&fdp->fd_lock);
}
dt = fdp->fd_dt;
if (dt->dt_ff[new] == NULL) {
KASSERT(new >= NDFDFILE);
dt->dt_ff[new] = ff;
ff = NULL;
}
fd_used(fdp, new);
mutex_exit(&fdp->fd_lock);
/* Slot is now allocated. Insert copy of the file. */
fd_affix(curproc, fp, new);
if (ff != NULL) {
pool_cache_put(fdfile_cache, ff);
}
return 0;
1994-05-17 08:21:49 +04:00
}
/*
* Drop reference to a file structure.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:15:01 +03:00
int
closef(file_t *fp)
{
struct flock lf;
int error;
/*
* Drop reference. If referenced elsewhere it's still open
* and we have nothing more to do.
*/
mutex_enter(&fp->f_lock);
KASSERT(fp->f_count > 0);
if (--fp->f_count > 0) {
mutex_exit(&fp->f_lock);
return 0;
}
KASSERT(fp->f_count == 0);
mutex_exit(&fp->f_lock);
/* We held the last reference - release locks, close and free. */
if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void)VOP_ADVLOCK(fp->f_data, fp, F_UNLCK, &lf, F_FLOCK);
}
if (fp->f_ops != NULL) {
error = (*fp->f_ops->fo_close)(fp);
} else {
error = 0;
}
KASSERT(fp->f_count == 0);
KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
return error;
1994-05-17 08:21:49 +04:00
}
/*
* Allocate a file descriptor for the process.
*/
1996-02-04 05:15:01 +03:00
int
fd_alloc(proc_t *p, int want, int *result)
1994-05-17 08:21:49 +04:00
{
filedesc_t *fdp;
int i, lim, last, error;
u_int off, new;
fdtab_t *dt;
KASSERT(p == curproc || p == &proc0);
2001-02-26 23:24:30 +03:00
fdp = p->p_fd;
1994-05-17 08:21:49 +04:00
/*
* Search for a free descriptor starting at the higher
* of want or fd_freefile.
1994-05-17 08:21:49 +04:00
*/
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
last = min(dt->dt_nfiles, lim);
for (;;) {
if ((i = want) < fdp->fd_freefile)
i = fdp->fd_freefile;
off = i >> NDENTRYSHIFT;
new = fd_next_zero(fdp, fdp->fd_himap, off,
(last + NDENTRIES - 1) >> NDENTRYSHIFT);
if (new == -1)
break;
i = fd_next_zero(fdp, &fdp->fd_lomap[new],
new > off ? 0 : i & NDENTRYMASK, NDENTRIES);
if (i == -1) {
2005-02-27 00:34:55 +03:00
/*
* Free file descriptor in this block was
* below want, try again with higher want.
*/
want = (new + 1) << NDENTRYSHIFT;
continue;
}
i += (new << NDENTRYSHIFT);
if (i >= last) {
break;
}
if (dt->dt_ff[i] == NULL) {
KASSERT(i >= NDFDFILE);
dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK);
1994-05-17 08:21:49 +04:00
}
KASSERT(dt->dt_ff[i]->ff_refcnt == 0);
KASSERT(dt->dt_ff[i]->ff_file == NULL);
fd_used(fdp, i);
if (want <= fdp->fd_freefile) {
fdp->fd_freefile = i;
}
*result = i;
KASSERT(i >= NDFDFILE ||
dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
return 0;
}
1994-05-17 08:21:49 +04:00
/* No space in current array. Let the caller expand and retry. */
error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
mutex_exit(&fdp->fd_lock);
return error;
1994-05-17 08:21:49 +04:00
}
/*
* Allocate memory for a descriptor table.
*/
static fdtab_t *
fd_dtab_alloc(int n)
{
fdtab_t *dt;
size_t sz;
KASSERT(n > NDFILE);
sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
dt = kmem_alloc(sz, KM_SLEEP);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
dt->dt_nfiles = n;
dt->dt_link = NULL;
return dt;
}
/*
* Free a descriptor table, and all tables linked for deferred free.
*/
static void
fd_dtab_free(fdtab_t *dt)
{
fdtab_t *next;
size_t sz;
do {
next = dt->dt_link;
KASSERT(dt->dt_nfiles > NDFILE);
sz = sizeof(*dt) +
(dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
kmem_free(dt, sz);
dt = next;
} while (dt != NULL);
}
/*
* Allocate descriptor bitmap.
*/
static void
fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
{
uint8_t *ptr;
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
*lo = (uint32_t *)ptr;
*hi = (uint32_t *)(ptr + szlo);
}
/*
* Free descriptor bitmap.
*/
static void
fd_map_free(int n, uint32_t *lo, uint32_t *hi)
{
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
kmem_free(lo, szlo + szhi);
}
/*
* Expand a process' descriptor table.
*/
void
fd_tryexpand(proc_t *p)
{
filedesc_t *fdp;
int i, numfiles, oldnfiles;
fdtab_t *newdt, *dt;
uint32_t *newhimap, *newlomap;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
newhimap = NULL;
newlomap = NULL;
oldnfiles = fdp->fd_dt->dt_nfiles;
if (oldnfiles < NDEXTENT)
numfiles = NDEXTENT;
else
numfiles = 2 * oldnfiles;
newdt = fd_dtab_alloc(numfiles);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_alloc(numfiles, &newlomap, &newhimap);
}
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
if (dt->dt_nfiles != oldnfiles) {
/* fdp changed; caller must retry */
mutex_exit(&fdp->fd_lock);
fd_dtab_free(newdt);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_free(numfiles, newlomap, newhimap);
}
return;
}
/* Copy the existing descriptor table and zero the new portion. */
i = sizeof(fdfile_t *) * oldnfiles;
memcpy(newdt->dt_ff, dt->dt_ff, i);
2009-05-29 02:17:04 +04:00
memset((uint8_t *)newdt->dt_ff + i, 0,
numfiles * sizeof(fdfile_t *) - i);
/*
* Link old descriptor array into list to be discarded. We defer
* freeing until the last reference to the descriptor table goes
* away (usually process exit). This allows us to do lockless
* lookups in fd_getfile().
*/
if (oldnfiles > NDFILE) {
if (fdp->fd_refcnt > 1) {
newdt->dt_link = dt;
} else {
fd_dtab_free(dt);
}
}
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newhimap, fdp->fd_himap, i);
memset((uint8_t *)newhimap + i, 0,
NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newlomap, fdp->fd_lomap, i);
memset((uint8_t *)newlomap + i, 0,
NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
}
fdp->fd_himap = newhimap;
fdp->fd_lomap = newlomap;
}
/*
* All other modifications must become globally visible before
* the change to fd_dt. See fd_getfile().
*/
membar_producer();
fdp->fd_dt = newdt;
KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
}
1994-05-17 08:21:49 +04:00
/*
* Create a new open file structure and allocate a file descriptor
* for the current process.
1994-05-17 08:21:49 +04:00
*/
1996-02-04 05:15:01 +03:00
int
fd_allocfile(file_t **resultfp, int *resultfd)
1994-05-17 08:21:49 +04:00
{
kauth_cred_t cred;
file_t *fp;
proc_t *p;
int error;
1994-05-17 08:21:49 +04:00
p = curproc;
while ((error = fd_alloc(p, 0, resultfd)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
fp = pool_cache_get(file_cache, PR_WAITOK);
if (fp == NULL) {
return ENFILE;
}
KASSERT(fp->f_count == 0);
completely rework the way that orphaned sockets that are being fdpassed via SCM_RIGHTS messages are dealt with: 1. unp_gc: make this a kthread. 2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread. 3. unp_scan: do not close files here. instead, put them on a global list for unp_gc to close, along with a per-file "deferred close count". if file is already enqueued for close, just increment deferred close count. this eliminates the recursive calls. 3. unp_gc: scan files on global deferred close list. close each file N times, as specified by deferred close count in file. continue processing list until it becomes empty (closing may cause additional files to be queued for close). 4. unp_gc: add additional bit to mark files we are scanning. set during initial scan of global file list that currently clears FMARK/FDEFER. during later scans, never examine / garbage collect descriptors that we have not marked during the earlier scan. do not proceed with this initial scan until all deferred closes have been processed. be careful with locking to ensure no races are introduced between deferred close and file scan. 5. unp_gc: use dummy file_t to mark position in list when scanning. allow us to drop filelist_lock. in turn allows us to eliminate kmem_alloc() and safely close files, etc. 6. prohibit transfer of descriptors within SCM_RIGHTS messages if (num_files_in_transit > maxfiles / unp_rights_ratio) 7. fd_allocfile: ensure recycled filse don't get scanned. this is 97% work done by andrew doran, with a couple of minor bug fixes and a lot of testing by yours truly.
2009-03-11 09:05:29 +03:00
KASSERT(fp->f_msgcount == 0);
KASSERT(fp->f_unpcount == 0);
/* Replace cached credentials if not what we need. */
cred = curlwp->l_cred;
if (__predict_false(cred != fp->f_cred)) {
kauth_cred_free(fp->f_cred);
kauth_cred_hold(cred);
fp->f_cred = cred;
1994-05-17 08:21:49 +04:00
}
completely rework the way that orphaned sockets that are being fdpassed via SCM_RIGHTS messages are dealt with: 1. unp_gc: make this a kthread. 2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread. 3. unp_scan: do not close files here. instead, put them on a global list for unp_gc to close, along with a per-file "deferred close count". if file is already enqueued for close, just increment deferred close count. this eliminates the recursive calls. 3. unp_gc: scan files on global deferred close list. close each file N times, as specified by deferred close count in file. continue processing list until it becomes empty (closing may cause additional files to be queued for close). 4. unp_gc: add additional bit to mark files we are scanning. set during initial scan of global file list that currently clears FMARK/FDEFER. during later scans, never examine / garbage collect descriptors that we have not marked during the earlier scan. do not proceed with this initial scan until all deferred closes have been processed. be careful with locking to ensure no races are introduced between deferred close and file scan. 5. unp_gc: use dummy file_t to mark position in list when scanning. allow us to drop filelist_lock. in turn allows us to eliminate kmem_alloc() and safely close files, etc. 6. prohibit transfer of descriptors within SCM_RIGHTS messages if (num_files_in_transit > maxfiles / unp_rights_ratio) 7. fd_allocfile: ensure recycled filse don't get scanned. this is 97% work done by andrew doran, with a couple of minor bug fixes and a lot of testing by yours truly.
2009-03-11 09:05:29 +03:00
/*
* Don't allow recycled files to be scanned.
* See uipc_usrreq.c.
completely rework the way that orphaned sockets that are being fdpassed via SCM_RIGHTS messages are dealt with: 1. unp_gc: make this a kthread. 2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread. 3. unp_scan: do not close files here. instead, put them on a global list for unp_gc to close, along with a per-file "deferred close count". if file is already enqueued for close, just increment deferred close count. this eliminates the recursive calls. 3. unp_gc: scan files on global deferred close list. close each file N times, as specified by deferred close count in file. continue processing list until it becomes empty (closing may cause additional files to be queued for close). 4. unp_gc: add additional bit to mark files we are scanning. set during initial scan of global file list that currently clears FMARK/FDEFER. during later scans, never examine / garbage collect descriptors that we have not marked during the earlier scan. do not proceed with this initial scan until all deferred closes have been processed. be careful with locking to ensure no races are introduced between deferred close and file scan. 5. unp_gc: use dummy file_t to mark position in list when scanning. allow us to drop filelist_lock. in turn allows us to eliminate kmem_alloc() and safely close files, etc. 6. prohibit transfer of descriptors within SCM_RIGHTS messages if (num_files_in_transit > maxfiles / unp_rights_ratio) 7. fd_allocfile: ensure recycled filse don't get scanned. this is 97% work done by andrew doran, with a couple of minor bug fixes and a lot of testing by yours truly.
2009-03-11 09:05:29 +03:00
*/
if (__predict_false((fp->f_flag & FSCAN) != 0)) {
completely rework the way that orphaned sockets that are being fdpassed via SCM_RIGHTS messages are dealt with: 1. unp_gc: make this a kthread. 2. unp_detach: go not call unp_gc directly. instead, wake up unp_gc kthread. 3. unp_scan: do not close files here. instead, put them on a global list for unp_gc to close, along with a per-file "deferred close count". if file is already enqueued for close, just increment deferred close count. this eliminates the recursive calls. 3. unp_gc: scan files on global deferred close list. close each file N times, as specified by deferred close count in file. continue processing list until it becomes empty (closing may cause additional files to be queued for close). 4. unp_gc: add additional bit to mark files we are scanning. set during initial scan of global file list that currently clears FMARK/FDEFER. during later scans, never examine / garbage collect descriptors that we have not marked during the earlier scan. do not proceed with this initial scan until all deferred closes have been processed. be careful with locking to ensure no races are introduced between deferred close and file scan. 5. unp_gc: use dummy file_t to mark position in list when scanning. allow us to drop filelist_lock. in turn allows us to eliminate kmem_alloc() and safely close files, etc. 6. prohibit transfer of descriptors within SCM_RIGHTS messages if (num_files_in_transit > maxfiles / unp_rights_ratio) 7. fd_allocfile: ensure recycled filse don't get scanned. this is 97% work done by andrew doran, with a couple of minor bug fixes and a lot of testing by yours truly.
2009-03-11 09:05:29 +03:00
mutex_enter(&fp->f_lock);
atomic_and_uint(&fp->f_flag, ~FSCAN);
mutex_exit(&fp->f_lock);
}
fp->f_advice = 0;
fp->f_offset = 0;
*resultfp = fp;
return 0;
}
/*
* Successful creation of a new descriptor: make visible to the process.
*/
void
fd_affix(proc_t *p, file_t *fp, unsigned fd)
{
fdfile_t *ff;
filedesc_t *fdp;
KASSERT(p == curproc || p == &proc0);
/* Add a reference to the file structure. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
1994-05-17 08:21:49 +04:00
/*
* Insert the new file into the descriptor slot.
*
* The memory barriers provided by lock activity in this routine
* ensure that any updates to the file structure become globally
* visible before the file becomes visible to other LWPs in the
* current process.
1994-05-17 08:21:49 +04:00
*/
fdp = p->p_fd;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(ff != NULL);
KASSERT(ff->ff_file == NULL);
KASSERT(ff->ff_allocated);
KASSERT(fd_isused(fdp, fd));
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
/* No need to lock in order to make file initially visible. */
ff->ff_file = fp;
}
/*
* Abort creation of a new descriptor: free descriptor slot and file.
*/
void
fd_abort(proc_t *p, file_t *fp, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
mutex_enter(&fdp->fd_lock);
KASSERT(fd_isused(fdp, fd));
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
if (fp != NULL) {
KASSERT(fp->f_count == 0);
KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
}
1994-05-17 08:21:49 +04:00
}
static int
file_ctor(void *arg, void *obj, int flags)
{
file_t *fp = obj;
memset(fp, 0, sizeof(*fp));
mutex_enter(&filelist_lock);
if (__predict_false(nfiles >= maxfiles)) {
mutex_exit(&filelist_lock);
tablefull("file", "increase kern.maxfiles or MAXFILES");
return ENFILE;
}
nfiles++;
LIST_INSERT_HEAD(&filehead, fp, f_list);
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
fp->f_cred = curlwp->l_cred;
kauth_cred_hold(fp->f_cred);
mutex_exit(&filelist_lock);
return 0;
}
static void
file_dtor(void *arg, void *obj)
{
file_t *fp = obj;
mutex_enter(&filelist_lock);
nfiles--;
LIST_REMOVE(fp, f_list);
mutex_exit(&filelist_lock);
kauth_cred_free(fp->f_cred);
mutex_destroy(&fp->f_lock);
}
static int
fdfile_ctor(void *arg, void *obj, int flags)
{
fdfile_t *ff = obj;
memset(ff, 0, sizeof(*ff));
cv_init(&ff->ff_closing, "fdclose");
return 0;
}
static void
fdfile_dtor(void *arg, void *obj)
{
fdfile_t *ff = obj;
cv_destroy(&ff->ff_closing);
}
file_t *
fgetdummy(void)
{
file_t *fp;
fp = kmem_alloc(sizeof(*fp), KM_SLEEP);
if (fp != NULL) {
memset(fp, 0, sizeof(*fp));
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
}
return fp;
}
void
fputdummy(file_t *fp)
{
mutex_destroy(&fp->f_lock);
kmem_free(fp, sizeof(*fp));
}
/*
* Create an initial filedesc structure.
*/
filedesc_t *
fd_init(filedesc_t *fdp)
{
#ifdef DIAGNOSTIC
unsigned fd;
#endif
if (__predict_true(fdp == NULL)) {
fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
} else {
/* XXXRUMP KASSERT(fdp == &filedesc0); */
filedesc_ctor(NULL, fdp, PR_WAITOK);
}
#ifdef DIAGNOSTIC
KASSERT(fdp->fd_lastfile == -1);
KASSERT(fdp->fd_lastkqfile == -1);
KASSERT(fdp->fd_knhash == NULL);
KASSERT(fdp->fd_freefile == 0);
KASSERT(fdp->fd_exclose == false);
KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (fd = 0; fd < NDFDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
for (fd = NDFDFILE; fd < NDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
}
KASSERT(fdp->fd_himap == fdp->fd_dhimap);
KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
#endif /* DIAGNOSTIC */
fdp->fd_refcnt = 1;
2009-08-16 15:00:20 +04:00
fd_checkmaps(fdp);
return fdp;
}
/*
* Initialize a file descriptor table.
*/
static int
filedesc_ctor(void *arg, void *obj, int flag)
{
filedesc_t *fdp = obj;
fdfile_t **ffp;
int i;
memset(fdp, 0, sizeof(*fdp));
mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
fdp->fd_lastfile = -1;
fdp->fd_lastkqfile = -1;
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
*ffp = (fdfile_t *)fdp->fd_dfdfile[i];
(void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
}
return 0;
}
static void
filedesc_dtor(void *arg, void *obj)
{
filedesc_t *fdp = obj;
int i;
for (i = 0; i < NDFDFILE; i++) {
fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
}
mutex_destroy(&fdp->fd_lock);
}
/*
* Make p2 share p1's filedesc structure.
*/
void
fd_share(struct proc *p2)
{
filedesc_t *fdp;
fdp = curlwp->l_fd;
p2->p_fd = fdp;
atomic_inc_uint(&fdp->fd_refcnt);
}
/*
* Acquire a hold on a filedesc structure.
*/
void
fd_hold(void)
{
atomic_inc_uint(&curlwp->l_fd->fd_refcnt);
}
1994-05-17 08:21:49 +04:00
/*
* Copy a filedesc structure.
*/
filedesc_t *
fd_copy(void)
1994-05-17 08:21:49 +04:00
{
filedesc_t *newfdp, *fdp;
fdfile_t *ff, **ffp, **nffp, *ff2;
int i, j, numfiles, lastfile, newlast;
file_t *fp;
fdtab_t *newdt;
1994-05-17 08:21:49 +04:00
fdp = curproc->p_fd;
newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
1994-05-17 08:21:49 +04:00
newfdp->fd_refcnt = 1;
#ifdef DIAGNOSTIC
KASSERT(newfdp->fd_lastfile == -1);
KASSERT(newfdp->fd_lastkqfile == -1);
KASSERT(newfdp->fd_knhash == NULL);
KASSERT(newfdp->fd_freefile == 0);
KASSERT(newfdp->fd_exclose == false);
KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (i = 0; i < NDFDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
(fdfile_t *)&newfdp->fd_dfdfile[i]);
}
for (i = NDFDFILE; i < NDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
}
#endif /* DIAGNOSTIC */
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
numfiles = fdp->fd_dt->dt_nfiles;
lastfile = fdp->fd_lastfile;
1994-05-17 08:21:49 +04:00
/*
* If the number of open files fits in the internal arrays
* of the open file structure, use them, otherwise allocate
* additional memory for the number of descriptors currently
* in use.
*/
if (lastfile < NDFILE) {
i = NDFILE;
newdt = newfdp->fd_dt;
KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
} else {
1994-05-17 08:21:49 +04:00
/*
* Compute the smallest multiple of NDEXTENT needed
* for the file descriptors currently in use,
* allowing the table to shrink.
1994-05-17 08:21:49 +04:00
*/
i = numfiles;
while (i >= 2 * NDEXTENT && i > lastfile * 2) {
i /= 2;
}
KASSERT(i > NDFILE);
newdt = fd_dtab_alloc(i);
newfdp->fd_dt = newdt;
memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
NDFDFILE * sizeof(fdfile_t **));
memset(newdt->dt_ff + NDFDFILE, 0,
(i - NDFDFILE) * sizeof(fdfile_t **));
}
if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
newfdp->fd_himap = newfdp->fd_dhimap;
newfdp->fd_lomap = newfdp->fd_dlomap;
} else {
fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
KASSERT(i >= NDENTRIES * NDENTRIES);
memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
}
newfdp->fd_freefile = fdp->fd_freefile;
newfdp->fd_exclose = fdp->fd_exclose;
ffp = fdp->fd_dt->dt_ff;
nffp = newdt->dt_ff;
newlast = -1;
for (i = 0; i <= (int)lastfile; i++, ffp++, nffp++) {
KASSERT(i >= NDFDFILE ||
*nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
ff = *ffp;
if (ff == NULL || (fp = ff->ff_file) == NULL) {
/* Descriptor unused, or descriptor half open. */
KASSERT(!fd_isused(newfdp, i));
continue;
}
if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
/* kqueue descriptors cannot be copied. */
if (i < newfdp->fd_freefile)
newfdp->fd_freefile = i;
continue;
}
/* It's active: add a reference to the file. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
/* Allocate an fdfile_t to represent it. */
if (i >= NDFDFILE) {
ff2 = pool_cache_get(fdfile_cache, PR_WAITOK);
*nffp = ff2;
} else {
ff2 = newdt->dt_ff[i];
}
ff2->ff_file = fp;
ff2->ff_exclose = ff->ff_exclose;
ff2->ff_allocated = true;
/* Fix up bitmaps. */
j = i >> NDENTRYSHIFT;
KASSERT((newfdp->fd_lomap[j] & (1 << (i & NDENTRYMASK))) == 0);
newfdp->fd_lomap[j] |= 1 << (i & NDENTRYMASK);
if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
(1 << (j & NDENTRYMASK))) == 0);
newfdp->fd_himap[j >> NDENTRYSHIFT] |=
1 << (j & NDENTRYMASK);
}
newlast = i;
}
KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
newfdp->fd_lastfile = newlast;
fd_checkmaps(newfdp);
mutex_exit(&fdp->fd_lock);
1994-05-17 08:21:49 +04:00
return (newfdp);
}
/*
* Release a filedesc structure.
*/
void
fd_free(void)
1994-05-17 08:21:49 +04:00
{
fdfile_t *ff;
file_t *fp;
int fd, nf;
fdtab_t *dt;
lwp_t * const l = curlwp;
filedesc_t * const fdp = l->l_fd;
const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
1994-05-17 08:21:49 +04:00
KASSERT(fdp->fd_dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_exit();
#endif
if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
return;
/*
* Close any files that the process holds open.
*/
dt = fdp->fd_dt;
fd_checkmaps(fdp);
#ifdef DEBUG
fdp->fd_refcnt = -1; /* see fd_checkmaps */
#endif
for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE ||
ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (ff == NULL)
continue;
if ((fp = ff->ff_file) != NULL) {
/*
* Must use fd_close() here if there is
* a reference from kqueue or we might have posix
* advisory locks.
*/
if (__predict_true(ff->ff_refcnt == 0) &&
(noadvlock || fp->f_type != DTYPE_VNODE)) {
ff->ff_file = NULL;
ff->ff_exclose = false;
ff->ff_allocated = false;
closef(fp);
} else {
ff->ff_refcnt++;
fd_close(fd);
}
}
KASSERT(ff->ff_refcnt == 0);
KASSERT(ff->ff_file == NULL);
KASSERT(!ff->ff_exclose);
KASSERT(!ff->ff_allocated);
if (fd >= NDFDFILE) {
pool_cache_put(fdfile_cache, ff);
dt->dt_ff[fd] = NULL;
}
}
/*
* Clean out the descriptor table for the next user and return
* to the cache.
*/
if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
fd_dtab_free(fdp->fd_dt);
/* Otherwise, done above. */
memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
(NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
fdp->fd_dt = &fdp->fd_dtbuiltin;
}
if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
KASSERT(fdp->fd_himap != fdp->fd_dhimap);
KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
2005-11-30 01:52:02 +03:00
}
if (__predict_false(fdp->fd_knhash != NULL)) {
hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
fdp->fd_knhash = NULL;
fdp->fd_knhashmask = 0;
} else {
KASSERT(fdp->fd_knhashmask == 0);
2005-11-30 01:52:02 +03:00
}
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_lastkqfile = -1;
fdp->fd_lastfile = -1;
fdp->fd_freefile = 0;
fdp->fd_exclose = false;
memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
offsetof(filedesc_t, fd_startzero));
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
#ifdef DEBUG
fdp->fd_refcnt = 0; /* see fd_checkmaps */
#endif
fd_checkmaps(fdp);
pool_cache_put(filedesc_cache, fdp);
}
1994-05-17 08:21:49 +04:00
/*
* File Descriptor pseudo-device driver (/dev/fd/).
*
* Opening minor device N dup()s the file (if any) connected to file
* descriptor N belonging to the calling process. Note that this driver
* consists of only the ``open()'' routine, because all subsequent
* references to this file will be direct to the other driver.
*/
static int
filedescopen(dev_t dev, int mode, int type, lwp_t *l)
1994-05-17 08:21:49 +04:00
{
1994-12-14 21:40:27 +03:00
/*
* XXX Kludge: set dupfd to contain the value of the
2002-04-28 01:31:41 +04:00
* the file descriptor being sought for duplication. The error
1994-12-14 21:40:27 +03:00
* return ensures that the vnode for this device will be released
* by vn_open. Open will detect this special error and take the
* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
* will simply report the error.
*/
2005-12-11 15:16:03 +03:00
l->l_dupfd = minor(dev); /* XXX */
return EDUPFD;
}
1994-12-14 21:40:27 +03:00
/*
* Duplicate the specified descriptor to a free descriptor.
*/
int
fd_dupopen(int old, int *new, int mode, int error)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
if ((fp = fd_getfile(old)) == NULL) {
return EBADF;
}
fdp = curlwp->l_fd;
dt = fdp->fd_dt;
ff = dt->dt_ff[old];
/*
1994-12-14 21:40:27 +03:00
* There are two cases of interest here.
*
* For EDUPFD simply dup (dfd) to file descriptor
1994-12-14 21:40:27 +03:00
* (indx) and return.
*
* For EMOVEFD steal away the file structure from (dfd) and
1994-12-14 21:40:27 +03:00
* store it in (indx). (dfd) is effectively closed by
* this operation.
*
* Any other error code is just returned.
*/
1994-12-14 21:40:27 +03:00
switch (error) {
case EDUPFD:
1994-12-14 21:40:27 +03:00
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
*/
if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
error = EACCES;
break;
}
/* Copy it. */
error = fd_dup(fp, 0, new, ff->ff_exclose);
break;
1994-05-17 08:21:49 +04:00
case EMOVEFD:
/* Copy it. */
error = fd_dup(fp, 0, new, ff->ff_exclose);
if (error != 0) {
break;
}
/* Steal away the file pointer from 'old'. */
(void)fd_close(old);
return 0;
1994-12-14 21:40:27 +03:00
}
fd_putfile(old);
return error;
}
1994-05-17 08:21:49 +04:00
/*
* Sets descriptor owner. If the owner is a process, 'pgid'
* is set to positive value, process ID. If the owner is process group,
* 'pgid' is set to -pg_id.
*/
int
fsetown(pid_t *pgid, u_long cmd, const void *data)
{
int id = *(const int *)data;
int error;
switch (cmd) {
case TIOCSPGRP:
if (id < 0)
return (EINVAL);
id = -id;
break;
default:
break;
}
if (id > 0 && !pfind(id))
return (ESRCH);
else if (id < 0 && (error = pgid_in_session(curproc, -id)))
return (error);
*pgid = id;
return (0);
}
/*
* Return descriptor owner information. If the value is positive,
* it's process ID. If it's negative, it's process group ID and
* needs the sign removed before use.
*/
int
fgetown(pid_t pgid, u_long cmd, void *data)
{
switch (cmd) {
case TIOCGPGRP:
*(int *)data = -pgid;
break;
default:
*(int *)data = pgid;
break;
}
return (0);
}
/*
* Send signal to descriptor owner, either process or process group.
*/
void
fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
{
2005-02-27 00:34:55 +03:00
ksiginfo_t ksi;
KASSERT(!cpu_intr_p());
if (pgid == 0) {
return;
}
KSI_INIT(&ksi);
ksi.ksi_signo = signo;
ksi.ksi_code = code;
ksi.ksi_band = band;
mutex_enter(proc_lock);
if (pgid > 0) {
struct proc *p1;
p1 = p_find(pgid, PFIND_LOCKED);
if (p1 != NULL) {
kpsignal(p1, &ksi, fdescdata);
}
} else {
struct pgrp *pgrp;
KASSERT(pgid < 0);
pgrp = pg_find(-pgid, PFIND_LOCKED);
if (pgrp != NULL) {
kpgsignal(pgrp, &ksi, fdescdata, 0);
}
}
mutex_exit(proc_lock);
}
int
fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
void *data)
{
2005-02-13 02:14:03 +03:00
fp->f_flag = flag;
fp->f_type = DTYPE_MISC;
fp->f_ops = fops;
fp->f_data = data;
curlwp->l_dupfd = fd;
fd_affix(curproc, fp, fd);
return EMOVEFD;
}
int
fnullop_fcntl(file_t *fp, u_int cmd, void *data)
{
if (cmd == F_SETFL)
return 0;
return EOPNOTSUPP;
}
int
fnullop_poll(file_t *fp, int which)
{
return 0;
}
int
fnullop_kqfilter(file_t *fp, struct knote *kn)
{
return 0;
}
void
fnullop_drain(file_t *fp)
{
}
int
fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_ioctl(file_t *fp, u_long com, void *data)
{
return EOPNOTSUPP;
}
int
fbadop_stat(file_t *fp, struct stat *sb)
{
return EOPNOTSUPP;
}
int
fbadop_close(file_t *fp)
{
return EOPNOTSUPP;
}