Implement direct select/poll support, currently effective for socket and

pipe subsystems.  Avoids overhead of second selscan() on wake-up, and thus
improves performance on certain workloads (especially when polling on many
file-descriptors).  Also, clean-up sys/fd_set.h header and improve macros.

Welcome to 5.99.36!
This commit is contained in:
rmind 2010-07-08 12:23:31 +00:00
parent 2bb038d66a
commit ce9dfd6a44
6 changed files with 173 additions and 70 deletions

View File

@ -1,11 +1,11 @@
/* $NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $ */
/* $NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
* by Andrew Doran and Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -84,7 +84,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $");
__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.23 2010/07/08 12:23:31 rmind Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@ -109,7 +109,12 @@ __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.22 2010/04/25 15:55:24 ad Exp $");
/* Flags for lwp::l_selflag. */
#define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
#define SEL_SCANNING 1 /* polling descriptors */
#define SEL_BLOCKING 2 /* about to block on select_cv */
#define SEL_BLOCKING 2 /* blocking and waiting for event */
#define SEL_EVENT 3 /* interrupted, events set directly */
/* Operations: either select() or poll(). */
#define SELOP_SELECT 1
#define SELOP_POLL 2
/*
* Per-cluster state for select()/poll(). For a system with fewer
@ -125,10 +130,16 @@ typedef struct selcluster {
uint32_t sc_mask;
} selcluster_t;
static inline int selscan(char *, u_int, register_t *);
static inline int pollscan(struct pollfd *, u_int, register_t *);
static inline int selscan(char *, const int, const size_t, register_t *);
static inline int pollscan(struct pollfd *, const int, register_t *);
static void selclear(void);
static const int sel_flag[] = {
POLLRDNORM | POLLHUP | POLLERR,
POLLWRNORM | POLLHUP | POLLERR,
POLLRDBAND
};
static syncobj_t select_sobj = {
SOBJ_SLEEPQ_FIFO,
sleepq_unsleep,
@ -137,7 +148,7 @@ static syncobj_t select_sobj = {
syncobj_noowner,
};
static selcluster_t *selcluster[SELCLUSTERS];
static selcluster_t *selcluster[SELCLUSTERS] __read_mostly;
/*
* Select system call.
@ -206,8 +217,8 @@ sys___select50(struct lwp *l, const struct sys___select50_args *uap,
* sel_do_scan: common code to perform the scan on descriptors.
*/
static int
sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask,
register_t *retval, int selpoll)
sel_do_scan(const int op, void *fds, const int nf, const size_t ni,
struct timespec *ts, sigset_t *mask, register_t *retval)
{
lwp_t * const l = curlwp;
proc_t * const p = l->l_proc;
@ -237,6 +248,14 @@ sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask,
lock = sc->sc_lock;
l->l_selcluster = sc;
SLIST_INIT(&l->l_selwait);
l->l_selret = 0;
if (op == SELOP_SELECT) {
l->l_selbits = (char *)fds + ni * 3;
l->l_selni = ni;
} else {
l->l_selbits = NULL;
}
for (;;) {
int ncoll;
@ -250,28 +269,51 @@ sel_do_scan(void *fds, u_int nfds, struct timespec *ts, sigset_t *mask,
l->l_selflag = SEL_SCANNING;
ncoll = sc->sc_ncoll;
if (selpoll) {
error = selscan((char *)fds, nfds, retval);
if (op == SELOP_SELECT) {
error = selscan((char *)fds, nf, ni, retval);
} else {
error = pollscan((struct pollfd *)fds, nfds, retval);
error = pollscan((struct pollfd *)fds, nf, retval);
}
if (error || *retval)
break;
if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
break;
/*
* Acquire the lock and perform the (re)checks. Note, if
* collision has occured, then our state does not matter,
* as we must perform re-scan. Therefore, check it first.
*/
state_check:
mutex_spin_enter(lock);
if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
if (__predict_false(sc->sc_ncoll != ncoll)) {
/* Collision: perform re-scan. */
mutex_spin_exit(lock);
continue;
}
if (__predict_true(l->l_selflag == SEL_EVENT)) {
/* Events occured, they are set directly. */
mutex_spin_exit(lock);
KASSERT(l->l_selret != 0);
*retval = l->l_selret;
break;
}
if (__predict_true(l->l_selflag == SEL_RESET)) {
/* Events occured, but re-scan is requested. */
mutex_spin_exit(lock);
continue;
}
KASSERT(l->l_selflag == SEL_SCANNING);
/* Nothing happen, therefore - sleep. */
l->l_selflag = SEL_BLOCKING;
l->l_kpriority = true;
sleepq_enter(&sc->sc_sleepq, l, lock);
sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
error = sleepq_block(timo, true);
if (error != 0)
if (error != 0) {
break;
}
/* Awoken: need to check the state. */
goto state_check;
}
selclear();
@ -326,7 +368,7 @@ selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
getbits(ex, 2);
#undef getbits
error = sel_do_scan(bits, nd, ts, mask, retval, 1);
error = sel_do_scan(SELOP_SELECT, bits, nd, ni, ts, mask, retval);
if (error == 0 && u_in != NULL)
error = copyout(bits + ni * 3, u_in, ni);
if (error == 0 && u_ou != NULL)
@ -340,30 +382,32 @@ selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
}
static inline int
selscan(char *bits, u_int nfd, register_t *retval)
selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
{
static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
POLLWRNORM | POLLHUP | POLLERR,
POLLRDBAND };
fd_mask *ibitp, *obitp;
int msk, i, j, fd, ni, n;
fd_mask ibits, obits;
int msk, i, j, fd, n;
file_t *fp;
ni = howmany(nfd, NFDBITS) * sizeof(fd_mask);
ibitp = (fd_mask *)(bits + ni * 0);
obitp = (fd_mask *)(bits + ni * 3);
n = 0;
for (msk = 0; msk < 3; msk++) {
for (i = 0; i < nfd; i += NFDBITS) {
fd_mask ibits, obits;
ibits = *ibitp++;
obits = 0;
while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
ibits &= ~(1 << j);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
/*
* Setup an argument to selrecord(), which is
* a file descriptor number.
*/
curlwp->l_selrec = fd;
if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) {
obits |= (1 << j);
n++;
}
@ -464,7 +508,7 @@ pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
if (error)
goto fail;
error = sel_do_scan(fds, nfds, ts, mask, retval, 0);
error = sel_do_scan(SELOP_POLL, fds, nfds, ni, ts, mask, retval);
if (error == 0)
error = copyout(fds, u_fds, ni);
fail:
@ -474,12 +518,11 @@ pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
}
static inline int
pollscan(struct pollfd *fds, u_int nfd, register_t *retval)
pollscan(struct pollfd *fds, const int nfd, register_t *retval)
{
int i, n;
file_t *fp;
int i, n = 0;
n = 0;
for (i = 0; i < nfd; i++, fds++) {
if (fds->fd < 0) {
fds->revents = 0;
@ -487,6 +530,12 @@ pollscan(struct pollfd *fds, u_int nfd, register_t *retval)
fds->revents = POLLNVAL;
n++;
} else {
/*
* Perform poll: registers select request or returns
* the events which are set. Setup an argument for
* selrecord(), which is a pointer to struct pollfd.
*/
curlwp->l_selrec = (uintptr_t)fds;
fds->revents = (*fp->f_ops->fo_poll)(fp,
fds->events | POLLERR | POLLHUP);
if (fds->revents != 0)
@ -498,7 +547,6 @@ pollscan(struct pollfd *fds, u_int nfd, register_t *retval)
return (0);
}
/*ARGSUSED*/
int
seltrue(dev_t dev, int events, lwp_t *l)
{
@ -539,27 +587,72 @@ selrecord(lwp_t *selector, struct selinfo *sip)
other = sip->sel_lwp;
if (other == selector) {
/* `selector' has already claimed it. */
/* 1. We (selector) already claimed to be the first LWP. */
KASSERT(sip->sel_cluster = sc);
} else if (other == NULL) {
/*
* First named waiter, although there may be unnamed
* waiters (collisions). Issue a memory barrier to
* ensure that we access sel_lwp (above) before other
* fields - this guards against a call to selclear().
* 2. No first LWP, therefore we (selector) are the first.
*
* There may be unnamed waiters (collisions). Issue a memory
* barrier to ensure that we access sel_lwp (above) before
* other fields - this guards against a call to selclear().
*/
membar_enter();
sip->sel_lwp = selector;
SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
/* Copy the argument, which is for selnotify(). */
sip->sel_fdinfo = selector->l_selrec;
/* Replace selinfo's lock with the chosen cluster's lock. */
sip->sel_cluster = sc;
} else {
/* Multiple waiters: record a collision. */
/* 3. Multiple waiters: record a collision. */
sip->sel_collision |= sc->sc_mask;
KASSERT(sip->sel_cluster != NULL);
}
}
/*
* sel_setevents: a helper function for selnotify(), to set the events
* for LWP sleeping in selcommon() or pollcommon().
*/
static inline void
sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
{
const int oflag = l->l_selflag;
/*
* If we require re-scan or it was required by somebody else,
* then just (re)set SEL_RESET and return.
*/
if (__predict_false(events == 0 || oflag == SEL_RESET)) {
l->l_selflag = SEL_RESET;
return;
}
/*
* Direct set. Note: select state of LWP is locked. First,
* determine whether it is selcommon() or pollcommon().
*/
if (l->l_selbits != NULL) {
fd_mask *fds = (fd_mask *)l->l_selbits;
const int ni = l->l_selni;
const int fd = sip->sel_fdinfo;
int n;
for (n = 0; n < 3; n++) {
if (sel_flag[n] | events) {
fds[fd >> __NFDSHIFT] |= (fd & __NFDMASK);
}
fds = (fd_mask *)((char *)fds + ni);
}
} else {
struct pollfd *pfd = (void *)sip->sel_fdinfo;
pfd->revents |= events;
}
/* Indicate direct set and note the event (cluster lock is held). */
l->l_selflag = SEL_EVENT;
l->l_selret++;
}
/*
* Do a wakeup when a selectable event occurs. Concurrency issues:
*
@ -590,14 +683,18 @@ selnotify(struct selinfo *sip, int events, long knhint)
mutex_spin_enter(lock);
/* Still there? */
if (sip->sel_lwp != NULL) {
/*
* Set the events for our LWP and indicate that.
* Otherwise, request for a full re-scan.
*/
l = sip->sel_lwp;
oflag = l->l_selflag;
sel_setevents(l, sip, events);
/*
* If thread is sleeping, wake it up. If it's not
* yet asleep, it will notice the change in state
* and will re-poll the descriptors.
*/
oflag = l->l_selflag;
l->l_selflag = SEL_RESET;
if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
KASSERT(l->l_wchan == sc);
sleepq_unsleep(l, false);
@ -710,7 +807,7 @@ selinit(struct selinfo *sip)
* Concurrency issues: we only need guard against a call to selclear()
* by a thread exiting sel_do_scan(). The caller has prevented further
* references being made to the selinfo record via selrecord(), and it
* won't call selwakeup() again.
* will not call selnotify() again.
*/
void
seldestroy(struct selinfo *sip)

View File

@ -1,4 +1,4 @@
/* $NetBSD: fd_set.h,v 1.2 2005/12/11 12:25:20 christos Exp $ */
/* $NetBSD: fd_set.h,v 1.3 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 1992, 1993
@ -39,44 +39,45 @@
#include <machine/int_types.h>
/*
* Implementation dependent defines, hidden from user space. X/Open does not
* specify them.
* Implementation dependent defines, hidden from user space.
* POSIX does not specify them.
*/
#define __NBBY 8 /* number of bits in a byte */
typedef __int32_t __fd_mask;
/* bits per mask */
#define __NFDBITS ((unsigned int)sizeof(__fd_mask) * __NBBY)
typedef __int32_t __fd_mask;
#define __howmany(x, y) (((x) + ((y) - 1)) / (y))
/* 32 = 2 ^ 5 */
#define __NFDBITS (32)
#define __NFDSHIFT (5)
#define __NFDMASK (__NFDBITS - 1)
/*
* Select uses bit masks of file descriptors in longs. These macros
* manipulate such bit fields (the filesystem macros use chars).
* FD_SETSIZE may be defined by the user, but the default here should
* be enough for most uses.
* Select uses bit fields of file descriptors. These macros manipulate
* such bit fields. Note: FD_SETSIZE may be defined by the user.
*/
#ifndef FD_SETSIZE
#define FD_SETSIZE 256
#endif
#define __NFD_SIZE (((FD_SETSIZE) + (__NFDBITS - 1)) / __NFDBITS)
typedef struct fd_set {
__fd_mask fds_bits[__howmany(FD_SETSIZE, __NFDBITS)];
__fd_mask fds_bits[__NFD_SIZE];
} fd_set;
#define FD_SET(n, p) \
((p)->fds_bits[(n)/__NFDBITS] |= (1 << ((n) % __NFDBITS)))
((p)->fds_bits[(n) >> __NFDSHIFT] |= (1 << ((n) & __NFDMASK)))
#define FD_CLR(n, p) \
((p)->fds_bits[(n)/__NFDBITS] &= ~(1 << ((n) % __NFDBITS)))
((p)->fds_bits[(n) >> __NFDSHIFT] &= ~(1 << ((n) & __NFDMASK)))
#define FD_ISSET(n, p) \
((p)->fds_bits[(n)/__NFDBITS] & (1 << ((n) % __NFDBITS)))
((p)->fds_bits[(n) >> __NFDSHIFT] & (1 << ((n) & __NFDMASK)))
#if __GNUC_PREREQ__(2, 95)
#define FD_ZERO(p) (void)__builtin_memset((p), 0, sizeof(*(p)))
#else
#define FD_ZERO(p) do { \
fd_set *__fds = (p); \
unsigned int __i; \
for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++) \
for (__i = 0; __i < __NFD_SIZE; __i++) \
__fds->fds_bits[__i] = 0; \
} while (/* CONSTCOND */ 0)
#endif /* GCC 2.95 */
@ -86,11 +87,8 @@ typedef struct fd_set {
*/
#if defined(_NETBSD_SOURCE)
#define fd_mask __fd_mask
#define NFDBITS __NFDBITS
#ifndef howmany
#define howmany(a, b) __howmany(a, b)
#endif
#define fd_mask __fd_mask
#define NFDBITS __NFDBITS
#if __GNUC_PREREQ__(2, 95)
#define FD_COPY(f, t) (void)__builtin_memcpy((t), (f), sizeof(*(f)))
@ -98,7 +96,7 @@ typedef struct fd_set {
#define FD_COPY(f, t) do { \
fd_set *__f = (f), *__t = (t); \
unsigned int __i; \
for (__i = 0; __i < __howmany(FD_SETSIZE, __NFDBITS); __i++) \
for (__i = 0; __i < __NFD_SIZE; __i++) \
__t->fds_bits[__i] = __f->fds_bits[__i]; \
} while (/* CONSTCOND */ 0)
#endif /* GCC 2.95 */

View File

@ -1,4 +1,4 @@
/* $NetBSD: lwp.h,v 1.136 2010/07/07 01:30:38 chs Exp $ */
/* $NetBSD: lwp.h,v 1.137 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010
@ -134,11 +134,17 @@ struct lwp {
int l_prflag; /* p: process level flags */
u_int l_refcnt; /* p: reference count on this LWP */
lwpid_t l_lid; /* (: LWP identifier; local to proc */
int l_selflag; /* S: select() flags */
SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */
struct selcluster *l_selcluster;/* !: associated select data */
char *l_name; /* (: name, optional */
/* State of select() or poll() */
int l_selflag; /* S: polling state flags */
SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */
int l_selret; /* S: return value of select/poll */
uintptr_t l_selrec; /* (: argument for selrecord() */
struct selcluster *l_selcluster;/* (: associated cluster data */
void * l_selbits; /* (: select() bit-field */
size_t l_selni; /* (: size of a single bit-field */
/* Signals */
int l_sigrestore; /* p: need to restore old sig mask */
sigset_t l_sigwaitset; /* p: signals being waited for */

View File

@ -1,4 +1,4 @@
/* $NetBSD: param.h,v 1.370 2010/07/07 01:37:35 chs Exp $ */
/* $NetBSD: param.h,v 1.371 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 1982, 1986, 1989, 1993
@ -63,7 +63,7 @@
* 2.99.9 (299000900)
*/
#define __NetBSD_Version__ 599003500 /* NetBSD 5.99.35 */
#define __NetBSD_Version__ 599003600 /* NetBSD 5.99.36 */
#define __NetBSD_Prereq__(M,m,p) (((((M) * 100000000) + \
(m) * 1000000) + (p) * 100) <= __NetBSD_Version__)

View File

@ -1,4 +1,4 @@
/* $NetBSD: selinfo.h,v 1.7 2010/04/25 15:55:24 ad Exp $ */
/* $NetBSD: selinfo.h,v 1.8 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 2008, 2010 The NetBSD Foundation, Inc.
@ -74,6 +74,7 @@ struct selinfo {
struct klist sel_klist; /* knotes attached to this selinfo */
void *sel_cluster; /* current cluster association */
struct lwp *sel_lwp; /* first LWP to be notified */
uintptr_t sel_fdinfo; /* selected descriptor by first LWP */
SLIST_ENTRY(selinfo) sel_chain; /* entry on LWP's list of selinfo */
uint32_t sel_collision; /* mask of colliding cpus */
uint32_t sel_reserved[3];/* reserved for future expansion */

View File

@ -1,4 +1,4 @@
/* $NetBSD: types.h,v 1.86 2009/03/07 21:59:25 ad Exp $ */
/* $NetBSD: types.h,v 1.87 2010/07/08 12:23:31 rmind Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993, 1994
@ -311,7 +311,8 @@ typedef _BSD_USECONDS_T_ useconds_t;
#ifdef _NETBSD_SOURCE
#include <sys/fd_set.h>
#define NBBY __NBBY
#define NBBY 8
typedef struct kauth_cred *kauth_cred_t;