haiku/src/system/kernel/wait_for_objects.cpp
Ingo Weinhold 4048494ce4 axeld + bonefish:
* Implemented automatic syscall restarts:
  - A syscall can indicate that it has been interrupted and can be
    restarted by setting a respective bit in thread::flags. It can
    store parameters it wants to be preserved for the restart in
    thread::syscall_restart::parameters. Another thread::flags bit
    indicates whether it has been restarted.
  - handle_signals() clears the restart flag, if the handled signal
    has a handler function installed and SA_RESTART is not set. Another
    thread flag (THREAD_FLAGS_DONT_RESTART_SYSCALL) can prevent syscalls
    from being restarted, even if they could be (not used yet, but we
    might want to use it in resume_thread(), so that we stay
    behaviorally compatible with BeOS).
  - The architecture specific syscall handler restarts the syscall, if
    the restart flag is set. Implemented for x86 only.
  - Added some support functions in the private <syscall_restart.h> to
    simplify the syscall restart code in the syscalls.
  - Adjusted all syscalls that can potentially be restarted accordingly.
  - _user_ioctl() sets new thread flag THREAD_FLAGS_IOCTL_SYSCALL while
    calling the underlying FS's/driver's hook, so that syscall restarts
    can also be supported there.
* thread_at_kernel_exit() invokes handle_signals() in a loop now, as
  long as the latter indicates that the thread shall be suspended, so
  that after waking up signals received in the meantime will be handled
  before the thread returns to userland. Adjusted handle_signals()
  accordingly -- when encountering a suspending signal we don't check
  for further signals.
* Fixed sigsuspend(): Suspending the thread and rescheduling doesn't
  result in the correct behavior. Instead we employ a temporary
  condition variable and interruptably wait on it. The POSIX test
  suite test passes, now.
* Made the switch_sem[_etc]() behavior on interruption consistent.
  Depending on when the signal arrived (before the call or when already
  waiting) the first semaphore would or wouldn't be released. Now we
  consistently release it.
* Refactored _user_{read,write}[v]() syscalls. Use a common function for
  either pair. The iovec version doesn't fail anymore, if anything could
  be read/written at all. It also checks whether a complete vector
  could be read/written, so that we won't skip data, if the underlying
  FS/driver couldn't read/write more ATM.
* Some refactoring in the x86 syscall handler: The int 99 and sysenter
  handlers use a common subroutine to avoid code duplication.



git-svn-id: file:///srv/svn/repos/haiku/haiku/trunk@23983 a95241bf-73f2-0310-859d-f6bbb57e9c96
2008-02-17 15:48:30 +00:00

836 lines
17 KiB
C++

/*
* Copyright 2007-2008, Ingo Weinhold, bonefish@cs.tu-berlin.de.
* Copyright 2002-2008, Axel Dörfler, axeld@pinc-software.de.
* Distributed under the terms of the MIT License.
*/
#include <fs/select_sync_pool.h>
#include <wait_for_objects.h>
#include <new>
#include <poll.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <sys/select.h>
#include <OS.h>
#include <Select.h>
#include <AutoDeleter.h>
#include <fs/fd.h>
#include <port.h>
#include <sem.h>
#include <syscalls.h>
#include <syscall_restart.h>
#include <thread.h>
#include <util/AutoLock.h>
#include <util/DoublyLinkedList.h>
#include <vfs.h>
//#define TRACE_WAIT_FOR_OBJECTS
#ifdef TRACE_WAIT_FOR_OBJECTS
# define PRINT(x) dprintf x
# define FUNCTION(x) dprintf x
#else
# define PRINT(x) ;
# define FUNCTION(x) ;
#endif
using std::nothrow;
struct select_sync_pool_entry
: DoublyLinkedListLinkImpl<select_sync_pool_entry> {
selectsync *sync;
uint16 events;
};
typedef DoublyLinkedList<select_sync_pool_entry> SelectSyncPoolEntryList;
struct select_sync_pool {
SelectSyncPoolEntryList entries;
};
struct select_ops {
status_t (*select)(int32 object, struct select_info* info, bool kernel);
status_t (*deselect)(int32 object, struct select_info* info, bool kernel);
};
static const select_ops kSelectOps[] = {
// B_OBJECT_TYPE_FD
{
select_fd,
deselect_fd
},
// B_OBJECT_TYPE_SEMAPHORE
{
select_sem,
deselect_sem
},
// B_OBJECT_TYPE_PORT
{
select_port,
deselect_port
},
// B_OBJECT_TYPE_THREAD
{
select_thread,
deselect_thread
}
};
static const uint32 kSelectOpsCount = sizeof(kSelectOps) / sizeof(select_ops);
/*!
Clears all bits in the fd_set - since we are using variable sized
arrays in the kernel, we can't use the FD_ZERO() macro provided by
sys/select.h for this task.
All other FD_xxx() macros are safe to use, though.
*/
static inline void
fd_zero(fd_set *set, int numFDs)
{
if (set != NULL)
memset(set, 0, _howmany(numFDs, NFDBITS) * sizeof(fd_mask));
}
static status_t
create_select_sync(int numFDs, select_sync*& _sync)
{
// create sync structure
select_sync* sync = new(nothrow) select_sync;
if (sync == NULL)
return B_NO_MEMORY;
ObjectDeleter<select_sync> syncDeleter(sync);
// create info set
sync->set = new(nothrow) select_info[numFDs];
if (sync->set == NULL)
return B_NO_MEMORY;
ArrayDeleter<select_info> setDeleter(sync->set);
// create select event semaphore
sync->sem = create_sem(0, "select");
if (sync->sem < 0)
return sync->sem;
// create lock
status_t error = benaphore_init(&sync->lock, "select sync");
if (error != B_OK) {
delete_sem(sync->sem);
return error;
}
sync->count = numFDs;
sync->ref_count = 1;
for (int i = 0; i < numFDs; i++) {
sync->set[i].next = NULL;
sync->set[i].sync = sync;
}
setDeleter.Detach();
syncDeleter.Detach();
_sync = sync;
return B_OK;
}
void
put_select_sync(select_sync* sync)
{
FUNCTION(("put_select_sync(%p): -> %ld\n", sync, sync->ref_count - 1));
if (atomic_add(&sync->ref_count, -1) == 1) {
delete_sem(sync->sem);
benaphore_destroy(&sync->lock);
delete[] sync->set;
delete sync;
}
}
static int
common_select(int numFDs, fd_set *readSet, fd_set *writeSet, fd_set *errorSet,
bigtime_t timeout, const sigset_t *sigMask, bool kernel)
{
status_t status = B_OK;
int fd;
FUNCTION(("[%ld] common_select(%d, %p, %p, %p, %lld, %p, %d)\n",
find_thread(NULL), numFDs, readSet, writeSet, errorSet, timeout,
sigMask, kernel));
// check if fds are valid before doing anything
for (fd = 0; fd < numFDs; fd++) {
if (((readSet && FD_ISSET(fd, readSet))
|| (writeSet && FD_ISSET(fd, writeSet))
|| (errorSet && FD_ISSET(fd, errorSet)))
&& !fd_is_valid(fd, kernel))
return B_FILE_ERROR;
}
// allocate sync object
select_sync* sync;
status = create_select_sync(numFDs, sync);
if (status != B_OK)
return status;
// start selecting file descriptors
BenaphoreLocker locker(sync->lock);
for (fd = 0; fd < numFDs; fd++) {
sync->set[fd].selected_events = 0;
sync->set[fd].events = 0;
if (readSet && FD_ISSET(fd, readSet))
sync->set[fd].selected_events = SELECT_FLAG(B_SELECT_READ);
if (writeSet && FD_ISSET(fd, writeSet))
sync->set[fd].selected_events |= SELECT_FLAG(B_SELECT_WRITE);
if (errorSet && FD_ISSET(fd, errorSet))
sync->set[fd].selected_events |= SELECT_FLAG(B_SELECT_ERROR);
if (sync->set[fd].selected_events != 0) {
select_fd(fd, sync->set + fd, kernel);
// array position is the same as the fd for select()
}
}
locker.Unlock();
// set new signal mask
sigset_t oldSigMask;
if (sigMask != NULL)
sigprocmask(SIG_SETMASK, sigMask, &oldSigMask);
// wait for something to happen
status = acquire_sem_etc(sync->sem, 1,
B_CAN_INTERRUPT | (timeout != -1 ? B_ABSOLUTE_TIMEOUT : 0), timeout);
// restore the old signal mask
if (sigMask != NULL)
sigprocmask(SIG_SETMASK, &oldSigMask, NULL);
PRINT(("common_select(): acquire_sem_etc() returned: %lx\n", status));
// deselect file descriptors
locker.Lock();
for (fd = 0; fd < numFDs; fd++)
deselect_fd(fd, sync->set + fd, kernel);
PRINT(("common_select(): events deselected\n"));
// collect the events that happened in the meantime
int count = 0;
if (status == B_INTERRUPTED) {
// We must not clear the sets in this case, as applications may
// rely on the contents of them.
locker.Unlock();
put_select_sync(sync);
return B_INTERRUPTED;
}
// Clear sets to store the received events
// (we can't use the macros, because we have variable sized arrays;
// the other FD_xxx() macros are safe, though).
fd_zero(readSet, numFDs);
fd_zero(writeSet, numFDs);
fd_zero(errorSet, numFDs);
if (status == B_OK) {
for (count = 0, fd = 0;fd < numFDs; fd++) {
if (readSet && sync->set[fd].events & SELECT_FLAG(B_SELECT_READ)) {
FD_SET(fd, readSet);
count++;
}
if (writeSet
&& sync->set[fd].events & SELECT_FLAG(B_SELECT_WRITE)) {
FD_SET(fd, writeSet);
count++;
}
if (errorSet
&& sync->set[fd].events & SELECT_FLAG(B_SELECT_ERROR)) {
FD_SET(fd, errorSet);
count++;
}
}
}
// B_TIMED_OUT and B_WOULD_BLOCK are supposed to return 0
locker.Unlock();
put_select_sync(sync);
return count;
}
static int
common_poll(struct pollfd *fds, nfds_t numFDs, bigtime_t timeout, bool kernel)
{
status_t status = B_OK;
int count = 0;
uint32 i;
// allocate sync object
select_sync* sync;
status = create_select_sync(numFDs, sync);
if (status != B_OK)
return status;
// start polling file descriptors (by selecting them)
BenaphoreLocker locker(sync->lock);
for (i = 0; i < numFDs; i++) {
int fd = fds[i].fd;
// initialize events masks
sync->set[i].selected_events = fds[i].events & ~POLLNVAL
| POLLERR | POLLHUP;
sync->set[i].events = 0;
if (select_fd(fd, sync->set + i, kernel) == B_OK) {
fds[i].revents = 0;
if (sync->set[i].selected_events != 0)
count++;
} else
fds[i].revents = POLLNVAL;
}
if (count < 1) {
count = B_BAD_VALUE;
goto err;
}
locker.Unlock();
status = acquire_sem_etc(sync->sem, 1,
B_CAN_INTERRUPT | (timeout != -1 ? B_ABSOLUTE_TIMEOUT : 0), timeout);
// deselect file descriptors
locker.Lock();
for (i = 0; i < numFDs; i++)
deselect_fd(fds[i].fd, sync->set + i, kernel);
// collect the events that are happened in the meantime
switch (status) {
case B_OK:
for (count = 0, i = 0; i < numFDs; i++) {
if (fds[i].revents == POLLNVAL)
continue;
// POLLxxx flags and B_SELECT_xxx flags are compatible
fds[i].revents = sync->set[i].events
& sync->set[i].selected_events;
if (fds[i].revents != 0)
count++;
}
break;
case B_INTERRUPTED:
count = B_INTERRUPTED;
break;
default:
// B_TIMED_OUT, and B_WOULD_BLOCK
count = 0;
}
err:
locker.Unlock();
put_select_sync(sync);
return count;
}
static ssize_t
common_wait_for_objects(object_wait_info* infos, int numInfos, uint32 flags,
bigtime_t timeout, bool kernel)
{
status_t status = B_OK;
// allocate sync object
select_sync* sync;
status = create_select_sync(numInfos, sync);
if (status != B_OK)
return status;
// start selecting objects
BenaphoreLocker locker(sync->lock);
ssize_t count = 0;
bool invalid = false;
for (int i = 0; i < numInfos; i++) {
uint16 type = infos[i].type;
int32 object = infos[i].object;
// initialize events masks
sync->set[i].selected_events = infos[i].events
| B_EVENT_INVALID | B_EVENT_ERROR | B_EVENT_DISCONNECTED;
sync->set[i].events = 0;
if (type < kSelectOpsCount
&& kSelectOps[type].select(object, sync->set + i, kernel) == B_OK) {
infos[i].events = 0;
if (sync->set[i].selected_events != 0)
count++;
} else {
sync->set[i].selected_events = B_EVENT_INVALID;
infos[i].events = B_EVENT_INVALID;
invalid = true;
}
}
locker.Unlock();
if (count < 1) {
put_select_sync(sync);
return B_BAD_VALUE;
}
if (!invalid) {
status = acquire_sem_etc(sync->sem, 1, B_CAN_INTERRUPT | flags,
timeout);
}
// deselect objects
locker.Lock();
for (int i = 0; i < numInfos; i++) {
uint16 type = infos[i].type;
if (type < kSelectOpsCount && (infos[i].events & B_EVENT_INVALID) == 0)
kSelectOps[type].deselect(infos[i].object, sync->set + i, kernel);
}
// collect the events that are happened in the meantime
switch (status) {
case B_OK:
count = 0;
for (int i = 0; i < numInfos; i++) {
infos[i].events = sync->set[i].events
& sync->set[i].selected_events;
if (infos[i].events != 0)
count++;
}
break;
default:
// B_INTERRUPTED, B_TIMED_OUT, and B_WOULD_BLOCK
count = status;
break;
}
locker.Unlock();
put_select_sync(sync);
return count;
}
// #pragma mark - kernel private
status_t
notify_select_events(select_info* info, uint16 events)
{
FUNCTION(("notify_select_events(%p (%p), 0x%x)\n", info, info->sync,
events));
if (info == NULL
|| info->sync == NULL
|| info->sync->sem < B_OK)
return B_BAD_VALUE;
info->events |= events;
// only wake up the waiting select()/poll() call if the events
// match one of the selected ones
if (info->selected_events & events)
return release_sem_etc(info->sync->sem, 1, B_DO_NOT_RESCHEDULE);
return B_OK;
}
void
notify_select_events_list(select_info* list, uint16 events)
{
struct select_info* info = list;
while (info != NULL) {
notify_select_events(info, events);
info = info->next;
}
}
// #pragma mark - public kernel API
status_t
notify_select_event(struct selectsync *sync, uint8 event)
{
return notify_select_events((select_info*)sync, SELECT_FLAG(event));
}
// #pragma mark - private kernel exported API
static select_sync_pool_entry *
find_select_sync_pool_entry(select_sync_pool *pool, selectsync *sync)
{
for (SelectSyncPoolEntryList::Iterator it = pool->entries.GetIterator();
it.HasNext();) {
select_sync_pool_entry *entry = it.Next();
if (entry->sync == sync)
return entry;
}
return NULL;
}
static status_t
add_select_sync_pool_entry(select_sync_pool *pool, selectsync *sync,
uint8 event)
{
// check, whether the entry does already exist
select_sync_pool_entry *entry = find_select_sync_pool_entry(pool, sync);
if (!entry) {
entry = new (std::nothrow) select_sync_pool_entry;
if (!entry)
return B_NO_MEMORY;
entry->sync = sync;
entry->events = 0;
pool->entries.Add(entry);
}
entry->events |= SELECT_FLAG(event);
return B_OK;
}
status_t
add_select_sync_pool_entry(select_sync_pool **_pool, selectsync *sync,
uint8 event)
{
// create the pool, if necessary
select_sync_pool *pool = *_pool;
if (!pool) {
pool = new (std::nothrow) select_sync_pool;
if (!pool)
return B_NO_MEMORY;
*_pool = pool;
}
// add the entry
status_t error = add_select_sync_pool_entry(pool, sync, event);
// cleanup
if (pool->entries.IsEmpty()) {
delete pool;
*_pool = NULL;
}
return error;
}
status_t
remove_select_sync_pool_entry(select_sync_pool **_pool, selectsync *sync,
uint8 event)
{
select_sync_pool *pool = *_pool;
if (!pool)
return B_ENTRY_NOT_FOUND;
// clear the event flag of the concerned entries
bool found = false;
for (SelectSyncPoolEntryList::Iterator it = pool->entries.GetIterator();
it.HasNext();) {
select_sync_pool_entry *entry = it.Next();
if (entry->sync == sync) {
found = true;
entry->events &= ~SELECT_FLAG(event);
// remove the entry, if no longer needed
if (entry->events == 0) {
it.Remove();
delete entry;
}
}
}
if (!found)
return B_ENTRY_NOT_FOUND;
// delete the pool, if no longer needed
if (pool->entries.IsEmpty()) {
delete pool;
*_pool = NULL;
}
return B_OK;
}
void
delete_select_sync_pool(select_sync_pool *pool)
{
if (!pool)
return;
while (select_sync_pool_entry *entry = pool->entries.Head()) {
pool->entries.Remove(entry);
delete entry;
}
delete pool;
}
void
notify_select_event_pool(select_sync_pool *pool, uint8 event)
{
if (!pool)
return;
FUNCTION(("notify_select_event_pool(%p, %u)\n", pool, event));
for (SelectSyncPoolEntryList::Iterator it = pool->entries.GetIterator();
it.HasNext();) {
select_sync_pool_entry *entry = it.Next();
if (entry->events & SELECT_FLAG(event))
notify_select_event(entry->sync, event);
}
}
// #pragma mark - Kernel POSIX layer
ssize_t
_kern_select(int numFDs, fd_set *readSet, fd_set *writeSet, fd_set *errorSet,
bigtime_t timeout, const sigset_t *sigMask)
{
if (timeout >= 0)
timeout += system_time();
return common_select(numFDs, readSet, writeSet, errorSet, timeout,
sigMask, true);
}
ssize_t
_kern_poll(struct pollfd *fds, int numFDs, bigtime_t timeout)
{
if (timeout >= 0)
timeout += system_time();
return common_poll(fds, numFDs, timeout, true);
}
ssize_t
_kern_wait_for_objects(object_wait_info* infos, int numInfos, uint32 flags,
bigtime_t timeout)
{
return common_wait_for_objects(infos, numInfos, flags, timeout, true);
}
// #pragma mark - User syscalls
ssize_t
_user_select(int numFDs, fd_set *userReadSet, fd_set *userWriteSet,
fd_set *userErrorSet, bigtime_t timeout, const sigset_t *userSigMask)
{
fd_set *readSet = NULL, *writeSet = NULL, *errorSet = NULL;
uint32 bytes = _howmany(numFDs, NFDBITS) * sizeof(fd_mask);
sigset_t sigMask;
int result;
syscall_restart_handle_timeout_pre(timeout);
if (numFDs < 0)
return B_BAD_VALUE;
if ((userReadSet != NULL && !IS_USER_ADDRESS(userReadSet))
|| (userWriteSet != NULL && !IS_USER_ADDRESS(userWriteSet))
|| (userErrorSet != NULL && !IS_USER_ADDRESS(userErrorSet))
|| (userSigMask != NULL && !IS_USER_ADDRESS(userSigMask)))
return B_BAD_ADDRESS;
// copy parameters
if (userReadSet != NULL) {
readSet = (fd_set *)malloc(bytes);
if (readSet == NULL)
return B_NO_MEMORY;
if (user_memcpy(readSet, userReadSet, bytes) < B_OK) {
result = B_BAD_ADDRESS;
goto err;
}
}
if (userWriteSet != NULL) {
writeSet = (fd_set *)malloc(bytes);
if (writeSet == NULL) {
result = B_NO_MEMORY;
goto err;
}
if (user_memcpy(writeSet, userWriteSet, bytes) < B_OK) {
result = B_BAD_ADDRESS;
goto err;
}
}
if (userErrorSet != NULL) {
errorSet = (fd_set *)malloc(bytes);
if (errorSet == NULL) {
result = B_NO_MEMORY;
goto err;
}
if (user_memcpy(errorSet, userErrorSet, bytes) < B_OK) {
result = B_BAD_ADDRESS;
goto err;
}
}
if (userSigMask != NULL)
sigMask = *userSigMask;
result = common_select(numFDs, readSet, writeSet, errorSet, timeout,
userSigMask ? &sigMask : NULL, false);
// copy back results
if (result >= B_OK
&& ((readSet != NULL
&& user_memcpy(userReadSet, readSet, bytes) < B_OK)
|| (writeSet != NULL
&& user_memcpy(userWriteSet, writeSet, bytes) < B_OK)
|| (errorSet != NULL
&& user_memcpy(userErrorSet, errorSet, bytes) < B_OK))) {
result = B_BAD_ADDRESS;
} else
syscall_restart_handle_timeout_post(result, timeout);
err:
free(readSet);
free(writeSet);
free(errorSet);
return result;
}
ssize_t
_user_poll(struct pollfd *userfds, int numFDs, bigtime_t timeout)
{
struct pollfd *fds;
size_t bytes;
int result;
syscall_restart_handle_timeout_pre(timeout);
if (numFDs < 0)
return B_BAD_VALUE;
if (userfds == NULL || !IS_USER_ADDRESS(userfds))
return B_BAD_ADDRESS;
// copy parameters
fds = (struct pollfd *)malloc(bytes = numFDs * sizeof(struct pollfd));
if (fds == NULL)
return B_NO_MEMORY;
if (user_memcpy(fds, userfds, bytes) < B_OK) {
result = B_BAD_ADDRESS;
goto err;
}
result = common_poll(fds, numFDs, timeout, false);
// copy back results
if (result >= B_OK && user_memcpy(userfds, fds, bytes) < B_OK)
result = B_BAD_ADDRESS;
else
syscall_restart_handle_timeout_post(result, timeout);
err:
free(fds);
return result;
}
ssize_t
_user_wait_for_objects(object_wait_info* userInfos, int numInfos, uint32 flags,
bigtime_t timeout)
{
syscall_restart_handle_timeout_pre(flags, timeout);
if (numInfos < 0)
return B_BAD_VALUE;
if (userInfos == NULL || !IS_USER_ADDRESS(userInfos))
return B_BAD_ADDRESS;
int bytes = sizeof(object_wait_info) * numInfos;
object_wait_info* infos = (object_wait_info*)malloc(bytes);
if (infos == NULL)
return B_NO_MEMORY;
// copy parameters to kernel space, call the function, and copy the results
// back
ssize_t result;
if (user_memcpy(infos, userInfos, bytes) == B_OK) {
result = common_wait_for_objects(infos, numInfos, flags, timeout,
false);
if (result >= 0 && user_memcpy(userInfos, infos, bytes) != B_OK)
result = B_BAD_ADDRESS;
else
syscall_restart_handle_timeout_post(result, timeout);
} else
result = B_BAD_ADDRESS;
free(infos);
return result;
}