aio-posix: extract ppoll(2) and epoll(7) fd monitoring
The ppoll(2) and epoll(7) file descriptor monitoring implementations are mixed with the core util/aio-posix.c code. Before adding another implementation for Linux io_uring, extract out the existing ones so there is a clear interface and the core code is simpler. The new interface is AioContext->fdmon_ops, a pointer to a FDMonOps struct. See the patch for details. Semantic changes: 1. ppoll(2) now reflects events from pollfds[] back into AioHandlers while we're still on the clock for adaptive polling. This was already happening for epoll(7), so if it's really an issue then we'll need to fix both in the future. 2. epoll(7)'s fallback to ppoll(2) while external events are disabled was broken when the number of fds exceeded the epoll(7) upgrade threshold. I guess this code path simply wasn't tested and no one noticed the bug. I didn't go out of my way to fix it but the correct code is simpler than preserving the bug. I also took some liberties in removing the unnecessary AioContext->epoll_available (just check AioContext->epollfd != -1 instead) and AioContext->epoll_enabled (it's implicit if our AioContext->fdmon_ops callbacks are being invoked) fields. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Link: https://lore.kernel.org/r/20200305170806.1313245-4-stefanha@redhat.com Message-Id: <20200305170806.1313245-4-stefanha@redhat.com>
This commit is contained in:
parent
3aa221b382
commit
1f050a4690
@ -1885,6 +1885,8 @@ L: qemu-block@nongnu.org
|
|||||||
S: Supported
|
S: Supported
|
||||||
F: util/async.c
|
F: util/async.c
|
||||||
F: util/aio-*.c
|
F: util/aio-*.c
|
||||||
|
F: util/aio-*.h
|
||||||
|
F: util/fdmon-*.c
|
||||||
F: block/io.c
|
F: block/io.c
|
||||||
F: migration/block*
|
F: migration/block*
|
||||||
F: include/block/aio.h
|
F: include/block/aio.h
|
||||||
|
@ -52,6 +52,38 @@ struct ThreadPool;
|
|||||||
struct LinuxAioState;
|
struct LinuxAioState;
|
||||||
struct LuringState;
|
struct LuringState;
|
||||||
|
|
||||||
|
/* Callbacks for file descriptor monitoring implementations */
|
||||||
|
typedef struct {
|
||||||
|
/*
|
||||||
|
* update:
|
||||||
|
* @ctx: the AioContext
|
||||||
|
* @node: the handler
|
||||||
|
* @is_new: is the file descriptor already being monitored?
|
||||||
|
*
|
||||||
|
* Add/remove/modify a monitored file descriptor. There are three cases:
|
||||||
|
* 1. node->pfd.events == 0 means remove the file descriptor.
|
||||||
|
* 2. !is_new means modify an already monitored file descriptor.
|
||||||
|
* 3. is_new means add a new file descriptor.
|
||||||
|
*
|
||||||
|
* Called with ctx->list_lock acquired.
|
||||||
|
*/
|
||||||
|
void (*update)(AioContext *ctx, AioHandler *node, bool is_new);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* wait:
|
||||||
|
* @ctx: the AioContext
|
||||||
|
* @ready_list: list for handlers that become ready
|
||||||
|
* @timeout: maximum duration to wait, in nanoseconds
|
||||||
|
*
|
||||||
|
* Wait for file descriptors to become ready and place them on ready_list.
|
||||||
|
*
|
||||||
|
* Called with ctx->list_lock incremented but not locked.
|
||||||
|
*
|
||||||
|
* Returns: number of ready file descriptors.
|
||||||
|
*/
|
||||||
|
int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
|
||||||
|
} FDMonOps;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
|
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
|
||||||
* scheduled BHs are not processed until the next aio_bh_poll() call. All
|
* scheduled BHs are not processed until the next aio_bh_poll() call. All
|
||||||
@ -173,8 +205,8 @@ struct AioContext {
|
|||||||
|
|
||||||
/* epoll(7) state used when built with CONFIG_EPOLL */
|
/* epoll(7) state used when built with CONFIG_EPOLL */
|
||||||
int epollfd;
|
int epollfd;
|
||||||
bool epoll_enabled;
|
|
||||||
bool epoll_available;
|
const FDMonOps *fdmon_ops;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -5,6 +5,8 @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
|
|||||||
util-obj-y += main-loop.o
|
util-obj-y += main-loop.o
|
||||||
util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
|
util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
|
||||||
util-obj-$(CONFIG_POSIX) += aio-posix.o
|
util-obj-$(CONFIG_POSIX) += aio-posix.o
|
||||||
|
util-obj-$(CONFIG_POSIX) += fdmon-poll.o
|
||||||
|
util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
|
||||||
util-obj-$(CONFIG_POSIX) += compatfd.o
|
util-obj-$(CONFIG_POSIX) += compatfd.o
|
||||||
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
|
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
|
||||||
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
|
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
|
||||||
|
286
util/aio-posix.c
286
util/aio-posix.c
@ -20,191 +20,17 @@
|
|||||||
#include "qemu/sockets.h"
|
#include "qemu/sockets.h"
|
||||||
#include "qemu/cutils.h"
|
#include "qemu/cutils.h"
|
||||||
#include "trace.h"
|
#include "trace.h"
|
||||||
#ifdef CONFIG_EPOLL_CREATE1
|
#include "aio-posix.h"
|
||||||
#include <sys/epoll.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
struct AioHandler
|
void aio_add_ready_handler(AioHandlerList *ready_list,
|
||||||
{
|
AioHandler *node,
|
||||||
GPollFD pfd;
|
int revents)
|
||||||
IOHandler *io_read;
|
|
||||||
IOHandler *io_write;
|
|
||||||
AioPollFn *io_poll;
|
|
||||||
IOHandler *io_poll_begin;
|
|
||||||
IOHandler *io_poll_end;
|
|
||||||
void *opaque;
|
|
||||||
bool is_external;
|
|
||||||
QLIST_ENTRY(AioHandler) node;
|
|
||||||
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
|
|
||||||
QLIST_ENTRY(AioHandler) node_deleted;
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Add a handler to a ready list */
|
|
||||||
static void add_ready_handler(AioHandlerList *ready_list,
|
|
||||||
AioHandler *node,
|
|
||||||
int revents)
|
|
||||||
{
|
{
|
||||||
QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
|
QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
|
||||||
node->pfd.revents = revents;
|
node->pfd.revents = revents;
|
||||||
QLIST_INSERT_HEAD(ready_list, node, node_ready);
|
QLIST_INSERT_HEAD(ready_list, node, node_ready);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_EPOLL_CREATE1
|
|
||||||
|
|
||||||
/* The fd number threshold to switch to epoll */
|
|
||||||
#define EPOLL_ENABLE_THRESHOLD 64
|
|
||||||
|
|
||||||
static void aio_epoll_disable(AioContext *ctx)
|
|
||||||
{
|
|
||||||
ctx->epoll_enabled = false;
|
|
||||||
if (!ctx->epoll_available) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
ctx->epoll_available = false;
|
|
||||||
close(ctx->epollfd);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int epoll_events_from_pfd(int pfd_events)
|
|
||||||
{
|
|
||||||
return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
|
|
||||||
(pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
|
|
||||||
(pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
|
|
||||||
(pfd_events & G_IO_ERR ? EPOLLERR : 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool aio_epoll_try_enable(AioContext *ctx)
|
|
||||||
{
|
|
||||||
AioHandler *node;
|
|
||||||
struct epoll_event event;
|
|
||||||
|
|
||||||
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
|
||||||
int r;
|
|
||||||
if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
event.events = epoll_events_from_pfd(node->pfd.events);
|
|
||||||
event.data.ptr = node;
|
|
||||||
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
|
|
||||||
if (r) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ctx->epoll_enabled = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
|
|
||||||
{
|
|
||||||
struct epoll_event event;
|
|
||||||
int r;
|
|
||||||
int ctl;
|
|
||||||
|
|
||||||
if (!ctx->epoll_enabled) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!node->pfd.events) {
|
|
||||||
ctl = EPOLL_CTL_DEL;
|
|
||||||
} else {
|
|
||||||
event.data.ptr = node;
|
|
||||||
event.events = epoll_events_from_pfd(node->pfd.events);
|
|
||||||
ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
|
|
||||||
}
|
|
||||||
|
|
||||||
r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
|
|
||||||
if (r) {
|
|
||||||
aio_epoll_disable(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
|
|
||||||
int64_t timeout)
|
|
||||||
{
|
|
||||||
GPollFD pfd = {
|
|
||||||
.fd = ctx->epollfd,
|
|
||||||
.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
|
|
||||||
};
|
|
||||||
AioHandler *node;
|
|
||||||
int i, ret = 0;
|
|
||||||
struct epoll_event events[128];
|
|
||||||
|
|
||||||
if (timeout > 0) {
|
|
||||||
ret = qemu_poll_ns(&pfd, 1, timeout);
|
|
||||||
if (ret > 0) {
|
|
||||||
timeout = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (timeout <= 0 || ret > 0) {
|
|
||||||
ret = epoll_wait(ctx->epollfd, events,
|
|
||||||
ARRAY_SIZE(events),
|
|
||||||
timeout);
|
|
||||||
if (ret <= 0) {
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
for (i = 0; i < ret; i++) {
|
|
||||||
int ev = events[i].events;
|
|
||||||
int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
|
|
||||||
(ev & EPOLLOUT ? G_IO_OUT : 0) |
|
|
||||||
(ev & EPOLLHUP ? G_IO_HUP : 0) |
|
|
||||||
(ev & EPOLLERR ? G_IO_ERR : 0);
|
|
||||||
|
|
||||||
node = events[i].data.ptr;
|
|
||||||
add_ready_handler(ready_list, node, revents);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool aio_epoll_enabled(AioContext *ctx)
|
|
||||||
{
|
|
||||||
/* Fall back to ppoll when external clients are disabled. */
|
|
||||||
return !aio_external_disabled(ctx) && ctx->epoll_enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
|
|
||||||
unsigned npfd, int64_t timeout)
|
|
||||||
{
|
|
||||||
if (!ctx->epoll_available) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (aio_epoll_enabled(ctx)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (npfd >= EPOLL_ENABLE_THRESHOLD) {
|
|
||||||
if (aio_epoll_try_enable(ctx)) {
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
aio_epoll_disable(ctx);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
|
|
||||||
int64_t timeout)
|
|
||||||
{
|
|
||||||
assert(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool aio_epoll_enabled(AioContext *ctx)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
|
|
||||||
unsigned npfd, int64_t timeout)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static AioHandler *find_aio_handler(AioContext *ctx, int fd)
|
static AioHandler *find_aio_handler(AioContext *ctx, int fd)
|
||||||
{
|
{
|
||||||
AioHandler *node;
|
AioHandler *node;
|
||||||
@ -314,10 +140,10 @@ void aio_set_fd_handler(AioContext *ctx,
|
|||||||
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
|
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
|
||||||
|
|
||||||
if (new_node) {
|
if (new_node) {
|
||||||
aio_epoll_update(ctx, new_node, is_new);
|
ctx->fdmon_ops->update(ctx, new_node, is_new);
|
||||||
} else if (node) {
|
} else if (node) {
|
||||||
/* Unregister deleted fd_handler */
|
/* Unregister deleted fd_handler */
|
||||||
aio_epoll_update(ctx, node, false);
|
ctx->fdmon_ops->update(ctx, node, false);
|
||||||
}
|
}
|
||||||
qemu_lockcnt_unlock(&ctx->list_lock);
|
qemu_lockcnt_unlock(&ctx->list_lock);
|
||||||
aio_notify(ctx);
|
aio_notify(ctx);
|
||||||
@ -532,52 +358,6 @@ void aio_dispatch(AioContext *ctx)
|
|||||||
timerlistgroup_run_timers(&ctx->tlg);
|
timerlistgroup_run_timers(&ctx->tlg);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* These thread-local variables are used only in a small part of aio_poll
|
|
||||||
* around the call to the poll() system call. In particular they are not
|
|
||||||
* used while aio_poll is performing callbacks, which makes it much easier
|
|
||||||
* to think about reentrancy!
|
|
||||||
*
|
|
||||||
* Stack-allocated arrays would be perfect but they have size limitations;
|
|
||||||
* heap allocation is expensive enough that we want to reuse arrays across
|
|
||||||
* calls to aio_poll(). And because poll() has to be called without holding
|
|
||||||
* any lock, the arrays cannot be stored in AioContext. Thread-local data
|
|
||||||
* has none of the disadvantages of these three options.
|
|
||||||
*/
|
|
||||||
static __thread GPollFD *pollfds;
|
|
||||||
static __thread AioHandler **nodes;
|
|
||||||
static __thread unsigned npfd, nalloc;
|
|
||||||
static __thread Notifier pollfds_cleanup_notifier;
|
|
||||||
|
|
||||||
static void pollfds_cleanup(Notifier *n, void *unused)
|
|
||||||
{
|
|
||||||
g_assert(npfd == 0);
|
|
||||||
g_free(pollfds);
|
|
||||||
g_free(nodes);
|
|
||||||
nalloc = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void add_pollfd(AioHandler *node)
|
|
||||||
{
|
|
||||||
if (npfd == nalloc) {
|
|
||||||
if (nalloc == 0) {
|
|
||||||
pollfds_cleanup_notifier.notify = pollfds_cleanup;
|
|
||||||
qemu_thread_atexit_add(&pollfds_cleanup_notifier);
|
|
||||||
nalloc = 8;
|
|
||||||
} else {
|
|
||||||
g_assert(nalloc <= INT_MAX);
|
|
||||||
nalloc *= 2;
|
|
||||||
}
|
|
||||||
pollfds = g_renew(GPollFD, pollfds, nalloc);
|
|
||||||
nodes = g_renew(AioHandler *, nodes, nalloc);
|
|
||||||
}
|
|
||||||
nodes[npfd] = node;
|
|
||||||
pollfds[npfd] = (GPollFD) {
|
|
||||||
.fd = node->pfd.fd,
|
|
||||||
.events = node->pfd.events,
|
|
||||||
};
|
|
||||||
npfd++;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
|
static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
|
||||||
{
|
{
|
||||||
bool progress = false;
|
bool progress = false;
|
||||||
@ -689,8 +469,6 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
|
|||||||
bool aio_poll(AioContext *ctx, bool blocking)
|
bool aio_poll(AioContext *ctx, bool blocking)
|
||||||
{
|
{
|
||||||
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
|
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
|
||||||
AioHandler *node;
|
|
||||||
int i;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
bool progress;
|
bool progress;
|
||||||
int64_t timeout;
|
int64_t timeout;
|
||||||
@ -723,26 +501,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
|
|||||||
* system call---a single round of run_poll_handlers_once suffices.
|
* system call---a single round of run_poll_handlers_once suffices.
|
||||||
*/
|
*/
|
||||||
if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
|
if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
|
||||||
assert(npfd == 0);
|
ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
|
||||||
|
|
||||||
/* fill pollfds */
|
|
||||||
|
|
||||||
if (!aio_epoll_enabled(ctx)) {
|
|
||||||
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
|
||||||
if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
|
|
||||||
&& aio_node_check(ctx, node->is_external)) {
|
|
||||||
add_pollfd(node);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* wait until next event */
|
|
||||||
if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
|
|
||||||
npfd = 0; /* pollfds[] is not being used */
|
|
||||||
ret = aio_epoll(ctx, &ready_list, timeout);
|
|
||||||
} else {
|
|
||||||
ret = qemu_poll_ns(pollfds, npfd, timeout);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blocking) {
|
if (blocking) {
|
||||||
@ -791,19 +550,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if we have any readable fds, dispatch event */
|
|
||||||
if (ret > 0) {
|
|
||||||
for (i = 0; i < npfd; i++) {
|
|
||||||
int revents = pollfds[i].revents;
|
|
||||||
|
|
||||||
if (revents) {
|
|
||||||
add_ready_handler(&ready_list, nodes[i], revents);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
npfd = 0;
|
|
||||||
|
|
||||||
progress |= aio_bh_poll(ctx);
|
progress |= aio_bh_poll(ctx);
|
||||||
|
|
||||||
if (ret > 0) {
|
if (ret > 0) {
|
||||||
@ -821,23 +567,15 @@ bool aio_poll(AioContext *ctx, bool blocking)
|
|||||||
|
|
||||||
void aio_context_setup(AioContext *ctx)
|
void aio_context_setup(AioContext *ctx)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_EPOLL_CREATE1
|
ctx->fdmon_ops = &fdmon_poll_ops;
|
||||||
assert(!ctx->epollfd);
|
ctx->epollfd = -1;
|
||||||
ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
|
|
||||||
if (ctx->epollfd == -1) {
|
fdmon_epoll_setup(ctx);
|
||||||
fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
|
|
||||||
ctx->epoll_available = false;
|
|
||||||
} else {
|
|
||||||
ctx->epoll_available = true;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aio_context_destroy(AioContext *ctx)
|
void aio_context_destroy(AioContext *ctx)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_EPOLL_CREATE1
|
fdmon_epoll_disable(ctx);
|
||||||
aio_epoll_disable(ctx);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
|
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
|
||||||
|
61
util/aio-posix.h
Normal file
61
util/aio-posix.h
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
/*
|
||||||
|
* AioContext POSIX event loop implementation internal APIs
|
||||||
|
*
|
||||||
|
* Copyright IBM, Corp. 2008
|
||||||
|
* Copyright Red Hat, Inc. 2020
|
||||||
|
*
|
||||||
|
* Authors:
|
||||||
|
* Anthony Liguori <aliguori@us.ibm.com>
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
||||||
|
* the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
* Contributions after 2012-01-13 are licensed under the terms of the
|
||||||
|
* GNU GPL, version 2 or (at your option) any later version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef AIO_POSIX_H
|
||||||
|
#define AIO_POSIX_H
|
||||||
|
|
||||||
|
#include "block/aio.h"
|
||||||
|
|
||||||
|
struct AioHandler {
|
||||||
|
GPollFD pfd;
|
||||||
|
IOHandler *io_read;
|
||||||
|
IOHandler *io_write;
|
||||||
|
AioPollFn *io_poll;
|
||||||
|
IOHandler *io_poll_begin;
|
||||||
|
IOHandler *io_poll_end;
|
||||||
|
void *opaque;
|
||||||
|
bool is_external;
|
||||||
|
QLIST_ENTRY(AioHandler) node;
|
||||||
|
QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
|
||||||
|
QLIST_ENTRY(AioHandler) node_deleted;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Add a handler to a ready list */
|
||||||
|
void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
|
||||||
|
int revents);
|
||||||
|
|
||||||
|
extern const FDMonOps fdmon_poll_ops;
|
||||||
|
|
||||||
|
#ifdef CONFIG_EPOLL_CREATE1
|
||||||
|
bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
|
||||||
|
void fdmon_epoll_setup(AioContext *ctx);
|
||||||
|
void fdmon_epoll_disable(AioContext *ctx);
|
||||||
|
#else
|
||||||
|
static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void fdmon_epoll_setup(AioContext *ctx)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void fdmon_epoll_disable(AioContext *ctx)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif /* !CONFIG_EPOLL_CREATE1 */
|
||||||
|
|
||||||
|
#endif /* AIO_POSIX_H */
|
151
util/fdmon-epoll.c
Normal file
151
util/fdmon-epoll.c
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||||
|
/*
|
||||||
|
* epoll(7) file descriptor monitoring
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include <sys/epoll.h>
|
||||||
|
#include "qemu/rcu_queue.h"
|
||||||
|
#include "aio-posix.h"
|
||||||
|
|
||||||
|
/* The fd number threshold to switch to epoll */
|
||||||
|
#define EPOLL_ENABLE_THRESHOLD 64
|
||||||
|
|
||||||
|
void fdmon_epoll_disable(AioContext *ctx)
|
||||||
|
{
|
||||||
|
if (ctx->epollfd >= 0) {
|
||||||
|
close(ctx->epollfd);
|
||||||
|
ctx->epollfd = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Switch back */
|
||||||
|
ctx->fdmon_ops = &fdmon_poll_ops;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int epoll_events_from_pfd(int pfd_events)
|
||||||
|
{
|
||||||
|
return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
|
||||||
|
(pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
|
||||||
|
(pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
|
||||||
|
(pfd_events & G_IO_ERR ? EPOLLERR : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fdmon_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
|
||||||
|
{
|
||||||
|
struct epoll_event event;
|
||||||
|
int r;
|
||||||
|
int ctl;
|
||||||
|
|
||||||
|
if (!node->pfd.events) {
|
||||||
|
ctl = EPOLL_CTL_DEL;
|
||||||
|
} else {
|
||||||
|
event.data.ptr = node;
|
||||||
|
event.events = epoll_events_from_pfd(node->pfd.events);
|
||||||
|
ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
|
||||||
|
if (r) {
|
||||||
|
fdmon_epoll_disable(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
|
||||||
|
int64_t timeout)
|
||||||
|
{
|
||||||
|
GPollFD pfd = {
|
||||||
|
.fd = ctx->epollfd,
|
||||||
|
.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
|
||||||
|
};
|
||||||
|
AioHandler *node;
|
||||||
|
int i, ret = 0;
|
||||||
|
struct epoll_event events[128];
|
||||||
|
|
||||||
|
/* Fall back while external clients are disabled */
|
||||||
|
if (atomic_read(&ctx->external_disable_cnt)) {
|
||||||
|
return fdmon_poll_ops.wait(ctx, ready_list, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (timeout > 0) {
|
||||||
|
ret = qemu_poll_ns(&pfd, 1, timeout);
|
||||||
|
if (ret > 0) {
|
||||||
|
timeout = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (timeout <= 0 || ret > 0) {
|
||||||
|
ret = epoll_wait(ctx->epollfd, events,
|
||||||
|
ARRAY_SIZE(events),
|
||||||
|
timeout);
|
||||||
|
if (ret <= 0) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
for (i = 0; i < ret; i++) {
|
||||||
|
int ev = events[i].events;
|
||||||
|
int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
|
||||||
|
(ev & EPOLLOUT ? G_IO_OUT : 0) |
|
||||||
|
(ev & EPOLLHUP ? G_IO_HUP : 0) |
|
||||||
|
(ev & EPOLLERR ? G_IO_ERR : 0);
|
||||||
|
|
||||||
|
node = events[i].data.ptr;
|
||||||
|
aio_add_ready_handler(ready_list, node, revents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const FDMonOps fdmon_epoll_ops = {
|
||||||
|
.update = fdmon_epoll_update,
|
||||||
|
.wait = fdmon_epoll_wait,
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool fdmon_epoll_try_enable(AioContext *ctx)
|
||||||
|
{
|
||||||
|
AioHandler *node;
|
||||||
|
struct epoll_event event;
|
||||||
|
|
||||||
|
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
||||||
|
int r;
|
||||||
|
if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
event.events = epoll_events_from_pfd(node->pfd.events);
|
||||||
|
event.data.ptr = node;
|
||||||
|
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
|
||||||
|
if (r) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx->fdmon_ops = &fdmon_epoll_ops;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
|
||||||
|
{
|
||||||
|
if (ctx->epollfd < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do not upgrade while external clients are disabled */
|
||||||
|
if (atomic_read(&ctx->external_disable_cnt)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (npfd >= EPOLL_ENABLE_THRESHOLD) {
|
||||||
|
if (fdmon_epoll_try_enable(ctx)) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
fdmon_epoll_disable(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void fdmon_epoll_setup(AioContext *ctx)
|
||||||
|
{
|
||||||
|
ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
|
||||||
|
if (ctx->epollfd == -1) {
|
||||||
|
fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
|
||||||
|
}
|
||||||
|
}
|
104
util/fdmon-poll.c
Normal file
104
util/fdmon-poll.c
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||||
|
/*
|
||||||
|
* poll(2) file descriptor monitoring
|
||||||
|
*
|
||||||
|
* Uses ppoll(2) when available, g_poll() otherwise.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "aio-posix.h"
|
||||||
|
#include "qemu/rcu_queue.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These thread-local variables are used only in fdmon_poll_wait() around the
|
||||||
|
* call to the poll() system call. In particular they are not used while
|
||||||
|
* aio_poll is performing callbacks, which makes it much easier to think about
|
||||||
|
* reentrancy!
|
||||||
|
*
|
||||||
|
* Stack-allocated arrays would be perfect but they have size limitations;
|
||||||
|
* heap allocation is expensive enough that we want to reuse arrays across
|
||||||
|
* calls to aio_poll(). And because poll() has to be called without holding
|
||||||
|
* any lock, the arrays cannot be stored in AioContext. Thread-local data
|
||||||
|
* has none of the disadvantages of these three options.
|
||||||
|
*/
|
||||||
|
static __thread GPollFD *pollfds;
|
||||||
|
static __thread AioHandler **nodes;
|
||||||
|
static __thread unsigned npfd, nalloc;
|
||||||
|
static __thread Notifier pollfds_cleanup_notifier;
|
||||||
|
|
||||||
|
static void pollfds_cleanup(Notifier *n, void *unused)
|
||||||
|
{
|
||||||
|
g_assert(npfd == 0);
|
||||||
|
g_free(pollfds);
|
||||||
|
g_free(nodes);
|
||||||
|
nalloc = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void add_pollfd(AioHandler *node)
|
||||||
|
{
|
||||||
|
if (npfd == nalloc) {
|
||||||
|
if (nalloc == 0) {
|
||||||
|
pollfds_cleanup_notifier.notify = pollfds_cleanup;
|
||||||
|
qemu_thread_atexit_add(&pollfds_cleanup_notifier);
|
||||||
|
nalloc = 8;
|
||||||
|
} else {
|
||||||
|
g_assert(nalloc <= INT_MAX);
|
||||||
|
nalloc *= 2;
|
||||||
|
}
|
||||||
|
pollfds = g_renew(GPollFD, pollfds, nalloc);
|
||||||
|
nodes = g_renew(AioHandler *, nodes, nalloc);
|
||||||
|
}
|
||||||
|
nodes[npfd] = node;
|
||||||
|
pollfds[npfd] = (GPollFD) {
|
||||||
|
.fd = node->pfd.fd,
|
||||||
|
.events = node->pfd.events,
|
||||||
|
};
|
||||||
|
npfd++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
|
||||||
|
int64_t timeout)
|
||||||
|
{
|
||||||
|
AioHandler *node;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
assert(npfd == 0);
|
||||||
|
|
||||||
|
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
|
||||||
|
if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
|
||||||
|
&& aio_node_check(ctx, node->is_external)) {
|
||||||
|
add_pollfd(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* epoll(7) is faster above a certain number of fds */
|
||||||
|
if (fdmon_epoll_try_upgrade(ctx, npfd)) {
|
||||||
|
return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = qemu_poll_ns(pollfds, npfd, timeout);
|
||||||
|
if (ret > 0) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < npfd; i++) {
|
||||||
|
int revents = pollfds[i].revents;
|
||||||
|
|
||||||
|
if (revents) {
|
||||||
|
aio_add_ready_handler(ready_list, nodes[i], revents);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
npfd = 0;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fdmon_poll_update(AioContext *ctx, AioHandler *node, bool is_new)
|
||||||
|
{
|
||||||
|
/* Do nothing, AioHandler already contains the state we'll need */
|
||||||
|
}
|
||||||
|
|
||||||
|
const FDMonOps fdmon_poll_ops = {
|
||||||
|
.update = fdmon_poll_update,
|
||||||
|
.wait = fdmon_poll_wait,
|
||||||
|
};
|
Loading…
Reference in New Issue
Block a user