qemu/util/fdmon-epoll.c
Stefan Hajnoczi e62da98527 aio-posix: fix race between epoll upgrade and aio_set_fd_handler()
If another thread calls aio_set_fd_handler() while the IOThread event
loop is upgrading from ppoll(2) to epoll(7) then we might miss new
AioHandlers. The epollfd will not monitor the new AioHandler's fd,
resulting in hangs.

Take the AioHandler list lock while upgrading to epoll. This prevents
AioHandlers from changing while epoll is being set up. If we cannot lock
because we're in a nested event loop, then don't upgrade to epoll (it
will happen next time we're not in a nested call).

The downside to taking the lock is that the aio_set_fd_handler() thread
has to wait until the epoll upgrade is finished, which involves many
epoll_ctl(2) system calls. However, this scenario is rare and I couldn't
think of another solution that is still simple.

Reported-by: Qing Wang <qinwang@redhat.com>
Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2090998
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Fam Zheng <fam@euphon.net>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20230323144859.1338495-1-stefanha@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2023-03-27 15:12:17 +02:00

167 lines
4.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* epoll(7) file descriptor monitoring
*/
#include "qemu/osdep.h"
#include <sys/epoll.h>
#include "qemu/rcu_queue.h"
#include "aio-posix.h"
/* The fd number threshold to switch to epoll */
#define EPOLL_ENABLE_THRESHOLD 64
void fdmon_epoll_disable(AioContext *ctx)
{
if (ctx->epollfd >= 0) {
close(ctx->epollfd);
ctx->epollfd = -1;
}
/* Switch back */
ctx->fdmon_ops = &fdmon_poll_ops;
}
static inline int epoll_events_from_pfd(int pfd_events)
{
return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
(pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
(pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
(pfd_events & G_IO_ERR ? EPOLLERR : 0);
}
static void fdmon_epoll_update(AioContext *ctx,
AioHandler *old_node,
AioHandler *new_node)
{
struct epoll_event event = {
.data.ptr = new_node,
.events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
};
int r;
if (!new_node) {
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
} else if (!old_node) {
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
} else {
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
}
if (r) {
fdmon_epoll_disable(ctx);
}
}
static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
int64_t timeout)
{
GPollFD pfd = {
.fd = ctx->epollfd,
.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
};
AioHandler *node;
int i, ret = 0;
struct epoll_event events[128];
/* Fall back while external clients are disabled */
if (qatomic_read(&ctx->external_disable_cnt)) {
return fdmon_poll_ops.wait(ctx, ready_list, timeout);
}
if (timeout > 0) {
ret = qemu_poll_ns(&pfd, 1, timeout);
if (ret > 0) {
timeout = 0;
}
}
if (timeout <= 0 || ret > 0) {
ret = epoll_wait(ctx->epollfd, events,
ARRAY_SIZE(events),
timeout);
if (ret <= 0) {
goto out;
}
for (i = 0; i < ret; i++) {
int ev = events[i].events;
int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
(ev & EPOLLOUT ? G_IO_OUT : 0) |
(ev & EPOLLHUP ? G_IO_HUP : 0) |
(ev & EPOLLERR ? G_IO_ERR : 0);
node = events[i].data.ptr;
aio_add_ready_handler(ready_list, node, revents);
}
}
out:
return ret;
}
static const FDMonOps fdmon_epoll_ops = {
.update = fdmon_epoll_update,
.wait = fdmon_epoll_wait,
.need_wait = aio_poll_disabled,
};
static bool fdmon_epoll_try_enable(AioContext *ctx)
{
AioHandler *node;
struct epoll_event event;
QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
int r;
if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
continue;
}
event.events = epoll_events_from_pfd(node->pfd.events);
event.data.ptr = node;
r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
if (r) {
return false;
}
}
ctx->fdmon_ops = &fdmon_epoll_ops;
return true;
}
bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
{
bool ok;
if (ctx->epollfd < 0) {
return false;
}
/* Do not upgrade while external clients are disabled */
if (qatomic_read(&ctx->external_disable_cnt)) {
return false;
}
if (npfd < EPOLL_ENABLE_THRESHOLD) {
return false;
}
/* The list must not change while we add fds to epoll */
if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
return false;
}
ok = fdmon_epoll_try_enable(ctx);
qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
if (!ok) {
fdmon_epoll_disable(ctx);
}
return ok;
}
void fdmon_epoll_setup(AioContext *ctx)
{
ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
if (ctx->epollfd == -1) {
fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
}
}