2008-09-22 23:17:18 +04:00
|
|
|
/*
|
|
|
|
* QEMU aio implementation
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2008
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Anthony Liguori <aliguori@us.ibm.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
|
|
* the COPYING file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef QEMU_AIO_H
|
|
|
|
#define QEMU_AIO_H
|
|
|
|
|
2020-03-05 20:08:04 +03:00
|
|
|
#ifdef CONFIG_LINUX_IO_URING
|
|
|
|
#include <liburing.h>
|
|
|
|
#endif
|
2022-12-21 16:14:34 +03:00
|
|
|
#include "qemu/coroutine-core.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/queue.h"
|
|
|
|
#include "qemu/event_notifier.h"
|
2013-07-16 08:28:58 +04:00
|
|
|
#include "qemu/thread.h"
|
2013-08-21 19:02:49 +04:00
|
|
|
#include "qemu/timer.h"
|
graph-lock: Introduce a lock to protect block graph operations
Block layer graph operations are always run under BQL in the main loop.
This is proved by the assertion qemu_in_main_thread() and its wrapper
macro GLOBAL_STATE_CODE.
However, there are also concurrent coroutines running in other iothreads
that always try to traverse the graph. Currently this is protected
(among various other things) by the AioContext lock, but once this is
removed, we need to make sure that reads do not happen while modifying
the graph.
We distinguish between writer (main loop, under BQL) that modifies the
graph, and readers (all other coroutines running in various AioContext),
that go through the graph edges, reading ->parents and->children.
The writer (main loop) has "exclusive" access, so it first waits for any
current read to finish, and then prevents incoming ones from entering
while it has the exclusive access.
The readers (coroutines in multiple AioContext) are free to access the
graph as long the writer is not modifying the graph. In case it is, they
go in a CoQueue and sleep until the writer is done.
If a coroutine changes AioContext, the counter in the original and new
AioContext are left intact, since the writer does not care where the
reader is, but only if there is one.
As a result, some AioContexts might have a negative reader count, to
balance the positive count of the AioContext that took the lock. This
also means that when an AioContext is deleted it may have a nonzero
reader count. In that case we transfer the count to a global shared
counter so that the writer is always aware of all readers.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20221207131838.239125-3-kwolf@redhat.com>
Reviewed-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2022-12-07 16:18:22 +03:00
|
|
|
#include "block/graph-lock.h"
|
2023-04-28 00:10:07 +03:00
|
|
|
#include "hw/qdev-core.h"
|
|
|
|
|
2008-09-22 23:17:18 +04:00
|
|
|
|
2014-10-07 15:59:14 +04:00
|
|
|
typedef struct BlockAIOCB BlockAIOCB;
|
2014-10-07 15:59:15 +04:00
|
|
|
typedef void BlockCompletionFunc(void *opaque, int ret);
|
2012-03-12 20:01:48 +04:00
|
|
|
|
2012-10-31 19:34:37 +04:00
|
|
|
typedef struct AIOCBInfo {
|
2014-10-07 15:59:14 +04:00
|
|
|
void (*cancel_async)(BlockAIOCB *acb);
|
2012-10-31 19:34:35 +04:00
|
|
|
size_t aiocb_size;
|
2012-10-31 19:34:37 +04:00
|
|
|
} AIOCBInfo;
|
2012-03-12 20:01:48 +04:00
|
|
|
|
2014-10-07 15:59:14 +04:00
|
|
|
struct BlockAIOCB {
|
2012-10-31 19:34:37 +04:00
|
|
|
const AIOCBInfo *aiocb_info;
|
2012-03-12 20:01:48 +04:00
|
|
|
BlockDriverState *bs;
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb;
|
2012-03-12 20:01:48 +04:00
|
|
|
void *opaque;
|
2014-09-11 09:41:08 +04:00
|
|
|
int refcnt;
|
2012-03-12 20:01:48 +04:00
|
|
|
};
|
|
|
|
|
2012-10-31 19:34:37 +04:00
|
|
|
void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
|
2014-10-07 15:59:15 +04:00
|
|
|
BlockCompletionFunc *cb, void *opaque);
|
2014-09-11 09:41:28 +04:00
|
|
|
void qemu_aio_unref(void *p);
|
2014-09-11 09:41:08 +04:00
|
|
|
void qemu_aio_ref(void *p);
|
2012-03-12 20:01:48 +04:00
|
|
|
|
2012-10-30 02:45:23 +04:00
|
|
|
typedef struct AioHandler AioHandler;
|
2020-02-14 20:17:11 +03:00
|
|
|
typedef QLIST_HEAD(, AioHandler) AioHandlerList;
|
2012-10-30 02:45:23 +04:00
|
|
|
typedef void QEMUBHFunc(void *opaque);
|
2016-12-01 22:26:41 +03:00
|
|
|
typedef bool AioPollFn(void *opaque);
|
2012-10-30 02:45:23 +04:00
|
|
|
typedef void IOHandler(void *opaque);
|
|
|
|
|
2016-07-04 19:33:20 +03:00
|
|
|
struct ThreadPool;
|
|
|
|
struct LinuxAioState;
|
2020-01-20 17:18:47 +03:00
|
|
|
struct LuringState;
|
2016-07-04 19:33:20 +03:00
|
|
|
|
2020-03-05 20:08:05 +03:00
|
|
|
/* Is polling disabled? */
|
|
|
|
bool aio_poll_disabled(AioContext *ctx);
|
|
|
|
|
2020-03-05 20:08:02 +03:00
|
|
|
/* Callbacks for file descriptor monitoring implementations */
|
|
|
|
typedef struct {
|
|
|
|
/*
|
|
|
|
* update:
|
|
|
|
* @ctx: the AioContext
|
2020-03-05 20:08:03 +03:00
|
|
|
* @old_node: the existing handler or NULL if this file descriptor is being
|
|
|
|
* monitored for the first time
|
|
|
|
* @new_node: the new handler or NULL if this file descriptor is being
|
|
|
|
* removed
|
2020-03-05 20:08:02 +03:00
|
|
|
*
|
2020-03-05 20:08:03 +03:00
|
|
|
* Add/remove/modify a monitored file descriptor.
|
2020-03-05 20:08:02 +03:00
|
|
|
*
|
|
|
|
* Called with ctx->list_lock acquired.
|
|
|
|
*/
|
2020-03-05 20:08:03 +03:00
|
|
|
void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
|
2020-03-05 20:08:02 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* wait:
|
|
|
|
* @ctx: the AioContext
|
|
|
|
* @ready_list: list for handlers that become ready
|
|
|
|
* @timeout: maximum duration to wait, in nanoseconds
|
|
|
|
*
|
|
|
|
* Wait for file descriptors to become ready and place them on ready_list.
|
|
|
|
*
|
|
|
|
* Called with ctx->list_lock incremented but not locked.
|
|
|
|
*
|
|
|
|
* Returns: number of ready file descriptors.
|
|
|
|
*/
|
|
|
|
int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
|
2020-03-05 20:08:05 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* need_wait:
|
|
|
|
* @ctx: the AioContext
|
|
|
|
*
|
|
|
|
* Tell aio_poll() when to stop userspace polling early because ->wait()
|
|
|
|
* has fds ready.
|
|
|
|
*
|
|
|
|
* File descriptor monitoring implementations that cannot poll fd readiness
|
|
|
|
* from userspace should use aio_poll_disabled() here. This ensures that
|
|
|
|
* file descriptors are not starved by handlers that frequently make
|
|
|
|
* progress via userspace polling.
|
|
|
|
*
|
|
|
|
* Returns: true if ->wait() should be called, false otherwise.
|
|
|
|
*/
|
|
|
|
bool (*need_wait)(AioContext *ctx);
|
2020-03-05 20:08:02 +03:00
|
|
|
} FDMonOps;
|
|
|
|
|
2020-02-21 12:39:51 +03:00
|
|
|
/*
|
|
|
|
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
|
|
|
|
* scheduled BHs are not processed until the next aio_bh_poll() call. All
|
|
|
|
* active aio_bh_poll() calls chain their slices together in a list, so that
|
|
|
|
* nested aio_bh_poll() calls process all scheduled bottom halves.
|
|
|
|
*/
|
|
|
|
typedef QSLIST_HEAD(, QEMUBH) BHList;
|
|
|
|
typedef struct BHListSlice BHListSlice;
|
|
|
|
struct BHListSlice {
|
|
|
|
BHList bh_list;
|
|
|
|
QSIMPLEQ_ENTRY(BHListSlice) next;
|
|
|
|
};
|
|
|
|
|
2020-03-05 20:08:04 +03:00
|
|
|
typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
|
|
|
|
|
2013-08-21 19:02:47 +04:00
|
|
|
struct AioContext {
|
2012-09-24 16:57:41 +04:00
|
|
|
GSource source;
|
|
|
|
|
2017-01-12 21:07:59 +03:00
|
|
|
/* Used by AioContext users to protect from multi-threaded access. */
|
2016-10-27 13:49:08 +03:00
|
|
|
QemuRecMutex lock;
|
2014-03-03 14:30:04 +04:00
|
|
|
|
graph-lock: Introduce a lock to protect block graph operations
Block layer graph operations are always run under BQL in the main loop.
This is proved by the assertion qemu_in_main_thread() and its wrapper
macro GLOBAL_STATE_CODE.
However, there are also concurrent coroutines running in other iothreads
that always try to traverse the graph. Currently this is protected
(among various other things) by the AioContext lock, but once this is
removed, we need to make sure that reads do not happen while modifying
the graph.
We distinguish between writer (main loop, under BQL) that modifies the
graph, and readers (all other coroutines running in various AioContext),
that go through the graph edges, reading ->parents and->children.
The writer (main loop) has "exclusive" access, so it first waits for any
current read to finish, and then prevents incoming ones from entering
while it has the exclusive access.
The readers (coroutines in multiple AioContext) are free to access the
graph as long the writer is not modifying the graph. In case it is, they
go in a CoQueue and sleep until the writer is done.
If a coroutine changes AioContext, the counter in the original and new
AioContext are left intact, since the writer does not care where the
reader is, but only if there is one.
As a result, some AioContexts might have a negative reader count, to
balance the positive count of the AioContext that took the lock. This
also means that when an AioContext is deleted it may have a nonzero
reader count. In that case we transfer the count to a global shared
counter so that the writer is always aware of all readers.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-Id: <20221207131838.239125-3-kwolf@redhat.com>
Reviewed-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
Reviewed-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2022-12-07 16:18:22 +03:00
|
|
|
/*
|
|
|
|
* Keep track of readers and writers of the block layer graph.
|
|
|
|
* This is essential to avoid performing additions and removal
|
|
|
|
* of nodes and edges from block graph while some
|
|
|
|
* other thread is traversing it.
|
|
|
|
*/
|
|
|
|
BdrvGraphRWlock *bdrv_graph;
|
|
|
|
|
2017-01-12 21:07:59 +03:00
|
|
|
/* The list of registered AIO handlers. Protected by ctx->list_lock. */
|
2020-02-14 20:17:11 +03:00
|
|
|
AioHandlerList aio_handlers;
|
|
|
|
|
|
|
|
/* The list of AIO handlers to be deleted. Protected by ctx->list_lock. */
|
|
|
|
AioHandlerList deleted_aio_handlers;
|
2012-09-13 14:28:51 +04:00
|
|
|
|
AioContext: fix broken ctx->dispatching optimization
This patch rewrites the ctx->dispatching optimization, which was the cause
of some mysterious hangs that could be reproduced on aarch64 KVM only.
The hangs were indirectly caused by aio_poll() and in particular by
flash memory updates's call to blk_write(), which invokes aio_poll().
Fun stuff: they had an extremely short race window, so much that
adding all kind of tracing to either the kernel or QEMU made it
go away (a single printf made it half as reproducible).
On the plus side, the failure mode (a hang until the next keypress)
made it very easy to examine the state of the process with a debugger.
And there was a very nice reproducer from Laszlo, which failed pretty
often (more than half of the time) on any version of QEMU with a non-debug
kernel; it also failed fast, while still in the firmware. So, it could
have been worse.
For some unknown reason they happened only with virtio-scsi, but
that's not important. It's more interesting that they disappeared with
io=native, making thread-pool.c a likely suspect for where the bug arose.
thread-pool.c is also one of the few places which use bottom halves
across threads, by the way.
I hope that no other similar bugs exist, but just in case :) I am
going to describe how the successful debugging went... Since the
likely culprit was the ctx->dispatching optimization, which mostly
affects bottom halves, the first observation was that there are two
qemu_bh_schedule() invocations in the thread pool: the one in the aio
worker and the one in thread_pool_completion_bh. The latter always
causes the optimization to trigger, the former may or may not. In
order to restrict the possibilities, I introduced new functions
qemu_bh_schedule_slow() and qemu_bh_schedule_fast():
/* qemu_bh_schedule_slow: */
ctx = bh->ctx;
bh->idle = 0;
if (atomic_xchg(&bh->scheduled, 1) == 0) {
event_notifier_set(&ctx->notifier);
}
/* qemu_bh_schedule_fast: */
ctx = bh->ctx;
bh->idle = 0;
assert(ctx->dispatching);
atomic_xchg(&bh->scheduled, 1);
Notice how the atomic_xchg is still in qemu_bh_schedule_slow(). This
was already debated a few months ago, so I assumed it to be correct.
In retrospect this was a very good idea, as you'll see later.
Changing thread_pool_completion_bh() to qemu_bh_schedule_fast() didn't
trigger the assertion (as expected). Changing the worker's invocation
to qemu_bh_schedule_slow() didn't hide the bug (another assumption
which luckily held). This already limited heavily the amount of
interaction between the threads, hinting that the problematic events
must have triggered around thread_pool_completion_bh().
As mentioned early, invoking a debugger to examine the state of a
hung process was pretty easy; the iothread was always waiting on a
poll(..., -1) system call. Infinite timeouts are much rarer on x86,
and this could be the reason why the bug was never observed there.
With the buggy sequence more or less resolved to an interaction between
thread_pool_completion_bh() and poll(..., -1), my "tracing" strategy was
to just add a few qemu_clock_get_ns(QEMU_CLOCK_REALTIME) calls, hoping
that the ordering of aio_ctx_prepare(), aio_ctx_dispatch, poll() and
qemu_bh_schedule_fast() would provide some hint. The output was:
(gdb) p last_prepare
$3 = 103885451
(gdb) p last_dispatch
$4 = 103876492
(gdb) p last_poll
$5 = 115909333
(gdb) p last_schedule
$6 = 115925212
Notice how the last call to qemu_poll_ns() came after aio_ctx_dispatch().
This makes little sense unless there is an aio_poll() call involved,
and indeed with a slightly different instrumentation you can see that
there is one:
(gdb) p last_prepare
$3 = 107569679
(gdb) p last_dispatch
$4 = 107561600
(gdb) p last_aio_poll
$5 = 110671400
(gdb) p last_schedule
$6 = 110698917
So the scenario becomes clearer:
iothread VCPU thread
--------------------------------------------------------------------------
aio_ctx_prepare
aio_ctx_check
qemu_poll_ns(timeout=-1)
aio_poll
aio_dispatch
thread_pool_completion_bh
qemu_bh_schedule()
At this point bh->scheduled = 1 and the iothread has not been woken up.
The solution must be close, but this alone should not be a problem,
because the bottom half is only rescheduled to account for rare situations
(see commit 3c80ca1, thread-pool: avoid deadlock in nested aio_poll()
calls, 2014-07-15).
Introducing a third thread---a thread pool worker thread, which
also does qemu_bh_schedule()---does bring out the problematic case.
The third thread must be awakened *after* the callback is complete and
thread_pool_completion_bh has redone the whole loop, explaining the
short race window. And then this is what happens:
thread pool worker
--------------------------------------------------------------------------
<I/O completes>
qemu_bh_schedule()
Tada, bh->scheduled is already 1, so qemu_bh_schedule() does nothing
and the iothread is never woken up. This is where the bh->scheduled
optimization comes into play---it is correct, but removing it would
have masked the bug.
So, what is the bug?
Well, the question asked by the ctx->dispatching optimization ("is any
active aio_poll dispatching?") was wrong. The right question to ask
instead is "is any active aio_poll *not* dispatching", i.e. in the prepare
or poll phases? In that case, the aio_poll is sleeping or might go to
sleep anytime soon, and the EventNotifier must be invoked to wake
it up.
In any other case (including if there is *no* active aio_poll at all!)
we can just wait for the next prepare phase to pick up the event (e.g. a
bottom half); the prepare phase will avoid the blocking and service the
bottom half.
Expressing the invariant with a logic formula, the broken one looked like:
!(exists(thread): in_dispatching(thread)) => !optimize
or equivalently:
!(exists(thread):
in_aio_poll(thread) && in_dispatching(thread)) => !optimize
In the correct one, the negation is in a slightly different place:
(exists(thread):
in_aio_poll(thread) && !in_dispatching(thread)) => !optimize
or equivalently:
(exists(thread): in_prepare_or_poll(thread)) => !optimize
Even if the difference boils down to moving an exclamation mark :)
the implementation is quite different. However, I think the new
one is simpler to understand.
In the old implementation, the "exists" was implemented with a boolean
value. This didn't really support well the case of multiple concurrent
event loops, but I thought that this was okay: aio_poll holds the
AioContext lock so there cannot be concurrent aio_poll invocations, and
I was just considering nested event loops. However, aio_poll _could_
indeed be concurrent with the GSource. This is why I came up with the
wrong invariant.
In the new implementation, "exists" is computed simply by counting how many
threads are in the prepare or poll phases. There are some interesting
points to consider, but the gist of the idea remains:
1) AioContext can be used through GSource as well; as mentioned in the
patch, bit 0 of the counter is reserved for the GSource.
2) the counter need not be updated for a non-blocking aio_poll, because
it won't sleep forever anyway. This is just a matter of checking
the "blocking" variable. This requires some changes to the win32
implementation, but is otherwise not too complicated.
3) as mentioned above, the new implementation will not call aio_notify
when there is *no* active aio_poll at all. The tests have to be
adjusted for this change. The calls to aio_notify in async.c are fine;
they only want to kick aio_poll out of a blocking wait, but need not
do anything if aio_poll is not running.
4) nested aio_poll: these just work with the new implementation; when
a nested event loop is invoked, the outer event loop is never in the
prepare or poll phases. The outer event loop thus has already decremented
the counter.
Reported-by: Richard W. M. Jones <rjones@redhat.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Message-id: 1437487673-23740-5-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-07-21 17:07:51 +03:00
|
|
|
/* Used to avoid unnecessary event_notifier_set calls in aio_notify;
|
2020-04-07 17:07:45 +03:00
|
|
|
* only written from the AioContext home thread, or under the BQL in
|
|
|
|
* the case of the main AioContext. However, it is read from any
|
|
|
|
* thread so it is still accessed with atomic primitives.
|
|
|
|
*
|
|
|
|
* If this field is 0, everything (file descriptors, bottom halves,
|
|
|
|
* timers) will be re-evaluated before the next blocking poll() or
|
|
|
|
* io_uring wait; therefore, the event_notifier_set call can be
|
|
|
|
* skipped. If it is non-zero, you may need to wake up a concurrent
|
|
|
|
* aio_poll or the glib main event loop, making event_notifier_set
|
|
|
|
* necessary.
|
AioContext: fix broken ctx->dispatching optimization
This patch rewrites the ctx->dispatching optimization, which was the cause
of some mysterious hangs that could be reproduced on aarch64 KVM only.
The hangs were indirectly caused by aio_poll() and in particular by
flash memory updates's call to blk_write(), which invokes aio_poll().
Fun stuff: they had an extremely short race window, so much that
adding all kind of tracing to either the kernel or QEMU made it
go away (a single printf made it half as reproducible).
On the plus side, the failure mode (a hang until the next keypress)
made it very easy to examine the state of the process with a debugger.
And there was a very nice reproducer from Laszlo, which failed pretty
often (more than half of the time) on any version of QEMU with a non-debug
kernel; it also failed fast, while still in the firmware. So, it could
have been worse.
For some unknown reason they happened only with virtio-scsi, but
that's not important. It's more interesting that they disappeared with
io=native, making thread-pool.c a likely suspect for where the bug arose.
thread-pool.c is also one of the few places which use bottom halves
across threads, by the way.
I hope that no other similar bugs exist, but just in case :) I am
going to describe how the successful debugging went... Since the
likely culprit was the ctx->dispatching optimization, which mostly
affects bottom halves, the first observation was that there are two
qemu_bh_schedule() invocations in the thread pool: the one in the aio
worker and the one in thread_pool_completion_bh. The latter always
causes the optimization to trigger, the former may or may not. In
order to restrict the possibilities, I introduced new functions
qemu_bh_schedule_slow() and qemu_bh_schedule_fast():
/* qemu_bh_schedule_slow: */
ctx = bh->ctx;
bh->idle = 0;
if (atomic_xchg(&bh->scheduled, 1) == 0) {
event_notifier_set(&ctx->notifier);
}
/* qemu_bh_schedule_fast: */
ctx = bh->ctx;
bh->idle = 0;
assert(ctx->dispatching);
atomic_xchg(&bh->scheduled, 1);
Notice how the atomic_xchg is still in qemu_bh_schedule_slow(). This
was already debated a few months ago, so I assumed it to be correct.
In retrospect this was a very good idea, as you'll see later.
Changing thread_pool_completion_bh() to qemu_bh_schedule_fast() didn't
trigger the assertion (as expected). Changing the worker's invocation
to qemu_bh_schedule_slow() didn't hide the bug (another assumption
which luckily held). This already limited heavily the amount of
interaction between the threads, hinting that the problematic events
must have triggered around thread_pool_completion_bh().
As mentioned early, invoking a debugger to examine the state of a
hung process was pretty easy; the iothread was always waiting on a
poll(..., -1) system call. Infinite timeouts are much rarer on x86,
and this could be the reason why the bug was never observed there.
With the buggy sequence more or less resolved to an interaction between
thread_pool_completion_bh() and poll(..., -1), my "tracing" strategy was
to just add a few qemu_clock_get_ns(QEMU_CLOCK_REALTIME) calls, hoping
that the ordering of aio_ctx_prepare(), aio_ctx_dispatch, poll() and
qemu_bh_schedule_fast() would provide some hint. The output was:
(gdb) p last_prepare
$3 = 103885451
(gdb) p last_dispatch
$4 = 103876492
(gdb) p last_poll
$5 = 115909333
(gdb) p last_schedule
$6 = 115925212
Notice how the last call to qemu_poll_ns() came after aio_ctx_dispatch().
This makes little sense unless there is an aio_poll() call involved,
and indeed with a slightly different instrumentation you can see that
there is one:
(gdb) p last_prepare
$3 = 107569679
(gdb) p last_dispatch
$4 = 107561600
(gdb) p last_aio_poll
$5 = 110671400
(gdb) p last_schedule
$6 = 110698917
So the scenario becomes clearer:
iothread VCPU thread
--------------------------------------------------------------------------
aio_ctx_prepare
aio_ctx_check
qemu_poll_ns(timeout=-1)
aio_poll
aio_dispatch
thread_pool_completion_bh
qemu_bh_schedule()
At this point bh->scheduled = 1 and the iothread has not been woken up.
The solution must be close, but this alone should not be a problem,
because the bottom half is only rescheduled to account for rare situations
(see commit 3c80ca1, thread-pool: avoid deadlock in nested aio_poll()
calls, 2014-07-15).
Introducing a third thread---a thread pool worker thread, which
also does qemu_bh_schedule()---does bring out the problematic case.
The third thread must be awakened *after* the callback is complete and
thread_pool_completion_bh has redone the whole loop, explaining the
short race window. And then this is what happens:
thread pool worker
--------------------------------------------------------------------------
<I/O completes>
qemu_bh_schedule()
Tada, bh->scheduled is already 1, so qemu_bh_schedule() does nothing
and the iothread is never woken up. This is where the bh->scheduled
optimization comes into play---it is correct, but removing it would
have masked the bug.
So, what is the bug?
Well, the question asked by the ctx->dispatching optimization ("is any
active aio_poll dispatching?") was wrong. The right question to ask
instead is "is any active aio_poll *not* dispatching", i.e. in the prepare
or poll phases? In that case, the aio_poll is sleeping or might go to
sleep anytime soon, and the EventNotifier must be invoked to wake
it up.
In any other case (including if there is *no* active aio_poll at all!)
we can just wait for the next prepare phase to pick up the event (e.g. a
bottom half); the prepare phase will avoid the blocking and service the
bottom half.
Expressing the invariant with a logic formula, the broken one looked like:
!(exists(thread): in_dispatching(thread)) => !optimize
or equivalently:
!(exists(thread):
in_aio_poll(thread) && in_dispatching(thread)) => !optimize
In the correct one, the negation is in a slightly different place:
(exists(thread):
in_aio_poll(thread) && !in_dispatching(thread)) => !optimize
or equivalently:
(exists(thread): in_prepare_or_poll(thread)) => !optimize
Even if the difference boils down to moving an exclamation mark :)
the implementation is quite different. However, I think the new
one is simpler to understand.
In the old implementation, the "exists" was implemented with a boolean
value. This didn't really support well the case of multiple concurrent
event loops, but I thought that this was okay: aio_poll holds the
AioContext lock so there cannot be concurrent aio_poll invocations, and
I was just considering nested event loops. However, aio_poll _could_
indeed be concurrent with the GSource. This is why I came up with the
wrong invariant.
In the new implementation, "exists" is computed simply by counting how many
threads are in the prepare or poll phases. There are some interesting
points to consider, but the gist of the idea remains:
1) AioContext can be used through GSource as well; as mentioned in the
patch, bit 0 of the counter is reserved for the GSource.
2) the counter need not be updated for a non-blocking aio_poll, because
it won't sleep forever anyway. This is just a matter of checking
the "blocking" variable. This requires some changes to the win32
implementation, but is otherwise not too complicated.
3) as mentioned above, the new implementation will not call aio_notify
when there is *no* active aio_poll at all. The tests have to be
adjusted for this change. The calls to aio_notify in async.c are fine;
they only want to kick aio_poll out of a blocking wait, but need not
do anything if aio_poll is not running.
4) nested aio_poll: these just work with the new implementation; when
a nested event loop is invoked, the outer event loop is never in the
prepare or poll phases. The outer event loop thus has already decremented
the counter.
Reported-by: Richard W. M. Jones <rjones@redhat.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Message-id: 1437487673-23740-5-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-07-21 17:07:51 +03:00
|
|
|
*
|
|
|
|
* Bit 0 is reserved for GSource usage of the AioContext, and is 1
|
2016-07-15 12:44:18 +03:00
|
|
|
* between a call to aio_ctx_prepare and the next call to aio_ctx_check.
|
AioContext: fix broken ctx->dispatching optimization
This patch rewrites the ctx->dispatching optimization, which was the cause
of some mysterious hangs that could be reproduced on aarch64 KVM only.
The hangs were indirectly caused by aio_poll() and in particular by
flash memory updates's call to blk_write(), which invokes aio_poll().
Fun stuff: they had an extremely short race window, so much that
adding all kind of tracing to either the kernel or QEMU made it
go away (a single printf made it half as reproducible).
On the plus side, the failure mode (a hang until the next keypress)
made it very easy to examine the state of the process with a debugger.
And there was a very nice reproducer from Laszlo, which failed pretty
often (more than half of the time) on any version of QEMU with a non-debug
kernel; it also failed fast, while still in the firmware. So, it could
have been worse.
For some unknown reason they happened only with virtio-scsi, but
that's not important. It's more interesting that they disappeared with
io=native, making thread-pool.c a likely suspect for where the bug arose.
thread-pool.c is also one of the few places which use bottom halves
across threads, by the way.
I hope that no other similar bugs exist, but just in case :) I am
going to describe how the successful debugging went... Since the
likely culprit was the ctx->dispatching optimization, which mostly
affects bottom halves, the first observation was that there are two
qemu_bh_schedule() invocations in the thread pool: the one in the aio
worker and the one in thread_pool_completion_bh. The latter always
causes the optimization to trigger, the former may or may not. In
order to restrict the possibilities, I introduced new functions
qemu_bh_schedule_slow() and qemu_bh_schedule_fast():
/* qemu_bh_schedule_slow: */
ctx = bh->ctx;
bh->idle = 0;
if (atomic_xchg(&bh->scheduled, 1) == 0) {
event_notifier_set(&ctx->notifier);
}
/* qemu_bh_schedule_fast: */
ctx = bh->ctx;
bh->idle = 0;
assert(ctx->dispatching);
atomic_xchg(&bh->scheduled, 1);
Notice how the atomic_xchg is still in qemu_bh_schedule_slow(). This
was already debated a few months ago, so I assumed it to be correct.
In retrospect this was a very good idea, as you'll see later.
Changing thread_pool_completion_bh() to qemu_bh_schedule_fast() didn't
trigger the assertion (as expected). Changing the worker's invocation
to qemu_bh_schedule_slow() didn't hide the bug (another assumption
which luckily held). This already limited heavily the amount of
interaction between the threads, hinting that the problematic events
must have triggered around thread_pool_completion_bh().
As mentioned early, invoking a debugger to examine the state of a
hung process was pretty easy; the iothread was always waiting on a
poll(..., -1) system call. Infinite timeouts are much rarer on x86,
and this could be the reason why the bug was never observed there.
With the buggy sequence more or less resolved to an interaction between
thread_pool_completion_bh() and poll(..., -1), my "tracing" strategy was
to just add a few qemu_clock_get_ns(QEMU_CLOCK_REALTIME) calls, hoping
that the ordering of aio_ctx_prepare(), aio_ctx_dispatch, poll() and
qemu_bh_schedule_fast() would provide some hint. The output was:
(gdb) p last_prepare
$3 = 103885451
(gdb) p last_dispatch
$4 = 103876492
(gdb) p last_poll
$5 = 115909333
(gdb) p last_schedule
$6 = 115925212
Notice how the last call to qemu_poll_ns() came after aio_ctx_dispatch().
This makes little sense unless there is an aio_poll() call involved,
and indeed with a slightly different instrumentation you can see that
there is one:
(gdb) p last_prepare
$3 = 107569679
(gdb) p last_dispatch
$4 = 107561600
(gdb) p last_aio_poll
$5 = 110671400
(gdb) p last_schedule
$6 = 110698917
So the scenario becomes clearer:
iothread VCPU thread
--------------------------------------------------------------------------
aio_ctx_prepare
aio_ctx_check
qemu_poll_ns(timeout=-1)
aio_poll
aio_dispatch
thread_pool_completion_bh
qemu_bh_schedule()
At this point bh->scheduled = 1 and the iothread has not been woken up.
The solution must be close, but this alone should not be a problem,
because the bottom half is only rescheduled to account for rare situations
(see commit 3c80ca1, thread-pool: avoid deadlock in nested aio_poll()
calls, 2014-07-15).
Introducing a third thread---a thread pool worker thread, which
also does qemu_bh_schedule()---does bring out the problematic case.
The third thread must be awakened *after* the callback is complete and
thread_pool_completion_bh has redone the whole loop, explaining the
short race window. And then this is what happens:
thread pool worker
--------------------------------------------------------------------------
<I/O completes>
qemu_bh_schedule()
Tada, bh->scheduled is already 1, so qemu_bh_schedule() does nothing
and the iothread is never woken up. This is where the bh->scheduled
optimization comes into play---it is correct, but removing it would
have masked the bug.
So, what is the bug?
Well, the question asked by the ctx->dispatching optimization ("is any
active aio_poll dispatching?") was wrong. The right question to ask
instead is "is any active aio_poll *not* dispatching", i.e. in the prepare
or poll phases? In that case, the aio_poll is sleeping or might go to
sleep anytime soon, and the EventNotifier must be invoked to wake
it up.
In any other case (including if there is *no* active aio_poll at all!)
we can just wait for the next prepare phase to pick up the event (e.g. a
bottom half); the prepare phase will avoid the blocking and service the
bottom half.
Expressing the invariant with a logic formula, the broken one looked like:
!(exists(thread): in_dispatching(thread)) => !optimize
or equivalently:
!(exists(thread):
in_aio_poll(thread) && in_dispatching(thread)) => !optimize
In the correct one, the negation is in a slightly different place:
(exists(thread):
in_aio_poll(thread) && !in_dispatching(thread)) => !optimize
or equivalently:
(exists(thread): in_prepare_or_poll(thread)) => !optimize
Even if the difference boils down to moving an exclamation mark :)
the implementation is quite different. However, I think the new
one is simpler to understand.
In the old implementation, the "exists" was implemented with a boolean
value. This didn't really support well the case of multiple concurrent
event loops, but I thought that this was okay: aio_poll holds the
AioContext lock so there cannot be concurrent aio_poll invocations, and
I was just considering nested event loops. However, aio_poll _could_
indeed be concurrent with the GSource. This is why I came up with the
wrong invariant.
In the new implementation, "exists" is computed simply by counting how many
threads are in the prepare or poll phases. There are some interesting
points to consider, but the gist of the idea remains:
1) AioContext can be used through GSource as well; as mentioned in the
patch, bit 0 of the counter is reserved for the GSource.
2) the counter need not be updated for a non-blocking aio_poll, because
it won't sleep forever anyway. This is just a matter of checking
the "blocking" variable. This requires some changes to the win32
implementation, but is otherwise not too complicated.
3) as mentioned above, the new implementation will not call aio_notify
when there is *no* active aio_poll at all. The tests have to be
adjusted for this change. The calls to aio_notify in async.c are fine;
they only want to kick aio_poll out of a blocking wait, but need not
do anything if aio_poll is not running.
4) nested aio_poll: these just work with the new implementation; when
a nested event loop is invoked, the outer event loop is never in the
prepare or poll phases. The outer event loop thus has already decremented
the counter.
Reported-by: Richard W. M. Jones <rjones@redhat.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Message-id: 1437487673-23740-5-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-07-21 17:07:51 +03:00
|
|
|
* Bits 1-31 simply count the number of active calls to aio_poll
|
|
|
|
* that are in the prepare or poll phase.
|
|
|
|
*
|
|
|
|
* The GSource and aio_poll must use a different mechanism because
|
|
|
|
* there is no certainty that a call to GSource's prepare callback
|
|
|
|
* (via g_main_context_prepare) is indeed followed by check and
|
|
|
|
* dispatch. It's not clear whether this would be a bug, but let's
|
|
|
|
* play safe and allow it---it will just cause extra calls to
|
|
|
|
* event_notifier_set until the next call to dispatch.
|
|
|
|
*
|
|
|
|
* Instead, the aio_poll calls include both the prepare and the
|
|
|
|
* dispatch phase, hence a simple counter is enough for them.
|
2014-07-07 17:18:04 +04:00
|
|
|
*/
|
AioContext: fix broken ctx->dispatching optimization
This patch rewrites the ctx->dispatching optimization, which was the cause
of some mysterious hangs that could be reproduced on aarch64 KVM only.
The hangs were indirectly caused by aio_poll() and in particular by
flash memory updates's call to blk_write(), which invokes aio_poll().
Fun stuff: they had an extremely short race window, so much that
adding all kind of tracing to either the kernel or QEMU made it
go away (a single printf made it half as reproducible).
On the plus side, the failure mode (a hang until the next keypress)
made it very easy to examine the state of the process with a debugger.
And there was a very nice reproducer from Laszlo, which failed pretty
often (more than half of the time) on any version of QEMU with a non-debug
kernel; it also failed fast, while still in the firmware. So, it could
have been worse.
For some unknown reason they happened only with virtio-scsi, but
that's not important. It's more interesting that they disappeared with
io=native, making thread-pool.c a likely suspect for where the bug arose.
thread-pool.c is also one of the few places which use bottom halves
across threads, by the way.
I hope that no other similar bugs exist, but just in case :) I am
going to describe how the successful debugging went... Since the
likely culprit was the ctx->dispatching optimization, which mostly
affects bottom halves, the first observation was that there are two
qemu_bh_schedule() invocations in the thread pool: the one in the aio
worker and the one in thread_pool_completion_bh. The latter always
causes the optimization to trigger, the former may or may not. In
order to restrict the possibilities, I introduced new functions
qemu_bh_schedule_slow() and qemu_bh_schedule_fast():
/* qemu_bh_schedule_slow: */
ctx = bh->ctx;
bh->idle = 0;
if (atomic_xchg(&bh->scheduled, 1) == 0) {
event_notifier_set(&ctx->notifier);
}
/* qemu_bh_schedule_fast: */
ctx = bh->ctx;
bh->idle = 0;
assert(ctx->dispatching);
atomic_xchg(&bh->scheduled, 1);
Notice how the atomic_xchg is still in qemu_bh_schedule_slow(). This
was already debated a few months ago, so I assumed it to be correct.
In retrospect this was a very good idea, as you'll see later.
Changing thread_pool_completion_bh() to qemu_bh_schedule_fast() didn't
trigger the assertion (as expected). Changing the worker's invocation
to qemu_bh_schedule_slow() didn't hide the bug (another assumption
which luckily held). This already limited heavily the amount of
interaction between the threads, hinting that the problematic events
must have triggered around thread_pool_completion_bh().
As mentioned early, invoking a debugger to examine the state of a
hung process was pretty easy; the iothread was always waiting on a
poll(..., -1) system call. Infinite timeouts are much rarer on x86,
and this could be the reason why the bug was never observed there.
With the buggy sequence more or less resolved to an interaction between
thread_pool_completion_bh() and poll(..., -1), my "tracing" strategy was
to just add a few qemu_clock_get_ns(QEMU_CLOCK_REALTIME) calls, hoping
that the ordering of aio_ctx_prepare(), aio_ctx_dispatch, poll() and
qemu_bh_schedule_fast() would provide some hint. The output was:
(gdb) p last_prepare
$3 = 103885451
(gdb) p last_dispatch
$4 = 103876492
(gdb) p last_poll
$5 = 115909333
(gdb) p last_schedule
$6 = 115925212
Notice how the last call to qemu_poll_ns() came after aio_ctx_dispatch().
This makes little sense unless there is an aio_poll() call involved,
and indeed with a slightly different instrumentation you can see that
there is one:
(gdb) p last_prepare
$3 = 107569679
(gdb) p last_dispatch
$4 = 107561600
(gdb) p last_aio_poll
$5 = 110671400
(gdb) p last_schedule
$6 = 110698917
So the scenario becomes clearer:
iothread VCPU thread
--------------------------------------------------------------------------
aio_ctx_prepare
aio_ctx_check
qemu_poll_ns(timeout=-1)
aio_poll
aio_dispatch
thread_pool_completion_bh
qemu_bh_schedule()
At this point bh->scheduled = 1 and the iothread has not been woken up.
The solution must be close, but this alone should not be a problem,
because the bottom half is only rescheduled to account for rare situations
(see commit 3c80ca1, thread-pool: avoid deadlock in nested aio_poll()
calls, 2014-07-15).
Introducing a third thread---a thread pool worker thread, which
also does qemu_bh_schedule()---does bring out the problematic case.
The third thread must be awakened *after* the callback is complete and
thread_pool_completion_bh has redone the whole loop, explaining the
short race window. And then this is what happens:
thread pool worker
--------------------------------------------------------------------------
<I/O completes>
qemu_bh_schedule()
Tada, bh->scheduled is already 1, so qemu_bh_schedule() does nothing
and the iothread is never woken up. This is where the bh->scheduled
optimization comes into play---it is correct, but removing it would
have masked the bug.
So, what is the bug?
Well, the question asked by the ctx->dispatching optimization ("is any
active aio_poll dispatching?") was wrong. The right question to ask
instead is "is any active aio_poll *not* dispatching", i.e. in the prepare
or poll phases? In that case, the aio_poll is sleeping or might go to
sleep anytime soon, and the EventNotifier must be invoked to wake
it up.
In any other case (including if there is *no* active aio_poll at all!)
we can just wait for the next prepare phase to pick up the event (e.g. a
bottom half); the prepare phase will avoid the blocking and service the
bottom half.
Expressing the invariant with a logic formula, the broken one looked like:
!(exists(thread): in_dispatching(thread)) => !optimize
or equivalently:
!(exists(thread):
in_aio_poll(thread) && in_dispatching(thread)) => !optimize
In the correct one, the negation is in a slightly different place:
(exists(thread):
in_aio_poll(thread) && !in_dispatching(thread)) => !optimize
or equivalently:
(exists(thread): in_prepare_or_poll(thread)) => !optimize
Even if the difference boils down to moving an exclamation mark :)
the implementation is quite different. However, I think the new
one is simpler to understand.
In the old implementation, the "exists" was implemented with a boolean
value. This didn't really support well the case of multiple concurrent
event loops, but I thought that this was okay: aio_poll holds the
AioContext lock so there cannot be concurrent aio_poll invocations, and
I was just considering nested event loops. However, aio_poll _could_
indeed be concurrent with the GSource. This is why I came up with the
wrong invariant.
In the new implementation, "exists" is computed simply by counting how many
threads are in the prepare or poll phases. There are some interesting
points to consider, but the gist of the idea remains:
1) AioContext can be used through GSource as well; as mentioned in the
patch, bit 0 of the counter is reserved for the GSource.
2) the counter need not be updated for a non-blocking aio_poll, because
it won't sleep forever anyway. This is just a matter of checking
the "blocking" variable. This requires some changes to the win32
implementation, but is otherwise not too complicated.
3) as mentioned above, the new implementation will not call aio_notify
when there is *no* active aio_poll at all. The tests have to be
adjusted for this change. The calls to aio_notify in async.c are fine;
they only want to kick aio_poll out of a blocking wait, but need not
do anything if aio_poll is not running.
4) nested aio_poll: these just work with the new implementation; when
a nested event loop is invoked, the outer event loop is never in the
prepare or poll phases. The outer event loop thus has already decremented
the counter.
Reported-by: Richard W. M. Jones <rjones@redhat.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Fam Zheng <famz@redhat.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
Message-id: 1437487673-23740-5-git-send-email-pbonzini@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-07-21 17:07:51 +03:00
|
|
|
uint32_t notify_me;
|
2014-07-07 17:18:04 +04:00
|
|
|
|
2017-01-12 21:07:59 +03:00
|
|
|
/* A lock to protect between QEMUBH and AioHandler adders and deleter,
|
|
|
|
* and to ensure that no callbacks are removed while we're walking and
|
|
|
|
* dispatching them.
|
2017-01-12 21:07:53 +03:00
|
|
|
*/
|
|
|
|
QemuLockCnt list_lock;
|
2014-07-07 17:18:04 +04:00
|
|
|
|
2020-02-21 12:39:51 +03:00
|
|
|
/* Bottom Halves pending aio_bh_poll() processing */
|
|
|
|
BHList bh_list;
|
|
|
|
|
|
|
|
/* Chained BH list slices for each nested aio_bh_poll() call */
|
|
|
|
QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
|
2012-10-30 02:45:23 +04:00
|
|
|
|
2015-07-21 17:07:53 +03:00
|
|
|
/* Used by aio_notify.
|
|
|
|
*
|
|
|
|
* "notified" is used to avoid expensive event_notifier_test_and_clear
|
|
|
|
* calls. When it is clear, the EventNotifier is clear, or one thread
|
|
|
|
* is going to clear "notified" before processing more events. False
|
|
|
|
* positives are possible, i.e. "notified" could be set even though the
|
|
|
|
* EventNotifier is clear.
|
|
|
|
*
|
|
|
|
* Note that event_notifier_set *cannot* be optimized the same way. For
|
|
|
|
* more information on the problem that would result, see "#ifdef BUG2"
|
|
|
|
* in the docs/aio_notify_accept.promela formal model.
|
|
|
|
*/
|
|
|
|
bool notified;
|
2012-09-24 20:44:14 +04:00
|
|
|
EventNotifier notifier;
|
2013-02-20 14:28:32 +04:00
|
|
|
|
2017-02-13 16:52:19 +03:00
|
|
|
QSLIST_HEAD(, Coroutine) scheduled_coroutines;
|
|
|
|
QEMUBH *co_schedule_bh;
|
|
|
|
|
2022-04-25 10:57:23 +03:00
|
|
|
int thread_pool_min;
|
|
|
|
int thread_pool_max;
|
2017-01-12 21:07:59 +03:00
|
|
|
/* Thread pool for performing work and receiving completion callbacks.
|
|
|
|
* Has its own locking.
|
|
|
|
*/
|
2013-03-07 16:41:47 +04:00
|
|
|
struct ThreadPool *thread_pool;
|
2013-08-21 19:02:49 +04:00
|
|
|
|
2016-07-04 19:33:20 +03:00
|
|
|
#ifdef CONFIG_LINUX_AIO
|
|
|
|
struct LinuxAioState *linux_aio;
|
|
|
|
#endif
|
2020-01-20 17:18:47 +03:00
|
|
|
#ifdef CONFIG_LINUX_IO_URING
|
|
|
|
struct LuringState *linux_io_uring;
|
2020-03-05 20:08:04 +03:00
|
|
|
|
|
|
|
/* State for file descriptor monitoring using Linux io_uring */
|
|
|
|
struct io_uring fdmon_io_uring;
|
|
|
|
AioHandlerSList submit_list;
|
2020-01-20 17:18:47 +03:00
|
|
|
#endif
|
2016-07-04 19:33:20 +03:00
|
|
|
|
2017-01-12 21:07:59 +03:00
|
|
|
/* TimerLists for calling timers - one per clock type. Has its own
|
|
|
|
* locking.
|
|
|
|
*/
|
2013-08-21 19:02:49 +04:00
|
|
|
QEMUTimerListGroup tlg;
|
2015-10-23 06:08:08 +03:00
|
|
|
|
2016-12-01 22:26:42 +03:00
|
|
|
/* Number of AioHandlers without .io_poll() */
|
|
|
|
int poll_disable_cnt;
|
|
|
|
|
2016-12-01 22:26:51 +03:00
|
|
|
/* Polling mode parameters */
|
|
|
|
int64_t poll_ns; /* current polling time in nanoseconds */
|
|
|
|
int64_t poll_max_ns; /* maximum polling time in nanoseconds */
|
|
|
|
int64_t poll_grow; /* polling time growth factor */
|
|
|
|
int64_t poll_shrink; /* polling time shrink factor */
|
2016-12-01 22:26:42 +03:00
|
|
|
|
2021-07-21 12:42:10 +03:00
|
|
|
/* AIO engine parameters */
|
|
|
|
int64_t aio_max_batch; /* maximum number of requests in a batch */
|
|
|
|
|
2020-03-05 20:08:06 +03:00
|
|
|
/*
|
|
|
|
* List of handlers participating in userspace polling. Protected by
|
|
|
|
* ctx->list_lock. Iterated and modified mostly by the event loop thread
|
|
|
|
* from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
|
|
|
|
* only touches the list to delete nodes if ctx->list_lock's count is zero.
|
|
|
|
*/
|
|
|
|
AioHandlerList poll_aio_handlers;
|
|
|
|
|
2016-12-01 22:26:49 +03:00
|
|
|
/* Are we in polling mode or monitoring file descriptors? */
|
|
|
|
bool poll_started;
|
|
|
|
|
2015-10-30 07:06:29 +03:00
|
|
|
/* epoll(7) state used when built with CONFIG_EPOLL */
|
|
|
|
int epollfd;
|
2020-03-05 20:08:02 +03:00
|
|
|
|
|
|
|
const FDMonOps *fdmon_ops;
|
2013-08-21 19:02:47 +04:00
|
|
|
};
|
2012-10-30 02:45:23 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_context_new: Allocate a new AioContext.
|
|
|
|
*
|
|
|
|
* AioContext provide a mini event-loop that can be waited on synchronously.
|
|
|
|
* They also provide bottom halves, a service to execute a piece of code
|
|
|
|
* as soon as possible.
|
|
|
|
*/
|
2014-09-18 15:30:49 +04:00
|
|
|
AioContext *aio_context_new(Error **errp);
|
2012-10-30 02:45:23 +04:00
|
|
|
|
2012-09-24 16:57:41 +04:00
|
|
|
/**
|
|
|
|
* aio_context_ref:
|
|
|
|
* @ctx: The AioContext to operate on.
|
|
|
|
*
|
|
|
|
* Add a reference to an AioContext.
|
|
|
|
*/
|
|
|
|
void aio_context_ref(AioContext *ctx);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_context_unref:
|
|
|
|
* @ctx: The AioContext to operate on.
|
|
|
|
*
|
|
|
|
* Drop a reference to an AioContext.
|
|
|
|
*/
|
|
|
|
void aio_context_unref(AioContext *ctx);
|
|
|
|
|
2021-04-14 23:02:46 +03:00
|
|
|
/**
|
|
|
|
* aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
|
|
|
|
* run only once and as soon as possible.
|
|
|
|
*
|
|
|
|
* @name: A human-readable identifier for debugging purposes.
|
|
|
|
*/
|
|
|
|
void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
|
|
|
|
const char *name);
|
|
|
|
|
2016-10-03 19:14:15 +03:00
|
|
|
/**
|
|
|
|
* aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
|
|
|
|
* only once and as soon as possible.
|
2021-04-14 23:02:46 +03:00
|
|
|
*
|
|
|
|
* A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
|
|
|
|
* name string.
|
2016-10-03 19:14:15 +03:00
|
|
|
*/
|
2021-04-14 23:02:46 +03:00
|
|
|
#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
|
|
|
|
aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
|
2016-10-03 19:14:15 +03:00
|
|
|
|
2012-10-30 02:45:23 +04:00
|
|
|
/**
|
2021-04-14 23:02:46 +03:00
|
|
|
* aio_bh_new_full: Allocate a new bottom half structure.
|
2012-10-30 02:45:23 +04:00
|
|
|
*
|
|
|
|
* Bottom halves are lightweight callbacks whose invocation is guaranteed
|
|
|
|
* to be wait-free, thread-safe and signal-safe. The #QEMUBH structure
|
|
|
|
* is opaque and must be allocated prior to its use.
|
2021-04-14 23:02:46 +03:00
|
|
|
*
|
|
|
|
* @name: A human-readable identifier for debugging purposes.
|
2023-04-28 00:10:07 +03:00
|
|
|
* @reentrancy_guard: A guard set when entering a cb to prevent
|
|
|
|
* device-reentrancy issues
|
2021-04-14 23:02:46 +03:00
|
|
|
*/
|
|
|
|
QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
|
2023-04-28 00:10:07 +03:00
|
|
|
const char *name, MemReentrancyGuard *reentrancy_guard);
|
2021-04-14 23:02:46 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_bh_new: Allocate a new bottom half structure
|
|
|
|
*
|
|
|
|
* A convenience wrapper for aio_bh_new_full() that uses the cb as the name
|
|
|
|
* string.
|
2012-10-30 02:45:23 +04:00
|
|
|
*/
|
2021-04-14 23:02:46 +03:00
|
|
|
#define aio_bh_new(ctx, cb, opaque) \
|
2023-04-28 00:10:07 +03:00
|
|
|
aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), NULL)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_bh_new_guarded: Allocate a new bottom half structure with a
|
|
|
|
* reentrancy_guard
|
|
|
|
*
|
|
|
|
* A convenience wrapper for aio_bh_new_full() that uses the cb as the name
|
|
|
|
* string.
|
|
|
|
*/
|
|
|
|
#define aio_bh_new_guarded(ctx, cb, opaque, guard) \
|
|
|
|
aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)), guard)
|
2012-10-30 02:45:23 +04:00
|
|
|
|
2012-09-24 20:44:14 +04:00
|
|
|
/**
|
|
|
|
* aio_notify: Force processing of pending events.
|
|
|
|
*
|
|
|
|
* Similar to signaling a condition variable, aio_notify forces
|
2016-12-01 07:30:40 +03:00
|
|
|
* aio_poll to exit, so that the next call will re-examine pending events.
|
|
|
|
* The caller of aio_notify will usually call aio_poll again very soon,
|
2012-09-24 20:44:14 +04:00
|
|
|
* or go through another iteration of the GLib main loop. Hence, aio_notify
|
|
|
|
* also has the side effect of recalculating the sets of file descriptors
|
|
|
|
* that the main loop waits for.
|
|
|
|
*
|
|
|
|
* Calling aio_notify is rarely necessary, because for example scheduling
|
|
|
|
* a bottom half calls it already.
|
|
|
|
*/
|
|
|
|
void aio_notify(AioContext *ctx);
|
|
|
|
|
2015-07-21 17:07:53 +03:00
|
|
|
/**
|
|
|
|
* aio_notify_accept: Acknowledge receiving an aio_notify.
|
|
|
|
*
|
|
|
|
* aio_notify() uses an EventNotifier in order to wake up a sleeping
|
|
|
|
* aio_poll() or g_main_context_iteration(). Calls to aio_notify() are
|
|
|
|
* usually rare, but the AioContext has to clear the EventNotifier on
|
|
|
|
* every aio_poll() or g_main_context_iteration() in order to avoid
|
|
|
|
* busy waiting. This event_notifier_test_and_clear() cannot be done
|
|
|
|
* using the usual aio_context_set_event_notifier(), because it must
|
|
|
|
* be done before processing all events (file descriptors, bottom halves,
|
|
|
|
* timers).
|
|
|
|
*
|
|
|
|
* aio_notify_accept() is an optimized event_notifier_test_and_clear()
|
|
|
|
* that is specific to an AioContext's notifier; it is used internally
|
|
|
|
* to clear the EventNotifier only if aio_notify() had been called.
|
|
|
|
*/
|
|
|
|
void aio_notify_accept(AioContext *ctx);
|
|
|
|
|
2015-09-17 19:24:50 +03:00
|
|
|
/**
|
|
|
|
* aio_bh_call: Executes callback function of the specified BH.
|
|
|
|
*/
|
|
|
|
void aio_bh_call(QEMUBH *bh);
|
|
|
|
|
2012-10-30 02:45:23 +04:00
|
|
|
/**
|
|
|
|
* aio_bh_poll: Poll bottom halves for an AioContext.
|
|
|
|
*
|
|
|
|
* These are internal functions used by the QEMU main loop.
|
2013-07-16 08:28:58 +04:00
|
|
|
* And notice that multiple occurrences of aio_bh_poll cannot
|
|
|
|
* be called concurrently
|
2012-10-30 02:45:23 +04:00
|
|
|
*/
|
|
|
|
int aio_bh_poll(AioContext *ctx);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* qemu_bh_schedule: Schedule a bottom half.
|
|
|
|
*
|
|
|
|
* Scheduling a bottom half interrupts the main loop and causes the
|
|
|
|
* execution of the callback that was passed to qemu_bh_new.
|
|
|
|
*
|
|
|
|
* Bottom halves that are scheduled from a bottom half handler are instantly
|
|
|
|
* invoked. This can create an infinite loop if a bottom half handler
|
|
|
|
* schedules itself.
|
|
|
|
*
|
|
|
|
* @bh: The bottom half to be scheduled.
|
|
|
|
*/
|
|
|
|
void qemu_bh_schedule(QEMUBH *bh);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* qemu_bh_cancel: Cancel execution of a bottom half.
|
|
|
|
*
|
|
|
|
* Canceling execution of a bottom half undoes the effect of calls to
|
|
|
|
* qemu_bh_schedule without freeing its resources yet. While cancellation
|
|
|
|
* itself is also wait-free and thread-safe, it can of course race with the
|
|
|
|
* loop that executes bottom halves unless you are holding the iothread
|
|
|
|
* mutex. This makes it mostly useless if you are not holding the mutex.
|
|
|
|
*
|
|
|
|
* @bh: The bottom half to be canceled.
|
|
|
|
*/
|
|
|
|
void qemu_bh_cancel(QEMUBH *bh);
|
|
|
|
|
|
|
|
/**
|
|
|
|
*qemu_bh_delete: Cancel execution of a bottom half and free its resources.
|
|
|
|
*
|
|
|
|
* Deleting a bottom half frees the memory that was allocated for it by
|
|
|
|
* qemu_bh_new. It also implies canceling the bottom half if it was
|
|
|
|
* scheduled.
|
2013-07-16 08:28:58 +04:00
|
|
|
* This func is async. The bottom half will do the delete action at the finial
|
|
|
|
* end.
|
2012-10-30 02:45:23 +04:00
|
|
|
*
|
|
|
|
* @bh: The bottom half to be deleted.
|
|
|
|
*/
|
|
|
|
void qemu_bh_delete(QEMUBH *bh);
|
|
|
|
|
2012-09-24 16:57:22 +04:00
|
|
|
/* Return whether there are any pending callbacks from the GSource
|
2014-07-09 13:53:08 +04:00
|
|
|
* attached to the AioContext, before g_poll is invoked.
|
|
|
|
*
|
|
|
|
* This is used internally in the implementation of the GSource.
|
|
|
|
*/
|
|
|
|
bool aio_prepare(AioContext *ctx);
|
|
|
|
|
|
|
|
/* Return whether there are any pending callbacks from the GSource
|
|
|
|
* attached to the AioContext, after g_poll is invoked.
|
2012-09-24 16:57:22 +04:00
|
|
|
*
|
|
|
|
* This is used internally in the implementation of the GSource.
|
|
|
|
*/
|
|
|
|
bool aio_pending(AioContext *ctx);
|
|
|
|
|
2014-07-09 13:53:05 +04:00
|
|
|
/* Dispatch any pending callbacks from the GSource attached to the AioContext.
|
|
|
|
*
|
|
|
|
* This is used internally in the implementation of the GSource.
|
|
|
|
*/
|
2017-02-13 16:52:33 +03:00
|
|
|
void aio_dispatch(AioContext *ctx);
|
2014-07-09 13:53:05 +04:00
|
|
|
|
2012-09-24 16:37:53 +04:00
|
|
|
/* Progress in completing AIO work to occur. This can issue new pending
|
|
|
|
* aio as a result of executing I/O completion or bh callbacks.
|
2012-04-12 16:00:55 +04:00
|
|
|
*
|
AioContext: do not rely on aio_poll(ctx, true) result to end a loop
Currently, whenever aio_poll(ctx, true) has completed all pending
work it returns true *and* the next call to aio_poll(ctx, true)
will not block.
This invariant has its roots in qemu_aio_flush()'s implementation
as "while (qemu_aio_wait()) {}". However, qemu_aio_flush() does
not exist anymore and bdrv_drain_all() is implemented differently;
and this invariant is complicated to maintain and subtly different
from the return value of GMainLoop's g_main_context_iteration.
All calls to aio_poll(ctx, true) except one are guarded by a
while() loop checking for a request to be incomplete, or a
BlockDriverState to be idle. The one remaining call (in
iothread.c) uses this to delay the aio_context_release/acquire
pair until the AioContext is quiescent, however:
- we can do the same just by using non-blocking aio_poll,
similar to how vl.c invokes main_loop_wait
- it is buggy, because it does not ensure that the AioContext
is released between an aio_notify and the next time the
iothread goes to sleep. This leads to hangs when stopping
the dataplane thread.
In the end, these semantics are a bad match for the current
users of AioContext. So modify that one exception in iothread.c,
which also fixes the hangs, as well as the testcase so that
it use the same idiom as the actual QEMU code.
Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-07-09 12:49:46 +04:00
|
|
|
* Return whether any progress was made by executing AIO or bottom half
|
|
|
|
* handlers. If @blocking == true, this should always be true except
|
|
|
|
* if someone called aio_notify.
|
2012-09-24 16:37:53 +04:00
|
|
|
*
|
|
|
|
* If there are no pending bottom halves, but there are pending AIO
|
|
|
|
* operations, it may not be possible to make any progress without
|
|
|
|
* blocking. If @blocking is true, this function will wait until one
|
|
|
|
* or more AIO events have completed, to ensure something has moved
|
|
|
|
* before returning.
|
|
|
|
*/
|
2023-09-08 10:54:58 +03:00
|
|
|
bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
|
2008-09-22 23:17:18 +04:00
|
|
|
|
|
|
|
/* Register a file descriptor and associated callbacks. Behaves very similarly
|
2015-06-04 09:45:19 +03:00
|
|
|
* to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will
|
2014-07-07 17:18:02 +04:00
|
|
|
* be invoked when using aio_poll().
|
2008-09-22 23:17:18 +04:00
|
|
|
*
|
|
|
|
* Code that invokes AIO completion functions should rely on this function
|
|
|
|
* instead of qemu_set_fd_handler[2].
|
|
|
|
*/
|
2012-09-13 14:28:51 +04:00
|
|
|
void aio_set_fd_handler(AioContext *ctx,
|
|
|
|
int fd,
|
|
|
|
IOHandler *io_read,
|
|
|
|
IOHandler *io_write,
|
2016-12-01 22:26:41 +03:00
|
|
|
AioPollFn *io_poll,
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 16:23:31 +03:00
|
|
|
IOHandler *io_poll_ready,
|
2012-09-13 14:28:51 +04:00
|
|
|
void *opaque);
|
2012-06-09 05:44:00 +04:00
|
|
|
|
|
|
|
/* Register an event notifier and associated callbacks. Behaves very similarly
|
|
|
|
* to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks
|
2014-07-07 17:18:02 +04:00
|
|
|
* will be invoked when using aio_poll().
|
2012-06-09 05:44:00 +04:00
|
|
|
*
|
|
|
|
* Code that invokes AIO completion functions should rely on this function
|
|
|
|
* instead of event_notifier_set_handler.
|
|
|
|
*/
|
2012-09-13 14:28:51 +04:00
|
|
|
void aio_set_event_notifier(AioContext *ctx,
|
|
|
|
EventNotifier *notifier,
|
2016-12-01 22:26:41 +03:00
|
|
|
EventNotifierHandler *io_read,
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 16:23:31 +03:00
|
|
|
AioPollFn *io_poll,
|
|
|
|
EventNotifierHandler *io_poll_ready);
|
2012-09-13 14:28:51 +04:00
|
|
|
|
2016-12-01 22:26:49 +03:00
|
|
|
/* Set polling begin/end callbacks for an event notifier that has already been
|
|
|
|
* registered with aio_set_event_notifier. Do nothing if the event notifier is
|
|
|
|
* not registered.
|
|
|
|
*/
|
|
|
|
void aio_set_event_notifier_poll(AioContext *ctx,
|
|
|
|
EventNotifier *notifier,
|
|
|
|
EventNotifierHandler *io_poll_begin,
|
|
|
|
EventNotifierHandler *io_poll_end);
|
|
|
|
|
2012-09-24 16:57:41 +04:00
|
|
|
/* Return a GSource that lets the main loop poll the file descriptors attached
|
|
|
|
* to this AioContext.
|
|
|
|
*/
|
|
|
|
GSource *aio_get_g_source(AioContext *ctx);
|
|
|
|
|
2013-03-07 16:41:47 +04:00
|
|
|
/* Return the ThreadPool bound to this AioContext */
|
|
|
|
struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
|
|
|
|
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
/* Setup the LinuxAioState bound to this AioContext */
|
|
|
|
struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
|
|
|
|
|
2016-07-04 19:33:20 +03:00
|
|
|
/* Return the LinuxAioState bound to this AioContext */
|
|
|
|
struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
|
|
|
|
|
2020-01-20 17:18:47 +03:00
|
|
|
/* Setup the LuringState bound to this AioContext */
|
|
|
|
struct LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
|
|
|
|
|
|
|
|
/* Return the LuringState bound to this AioContext */
|
|
|
|
struct LuringState *aio_get_linux_io_uring(AioContext *ctx);
|
2013-08-21 19:02:52 +04:00
|
|
|
/**
|
2018-10-17 11:24:19 +03:00
|
|
|
* aio_timer_new_with_attrs:
|
2013-08-21 19:02:52 +04:00
|
|
|
* @ctx: the aio context
|
|
|
|
* @type: the clock type
|
|
|
|
* @scale: the scale
|
2018-10-17 11:24:19 +03:00
|
|
|
* @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
|
|
|
|
* to assign
|
2013-08-21 19:02:52 +04:00
|
|
|
* @cb: the callback to call on timer expiry
|
|
|
|
* @opaque: the opaque pointer to pass to the callback
|
|
|
|
*
|
2018-10-17 11:24:19 +03:00
|
|
|
* Allocate a new timer (with attributes) attached to the context @ctx.
|
2013-08-21 19:02:52 +04:00
|
|
|
* The function is responsible for memory allocation.
|
|
|
|
*
|
2018-10-17 11:24:19 +03:00
|
|
|
* The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
|
|
|
|
* Use that unless you really need dynamic memory allocation.
|
|
|
|
*
|
|
|
|
* Returns: a pointer to the new timer
|
|
|
|
*/
|
|
|
|
static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
|
|
|
|
QEMUClockType type,
|
|
|
|
int scale, int attributes,
|
|
|
|
QEMUTimerCB *cb, void *opaque)
|
|
|
|
{
|
|
|
|
return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_timer_new:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @type: the clock type
|
|
|
|
* @scale: the scale
|
|
|
|
* @cb: the callback to call on timer expiry
|
|
|
|
* @opaque: the opaque pointer to pass to the callback
|
|
|
|
*
|
|
|
|
* Allocate a new timer attached to the context @ctx.
|
|
|
|
* See aio_timer_new_with_attrs for details.
|
2013-08-21 19:02:52 +04:00
|
|
|
*
|
|
|
|
* Returns: a pointer to the new timer
|
|
|
|
*/
|
|
|
|
static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
|
|
|
|
int scale,
|
|
|
|
QEMUTimerCB *cb, void *opaque)
|
|
|
|
{
|
2018-10-17 11:24:19 +03:00
|
|
|
return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_timer_init_with_attrs:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @ts: the timer
|
|
|
|
* @type: the clock type
|
|
|
|
* @scale: the scale
|
|
|
|
* @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
|
|
|
|
* to assign
|
|
|
|
* @cb: the callback to call on timer expiry
|
|
|
|
* @opaque: the opaque pointer to pass to the callback
|
|
|
|
*
|
|
|
|
* Initialise a new timer (with attributes) attached to the context @ctx.
|
|
|
|
* The caller is responsible for memory allocation.
|
|
|
|
*/
|
|
|
|
static inline void aio_timer_init_with_attrs(AioContext *ctx,
|
|
|
|
QEMUTimer *ts, QEMUClockType type,
|
|
|
|
int scale, int attributes,
|
|
|
|
QEMUTimerCB *cb, void *opaque)
|
|
|
|
{
|
|
|
|
timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
|
2013-08-21 19:02:52 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* aio_timer_init:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @ts: the timer
|
|
|
|
* @type: the clock type
|
|
|
|
* @scale: the scale
|
|
|
|
* @cb: the callback to call on timer expiry
|
|
|
|
* @opaque: the opaque pointer to pass to the callback
|
|
|
|
*
|
|
|
|
* Initialise a new timer attached to the context @ctx.
|
2018-10-17 11:24:19 +03:00
|
|
|
* See aio_timer_init_with_attrs for details.
|
2013-08-21 19:02:52 +04:00
|
|
|
*/
|
|
|
|
static inline void aio_timer_init(AioContext *ctx,
|
|
|
|
QEMUTimer *ts, QEMUClockType type,
|
|
|
|
int scale,
|
|
|
|
QEMUTimerCB *cb, void *opaque)
|
|
|
|
{
|
2018-10-17 11:24:19 +03:00
|
|
|
timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
|
2013-08-21 19:02:52 +04:00
|
|
|
}
|
|
|
|
|
2014-07-09 13:53:01 +04:00
|
|
|
/**
|
|
|
|
* aio_compute_timeout:
|
|
|
|
* @ctx: the aio context
|
|
|
|
*
|
|
|
|
* Compute the timeout that a blocking aio_poll should use.
|
|
|
|
*/
|
|
|
|
int64_t aio_compute_timeout(AioContext *ctx);
|
|
|
|
|
2017-02-13 16:52:19 +03:00
|
|
|
/**
|
|
|
|
* aio_co_schedule:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @co: the coroutine
|
|
|
|
*
|
|
|
|
* Start a coroutine on a remote AioContext.
|
|
|
|
*
|
|
|
|
* The coroutine must not be entered by anyone else while aio_co_schedule()
|
|
|
|
* is active. In addition the coroutine must have yielded unless ctx
|
|
|
|
* is the context in which the coroutine is running (i.e. the value of
|
|
|
|
* qemu_get_current_aio_context() from the coroutine itself).
|
|
|
|
*/
|
2022-12-21 16:14:35 +03:00
|
|
|
void aio_co_schedule(AioContext *ctx, Coroutine *co);
|
2017-02-13 16:52:19 +03:00
|
|
|
|
2020-10-05 18:58:52 +03:00
|
|
|
/**
|
|
|
|
* aio_co_reschedule_self:
|
|
|
|
* @new_ctx: the new context
|
|
|
|
*
|
|
|
|
* Move the currently running coroutine to new_ctx. If the coroutine is already
|
|
|
|
* running in new_ctx, do nothing.
|
|
|
|
*/
|
|
|
|
void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
|
|
|
|
|
2017-02-13 16:52:19 +03:00
|
|
|
/**
|
|
|
|
* aio_co_wake:
|
|
|
|
* @co: the coroutine
|
|
|
|
*
|
|
|
|
* Restart a coroutine on the AioContext where it was running last, thus
|
|
|
|
* preventing coroutines from jumping from one context to another when they
|
|
|
|
* go to sleep.
|
|
|
|
*
|
|
|
|
* aio_co_wake may be executed either in coroutine or non-coroutine
|
|
|
|
* context. The coroutine must not be entered by anyone else while
|
|
|
|
* aio_co_wake() is active.
|
|
|
|
*/
|
2022-12-21 16:14:35 +03:00
|
|
|
void aio_co_wake(Coroutine *co);
|
2017-02-13 16:52:19 +03:00
|
|
|
|
2017-04-10 15:07:35 +03:00
|
|
|
/**
|
|
|
|
* aio_co_enter:
|
|
|
|
* @ctx: the context to run the coroutine
|
|
|
|
* @co: the coroutine to run
|
|
|
|
*
|
|
|
|
* Enter a coroutine in the specified AioContext.
|
|
|
|
*/
|
2022-12-21 16:14:35 +03:00
|
|
|
void aio_co_enter(AioContext *ctx, Coroutine *co);
|
2017-04-10 15:07:35 +03:00
|
|
|
|
2016-10-27 13:48:59 +03:00
|
|
|
/**
|
|
|
|
* Return the AioContext whose event loop runs in the current thread.
|
|
|
|
*
|
|
|
|
* If called from an IOThread this will be the IOThread's AioContext. If
|
async: the main AioContext is only "current" if under the BQL
If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work. In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread. aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().
The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different. The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread(). It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.
Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication. The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop. Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().
With this change, the stub qemu_mutex_iothread_locked() must be changed
from true to false. The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210609122234.544153-1-pbonzini@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
[eblake: tweak commit message per Vladimir's review]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-06-09 15:22:34 +03:00
|
|
|
* called from the main thread or with the "big QEMU lock" taken it
|
|
|
|
* will be the main loop AioContext.
|
2016-10-27 13:48:59 +03:00
|
|
|
*/
|
|
|
|
AioContext *qemu_get_current_aio_context(void);
|
|
|
|
|
async: the main AioContext is only "current" if under the BQL
If we want to wake up a coroutine from a worker thread, aio_co_wake()
currently does not work. In that scenario, aio_co_wake() calls
aio_co_enter(), but there is no current AioContext and therefore
qemu_get_current_aio_context() returns the main thread. aio_co_wake()
then attempts to call aio_context_acquire() instead of going through
aio_co_schedule().
The default case of qemu_get_current_aio_context() was added to cover
synchronous I/O started from the vCPU thread, but the main and vCPU
threads are quite different. The main thread is an I/O thread itself,
only running a more complicated event loop; the vCPU thread instead
is essentially a worker thread that occasionally calls
qemu_mutex_lock_iothread(). It is only in those critical sections
that it acts as if it were the home thread of the main AioContext.
Therefore, this patch detaches qemu_get_current_aio_context() from
iothreads, which is a useless complication. The AioContext pointer
is stored directly in the thread-local variable, including for the
main loop. Worker threads (including vCPU threads) optionally behave
as temporary home threads if they have taken the big QEMU lock,
but if that is not the case they will always schedule coroutines
on remote threads via aio_co_schedule().
With this change, the stub qemu_mutex_iothread_locked() must be changed
from true to false. The previous value of true was needed because the
main thread did not have an AioContext in the thread-local variable,
but now it does have one.
Reported-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210609122234.544153-1-pbonzini@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Tested-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
[eblake: tweak commit message per Vladimir's review]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-06-09 15:22:34 +03:00
|
|
|
void qemu_set_current_aio_context(AioContext *ctx);
|
|
|
|
|
2015-10-30 07:06:28 +03:00
|
|
|
/**
|
|
|
|
* aio_context_setup:
|
|
|
|
* @ctx: the aio context
|
|
|
|
*
|
|
|
|
* Initialize the aio context.
|
|
|
|
*/
|
2016-07-15 13:28:44 +03:00
|
|
|
void aio_context_setup(AioContext *ctx);
|
2015-10-30 07:06:28 +03:00
|
|
|
|
2018-05-17 03:42:43 +03:00
|
|
|
/**
|
|
|
|
* aio_context_destroy:
|
|
|
|
* @ctx: the aio context
|
|
|
|
*
|
|
|
|
* Destroy the aio context.
|
|
|
|
*/
|
|
|
|
void aio_context_destroy(AioContext *ctx);
|
|
|
|
|
2020-05-11 21:36:30 +03:00
|
|
|
/* Used internally, do not call outside AioContext code */
|
|
|
|
void aio_context_use_g_source(AioContext *ctx);
|
|
|
|
|
2016-12-01 22:26:42 +03:00
|
|
|
/**
|
|
|
|
* aio_context_set_poll_params:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @max_ns: how long to busy poll for, in nanoseconds
|
2016-12-01 22:26:51 +03:00
|
|
|
* @grow: polling time growth factor
|
|
|
|
* @shrink: polling time shrink factor
|
2016-12-01 22:26:42 +03:00
|
|
|
*
|
|
|
|
* Poll mode can be disabled by setting poll_max_ns to 0.
|
|
|
|
*/
|
|
|
|
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
|
2016-12-01 22:26:51 +03:00
|
|
|
int64_t grow, int64_t shrink,
|
2016-12-01 22:26:42 +03:00
|
|
|
Error **errp);
|
|
|
|
|
2021-07-21 12:42:10 +03:00
|
|
|
/**
|
|
|
|
* aio_context_set_aio_params:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @max_batch: maximum number of requests in a batch, 0 means that the
|
|
|
|
* engine will use its default
|
|
|
|
*/
|
|
|
|
void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
|
|
|
|
Error **errp);
|
|
|
|
|
2022-04-25 10:57:23 +03:00
|
|
|
/**
|
|
|
|
* aio_context_set_thread_pool_params:
|
|
|
|
* @ctx: the aio context
|
|
|
|
* @min: min number of threads to have readily available in the thread pool
|
|
|
|
* @min: max number of threads the thread pool can contain
|
|
|
|
*/
|
|
|
|
void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
|
|
|
|
int64_t max, Error **errp);
|
2008-09-22 23:17:18 +04:00
|
|
|
#endif
|