2009-08-20 18:58:35 +04:00
|
|
|
/*
|
|
|
|
* Linux native AIO support.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2009 IBM, Corp.
|
|
|
|
* Copyright (C) 2009 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
|
|
* See the COPYING file in the top-level directory.
|
|
|
|
*/
|
2016-01-18 21:01:42 +03:00
|
|
|
#include "qemu/osdep.h"
|
2012-12-17 21:19:44 +04:00
|
|
|
#include "block/aio.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/queue.h"
|
2014-08-06 19:18:07 +04:00
|
|
|
#include "block/block.h"
|
2012-06-09 12:57:37 +04:00
|
|
|
#include "block/raw-aio.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/event_notifier.h"
|
2014-08-06 19:18:07 +04:00
|
|
|
#include "qemu/coroutine.h"
|
2023-09-13 23:00:43 +03:00
|
|
|
#include "qemu/defer-call.h"
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
#include "qapi/error.h"
|
2023-05-30 21:09:58 +03:00
|
|
|
#include "sysemu/block-backend.h"
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2023-02-03 16:17:28 +03:00
|
|
|
/* Only used for assertions. */
|
|
|
|
#include "qemu/coroutine_int.h"
|
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
#include <libaio.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Queue size (per-device).
|
|
|
|
*
|
|
|
|
* XXX: eventually we need to communicate this to the guest and/or make it
|
|
|
|
* tunable by the guest. If we get more outstanding requests at a time
|
|
|
|
* than this we will get EAGAIN from io_submit which is communicated to
|
|
|
|
* the guest as an I/O error.
|
|
|
|
*/
|
2020-01-07 09:01:01 +03:00
|
|
|
#define MAX_EVENTS 1024
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2021-07-21 12:42:11 +03:00
|
|
|
/* Maximum number of requests in a batch. (default value) */
|
|
|
|
#define DEFAULT_MAX_BATCH 32
|
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
struct qemu_laiocb {
|
2014-08-06 19:18:07 +04:00
|
|
|
Coroutine *co;
|
2016-04-07 19:33:35 +03:00
|
|
|
LinuxAioState *ctx;
|
2009-08-20 18:58:35 +04:00
|
|
|
struct iocb iocb;
|
|
|
|
ssize_t ret;
|
|
|
|
size_t nbytes;
|
2011-10-13 17:42:52 +04:00
|
|
|
QEMUIOVector *qiov;
|
|
|
|
bool is_read;
|
2014-12-11 16:52:26 +03:00
|
|
|
QSIMPLEQ_ENTRY(qemu_laiocb) next;
|
2009-08-20 18:58:35 +04:00
|
|
|
};
|
|
|
|
|
2014-07-04 14:04:34 +04:00
|
|
|
typedef struct {
|
2016-07-13 16:03:24 +03:00
|
|
|
unsigned int in_queue;
|
|
|
|
unsigned int in_flight;
|
2014-12-11 16:52:27 +03:00
|
|
|
bool blocked;
|
2014-12-11 16:52:26 +03:00
|
|
|
QSIMPLEQ_HEAD(, qemu_laiocb) pending;
|
2014-07-04 14:04:34 +04:00
|
|
|
} LaioQueue;
|
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
struct LinuxAioState {
|
2016-07-04 19:33:20 +03:00
|
|
|
AioContext *aio_context;
|
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
io_context_t ctx;
|
2012-02-24 11:39:02 +04:00
|
|
|
EventNotifier e;
|
2014-07-04 14:04:34 +04:00
|
|
|
|
2023-02-03 16:17:28 +03:00
|
|
|
/* No locking required, only accessed from AioContext home thread */
|
2014-07-04 14:04:34 +04:00
|
|
|
LaioQueue io_q;
|
2014-08-04 19:56:33 +04:00
|
|
|
QEMUBH *completion_bh;
|
|
|
|
int event_idx;
|
|
|
|
int event_max;
|
2009-08-20 18:58:35 +04:00
|
|
|
};
|
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
static void ioq_submit(LinuxAioState *s);
|
2014-12-11 16:52:26 +03:00
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
static inline ssize_t io_event_ret(struct io_event *ev)
|
|
|
|
{
|
|
|
|
return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
|
|
|
|
}
|
|
|
|
|
2009-10-22 19:54:41 +04:00
|
|
|
/*
|
2019-06-02 23:17:09 +03:00
|
|
|
* Completes an AIO request.
|
2009-10-22 19:54:41 +04:00
|
|
|
*/
|
2016-04-07 19:33:35 +03:00
|
|
|
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
|
2009-10-22 19:54:41 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = laiocb->ret;
|
|
|
|
if (ret != -ECANCELED) {
|
2011-10-13 17:42:52 +04:00
|
|
|
if (ret == laiocb->nbytes) {
|
2009-10-22 19:54:41 +04:00
|
|
|
ret = 0;
|
2011-10-13 17:42:52 +04:00
|
|
|
} else if (ret >= 0) {
|
|
|
|
/* Short reads mean EOF, pad with zeros. */
|
|
|
|
if (laiocb->is_read) {
|
consolidate qemu_iovec_memset{,_skip}() into single function and use existing iov_memset()
This patch combines two functions into one, and replaces
the implementation with already existing iov_memset() from
iov.c.
The new prototype of qemu_iovec_memset():
size_t qemu_iovec_memset(qiov, size_t offset, int fillc, size_t bytes)
It is different from former qemu_iovec_memset_skip(), and
I want to make other functions to be consistent with it
too: first how much to skip, second what, and 3rd how many
of it. It also returns actual number of bytes filled in,
which may be less than the requested `bytes' if qiov is
smaller than offset+bytes, in the same way iov_memset()
does.
While at it, use utility function iov_memset() from
iov.h in posix-aio-compat.c, where qiov was used.
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
2012-03-10 16:54:23 +04:00
|
|
|
qemu_iovec_memset(laiocb->qiov, ret, 0,
|
|
|
|
laiocb->qiov->size - ret);
|
2011-10-13 17:42:52 +04:00
|
|
|
} else {
|
2016-06-23 14:37:16 +03:00
|
|
|
ret = -ENOSPC;
|
2011-10-13 17:42:52 +04:00
|
|
|
}
|
|
|
|
}
|
2009-10-22 19:54:41 +04:00
|
|
|
}
|
|
|
|
|
2014-08-06 19:18:07 +04:00
|
|
|
laiocb->ret = ret;
|
2019-06-02 23:17:09 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the coroutine is already entered it must be in ioq_submit() and
|
|
|
|
* will notice laio->ret has been filled in when it eventually runs
|
|
|
|
* later. Coroutines cannot be entered recursively so avoid doing
|
|
|
|
* that!
|
|
|
|
*/
|
2023-02-03 16:17:28 +03:00
|
|
|
assert(laiocb->co->ctx == laiocb->ctx->aio_context);
|
2019-06-02 23:17:09 +03:00
|
|
|
if (!qemu_coroutine_entered(laiocb->co)) {
|
|
|
|
aio_co_wake(laiocb->co);
|
2014-08-06 19:18:07 +04:00
|
|
|
}
|
2009-10-22 19:54:41 +04:00
|
|
|
}
|
|
|
|
|
2016-07-19 15:27:41 +03:00
|
|
|
/**
|
|
|
|
* aio_ring buffer which is shared between userspace and kernel.
|
|
|
|
*
|
|
|
|
* This copied from linux/fs/aio.c, common header does not exist
|
|
|
|
* but AIO exists for ages so we assume ABI is stable.
|
|
|
|
*/
|
|
|
|
struct aio_ring {
|
|
|
|
unsigned id; /* kernel internal index number */
|
|
|
|
unsigned nr; /* number of io_events */
|
|
|
|
unsigned head; /* Written to by userland or by kernel. */
|
|
|
|
unsigned tail;
|
|
|
|
|
|
|
|
unsigned magic;
|
|
|
|
unsigned compat_features;
|
|
|
|
unsigned incompat_features;
|
|
|
|
unsigned header_length; /* size of aio_ring */
|
|
|
|
|
misc: Replace zero-length arrays with flexible array member (automatic)
Description copied from Linux kernel commit from Gustavo A. R. Silva
(see [3]):
--v-- description start --v--
The current codebase makes use of the zero-length array language
extension to the C90 standard, but the preferred mechanism to
declare variable-length types such as these ones is a flexible
array member [1], introduced in C99:
struct foo {
int stuff;
struct boo array[];
};
By making use of the mechanism above, we will get a compiler
warning in case the flexible array does not occur last in the
structure, which will help us prevent some kind of undefined
behavior bugs from being unadvertenly introduced [2] to the
Linux codebase from now on.
--^-- description end --^--
Do the similar housekeeping in the QEMU codebase (which uses
C99 since commit 7be41675f7cb).
All these instances of code were found with the help of the
following Coccinelle script:
@@
identifier s, m, a;
type t, T;
@@
struct s {
...
t m;
- T a[0];
+ T a[];
};
@@
identifier s, m, a;
type t, T;
@@
struct s {
...
t m;
- T a[0];
+ T a[];
} QEMU_PACKED;
[1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html
[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=76497732932f
[3] https://git.kernel.org/pub/scm/linux/kernel/git/gustavoars/linux.git/commit/?id=17642a2fbd2c1
Inspired-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-03-04 18:38:15 +03:00
|
|
|
struct io_event io_events[];
|
2016-07-19 15:27:41 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_peek:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @events: pointer on events array, output value
|
|
|
|
|
|
|
|
* Returns the number of completed events and sets a pointer
|
|
|
|
* on events array. This function does not update the internal
|
|
|
|
* ring buffer, only reads head and tail. When @events has been
|
|
|
|
* processed io_getevents_commit() must be called.
|
|
|
|
*/
|
|
|
|
static inline unsigned int io_getevents_peek(io_context_t ctx,
|
|
|
|
struct io_event **events)
|
|
|
|
{
|
|
|
|
struct aio_ring *ring = (struct aio_ring *)ctx;
|
|
|
|
unsigned int head = ring->head, tail = ring->tail;
|
|
|
|
unsigned int nr;
|
|
|
|
|
|
|
|
nr = tail >= head ? tail - head : ring->nr - head;
|
|
|
|
*events = ring->io_events + head;
|
|
|
|
/* To avoid speculative loads of s->events[i] before observing tail.
|
|
|
|
Paired with smp_wmb() inside linux/fs/aio.c: aio_complete(). */
|
|
|
|
smp_rmb();
|
|
|
|
|
|
|
|
return nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_commit:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @nr: the number of events on which head should be advanced
|
|
|
|
*
|
|
|
|
* Advances head of a ring buffer.
|
|
|
|
*/
|
|
|
|
static inline void io_getevents_commit(io_context_t ctx, unsigned int nr)
|
|
|
|
{
|
|
|
|
struct aio_ring *ring = (struct aio_ring *)ctx;
|
|
|
|
|
|
|
|
if (nr) {
|
|
|
|
ring->head = (ring->head + nr) % ring->nr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* io_getevents_advance_and_peek:
|
|
|
|
* @ctx: AIO context
|
|
|
|
* @events: pointer on events array, output value
|
|
|
|
* @nr: the number of events on which head should be advanced
|
|
|
|
*
|
|
|
|
* Advances head of a ring buffer and returns number of elements left.
|
|
|
|
*/
|
|
|
|
static inline unsigned int
|
|
|
|
io_getevents_advance_and_peek(io_context_t ctx,
|
|
|
|
struct io_event **events,
|
|
|
|
unsigned int nr)
|
|
|
|
{
|
|
|
|
io_getevents_commit(ctx, nr);
|
|
|
|
return io_getevents_peek(ctx, events);
|
|
|
|
}
|
|
|
|
|
2016-07-19 15:27:42 +03:00
|
|
|
/**
|
|
|
|
* qemu_laio_process_completions:
|
|
|
|
* @s: AIO state
|
|
|
|
*
|
|
|
|
* Fetches completed I/O requests and invokes their callbacks.
|
2014-08-04 19:56:33 +04:00
|
|
|
*
|
|
|
|
* The function is somewhat tricky because it supports nested event loops, for
|
|
|
|
* example when a request callback invokes aio_poll(). In order to do this,
|
2016-07-19 15:27:42 +03:00
|
|
|
* indices are kept in LinuxAioState. Function schedules BH completion so it
|
|
|
|
* can be called again in a nested event loop. When there are no events left
|
|
|
|
* to complete the BH is being canceled.
|
2014-08-04 19:56:33 +04:00
|
|
|
*/
|
2016-07-19 15:27:42 +03:00
|
|
|
static void qemu_laio_process_completions(LinuxAioState *s)
|
2009-08-20 18:58:35 +04:00
|
|
|
{
|
2016-07-19 15:27:41 +03:00
|
|
|
struct io_event *events;
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2023-09-13 23:00:44 +03:00
|
|
|
defer_call_begin();
|
|
|
|
|
2014-08-04 19:56:33 +04:00
|
|
|
/* Reschedule so nested event loops see currently pending completions */
|
|
|
|
qemu_bh_schedule(s->completion_bh);
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2016-07-19 15:27:41 +03:00
|
|
|
while ((s->event_max = io_getevents_advance_and_peek(s->ctx, &events,
|
|
|
|
s->event_idx))) {
|
|
|
|
for (s->event_idx = 0; s->event_idx < s->event_max; ) {
|
|
|
|
struct iocb *iocb = events[s->event_idx].obj;
|
|
|
|
struct qemu_laiocb *laiocb =
|
2014-08-04 19:56:33 +04:00
|
|
|
container_of(iocb, struct qemu_laiocb, iocb);
|
|
|
|
|
2016-07-19 15:27:41 +03:00
|
|
|
laiocb->ret = io_event_ret(&events[s->event_idx]);
|
2014-08-04 19:56:33 +04:00
|
|
|
|
2016-07-19 15:27:41 +03:00
|
|
|
/* Change counters one-by-one because we can be nested. */
|
|
|
|
s->io_q.in_flight--;
|
|
|
|
s->event_idx++;
|
|
|
|
qemu_laio_process_completion(laiocb);
|
|
|
|
}
|
2014-08-04 19:56:33 +04:00
|
|
|
}
|
2014-12-11 16:52:26 +03:00
|
|
|
|
2016-07-19 15:27:41 +03:00
|
|
|
qemu_bh_cancel(s->completion_bh);
|
|
|
|
|
|
|
|
/* If we are nested we have to notify the level above that we are done
|
|
|
|
* by setting event_max to zero, upper level will then jump out of it's
|
2023-07-14 14:06:05 +03:00
|
|
|
* own `for` loop. If we are the last all counters dropped to zero. */
|
2016-07-19 15:27:41 +03:00
|
|
|
s->event_max = 0;
|
|
|
|
s->event_idx = 0;
|
2023-09-13 23:00:44 +03:00
|
|
|
|
|
|
|
defer_call_end();
|
2016-07-19 15:27:42 +03:00
|
|
|
}
|
2016-07-19 15:27:41 +03:00
|
|
|
|
2016-07-19 15:27:42 +03:00
|
|
|
static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
|
|
|
|
{
|
|
|
|
qemu_laio_process_completions(s);
|
2017-02-13 16:52:31 +03:00
|
|
|
|
2023-05-30 21:09:58 +03:00
|
|
|
if (!QSIMPLEQ_EMPTY(&s->io_q.pending)) {
|
2014-12-11 16:52:26 +03:00
|
|
|
ioq_submit(s);
|
|
|
|
}
|
2014-08-04 19:56:33 +04:00
|
|
|
}
|
|
|
|
|
2016-07-19 15:27:42 +03:00
|
|
|
static void qemu_laio_completion_bh(void *opaque)
|
|
|
|
{
|
|
|
|
LinuxAioState *s = opaque;
|
|
|
|
|
|
|
|
qemu_laio_process_completions_and_submit(s);
|
|
|
|
}
|
|
|
|
|
2014-08-04 19:56:33 +04:00
|
|
|
static void qemu_laio_completion_cb(EventNotifier *e)
|
|
|
|
{
|
2016-04-07 19:33:35 +03:00
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
2014-08-04 19:56:33 +04:00
|
|
|
|
|
|
|
if (event_notifier_test_and_clear(&s->e)) {
|
2016-07-19 15:27:42 +03:00
|
|
|
qemu_laio_process_completions_and_submit(s);
|
2009-08-20 18:58:35 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-01 22:26:44 +03:00
|
|
|
static bool qemu_laio_poll_cb(void *opaque)
|
|
|
|
{
|
|
|
|
EventNotifier *e = opaque;
|
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
|
|
|
struct io_event *events;
|
|
|
|
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 16:23:31 +03:00
|
|
|
return io_getevents_peek(s->ctx, &events);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qemu_laio_poll_ready(EventNotifier *opaque)
|
|
|
|
{
|
|
|
|
EventNotifier *e = opaque;
|
|
|
|
LinuxAioState *s = container_of(e, LinuxAioState, e);
|
2016-12-01 22:26:44 +03:00
|
|
|
|
|
|
|
qemu_laio_process_completions_and_submit(s);
|
|
|
|
}
|
|
|
|
|
2014-07-04 14:04:34 +04:00
|
|
|
static void ioq_init(LaioQueue *io_q)
|
|
|
|
{
|
2014-12-11 16:52:26 +03:00
|
|
|
QSIMPLEQ_INIT(&io_q->pending);
|
2016-07-13 16:03:24 +03:00
|
|
|
io_q->in_queue = 0;
|
|
|
|
io_q->in_flight = 0;
|
2014-12-11 16:52:27 +03:00
|
|
|
io_q->blocked = false;
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
static void ioq_submit(LinuxAioState *s)
|
2014-07-04 14:04:34 +04:00
|
|
|
{
|
2014-12-11 16:52:30 +03:00
|
|
|
int ret, len;
|
2014-12-11 16:52:26 +03:00
|
|
|
struct qemu_laiocb *aiocb;
|
2016-07-13 16:03:24 +03:00
|
|
|
struct iocb *iocbs[MAX_EVENTS];
|
2014-12-11 16:52:30 +03:00
|
|
|
QSIMPLEQ_HEAD(, qemu_laiocb) completed;
|
2014-07-04 14:04:34 +04:00
|
|
|
|
2014-12-11 16:52:27 +03:00
|
|
|
do {
|
2016-07-13 16:03:24 +03:00
|
|
|
if (s->io_q.in_flight >= MAX_EVENTS) {
|
|
|
|
break;
|
|
|
|
}
|
2014-12-11 16:52:27 +03:00
|
|
|
len = 0;
|
|
|
|
QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
|
|
|
|
iocbs[len++] = &aiocb->iocb;
|
2016-07-13 16:03:24 +03:00
|
|
|
if (s->io_q.in_flight + len >= MAX_EVENTS) {
|
2014-12-11 16:52:27 +03:00
|
|
|
break;
|
|
|
|
}
|
2014-12-11 16:52:26 +03:00
|
|
|
}
|
2014-07-04 14:04:34 +04:00
|
|
|
|
2014-12-11 16:52:27 +03:00
|
|
|
ret = io_submit(s->ctx, len, iocbs);
|
|
|
|
if (ret == -EAGAIN) {
|
2014-12-11 16:52:30 +03:00
|
|
|
break;
|
2014-12-11 16:52:27 +03:00
|
|
|
}
|
|
|
|
if (ret < 0) {
|
2016-08-09 14:20:19 +03:00
|
|
|
/* Fail the first request, retry the rest */
|
|
|
|
aiocb = QSIMPLEQ_FIRST(&s->io_q.pending);
|
|
|
|
QSIMPLEQ_REMOVE_HEAD(&s->io_q.pending, next);
|
|
|
|
s->io_q.in_queue--;
|
|
|
|
aiocb->ret = ret;
|
|
|
|
qemu_laio_process_completion(aiocb);
|
|
|
|
continue;
|
2014-12-11 16:52:27 +03:00
|
|
|
}
|
|
|
|
|
2016-07-13 16:03:24 +03:00
|
|
|
s->io_q.in_flight += ret;
|
|
|
|
s->io_q.in_queue -= ret;
|
2014-12-11 16:52:30 +03:00
|
|
|
aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
|
|
|
|
QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
|
2014-12-11 16:52:27 +03:00
|
|
|
} while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
|
2016-07-13 16:03:24 +03:00
|
|
|
s->io_q.blocked = (s->io_q.in_queue > 0);
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 15:27:43 +03:00
|
|
|
|
|
|
|
if (s->io_q.in_flight) {
|
|
|
|
/* We can try to complete something just right away if there are
|
|
|
|
* still requests in-flight. */
|
|
|
|
qemu_laio_process_completions(s);
|
|
|
|
/*
|
|
|
|
* Even we have completed everything (in_flight == 0), the queue can
|
|
|
|
* have still pended requests (in_queue > 0). We do not attempt to
|
|
|
|
* repeat submission to avoid IO hang. The reason is simple: s->e is
|
|
|
|
* still set and completion callback will be called shortly and all
|
|
|
|
* pended requests will be submitted from there.
|
|
|
|
*/
|
|
|
|
}
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
|
|
|
|
2021-10-26 19:23:45 +03:00
|
|
|
static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
|
|
|
|
{
|
|
|
|
uint64_t max_batch = s->aio_context->aio_max_batch ?: DEFAULT_MAX_BATCH;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* AIO context can be shared between multiple block devices, so
|
|
|
|
* `dev_max_batch` allows reducing the batch size for latency-sensitive
|
|
|
|
* devices.
|
|
|
|
*/
|
|
|
|
max_batch = MIN_NON_ZERO(dev_max_batch, max_batch);
|
|
|
|
|
|
|
|
/* limit the batch with the number of available events */
|
|
|
|
max_batch = MIN_NON_ZERO(MAX_EVENTS - s->io_q.in_flight, max_batch);
|
|
|
|
|
|
|
|
return max_batch;
|
|
|
|
}
|
|
|
|
|
2023-09-13 23:00:42 +03:00
|
|
|
static void laio_deferred_fn(void *opaque)
|
2014-07-04 14:04:34 +04:00
|
|
|
{
|
2023-05-30 21:09:58 +03:00
|
|
|
LinuxAioState *s = opaque;
|
2022-06-09 19:47:11 +03:00
|
|
|
|
2023-05-30 21:09:58 +03:00
|
|
|
if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
|
2014-12-11 16:52:29 +03:00
|
|
|
ioq_submit(s);
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-08-06 19:18:07 +04:00
|
|
|
static int laio_do_submit(int fd, struct qemu_laiocb *laiocb, off_t offset,
|
2021-10-26 19:23:45 +03:00
|
|
|
int type, uint64_t dev_max_batch)
|
2009-08-20 18:58:35 +04:00
|
|
|
{
|
2014-08-06 19:18:07 +04:00
|
|
|
LinuxAioState *s = laiocb->ctx;
|
|
|
|
struct iocb *iocbs = &laiocb->iocb;
|
|
|
|
QEMUIOVector *qiov = laiocb->qiov;
|
2009-08-20 18:58:35 +04:00
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
case QEMU_AIO_WRITE:
|
|
|
|
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
|
2018-12-14 01:37:37 +03:00
|
|
|
break;
|
2023-05-08 08:15:08 +03:00
|
|
|
case QEMU_AIO_ZONE_APPEND:
|
|
|
|
io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
|
|
|
|
break;
|
2009-08-20 18:58:35 +04:00
|
|
|
case QEMU_AIO_READ:
|
|
|
|
io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
|
2018-12-14 01:37:37 +03:00
|
|
|
break;
|
2024-04-25 10:04:12 +03:00
|
|
|
case QEMU_AIO_FLUSH:
|
|
|
|
io_prep_fdsync(iocbs, fd);
|
|
|
|
break;
|
2011-08-30 11:46:11 +04:00
|
|
|
/* Currently Linux kernel does not support other operations */
|
2009-08-20 18:58:35 +04:00
|
|
|
default:
|
|
|
|
fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
|
|
|
|
__func__, type);
|
2014-08-06 19:18:07 +04:00
|
|
|
return -EIO;
|
2009-08-20 18:58:35 +04:00
|
|
|
}
|
2012-02-24 11:39:02 +04:00
|
|
|
io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2014-12-11 16:52:26 +03:00
|
|
|
QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
|
2016-07-13 16:03:24 +03:00
|
|
|
s->io_q.in_queue++;
|
2023-05-30 21:09:58 +03:00
|
|
|
if (!s->io_q.blocked) {
|
|
|
|
if (s->io_q.in_queue >= laio_max_batch(s, dev_max_batch)) {
|
|
|
|
ioq_submit(s);
|
|
|
|
} else {
|
2023-09-13 23:00:42 +03:00
|
|
|
defer_call(laio_deferred_fn, s);
|
2023-05-30 21:09:58 +03:00
|
|
|
}
|
2014-07-04 14:04:34 +04:00
|
|
|
}
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2014-08-06 19:18:07 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-02-03 16:17:28 +03:00
|
|
|
int coroutine_fn laio_co_submit(int fd, uint64_t offset, QEMUIOVector *qiov,
|
|
|
|
int type, uint64_t dev_max_batch)
|
2014-08-06 19:18:07 +04:00
|
|
|
{
|
|
|
|
int ret;
|
2023-02-03 16:17:28 +03:00
|
|
|
AioContext *ctx = qemu_get_current_aio_context();
|
2014-08-06 19:18:07 +04:00
|
|
|
struct qemu_laiocb laiocb = {
|
|
|
|
.co = qemu_coroutine_self(),
|
2024-04-25 10:04:12 +03:00
|
|
|
.nbytes = qiov ? qiov->size : 0,
|
2023-02-03 16:17:28 +03:00
|
|
|
.ctx = aio_get_linux_aio(ctx),
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 15:27:43 +03:00
|
|
|
.ret = -EINPROGRESS,
|
2014-08-06 19:18:07 +04:00
|
|
|
.is_read = (type == QEMU_AIO_READ),
|
|
|
|
.qiov = qiov,
|
|
|
|
};
|
|
|
|
|
2021-10-26 19:23:45 +03:00
|
|
|
ret = laio_do_submit(fd, &laiocb, offset, type, dev_max_batch);
|
2014-08-06 19:18:07 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
linux-aio: process completions from ioq_submit()
In order to reduce completion latency it makes sense to harvest completed
requests ASAP. Very fast backend device can complete requests just after
submission, so it is worth trying to check ring buffer in order to peek
completed requests directly after io_submit() has been called.
Indeed, this patch reduces the completions latencies and increases the
overall throughput, e.g. the following is the percentiles of number of
completed requests at once:
1th 10th 20th 30th 40th 50th 60th 70th 80th 90th 99.99th
Before 2 4 42 112 128 128 128 128 128 128 128
After 1 1 4 14 33 45 47 48 50 51 108
That means, that before the current patch is applied the ring buffer is
observed as full (128 requests were consumed at once) in 60% of calls.
After patch is applied the distribution of number of completed requests
is "smoother" and the queue (requests in-flight) is almost never full.
The fio read results are the following (write results are almost the
same and are not showed here):
Before
------
job: (groupid=0, jobs=8): err= 0: pid=2227: Tue Jul 19 11:29:50 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=54681MB, bw=1822.7MB/s, iops=179779, runt= 30001msec
slat (usec): min=172, max=16883, avg=338.35, stdev=109.66
clat (usec): min=1, max=21977, avg=1051.45, stdev=299.29
lat (usec): min=317, max=22521, avg=1389.83, stdev=300.73
clat percentiles (usec):
| 1.00th=[ 346], 5.00th=[ 596], 10.00th=[ 708], 20.00th=[ 852],
| 30.00th=[ 932], 40.00th=[ 996], 50.00th=[ 1048], 60.00th=[ 1112],
| 70.00th=[ 1176], 80.00th=[ 1256], 90.00th=[ 1384], 95.00th=[ 1496],
| 99.00th=[ 1800], 99.50th=[ 1928], 99.90th=[ 2320], 99.95th=[ 2672],
| 99.99th=[ 4704]
bw (KB /s): min=205229, max=553181, per=12.50%, avg=233278.26, stdev=18383.51
After
------
job: (groupid=0, jobs=8): err= 0: pid=2220: Tue Jul 19 11:31:51 2016
Description : [Emulation of Storage Server Access Pattern]
read : io=57637MB, bw=1921.2MB/s, iops=189529, runt= 30002msec
slat (usec): min=169, max=20636, avg=329.61, stdev=124.18
clat (usec): min=2, max=19592, avg=988.78, stdev=251.04
lat (usec): min=381, max=21067, avg=1318.42, stdev=243.58
clat percentiles (usec):
| 1.00th=[ 310], 5.00th=[ 580], 10.00th=[ 748], 20.00th=[ 876],
| 30.00th=[ 908], 40.00th=[ 948], 50.00th=[ 1012], 60.00th=[ 1064],
| 70.00th=[ 1080], 80.00th=[ 1128], 90.00th=[ 1224], 95.00th=[ 1288],
| 99.00th=[ 1496], 99.50th=[ 1608], 99.90th=[ 1960], 99.95th=[ 2256],
| 99.99th=[ 5408]
bw (KB /s): min=212149, max=390160, per=12.49%, avg=245746.04, stdev=11606.75
Throughput increased from 1822MB/s to 1921MB/s, average completion latencies
decreased from 1051us to 988us.
Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Message-id: 1468931263-32667-4-git-send-email-roman.penyaev@profitbricks.com
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2016-07-19 15:27:43 +03:00
|
|
|
if (laiocb.ret == -EINPROGRESS) {
|
|
|
|
qemu_coroutine_yield();
|
|
|
|
}
|
2014-08-06 19:18:07 +04:00
|
|
|
return laiocb.ret;
|
|
|
|
}
|
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
|
2014-05-08 18:34:47 +04:00
|
|
|
{
|
aio: remove aio_disable_external() API
All callers now pass is_external=false to aio_set_fd_handler() and
aio_set_event_notifier(). The aio_disable_external() API that
temporarily disables fd handlers that were registered is_external=true
is therefore dead code.
Remove aio_disable_external(), aio_enable_external(), and the
is_external arguments to aio_set_fd_handler() and
aio_set_event_notifier().
The entire test-fdmon-epoll test is removed because its sole purpose was
testing aio_disable_external().
Parts of this patch were generated using the following coccinelle
(https://coccinelle.lip6.fr/) semantic patch:
@@
expression ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque;
@@
- aio_set_fd_handler(ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque)
+ aio_set_fd_handler(ctx, fd, io_read, io_write, io_poll, io_poll_ready, opaque)
@@
expression ctx, notifier, is_external, io_read, io_poll, io_poll_ready;
@@
- aio_set_event_notifier(ctx, notifier, is_external, io_read, io_poll, io_poll_ready)
+ aio_set_event_notifier(ctx, notifier, io_read, io_poll, io_poll_ready)
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20230516190238.8401-21-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2023-05-16 22:02:38 +03:00
|
|
|
aio_set_event_notifier(old_context, &s->e, NULL, NULL, NULL);
|
2014-08-04 19:56:33 +04:00
|
|
|
qemu_bh_delete(s->completion_bh);
|
2017-02-13 16:52:31 +03:00
|
|
|
s->aio_context = NULL;
|
2014-05-08 18:34:47 +04:00
|
|
|
}
|
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
|
2014-05-08 18:34:47 +04:00
|
|
|
{
|
2016-07-04 19:33:20 +03:00
|
|
|
s->aio_context = new_context;
|
2014-08-04 19:56:33 +04:00
|
|
|
s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
|
aio: remove aio_disable_external() API
All callers now pass is_external=false to aio_set_fd_handler() and
aio_set_event_notifier(). The aio_disable_external() API that
temporarily disables fd handlers that were registered is_external=true
is therefore dead code.
Remove aio_disable_external(), aio_enable_external(), and the
is_external arguments to aio_set_fd_handler() and
aio_set_event_notifier().
The entire test-fdmon-epoll test is removed because its sole purpose was
testing aio_disable_external().
Parts of this patch were generated using the following coccinelle
(https://coccinelle.lip6.fr/) semantic patch:
@@
expression ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque;
@@
- aio_set_fd_handler(ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, opaque)
+ aio_set_fd_handler(ctx, fd, io_read, io_write, io_poll, io_poll_ready, opaque)
@@
expression ctx, notifier, is_external, io_read, io_poll, io_poll_ready;
@@
- aio_set_event_notifier(ctx, notifier, is_external, io_read, io_poll, io_poll_ready)
+ aio_set_event_notifier(ctx, notifier, io_read, io_poll, io_poll_ready)
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-Id: <20230516190238.8401-21-stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2023-05-16 22:02:38 +03:00
|
|
|
aio_set_event_notifier(new_context, &s->e,
|
2016-12-01 22:26:44 +03:00
|
|
|
qemu_laio_completion_cb,
|
aio-posix: split poll check from ready handler
Adaptive polling measures the execution time of the polling check plus
handlers called when a polled event becomes ready. Handlers can take a
significant amount of time, making it look like polling was running for
a long time when in fact the event handler was running for a long time.
For example, on Linux the io_submit(2) syscall invoked when a virtio-blk
device's virtqueue becomes ready can take 10s of microseconds. This
can exceed the default polling interval (32 microseconds) and cause
adaptive polling to stop polling.
By excluding the handler's execution time from the polling check we make
the adaptive polling calculation more accurate. As a result, the event
loop now stays in polling mode where previously it would have fallen
back to file descriptor monitoring.
The following data was collected with virtio-blk num-queues=2
event_idx=off using an IOThread. Before:
168k IOPS, IOThread syscalls:
9837.115 ( 0.020 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 16, iocbpp: 0x7fcb9f937db0) = 16
9837.158 ( 0.002 ms): IO iothread1/620155 write(fd: 103, buf: 0x556a2ef71b88, count: 8) = 8
9837.161 ( 0.001 ms): IO iothread1/620155 write(fd: 104, buf: 0x556a2ef71b88, count: 8) = 8
9837.163 ( 0.001 ms): IO iothread1/620155 ppoll(ufds: 0x7fcb90002800, nfds: 4, tsp: 0x7fcb9f1342d0, sigsetsize: 8) = 3
9837.164 ( 0.001 ms): IO iothread1/620155 read(fd: 107, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.174 ( 0.001 ms): IO iothread1/620155 read(fd: 105, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.176 ( 0.001 ms): IO iothread1/620155 read(fd: 106, buf: 0x7fcb9f939cc0, count: 512) = 8
9837.209 ( 0.035 ms): IO iothread1/620155 io_submit(ctx_id: 140512552468480, nr: 32, iocbpp: 0x7fca7d0cebe0) = 32
174k IOPS (+3.6%), IOThread syscalls:
9809.566 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0cdd62be0) = 32
9809.625 ( 0.001 ms): IO iothread1/623061 write(fd: 103, buf: 0x5647cfba5f58, count: 8) = 8
9809.627 ( 0.002 ms): IO iothread1/623061 write(fd: 104, buf: 0x5647cfba5f58, count: 8) = 8
9809.663 ( 0.036 ms): IO iothread1/623061 io_submit(ctx_id: 140539805028352, nr: 32, iocbpp: 0x7fd0d0388b50) = 32
Notice that ppoll(2) and eventfd read(2) syscalls are eliminated because
the IOThread stays in polling mode instead of falling back to file
descriptor monitoring.
As usual, polling is not implemented on Windows so this patch ignores
the new io_poll_read() callback in aio-win32.c.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Message-id: 20211207132336.36627-2-stefanha@redhat.com
[Fixed up aio_set_event_notifier() calls in
tests/unit/test-fdmon-epoll.c added after this series was queued.
--Stefan]
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-12-07 16:23:31 +03:00
|
|
|
qemu_laio_poll_cb,
|
|
|
|
qemu_laio_poll_ready);
|
2014-05-08 18:34:47 +04:00
|
|
|
}
|
|
|
|
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
LinuxAioState *laio_init(Error **errp)
|
2009-08-20 18:58:35 +04:00
|
|
|
{
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
int rc;
|
2016-04-07 19:33:35 +03:00
|
|
|
LinuxAioState *s;
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2011-08-21 07:09:37 +04:00
|
|
|
s = g_malloc0(sizeof(*s));
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
rc = event_notifier_init(&s->e, false);
|
|
|
|
if (rc < 0) {
|
2022-07-07 19:37:15 +03:00
|
|
|
error_setg_errno(errp, -rc, "failed to initialize event notifier");
|
2009-08-20 18:58:35 +04:00
|
|
|
goto out_free_state;
|
2012-02-24 11:39:02 +04:00
|
|
|
}
|
2009-08-20 18:58:35 +04:00
|
|
|
|
linux-aio: properly bubble up errors from initialization
laio_init() can fail for a couple of reasons, which will lead to a NULL
pointer dereference in laio_attach_aio_context().
To solve this, add a aio_setup_linux_aio() function which is called
early in raw_open_common. If this fails, propagate the error up. The
signature of aio_get_linux_aio() was not modified, because it seems
preferable to return the actual errno from the possible failing
initialization calls.
Additionally, when the AioContext changes, we need to associate a
LinuxAioState with the new AioContext. Use the bdrv_attach_aio_context
callback and call the new aio_setup_linux_aio(), which will allocate a
new AioContext if needed, and return errors on failures. If it fails for
any reason, fallback to threaded AIO with an error message, as the
device is already in-use by the guest.
Add an assert that aio_get_linux_aio() cannot return NULL.
Signed-off-by: Nishanth Aravamudan <naravamudan@digitalocean.com>
Message-id: 20180622193700.6523-1-naravamudan@digitalocean.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2018-06-22 22:37:00 +03:00
|
|
|
rc = io_setup(MAX_EVENTS, &s->ctx);
|
|
|
|
if (rc < 0) {
|
|
|
|
error_setg_errno(errp, -rc, "failed to create linux AIO context");
|
2009-08-20 18:58:35 +04:00
|
|
|
goto out_close_efd;
|
2012-02-24 11:39:02 +04:00
|
|
|
}
|
2009-08-20 18:58:35 +04:00
|
|
|
|
2014-07-04 14:04:34 +04:00
|
|
|
ioq_init(&s->io_q);
|
|
|
|
|
2009-08-20 18:58:35 +04:00
|
|
|
return s;
|
|
|
|
|
|
|
|
out_close_efd:
|
2012-02-24 11:39:02 +04:00
|
|
|
event_notifier_cleanup(&s->e);
|
2009-08-20 18:58:35 +04:00
|
|
|
out_free_state:
|
2011-08-21 07:09:37 +04:00
|
|
|
g_free(s);
|
2009-08-20 18:58:35 +04:00
|
|
|
return NULL;
|
|
|
|
}
|
2014-05-08 18:34:48 +04:00
|
|
|
|
2016-04-07 19:33:35 +03:00
|
|
|
void laio_cleanup(LinuxAioState *s)
|
2014-05-08 18:34:48 +04:00
|
|
|
{
|
|
|
|
event_notifier_cleanup(&s->e);
|
2014-07-12 07:43:37 +04:00
|
|
|
|
|
|
|
if (io_destroy(s->ctx) != 0) {
|
|
|
|
fprintf(stderr, "%s: destroy AIO context %p failed\n",
|
|
|
|
__func__, &s->ctx);
|
|
|
|
}
|
2014-05-08 18:34:48 +04:00
|
|
|
g_free(s);
|
|
|
|
}
|
2024-04-25 10:04:12 +03:00
|
|
|
|
|
|
|
bool laio_has_fdsync(int fd)
|
|
|
|
{
|
|
|
|
struct iocb cb;
|
|
|
|
struct iocb *cbs[] = {&cb, NULL};
|
|
|
|
|
|
|
|
io_context_t ctx = 0;
|
|
|
|
io_setup(1, &ctx);
|
|
|
|
|
|
|
|
/* check if host kernel supports IO_CMD_FDSYNC */
|
|
|
|
io_prep_fdsync(&cb, fd);
|
|
|
|
int ret = io_submit(ctx, 1, cbs);
|
|
|
|
|
|
|
|
io_destroy(ctx);
|
|
|
|
return (ret == -EINVAL) ? false : true;
|
|
|
|
}
|