From 816a430c517eae48da5a31207ca43151df3203b0 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Sun, 10 Jul 2022 20:08:49 +0800 Subject: [PATCH 1/3] util/aio: Defer disabling poll mode as long as possible When we measure FIO read performance (cache=writethrough, bs=4k, iodepth=64) in VMs, ~80K/s notifications (e.g., EPT_MISCONFIG) are observed from guest to qemu. It turns out those frequent notificatons are caused by interference from worker threads. Worker threads queue bottom halves after completing IO requests. Pending bottom halves may lead to either aio_compute_timeout() zeros timeout and pass it to try_poll_mode() or run_poll_handlers() returns no progress after noticing pending aio_notify() events. Both cause run_poll_handlers() to call poll_set_started(false) to disable poll mode. However, for both cases, as timeout is already zeroed, the event loop (i.e., aio_poll()) just processes bottom halves and then starts the next event loop iteration. So, disabling poll mode has no value but leads to unnecessary notifications from guest. To minimize unnecessary notifications from guest, defer disabling poll mode to when the event loop is about to be blocked. With this patch applied, FIO seq-read performance (bs=4k, iodepth=64, cache=writethrough) in VMs increases from 330K/s to 413K/s IOPS. Suggested-by: Stefan Hajnoczi Signed-off-by: Chao Gao Message-id: 20220710120849.63086-1-chao.gao@intel.com Signed-off-by: Stefan Hajnoczi --- util/aio-posix.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/util/aio-posix.c b/util/aio-posix.c index 731f3826c0..6cc6256d53 100644 --- a/util/aio-posix.c +++ b/util/aio-posix.c @@ -585,18 +585,16 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list, max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { + /* + * Enable poll mode. It pairs with the poll_set_started() in + * aio_poll() which disables poll mode. + */ poll_set_started(ctx, ready_list, true); if (run_poll_handlers(ctx, ready_list, max_ns, timeout)) { return true; } } - - if (poll_set_started(ctx, ready_list, false)) { - *timeout = 0; - return true; - } - return false; } @@ -657,6 +655,17 @@ bool aio_poll(AioContext *ctx, bool blocking) * system call---a single round of run_poll_handlers_once suffices. */ if (timeout || ctx->fdmon_ops->need_wait(ctx)) { + /* + * Disable poll mode. poll mode should be disabled before the call + * of ctx->fdmon_ops->wait() so that guest's notification can wake + * up IO threads when some work becomes pending. It is essential to + * avoid hangs or unnecessary latency. + */ + if (poll_set_started(ctx, &ready_list, false)) { + timeout = 0; + progress = true; + } + ctx->fdmon_ops->wait(ctx, &ready_list, timeout); } From a937f8e8577babc32b24d4f518cb336c013cd14f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 2 Nov 2022 14:23:37 -0400 Subject: [PATCH 2/3] virtio-blk: simplify virtio_blk_dma_restart_cb() virtio_blk_dma_restart_cb() is tricky because the BH must deal with virtio_blk_data_plane_start()/virtio_blk_data_plane_stop() being called. There are two issues with the code: 1. virtio_blk_realize() should use qdev_add_vm_change_state_handler() instead of qemu_add_vm_change_state_handler(). This ensures the ordering with virtio_init()'s vm change state handler that calls virtio_blk_data_plane_start()/virtio_blk_data_plane_stop() is well-defined. Then blk's AioContext is guaranteed to be up-to-date in virtio_blk_dma_restart_cb() and it's no longer necessary to have a special case for virtio_blk_data_plane_start(). 2. Only blk_drain() waits for virtio_blk_dma_restart_cb()'s blk_inc_in_flight() to be decremented. The bdrv_drain() family of functions do not wait for BlockBackend's in_flight counter to reach zero. virtio_blk_data_plane_stop() relies on blk_set_aio_context()'s implicit drain, but that's a bdrv_drain() and not a blk_drain(). Note that virtio_blk_reset() already correctly relies on blk_drain(). If virtio_blk_data_plane_stop() switches to blk_drain() then we can properly wait for pending virtio_blk_dma_restart_bh() calls. Once these issues are taken care of the code becomes simpler. This change is in preparation for multiple IOThreads in virtio-blk where we need to clean up the multi-threading behavior. I ran the reproducer from commit 49b44549ace7 ("virtio-blk: On restart, process queued requests in the proper context") to check that there is no regression. Cc: Sergio Lopez Cc: Kevin Wolf Cc: Emanuele Giuseppe Esposito Signed-off-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin Reviewed-by: Emanuele Giuseppe Esposito Message-id: 20221102182337.252202-1-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- hw/block/dataplane/virtio-blk.c | 17 +++++------- hw/block/virtio-blk.c | 46 ++++++++++++++------------------- include/hw/virtio/virtio-blk.h | 2 -- 3 files changed, 26 insertions(+), 39 deletions(-) diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c index 26f965cabc..b28d81737e 100644 --- a/hw/block/dataplane/virtio-blk.c +++ b/hw/block/dataplane/virtio-blk.c @@ -237,9 +237,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) goto fail_aio_context; } - /* Process queued requests before the ones in vring */ - virtio_blk_process_queued_requests(vblk, false); - /* Kick right away to begin processing requests already in vring */ for (i = 0; i < nvqs; i++) { VirtQueue *vq = virtio_get_queue(s->vdev, i); @@ -272,11 +269,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev) fail_host_notifiers: k->set_guest_notifiers(qbus->parent, nvqs, false); fail_guest_notifiers: - /* - * If we failed to set up the guest notifiers queued requests will be - * processed on the main context. - */ - virtio_blk_process_queued_requests(vblk, false); vblk->dataplane_disabled = true; s->starting = false; vblk->dataplane_started = true; @@ -325,8 +317,13 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev) aio_context_acquire(s->ctx); aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s); - /* Drain and try to switch bs back to the QEMU main loop. If other users - * keep the BlockBackend in the iothread, that's ok */ + /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */ + blk_drain(s->conf->conf.blk); + + /* + * Try to switch bs back to the QEMU main loop. If other users keep the + * BlockBackend in the iothread, that's ok + */ blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL); aio_context_release(s->ctx); diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index f717550fdc..1762517878 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -806,8 +806,10 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) virtio_blk_handle_vq(s, vq); } -void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh) +static void virtio_blk_dma_restart_bh(void *opaque) { + VirtIOBlock *s = opaque; + VirtIOBlockReq *req = s->rq; MultiReqBuffer mrb = {}; @@ -834,43 +836,27 @@ void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh) if (mrb.num_reqs) { virtio_blk_submit_multireq(s, &mrb); } - if (is_bh) { - blk_dec_in_flight(s->conf.conf.blk); - } + + /* Paired with inc in virtio_blk_dma_restart_cb() */ + blk_dec_in_flight(s->conf.conf.blk); + aio_context_release(blk_get_aio_context(s->conf.conf.blk)); } -static void virtio_blk_dma_restart_bh(void *opaque) -{ - VirtIOBlock *s = opaque; - - qemu_bh_delete(s->bh); - s->bh = NULL; - - virtio_blk_process_queued_requests(s, true); -} - static void virtio_blk_dma_restart_cb(void *opaque, bool running, RunState state) { VirtIOBlock *s = opaque; - BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s))); - VirtioBusState *bus = VIRTIO_BUS(qbus); if (!running) { return; } - /* - * If ioeventfd is enabled, don't schedule the BH here as queued - * requests will be processed while starting the data plane. - */ - if (!s->bh && !virtio_bus_ioeventfd_enabled(bus)) { - s->bh = aio_bh_new(blk_get_aio_context(s->conf.conf.blk), - virtio_blk_dma_restart_bh, s); - blk_inc_in_flight(s->conf.conf.blk); - qemu_bh_schedule(s->bh); - } + /* Paired with dec in virtio_blk_dma_restart_bh() */ + blk_inc_in_flight(s->conf.conf.blk); + + aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.conf.blk), + virtio_blk_dma_restart_bh, s); } static void virtio_blk_reset(VirtIODevice *vdev) @@ -1213,7 +1199,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } - s->change = qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s); + /* + * This must be after virtio_init() so virtio_blk_dma_restart_cb() gets + * called after ->start_ioeventfd() has already set blk's AioContext. + */ + s->change = + qdev_add_vm_change_state_handler(dev, virtio_blk_dma_restart_cb, s); + blk_ram_registrar_init(&s->blk_ram_registrar, s->blk); blk_set_dev_ops(s->blk, &virtio_block_ops, s); diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 7f589b4146..dafec432ce 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -55,7 +55,6 @@ struct VirtIOBlock { VirtIODevice parent_obj; BlockBackend *blk; void *rq; - QEMUBH *bh; VirtIOBlkConf conf; unsigned short sector_mask; bool original_wce; @@ -93,6 +92,5 @@ typedef struct MultiReqBuffer { } MultiReqBuffer; void virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq); -void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh); #endif From 4f01a9bb0461e8c11ee0c94d90a504cb7d580a85 Mon Sep 17 00:00:00 2001 From: Peter Krempa Date: Mon, 23 Jan 2023 13:39:27 +0100 Subject: [PATCH 3/3] block/blkio: Fix inclusion of required headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After recent header file inclusion rework the build fails when the blkio module is enabled: ../block/blkio.c: In function ‘blkio_detach_aio_context’: ../block/blkio.c:321:24: error: implicit declaration of function ‘bdrv_get_aio_context’; did you mean ‘qemu_get_aio_context’? [-Werror=implicit-function-declaration] 321 | aio_set_fd_handler(bdrv_get_aio_context(bs), | ^~~~~~~~~~~~~~~~~~~~ | qemu_get_aio_context ../block/blkio.c:321:24: error: nested extern declaration of ‘bdrv_get_aio_context’ [-Werror=nested-externs] ../block/blkio.c:321:24: error: passing argument 1 of ‘aio_set_fd_handler’ makes pointer from integer without a cast [-Werror=int-conversion] 321 | aio_set_fd_handler(bdrv_get_aio_context(bs), | ^~~~~~~~~~~~~~~~~~~~~~~~ | | | int In file included from /home/pipo/git/qemu.git/include/qemu/job.h:33, from /home/pipo/git/qemu.git/include/block/blockjob.h:30, from /home/pipo/git/qemu.git/include/block/block_int-global-state.h:28, from /home/pipo/git/qemu.git/include/block/block_int.h:27, from ../block/blkio.c:13: /home/pipo/git/qemu.git/include/block/aio.h:476:37: note: expected ‘AioContext *’ but argument is of type ‘int’ 476 | void aio_set_fd_handler(AioContext *ctx, | ~~~~~~~~~~~~^~~ ../block/blkio.c: In function ‘blkio_file_open’: ../block/blkio.c:821:34: error: passing argument 2 of ‘blkio_attach_aio_context’ makes pointer from integer without a cast [-Werror=int-conversion] 821 | blkio_attach_aio_context(bs, bdrv_get_aio_context(bs)); | ^~~~~~~~~~~~~~~~~~~~~~~~ | | | int Fix it by including 'block/block-io.h' which contains the required declarations. Fixes: e2c1c34f139f49ef909bb4322607fb8b39002312 Signed-off-by: Peter Krempa Reviewed-by: Markus Armbruster Message-id: 2bc956011404a1ab03342aefde0087b5b4762562.1674477350.git.pkrempa@redhat.com Signed-off-by: Stefan Hajnoczi --- block/blkio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blkio.c b/block/blkio.c index 5eae3adfaf..6ad86b23d1 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -19,6 +19,8 @@ #include "qemu/module.h" #include "exec/memory.h" /* for ram_block_discard_disable() */ +#include "block/block-io.h" + /* * Keep the QEMU BlockDriver names identical to the libblkio driver names. * Using macros instead of typing out the string literals avoids typos.