block/nvme: support nested aio_poll()

QEMU block drivers are supposed to support aio_poll() from I/O
completion callback functions. This means completion processing must be
re-entrant.

The standard approach is to schedule a BH during completion processing
and cancel it at the end of processing. If aio_poll() is invoked by a
callback function then the BH will run. The BH continues the suspended
completion processing.

All of this means that request A's cb() can synchronously wait for
request B to complete. Previously the nvme block driver would hang
because it didn't process completions from nested aio_poll().

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Sergio Lopez <slp@redhat.com>
Message-id: 20200617132201.1832152-8-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2020-06-17 14:22:01 +01:00
parent b75fd5f554
commit 7838c67f22
2 changed files with 58 additions and 7 deletions

View File

@ -74,9 +74,11 @@ typedef struct {
int cq_phase; int cq_phase;
int free_req_head; int free_req_head;
NVMeRequest reqs[NVME_NUM_REQS]; NVMeRequest reqs[NVME_NUM_REQS];
bool busy;
int need_kick; int need_kick;
int inflight; int inflight;
/* Thread-safe, no lock necessary */
QEMUBH *completion_bh;
} NVMeQueuePair; } NVMeQueuePair;
/* Memory mapped registers */ /* Memory mapped registers */
@ -140,6 +142,8 @@ struct BDRVNVMeState {
#define NVME_BLOCK_OPT_DEVICE "device" #define NVME_BLOCK_OPT_DEVICE "device"
#define NVME_BLOCK_OPT_NAMESPACE "namespace" #define NVME_BLOCK_OPT_NAMESPACE "namespace"
static void nvme_process_completion_bh(void *opaque);
static QemuOptsList runtime_opts = { static QemuOptsList runtime_opts = {
.name = "nvme", .name = "nvme",
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
@ -181,6 +185,9 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
static void nvme_free_queue_pair(NVMeQueuePair *q) static void nvme_free_queue_pair(NVMeQueuePair *q)
{ {
if (q->completion_bh) {
qemu_bh_delete(q->completion_bh);
}
qemu_vfree(q->prp_list_pages); qemu_vfree(q->prp_list_pages);
qemu_vfree(q->sq.queue); qemu_vfree(q->sq.queue);
qemu_vfree(q->cq.queue); qemu_vfree(q->cq.queue);
@ -214,6 +221,8 @@ static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
q->index = idx; q->index = idx;
qemu_co_queue_init(&q->free_req_queue); qemu_co_queue_init(&q->free_req_queue);
q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS); q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
nvme_process_completion_bh, q);
r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
s->page_size * NVME_NUM_REQS, s->page_size * NVME_NUM_REQS,
false, &prp_list_iova); false, &prp_list_iova);
@ -352,11 +361,21 @@ static bool nvme_process_completion(NVMeQueuePair *q)
NvmeCqe *c; NvmeCqe *c;
trace_nvme_process_completion(s, q->index, q->inflight); trace_nvme_process_completion(s, q->index, q->inflight);
if (q->busy || s->plugged) { if (s->plugged) {
trace_nvme_process_completion_queue_busy(s, q->index); trace_nvme_process_completion_queue_plugged(s, q->index);
return false; return false;
} }
q->busy = true;
/*
* Support re-entrancy when a request cb() function invokes aio_poll().
* Pending completions must be visible to aio_poll() so that a cb()
* function can wait for the completion of another request.
*
* The aio_poll() loop will execute our BH and we'll resume completion
* processing there.
*/
qemu_bh_schedule(q->completion_bh);
assert(q->inflight >= 0); assert(q->inflight >= 0);
while (q->inflight) { while (q->inflight) {
int ret; int ret;
@ -384,10 +403,10 @@ static bool nvme_process_completion(NVMeQueuePair *q)
assert(req.cb); assert(req.cb);
nvme_put_free_req_locked(q, preq); nvme_put_free_req_locked(q, preq);
preq->cb = preq->opaque = NULL; preq->cb = preq->opaque = NULL;
q->inflight--;
qemu_mutex_unlock(&q->lock); qemu_mutex_unlock(&q->lock);
req.cb(req.opaque, ret); req.cb(req.opaque, ret);
qemu_mutex_lock(&q->lock); qemu_mutex_lock(&q->lock);
q->inflight--;
progress = true; progress = true;
} }
if (progress) { if (progress) {
@ -396,10 +415,28 @@ static bool nvme_process_completion(NVMeQueuePair *q)
*q->cq.doorbell = cpu_to_le32(q->cq.head); *q->cq.doorbell = cpu_to_le32(q->cq.head);
nvme_wake_free_req_locked(q); nvme_wake_free_req_locked(q);
} }
q->busy = false;
qemu_bh_cancel(q->completion_bh);
return progress; return progress;
} }
static void nvme_process_completion_bh(void *opaque)
{
NVMeQueuePair *q = opaque;
/*
* We're being invoked because a nvme_process_completion() cb() function
* called aio_poll(). The callback may be waiting for further completions
* so notify the device that it has space to fill in more completions now.
*/
smp_mb_release();
*q->cq.doorbell = cpu_to_le32(q->cq.head);
nvme_wake_free_req_locked(q);
nvme_process_completion(q);
}
static void nvme_trace_command(const NvmeCmd *cmd) static void nvme_trace_command(const NvmeCmd *cmd)
{ {
int i; int i;
@ -1309,6 +1346,13 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
{ {
BDRVNVMeState *s = bs->opaque; BDRVNVMeState *s = bs->opaque;
for (int i = 0; i < s->nr_queues; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_bh_delete(q->completion_bh);
q->completion_bh = NULL;
}
aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
false, NULL, NULL); false, NULL, NULL);
} }
@ -1321,6 +1365,13 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
s->aio_context = new_context; s->aio_context = new_context;
aio_set_event_notifier(new_context, &s->irq_notifier, aio_set_event_notifier(new_context, &s->irq_notifier,
false, nvme_handle_event, nvme_poll_cb); false, nvme_handle_event, nvme_poll_cb);
for (int i = 0; i < s->nr_queues; i++) {
NVMeQueuePair *q = s->queues[i];
q->completion_bh =
aio_bh_new(new_context, nvme_process_completion_bh, q);
}
} }
static void nvme_aio_plug(BlockDriverState *bs) static void nvme_aio_plug(BlockDriverState *bs)

View File

@ -158,7 +158,7 @@ nvme_kick(void *s, int queue) "s %p queue %d"
nvme_dma_flush_queue_wait(void *s) "s %p" nvme_dma_flush_queue_wait(void *s) "s %p"
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"