nbd: fix the co_queue multi-adding bug
When we tested the VM migartion between different hosts with NBD devices, we found if we sent a cancel command after the drive_mirror was just started, a coroutine re-enter error would occur. The stack was as follow: (gdb) bt 00) 0x00007fdfc744d885 in raise () from /lib64/libc.so.6 01) 0x00007fdfc744ee61 in abort () from /lib64/libc.so.6 02) 0x00007fdfca467cc5 in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0) at qemu-coroutine.c:118 03) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedb400) at qemu-coroutine-lock.c:59 04) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8, to=0x7fdfcaedb400) at qemu-coroutine.c:96 05) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedb400, opaque=0x0) at qemu-coroutine.c:123 06) 0x00007fdfca467f6c in qemu_co_queue_run_restart (co=0x7fdfcaedbdc0) at qemu-coroutine-lock.c:59 07) 0x00007fdfca467be5 in coroutine_swap (from=0x7fdfcaf3c4e8, to=0x7fdfcaedbdc0) at qemu-coroutine.c:96 08) 0x00007fdfca467cea in qemu_coroutine_enter (co=0x7fdfcaedbdc0, opaque=0x0) at qemu-coroutine.c:123 09) 0x00007fdfca4a1fa4 in nbd_recv_coroutines_enter_all (s=0x7fdfcaef7dd0) at block/nbd-client.c:41 10) 0x00007fdfca4a1ff9 in nbd_teardown_connection (client=0x7fdfcaef7dd0) at block/nbd-client.c:50 11) 0x00007fdfca4a20f0 in nbd_reply_ready (opaque=0x7fdfcaef7dd0) at block/nbd-client.c:92 12) 0x00007fdfca45ed80 in aio_dispatch (ctx=0x7fdfcae15e90) at aio-posix.c:144 13) 0x00007fdfca45ef1b in aio_poll (ctx=0x7fdfcae15e90, blocking=false) at aio-posix.c:222 14) 0x00007fdfca448c34 in aio_ctx_dispatch (source=0x7fdfcae15e90, callback=0x0, user_data=0x0) at async.c:212 15) 0x00007fdfc8f2f69a in g_main_context_dispatch () from /usr/lib64/libglib-2.0.so.0 16) 0x00007fdfca45c391 in glib_pollfds_poll () at main-loop.c:190 17) 0x00007fdfca45c489 in os_host_main_loop_wait (timeout=1483677098) at main-loop.c:235 18) 0x00007fdfca45c57b in main_loop_wait (nonblocking=0) at main-loop.c:484 19) 0x00007fdfca25f403 in main_loop () at vl.c:2249 20) 0x00007fdfca266fc2 in main (argc=42, argv=0x7ffff517d638, envp=0x7ffff517d790) at vl.c:4814 We find the nbd_recv_coroutines_enter_all function (triggered by a cancel command or a network connection breaking down) will enter a coroutine which is waiting for the sending lock. If the lock is still held by another coroutine, the entering coroutine will be added into the co_queue again. Latter, when the lock is released, a coroutine re-enter error will occur. This bug can be fixed simply by delaying the setting of recv_coroutine as suggested by paolo. After applying this patch, we have tested the cancel operation in mirror phase looply for more than 5 hous and everything is fine. Without this patch, a coroutine re-enter error will occur in 5 minutes. Signed-off-by: Bn Wu <wu.wubin@huawei.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> Message-id: 1423552846-3896-1-git-send-email-wu.wubin@huawei.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
parent
81705ee485
commit
141cabe6f1
@ -108,11 +108,22 @@ static int nbd_co_send_request(BlockDriverState *bs,
|
||||
{
|
||||
NbdClientSession *s = nbd_get_client_session(bs);
|
||||
AioContext *aio_context;
|
||||
int rc, ret;
|
||||
int rc, ret, i;
|
||||
|
||||
qemu_co_mutex_lock(&s->send_mutex);
|
||||
|
||||
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
||||
if (s->recv_coroutine[i] == NULL) {
|
||||
s->recv_coroutine[i] = qemu_coroutine_self();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(i < MAX_NBD_REQUESTS);
|
||||
request->handle = INDEX_TO_HANDLE(s, i);
|
||||
s->send_coroutine = qemu_coroutine_self();
|
||||
aio_context = bdrv_get_aio_context(bs);
|
||||
|
||||
aio_set_fd_handler(aio_context, s->sock,
|
||||
nbd_reply_ready, nbd_restart_write, bs);
|
||||
if (qiov) {
|
||||
@ -168,8 +179,6 @@ static void nbd_co_receive_reply(NbdClientSession *s,
|
||||
static void nbd_coroutine_start(NbdClientSession *s,
|
||||
struct nbd_request *request)
|
||||
{
|
||||
int i;
|
||||
|
||||
/* Poor man semaphore. The free_sema is locked when no other request
|
||||
* can be accepted, and unlocked after receiving one reply. */
|
||||
if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
|
||||
@ -178,15 +187,7 @@ static void nbd_coroutine_start(NbdClientSession *s,
|
||||
}
|
||||
s->in_flight++;
|
||||
|
||||
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
||||
if (s->recv_coroutine[i] == NULL) {
|
||||
s->recv_coroutine[i] = qemu_coroutine_self();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(i < MAX_NBD_REQUESTS);
|
||||
request->handle = INDEX_TO_HANDLE(s, i);
|
||||
/* s->recv_coroutine[i] is set as soon as we get the send_lock. */
|
||||
}
|
||||
|
||||
static void nbd_coroutine_end(NbdClientSession *s,
|
||||
|
Loading…
Reference in New Issue
Block a user