2008-07-03 17:41:03 +04:00
|
|
|
/*
|
|
|
|
* QEMU Block driver for NBD
|
|
|
|
*
|
2019-10-09 11:41:57 +03:00
|
|
|
* Copyright (c) 2019 Virtuozzo International GmbH.
|
2019-06-11 13:27:19 +03:00
|
|
|
* Copyright (C) 2016 Red Hat, Inc.
|
2008-07-03 17:41:03 +04:00
|
|
|
* Copyright (C) 2008 Bull S.A.S.
|
2008-07-08 22:57:05 +04:00
|
|
|
* Author: Laurent Vivier <Laurent.Vivier@bull.net>
|
2008-07-03 17:41:03 +04:00
|
|
|
*
|
|
|
|
* Some parts:
|
|
|
|
* Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
|
* of this software and associated documentation files (the "Software"), to deal
|
|
|
|
* in the Software without restriction, including without limitation the rights
|
|
|
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
|
|
* copies of the Software, and to permit persons to whom the Software is
|
|
|
|
* furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included in
|
|
|
|
* all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
|
|
* THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
2016-01-18 21:01:42 +03:00
|
|
|
#include "qemu/osdep.h"
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
#include "trace.h"
|
2012-12-17 21:20:00 +04:00
|
|
|
#include "qemu/uri.h"
|
2018-02-01 14:18:46 +03:00
|
|
|
#include "qemu/option.h"
|
2019-06-11 13:27:19 +03:00
|
|
|
#include "qemu/cutils.h"
|
Include qemu/main-loop.h less
In my "build everything" tree, changing qemu/main-loop.h triggers a
recompile of some 5600 out of 6600 objects (not counting tests and
objects that don't depend on qemu/osdep.h). It includes block/aio.h,
which in turn includes qemu/event_notifier.h, qemu/notify.h,
qemu/processor.h, qemu/qsp.h, qemu/queue.h, qemu/thread-posix.h,
qemu/thread.h, qemu/timer.h, and a few more.
Include qemu/main-loop.h only where it's needed. Touching it now
recompiles only some 1700 objects. For block/aio.h and
qemu/event_notifier.h, these numbers drop from 5600 to 2800. For the
others, they shrink only slightly.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20190812052359.30071-21-armbru@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
2019-08-12 08:23:50 +03:00
|
|
|
#include "qemu/main-loop.h"
|
2020-12-28 18:08:44 +03:00
|
|
|
#include "qemu/atomic.h"
|
2019-06-11 13:27:19 +03:00
|
|
|
|
2018-02-11 12:36:01 +03:00
|
|
|
#include "qapi/qapi-visit-sockets.h"
|
2014-07-18 22:24:59 +04:00
|
|
|
#include "qapi/qmp/qstring.h"
|
block/nbd: use non-blocking connect: fix vm hang on connect()
This makes nbd's connection_co yield during reconnects, so that
reconnect doesn't block the main thread. This is very important in
case of an unavailable nbd server host: connect() call may take a long
time, blocking the main thread (and due to reconnect, it will hang
again and again with small gaps of working time during pauses between
connection attempts).
Realization notes:
- We don't want to implement non-blocking connect() over non-blocking
socket, because getaddrinfo() doesn't have portable non-blocking
realization anyway, so let's just use a thread for both getaddrinfo()
and connect().
- We can't use qio_channel_socket_connect_async (which behaves
similarly and starts a thread to execute connect() call), as it's relying
on someone iterating main loop (g_main_loop_run() or something like
this), which is not always the case.
- We can't use thread_pool_submit_co API, as thread pool waits for all
threads to finish (but we don't want to wait for blocking reconnect
attempt on shutdown.
So, we just create the thread by hand. Some additional difficulties
are:
- We want our connect to avoid blocking drained sections and aio context
switches. To achieve this, we make it possible to "cancel" synchronous
wait for the connect (which is a coroutine yield actually), still,
the thread continues in background, and if successful, its result may be
reused on next reconnect attempt.
- We don't want to wait for reconnect on shutdown, so there is
CONNECT_THREAD_RUNNING_DETACHED thread state, which means that the block
layer is no longer interested in a result, and thread should close new
connected socket on finish and free the state.
How to reproduce the bug, fixed with this commit:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. Start vm with second nbd disk on node2, like this:
./x86_64-softmmu/qemu-system-x86_64 -nodefaults -drive \
file=/work/images/cent7.qcow2 -drive file=nbd+tcp://192.168.100.2 \
-vnc :0 -qmp stdio -m 2G -enable-kvm -vga std
4. Access the vm through vnc (or some other way?), and check that NBD
drive works:
dd if=/dev/sdb of=/dev/null bs=1M count=10
- the command should succeed.
5. Now, let's trigger nbd-reconnect loop in Qemu process. For this:
5.1 Kill NBD server on node1
5.2 run "dd if=/dev/sdb of=/dev/null bs=1M count=10" in the guest
again. The command should fail and a lot of error messages about
failing disk may appear as well.
Now NBD client driver in Qemu tries to reconnect.
Still, VM works well.
6. Make node1 unavailable on NBD port, so connect() from node2 will
last for a long time:
On node1 (Note, that 10809 is just a default NBD port):
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
After some time the guest hangs, and you may check in gdb that Qemu
hangs in connect() call, issued from the main thread. This is the
BUG.
7. Don't forget to drop iptables rule from your node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200812145237.4396-1-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: minor wording and formatting tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-08-12 17:52:37 +03:00
|
|
|
#include "qapi/clone-visitor.h"
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
#include "block/qdict.h"
|
|
|
|
#include "block/nbd.h"
|
|
|
|
#include "block/block_int.h"
|
2021-06-10 13:08:00 +03:00
|
|
|
#include "block/coroutines.h"
|
2008-07-03 17:41:03 +04:00
|
|
|
|
2020-12-28 18:08:44 +03:00
|
|
|
#include "qemu/yank.h"
|
|
|
|
|
2010-08-26 00:48:33 +04:00
|
|
|
#define EN_OPTSTR ":exportname="
|
2019-06-11 13:27:19 +03:00
|
|
|
#define MAX_NBD_REQUESTS 16
|
|
|
|
|
|
|
|
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
|
|
|
|
#define INDEX_TO_HANDLE(bs, index) ((index) ^ (uint64_t)(intptr_t)(bs))
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
Coroutine *coroutine;
|
|
|
|
uint64_t offset; /* original offset of the request */
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
bool receiving; /* sleeping in the yield in nbd_receive_replies */
|
2021-09-02 13:38:05 +03:00
|
|
|
bool reply_possible; /* reply header not yet received */
|
2019-06-11 13:27:19 +03:00
|
|
|
} NBDClientRequest;
|
|
|
|
|
2019-06-18 14:43:22 +03:00
|
|
|
typedef enum NBDClientState {
|
2019-10-09 11:41:57 +03:00
|
|
|
NBD_CLIENT_CONNECTING_WAIT,
|
|
|
|
NBD_CLIENT_CONNECTING_NOWAIT,
|
2019-06-18 14:43:22 +03:00
|
|
|
NBD_CLIENT_CONNECTED,
|
|
|
|
NBD_CLIENT_QUIT
|
|
|
|
} NBDClientState;
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
typedef struct BDRVNBDState {
|
2021-06-10 13:07:55 +03:00
|
|
|
QIOChannel *ioc; /* The current I/O channel */
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDExportInfo info;
|
|
|
|
|
|
|
|
CoMutex send_mutex;
|
|
|
|
CoQueue free_sema;
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
|
|
|
|
CoMutex receive_mutex;
|
2019-06-11 13:27:19 +03:00
|
|
|
int in_flight;
|
2019-06-18 14:43:22 +03:00
|
|
|
NBDClientState state;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
QEMUTimer *reconnect_delay_timer;
|
2021-09-06 22:06:48 +03:00
|
|
|
QEMUTimer *open_timer;
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDClientRequest requests[MAX_NBD_REQUESTS];
|
|
|
|
NBDReply reply;
|
|
|
|
BlockDriverState *bs;
|
2016-08-15 16:29:26 +03:00
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
/* Connection parameters */
|
|
|
|
uint32_t reconnect_delay;
|
2021-09-06 22:06:48 +03:00
|
|
|
uint32_t open_timeout;
|
2017-04-26 10:36:40 +03:00
|
|
|
SocketAddress *saddr;
|
2016-10-25 16:11:34 +03:00
|
|
|
char *export, *tlscredsid;
|
2019-06-18 14:43:24 +03:00
|
|
|
QCryptoTLSCreds *tlscreds;
|
|
|
|
const char *hostname;
|
|
|
|
char *x_dirty_bitmap;
|
2020-10-27 08:05:55 +03:00
|
|
|
bool alloc_depth;
|
block/nbd: use non-blocking connect: fix vm hang on connect()
This makes nbd's connection_co yield during reconnects, so that
reconnect doesn't block the main thread. This is very important in
case of an unavailable nbd server host: connect() call may take a long
time, blocking the main thread (and due to reconnect, it will hang
again and again with small gaps of working time during pauses between
connection attempts).
Realization notes:
- We don't want to implement non-blocking connect() over non-blocking
socket, because getaddrinfo() doesn't have portable non-blocking
realization anyway, so let's just use a thread for both getaddrinfo()
and connect().
- We can't use qio_channel_socket_connect_async (which behaves
similarly and starts a thread to execute connect() call), as it's relying
on someone iterating main loop (g_main_loop_run() or something like
this), which is not always the case.
- We can't use thread_pool_submit_co API, as thread pool waits for all
threads to finish (but we don't want to wait for blocking reconnect
attempt on shutdown.
So, we just create the thread by hand. Some additional difficulties
are:
- We want our connect to avoid blocking drained sections and aio context
switches. To achieve this, we make it possible to "cancel" synchronous
wait for the connect (which is a coroutine yield actually), still,
the thread continues in background, and if successful, its result may be
reused on next reconnect attempt.
- We don't want to wait for reconnect on shutdown, so there is
CONNECT_THREAD_RUNNING_DETACHED thread state, which means that the block
layer is no longer interested in a result, and thread should close new
connected socket on finish and free the state.
How to reproduce the bug, fixed with this commit:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. Start vm with second nbd disk on node2, like this:
./x86_64-softmmu/qemu-system-x86_64 -nodefaults -drive \
file=/work/images/cent7.qcow2 -drive file=nbd+tcp://192.168.100.2 \
-vnc :0 -qmp stdio -m 2G -enable-kvm -vga std
4. Access the vm through vnc (or some other way?), and check that NBD
drive works:
dd if=/dev/sdb of=/dev/null bs=1M count=10
- the command should succeed.
5. Now, let's trigger nbd-reconnect loop in Qemu process. For this:
5.1 Kill NBD server on node1
5.2 run "dd if=/dev/sdb of=/dev/null bs=1M count=10" in the guest
again. The command should fail and a lot of error messages about
failing disk may appear as well.
Now NBD client driver in Qemu tries to reconnect.
Still, VM works well.
6. Make node1 unavailable on NBD port, so connect() from node2 will
last for a long time:
On node1 (Note, that 10809 is just a default NBD port):
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
After some time the guest hangs, and you may check in gdb that Qemu
hangs in connect() call, issued from the main thread. This is the
BUG.
7. Don't forget to drop iptables rule from your node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200812145237.4396-1-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: minor wording and formatting tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-08-12 17:52:37 +03:00
|
|
|
|
2021-06-10 13:07:44 +03:00
|
|
|
NBDClientConnection *conn;
|
2008-07-03 17:41:03 +04:00
|
|
|
} BDRVNBDState;
|
|
|
|
|
2020-12-28 18:08:44 +03:00
|
|
|
static void nbd_yank(void *opaque);
|
2019-10-09 11:41:57 +03:00
|
|
|
|
2021-06-10 13:07:33 +03:00
|
|
|
static void nbd_clear_bdrvstate(BlockDriverState *bs)
|
2019-12-05 06:45:27 +03:00
|
|
|
{
|
2021-06-10 13:07:33 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2021-06-10 13:07:37 +03:00
|
|
|
|
2021-06-10 13:07:46 +03:00
|
|
|
nbd_client_connection_release(s->conn);
|
|
|
|
s->conn = NULL;
|
2021-06-10 13:07:33 +03:00
|
|
|
|
|
|
|
yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
|
|
|
|
|
2022-02-04 14:10:08 +03:00
|
|
|
/* Must not leave timers behind that would access freed data */
|
|
|
|
assert(!s->reconnect_delay_timer);
|
|
|
|
assert(!s->open_timer);
|
|
|
|
|
2019-12-05 06:45:27 +03:00
|
|
|
object_unref(OBJECT(s->tlscreds));
|
|
|
|
qapi_free_SocketAddress(s->saddr);
|
|
|
|
s->saddr = NULL;
|
|
|
|
g_free(s->export);
|
|
|
|
s->export = NULL;
|
|
|
|
g_free(s->tlscredsid);
|
|
|
|
s->tlscredsid = NULL;
|
|
|
|
g_free(s->x_dirty_bitmap);
|
|
|
|
s->x_dirty_bitmap = NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:08:01 +03:00
|
|
|
static bool nbd_client_connected(BDRVNBDState *s)
|
|
|
|
{
|
|
|
|
return qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTED;
|
|
|
|
}
|
|
|
|
|
2021-09-02 13:38:03 +03:00
|
|
|
static bool nbd_recv_coroutine_wake_one(NBDClientRequest *req)
|
|
|
|
{
|
|
|
|
if (req->receiving) {
|
|
|
|
req->receiving = false;
|
|
|
|
aio_co_wake(req->coroutine);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nbd_recv_coroutines_wake(BDRVNBDState *s, bool all)
|
2021-09-02 13:38:02 +03:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
2021-09-02 13:38:03 +03:00
|
|
|
if (nbd_recv_coroutine_wake_one(&s->requests[i]) && !all) {
|
|
|
|
return;
|
2021-09-02 13:38:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-18 14:43:22 +03:00
|
|
|
static void nbd_channel_error(BDRVNBDState *s, int ret)
|
|
|
|
{
|
2021-09-02 13:38:01 +03:00
|
|
|
if (nbd_client_connected(s)) {
|
|
|
|
qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
|
|
|
|
}
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
if (ret == -EIO) {
|
2021-06-10 13:08:01 +03:00
|
|
|
if (nbd_client_connected(s)) {
|
2019-10-09 11:41:57 +03:00
|
|
|
s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
|
|
|
|
NBD_CLIENT_CONNECTING_NOWAIT;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
s->state = NBD_CLIENT_QUIT;
|
|
|
|
}
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
|
|
|
|
nbd_recv_coroutines_wake(s, true);
|
2019-06-18 14:43:22 +03:00
|
|
|
}
|
|
|
|
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
static void reconnect_delay_timer_del(BDRVNBDState *s)
|
|
|
|
{
|
|
|
|
if (s->reconnect_delay_timer) {
|
|
|
|
timer_free(s->reconnect_delay_timer);
|
|
|
|
s->reconnect_delay_timer = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void reconnect_delay_timer_cb(void *opaque)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = opaque;
|
|
|
|
|
2020-12-28 18:08:44 +03:00
|
|
|
if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) {
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
s->state = NBD_CLIENT_CONNECTING_NOWAIT;
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
nbd_co_establish_connection_cancel(s->conn);
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
while (qemu_co_enter_next(&s->free_sema, NULL)) {
|
|
|
|
/* Resume all queued requests */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
reconnect_delay_timer_del(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void reconnect_delay_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
|
|
|
|
{
|
2020-12-28 18:08:44 +03:00
|
|
|
if (qatomic_load_acquire(&s->state) != NBD_CLIENT_CONNECTING_WAIT) {
|
block/nbd: fix reconnect-delay
reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.
Let's instead use separate timer.
How to reproduce the bug:
1. Create an image on node1:
qemu-img create -f qcow2 xx 100M
2. Start NBD server on node1:
qemu-nbd xx
3. On node2 start qemu-io:
./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15
4. Type 'read 0 512' in qemu-io interface to check that connection
works
Be careful: you should make steps 5-7 in a short time, less than 15
seconds.
5. Kill nbd server on node1
6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.
7. On node1 run the following command
sudo iptables -A INPUT -p tcp --dport 10809 -j DROP
This will make the connect() call of qemu-io at node2 take a long time.
And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.
8. Don't forget to drop iptables rule on node1:
sudo iptables -D INPUT -p tcp --dport 10809 -j DROP
Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200903190301.367620-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2020-09-03 22:03:00 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!s->reconnect_delay_timer);
|
|
|
|
s->reconnect_delay_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
|
|
|
|
QEMU_CLOCK_REALTIME,
|
|
|
|
SCALE_NS,
|
|
|
|
reconnect_delay_timer_cb, s);
|
|
|
|
timer_mod(s->reconnect_delay_timer, expire_time_ns);
|
|
|
|
}
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
static void nbd_teardown_connection(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
assert(!s->in_flight);
|
|
|
|
|
2020-07-27 21:47:49 +03:00
|
|
|
if (s->ioc) {
|
2019-10-09 11:41:57 +03:00
|
|
|
qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
|
|
|
|
nbd_yank, s->bs);
|
|
|
|
object_unref(OBJECT(s->ioc));
|
|
|
|
s->ioc = NULL;
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
2020-07-27 21:47:49 +03:00
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
s->state = NBD_CLIENT_QUIT;
|
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
|
2021-09-06 22:06:48 +03:00
|
|
|
static void open_timer_del(BDRVNBDState *s)
|
|
|
|
{
|
|
|
|
if (s->open_timer) {
|
|
|
|
timer_free(s->open_timer);
|
|
|
|
s->open_timer = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void open_timer_cb(void *opaque)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = opaque;
|
|
|
|
|
|
|
|
nbd_co_establish_connection_cancel(s->conn);
|
|
|
|
open_timer_del(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void open_timer_init(BDRVNBDState *s, uint64_t expire_time_ns)
|
|
|
|
{
|
|
|
|
assert(!s->open_timer);
|
|
|
|
s->open_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
|
|
|
|
QEMU_CLOCK_REALTIME,
|
|
|
|
SCALE_NS,
|
|
|
|
open_timer_cb, s);
|
|
|
|
timer_mod(s->open_timer, expire_time_ns);
|
|
|
|
}
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
static bool nbd_client_connecting(BDRVNBDState *s)
|
|
|
|
{
|
2020-12-28 18:08:44 +03:00
|
|
|
NBDClientState state = qatomic_load_acquire(&s->state);
|
|
|
|
return state == NBD_CLIENT_CONNECTING_WAIT ||
|
|
|
|
state == NBD_CLIENT_CONNECTING_NOWAIT;
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool nbd_client_connecting_wait(BDRVNBDState *s)
|
|
|
|
{
|
2020-12-28 18:08:44 +03:00
|
|
|
return qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT;
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
|
|
|
|
2021-06-10 13:07:52 +03:00
|
|
|
/*
|
|
|
|
* Update @bs with information learned during a completed negotiation process.
|
|
|
|
* Return failure if the server's advertised options are incompatible with the
|
|
|
|
* client's needs.
|
|
|
|
*/
|
|
|
|
static int nbd_handle_updated_info(BlockDriverState *bs, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (s->x_dirty_bitmap) {
|
|
|
|
if (!s->info.base_allocation) {
|
|
|
|
error_setg(errp, "requested x-dirty-bitmap %s not found",
|
|
|
|
s->x_dirty_bitmap);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (strcmp(s->x_dirty_bitmap, "qemu:allocation-depth") == 0) {
|
|
|
|
s->alloc_depth = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s->info.flags & NBD_FLAG_READ_ONLY) {
|
|
|
|
ret = bdrv_apply_auto_read_only(bs, "NBD export is read-only", errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s->info.flags & NBD_FLAG_SEND_FUA) {
|
|
|
|
bs->supported_write_flags = BDRV_REQ_FUA;
|
|
|
|
bs->supported_zero_flags |= BDRV_REQ_FUA;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
|
|
|
|
bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
|
|
|
|
if (s->info.flags & NBD_FLAG_SEND_FAST_ZERO) {
|
|
|
|
bs->supported_zero_flags |= BDRV_REQ_NO_FALLBACK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_nbd_client_handshake_success(s->export);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:08:00 +03:00
|
|
|
int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs,
|
|
|
|
Error **errp)
|
2019-10-09 11:41:57 +03:00
|
|
|
{
|
2021-06-10 13:07:58 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2020-07-27 21:47:47 +03:00
|
|
|
int ret;
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
bool blocking = nbd_client_connecting_wait(s);
|
2022-03-03 18:16:09 +03:00
|
|
|
IO_CODE();
|
2019-10-09 11:41:57 +03:00
|
|
|
|
2021-06-10 13:07:58 +03:00
|
|
|
assert(!s->ioc);
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
s->ioc = nbd_co_establish_connection(s->conn, &s->info, blocking, errp);
|
2021-06-10 13:07:58 +03:00
|
|
|
if (!s->ioc) {
|
|
|
|
return -ECONNREFUSED;
|
|
|
|
}
|
|
|
|
|
2021-07-04 01:07:30 +03:00
|
|
|
yank_register_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name), nbd_yank,
|
|
|
|
bs);
|
|
|
|
|
2021-06-10 13:07:58 +03:00
|
|
|
ret = nbd_handle_updated_info(s->bs, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
/*
|
|
|
|
* We have connected, but must fail for other reasons.
|
|
|
|
* Send NBD_CMD_DISC as a courtesy to the server.
|
|
|
|
*/
|
|
|
|
NBDRequest request = { .type = NBD_CMD_DISC };
|
|
|
|
|
|
|
|
nbd_send_request(s->ioc, &request);
|
|
|
|
|
2021-07-04 01:07:30 +03:00
|
|
|
yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
|
|
|
|
nbd_yank, bs);
|
2021-06-10 13:07:58 +03:00
|
|
|
object_unref(OBJECT(s->ioc));
|
|
|
|
s->ioc = NULL;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
qio_channel_set_blocking(s->ioc, false, NULL);
|
|
|
|
qio_channel_attach_aio_context(s->ioc, bdrv_get_aio_context(bs));
|
|
|
|
|
|
|
|
/* successfully connected */
|
|
|
|
s->state = NBD_CLIENT_CONNECTED;
|
|
|
|
qemu_co_queue_restart_all(&s->free_sema);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
/* called under s->send_mutex */
|
2021-06-10 13:07:58 +03:00
|
|
|
static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
|
|
|
|
{
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
assert(nbd_client_connecting(s));
|
|
|
|
assert(s->in_flight == 0);
|
2019-10-09 11:41:57 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (nbd_client_connecting_wait(s) && s->reconnect_delay &&
|
|
|
|
!s->reconnect_delay_timer)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* It's first reconnect attempt after switching to
|
|
|
|
* NBD_CLIENT_CONNECTING_WAIT
|
|
|
|
*/
|
|
|
|
reconnect_delay_timer_init(s,
|
|
|
|
qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
|
|
|
|
s->reconnect_delay * NANOSECONDS_PER_SECOND);
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now we are sure that nobody is accessing the channel, and no one will
|
|
|
|
* try until we set the state to CONNECTED.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Finalize previous connection if any */
|
|
|
|
if (s->ioc) {
|
2020-09-03 22:02:59 +03:00
|
|
|
qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
|
2020-12-28 18:08:44 +03:00
|
|
|
yank_unregister_function(BLOCKDEV_YANK_INSTANCE(s->bs->node_name),
|
|
|
|
nbd_yank, s->bs);
|
2019-10-09 11:41:57 +03:00
|
|
|
object_unref(OBJECT(s->ioc));
|
|
|
|
s->ioc = NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:07:58 +03:00
|
|
|
nbd_co_do_establish_connection(s->bs, NULL);
|
2022-02-04 14:10:06 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The reconnect attempt is done (maybe successfully, maybe not), so
|
|
|
|
* we no longer need this timer. Delete it so it will not outlive
|
|
|
|
* this I/O request (so draining removes all timers).
|
|
|
|
*/
|
|
|
|
reconnect_delay_timer_del(s);
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
|
2019-10-09 11:41:57 +03:00
|
|
|
{
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
int ret;
|
|
|
|
uint64_t ind = HANDLE_TO_INDEX(s, handle), ind2;
|
|
|
|
QEMU_LOCK_GUARD(&s->receive_mutex);
|
2019-10-09 11:41:57 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
while (true) {
|
|
|
|
if (s->reply.handle == handle) {
|
|
|
|
/* We are done */
|
|
|
|
return 0;
|
2019-10-09 11:41:57 +03:00
|
|
|
}
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (!nbd_client_connected(s)) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (s->reply.handle != 0) {
|
|
|
|
/*
|
|
|
|
* Some other request is being handled now. It should already be
|
|
|
|
* woken by whoever set s->reply.handle (or never wait in this
|
|
|
|
* yield). So, we should not wake it here.
|
|
|
|
*/
|
|
|
|
ind2 = HANDLE_TO_INDEX(s, s->reply.handle);
|
|
|
|
assert(!s->requests[ind2].receiving);
|
2019-06-11 13:27:19 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
s->requests[ind].receiving = true;
|
|
|
|
qemu_co_mutex_unlock(&s->receive_mutex);
|
2019-10-09 11:41:57 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
qemu_coroutine_yield();
|
|
|
|
/*
|
|
|
|
* We may be woken for 3 reasons:
|
|
|
|
* 1. From this function, executing in parallel coroutine, when our
|
|
|
|
* handle is received.
|
|
|
|
* 2. From nbd_channel_error(), when connection is lost.
|
|
|
|
* 3. From nbd_co_receive_one_chunk(), when previous request is
|
|
|
|
* finished and s->reply.handle set to 0.
|
|
|
|
* Anyway, it's OK to lock the mutex and go to the next iteration.
|
|
|
|
*/
|
2019-10-09 11:41:57 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
qemu_co_mutex_lock(&s->receive_mutex);
|
|
|
|
assert(!s->requests[ind].receiving);
|
2019-10-09 11:41:57 +03:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
/* We are under mutex and handle is 0. We have to do the dirty work. */
|
2019-06-11 13:27:19 +03:00
|
|
|
assert(s->reply.handle == 0);
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
ret = nbd_receive_reply(s->bs, s->ioc, &s->reply, NULL);
|
2019-06-11 13:27:19 +03:00
|
|
|
if (ret <= 0) {
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
ret = ret ? ret : -EIO;
|
|
|
|
nbd_channel_error(s, ret);
|
|
|
|
return ret;
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, -EINVAL);
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
return -EINVAL;
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (s->reply.handle == handle) {
|
|
|
|
/* We are done */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
ind2 = HANDLE_TO_INDEX(s, s->reply.handle);
|
2021-09-02 13:38:05 +03:00
|
|
|
if (ind2 >= MAX_NBD_REQUESTS || !s->requests[ind2].reply_possible) {
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
nbd_channel_error(s, -EINVAL);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
nbd_recv_coroutine_wake_one(&s->requests[ind2]);
|
2020-01-22 19:45:28 +03:00
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_co_send_request(BlockDriverState *bs,
|
|
|
|
NBDRequest *request,
|
|
|
|
QEMUIOVector *qiov)
|
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-18 14:43:22 +03:00
|
|
|
int rc, i = -1;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
qemu_co_mutex_lock(&s->send_mutex);
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
|
|
|
|
while (s->in_flight == MAX_NBD_REQUESTS ||
|
|
|
|
(!nbd_client_connected(s) && s->in_flight > 0))
|
|
|
|
{
|
2019-06-11 13:27:19 +03:00
|
|
|
qemu_co_queue_wait(&s->free_sema, &s->send_mutex);
|
|
|
|
}
|
2019-06-18 14:43:22 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
if (nbd_client_connecting(s)) {
|
|
|
|
nbd_reconnect_attempt(s);
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:08:01 +03:00
|
|
|
if (!nbd_client_connected(s)) {
|
2019-06-18 14:43:22 +03:00
|
|
|
rc = -EIO;
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:19 +03:00
|
|
|
s->in_flight++;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
|
|
|
|
if (s->requests[i].coroutine == NULL) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_assert(qemu_in_coroutine());
|
|
|
|
assert(i < MAX_NBD_REQUESTS);
|
|
|
|
|
|
|
|
s->requests[i].coroutine = qemu_coroutine_self();
|
|
|
|
s->requests[i].offset = request->from;
|
|
|
|
s->requests[i].receiving = false;
|
2021-09-02 13:38:05 +03:00
|
|
|
s->requests[i].reply_possible = true;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
request->handle = INDEX_TO_HANDLE(s, i);
|
|
|
|
|
|
|
|
assert(s->ioc);
|
|
|
|
|
|
|
|
if (qiov) {
|
|
|
|
qio_channel_set_cork(s->ioc, true);
|
|
|
|
rc = nbd_send_request(s->ioc, request);
|
2021-06-10 13:08:01 +03:00
|
|
|
if (nbd_client_connected(s) && rc >= 0) {
|
2019-06-11 13:27:19 +03:00
|
|
|
if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov,
|
|
|
|
NULL) < 0) {
|
|
|
|
rc = -EIO;
|
|
|
|
}
|
|
|
|
} else if (rc >= 0) {
|
|
|
|
rc = -EIO;
|
|
|
|
}
|
|
|
|
qio_channel_set_cork(s->ioc, false);
|
|
|
|
} else {
|
|
|
|
rc = nbd_send_request(s->ioc, request);
|
|
|
|
}
|
|
|
|
|
|
|
|
err:
|
|
|
|
if (rc < 0) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, rc);
|
|
|
|
if (i != -1) {
|
|
|
|
s->requests[i].coroutine = NULL;
|
|
|
|
s->in_flight--;
|
2019-10-09 11:41:57 +03:00
|
|
|
qemu_co_queue_next(&s->free_sema);
|
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
|
|
|
qemu_co_mutex_unlock(&s->send_mutex);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint16_t payload_advance16(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 2;
|
|
|
|
return lduw_be_p(*payload - 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint32_t payload_advance32(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 4;
|
|
|
|
return ldl_be_p(*payload - 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline uint64_t payload_advance64(uint8_t **payload)
|
|
|
|
{
|
|
|
|
*payload += 8;
|
|
|
|
return ldq_be_p(*payload - 8);
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_parse_offset_hole_payload(BDRVNBDState *s,
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDStructuredReplyChunk *chunk,
|
|
|
|
uint8_t *payload, uint64_t orig_offset,
|
|
|
|
QEMUIOVector *qiov, Error **errp)
|
|
|
|
{
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t hole_size;
|
|
|
|
|
|
|
|
if (chunk->length != sizeof(offset) + sizeof(hole_size)) {
|
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_OFFSET_HOLE");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
offset = payload_advance64(&payload);
|
|
|
|
hole_size = payload_advance32(&payload);
|
|
|
|
|
|
|
|
if (!hole_size || offset < orig_offset || hole_size > qiov->size ||
|
|
|
|
offset > orig_offset + qiov->size - hole_size) {
|
|
|
|
error_setg(errp, "Protocol error: server sent chunk exceeding requested"
|
|
|
|
" region");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2019-06-11 13:27:20 +03:00
|
|
|
if (s->info.min_block &&
|
|
|
|
!QEMU_IS_ALIGNED(hole_size, s->info.min_block)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
trace_nbd_structured_read_compliance("hole");
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_iovec_memset(qiov, offset - orig_offset, 0, hole_size);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nbd_parse_blockstatus_payload
|
|
|
|
* Based on our request, we expect only one extent in reply, for the
|
|
|
|
* base:allocation context.
|
|
|
|
*/
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_parse_blockstatus_payload(BDRVNBDState *s,
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDStructuredReplyChunk *chunk,
|
|
|
|
uint8_t *payload, uint64_t orig_length,
|
|
|
|
NBDExtent *extent, Error **errp)
|
|
|
|
{
|
|
|
|
uint32_t context_id;
|
|
|
|
|
|
|
|
/* The server succeeded, so it must have sent [at least] one extent */
|
|
|
|
if (chunk->length < sizeof(context_id) + sizeof(*extent)) {
|
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_BLOCK_STATUS");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
context_id = payload_advance32(&payload);
|
2019-06-11 13:27:20 +03:00
|
|
|
if (s->info.context_id != context_id) {
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(errp, "Protocol error: unexpected context id %d for "
|
|
|
|
"NBD_REPLY_TYPE_BLOCK_STATUS, when negotiated context "
|
|
|
|
"id is %d", context_id,
|
2019-06-11 13:27:20 +03:00
|
|
|
s->info.context_id);
|
2019-06-11 13:27:19 +03:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
extent->length = payload_advance32(&payload);
|
|
|
|
extent->flags = payload_advance32(&payload);
|
|
|
|
|
|
|
|
if (extent->length == 0) {
|
|
|
|
error_setg(errp, "Protocol error: server sent status chunk with "
|
|
|
|
"zero length");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A server sending unaligned block status is in violation of the
|
|
|
|
* protocol, but as qemu-nbd 3.1 is such a server (at least for
|
|
|
|
* POSIX files that are not a multiple of 512 bytes, since qemu
|
|
|
|
* rounds files up to 512-byte multiples but lseek(SEEK_HOLE)
|
|
|
|
* still sees an implicit hole beyond the real EOF), it's nicer to
|
|
|
|
* work around the misbehaving server. If the request included
|
|
|
|
* more than the final unaligned block, truncate it back to an
|
|
|
|
* aligned result; if the request was only the final block, round
|
|
|
|
* up to the full block and change the status to fully-allocated
|
|
|
|
* (always a safe status, even if it loses information).
|
|
|
|
*/
|
2019-06-11 13:27:20 +03:00
|
|
|
if (s->info.min_block && !QEMU_IS_ALIGNED(extent->length,
|
|
|
|
s->info.min_block)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
trace_nbd_parse_blockstatus_compliance("extent length is unaligned");
|
2019-06-11 13:27:20 +03:00
|
|
|
if (extent->length > s->info.min_block) {
|
2019-06-11 13:27:19 +03:00
|
|
|
extent->length = QEMU_ALIGN_DOWN(extent->length,
|
2019-06-11 13:27:20 +03:00
|
|
|
s->info.min_block);
|
2019-06-11 13:27:19 +03:00
|
|
|
} else {
|
2019-06-11 13:27:20 +03:00
|
|
|
extent->length = s->info.min_block;
|
2019-06-11 13:27:19 +03:00
|
|
|
extent->flags = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We used NBD_CMD_FLAG_REQ_ONE, so the server should not have
|
|
|
|
* sent us any more than one extent, nor should it have included
|
|
|
|
* status beyond our request in that extent. However, it's easy
|
|
|
|
* enough to ignore the server's noncompliance without killing the
|
|
|
|
* connection; just ignore trailing extents, and clamp things to
|
|
|
|
* the length of our request.
|
|
|
|
*/
|
|
|
|
if (chunk->length > sizeof(context_id) + sizeof(*extent)) {
|
|
|
|
trace_nbd_parse_blockstatus_compliance("more than one extent");
|
|
|
|
}
|
|
|
|
if (extent->length > orig_length) {
|
|
|
|
extent->length = orig_length;
|
|
|
|
trace_nbd_parse_blockstatus_compliance("extent length too large");
|
|
|
|
}
|
|
|
|
|
2020-10-27 08:05:55 +03:00
|
|
|
/*
|
|
|
|
* HACK: if we are using x-dirty-bitmaps to access
|
|
|
|
* qemu:allocation-depth, treat all depths > 2 the same as 2,
|
|
|
|
* since nbd_client_co_block_status is only expecting the low two
|
|
|
|
* bits to be set.
|
|
|
|
*/
|
|
|
|
if (s->alloc_depth && extent->flags > 2) {
|
|
|
|
extent->flags = 2;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:19 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nbd_parse_error_payload
|
|
|
|
* on success @errp contains message describing nbd error reply
|
|
|
|
*/
|
|
|
|
static int nbd_parse_error_payload(NBDStructuredReplyChunk *chunk,
|
|
|
|
uint8_t *payload, int *request_ret,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
uint32_t error;
|
|
|
|
uint16_t message_size;
|
|
|
|
|
|
|
|
assert(chunk->type & (1 << 15));
|
|
|
|
|
|
|
|
if (chunk->length < sizeof(error) + sizeof(message_size)) {
|
|
|
|
error_setg(errp,
|
|
|
|
"Protocol error: invalid payload for structured error");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
error = nbd_errno_to_system_errno(payload_advance32(&payload));
|
|
|
|
if (error == 0) {
|
|
|
|
error_setg(errp, "Protocol error: server sent structured error chunk "
|
|
|
|
"with error = 0");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*request_ret = -error;
|
|
|
|
message_size = payload_advance16(&payload);
|
|
|
|
|
|
|
|
if (message_size > chunk->length - sizeof(error) - sizeof(message_size)) {
|
|
|
|
error_setg(errp, "Protocol error: server sent structured error chunk "
|
|
|
|
"with incorrect message size");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* TODO: Add a trace point to mention the server complaint */
|
|
|
|
|
|
|
|
/* TODO handle ERROR_OFFSET */
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_co_receive_offset_data_payload(BDRVNBDState *s,
|
2019-06-11 13:27:19 +03:00
|
|
|
uint64_t orig_offset,
|
|
|
|
QEMUIOVector *qiov, Error **errp)
|
|
|
|
{
|
|
|
|
QEMUIOVector sub_qiov;
|
|
|
|
uint64_t offset;
|
|
|
|
size_t data_size;
|
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &s->reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&s->reply));
|
|
|
|
|
|
|
|
/* The NBD spec requires at least one byte of payload */
|
|
|
|
if (chunk->length <= sizeof(offset)) {
|
|
|
|
error_setg(errp, "Protocol error: invalid payload for "
|
|
|
|
"NBD_REPLY_TYPE_OFFSET_DATA");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nbd_read64(s->ioc, &offset, "OFFSET_DATA offset", errp) < 0) {
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
data_size = chunk->length - sizeof(offset);
|
|
|
|
assert(data_size);
|
|
|
|
if (offset < orig_offset || data_size > qiov->size ||
|
|
|
|
offset > orig_offset + qiov->size - data_size) {
|
|
|
|
error_setg(errp, "Protocol error: server sent chunk exceeding requested"
|
|
|
|
" region");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (s->info.min_block && !QEMU_IS_ALIGNED(data_size, s->info.min_block)) {
|
|
|
|
trace_nbd_structured_read_compliance("data");
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_iovec_init(&sub_qiov, qiov->niov);
|
|
|
|
qemu_iovec_concat(&sub_qiov, qiov, offset - orig_offset, data_size);
|
|
|
|
ret = qio_channel_readv_all(s->ioc, sub_qiov.iov, sub_qiov.niov, errp);
|
|
|
|
qemu_iovec_destroy(&sub_qiov);
|
|
|
|
|
|
|
|
return ret < 0 ? -EIO : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define NBD_MAX_MALLOC_PAYLOAD 1000
|
|
|
|
static coroutine_fn int nbd_co_receive_structured_payload(
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s, void **payload, Error **errp)
|
2019-06-11 13:27:19 +03:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
uint32_t len;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&s->reply));
|
|
|
|
|
|
|
|
len = s->reply.structured.length;
|
|
|
|
|
|
|
|
if (len == 0) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (payload == NULL) {
|
|
|
|
error_setg(errp, "Unexpected structured payload");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > NBD_MAX_MALLOC_PAYLOAD) {
|
|
|
|
error_setg(errp, "Payload too large");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*payload = g_new(char, len);
|
|
|
|
ret = nbd_read(s->ioc, *payload, len, "structured payload", errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
g_free(*payload);
|
|
|
|
*payload = NULL;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nbd_co_do_receive_one_chunk
|
|
|
|
* for simple reply:
|
|
|
|
* set request_ret to received reply error
|
|
|
|
* if qiov is not NULL: read payload to @qiov
|
|
|
|
* for structured reply chunk:
|
|
|
|
* if error chunk: read payload, set @request_ret, do not set @payload
|
|
|
|
* else if offset_data chunk: read payload data to @qiov, do not set @payload
|
|
|
|
* else: read payload to @payload
|
|
|
|
*
|
|
|
|
* If function fails, @errp contains corresponding error message, and the
|
|
|
|
* connection with the server is suspect. If it returns 0, then the
|
|
|
|
* transaction succeeded (although @request_ret may be a negative errno
|
|
|
|
* corresponding to the server's error reply), and errp is unchanged.
|
|
|
|
*/
|
|
|
|
static coroutine_fn int nbd_co_do_receive_one_chunk(
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s, uint64_t handle, bool only_structured,
|
2019-06-11 13:27:19 +03:00
|
|
|
int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
int i = HANDLE_TO_INDEX(s, handle);
|
|
|
|
void *local_payload = NULL;
|
|
|
|
NBDStructuredReplyChunk *chunk;
|
|
|
|
|
|
|
|
if (payload) {
|
|
|
|
*payload = NULL;
|
|
|
|
}
|
|
|
|
*request_ret = 0;
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
nbd_receive_replies(s, handle);
|
2021-06-10 13:08:01 +03:00
|
|
|
if (!nbd_client_connected(s)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(errp, "Connection closed");
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
assert(s->ioc);
|
|
|
|
|
|
|
|
assert(s->reply.handle == handle);
|
|
|
|
|
|
|
|
if (nbd_reply_is_simple(&s->reply)) {
|
|
|
|
if (only_structured) {
|
|
|
|
error_setg(errp, "Protocol error: simple reply when structured "
|
|
|
|
"reply chunk was expected");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
*request_ret = -nbd_errno_to_system_errno(s->reply.simple.error);
|
|
|
|
if (*request_ret < 0 || !qiov) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return qio_channel_readv_all(s->ioc, qiov->iov, qiov->niov,
|
|
|
|
errp) < 0 ? -EIO : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* handle structured reply chunk */
|
|
|
|
assert(s->info.structured_reply);
|
|
|
|
chunk = &s->reply.structured;
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_NONE) {
|
|
|
|
if (!(chunk->flags & NBD_REPLY_FLAG_DONE)) {
|
|
|
|
error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk without"
|
|
|
|
" NBD_REPLY_FLAG_DONE flag set");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
if (chunk->length) {
|
|
|
|
error_setg(errp, "Protocol error: NBD_REPLY_TYPE_NONE chunk with"
|
|
|
|
" nonzero length");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_OFFSET_DATA) {
|
|
|
|
if (!qiov) {
|
|
|
|
error_setg(errp, "Unexpected NBD_REPLY_TYPE_OFFSET_DATA chunk");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nbd_co_receive_offset_data_payload(s, s->requests[i].offset,
|
|
|
|
qiov, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
payload = &local_payload;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_structured_payload(s, payload, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
ret = nbd_parse_error_payload(chunk, local_payload, request_ret, errp);
|
|
|
|
g_free(local_payload);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nbd_co_receive_one_chunk
|
|
|
|
* Read reply, wake up connection_co and set s->quit if needed.
|
|
|
|
* Return value is a fatal error code or normal nbd reply error code
|
|
|
|
*/
|
|
|
|
static coroutine_fn int nbd_co_receive_one_chunk(
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s, uint64_t handle, bool only_structured,
|
2019-06-11 13:27:19 +03:00
|
|
|
int *request_ret, QEMUIOVector *qiov, NBDReply *reply, void **payload,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int ret = nbd_co_do_receive_one_chunk(s, handle, only_structured,
|
|
|
|
request_ret, qiov, payload, errp);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2019-07-19 20:20:01 +03:00
|
|
|
memset(reply, 0, sizeof(*reply));
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, ret);
|
2019-06-11 13:27:19 +03:00
|
|
|
} else {
|
|
|
|
/* For assert at loop start in nbd_connection_entry */
|
2019-07-19 20:20:01 +03:00
|
|
|
*reply = s->reply;
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
2019-10-09 11:41:57 +03:00
|
|
|
s->reply.handle = 0;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
nbd_recv_coroutines_wake(s, false);
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct NBDReplyChunkIter {
|
|
|
|
int ret;
|
|
|
|
int request_ret;
|
|
|
|
Error *err;
|
|
|
|
bool done, only_structured;
|
|
|
|
} NBDReplyChunkIter;
|
|
|
|
|
|
|
|
static void nbd_iter_channel_error(NBDReplyChunkIter *iter,
|
|
|
|
int ret, Error **local_err)
|
|
|
|
{
|
2019-12-05 20:46:35 +03:00
|
|
|
assert(local_err && *local_err);
|
2019-06-11 13:27:19 +03:00
|
|
|
assert(ret < 0);
|
|
|
|
|
|
|
|
if (!iter->ret) {
|
|
|
|
iter->ret = ret;
|
|
|
|
error_propagate(&iter->err, *local_err);
|
|
|
|
} else {
|
|
|
|
error_free(*local_err);
|
|
|
|
}
|
|
|
|
|
|
|
|
*local_err = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nbd_iter_request_error(NBDReplyChunkIter *iter, int ret)
|
|
|
|
{
|
|
|
|
assert(ret < 0);
|
|
|
|
|
|
|
|
if (!iter->request_ret) {
|
|
|
|
iter->request_ret = ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* NBD_FOREACH_REPLY_CHUNK
|
|
|
|
* The pointer stored in @payload requires g_free() to free it.
|
|
|
|
*/
|
|
|
|
#define NBD_FOREACH_REPLY_CHUNK(s, iter, handle, structured, \
|
|
|
|
qiov, reply, payload) \
|
|
|
|
for (iter = (NBDReplyChunkIter) { .only_structured = structured }; \
|
|
|
|
nbd_reply_chunk_iter_receive(s, &iter, handle, qiov, reply, payload);)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nbd_reply_chunk_iter_receive
|
|
|
|
* The pointer stored in @payload requires g_free() to free it.
|
|
|
|
*/
|
2019-06-11 13:27:20 +03:00
|
|
|
static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s,
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDReplyChunkIter *iter,
|
|
|
|
uint64_t handle,
|
|
|
|
QEMUIOVector *qiov, NBDReply *reply,
|
|
|
|
void **payload)
|
|
|
|
{
|
|
|
|
int ret, request_ret;
|
|
|
|
NBDReply local_reply;
|
|
|
|
NBDStructuredReplyChunk *chunk;
|
|
|
|
Error *local_err = NULL;
|
2021-06-10 13:08:01 +03:00
|
|
|
if (!nbd_client_connected(s)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(&local_err, "Connection closed");
|
|
|
|
nbd_iter_channel_error(iter, -EIO, &local_err);
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (iter->done) {
|
|
|
|
/* Previous iteration was last. */
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reply == NULL) {
|
|
|
|
reply = &local_reply;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_one_chunk(s, handle, iter->only_structured,
|
|
|
|
&request_ret, qiov, reply, payload,
|
|
|
|
&local_err);
|
|
|
|
if (ret < 0) {
|
|
|
|
nbd_iter_channel_error(iter, ret, &local_err);
|
|
|
|
} else if (request_ret < 0) {
|
|
|
|
nbd_iter_request_error(iter, request_ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */
|
2021-06-10 13:08:01 +03:00
|
|
|
if (nbd_reply_is_simple(reply) || !nbd_client_connected(s)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
chunk = &reply->structured;
|
|
|
|
iter->only_structured = true;
|
|
|
|
|
|
|
|
if (chunk->type == NBD_REPLY_TYPE_NONE) {
|
|
|
|
/* NBD_REPLY_FLAG_DONE is already checked in nbd_co_receive_one_chunk */
|
|
|
|
assert(chunk->flags & NBD_REPLY_FLAG_DONE);
|
|
|
|
goto break_loop;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (chunk->flags & NBD_REPLY_FLAG_DONE) {
|
|
|
|
/* This iteration is last. */
|
|
|
|
iter->done = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Execute the loop body */
|
|
|
|
return true;
|
|
|
|
|
|
|
|
break_loop:
|
|
|
|
s->requests[HANDLE_TO_INDEX(s, handle)].coroutine = NULL;
|
|
|
|
|
|
|
|
qemu_co_mutex_lock(&s->send_mutex);
|
|
|
|
s->in_flight--;
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
qemu_co_queue_next(&s->free_sema);
|
2019-06-11 13:27:19 +03:00
|
|
|
qemu_co_mutex_unlock(&s->send_mutex);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_co_receive_return_code(BDRVNBDState *s, uint64_t handle,
|
2019-06-11 13:27:19 +03:00
|
|
|
int *request_ret, Error **errp)
|
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
|
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, NULL, NULL) {
|
|
|
|
/* nbd_reply_chunk_iter_receive does all the work */
|
|
|
|
}
|
|
|
|
|
|
|
|
error_propagate(errp, iter.err);
|
|
|
|
*request_ret = iter.request_ret;
|
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t handle,
|
2019-06-11 13:27:19 +03:00
|
|
|
uint64_t offset, QEMUIOVector *qiov,
|
|
|
|
int *request_ret, Error **errp)
|
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
NBDReply reply;
|
|
|
|
void *payload = NULL;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, s->info.structured_reply,
|
|
|
|
qiov, &reply, &payload)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&reply));
|
|
|
|
|
|
|
|
switch (chunk->type) {
|
|
|
|
case NBD_REPLY_TYPE_OFFSET_DATA:
|
|
|
|
/*
|
|
|
|
* special cased in nbd_co_receive_one_chunk, data is already
|
|
|
|
* in qiov
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
case NBD_REPLY_TYPE_OFFSET_HOLE:
|
|
|
|
ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload,
|
|
|
|
offset, qiov, &local_err);
|
|
|
|
if (ret < 0) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, ret);
|
2019-06-11 13:27:19 +03:00
|
|
|
nbd_iter_channel_error(&iter, ret, &local_err);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if (!nbd_reply_type_is_error(chunk->type)) {
|
|
|
|
/* not allowed reply type */
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, -EINVAL);
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(&local_err,
|
|
|
|
"Unexpected reply type: %d (%s) for CMD_READ",
|
|
|
|
chunk->type, nbd_reply_type_lookup(chunk->type));
|
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(payload);
|
|
|
|
payload = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
error_propagate(errp, iter.err);
|
|
|
|
*request_ret = iter.request_ret;
|
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
|
2019-06-11 13:27:19 +03:00
|
|
|
uint64_t handle, uint64_t length,
|
|
|
|
NBDExtent *extent,
|
|
|
|
int *request_ret, Error **errp)
|
|
|
|
{
|
|
|
|
NBDReplyChunkIter iter;
|
|
|
|
NBDReply reply;
|
|
|
|
void *payload = NULL;
|
|
|
|
Error *local_err = NULL;
|
|
|
|
bool received = false;
|
|
|
|
|
|
|
|
assert(!extent->length);
|
|
|
|
NBD_FOREACH_REPLY_CHUNK(s, iter, handle, false, NULL, &reply, &payload) {
|
|
|
|
int ret;
|
|
|
|
NBDStructuredReplyChunk *chunk = &reply.structured;
|
|
|
|
|
|
|
|
assert(nbd_reply_is_structured(&reply));
|
|
|
|
|
|
|
|
switch (chunk->type) {
|
|
|
|
case NBD_REPLY_TYPE_BLOCK_STATUS:
|
|
|
|
if (received) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, -EINVAL);
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(&local_err, "Several BLOCK_STATUS chunks in reply");
|
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
|
|
|
}
|
|
|
|
received = true;
|
|
|
|
|
|
|
|
ret = nbd_parse_blockstatus_payload(s, &reply.structured,
|
|
|
|
payload, length, extent,
|
|
|
|
&local_err);
|
|
|
|
if (ret < 0) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, ret);
|
2019-06-11 13:27:19 +03:00
|
|
|
nbd_iter_channel_error(&iter, ret, &local_err);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
if (!nbd_reply_type_is_error(chunk->type)) {
|
2019-06-18 14:43:22 +03:00
|
|
|
nbd_channel_error(s, -EINVAL);
|
2019-06-11 13:27:19 +03:00
|
|
|
error_setg(&local_err,
|
|
|
|
"Unexpected reply type: %d (%s) "
|
|
|
|
"for CMD_BLOCK_STATUS",
|
|
|
|
chunk->type, nbd_reply_type_lookup(chunk->type));
|
|
|
|
nbd_iter_channel_error(&iter, -EINVAL, &local_err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(payload);
|
|
|
|
payload = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!extent->length && !iter.request_ret) {
|
|
|
|
error_setg(&local_err, "Server did not reply with any status extents");
|
|
|
|
nbd_iter_channel_error(&iter, -EIO, &local_err);
|
|
|
|
}
|
|
|
|
|
|
|
|
error_propagate(errp, iter.err);
|
|
|
|
*request_ret = iter.request_ret;
|
|
|
|
return iter.ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_co_request(BlockDriverState *bs, NBDRequest *request,
|
|
|
|
QEMUIOVector *write_qiov)
|
|
|
|
{
|
|
|
|
int ret, request_ret;
|
|
|
|
Error *local_err = NULL;
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
assert(request->type != NBD_CMD_READ);
|
|
|
|
if (write_qiov) {
|
|
|
|
assert(request->type == NBD_CMD_WRITE);
|
|
|
|
assert(request->len == iov_size(write_qiov->iov, write_qiov->niov));
|
|
|
|
} else {
|
|
|
|
assert(request->type != NBD_CMD_WRITE);
|
|
|
|
}
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
do {
|
|
|
|
ret = nbd_co_send_request(bs, request, write_qiov);
|
|
|
|
if (ret < 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_return_code(s, request->handle,
|
|
|
|
&request_ret, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
trace_nbd_co_request_fail(request->from, request->len,
|
|
|
|
request->handle, request->flags,
|
|
|
|
request->type,
|
|
|
|
nbd_cmd_lookup(request->type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
|
|
|
local_err = NULL;
|
|
|
|
}
|
|
|
|
} while (ret < 0 && nbd_client_connecting_wait(s));
|
|
|
|
|
2019-06-11 13:27:19 +03:00
|
|
|
return ret ? ret : request_ret;
|
|
|
|
}
|
|
|
|
|
block: use int64_t instead of uint64_t in driver read handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver read handlers parameters which are already 64bit to
signed type.
While being here, convert also flags parameter to be BdrvRequestFlags.
Now let's consider all callers. Simple
git grep '\->bdrv_\(aio\|co\)_preadv\(_part\)\?'
shows that's there three callers of driver function:
bdrv_driver_preadv() in block/io.c, passes int64_t, checked by
bdrv_check_qiov_request() to be non-negative.
qcow2_load_vmstate() does bdrv_check_qiov_request().
do_perform_cow_read() has uint64_t argument. And a lot of things in
qcow2 driver are uint64_t, so converting it is big job. But we must
not work with requests that don't satisfy bdrv_check_qiov_request(),
so let's just assert it here.
Still, the functions may be called directly, not only by drv->...
Let's check:
git grep '\.bdrv_\(aio\|co\)_preadv\(_part\)\?\s*=' | \
awk '{print $4}' | sed 's/,//' | sed 's/&//' | sort | uniq | \
while read func; do git grep "$func(" | \
grep -v "$func(BlockDriverState"; done
The only one such caller:
QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, &data, 1);
...
ret = bdrv_replace_test_co_preadv(bs, 0, 1, &qiov, 0);
in tests/unit/test-bdrv-drain.c, and it's OK obviously.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-4-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: fix typos]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:27:59 +03:00
|
|
|
static int nbd_client_co_preadv(BlockDriverState *bs, int64_t offset,
|
|
|
|
int64_t bytes, QEMUIOVector *qiov,
|
|
|
|
BdrvRequestFlags flags)
|
2019-06-11 13:27:19 +03:00
|
|
|
{
|
|
|
|
int ret, request_ret;
|
|
|
|
Error *local_err = NULL;
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_READ,
|
|
|
|
.from = offset,
|
|
|
|
.len = bytes,
|
|
|
|
};
|
|
|
|
|
|
|
|
assert(bytes <= NBD_MAX_BUFFER_SIZE);
|
|
|
|
assert(!flags);
|
|
|
|
|
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Work around the fact that the block layer doesn't do
|
|
|
|
* byte-accurate sizing yet - if the read exceeds the server's
|
|
|
|
* advertised size because the block layer rounded size up, then
|
|
|
|
* truncate the request to the server and tail-pad with zero.
|
|
|
|
*/
|
2019-06-11 13:27:20 +03:00
|
|
|
if (offset >= s->info.size) {
|
2019-06-11 13:27:19 +03:00
|
|
|
assert(bytes < BDRV_SECTOR_SIZE);
|
|
|
|
qemu_iovec_memset(qiov, 0, 0, bytes);
|
|
|
|
return 0;
|
|
|
|
}
|
2019-06-11 13:27:20 +03:00
|
|
|
if (offset + bytes > s->info.size) {
|
|
|
|
uint64_t slop = offset + bytes - s->info.size;
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
assert(slop < BDRV_SECTOR_SIZE);
|
|
|
|
qemu_iovec_memset(qiov, bytes - slop, 0, slop);
|
|
|
|
request.len -= slop;
|
|
|
|
}
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
do {
|
|
|
|
ret = nbd_co_send_request(bs, &request, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_cmdread_reply(s, request.handle, offset, qiov,
|
|
|
|
&request_ret, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
trace_nbd_co_request_fail(request.from, request.len, request.handle,
|
|
|
|
request.flags, request.type,
|
|
|
|
nbd_cmd_lookup(request.type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
|
|
|
local_err = NULL;
|
|
|
|
}
|
|
|
|
} while (ret < 0 && nbd_client_connecting_wait(s));
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
return ret ? ret : request_ret;
|
|
|
|
}
|
|
|
|
|
block: use int64_t instead of uint64_t in driver write handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write handlers parameters which are already 64bit to
signed type.
While being here, convert also flags parameter to be BdrvRequestFlags.
Now let's consider all callers. Simple
git grep '\->bdrv_\(aio\|co\)_pwritev\(_part\)\?'
shows that's there three callers of driver function:
bdrv_driver_pwritev() and bdrv_driver_pwritev_compressed() in
block/io.c, both pass int64_t, checked by bdrv_check_qiov_request() to
be non-negative.
qcow2_save_vmstate() does bdrv_check_qiov_request().
Still, the functions may be called directly, not only by drv->...
Let's check:
git grep '\.bdrv_\(aio\|co\)_pwritev\(_part\)\?\s*=' | \
awk '{print $4}' | sed 's/,//' | sed 's/&//' | sort | uniq | \
while read func; do git grep "$func(" | \
grep -v "$func(BlockDriverState"; done
shows several callers:
qcow2:
qcow2_co_truncate() write at most up to @offset, which is checked in
generic qcow2_co_truncate() by bdrv_check_request().
qcow2_co_pwritev_compressed_task() pass the request (or part of the
request) that already went through normal write path, so it should
be OK
qcow:
qcow_co_pwritev_compressed() pass int64_t, it's updated by this patch
quorum:
quorum_co_pwrite_zeroes() pass int64_t and int - OK
throttle:
throttle_co_pwritev_compressed() pass int64_t, it's updated by this
patch
vmdk:
vmdk_co_pwritev_compressed() pass int64_t, it's updated by this
patch
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:00 +03:00
|
|
|
static int nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset,
|
|
|
|
int64_t bytes, QEMUIOVector *qiov,
|
|
|
|
BdrvRequestFlags flags)
|
2019-06-11 13:27:19 +03:00
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_WRITE,
|
|
|
|
.from = offset,
|
|
|
|
.len = bytes,
|
|
|
|
};
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
|
2019-06-11 13:27:19 +03:00
|
|
|
if (flags & BDRV_REQ_FUA) {
|
2019-06-11 13:27:20 +03:00
|
|
|
assert(s->info.flags & NBD_FLAG_SEND_FUA);
|
2019-06-11 13:27:19 +03:00
|
|
|
request.flags |= NBD_CMD_FLAG_FUA;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(bytes <= NBD_MAX_BUFFER_SIZE);
|
|
|
|
|
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return nbd_co_request(bs, &request, qiov);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
int64_t bytes, BdrvRequestFlags flags)
|
2019-06-11 13:27:19 +03:00
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_WRITE_ZEROES,
|
|
|
|
.from = offset,
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
.len = bytes, /* .len is uint32_t actually */
|
2019-06-11 13:27:19 +03:00
|
|
|
};
|
|
|
|
|
block: use int64_t instead of int in driver write_zeroes handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver write_zeroes handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_do_pwrite_zeroes().
bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.
Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.
Let's go:
blkdebug: calculations can't overflow, thanks to
bdrv_check_qiov_request() in generic layer. rule_check() and
bdrv_co_pwrite_zeroes() both have 64bit argument.
blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.
blkreplay, copy-on-read, filter-compress: pass to
bdrv_co_pwrite_zeroes() which is OK
copy-before-write: Calls cbw_do_copy_before_write() and
bdrv_co_pwrite_zeroes, both have 64bit argument.
file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
In raw_do_pwrite_zeroes() calculations are OK due to
bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
which is uint64_t.
Check also where that uint64_t gets handed:
handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
which takes off_t (and we compile to always have 64-bit off_t), as
does handle_aiocb_write_zeroes_unmap. All look safe.
gluster: bytes go to GlusterAIOCB::size which is int64_t and to
glfs_zerofill_async works with off_t.
iscsi: Aha, here we deal with iscsi_writesame16_task() that has
uint32_t num_blocks argument and iscsi_writesame16_task() has
uint16_t argument. Make comments, add assertions and clarify
max_pwrite_zeroes calculation.
iscsi_allocmap_() functions already has int64_t argument
is_byte_request_lun_aligned is simple to update, do it.
mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
argument
nbd: Aha, here we have protocol limitation, and NBDRequest::len is
uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
OK for now.
nvme: Again, protocol limitation. And no inherent limit for
write-zeroes at all. But from code that calculates cdw12 it's obvious
that we do have limit and alignment. Let's clarify it. Also,
obviously the code is not prepared to handle bytes=0. Let's handle
this case too.
trace events already 64bit
preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: offset + bytes and alignment still works good (thanks to
bdrv_check_qiov_request()), so tail calculation is OK
qcow2_subcluster_zeroize() has 64bit argument, should be OK
trace events updated
qed: qed_co_request wants int nb_sectors. Also in code we have size_t
used for request length which may be 32bit. So, let's just keep
INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
don't care.
raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
64bit.
throttle: Both throttle_group_co_io_limits_intercept() and
bdrv_co_pwrite_zeroes() are 64bit.
vmdk: pass to vmdk_pwritev which is 64bit
quorum: pass to quorum_co_pwritev() which is 64bit
Hooray!
At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-8-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:03 +03:00
|
|
|
assert(bytes <= UINT32_MAX); /* rely on max_pwrite_zeroes */
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
|
|
|
|
if (!(s->info.flags & NBD_FLAG_SEND_WRITE_ZEROES)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flags & BDRV_REQ_FUA) {
|
2019-06-11 13:27:20 +03:00
|
|
|
assert(s->info.flags & NBD_FLAG_SEND_FUA);
|
2019-06-11 13:27:19 +03:00
|
|
|
request.flags |= NBD_CMD_FLAG_FUA;
|
|
|
|
}
|
|
|
|
if (!(flags & BDRV_REQ_MAY_UNMAP)) {
|
|
|
|
request.flags |= NBD_CMD_FLAG_NO_HOLE;
|
|
|
|
}
|
2019-08-23 17:37:24 +03:00
|
|
|
if (flags & BDRV_REQ_NO_FALLBACK) {
|
|
|
|
assert(s->info.flags & NBD_FLAG_SEND_FAST_ZERO);
|
|
|
|
request.flags |= NBD_CMD_FLAG_FAST_ZERO;
|
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
if (!bytes) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return nbd_co_request(bs, &request, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_client_co_flush(BlockDriverState *bs)
|
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = { .type = NBD_CMD_FLUSH };
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
if (!(s->info.flags & NBD_FLAG_SEND_FLUSH)) {
|
2019-06-11 13:27:19 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
request.from = 0;
|
|
|
|
request.len = 0;
|
|
|
|
|
|
|
|
return nbd_co_request(bs, &request, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset,
|
block: use int64_t instead of int in driver discard handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver discard handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_pdiscard in
block/io.c. It is already prepared to work with 64bit requests, but
pass at most max(bs->bl.max_pdiscard, INT_MAX) to the driver.
Let's look at all updated functions:
blkdebug: all calculations are still OK, thanks to
bdrv_check_qiov_request().
both rule_check and bdrv_co_pdiscard are 64bit
blklogwrites: pass to blk_loc_writes_co_log which is 64bit
blkreplay, copy-on-read, filter-compress: pass to bdrv_co_pdiscard, OK
copy-before-write: pass to bdrv_co_pdiscard which is 64bit and to
cbw_do_copy_before_write which is 64bit
file-posix: one handler calls raw_account_discard() is 64bit and both
handlers calls raw_do_pdiscard(). Update raw_do_pdiscard, which pass
to RawPosixAIOData::aio_nbytes, which is 64bit (and calls
raw_account_discard())
gluster: somehow, third argument of glfs_discard_async is size_t.
Let's set max_pdiscard accordingly.
iscsi: iscsi_allocmap_set_invalid is 64bit,
!is_byte_request_lun_aligned is 64bit.
list.num is uint32_t. Let's clarify max_pdiscard and
pdiscard_alignment.
mirror_top: pass to bdrv_mirror_top_do_write() which is
64bit
nbd: protocol limitation. max_pdiscard is alredy set strict enough,
keep it as is for now.
nvme: buf.nlb is uint32_t and we do shift. So, add corresponding limits
to nvme_refresh_limits().
preallocate: pass to bdrv_co_pdiscard() which is 64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: calculations are still OK, thanks to bdrv_check_qiov_request(),
qcow2_cluster_discard() is 64bit.
raw-format: raw_adjust_offset() is 64bit, bdrv_co_pdiscard too.
throttle: pass to bdrv_co_pdiscard() which is 64bit and to
throttle_group_co_io_limits_intercept() which is 64bit as well.
test-block-iothread: bytes argument is unused
Great! Now all drivers are prepared to handle 64bit discard requests,
or else have explicit max_pdiscard limits.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-11-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:06 +03:00
|
|
|
int64_t bytes)
|
2019-06-11 13:27:19 +03:00
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_TRIM,
|
|
|
|
.from = offset,
|
block: use int64_t instead of int in driver discard handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver discard handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_pdiscard in
block/io.c. It is already prepared to work with 64bit requests, but
pass at most max(bs->bl.max_pdiscard, INT_MAX) to the driver.
Let's look at all updated functions:
blkdebug: all calculations are still OK, thanks to
bdrv_check_qiov_request().
both rule_check and bdrv_co_pdiscard are 64bit
blklogwrites: pass to blk_loc_writes_co_log which is 64bit
blkreplay, copy-on-read, filter-compress: pass to bdrv_co_pdiscard, OK
copy-before-write: pass to bdrv_co_pdiscard which is 64bit and to
cbw_do_copy_before_write which is 64bit
file-posix: one handler calls raw_account_discard() is 64bit and both
handlers calls raw_do_pdiscard(). Update raw_do_pdiscard, which pass
to RawPosixAIOData::aio_nbytes, which is 64bit (and calls
raw_account_discard())
gluster: somehow, third argument of glfs_discard_async is size_t.
Let's set max_pdiscard accordingly.
iscsi: iscsi_allocmap_set_invalid is 64bit,
!is_byte_request_lun_aligned is 64bit.
list.num is uint32_t. Let's clarify max_pdiscard and
pdiscard_alignment.
mirror_top: pass to bdrv_mirror_top_do_write() which is
64bit
nbd: protocol limitation. max_pdiscard is alredy set strict enough,
keep it as is for now.
nvme: buf.nlb is uint32_t and we do shift. So, add corresponding limits
to nvme_refresh_limits().
preallocate: pass to bdrv_co_pdiscard() which is 64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: calculations are still OK, thanks to bdrv_check_qiov_request(),
qcow2_cluster_discard() is 64bit.
raw-format: raw_adjust_offset() is 64bit, bdrv_co_pdiscard too.
throttle: pass to bdrv_co_pdiscard() which is 64bit and to
throttle_group_co_io_limits_intercept() which is 64bit as well.
test-block-iothread: bytes argument is unused
Great! Now all drivers are prepared to handle 64bit discard requests,
or else have explicit max_pdiscard limits.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-11-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:06 +03:00
|
|
|
.len = bytes, /* len is uint32_t */
|
2019-06-11 13:27:19 +03:00
|
|
|
};
|
|
|
|
|
block: use int64_t instead of int in driver discard handlers
We are generally moving to int64_t for both offset and bytes parameters
on all io paths.
Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.
We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).
So, convert driver discard handlers bytes parameter to int64_t.
The only caller of all updated function is bdrv_co_pdiscard in
block/io.c. It is already prepared to work with 64bit requests, but
pass at most max(bs->bl.max_pdiscard, INT_MAX) to the driver.
Let's look at all updated functions:
blkdebug: all calculations are still OK, thanks to
bdrv_check_qiov_request().
both rule_check and bdrv_co_pdiscard are 64bit
blklogwrites: pass to blk_loc_writes_co_log which is 64bit
blkreplay, copy-on-read, filter-compress: pass to bdrv_co_pdiscard, OK
copy-before-write: pass to bdrv_co_pdiscard which is 64bit and to
cbw_do_copy_before_write which is 64bit
file-posix: one handler calls raw_account_discard() is 64bit and both
handlers calls raw_do_pdiscard(). Update raw_do_pdiscard, which pass
to RawPosixAIOData::aio_nbytes, which is 64bit (and calls
raw_account_discard())
gluster: somehow, third argument of glfs_discard_async is size_t.
Let's set max_pdiscard accordingly.
iscsi: iscsi_allocmap_set_invalid is 64bit,
!is_byte_request_lun_aligned is 64bit.
list.num is uint32_t. Let's clarify max_pdiscard and
pdiscard_alignment.
mirror_top: pass to bdrv_mirror_top_do_write() which is
64bit
nbd: protocol limitation. max_pdiscard is alredy set strict enough,
keep it as is for now.
nvme: buf.nlb is uint32_t and we do shift. So, add corresponding limits
to nvme_refresh_limits().
preallocate: pass to bdrv_co_pdiscard() which is 64bit.
rbd: pass to qemu_rbd_start_co() which is 64bit.
qcow2: calculations are still OK, thanks to bdrv_check_qiov_request(),
qcow2_cluster_discard() is 64bit.
raw-format: raw_adjust_offset() is 64bit, bdrv_co_pdiscard too.
throttle: pass to bdrv_co_pdiscard() which is 64bit and to
throttle_group_co_io_limits_intercept() which is 64bit as well.
test-block-iothread: bytes argument is unused
Great! Now all drivers are prepared to handle 64bit discard requests,
or else have explicit max_pdiscard limits.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210903102807.27127-11-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-03 13:28:06 +03:00
|
|
|
assert(bytes <= UINT32_MAX); /* rely on max_pdiscard */
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
assert(!(s->info.flags & NBD_FLAG_READ_ONLY));
|
|
|
|
if (!(s->info.flags & NBD_FLAG_SEND_TRIM) || !bytes) {
|
2019-06-11 13:27:19 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return nbd_co_request(bs, &request, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int coroutine_fn nbd_client_co_block_status(
|
|
|
|
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
|
|
|
|
int64_t *pnum, int64_t *map, BlockDriverState **file)
|
|
|
|
{
|
|
|
|
int ret, request_ret;
|
|
|
|
NBDExtent extent = { 0 };
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
Error *local_err = NULL;
|
|
|
|
|
|
|
|
NBDRequest request = {
|
|
|
|
.type = NBD_CMD_BLOCK_STATUS,
|
|
|
|
.from = offset,
|
2020-04-01 18:01:07 +03:00
|
|
|
.len = MIN(QEMU_ALIGN_DOWN(INT_MAX, bs->bl.request_alignment),
|
2019-06-11 13:27:20 +03:00
|
|
|
MIN(bytes, s->info.size - offset)),
|
2019-06-11 13:27:19 +03:00
|
|
|
.flags = NBD_CMD_FLAG_REQ_ONE,
|
|
|
|
};
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
if (!s->info.base_allocation) {
|
2019-06-11 13:27:19 +03:00
|
|
|
*pnum = bytes;
|
|
|
|
*map = offset;
|
|
|
|
*file = bs;
|
|
|
|
return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Work around the fact that the block layer doesn't do
|
|
|
|
* byte-accurate sizing yet - if the status request exceeds the
|
|
|
|
* server's advertised size because the block layer rounded size
|
|
|
|
* up, we truncated the request to the server (above), or are
|
|
|
|
* called on just the hole.
|
|
|
|
*/
|
2019-06-11 13:27:20 +03:00
|
|
|
if (offset >= s->info.size) {
|
2019-06-11 13:27:19 +03:00
|
|
|
*pnum = bytes;
|
|
|
|
assert(bytes < BDRV_SECTOR_SIZE);
|
|
|
|
/* Intentionally don't report offset_valid for the hole */
|
|
|
|
return BDRV_BLOCK_ZERO;
|
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
if (s->info.min_block) {
|
|
|
|
assert(QEMU_IS_ALIGNED(request.len, s->info.min_block));
|
2019-06-11 13:27:19 +03:00
|
|
|
}
|
2019-10-09 11:41:57 +03:00
|
|
|
do {
|
|
|
|
ret = nbd_co_send_request(bs, &request, NULL);
|
|
|
|
if (ret < 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = nbd_co_receive_blockstatus_reply(s, request.handle, bytes,
|
|
|
|
&extent, &request_ret,
|
|
|
|
&local_err);
|
|
|
|
if (local_err) {
|
|
|
|
trace_nbd_co_request_fail(request.from, request.len, request.handle,
|
|
|
|
request.flags, request.type,
|
|
|
|
nbd_cmd_lookup(request.type),
|
|
|
|
ret, error_get_pretty(local_err));
|
|
|
|
error_free(local_err);
|
|
|
|
local_err = NULL;
|
|
|
|
}
|
|
|
|
} while (ret < 0 && nbd_client_connecting_wait(s));
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
if (ret < 0 || request_ret < 0) {
|
|
|
|
return ret ? ret : request_ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(extent.length);
|
|
|
|
*pnum = extent.length;
|
|
|
|
*map = offset;
|
|
|
|
*file = bs;
|
|
|
|
return (extent.flags & NBD_STATE_HOLE ? 0 : BDRV_BLOCK_DATA) |
|
|
|
|
(extent.flags & NBD_STATE_ZERO ? BDRV_BLOCK_ZERO : 0) |
|
|
|
|
BDRV_BLOCK_OFFSET_VALID;
|
|
|
|
}
|
|
|
|
|
2019-10-01 00:38:20 +03:00
|
|
|
static int nbd_client_reopen_prepare(BDRVReopenState *state,
|
|
|
|
BlockReopenQueue *queue, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)state->bs->opaque;
|
|
|
|
|
|
|
|
if ((state->flags & BDRV_O_RDWR) && (s->info.flags & NBD_FLAG_READ_ONLY)) {
|
|
|
|
error_setg(errp, "Can't reopen read-only NBD mount as read/write");
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-12-28 18:08:44 +03:00
|
|
|
static void nbd_yank(void *opaque)
|
|
|
|
{
|
|
|
|
BlockDriverState *bs = opaque;
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
|
|
|
|
|
|
|
qatomic_store_release(&s->state, NBD_CLIENT_QUIT);
|
2021-06-10 13:07:55 +03:00
|
|
|
qio_channel_shutdown(QIO_CHANNEL(s->ioc), QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
|
2020-12-28 18:08:44 +03:00
|
|
|
}
|
|
|
|
|
2019-06-11 13:27:19 +03:00
|
|
|
static void nbd_client_close(BlockDriverState *bs)
|
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
2019-06-11 13:27:19 +03:00
|
|
|
NBDRequest request = { .type = NBD_CMD_DISC };
|
|
|
|
|
2019-10-09 11:41:57 +03:00
|
|
|
if (s->ioc) {
|
|
|
|
nbd_send_request(s->ioc, &request);
|
|
|
|
}
|
2019-06-11 13:27:19 +03:00
|
|
|
|
|
|
|
nbd_teardown_connection(bs);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
/*
|
|
|
|
* Parse nbd_open options
|
|
|
|
*/
|
2019-06-11 13:27:19 +03:00
|
|
|
|
2013-03-07 19:15:11 +04:00
|
|
|
static int nbd_parse_uri(const char *filename, QDict *options)
|
2012-11-04 16:04:24 +04:00
|
|
|
{
|
|
|
|
URI *uri;
|
|
|
|
const char *p;
|
|
|
|
QueryParams *qp = NULL;
|
|
|
|
int ret = 0;
|
2013-03-07 19:15:11 +04:00
|
|
|
bool is_unix;
|
2012-11-04 16:04:24 +04:00
|
|
|
|
|
|
|
uri = uri_parse(filename);
|
|
|
|
if (!uri) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* transport */
|
2017-06-13 23:57:26 +03:00
|
|
|
if (!g_strcmp0(uri->scheme, "nbd")) {
|
2013-03-07 19:15:11 +04:00
|
|
|
is_unix = false;
|
2017-06-13 23:57:26 +03:00
|
|
|
} else if (!g_strcmp0(uri->scheme, "nbd+tcp")) {
|
2013-03-07 19:15:11 +04:00
|
|
|
is_unix = false;
|
2017-06-13 23:57:26 +03:00
|
|
|
} else if (!g_strcmp0(uri->scheme, "nbd+unix")) {
|
2013-03-07 19:15:11 +04:00
|
|
|
is_unix = true;
|
2012-11-04 16:04:24 +04:00
|
|
|
} else {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2020-02-12 05:31:01 +03:00
|
|
|
p = uri->path ? uri->path : "";
|
|
|
|
if (p[0] == '/') {
|
|
|
|
p++;
|
|
|
|
}
|
2012-11-04 16:04:24 +04:00
|
|
|
if (p[0]) {
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "export", p);
|
2012-11-04 16:04:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
qp = query_params_parse(uri->query);
|
2013-03-07 19:15:11 +04:00
|
|
|
if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
|
2012-11-04 16:04:24 +04:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2013-03-07 19:15:11 +04:00
|
|
|
if (is_unix) {
|
2012-11-04 16:04:24 +04:00
|
|
|
/* nbd+unix:///export?socket=path */
|
|
|
|
if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "server.type", "unix");
|
|
|
|
qdict_put_str(options, "server.path", qp->p[0].value);
|
2012-11-04 16:04:24 +04:00
|
|
|
} else {
|
2013-06-03 19:54:56 +04:00
|
|
|
QString *host;
|
2016-10-25 16:11:35 +03:00
|
|
|
char *port_str;
|
|
|
|
|
2013-03-18 19:56:05 +04:00
|
|
|
/* nbd[+tcp]://host[:port]/export */
|
2012-11-04 16:04:24 +04:00
|
|
|
if (!uri->server) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-03-15 14:55:29 +04:00
|
|
|
|
2013-06-03 19:54:56 +04:00
|
|
|
/* strip braces from literal IPv6 address */
|
|
|
|
if (uri->server[0] == '[') {
|
|
|
|
host = qstring_from_substr(uri->server, 1,
|
2018-07-27 09:22:04 +03:00
|
|
|
strlen(uri->server) - 1);
|
2013-06-03 19:54:56 +04:00
|
|
|
} else {
|
|
|
|
host = qstring_from_str(uri->server);
|
|
|
|
}
|
|
|
|
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "server.type", "inet");
|
nbd: Tidy up blockdev-add interface
SocketAddress is a simple union, and simple unions are awkward: they
have their variant members wrapped in a "data" object on the wire, and
require additional indirections in C. I intend to limit its use to
existing external interfaces, and convert all internal interfaces to
SocketAddressFlat.
BlockdevOptionsNbd is an external interface using SocketAddress. We
already use SocketAddressFlat elsewhere in blockdev-add. Replace it
by SocketAddressFlat while we can (it's new in 2.9) for simplicity and
consistency. For example,
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"data": { "host": "localhost",
"port": "12345" } } } }
becomes
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"host": "localhost", "port": "12345" } } }
Since the internal interfaces still take SocketAddress, this requires
conversion function socket_address_crumple(). It'll go away when I
update the interfaces.
Unfortunately, SocketAddress is also visible in -drive since 2.8:
-drive if=none,driver=nbd,server.type=inet,server.data.host=127.0.0.1,server.data.port=12345
Nobody should be using it, as it's fairly new and has never been
documented, so adding still more compatibility gunk to keep it working
isn't worth the trouble. You now have to use
-drive if=none,driver=nbd,server.type=inet,server.host=127.0.0.1,server.port=12345
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-id: 1490895797-29094-9-git-send-email-armbru@redhat.com
[mreitz: Change iotest 147 accordingly]
Because of this interface change, iotest 147 has to be adapted.
Unfortunately, we cannot just flatten all of the addresses because
nbd-server-start still takes a plain SocketAddress. Therefore, we need
both and this is most easily achieved by writing the SocketAddress into
the code and flattening it where necessary.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170330221243.17333-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2017-03-30 20:43:16 +03:00
|
|
|
qdict_put(options, "server.host", host);
|
2016-10-25 16:11:35 +03:00
|
|
|
|
|
|
|
port_str = g_strdup_printf("%d", uri->port ?: NBD_DEFAULT_PORT);
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "server.port", port_str);
|
2016-10-25 16:11:35 +03:00
|
|
|
g_free(port_str);
|
2012-11-04 16:04:24 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (qp) {
|
|
|
|
query_params_free(qp);
|
|
|
|
}
|
|
|
|
uri_free(uri);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-10-25 16:11:33 +03:00
|
|
|
static bool nbd_has_filename_options_conflict(QDict *options, Error **errp)
|
|
|
|
{
|
|
|
|
const QDictEntry *e;
|
|
|
|
|
|
|
|
for (e = qdict_first(options); e; e = qdict_next(options, e)) {
|
|
|
|
if (!strcmp(e->key, "host") ||
|
|
|
|
!strcmp(e->key, "port") ||
|
|
|
|
!strcmp(e->key, "path") ||
|
2016-10-25 16:11:34 +03:00
|
|
|
!strcmp(e->key, "export") ||
|
|
|
|
strstart(e->key, "server.", NULL))
|
2016-10-25 16:11:33 +03:00
|
|
|
{
|
|
|
|
error_setg(errp, "Option '%s' cannot be used with a file name",
|
|
|
|
e->key);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-03-15 21:47:22 +04:00
|
|
|
static void nbd_parse_filename(const char *filename, QDict *options,
|
|
|
|
Error **errp)
|
2008-07-03 17:41:03 +04:00
|
|
|
{
|
2019-08-24 20:28:12 +03:00
|
|
|
g_autofree char *file = NULL;
|
2011-02-22 18:44:54 +03:00
|
|
|
char *export_name;
|
|
|
|
const char *host_spec;
|
2008-07-03 17:41:03 +04:00
|
|
|
const char *unixpath;
|
|
|
|
|
2016-10-25 16:11:33 +03:00
|
|
|
if (nbd_has_filename_options_conflict(options, errp)) {
|
2013-03-20 22:23:23 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-11-04 16:04:24 +04:00
|
|
|
if (strstr(filename, "://")) {
|
2013-03-15 21:47:22 +04:00
|
|
|
int ret = nbd_parse_uri(filename, options);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_setg(errp, "No valid URL specified");
|
|
|
|
}
|
|
|
|
return;
|
2012-11-04 16:04:24 +04:00
|
|
|
}
|
|
|
|
|
2011-08-21 07:09:37 +04:00
|
|
|
file = g_strdup(filename);
|
2010-08-26 00:48:33 +04:00
|
|
|
|
2011-02-22 18:44:54 +03:00
|
|
|
export_name = strstr(file, EN_OPTSTR);
|
|
|
|
if (export_name) {
|
|
|
|
if (export_name[strlen(EN_OPTSTR)] == 0) {
|
2019-08-24 20:28:12 +03:00
|
|
|
return;
|
2010-08-26 00:48:33 +04:00
|
|
|
}
|
2011-02-22 18:44:54 +03:00
|
|
|
export_name[0] = 0; /* truncate 'file' */
|
|
|
|
export_name += strlen(EN_OPTSTR);
|
2013-03-07 19:15:11 +04:00
|
|
|
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "export", export_name);
|
2010-08-26 00:48:33 +04:00
|
|
|
}
|
|
|
|
|
2011-02-22 18:44:54 +03:00
|
|
|
/* extract the host_spec - fail if it's not nbd:... */
|
|
|
|
if (!strstart(file, "nbd:", &host_spec)) {
|
2013-03-15 21:47:22 +04:00
|
|
|
error_setg(errp, "File name string for NBD must start with 'nbd:'");
|
2019-08-24 20:28:12 +03:00
|
|
|
return;
|
2010-08-26 00:48:33 +04:00
|
|
|
}
|
2008-07-03 17:41:03 +04:00
|
|
|
|
2013-03-07 19:15:11 +04:00
|
|
|
if (!*host_spec) {
|
2019-08-24 20:28:12 +03:00
|
|
|
return;
|
2013-03-07 19:15:11 +04:00
|
|
|
}
|
|
|
|
|
2011-02-22 18:44:54 +03:00
|
|
|
/* are we a UNIX or TCP socket? */
|
|
|
|
if (strstart(host_spec, "unix:", &unixpath)) {
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "server.type", "unix");
|
|
|
|
qdict_put_str(options, "server.path", unixpath);
|
2008-07-03 17:41:03 +04:00
|
|
|
} else {
|
2017-04-26 10:36:37 +03:00
|
|
|
InetSocketAddress *addr = g_new(InetSocketAddress, 1);
|
2013-03-07 19:15:11 +04:00
|
|
|
|
2017-04-26 10:36:37 +03:00
|
|
|
if (inet_parse(addr, host_spec, errp)) {
|
|
|
|
goto out_inet;
|
2013-03-15 14:55:29 +04:00
|
|
|
}
|
2008-07-03 17:41:03 +04:00
|
|
|
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(options, "server.type", "inet");
|
|
|
|
qdict_put_str(options, "server.host", addr->host);
|
|
|
|
qdict_put_str(options, "server.port", addr->port);
|
2017-04-26 10:36:37 +03:00
|
|
|
out_inet:
|
2013-03-07 19:15:11 +04:00
|
|
|
qapi_free_InetSocketAddress(addr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
static bool nbd_process_legacy_socket_options(QDict *output_options,
|
|
|
|
QemuOpts *legacy_opts,
|
|
|
|
Error **errp)
|
2013-03-07 19:15:11 +04:00
|
|
|
{
|
2016-10-25 16:11:34 +03:00
|
|
|
const char *path = qemu_opt_get(legacy_opts, "path");
|
|
|
|
const char *host = qemu_opt_get(legacy_opts, "host");
|
|
|
|
const char *port = qemu_opt_get(legacy_opts, "port");
|
|
|
|
const QDictEntry *e;
|
2013-03-07 19:15:11 +04:00
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
if (!path && !host && !port) {
|
|
|
|
return true;
|
|
|
|
}
|
2016-08-15 16:29:26 +03:00
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
for (e = qdict_first(output_options); e; e = qdict_next(output_options, e))
|
|
|
|
{
|
|
|
|
if (strstart(e->key, "server.", NULL)) {
|
|
|
|
error_setg(errp, "Cannot use 'server' and path/host/port at the "
|
|
|
|
"same time");
|
|
|
|
return false;
|
2013-03-20 22:23:23 +04:00
|
|
|
}
|
2011-02-22 18:44:54 +03:00
|
|
|
}
|
2016-10-25 16:11:34 +03:00
|
|
|
|
|
|
|
if (path && host) {
|
|
|
|
error_setg(errp, "path and host may not be used at the same time");
|
|
|
|
return false;
|
|
|
|
} else if (path) {
|
|
|
|
if (port) {
|
|
|
|
error_setg(errp, "port may not be used without host");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(output_options, "server.type", "unix");
|
|
|
|
qdict_put_str(output_options, "server.path", path);
|
2016-10-25 16:11:34 +03:00
|
|
|
} else if (host) {
|
2017-04-28 00:58:17 +03:00
|
|
|
qdict_put_str(output_options, "server.type", "inet");
|
|
|
|
qdict_put_str(output_options, "server.host", host);
|
|
|
|
qdict_put_str(output_options, "server.port",
|
|
|
|
port ?: stringify(NBD_DEFAULT_PORT));
|
2016-10-25 16:11:30 +03:00
|
|
|
}
|
2013-03-07 19:15:11 +04:00
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
return true;
|
|
|
|
}
|
2013-03-07 19:15:11 +04:00
|
|
|
|
2017-04-26 10:36:40 +03:00
|
|
|
static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options,
|
|
|
|
Error **errp)
|
2016-10-25 16:11:34 +03:00
|
|
|
{
|
2017-04-26 10:36:40 +03:00
|
|
|
SocketAddress *saddr = NULL;
|
2016-10-25 16:11:34 +03:00
|
|
|
QDict *addr = NULL;
|
|
|
|
Visitor *iv = NULL;
|
|
|
|
|
|
|
|
qdict_extract_subqdict(options, &addr, "server.");
|
|
|
|
if (!qdict_size(addr)) {
|
|
|
|
error_setg(errp, "NBD server address missing");
|
|
|
|
goto done;
|
2013-03-07 19:15:11 +04:00
|
|
|
}
|
|
|
|
|
2018-06-14 22:14:33 +03:00
|
|
|
iv = qobject_input_visitor_new_flat_confused(addr, errp);
|
|
|
|
if (!iv) {
|
2016-10-25 16:11:34 +03:00
|
|
|
goto done;
|
|
|
|
}
|
2013-03-18 19:56:05 +04:00
|
|
|
|
2020-07-07 19:06:03 +03:00
|
|
|
if (!visit_type_SocketAddress(iv, NULL, &saddr, errp)) {
|
2016-10-25 16:11:34 +03:00
|
|
|
goto done;
|
|
|
|
}
|
2015-09-16 16:52:22 +03:00
|
|
|
|
2021-06-10 13:07:36 +03:00
|
|
|
if (socket_address_parse_named_fd(saddr, errp) < 0) {
|
|
|
|
qapi_free_SocketAddress(saddr);
|
|
|
|
saddr = NULL;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
done:
|
2018-04-19 18:01:43 +03:00
|
|
|
qobject_unref(addr);
|
2016-10-25 16:11:34 +03:00
|
|
|
visit_free(iv);
|
2015-09-16 16:52:22 +03:00
|
|
|
return saddr;
|
2011-02-22 18:44:54 +03:00
|
|
|
}
|
2010-08-26 00:48:33 +04:00
|
|
|
|
2016-02-10 21:41:12 +03:00
|
|
|
static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp)
|
|
|
|
{
|
|
|
|
Object *obj;
|
|
|
|
QCryptoTLSCreds *creds;
|
|
|
|
|
|
|
|
obj = object_resolve_path_component(
|
|
|
|
object_get_objects_root(), id);
|
|
|
|
if (!obj) {
|
|
|
|
error_setg(errp, "No TLS credentials with id '%s'",
|
|
|
|
id);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
creds = (QCryptoTLSCreds *)
|
|
|
|
object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS);
|
|
|
|
if (!creds) {
|
|
|
|
error_setg(errp, "Object with id '%s' is not TLS credentials",
|
|
|
|
id);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-06-28 19:09:09 +03:00
|
|
|
if (!qcrypto_tls_creds_check_endpoint(creds,
|
|
|
|
QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT,
|
|
|
|
errp)) {
|
2016-02-10 21:41:12 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
object_ref(obj);
|
|
|
|
return creds;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2016-08-15 16:29:24 +03:00
|
|
|
static QemuOptsList nbd_runtime_opts = {
|
|
|
|
.name = "nbd",
|
|
|
|
.head = QTAILQ_HEAD_INITIALIZER(nbd_runtime_opts.head),
|
|
|
|
.desc = {
|
|
|
|
{
|
|
|
|
.name = "host",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "TCP host to connect to",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "port",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "TCP port to connect to",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "path",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Unix socket path to connect to",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "export",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "Name of the NBD export to open",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
.name = "tls-creds",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "ID of the TLS credentials to use",
|
|
|
|
},
|
2018-07-02 22:14:57 +03:00
|
|
|
{
|
|
|
|
.name = "x-dirty-bitmap",
|
|
|
|
.type = QEMU_OPT_STRING,
|
|
|
|
.help = "experimental: expose named dirty bitmap in place of "
|
|
|
|
"block status",
|
|
|
|
},
|
2019-06-18 14:43:23 +03:00
|
|
|
{
|
|
|
|
.name = "reconnect-delay",
|
|
|
|
.type = QEMU_OPT_NUMBER,
|
|
|
|
.help = "On an unexpected disconnect, the nbd client tries to "
|
|
|
|
"connect again until succeeding or encountering a serious "
|
|
|
|
"error. During the first @reconnect-delay seconds, all "
|
|
|
|
"requests are paused and will be rerun on a successful "
|
|
|
|
"reconnect. After that time, any delayed requests and all "
|
|
|
|
"future requests before a successful reconnect will "
|
|
|
|
"immediately fail. Default 0",
|
|
|
|
},
|
2021-09-06 22:06:48 +03:00
|
|
|
{
|
|
|
|
.name = "open-timeout",
|
|
|
|
.type = QEMU_OPT_NUMBER,
|
|
|
|
.help = "In seconds. If zero, the nbd driver tries the connection "
|
|
|
|
"only once, and fails to open if the connection fails. "
|
|
|
|
"If non-zero, the nbd driver will repeat connection "
|
|
|
|
"attempts until successful or until @open-timeout seconds "
|
|
|
|
"have elapsed. Default 0",
|
|
|
|
},
|
block/nbd: fix segmentation fault when .desc is not null-terminated
The find_desc_by_name() from util/qemu-option.c relies on the .name not being
NULL to call strcmp(). This check becomes unsafe when the list is not
NULL-terminated, which is the case of nbd_runtime_opts in block/nbd.c, and can
result in segmentation fault when strcmp() tries to access an invalid memory:
#0 0x00007fff8c75f7d4 in __strcmp_power9 () from /lib64/libc.so.6
#1 0x00000000102d3ec8 in find_desc_by_name (desc=0x1036d6f0, name=0x28e46670 "server.path") at util/qemu-option.c:166
#2 0x00000000102d93e0 in qemu_opts_absorb_qdict (opts=0x28e47a80, qdict=0x28e469a0, errp=0x7fffec247c98) at util/qemu-option.c:1026
#3 0x000000001012a2e4 in nbd_open (bs=0x28e42290, options=0x28e469a0, flags=24578, errp=0x7fffec247d80) at block/nbd.c:406
#4 0x00000000100144e8 in bdrv_open_driver (bs=0x28e42290, drv=0x1036e070 <bdrv_nbd_unix>, node_name=0x0, options=0x28e469a0, open_flags=24578, errp=0x7fffec247f50) at block.c:1135
#5 0x0000000010015b04 in bdrv_open_common (bs=0x28e42290, file=0x0, options=0x28e469a0, errp=0x7fffec247f50) at block.c:1395
>From gdb, the desc[i].name was not NULL and resulted in strcmp() accessing an
invalid memory:
>>> p desc[5]
$8 = {
name = 0x1037f098 "R27A",
type = 1561964883,
help = 0xc0bbb23e <error: Cannot access memory at address 0xc0bbb23e>,
def_value_str = 0x2 <error: Cannot access memory at address 0x2>
}
>>> p desc[6]
$9 = {
name = 0x103dac78 <__gcov0.do_qemu_init_bdrv_nbd_init> "\001",
type = 272101528,
help = 0x29ec0b754403e31f <error: Cannot access memory at address 0x29ec0b754403e31f>,
def_value_str = 0x81f343b9 <error: Cannot access memory at address 0x81f343b9>
}
This patch fixes the segmentation fault in strcmp() by adding a NULL element at
the end of nbd_runtime_opts.desc list, which is the common practice to most of
other structs like runtime_opts in block/null.c. Thus, the desc[i].name != NULL
check becomes safe because it will not evaluate to true when .desc list reached
its end.
Reported-by: R. Nageswara Sastry <nasastry@in.ibm.com>
Buglink: https://bugs.launchpad.net/qemu/+bug/1727259
Signed-off-by: Murilo Opsfelder Araujo <muriloo@linux.vnet.ibm.com>
Message-Id: <20180105133241.14141-2-muriloo@linux.vnet.ibm.com>
CC: qemu-stable@nongnu.org
Fixes: 7ccc44fd7d1dfa62c4d6f3a680df809d6e7068ce
Signed-off-by: Eric Blake <eblake@redhat.com>
2018-01-05 16:32:41 +03:00
|
|
|
{ /* end of list */ }
|
2016-08-15 16:29:24 +03:00
|
|
|
},
|
|
|
|
};
|
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
static int nbd_process_options(BlockDriverState *bs, QDict *options,
|
|
|
|
Error **errp)
|
2011-02-22 18:44:54 +03:00
|
|
|
{
|
|
|
|
BDRVNBDState *s = bs->opaque;
|
2019-06-18 14:43:24 +03:00
|
|
|
QemuOpts *opts;
|
2016-02-10 21:41:12 +03:00
|
|
|
int ret = -EINVAL;
|
2011-09-08 16:28:59 +04:00
|
|
|
|
2016-08-15 16:29:24 +03:00
|
|
|
opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort);
|
2020-07-07 19:06:03 +03:00
|
|
|
if (!qemu_opts_absorb_qdict(opts, options, errp)) {
|
2016-08-15 16:29:24 +03:00
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2017-04-26 10:36:40 +03:00
|
|
|
/* Translate @host, @port, and @path to a SocketAddress */
|
2016-10-25 16:11:34 +03:00
|
|
|
if (!nbd_process_legacy_socket_options(options, opts, errp)) {
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2011-02-22 18:44:54 +03:00
|
|
|
/* Pop the config into our state object. Exit if invalid. */
|
2016-10-25 16:11:34 +03:00
|
|
|
s->saddr = nbd_config(s, options, errp);
|
|
|
|
if (!s->saddr) {
|
2016-02-10 21:41:12 +03:00
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
s->export = g_strdup(qemu_opt_get(opts, "export"));
|
2019-11-14 05:46:34 +03:00
|
|
|
if (s->export && strlen(s->export) > NBD_MAX_STRING_SIZE) {
|
|
|
|
error_setg(errp, "export name too long to send to server");
|
|
|
|
goto error;
|
|
|
|
}
|
2016-10-25 16:11:34 +03:00
|
|
|
|
2016-08-15 16:29:26 +03:00
|
|
|
s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds"));
|
|
|
|
if (s->tlscredsid) {
|
2019-06-18 14:43:24 +03:00
|
|
|
s->tlscreds = nbd_get_tls_creds(s->tlscredsid, errp);
|
|
|
|
if (!s->tlscreds) {
|
2016-02-10 21:41:12 +03:00
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2017-03-30 20:43:09 +03:00
|
|
|
/* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
|
2017-04-26 10:36:40 +03:00
|
|
|
if (s->saddr->type != SOCKET_ADDRESS_TYPE_INET) {
|
2016-02-10 21:41:12 +03:00
|
|
|
error_setg(errp, "TLS only supported over IP sockets");
|
|
|
|
goto error;
|
|
|
|
}
|
2019-06-18 14:43:24 +03:00
|
|
|
s->hostname = s->saddr->u.inet.host;
|
2011-02-22 18:44:54 +03:00
|
|
|
}
|
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
|
2019-11-14 05:46:34 +03:00
|
|
|
if (s->x_dirty_bitmap && strlen(s->x_dirty_bitmap) > NBD_MAX_STRING_SIZE) {
|
|
|
|
error_setg(errp, "x-dirty-bitmap query too long to send to server");
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
|
2021-09-06 22:06:48 +03:00
|
|
|
s->open_timeout = qemu_opt_get_number(opts, "open-timeout", 0);
|
2019-06-18 14:43:24 +03:00
|
|
|
|
|
|
|
ret = 0;
|
2019-02-01 16:01:34 +03:00
|
|
|
|
2016-02-10 21:41:12 +03:00
|
|
|
error:
|
2016-08-15 16:29:24 +03:00
|
|
|
qemu_opts_del(opts);
|
2016-02-10 21:41:12 +03:00
|
|
|
return ret;
|
2011-10-20 15:16:23 +04:00
|
|
|
}
|
|
|
|
|
2019-06-18 14:43:24 +03:00
|
|
|
static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
|
|
|
|
|
|
|
s->bs = bs;
|
|
|
|
qemu_co_mutex_init(&s->send_mutex);
|
|
|
|
qemu_co_queue_init(&s->free_sema);
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
qemu_co_mutex_init(&s->receive_mutex);
|
2019-06-18 14:43:24 +03:00
|
|
|
|
2020-12-28 18:08:44 +03:00
|
|
|
if (!yank_register_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name), errp)) {
|
|
|
|
return -EEXIST;
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:07:33 +03:00
|
|
|
ret = nbd_process_options(bs, options, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
2021-06-10 13:07:53 +03:00
|
|
|
s->conn = nbd_client_connection_new(s->saddr, true, s->export,
|
|
|
|
s->x_dirty_bitmap, s->tlscreds);
|
2021-06-10 13:07:37 +03:00
|
|
|
|
2021-09-06 22:06:48 +03:00
|
|
|
if (s->open_timeout) {
|
|
|
|
nbd_client_connection_enable_retry(s->conn);
|
|
|
|
open_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
|
|
|
|
s->open_timeout * NANOSECONDS_PER_SECOND);
|
|
|
|
}
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
s->state = NBD_CLIENT_CONNECTING_WAIT;
|
2021-06-10 13:08:00 +03:00
|
|
|
ret = nbd_do_establish_connection(bs, errp);
|
2019-06-18 14:43:24 +03:00
|
|
|
if (ret < 0) {
|
2021-06-10 13:07:33 +03:00
|
|
|
goto fail;
|
2019-06-18 14:43:24 +03:00
|
|
|
}
|
|
|
|
|
2022-02-04 14:10:07 +03:00
|
|
|
/*
|
|
|
|
* The connect attempt is done, so we no longer need this timer.
|
|
|
|
* Delete it, because we do not want it to be around when this node
|
|
|
|
* is drained or closed.
|
|
|
|
*/
|
|
|
|
open_timer_del(s);
|
|
|
|
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
nbd_client_connection_enable_retry(s->conn);
|
2019-06-18 14:43:24 +03:00
|
|
|
|
|
|
|
return 0;
|
2021-06-10 13:07:33 +03:00
|
|
|
|
|
|
|
fail:
|
2022-02-04 14:10:07 +03:00
|
|
|
open_timer_del(s);
|
2021-06-10 13:07:33 +03:00
|
|
|
nbd_clear_bdrvstate(bs);
|
|
|
|
return ret;
|
2019-06-18 14:43:24 +03:00
|
|
|
}
|
|
|
|
|
2011-10-21 15:17:14 +04:00
|
|
|
static int nbd_co_flush(BlockDriverState *bs)
|
|
|
|
{
|
2015-02-07 00:06:16 +03:00
|
|
|
return nbd_client_co_flush(bs);
|
2011-10-21 15:17:14 +04:00
|
|
|
}
|
|
|
|
|
2015-02-06 14:24:43 +03:00
|
|
|
static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
|
|
|
|
{
|
2019-06-11 13:27:20 +03:00
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
nbd: Honor server's advertised minimum block size
Commit 79ba8c98 (v2.7) changed the setting of request_alignment
to occur only during bdrv_refresh_limits(), rather than at at
bdrv_open() time; but at the time, NBD was unaffected, because
it still used sector-based callbacks, so the block layer
defaulted NBD to use 512 request_alignment.
Later, commit 70c4fb26 (also v2.7) changed NBD to use byte-based
callbacks, without setting request_alignment. This resulted in
NBD using request_alignment of 1, which works great when the
server supports it (as is the case for qemu-nbd), but falls apart
miserably if the server requires alignment (but only if qemu
actually sends a sub-sector request; qemu-io can do it, but
most qemu operations still perform on sectors or larger).
Even later, the NBD protocol was updated to document that clients
should learn the server's minimum alignment during NBD_OPT_GO;
and recommended that clients should assume a minimum size of 512
unless the server understands NBD_OPT_GO and replied with a smaller
size. Commit 081dd1fe (v2.10) attempted to do that, by assigning
request_alignment to whatever was learned from the server; but
it has two flaws: the assignment is done during bdrv_open() so
it gets unconditionally wiped out back to 1 during any later
bdrv_refresh_limits(); and the code is not using a default of 512
when the server did not report a minimum size.
Fix these issues by moving the assignment to request_alignment
to the right function, and by using a sane default when the
server does not advertise a minimum size.
CC: qemu-stable@nongnu.org
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180215032905.27146-1-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy<vsementsov@virtuozzo.com>
2018-02-15 06:29:05 +03:00
|
|
|
uint32_t min = s->info.min_block;
|
nbd: Implement NBD_INFO_BLOCK_SIZE on client
The upstream NBD Protocol has defined a new extension to allow
the server to advertise block sizes to the client, as well as
a way for the client to inform the server whether it intends to
obey block sizes.
When using the block layer as the client, we will obey block
sizes; but when used as 'qemu-nbd -c' to hand off to the
kernel nbd module as the client, we are still waiting for the
kernel to implement a way for us to learn if it will honor
block sizes (perhaps by an addition to sysfs, rather than an
ioctl), as well as any way to tell the kernel what additional
block sizes to obey (NBD_SET_BLKSIZE appears to be accurate
for the minimum size, but preferred and maximum sizes would
probably be new ioctl()s), so until then, we need to make our
request for block sizes conditional.
When using ioctl(NBD_SET_BLKSIZE) to hand off to the kernel,
use the minimum block size as the sector size if it is larger
than 512, which also has the nice effect of cooperating with
(non-qemu) servers that don't do read-modify-write when
exposing a block device with 4k sectors; it might also allow
us to visit a file larger than 2T on a 32-bit kernel.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170707203049.534-10-eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-07-07 23:30:49 +03:00
|
|
|
uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);
|
|
|
|
|
nbd/client: Lower min_block for block-status, unaligned size
We have a latent bug in our NBD client code, tickled by the brand new
nbdkit 1.11.10 block status support:
$ nbdkit --filter=log --filter=truncate -U - \
data data="1" size=511 truncate=64K logfile=/dev/stdout \
--run 'qemu-img convert $nbd /var/tmp/out'
...
qemu-img: block/io.c:2122: bdrv_co_block_status: Assertion `*pnum && QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset' failed.
The culprit? Our implementation of .bdrv_co_block_status can return
unaligned block status for any server that operates with a lower
actual alignment than what we tell the block layer in
request_alignment, in violation of the block layer's constraints. To
date, we've been unable to trip the bug, because qemu as NBD server
always advertises block sizing (at which point it is a server bug if
the server sends unaligned status - although qemu 3.1 is such a server
and I've sent separate patches for 4.0 both to get the server to obey
the spec, and to let the client to tolerate server oddities at EOF).
But nbdkit does not (yet) advertise block sizing, and therefore is not
in violation of the spec for returning block status at whatever
boundaries it wants, and those unaligned results can occur anywhere
rather than just at EOF. While we are still wise to avoid sending
sub-sector read/write requests to a server of unknown origin, we MUST
consider that a server telling us block status without an advertised
block size is correct. So, we either have to munge unaligned answers
from the server into aligned ones that we hand back to the block
layer, or we have to tell the block layer about a smaller alignment.
Similarly, if the server advertises an image size that is not
sector-aligned, we might as well assume that the server intends to let
us access those tail bytes, and therefore supports a minimum block
size of 1, regardless of whether the server supports block status
(although we still need more patches to fix the problem that with an
unaligned image, we can send read or block status requests that exceed
EOF to the server). Again, qemu as server cannot trip this problem
(because it rounds images to sector alignment), but nbdkit advertised
unaligned size even before it gained block status support.
Solve both alignment problems at once by using better heuristics on
what alignment to report to the block layer when the server did not
give us something to work with. Note that very few NBD servers
implement block status (to date, only qemu and nbdkit are known to do
so); and as the NBD spec mentioned block sizing constraints prior to
documenting block status, it can be assumed that any future
implementations of block status are aware that they must advertise
block size if they want a minimum size other than 1.
We've had a long history of struggles with picking the right alignment
to use in the block layer, as evidenced by the commit message of
fd8d372d (v2.12) that introduced the current choice of forced 512-byte
alignment.
There is no iotest coverage for this fix, because qemu can't provoke
it, and I didn't want to make test 241 dependent on nbdkit.
Fixes: fd8d372d
Reported-by: Richard W.M. Jones <rjones@redhat.com>
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20190329042750.14704-3-eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Tested-by: Richard W.M. Jones <rjones@redhat.com>
2019-03-29 07:27:46 +03:00
|
|
|
/*
|
|
|
|
* If the server did not advertise an alignment:
|
|
|
|
* - a size that is not sector-aligned implies that an alignment
|
|
|
|
* of 1 can be used to access those tail bytes
|
|
|
|
* - advertisement of block status requires an alignment of 1, so
|
|
|
|
* that we don't violate block layer constraints that block
|
|
|
|
* status is always aligned (as we can't control whether the
|
|
|
|
* server will report sub-sector extents, such as a hole at EOF
|
|
|
|
* on an unaligned POSIX file)
|
|
|
|
* - otherwise, assume the server is so old that we are safer avoiding
|
|
|
|
* sub-sector requests
|
|
|
|
*/
|
|
|
|
if (!min) {
|
|
|
|
min = (!QEMU_IS_ALIGNED(s->info.size, BDRV_SECTOR_SIZE) ||
|
|
|
|
s->info.base_allocation) ? 1 : BDRV_SECTOR_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
bs->bl.request_alignment = min;
|
2020-04-01 18:01:08 +03:00
|
|
|
bs->bl.max_pdiscard = QEMU_ALIGN_DOWN(INT_MAX, min);
|
nbd: Implement NBD_INFO_BLOCK_SIZE on client
The upstream NBD Protocol has defined a new extension to allow
the server to advertise block sizes to the client, as well as
a way for the client to inform the server whether it intends to
obey block sizes.
When using the block layer as the client, we will obey block
sizes; but when used as 'qemu-nbd -c' to hand off to the
kernel nbd module as the client, we are still waiting for the
kernel to implement a way for us to learn if it will honor
block sizes (perhaps by an addition to sysfs, rather than an
ioctl), as well as any way to tell the kernel what additional
block sizes to obey (NBD_SET_BLKSIZE appears to be accurate
for the minimum size, but preferred and maximum sizes would
probably be new ioctl()s), so until then, we need to make our
request for block sizes conditional.
When using ioctl(NBD_SET_BLKSIZE) to hand off to the kernel,
use the minimum block size as the sector size if it is larger
than 512, which also has the nice effect of cooperating with
(non-qemu) servers that don't do read-modify-write when
exposing a block device with 4k sectors; it might also allow
us to visit a file larger than 2T on a 32-bit kernel.
Signed-off-by: Eric Blake <eblake@redhat.com>
Message-Id: <20170707203049.534-10-eblake@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-07-07 23:30:49 +03:00
|
|
|
bs->bl.max_pwrite_zeroes = max;
|
|
|
|
bs->bl.max_transfer = max;
|
|
|
|
|
|
|
|
if (s->info.opt_block &&
|
|
|
|
s->info.opt_block > bs->bl.opt_transfer) {
|
|
|
|
bs->bl.opt_transfer = s->info.opt_block;
|
|
|
|
}
|
2015-02-06 14:24:43 +03:00
|
|
|
}
|
|
|
|
|
2008-07-03 17:41:03 +04:00
|
|
|
static void nbd_close(BlockDriverState *bs)
|
|
|
|
{
|
2015-02-07 00:06:16 +03:00
|
|
|
nbd_client_close(bs);
|
2021-06-10 13:07:33 +03:00
|
|
|
nbd_clear_bdrvstate(bs);
|
2008-07-03 17:41:03 +04:00
|
|
|
}
|
|
|
|
|
2020-07-28 00:58:43 +03:00
|
|
|
/*
|
|
|
|
* NBD cannot truncate, but if the caller asks to truncate to the same size, or
|
|
|
|
* to a smaller size with exact=false, there is no reason to fail the
|
|
|
|
* operation.
|
|
|
|
*
|
|
|
|
* Preallocation mode is ignored since it does not seems useful to fail when
|
|
|
|
* we never change anything.
|
|
|
|
*/
|
|
|
|
static int coroutine_fn nbd_co_truncate(BlockDriverState *bs, int64_t offset,
|
|
|
|
bool exact, PreallocMode prealloc,
|
|
|
|
BdrvRequestFlags flags, Error **errp)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = bs->opaque;
|
|
|
|
|
|
|
|
if (offset != s->info.size && exact) {
|
|
|
|
error_setg(errp, "Cannot resize NBD nodes");
|
|
|
|
return -ENOTSUP;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (offset > s->info.size) {
|
|
|
|
error_setg(errp, "Cannot grow NBD nodes");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-07-03 17:41:03 +04:00
|
|
|
static int64_t nbd_getlength(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = bs->opaque;
|
|
|
|
|
2019-06-11 13:27:20 +03:00
|
|
|
return s->info.size;
|
2008-07-03 17:41:03 +04:00
|
|
|
}
|
|
|
|
|
2019-02-01 22:29:28 +03:00
|
|
|
static void nbd_refresh_filename(BlockDriverState *bs)
|
2014-07-18 22:24:59 +04:00
|
|
|
{
|
2016-08-15 16:29:26 +03:00
|
|
|
BDRVNBDState *s = bs->opaque;
|
2016-10-25 16:11:34 +03:00
|
|
|
const char *host = NULL, *port = NULL, *path = NULL;
|
2020-06-08 21:26:38 +03:00
|
|
|
size_t len = 0;
|
2016-10-25 16:11:34 +03:00
|
|
|
|
2017-04-26 10:36:40 +03:00
|
|
|
if (s->saddr->type == SOCKET_ADDRESS_TYPE_INET) {
|
nbd: Tidy up blockdev-add interface
SocketAddress is a simple union, and simple unions are awkward: they
have their variant members wrapped in a "data" object on the wire, and
require additional indirections in C. I intend to limit its use to
existing external interfaces, and convert all internal interfaces to
SocketAddressFlat.
BlockdevOptionsNbd is an external interface using SocketAddress. We
already use SocketAddressFlat elsewhere in blockdev-add. Replace it
by SocketAddressFlat while we can (it's new in 2.9) for simplicity and
consistency. For example,
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"data": { "host": "localhost",
"port": "12345" } } } }
becomes
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"host": "localhost", "port": "12345" } } }
Since the internal interfaces still take SocketAddress, this requires
conversion function socket_address_crumple(). It'll go away when I
update the interfaces.
Unfortunately, SocketAddress is also visible in -drive since 2.8:
-drive if=none,driver=nbd,server.type=inet,server.data.host=127.0.0.1,server.data.port=12345
Nobody should be using it, as it's fairly new and has never been
documented, so adding still more compatibility gunk to keep it working
isn't worth the trouble. You now have to use
-drive if=none,driver=nbd,server.type=inet,server.host=127.0.0.1,server.port=12345
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-id: 1490895797-29094-9-git-send-email-armbru@redhat.com
[mreitz: Change iotest 147 accordingly]
Because of this interface change, iotest 147 has to be adapted.
Unfortunately, we cannot just flatten all of the addresses because
nbd-server-start still takes a plain SocketAddress. Therefore, we need
both and this is most easily achieved by writing the SocketAddress into
the code and flattening it where necessary.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170330221243.17333-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2017-03-30 20:43:16 +03:00
|
|
|
const InetSocketAddress *inet = &s->saddr->u.inet;
|
2016-10-25 16:11:34 +03:00
|
|
|
if (!inet->has_ipv4 && !inet->has_ipv6 && !inet->has_to) {
|
|
|
|
host = inet->host;
|
|
|
|
port = inet->port;
|
|
|
|
}
|
2017-04-26 10:36:40 +03:00
|
|
|
} else if (s->saddr->type == SOCKET_ADDRESS_TYPE_UNIX) {
|
nbd: Tidy up blockdev-add interface
SocketAddress is a simple union, and simple unions are awkward: they
have their variant members wrapped in a "data" object on the wire, and
require additional indirections in C. I intend to limit its use to
existing external interfaces, and convert all internal interfaces to
SocketAddressFlat.
BlockdevOptionsNbd is an external interface using SocketAddress. We
already use SocketAddressFlat elsewhere in blockdev-add. Replace it
by SocketAddressFlat while we can (it's new in 2.9) for simplicity and
consistency. For example,
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"data": { "host": "localhost",
"port": "12345" } } } }
becomes
{ "execute": "blockdev-add",
"arguments": { "node-name": "foo", "driver": "nbd",
"server": { "type": "inet",
"host": "localhost", "port": "12345" } } }
Since the internal interfaces still take SocketAddress, this requires
conversion function socket_address_crumple(). It'll go away when I
update the interfaces.
Unfortunately, SocketAddress is also visible in -drive since 2.8:
-drive if=none,driver=nbd,server.type=inet,server.data.host=127.0.0.1,server.data.port=12345
Nobody should be using it, as it's fairly new and has never been
documented, so adding still more compatibility gunk to keep it working
isn't worth the trouble. You now have to use
-drive if=none,driver=nbd,server.type=inet,server.host=127.0.0.1,server.port=12345
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-id: 1490895797-29094-9-git-send-email-armbru@redhat.com
[mreitz: Change iotest 147 accordingly]
Because of this interface change, iotest 147 has to be adapted.
Unfortunately, we cannot just flatten all of the addresses because
nbd-server-start still takes a plain SocketAddress. Therefore, we need
both and this is most easily achieved by writing the SocketAddress into
the code and flattening it where necessary.
Signed-off-by: Max Reitz <mreitz@redhat.com>
Message-id: 20170330221243.17333-1-mreitz@redhat.com
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
2017-03-30 20:43:16 +03:00
|
|
|
path = s->saddr->u.q_unix.path;
|
|
|
|
} /* else can't represent as pseudo-filename */
|
2014-07-18 22:24:59 +04:00
|
|
|
|
2016-10-25 16:11:34 +03:00
|
|
|
if (path && s->export) {
|
2020-06-08 21:26:38 +03:00
|
|
|
len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
|
|
|
|
"nbd+unix:///%s?socket=%s", s->export, path);
|
2016-10-25 16:11:34 +03:00
|
|
|
} else if (path && !s->export) {
|
2020-06-08 21:26:38 +03:00
|
|
|
len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
|
|
|
|
"nbd+unix://?socket=%s", path);
|
2016-10-25 16:11:34 +03:00
|
|
|
} else if (host && s->export) {
|
2020-06-08 21:26:38 +03:00
|
|
|
len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
|
|
|
|
"nbd://%s:%s/%s", host, port, s->export);
|
2016-10-25 16:11:34 +03:00
|
|
|
} else if (host && !s->export) {
|
2020-06-08 21:26:38 +03:00
|
|
|
len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
|
|
|
|
"nbd://%s:%s", host, port);
|
|
|
|
}
|
2020-06-23 00:03:55 +03:00
|
|
|
if (len >= sizeof(bs->exact_filename)) {
|
2020-06-08 21:26:38 +03:00
|
|
|
/* Name is too long to represent exactly, so leave it empty. */
|
|
|
|
bs->exact_filename[0] = '\0';
|
nbd: Fix filename generation
Export names may be used with nbd+unix, too, fix nbd_refresh_filename()
accordingly. Also, for nbd+tcp, the documented path schema is
"nbd://host[:port]/export", so use it. Furthermore, as can be seen from
that schema, the port is optional.
That makes six single cases for how the filename can be formatted; it is
not easy to generalize these cases without the resulting statement being
completely unreadable, thus there is simply one snprintf() per case.
Finally, taking the options from BDRVNBDState::socket_opts is wrong,
because those will not contain the export name. Just use
BlockDriverState::options instead.
Reported-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Max Reitz <mreitz@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2014-10-08 21:55:15 +04:00
|
|
|
}
|
2014-07-18 22:24:59 +04:00
|
|
|
}
|
|
|
|
|
2019-02-01 22:29:21 +03:00
|
|
|
static char *nbd_dirname(BlockDriverState *bs, Error **errp)
|
|
|
|
{
|
|
|
|
/* The generic bdrv_dirname() implementation is able to work out some
|
|
|
|
* directory name for NBD nodes, but that would be wrong. So far there is no
|
|
|
|
* specification for how "export paths" would work, so NBD does not have
|
|
|
|
* directory names. */
|
|
|
|
error_setg(errp, "Cannot generate a base directory for NBD nodes");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2019-02-01 22:29:25 +03:00
|
|
|
static const char *const nbd_strong_runtime_opts[] = {
|
|
|
|
"path",
|
|
|
|
"host",
|
|
|
|
"port",
|
|
|
|
"export",
|
|
|
|
"tls-creds",
|
|
|
|
"server.",
|
|
|
|
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2021-02-05 19:37:12 +03:00
|
|
|
static void nbd_cancel_in_flight(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
|
|
|
|
|
|
|
|
reconnect_delay_timer_del(s);
|
|
|
|
|
|
|
|
if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
|
|
|
|
s->state = NBD_CLIENT_CONNECTING_NOWAIT;
|
|
|
|
qemu_co_queue_restart_all(&s->free_sema);
|
|
|
|
}
|
block/nbd: drop connection_co
OK, that's a big rewrite of the logic.
Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.
We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.
Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.
The detailed list of changes below (in the sequence of diff hunks).
1. receiving coroutines are woken directly from nbd_channel_error, when
we change s->state
2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
and in nbd_teardown_connection() all requests should already be
finished (and reconnect is done from request). So
nbd_co_establish_connection_cancel() is called from
nbd_cancel_in_flight() (to cancel the request that is doing
nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
(previously we didn't need it, as reconnect delay only should cancel
active requests not the reconnection itself). But now reconnection
itself is done in the separate thread (we now call
nbd_client_connection_enable_retry() in nbd_open()), and we need to
cancel the requests that wait in nbd_co_establish_connection()
now).
2A. We do receive headers in request coroutine. But we also should
dispatch replies for other pending requests. So,
nbd_connection_entry() is turned into nbd_receive_replies(), which
does reply dispatching while it receives other request headers, and
returns when it receives the requested header.
3. All old staff around drained sections and context switch is dropped.
In details:
- we don't need to move connection_co to new aio context, as we
don't have connection_co anymore
- we don't have a fake "request" of connection_co (extra increasing
in_flight), so don't care with it in drain_begin/end
- we don't stop reconnection during drained section anymore. This
means that drain_begin may wait for a long time (up to
reconnect_delay). But that's an improvement and more correct
behavior see below[*]
4. In nbd_teardown_connection() we don't have to wait for
connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
is moved here from removed connection_co.
5. In nbd_co_do_establish_connection() we now should handle
NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
NBD_CLIENT_CONNECTING_NOWAIT, it still should call
nbd_co_establish_connection() (who knows, maybe the connection was
already established by another thread in the background). But we
shouldn't wait: if nbd_co_establish_connection() can't return new
channel immediately the request should fail (we are in
NBD_CLIENT_CONNECTING_NOWAIT state).
6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
other requests in the caller, so here we just assert that fact.
Also delay time is now initialized here: we can easily detect first
attempt and start a timer.
7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
retries are fully handle by thread (nbd/client-connection.c), delay
timer we initialize in nbd_reconnect_attempt(), we don't have to
bother with s->drained and friends. nbd_reconnect_attempt() now
called from nbd_co_send_request().
8. nbd_connection_entry is dropped: reconnect is now handled by
nbd_co_send_request(), receiving reply is now handled by
nbd_receive_replies(): all handled from request coroutines.
9. So, welcome new nbd_receive_replies() called from request coroutine,
that receives reply header instead of nbd_connection_entry().
Like with sending requests, only one coroutine may receive in a
moment. So we introduce receive_mutex, which is locked around
nbd_receive_reply(). It also protects some related fields. Still,
full audit of thread-safety in nbd driver is a separate task.
New function waits for a reply with specified handle being received
and works rather simple:
Under mutex:
- if current handle is 0, do receive by hand. If another handle
received - switch to other request coroutine, release mutex and
yield. Otherwise return success
- if current handle == requested handle, we are done
- otherwise, release mutex and yield
10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call
nbd_reconnect_attempt()
And this logic is protected by s->send_mutex
Also, on failure we don't have to care of removed s->connection_co
11. nbd_co_do_receive_one_chunk(): now instead of yield() and wait for
s->connection_co we just call new nbd_receive_replies().
12. nbd_co_receive_one_chunk(): place where s->reply.handle becomes 0,
which means that handling of the whole reply is finished. Here we
need to wake one of coroutines sleeping in nbd_receive_replies().
If none are sleeping - do nothing. That's another behavior change: we
don't have endless recv() in the idle time. It may be considered as
a drawback. If so, it may be fixed later.
13. nbd_reply_chunk_iter_receive(): don't care about removed
connection_co, just ping in_flight waiters.
14. Don't create connection_co, enable retry in the connection thread
(we don't have own reconnect loop anymore)
15. We now need to add a nbd_co_establish_connection_cancel() call in
nbd_cancel_in_flight(), to cancel the request that is doing a
connection attempt.
[*], ok, now we don't cancel reconnect on drain begin. That's correct:
reconnect feature leads to possibility of long-running requests (up
to reconnect delay). Still, drain begin is not a reason to kill
long requests. We should wait for them.
This also means, that we can again reproduce a dead-lock, described
in 8c517de24a8a1dcbeb54e7e12b5b0fda42a90ace.
Why we are OK with it:
1. Now this is not absolutely-dead dead-lock: the vm is unfrozen
after reconnect delay. Actually 8c517de24a8a1dc fixed a bug in
NBD logic, that was not described in 8c517de24a8a1dc and led to
forever dead-lock. The problem was that nobody woke the free_sema
queue, but drain_begin can't finish until there is a request in
free_sema queue. Now we have a reconnect delay timer that works
well.
2. It's not a problem of the NBD driver, but of the ide code,
because it does drain_begin under the global mutex; the problem
doesn't reproduce when using scsi instead of ide.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20210902103805.25686-5-vsementsov@virtuozzo.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
[eblake: grammar and comment tweaks]
Signed-off-by: Eric Blake <eblake@redhat.com>
2021-09-02 13:38:04 +03:00
|
|
|
|
|
|
|
nbd_co_establish_connection_cancel(s->conn);
|
2021-02-05 19:37:12 +03:00
|
|
|
}
|
|
|
|
|
2022-02-04 14:10:11 +03:00
|
|
|
static void nbd_attach_aio_context(BlockDriverState *bs,
|
|
|
|
AioContext *new_context)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = bs->opaque;
|
|
|
|
|
|
|
|
/* The open_timer is used only during nbd_open() */
|
|
|
|
assert(!s->open_timer);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The reconnect_delay_timer is scheduled in I/O paths when the
|
|
|
|
* connection is lost, to cancel the reconnection attempt after a
|
|
|
|
* given time. Once this attempt is done (successfully or not),
|
|
|
|
* nbd_reconnect_attempt() ensures the timer is deleted before the
|
|
|
|
* respective I/O request is resumed.
|
|
|
|
* Since the AioContext can only be changed when a node is drained,
|
|
|
|
* the reconnect_delay_timer cannot be active here.
|
|
|
|
*/
|
|
|
|
assert(!s->reconnect_delay_timer);
|
|
|
|
|
|
|
|
if (s->ioc) {
|
|
|
|
qio_channel_attach_aio_context(s->ioc, new_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void nbd_detach_aio_context(BlockDriverState *bs)
|
|
|
|
{
|
|
|
|
BDRVNBDState *s = bs->opaque;
|
|
|
|
|
|
|
|
assert(!s->open_timer);
|
|
|
|
assert(!s->reconnect_delay_timer);
|
|
|
|
|
|
|
|
if (s->ioc) {
|
|
|
|
qio_channel_detach_aio_context(s->ioc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-05-10 02:03:42 +04:00
|
|
|
static BlockDriver bdrv_nbd = {
|
2014-05-08 18:34:43 +04:00
|
|
|
.format_name = "nbd",
|
|
|
|
.protocol_name = "nbd",
|
|
|
|
.instance_size = sizeof(BDRVNBDState),
|
|
|
|
.bdrv_parse_filename = nbd_parse_filename,
|
2020-03-26 04:12:18 +03:00
|
|
|
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
|
|
|
|
.create_opts = &bdrv_create_opts_simple,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_file_open = nbd_open,
|
2019-10-01 00:38:20 +03:00
|
|
|
.bdrv_reopen_prepare = nbd_client_reopen_prepare,
|
2016-07-16 02:23:07 +03:00
|
|
|
.bdrv_co_preadv = nbd_client_co_preadv,
|
|
|
|
.bdrv_co_pwritev = nbd_client_co_pwritev,
|
2016-10-14 21:33:18 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_close = nbd_close,
|
|
|
|
.bdrv_co_flush_to_os = nbd_co_flush,
|
2016-07-16 02:23:02 +03:00
|
|
|
.bdrv_co_pdiscard = nbd_client_co_pdiscard,
|
2015-02-06 14:24:43 +03:00
|
|
|
.bdrv_refresh_limits = nbd_refresh_limits,
|
2020-07-28 00:58:43 +03:00
|
|
|
.bdrv_co_truncate = nbd_co_truncate,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_getlength = nbd_getlength,
|
2014-07-18 22:24:59 +04:00
|
|
|
.bdrv_refresh_filename = nbd_refresh_filename,
|
2018-03-12 18:21:23 +03:00
|
|
|
.bdrv_co_block_status = nbd_client_co_block_status,
|
2019-02-01 22:29:21 +03:00
|
|
|
.bdrv_dirname = nbd_dirname,
|
2019-02-01 22:29:25 +03:00
|
|
|
.strong_runtime_opts = nbd_strong_runtime_opts,
|
2021-02-05 19:37:12 +03:00
|
|
|
.bdrv_cancel_in_flight = nbd_cancel_in_flight,
|
2022-02-04 14:10:11 +03:00
|
|
|
|
|
|
|
.bdrv_attach_aio_context = nbd_attach_aio_context,
|
|
|
|
.bdrv_detach_aio_context = nbd_detach_aio_context,
|
2012-11-04 16:04:24 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static BlockDriver bdrv_nbd_tcp = {
|
2014-05-08 18:34:43 +04:00
|
|
|
.format_name = "nbd",
|
|
|
|
.protocol_name = "nbd+tcp",
|
|
|
|
.instance_size = sizeof(BDRVNBDState),
|
|
|
|
.bdrv_parse_filename = nbd_parse_filename,
|
2020-03-26 04:12:18 +03:00
|
|
|
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
|
|
|
|
.create_opts = &bdrv_create_opts_simple,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_file_open = nbd_open,
|
2019-10-01 00:38:20 +03:00
|
|
|
.bdrv_reopen_prepare = nbd_client_reopen_prepare,
|
2016-07-16 02:23:07 +03:00
|
|
|
.bdrv_co_preadv = nbd_client_co_preadv,
|
|
|
|
.bdrv_co_pwritev = nbd_client_co_pwritev,
|
2016-10-14 21:33:18 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_close = nbd_close,
|
|
|
|
.bdrv_co_flush_to_os = nbd_co_flush,
|
2016-07-16 02:23:02 +03:00
|
|
|
.bdrv_co_pdiscard = nbd_client_co_pdiscard,
|
2015-02-06 14:24:43 +03:00
|
|
|
.bdrv_refresh_limits = nbd_refresh_limits,
|
2020-07-28 00:58:43 +03:00
|
|
|
.bdrv_co_truncate = nbd_co_truncate,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_getlength = nbd_getlength,
|
2014-07-18 22:24:59 +04:00
|
|
|
.bdrv_refresh_filename = nbd_refresh_filename,
|
2018-03-12 18:21:23 +03:00
|
|
|
.bdrv_co_block_status = nbd_client_co_block_status,
|
2019-02-01 22:29:21 +03:00
|
|
|
.bdrv_dirname = nbd_dirname,
|
2019-02-01 22:29:25 +03:00
|
|
|
.strong_runtime_opts = nbd_strong_runtime_opts,
|
2021-02-05 19:37:12 +03:00
|
|
|
.bdrv_cancel_in_flight = nbd_cancel_in_flight,
|
2022-02-04 14:10:11 +03:00
|
|
|
|
|
|
|
.bdrv_attach_aio_context = nbd_attach_aio_context,
|
|
|
|
.bdrv_detach_aio_context = nbd_detach_aio_context,
|
2012-11-04 16:04:24 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static BlockDriver bdrv_nbd_unix = {
|
2014-05-08 18:34:43 +04:00
|
|
|
.format_name = "nbd",
|
|
|
|
.protocol_name = "nbd+unix",
|
|
|
|
.instance_size = sizeof(BDRVNBDState),
|
|
|
|
.bdrv_parse_filename = nbd_parse_filename,
|
2020-03-26 04:12:18 +03:00
|
|
|
.bdrv_co_create_opts = bdrv_co_create_opts_simple,
|
|
|
|
.create_opts = &bdrv_create_opts_simple,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_file_open = nbd_open,
|
2019-10-01 00:38:20 +03:00
|
|
|
.bdrv_reopen_prepare = nbd_client_reopen_prepare,
|
2016-07-16 02:23:07 +03:00
|
|
|
.bdrv_co_preadv = nbd_client_co_preadv,
|
|
|
|
.bdrv_co_pwritev = nbd_client_co_pwritev,
|
2016-10-14 21:33:18 +03:00
|
|
|
.bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_close = nbd_close,
|
|
|
|
.bdrv_co_flush_to_os = nbd_co_flush,
|
2016-07-16 02:23:02 +03:00
|
|
|
.bdrv_co_pdiscard = nbd_client_co_pdiscard,
|
2015-02-06 14:24:43 +03:00
|
|
|
.bdrv_refresh_limits = nbd_refresh_limits,
|
2020-07-28 00:58:43 +03:00
|
|
|
.bdrv_co_truncate = nbd_co_truncate,
|
2014-05-08 18:34:43 +04:00
|
|
|
.bdrv_getlength = nbd_getlength,
|
2014-07-18 22:24:59 +04:00
|
|
|
.bdrv_refresh_filename = nbd_refresh_filename,
|
2018-03-12 18:21:23 +03:00
|
|
|
.bdrv_co_block_status = nbd_client_co_block_status,
|
2019-02-01 22:29:21 +03:00
|
|
|
.bdrv_dirname = nbd_dirname,
|
2019-02-01 22:29:25 +03:00
|
|
|
.strong_runtime_opts = nbd_strong_runtime_opts,
|
2021-02-05 19:37:12 +03:00
|
|
|
.bdrv_cancel_in_flight = nbd_cancel_in_flight,
|
2022-02-04 14:10:11 +03:00
|
|
|
|
|
|
|
.bdrv_attach_aio_context = nbd_attach_aio_context,
|
|
|
|
.bdrv_detach_aio_context = nbd_detach_aio_context,
|
2008-07-03 17:41:03 +04:00
|
|
|
};
|
2009-05-10 02:03:42 +04:00
|
|
|
|
|
|
|
static void bdrv_nbd_init(void)
|
|
|
|
{
|
|
|
|
bdrv_register(&bdrv_nbd);
|
2012-11-04 16:04:24 +04:00
|
|
|
bdrv_register(&bdrv_nbd_tcp);
|
|
|
|
bdrv_register(&bdrv_nbd_unix);
|
2009-05-10 02:03:42 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
block_init(bdrv_nbd_init);
|