2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* RDMA protocol and interfaces
|
|
|
|
*
|
|
|
|
* Copyright IBM, Corp. 2010-2013
|
2016-04-27 13:05:07 +03:00
|
|
|
* Copyright Red Hat, Inc. 2015-2016
|
2013-07-22 18:01:54 +04:00
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Michael R. Hines <mrhines@us.ibm.com>
|
|
|
|
* Jiuxing Liu <jl@us.ibm.com>
|
2016-04-27 13:05:07 +03:00
|
|
|
* Daniel P. Berrange <berrange@redhat.com>
|
2013-07-22 18:01:54 +04:00
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or
|
|
|
|
* later. See the COPYING file in the top-level directory.
|
|
|
|
*
|
|
|
|
*/
|
2019-05-23 17:35:07 +03:00
|
|
|
|
2016-01-26 21:16:54 +03:00
|
|
|
#include "qemu/osdep.h"
|
include/qemu/osdep.h: Don't include qapi/error.h
Commit 57cb38b included qapi/error.h into qemu/osdep.h to get the
Error typedef. Since then, we've moved to include qemu/osdep.h
everywhere. Its file comment explains: "To avoid getting into
possible circular include dependencies, this file should not include
any other QEMU headers, with the exceptions of config-host.h,
compiler.h, os-posix.h and os-win32.h, all of which are doing a
similar job to this file and are under similar constraints."
qapi/error.h doesn't do a similar job, and it doesn't adhere to
similar constraints: it includes qapi-types.h. That's in excess of
100KiB of crap most .c files don't actually need.
Add the typedef to qemu/typedefs.h, and include that instead of
qapi/error.h. Include qapi/error.h in .c files that need it and don't
get it now. Include qapi-types.h in qom/object.h for uint16List.
Update scripts/clean-includes accordingly. Update it further to match
reality: replace config.h by config-target.h, add sysemu/os-posix.h,
sysemu/os-win32.h. Update the list of includes in the qemu/osdep.h
comment quoted above similarly.
This reduces the number of objects depending on qapi/error.h from "all
of them" to less than a third. Unfortunately, the number depending on
qapi-types.h shrinks only a little. More work is needed for that one.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
[Fix compilation without the spice devel packages. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-14 11:01:28 +03:00
|
|
|
#include "qapi/error.h"
|
2016-03-20 20:16:19 +03:00
|
|
|
#include "qemu/cutils.h"
|
2023-04-27 11:35:26 +03:00
|
|
|
#include "exec/target_page.h"
|
2017-04-17 21:32:36 +03:00
|
|
|
#include "rdma.h"
|
2017-04-24 21:07:27 +03:00
|
|
|
#include "migration.h"
|
2023-04-27 11:35:26 +03:00
|
|
|
#include "migration-stats.h"
|
2017-04-20 19:52:18 +03:00
|
|
|
#include "qemu-file.h"
|
2017-04-17 21:26:27 +03:00
|
|
|
#include "ram.h"
|
2015-03-17 20:29:20 +03:00
|
|
|
#include "qemu/error-report.h"
|
2013-07-22 18:01:54 +04:00
|
|
|
#include "qemu/main-loop.h"
|
2019-05-23 17:35:07 +03:00
|
|
|
#include "qemu/module.h"
|
2019-08-12 08:23:46 +03:00
|
|
|
#include "qemu/rcu.h"
|
2013-07-22 18:01:54 +04:00
|
|
|
#include "qemu/sockets.h"
|
|
|
|
#include "qemu/bitmap.h"
|
2015-09-01 16:48:02 +03:00
|
|
|
#include "qemu/coroutine.h"
|
2020-06-26 10:22:35 +03:00
|
|
|
#include "exec/memory.h"
|
2013-07-22 18:01:54 +04:00
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <netdb.h>
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
#include <rdma/rdma_cma.h>
|
2015-02-02 22:53:33 +03:00
|
|
|
#include "trace.h"
|
2020-09-03 23:43:22 +03:00
|
|
|
#include "qom/object.h"
|
2023-03-02 00:41:55 +03:00
|
|
|
#include "options.h"
|
2021-05-25 11:05:52 +03:00
|
|
|
#include <poll.h>
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
#define RDMA_RESOLVE_TIMEOUT_MS 10000
|
|
|
|
|
|
|
|
/* Do not merge data if larger than this. */
|
|
|
|
#define RDMA_MERGE_MAX (2 * 1024 * 1024)
|
|
|
|
#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
|
|
|
|
|
|
|
|
#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is only for non-live state being migrated.
|
|
|
|
* Instead of RDMA_WRITE messages, we use RDMA_SEND
|
|
|
|
* messages for that state, which requires a different
|
|
|
|
* delivery design than main memory.
|
|
|
|
*/
|
|
|
|
#define RDMA_SEND_INCREMENT 32768
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum size infiniband SEND message
|
|
|
|
*/
|
|
|
|
#define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
|
|
|
|
#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
|
|
|
|
|
|
|
|
#define RDMA_CONTROL_VERSION_CURRENT 1
|
|
|
|
/*
|
|
|
|
* Capabilities for negotiation.
|
|
|
|
*/
|
|
|
|
#define RDMA_CAPABILITY_PIN_ALL 0x01
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Add the other flags above to this list of known capabilities
|
|
|
|
* as they are introduced.
|
|
|
|
*/
|
|
|
|
static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A work request ID is 64-bits and we split up these bits
|
|
|
|
* into 3 parts:
|
|
|
|
*
|
|
|
|
* bits 0-15 : type of control message, 2^16
|
|
|
|
* bits 16-29: ram block index, 2^14
|
|
|
|
* bits 30-63: ram block chunk number, 2^34
|
|
|
|
*
|
|
|
|
* The last two bit ranges are only used for RDMA writes,
|
|
|
|
* in order to track their completion and potentially
|
|
|
|
* also track unregistration status of the message.
|
|
|
|
*/
|
|
|
|
#define RDMA_WRID_TYPE_SHIFT 0UL
|
|
|
|
#define RDMA_WRID_BLOCK_SHIFT 16UL
|
|
|
|
#define RDMA_WRID_CHUNK_SHIFT 30UL
|
|
|
|
|
|
|
|
#define RDMA_WRID_TYPE_MASK \
|
|
|
|
((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
|
|
|
|
|
|
|
|
#define RDMA_WRID_BLOCK_MASK \
|
|
|
|
(~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
|
|
|
|
|
|
|
|
#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RDMA migration protocol:
|
|
|
|
* 1. RDMA Writes (data messages, i.e. RAM)
|
|
|
|
* 2. IB Send/Recv (control channel messages)
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
RDMA_WRID_NONE = 0,
|
|
|
|
RDMA_WRID_RDMA_WRITE = 1,
|
|
|
|
RDMA_WRID_SEND_CONTROL = 2000,
|
|
|
|
RDMA_WRID_RECV_CONTROL = 4000,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Work request IDs for IB SEND messages only (not RDMA writes).
|
|
|
|
* This is used by the migration protocol to transmit
|
|
|
|
* control messages (such as device state and registration commands)
|
|
|
|
*
|
|
|
|
* We could use more WRs, but we have enough for now.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
RDMA_WRID_READY = 0,
|
|
|
|
RDMA_WRID_DATA,
|
|
|
|
RDMA_WRID_CONTROL,
|
|
|
|
RDMA_WRID_MAX,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SEND/RECV IB Control Messages.
|
|
|
|
*/
|
|
|
|
enum {
|
|
|
|
RDMA_CONTROL_NONE = 0,
|
|
|
|
RDMA_CONTROL_ERROR,
|
|
|
|
RDMA_CONTROL_READY, /* ready to receive */
|
|
|
|
RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
|
|
|
|
RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
|
|
|
|
RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
|
|
|
|
RDMA_CONTROL_COMPRESS, /* page contains repeat values */
|
|
|
|
RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
|
|
|
|
RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
|
|
|
|
RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
|
|
|
|
RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
|
|
|
|
RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Memory and MR structures used to represent an IB Send/Recv work request.
|
|
|
|
* This is *not* used for RDMA writes, only IB Send/Recv.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
|
|
|
|
struct ibv_mr *control_mr; /* registration metadata */
|
|
|
|
size_t control_len; /* length of the message */
|
|
|
|
uint8_t *control_curr; /* start of unconsumed bytes */
|
|
|
|
} RDMAWorkRequestData;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Negotiate RDMA capabilities during connection-setup time.
|
|
|
|
*/
|
|
|
|
typedef struct {
|
|
|
|
uint32_t version;
|
|
|
|
uint32_t flags;
|
|
|
|
} RDMACapabilities;
|
|
|
|
|
|
|
|
static void caps_to_network(RDMACapabilities *cap)
|
|
|
|
{
|
|
|
|
cap->version = htonl(cap->version);
|
|
|
|
cap->flags = htonl(cap->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void network_to_caps(RDMACapabilities *cap)
|
|
|
|
{
|
|
|
|
cap->version = ntohl(cap->version);
|
|
|
|
cap->flags = ntohl(cap->flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Representation of a RAMBlock from an RDMA perspective.
|
|
|
|
* This is not transmitted, only local.
|
|
|
|
* This and subsequent structures cannot be linked lists
|
|
|
|
* because we're using a single IB message to transmit
|
|
|
|
* the information. It's small anyway, so a list is overkill.
|
|
|
|
*/
|
|
|
|
typedef struct RDMALocalBlock {
|
2015-06-11 20:17:21 +03:00
|
|
|
char *block_name;
|
|
|
|
uint8_t *local_host_addr; /* local virtual address */
|
|
|
|
uint64_t remote_host_addr; /* remote virtual address */
|
|
|
|
uint64_t offset;
|
|
|
|
uint64_t length;
|
|
|
|
struct ibv_mr **pmr; /* MRs for chunk-level registration */
|
|
|
|
struct ibv_mr *mr; /* MR for non-chunk-level registration */
|
|
|
|
uint32_t *remote_keys; /* rkeys for chunk-level registration */
|
|
|
|
uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
|
|
|
|
int index; /* which block are we */
|
2015-06-11 20:17:26 +03:00
|
|
|
unsigned int src_index; /* (Only used on dest) */
|
2015-06-11 20:17:21 +03:00
|
|
|
bool is_ram_block;
|
|
|
|
int nb_chunks;
|
2013-07-22 18:01:54 +04:00
|
|
|
unsigned long *transit_bitmap;
|
|
|
|
unsigned long *unregister_bitmap;
|
|
|
|
} RDMALocalBlock;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Also represents a RAMblock, but only on the dest.
|
|
|
|
* This gets transmitted by the dest during connection-time
|
|
|
|
* to the source VM and then is used to populate the
|
|
|
|
* corresponding RDMALocalBlock with
|
|
|
|
* the information needed to perform the actual RDMA.
|
|
|
|
*/
|
2015-04-20 18:57:16 +03:00
|
|
|
typedef struct QEMU_PACKED RDMADestBlock {
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t remote_host_addr;
|
|
|
|
uint64_t offset;
|
|
|
|
uint64_t length;
|
|
|
|
uint32_t remote_rkey;
|
|
|
|
uint32_t padding;
|
2015-04-20 18:57:16 +03:00
|
|
|
} RDMADestBlock;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2017-07-17 14:09:35 +03:00
|
|
|
static const char *control_desc(unsigned int rdma_control)
|
|
|
|
{
|
|
|
|
static const char *strs[] = {
|
|
|
|
[RDMA_CONTROL_NONE] = "NONE",
|
|
|
|
[RDMA_CONTROL_ERROR] = "ERROR",
|
|
|
|
[RDMA_CONTROL_READY] = "READY",
|
|
|
|
[RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
|
|
|
|
[RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
|
|
|
|
[RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
|
|
|
|
[RDMA_CONTROL_COMPRESS] = "COMPRESS",
|
|
|
|
[RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
|
|
|
|
[RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
|
|
|
|
[RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
|
|
|
|
[RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
|
|
|
|
[RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
|
|
|
|
};
|
|
|
|
|
|
|
|
if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
|
|
|
|
return "??BAD CONTROL VALUE??";
|
|
|
|
}
|
|
|
|
|
|
|
|
return strs[rdma_control];
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
static uint64_t htonll(uint64_t v)
|
|
|
|
{
|
|
|
|
union { uint32_t lv[2]; uint64_t llv; } u;
|
|
|
|
u.lv[0] = htonl(v >> 32);
|
|
|
|
u.lv[1] = htonl(v & 0xFFFFFFFFULL);
|
|
|
|
return u.llv;
|
|
|
|
}
|
|
|
|
|
2020-10-20 06:10:47 +03:00
|
|
|
static uint64_t ntohll(uint64_t v)
|
|
|
|
{
|
2013-07-22 18:01:54 +04:00
|
|
|
union { uint32_t lv[2]; uint64_t llv; } u;
|
|
|
|
u.llv = v;
|
|
|
|
return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
|
|
|
|
}
|
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
static void dest_block_to_network(RDMADestBlock *db)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2015-04-20 18:57:16 +03:00
|
|
|
db->remote_host_addr = htonll(db->remote_host_addr);
|
|
|
|
db->offset = htonll(db->offset);
|
|
|
|
db->length = htonll(db->length);
|
|
|
|
db->remote_rkey = htonl(db->remote_rkey);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
static void network_to_dest_block(RDMADestBlock *db)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2015-04-20 18:57:16 +03:00
|
|
|
db->remote_host_addr = ntohll(db->remote_host_addr);
|
|
|
|
db->offset = ntohll(db->offset);
|
|
|
|
db->length = ntohll(db->length);
|
|
|
|
db->remote_rkey = ntohl(db->remote_rkey);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Virtual address of the above structures used for transmitting
|
|
|
|
* the RAMBlock descriptions at connection-time.
|
|
|
|
* This structure is *not* transmitted.
|
|
|
|
*/
|
|
|
|
typedef struct RDMALocalBlocks {
|
|
|
|
int nb_blocks;
|
|
|
|
bool init; /* main memory init complete */
|
|
|
|
RDMALocalBlock *block;
|
|
|
|
} RDMALocalBlocks;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Main data structure for RDMA state.
|
|
|
|
* While there is only one copy of this structure being allocated right now,
|
|
|
|
* this is the place where one would start if you wanted to consider
|
|
|
|
* having more than one RDMA connection open at the same time.
|
|
|
|
*/
|
|
|
|
typedef struct RDMAContext {
|
|
|
|
char *host;
|
|
|
|
int port;
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
char *host_port;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-04 06:54:52 +04:00
|
|
|
RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is used by *_exchange_send() to figure out whether or not
|
|
|
|
* the initial "READY" message has already been received or not.
|
|
|
|
* This is because other functions may potentially poll() and detect
|
|
|
|
* the READY message before send() does, in which case we need to
|
|
|
|
* know if it completed.
|
|
|
|
*/
|
|
|
|
int control_ready_expected;
|
|
|
|
|
|
|
|
/* number of outstanding writes */
|
|
|
|
int nb_sent;
|
|
|
|
|
|
|
|
/* store info about current buffer so that we can
|
|
|
|
merge it with future sends */
|
|
|
|
uint64_t current_addr;
|
|
|
|
uint64_t current_length;
|
|
|
|
/* index of ram block the current buffer belongs to */
|
|
|
|
int current_index;
|
|
|
|
/* index of the chunk in the current ram block */
|
|
|
|
int current_chunk;
|
|
|
|
|
|
|
|
bool pin_all;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* infiniband-specific variables for opening the device
|
|
|
|
* and maintaining connection state and so forth.
|
|
|
|
*
|
|
|
|
* cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
|
|
|
|
* cm_id->verbs, cm_id->channel, and cm_id->qp.
|
|
|
|
*/
|
|
|
|
struct rdma_cm_id *cm_id; /* connection manager ID */
|
|
|
|
struct rdma_cm_id *listen_id;
|
2013-08-13 06:12:43 +04:00
|
|
|
bool connected;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
struct ibv_context *verbs;
|
|
|
|
struct rdma_event_channel *channel;
|
|
|
|
struct ibv_qp *qp; /* queue pair */
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
|
|
|
|
struct ibv_comp_channel *send_comp_channel; /* send completion channel */
|
2013-07-22 18:01:54 +04:00
|
|
|
struct ibv_pd *pd; /* protection domain */
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
struct ibv_cq *recv_cq; /* recvieve completion queue */
|
|
|
|
struct ibv_cq *send_cq; /* send completion queue */
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If a previous write failed (perhaps because of a failed
|
|
|
|
* memory registration, then do not attempt any future work
|
|
|
|
* and remember the error state.
|
|
|
|
*/
|
2023-09-28 16:19:53 +03:00
|
|
|
bool errored;
|
2023-09-28 16:19:41 +03:00
|
|
|
bool error_reported;
|
|
|
|
bool received_error;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Description of ram blocks used throughout the code.
|
|
|
|
*/
|
|
|
|
RDMALocalBlocks local_ram_blocks;
|
2015-04-20 18:57:16 +03:00
|
|
|
RDMADestBlock *dest_blocks;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
/* Index of the next RAMBlock received during block registration */
|
|
|
|
unsigned int next_src_index;
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* Migration on *destination* started.
|
|
|
|
* Then use coroutine yield function.
|
|
|
|
* Source runs in a thread, so we don't care.
|
|
|
|
*/
|
|
|
|
int migration_started_on_destination;
|
|
|
|
|
|
|
|
int total_registrations;
|
|
|
|
int total_writes;
|
|
|
|
|
|
|
|
int unregister_current, unregister_next;
|
|
|
|
uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
|
|
|
|
|
|
|
|
GHashTable *blockmap;
|
2018-08-06 16:29:28 +03:00
|
|
|
|
|
|
|
/* the RDMAContext for return path */
|
|
|
|
struct RDMAContext *return_path;
|
|
|
|
bool is_return_path;
|
2013-07-22 18:01:54 +04:00
|
|
|
} RDMAContext;
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
#define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
|
2020-09-16 21:25:19 +03:00
|
|
|
OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
|
2016-04-27 13:05:07 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct QIOChannelRDMA {
|
|
|
|
QIOChannel parent;
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdmain;
|
|
|
|
RDMAContext *rdmaout;
|
2016-04-27 13:05:07 +03:00
|
|
|
QEMUFile *file;
|
|
|
|
bool blocking; /* XXX we don't actually honour this yet */
|
|
|
|
};
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Main structure for IB Send/Recv control messages.
|
|
|
|
* This gets prepended at the beginning of every Send/Recv.
|
|
|
|
*/
|
|
|
|
typedef struct QEMU_PACKED {
|
|
|
|
uint32_t len; /* Total length of data portion */
|
|
|
|
uint32_t type; /* which control command to perform */
|
|
|
|
uint32_t repeat; /* number of commands in data portion of same type */
|
|
|
|
uint32_t padding;
|
|
|
|
} RDMAControlHeader;
|
|
|
|
|
|
|
|
static void control_to_network(RDMAControlHeader *control)
|
|
|
|
{
|
|
|
|
control->type = htonl(control->type);
|
|
|
|
control->len = htonl(control->len);
|
|
|
|
control->repeat = htonl(control->repeat);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void network_to_control(RDMAControlHeader *control)
|
|
|
|
{
|
|
|
|
control->type = ntohl(control->type);
|
|
|
|
control->len = ntohl(control->len);
|
|
|
|
control->repeat = ntohl(control->repeat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a single Chunk.
|
|
|
|
* Information sent by the source VM to inform the dest
|
|
|
|
* to register an single chunk of memory before we can perform
|
|
|
|
* the actual RDMA operation.
|
|
|
|
*/
|
|
|
|
typedef struct QEMU_PACKED {
|
|
|
|
union QEMU_PACKED {
|
2015-06-11 20:17:22 +03:00
|
|
|
uint64_t current_addr; /* offset into the ram_addr_t space */
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t chunk; /* chunk to lookup if unregistering */
|
|
|
|
} key;
|
|
|
|
uint32_t current_index; /* which ramblock the chunk belongs to */
|
|
|
|
uint32_t padding;
|
|
|
|
uint64_t chunks; /* how many sequential chunks to register */
|
|
|
|
} RDMARegister;
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
static bool rdma_errored(RDMAContext *rdma)
|
2023-09-28 16:19:45 +03:00
|
|
|
{
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma->errored && !rdma->error_reported) {
|
2023-09-28 16:19:45 +03:00
|
|
|
error_report("RDMA is in an error state waiting migration"
|
|
|
|
" to abort!");
|
|
|
|
rdma->error_reported = true;
|
|
|
|
}
|
2023-09-28 16:19:53 +03:00
|
|
|
return rdma->errored;
|
2023-09-28 16:19:45 +03:00
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:22 +03:00
|
|
|
static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2015-06-11 20:17:22 +03:00
|
|
|
RDMALocalBlock *local_block;
|
|
|
|
local_block = &rdma->local_ram_blocks.block[reg->current_index];
|
|
|
|
|
|
|
|
if (local_block->is_ram_block) {
|
|
|
|
/*
|
|
|
|
* current_addr as passed in is an address in the local ram_addr_t
|
|
|
|
* space, we need to translate this for the destination
|
|
|
|
*/
|
|
|
|
reg->key.current_addr -= local_block->offset;
|
|
|
|
reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
reg->key.current_addr = htonll(reg->key.current_addr);
|
|
|
|
reg->current_index = htonl(reg->current_index);
|
|
|
|
reg->chunks = htonll(reg->chunks);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void network_to_register(RDMARegister *reg)
|
|
|
|
{
|
|
|
|
reg->key.current_addr = ntohll(reg->key.current_addr);
|
|
|
|
reg->current_index = ntohl(reg->current_index);
|
|
|
|
reg->chunks = ntohll(reg->chunks);
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef struct QEMU_PACKED {
|
|
|
|
uint32_t value; /* if zero, we will madvise() */
|
|
|
|
uint32_t block_idx; /* which ram block index */
|
2015-06-11 20:17:22 +03:00
|
|
|
uint64_t offset; /* Address in remote ram_addr_t space */
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t length; /* length of the chunk */
|
|
|
|
} RDMACompress;
|
|
|
|
|
2015-06-11 20:17:22 +03:00
|
|
|
static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
comp->value = htonl(comp->value);
|
2015-06-11 20:17:22 +03:00
|
|
|
/*
|
|
|
|
* comp->offset as passed in is an address in the local ram_addr_t
|
|
|
|
* space, we need to translate this for the destination
|
|
|
|
*/
|
|
|
|
comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
|
|
|
|
comp->offset += rdma->dest_blocks[comp->block_idx].offset;
|
2013-07-22 18:01:54 +04:00
|
|
|
comp->block_idx = htonl(comp->block_idx);
|
|
|
|
comp->offset = htonll(comp->offset);
|
|
|
|
comp->length = htonll(comp->length);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void network_to_compress(RDMACompress *comp)
|
|
|
|
{
|
|
|
|
comp->value = ntohl(comp->value);
|
|
|
|
comp->block_idx = ntohl(comp->block_idx);
|
|
|
|
comp->offset = ntohll(comp->offset);
|
|
|
|
comp->length = ntohll(comp->length);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The result of the dest's memory registration produces an "rkey"
|
|
|
|
* which the source VM must reference in order to perform
|
|
|
|
* the RDMA operation.
|
|
|
|
*/
|
|
|
|
typedef struct QEMU_PACKED {
|
|
|
|
uint32_t rkey;
|
|
|
|
uint32_t padding;
|
|
|
|
uint64_t host_addr;
|
|
|
|
} RDMARegisterResult;
|
|
|
|
|
|
|
|
static void result_to_network(RDMARegisterResult *result)
|
|
|
|
{
|
|
|
|
result->rkey = htonl(result->rkey);
|
|
|
|
result->host_addr = htonll(result->host_addr);
|
|
|
|
};
|
|
|
|
|
|
|
|
static void network_to_result(RDMARegisterResult *result)
|
|
|
|
{
|
|
|
|
result->rkey = ntohl(result->rkey);
|
|
|
|
result->host_addr = ntohll(result->host_addr);
|
|
|
|
};
|
|
|
|
|
|
|
|
static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
|
|
|
|
uint8_t *data, RDMAControlHeader *resp,
|
|
|
|
int *resp_idx,
|
|
|
|
int (*callback)(RDMAContext *rdma));
|
|
|
|
|
2013-09-04 06:32:19 +04:00
|
|
|
static inline uint64_t ram_chunk_index(const uint8_t *start,
|
|
|
|
const uint8_t *host)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
|
|
|
|
}
|
|
|
|
|
2013-09-04 06:32:19 +04:00
|
|
|
static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t i)
|
|
|
|
{
|
2015-02-28 21:09:43 +03:00
|
|
|
return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
|
|
|
|
(i << RDMA_REG_CHUNK_SHIFT));
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2013-09-04 06:32:19 +04:00
|
|
|
static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
|
|
|
|
uint64_t i)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
|
|
|
|
(1UL << RDMA_REG_CHUNK_SHIFT);
|
|
|
|
|
|
|
|
if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
|
|
|
|
result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:38 +03:00
|
|
|
static void rdma_add_block(RDMAContext *rdma, const char *block_name,
|
|
|
|
void *host_addr,
|
|
|
|
ram_addr_t block_offset, uint64_t length)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMALocalBlocks *local = &rdma->local_ram_blocks;
|
2015-06-11 20:17:25 +03:00
|
|
|
RDMALocalBlock *block;
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMALocalBlock *old = local->block;
|
|
|
|
|
2015-09-14 14:51:31 +03:00
|
|
|
local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (local->nb_blocks) {
|
|
|
|
int x;
|
|
|
|
|
2015-06-11 20:17:25 +03:00
|
|
|
if (rdma->blockmap) {
|
|
|
|
for (x = 0; x < local->nb_blocks; x++) {
|
|
|
|
g_hash_table_remove(rdma->blockmap,
|
|
|
|
(void *)(uintptr_t)old[x].offset);
|
|
|
|
g_hash_table_insert(rdma->blockmap,
|
|
|
|
(void *)(uintptr_t)old[x].offset,
|
|
|
|
&local->block[x]);
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
|
|
|
|
g_free(old);
|
|
|
|
}
|
|
|
|
|
|
|
|
block = &local->block[local->nb_blocks];
|
|
|
|
|
2015-06-11 20:17:21 +03:00
|
|
|
block->block_name = g_strdup(block_name);
|
2013-07-22 18:01:54 +04:00
|
|
|
block->local_host_addr = host_addr;
|
|
|
|
block->offset = block_offset;
|
|
|
|
block->length = length;
|
|
|
|
block->index = local->nb_blocks;
|
2015-06-11 20:17:26 +03:00
|
|
|
block->src_index = ~0U; /* Filled in by the receipt of the block list */
|
2013-07-22 18:01:54 +04:00
|
|
|
block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
|
|
|
|
block->transit_bitmap = bitmap_new(block->nb_chunks);
|
|
|
|
bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
|
|
|
|
block->unregister_bitmap = bitmap_new(block->nb_chunks);
|
|
|
|
bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
|
2015-09-14 14:51:31 +03:00
|
|
|
block->remote_keys = g_new0(uint32_t, block->nb_chunks);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
block->is_ram_block = local->init ? false : true;
|
|
|
|
|
2015-06-11 20:17:25 +03:00
|
|
|
if (rdma->blockmap) {
|
2015-11-10 19:43:04 +03:00
|
|
|
g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
|
2015-06-11 20:17:25 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:21 +03:00
|
|
|
trace_rdma_add_block(block_name, local->nb_blocks,
|
|
|
|
(uintptr_t) block->local_host_addr,
|
2015-02-16 19:58:05 +03:00
|
|
|
block->offset, block->length,
|
2015-02-28 21:09:43 +03:00
|
|
|
(uintptr_t) (block->local_host_addr + block->length),
|
2015-02-16 19:58:05 +03:00
|
|
|
BITS_TO_LONGS(block->nb_chunks) *
|
|
|
|
sizeof(unsigned long) * 8,
|
|
|
|
block->nb_chunks);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
local->nb_blocks++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Memory regions need to be registered with the device and queue pairs setup
|
|
|
|
* in advanced before the migration starts. This tells us where the RAM blocks
|
|
|
|
* are so that we can register them individually.
|
|
|
|
*/
|
2019-02-15 20:45:44 +03:00
|
|
|
static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2019-02-15 20:45:44 +03:00
|
|
|
const char *block_name = qemu_ram_get_idstr(rb);
|
|
|
|
void *host_addr = qemu_ram_get_host_addr(rb);
|
|
|
|
ram_addr_t block_offset = qemu_ram_get_offset(rb);
|
|
|
|
ram_addr_t length = qemu_ram_get_used_length(rb);
|
2023-09-28 16:19:38 +03:00
|
|
|
rdma_add_block(opaque, block_name, host_addr, block_offset, length);
|
|
|
|
return 0;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Identify the RAMBlocks and their quantity. They will be references to
|
|
|
|
* identify chunk boundaries inside each RAMBlock and also be referenced
|
|
|
|
* during dynamic page registration.
|
|
|
|
*/
|
2023-09-28 16:19:38 +03:00
|
|
|
static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMALocalBlocks *local = &rdma->local_ram_blocks;
|
2019-03-08 21:51:24 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
assert(rdma->blockmap == NULL);
|
|
|
|
memset(local, 0, sizeof *local);
|
2019-03-08 21:51:24 +03:00
|
|
|
ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
|
2023-09-28 16:19:38 +03:00
|
|
|
assert(!ret);
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
|
2015-09-14 14:51:31 +03:00
|
|
|
rdma->dest_blocks = g_new0(RDMADestBlock,
|
|
|
|
rdma->local_ram_blocks.nb_blocks);
|
2013-07-22 18:01:54 +04:00
|
|
|
local->init = true;
|
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:24 +03:00
|
|
|
/*
|
|
|
|
* Note: If used outside of cleanup, the caller must ensure that the destination
|
|
|
|
* block structures are also updated
|
|
|
|
*/
|
2023-09-28 16:19:29 +03:00
|
|
|
static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMALocalBlocks *local = &rdma->local_ram_blocks;
|
|
|
|
RDMALocalBlock *old = local->block;
|
|
|
|
int x;
|
|
|
|
|
2015-06-11 20:17:24 +03:00
|
|
|
if (rdma->blockmap) {
|
|
|
|
g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
if (block->pmr) {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < block->nb_chunks; j++) {
|
|
|
|
if (!block->pmr[j]) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
ibv_dereg_mr(block->pmr[j]);
|
|
|
|
rdma->total_registrations--;
|
|
|
|
}
|
|
|
|
g_free(block->pmr);
|
|
|
|
block->pmr = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (block->mr) {
|
|
|
|
ibv_dereg_mr(block->mr);
|
|
|
|
rdma->total_registrations--;
|
|
|
|
block->mr = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(block->transit_bitmap);
|
|
|
|
block->transit_bitmap = NULL;
|
|
|
|
|
|
|
|
g_free(block->unregister_bitmap);
|
|
|
|
block->unregister_bitmap = NULL;
|
|
|
|
|
|
|
|
g_free(block->remote_keys);
|
|
|
|
block->remote_keys = NULL;
|
|
|
|
|
2015-06-11 20:17:21 +03:00
|
|
|
g_free(block->block_name);
|
|
|
|
block->block_name = NULL;
|
|
|
|
|
2015-06-11 20:17:24 +03:00
|
|
|
if (rdma->blockmap) {
|
|
|
|
for (x = 0; x < local->nb_blocks; x++) {
|
|
|
|
g_hash_table_remove(rdma->blockmap,
|
|
|
|
(void *)(uintptr_t)old[x].offset);
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (local->nb_blocks > 1) {
|
|
|
|
|
2015-09-14 14:51:31 +03:00
|
|
|
local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (block->index) {
|
|
|
|
memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (block->index < (local->nb_blocks - 1)) {
|
|
|
|
memcpy(local->block + block->index, old + (block->index + 1),
|
|
|
|
sizeof(RDMALocalBlock) *
|
|
|
|
(local->nb_blocks - (block->index + 1)));
|
2018-05-06 17:54:58 +03:00
|
|
|
for (x = block->index; x < local->nb_blocks - 1; x++) {
|
|
|
|
local->block[x].index--;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
assert(block == local->block);
|
|
|
|
local->block = NULL;
|
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:24 +03:00
|
|
|
trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
|
2015-02-02 22:53:33 +03:00
|
|
|
block->offset, block->length,
|
2015-02-28 21:09:43 +03:00
|
|
|
(uintptr_t)(block->local_host_addr + block->length),
|
2015-02-02 22:53:33 +03:00
|
|
|
BITS_TO_LONGS(block->nb_chunks) *
|
|
|
|
sizeof(unsigned long) * 8, block->nb_chunks);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
g_free(old);
|
|
|
|
|
|
|
|
local->nb_blocks--;
|
|
|
|
|
2015-06-11 20:17:24 +03:00
|
|
|
if (local->nb_blocks && rdma->blockmap) {
|
2013-07-22 18:01:54 +04:00
|
|
|
for (x = 0; x < local->nb_blocks; x++) {
|
2015-02-28 21:09:43 +03:00
|
|
|
g_hash_table_insert(rdma->blockmap,
|
|
|
|
(void *)(uintptr_t)local->block[x].offset,
|
|
|
|
&local->block[x]);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Put in the log file which RDMA device was opened and the details
|
|
|
|
* associated with that device.
|
|
|
|
*/
|
|
|
|
static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
|
|
|
|
{
|
2013-08-10 00:05:44 +04:00
|
|
|
struct ibv_port_attr port;
|
|
|
|
|
|
|
|
if (ibv_query_port(verbs, 1, &port)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to query port information");
|
2013-08-10 00:05:44 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
printf("%s RDMA Device opened: kernel name %s "
|
|
|
|
"uverbs device name %s, "
|
2013-08-10 00:05:44 +04:00
|
|
|
"infiniband_verbs class device path %s, "
|
|
|
|
"infiniband class device path %s, "
|
|
|
|
"transport: (%d) %s\n",
|
2013-07-22 18:01:54 +04:00
|
|
|
who,
|
|
|
|
verbs->device->name,
|
|
|
|
verbs->device->dev_name,
|
|
|
|
verbs->device->dev_path,
|
2013-08-10 00:05:44 +04:00
|
|
|
verbs->device->ibdev_path,
|
|
|
|
port.link_layer,
|
|
|
|
(port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
|
2015-02-28 21:09:41 +03:00
|
|
|
((port.link_layer == IBV_LINK_LAYER_ETHERNET)
|
2013-08-10 00:05:44 +04:00
|
|
|
? "Ethernet" : "Unknown"));
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Put in the log file the RDMA gid addressing information,
|
|
|
|
* useful for folks who have trouble understanding the
|
|
|
|
* RDMA device hierarchy in the kernel.
|
|
|
|
*/
|
|
|
|
static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
|
|
|
|
{
|
|
|
|
char sgid[33];
|
|
|
|
char dgid[33];
|
|
|
|
inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
|
|
|
|
inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_dump_gid(who, sgid, dgid);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2013-08-10 00:05:44 +04:00
|
|
|
/*
|
|
|
|
* As of now, IPv6 over RoCE / iWARP is not supported by linux.
|
|
|
|
* We will try the next addrinfo struct, and fail if there are
|
|
|
|
* no other valid addresses to bind against.
|
|
|
|
*
|
|
|
|
* If user is listening on '[::]', then we will not have a opened a device
|
|
|
|
* yet and have no way of verifying if the device is RoCE or not.
|
|
|
|
*
|
|
|
|
* In this case, the source VM will throw an error for ALL types of
|
|
|
|
* connections (both IPv4 and IPv6) if the destination machine does not have
|
|
|
|
* a regular infiniband network available for use.
|
|
|
|
*
|
2013-08-18 21:40:06 +04:00
|
|
|
* The only way to guarantee that an error is thrown for broken kernels is
|
2013-08-10 00:05:44 +04:00
|
|
|
* for the management software to choose a *specific* interface at bind time
|
|
|
|
* and validate what time of hardware it is.
|
|
|
|
*
|
|
|
|
* Unfortunately, this puts the user in a fix:
|
2015-02-28 21:09:41 +03:00
|
|
|
*
|
2013-08-10 00:05:44 +04:00
|
|
|
* If the source VM connects with an IPv4 address without knowing that the
|
|
|
|
* destination has bound to '[::]' the migration will unconditionally fail
|
2015-08-26 14:17:13 +03:00
|
|
|
* unless the management software is explicitly listening on the IPv4
|
2013-08-10 00:05:44 +04:00
|
|
|
* address while using a RoCE-based device.
|
|
|
|
*
|
|
|
|
* If the source VM connects with an IPv6 address, then we're OK because we can
|
|
|
|
* throw an error on the source (and similarly on the destination).
|
2015-02-28 21:09:41 +03:00
|
|
|
*
|
2013-08-10 00:05:44 +04:00
|
|
|
* But in mixed environments, this will be broken for a while until it is fixed
|
|
|
|
* inside linux.
|
|
|
|
*
|
|
|
|
* We do provide a *tiny* bit of help in this function: We can list all of the
|
|
|
|
* devices in the system and check to see if all the devices are RoCE or
|
2015-02-28 21:09:41 +03:00
|
|
|
* Infiniband.
|
2013-08-10 00:05:44 +04:00
|
|
|
*
|
|
|
|
* If we detect that we have a *pure* RoCE environment, then we can safely
|
2013-08-18 21:40:06 +04:00
|
|
|
* thrown an error even if the management software has specified '[::]' as the
|
2013-08-10 00:05:44 +04:00
|
|
|
* bind address.
|
|
|
|
*
|
|
|
|
* However, if there is are multiple hetergeneous devices, then we cannot make
|
|
|
|
* this assumption and the user just has to be sure they know what they are
|
|
|
|
* doing.
|
|
|
|
*
|
|
|
|
* Patches are being reviewed on linux-rdma.
|
|
|
|
*/
|
2017-04-21 15:27:08 +03:00
|
|
|
static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
|
2013-08-10 00:05:44 +04:00
|
|
|
{
|
|
|
|
/* This bug only exists in linux, to our knowledge. */
|
|
|
|
#ifdef CONFIG_LINUX
|
2019-07-01 18:06:04 +03:00
|
|
|
struct ibv_port_attr port_attr;
|
2013-08-10 00:05:44 +04:00
|
|
|
|
2015-02-28 21:09:41 +03:00
|
|
|
/*
|
2013-08-10 00:05:44 +04:00
|
|
|
* Verbs are only NULL if management has bound to '[::]'.
|
2015-02-28 21:09:41 +03:00
|
|
|
*
|
2013-08-10 00:05:44 +04:00
|
|
|
* Let's iterate through all the devices and see if there any pure IB
|
|
|
|
* devices (non-ethernet).
|
2015-02-28 21:09:41 +03:00
|
|
|
*
|
2013-08-10 00:05:44 +04:00
|
|
|
* If not, then we can safely proceed with the migration.
|
2013-08-18 21:40:06 +04:00
|
|
|
* Otherwise, there are no guarantees until the bug is fixed in linux.
|
2013-08-10 00:05:44 +04:00
|
|
|
*/
|
|
|
|
if (!verbs) {
|
2015-02-28 21:09:41 +03:00
|
|
|
int num_devices, x;
|
2020-10-20 06:10:48 +03:00
|
|
|
struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
|
2013-08-10 00:05:44 +04:00
|
|
|
bool roce_found = false;
|
|
|
|
bool ib_found = false;
|
|
|
|
|
|
|
|
for (x = 0; x < num_devices; x++) {
|
|
|
|
verbs = ibv_open_device(dev_list[x]);
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* ibv_open_device() is not documented to set errno. If
|
|
|
|
* it does, it's somebody else's doc bug. If it doesn't,
|
|
|
|
* the use of errno below is wrong.
|
|
|
|
* TODO Find out whether ibv_open_device() sets errno.
|
|
|
|
*/
|
2015-06-03 02:14:10 +03:00
|
|
|
if (!verbs) {
|
|
|
|
if (errno == EPERM) {
|
|
|
|
continue;
|
|
|
|
} else {
|
2023-09-28 16:19:46 +03:00
|
|
|
error_setg_errno(errp, errno,
|
|
|
|
"could not open RDMA device context");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2015-06-03 02:14:10 +03:00
|
|
|
}
|
|
|
|
}
|
2013-08-10 00:05:44 +04:00
|
|
|
|
|
|
|
if (ibv_query_port(verbs, 1, &port_attr)) {
|
|
|
|
ibv_close_device(verbs);
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp,
|
|
|
|
"RDMA ERROR: Could not query initial IB port");
|
|
|
|
}
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:44 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
|
|
|
|
ib_found = true;
|
|
|
|
} else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
|
|
|
|
roce_found = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ibv_close_device(verbs);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (roce_found) {
|
|
|
|
if (ib_found) {
|
|
|
|
fprintf(stderr, "WARN: migrations may fail:"
|
|
|
|
" IPv6 over RoCE / iWARP in linux"
|
|
|
|
" is broken. But since you appear to have a"
|
|
|
|
" mixed RoCE / IB environment, be sure to only"
|
|
|
|
" migrate over the IB fabric until the kernel "
|
|
|
|
" fixes the bug.\n");
|
|
|
|
} else {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: "
|
|
|
|
"You only have RoCE / iWARP devices in your systems"
|
|
|
|
" and your management software has specified '[::]'"
|
|
|
|
", but IPv6 over RoCE / iWARP is not supported in Linux.");
|
|
|
|
}
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:44 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have a verbs context, that means that some other than '[::]' was
|
2015-02-28 21:09:41 +03:00
|
|
|
* used by the management software for binding. In which case we can
|
|
|
|
* actually warn the user about a potentially broken kernel.
|
2013-08-10 00:05:44 +04:00
|
|
|
*/
|
|
|
|
|
|
|
|
/* IB ports start with 1, not 0 */
|
|
|
|
if (ibv_query_port(verbs, 1, &port_attr)) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: Could not query initial IB port");
|
|
|
|
}
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:44 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: "
|
|
|
|
"Linux kernel's RoCE / iWARP does not support IPv6 "
|
|
|
|
"(but patches on linux-rdma in progress)");
|
|
|
|
}
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:44 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* Figure out which RDMA device corresponds to the requested IP hostname
|
|
|
|
* Also create the initial connection manager identifiers for opening
|
|
|
|
* the connection.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
2013-08-10 00:05:44 +04:00
|
|
|
struct rdma_addrinfo *res;
|
2013-07-22 18:01:54 +04:00
|
|
|
char port_str[16];
|
|
|
|
struct rdma_cm_event *cm_event;
|
|
|
|
char ip[40] = "unknown";
|
2013-08-10 00:05:44 +04:00
|
|
|
struct rdma_addrinfo *e;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (rdma->host == NULL || !strcmp(rdma->host, "")) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: RDMA hostname has not been set");
|
|
|
|
}
|
2023-09-28 16:19:50 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* create CM channel */
|
|
|
|
rdma->channel = rdma_create_event_channel();
|
|
|
|
if (!rdma->channel) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not create CM channel");
|
|
|
|
}
|
2023-09-28 16:19:50 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* create CM id */
|
|
|
|
ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not create channel id");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_create_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(port_str, 16, "%d", rdma->port);
|
|
|
|
port_str[15] = '\0';
|
|
|
|
|
2013-08-10 00:05:44 +04:00
|
|
|
ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
|
2023-09-28 16:19:50 +03:00
|
|
|
if (ret) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
|
|
|
|
rdma->host);
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
|
2013-08-10 00:05:43 +04:00
|
|
|
for (e = res; e != NULL; e = e->ai_next) {
|
|
|
|
inet_ntop(e->ai_family,
|
2013-08-10 00:05:44 +04:00
|
|
|
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-10 00:05:44 +04:00
|
|
|
ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
|
2013-08-10 00:05:43 +04:00
|
|
|
RDMA_RESOLVE_TIMEOUT_MS);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret >= 0) {
|
2013-08-19 06:27:08 +04:00
|
|
|
if (e->ai_family == AF_INET6) {
|
2017-04-21 15:27:08 +03:00
|
|
|
ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs, errp);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-08-19 06:27:08 +04:00
|
|
|
continue;
|
|
|
|
}
|
2013-08-10 00:05:44 +04:00
|
|
|
}
|
2013-08-10 00:05:43 +04:00
|
|
|
goto route;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2021-05-25 11:05:50 +03:00
|
|
|
rdma_freeaddrinfo(res);
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not resolve address %s",
|
|
|
|
rdma->host);
|
|
|
|
}
|
2013-08-10 00:05:43 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
|
|
|
|
route:
|
2021-05-25 11:05:50 +03:00
|
|
|
rdma_freeaddrinfo(res);
|
2013-07-22 18:01:54 +04:00
|
|
|
qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
|
|
|
|
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp,
|
|
|
|
"RDMA ERROR: result not equal to event_addr_resolved %s",
|
|
|
|
rdma_event_str(cm_event->event));
|
|
|
|
}
|
2021-06-28 10:19:59 +03:00
|
|
|
error_report("rdma_resolve_addr");
|
2014-05-13 16:25:38 +04:00
|
|
|
rdma_ack_cm_event(cm_event);
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
|
|
|
|
/* resolve route */
|
|
|
|
ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not resolve rdma route");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not perform event_route_resolved");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: "
|
|
|
|
"result not equal to event_route_resolved: %s",
|
|
|
|
rdma_event_str(cm_event->event));
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_resolve_get_addr;
|
|
|
|
}
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
rdma->verbs = rdma->cm_id->verbs;
|
|
|
|
qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
|
|
|
|
qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_resolve_get_addr:
|
|
|
|
rdma_destroy_id(rdma->cm_id);
|
|
|
|
rdma->cm_id = NULL;
|
|
|
|
err_resolve_create_id:
|
|
|
|
rdma_destroy_event_channel(rdma->channel);
|
|
|
|
rdma->channel = NULL;
|
2023-09-28 16:19:50 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create protection domain and completion queues
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
/* allocate pd */
|
|
|
|
rdma->pd = ibv_alloc_pd(rdma->verbs);
|
|
|
|
if (!rdma->pd) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("failed to allocate protection domain");
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
/* create receive completion channel */
|
|
|
|
rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
|
|
|
|
if (!rdma->recv_comp_channel) {
|
|
|
|
error_report("failed to allocate receive completion channel");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_alloc_pd_cq;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
* Completion queue can be filled by read work requests.
|
2013-07-22 18:01:54 +04:00
|
|
|
*/
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
|
|
|
NULL, rdma->recv_comp_channel, 0);
|
|
|
|
if (!rdma->recv_cq) {
|
|
|
|
error_report("failed to allocate receive completion queue");
|
|
|
|
goto err_alloc_pd_cq;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* create send completion channel */
|
|
|
|
rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
|
|
|
|
if (!rdma->send_comp_channel) {
|
|
|
|
error_report("failed to allocate send completion channel");
|
|
|
|
goto err_alloc_pd_cq;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
|
|
|
NULL, rdma->send_comp_channel, 0);
|
|
|
|
if (!rdma->send_cq) {
|
|
|
|
error_report("failed to allocate send completion queue");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_alloc_pd_cq;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_alloc_pd_cq:
|
|
|
|
if (rdma->pd) {
|
|
|
|
ibv_dealloc_pd(rdma->pd);
|
|
|
|
}
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
if (rdma->recv_comp_channel) {
|
|
|
|
ibv_destroy_comp_channel(rdma->recv_comp_channel);
|
|
|
|
}
|
|
|
|
if (rdma->send_comp_channel) {
|
|
|
|
ibv_destroy_comp_channel(rdma->send_comp_channel);
|
|
|
|
}
|
|
|
|
if (rdma->recv_cq) {
|
|
|
|
ibv_destroy_cq(rdma->recv_cq);
|
|
|
|
rdma->recv_cq = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
rdma->pd = NULL;
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
rdma->recv_comp_channel = NULL;
|
|
|
|
rdma->send_comp_channel = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create queue pairs.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_alloc_qp(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
struct ibv_qp_init_attr attr = { 0 };
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
|
|
|
|
attr.cap.max_recv_wr = 3;
|
|
|
|
attr.cap.max_send_sge = 1;
|
|
|
|
attr.cap.max_recv_sge = 1;
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
attr.send_cq = rdma->send_cq;
|
|
|
|
attr.recv_cq = rdma->recv_cq;
|
2013-07-22 18:01:54 +04:00
|
|
|
attr.qp_type = IBV_QPT_RC;
|
|
|
|
|
|
|
|
ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma->qp = rdma->cm_id->qp;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-09-10 10:02:54 +03:00
|
|
|
/* Check whether On-Demand Paging is supported by RDAM device */
|
|
|
|
static bool rdma_support_odp(struct ibv_context *dev)
|
|
|
|
{
|
|
|
|
struct ibv_device_attr_ex attr = {0};
|
|
|
|
int ret = ibv_query_device_ex(dev, NULL, &attr);
|
|
|
|
if (ret) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-09-10 10:02:55 +03:00
|
|
|
/*
|
|
|
|
* ibv_advise_mr to avoid RNR NAK error as far as possible.
|
|
|
|
* The responder mr registering with ODP will sent RNR NAK back to
|
|
|
|
* the requester in the face of the page fault.
|
|
|
|
*/
|
|
|
|
static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
|
|
|
|
uint32_t len, uint32_t lkey,
|
|
|
|
const char *name, bool wr)
|
|
|
|
{
|
|
|
|
#ifdef HAVE_IBV_ADVISE_MR
|
|
|
|
int ret;
|
|
|
|
int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
|
|
|
|
IBV_ADVISE_MR_ADVICE_PREFETCH;
|
|
|
|
struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
|
|
|
|
|
|
|
|
ret = ibv_advise_mr(pd, advice,
|
|
|
|
IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
|
|
|
|
/* ignore the error */
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
|
2021-09-10 10:02:55 +03:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
RDMALocalBlocks *local = &rdma->local_ram_blocks;
|
|
|
|
|
|
|
|
for (i = 0; i < local->nb_blocks; i++) {
|
2021-09-10 10:02:54 +03:00
|
|
|
int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
local->block[i].mr =
|
|
|
|
ibv_reg_mr(rdma->pd,
|
|
|
|
local->block[i].local_host_addr,
|
2021-09-10 10:02:54 +03:00
|
|
|
local->block[i].length, access
|
2013-07-22 18:01:54 +04:00
|
|
|
);
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* ibv_reg_mr() is not documented to set errno. If it does,
|
|
|
|
* it's somebody else's doc bug. If it doesn't, the use of
|
|
|
|
* errno below is wrong.
|
|
|
|
* TODO Find out whether ibv_reg_mr() sets errno.
|
|
|
|
*/
|
2021-09-10 10:02:54 +03:00
|
|
|
if (!local->block[i].mr &&
|
|
|
|
errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
|
|
|
|
access |= IBV_ACCESS_ON_DEMAND;
|
|
|
|
/* register ODP mr */
|
|
|
|
local->block[i].mr =
|
|
|
|
ibv_reg_mr(rdma->pd,
|
|
|
|
local->block[i].local_host_addr,
|
|
|
|
local->block[i].length, access);
|
|
|
|
trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
|
2021-09-10 10:02:55 +03:00
|
|
|
|
|
|
|
if (local->block[i].mr) {
|
|
|
|
qemu_rdma_advise_prefetch_mr(rdma->pd,
|
|
|
|
(uintptr_t)local->block[i].local_host_addr,
|
|
|
|
local->block[i].length,
|
|
|
|
local->block[i].mr->lkey,
|
|
|
|
local->block[i].block_name,
|
|
|
|
true);
|
|
|
|
}
|
2021-09-10 10:02:54 +03:00
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
if (!local->block[i].mr) {
|
2021-07-06 12:44:33 +03:00
|
|
|
perror("Failed to register local dest ram block!");
|
2013-07-22 18:01:54 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
rdma->total_registrations++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (i >= local->nb_blocks) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i--; i >= 0; i--) {
|
|
|
|
ibv_dereg_mr(local->block[i].mr);
|
2021-07-08 17:45:21 +03:00
|
|
|
local->block[i].mr = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->total_registrations--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the ram block that corresponds to the page requested to be
|
|
|
|
* transmitted by QEMU.
|
|
|
|
*
|
|
|
|
* Once the block is found, also identify which 'chunk' within that
|
|
|
|
* block that the page belongs to.
|
|
|
|
*/
|
2023-09-28 16:19:39 +03:00
|
|
|
static void qemu_rdma_search_ram_block(RDMAContext *rdma,
|
|
|
|
uintptr_t block_offset,
|
|
|
|
uint64_t offset,
|
|
|
|
uint64_t length,
|
|
|
|
uint64_t *block_index,
|
|
|
|
uint64_t *chunk_index)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
uint64_t current_addr = block_offset + offset;
|
|
|
|
RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
|
|
|
|
(void *) block_offset);
|
|
|
|
assert(block);
|
|
|
|
assert(current_addr >= block->offset);
|
|
|
|
assert((current_addr + length) <= (block->offset + block->length));
|
|
|
|
|
|
|
|
*block_index = block->index;
|
|
|
|
*chunk_index = ram_chunk_index(block->local_host_addr,
|
|
|
|
block->local_host_addr + (current_addr - block->offset));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a chunk with IB. If the chunk was already registered
|
|
|
|
* previously, then skip.
|
|
|
|
*
|
|
|
|
* Also return the keys associated with the registration needed
|
|
|
|
* to perform the actual RDMA operation.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
|
2015-02-28 21:09:42 +03:00
|
|
|
RDMALocalBlock *block, uintptr_t host_addr,
|
2013-07-22 18:01:54 +04:00
|
|
|
uint32_t *lkey, uint32_t *rkey, int chunk,
|
|
|
|
uint8_t *chunk_start, uint8_t *chunk_end)
|
|
|
|
{
|
|
|
|
if (block->mr) {
|
|
|
|
if (lkey) {
|
|
|
|
*lkey = block->mr->lkey;
|
|
|
|
}
|
|
|
|
if (rkey) {
|
|
|
|
*rkey = block->mr->rkey;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate memory to store chunk MRs */
|
|
|
|
if (!block->pmr) {
|
2015-09-14 14:51:31 +03:00
|
|
|
block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If 'rkey', then we're the destination, so grant access to the source.
|
|
|
|
*
|
|
|
|
* If 'lkey', then we're the source VM, so grant access only to ourselves.
|
|
|
|
*/
|
|
|
|
if (!block->pmr[chunk]) {
|
|
|
|
uint64_t len = chunk_end - chunk_start;
|
2021-09-10 10:02:54 +03:00
|
|
|
int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
|
|
|
|
0;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_register_and_get_keys(len, chunk_start);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2021-09-10 10:02:54 +03:00
|
|
|
block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* ibv_reg_mr() is not documented to set errno. If it does,
|
|
|
|
* it's somebody else's doc bug. If it doesn't, the use of
|
|
|
|
* errno below is wrong.
|
|
|
|
* TODO Find out whether ibv_reg_mr() sets errno.
|
|
|
|
*/
|
2021-09-10 10:02:54 +03:00
|
|
|
if (!block->pmr[chunk] &&
|
|
|
|
errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
|
|
|
|
access |= IBV_ACCESS_ON_DEMAND;
|
|
|
|
/* register ODP mr */
|
|
|
|
block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
|
|
|
|
trace_qemu_rdma_register_odp_mr(block->block_name);
|
2021-09-10 10:02:55 +03:00
|
|
|
|
|
|
|
if (block->pmr[chunk]) {
|
|
|
|
qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
|
|
|
|
len, block->pmr[chunk]->lkey,
|
|
|
|
block->block_name, rkey);
|
|
|
|
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
2021-09-10 10:02:54 +03:00
|
|
|
if (!block->pmr[chunk]) {
|
|
|
|
perror("Failed to register chunk!");
|
|
|
|
fprintf(stderr, "Chunk details: block: %d chunk index %d"
|
|
|
|
" start %" PRIuPTR " end %" PRIuPTR
|
|
|
|
" host %" PRIuPTR
|
|
|
|
" local %" PRIuPTR " registrations: %d\n",
|
|
|
|
block->index, chunk, (uintptr_t)chunk_start,
|
|
|
|
(uintptr_t)chunk_end, host_addr,
|
|
|
|
(uintptr_t)block->local_host_addr,
|
|
|
|
rdma->total_registrations);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
rdma->total_registrations++;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (lkey) {
|
|
|
|
*lkey = block->pmr[chunk]->lkey;
|
|
|
|
}
|
|
|
|
if (rkey) {
|
|
|
|
*rkey = block->pmr[chunk]->rkey;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register (at connection time) the memory used for control
|
|
|
|
* channel messages.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
|
|
|
|
{
|
|
|
|
rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
|
|
|
|
rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
|
|
|
|
IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
|
|
|
|
if (rdma->wr_data[idx].control_mr) {
|
|
|
|
rdma->total_registrations++;
|
|
|
|
return 0;
|
|
|
|
}
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("qemu_rdma_reg_control failed");
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform a non-optimized memory unregistration after every transfer
|
2015-06-11 20:17:20 +03:00
|
|
|
* for demonstration purposes, only if pin-all is not requested.
|
2013-07-22 18:01:54 +04:00
|
|
|
*
|
|
|
|
* Potential optimizations:
|
|
|
|
* 1. Start a new thread to run this function continuously
|
|
|
|
- for bit clearing
|
|
|
|
- and for receipt of unregister messages
|
|
|
|
* 2. Use an LRU.
|
|
|
|
* 3. Use workload hints.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
while (rdma->unregistrations[rdma->unregister_current]) {
|
|
|
|
int ret;
|
|
|
|
uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
|
|
|
|
uint64_t chunk =
|
|
|
|
(wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
|
|
|
|
uint64_t index =
|
|
|
|
(wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
|
|
|
|
RDMALocalBlock *block =
|
|
|
|
&(rdma->local_ram_blocks.block[index]);
|
|
|
|
RDMARegister reg = { .current_index = index };
|
|
|
|
RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
|
|
|
|
};
|
|
|
|
RDMAControlHeader head = { .len = sizeof(RDMARegister),
|
|
|
|
.type = RDMA_CONTROL_UNREGISTER_REQUEST,
|
|
|
|
.repeat = 1,
|
|
|
|
};
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_unregister_waiting_proc(chunk,
|
|
|
|
rdma->unregister_current);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
rdma->unregistrations[rdma->unregister_current] = 0;
|
|
|
|
rdma->unregister_current++;
|
|
|
|
|
|
|
|
if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
|
|
|
|
rdma->unregister_current = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Unregistration is speculative (because migration is single-threaded
|
|
|
|
* and we cannot break the protocol's inifinband message ordering).
|
|
|
|
* Thus, if the memory is currently being used for transmission,
|
|
|
|
* then abort the attempt to unregister and try again
|
|
|
|
* later the next time a completion is received for this memory.
|
|
|
|
*/
|
|
|
|
clear_bit(chunk, block->unregister_bitmap);
|
|
|
|
|
|
|
|
if (test_bit(chunk, block->transit_bitmap)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_unregister_waiting_inflight(chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_unregister_waiting_send(chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
ret = ibv_dereg_mr(block->pmr[chunk]);
|
|
|
|
block->pmr[chunk] = NULL;
|
|
|
|
block->remote_keys[chunk] = 0;
|
|
|
|
|
|
|
|
if (ret != 0) {
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* FIXME perror() is problematic, bcause ibv_dereg_mr() is
|
|
|
|
* not documented to set errno. Will go away later in
|
|
|
|
* this series.
|
|
|
|
*/
|
2013-07-22 18:01:54 +04:00
|
|
|
perror("unregistration chunk failed");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
rdma->total_registrations--;
|
|
|
|
|
|
|
|
reg.key.chunk = chunk;
|
2015-06-11 20:17:22 +03:00
|
|
|
register_to_network(rdma, ®);
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
|
|
|
|
&resp, NULL, NULL);
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_unregister_waiting_complete(chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
|
|
|
|
uint64_t chunk)
|
|
|
|
{
|
|
|
|
uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
result |= (index << RDMA_WRID_BLOCK_SHIFT);
|
|
|
|
result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Consult the connection manager to see a work request
|
|
|
|
* (of any kind) has completed.
|
|
|
|
* Return the work request ID that completed.
|
|
|
|
*/
|
2023-09-28 16:19:27 +03:00
|
|
|
static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
|
|
|
|
uint64_t *wr_id_out, uint32_t *byte_len)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct ibv_wc wc;
|
|
|
|
uint64_t wr_id;
|
|
|
|
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
ret = ibv_poll_cq(cq, 1, &wc);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
*wr_id_out = RDMA_WRID_NONE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("ibv_poll_cq failed");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
if (wc.status != IBV_WC_SUCCESS) {
|
|
|
|
fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
|
|
|
|
wc.status, ibv_wc_status_str(wc.status));
|
2023-09-28 16:19:30 +03:00
|
|
|
fprintf(stderr, "ibv_poll_cq wrid=%" PRIu64 "!\n", wr_id);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rdma->control_ready_expected &&
|
|
|
|
(wr_id >= RDMA_WRID_RECV_CONTROL)) {
|
2023-09-28 16:19:30 +03:00
|
|
|
trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
|
|
|
|
rdma->nb_sent);
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->control_ready_expected = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wr_id == RDMA_WRID_RDMA_WRITE) {
|
|
|
|
uint64_t chunk =
|
|
|
|
(wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
|
|
|
|
uint64_t index =
|
|
|
|
(wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
|
|
|
|
RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
|
|
|
|
|
2023-09-28 16:19:30 +03:00
|
|
|
trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
|
2015-02-28 21:09:43 +03:00
|
|
|
index, chunk, block->local_host_addr,
|
|
|
|
(void *)(uintptr_t)block->remote_host_addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
clear_bit(chunk, block->transit_bitmap);
|
|
|
|
|
|
|
|
if (rdma->nb_sent > 0) {
|
|
|
|
rdma->nb_sent--;
|
|
|
|
}
|
|
|
|
} else {
|
2023-09-28 16:19:30 +03:00
|
|
|
trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
*wr_id_out = wc.wr_id;
|
2013-08-10 00:05:42 +04:00
|
|
|
if (byte_len) {
|
|
|
|
*byte_len = wc.byte_len;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-17 14:09:34 +03:00
|
|
|
/* Wait for activity on the completion channel.
|
|
|
|
* Returns 0 on success, none-0 on error.
|
|
|
|
*/
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
|
|
|
|
struct ibv_comp_channel *comp_channel)
|
2017-07-17 14:09:34 +03:00
|
|
|
{
|
2018-08-06 16:29:33 +03:00
|
|
|
struct rdma_cm_event *cm_event;
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2018-08-06 16:29:33 +03:00
|
|
|
|
2017-07-17 14:09:34 +03:00
|
|
|
/*
|
|
|
|
* Coroutine doesn't start until migration_fd_process_incoming()
|
|
|
|
* so don't yield unless we know we're running inside of a coroutine.
|
|
|
|
*/
|
2018-08-06 16:29:30 +03:00
|
|
|
if (rdma->migration_started_on_destination &&
|
|
|
|
migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
yield_until_fd_readable(comp_channel->fd);
|
2017-07-17 14:09:34 +03:00
|
|
|
} else {
|
|
|
|
/* This is the source side, we're in a separate thread
|
|
|
|
* or destination prior to migration_fd_process_incoming()
|
2020-09-17 10:50:21 +03:00
|
|
|
* after postcopy, the destination also in a separate thread.
|
2017-07-17 14:09:34 +03:00
|
|
|
* we can't yield; so we have to poll the fd.
|
|
|
|
* But we need to be able to handle 'cancel' or an error
|
|
|
|
* without hanging forever.
|
|
|
|
*/
|
2023-09-28 16:19:53 +03:00
|
|
|
while (!rdma->errored && !rdma->received_error) {
|
2018-08-06 16:29:33 +03:00
|
|
|
GPollFD pfds[2];
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
pfds[0].fd = comp_channel->fd;
|
2017-07-17 14:09:34 +03:00
|
|
|
pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
|
2018-08-06 16:29:33 +03:00
|
|
|
pfds[0].revents = 0;
|
|
|
|
|
|
|
|
pfds[1].fd = rdma->channel->fd;
|
|
|
|
pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
|
|
|
|
pfds[1].revents = 0;
|
|
|
|
|
2017-07-17 14:09:34 +03:00
|
|
|
/* 0.1s timeout, should be fine for a 'cancel' */
|
2018-08-06 16:29:33 +03:00
|
|
|
switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
|
|
|
|
case 2:
|
2017-07-17 14:09:34 +03:00
|
|
|
case 1: /* fd active */
|
2018-08-06 16:29:33 +03:00
|
|
|
if (pfds[0].revents) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pfds[1].revents) {
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2021-06-02 05:35:06 +03:00
|
|
|
error_report("failed to get cm event while wait "
|
|
|
|
"completion channel");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:33 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
error_report("receive cm event while wait comp channel,"
|
|
|
|
"cm event is %d", cm_event->event);
|
|
|
|
if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
|
|
|
|
cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
|
2021-06-02 05:35:06 +03:00
|
|
|
rdma_ack_cm_event(cm_event);
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:33 +03:00
|
|
|
}
|
2021-06-02 05:35:06 +03:00
|
|
|
rdma_ack_cm_event(cm_event);
|
2018-08-06 16:29:33 +03:00
|
|
|
}
|
|
|
|
break;
|
2017-07-17 14:09:34 +03:00
|
|
|
|
|
|
|
case 0: /* Timeout, go around again */
|
|
|
|
break;
|
|
|
|
|
|
|
|
default: /* Error of some type -
|
|
|
|
* I don't trust errno from qemu_poll_ns
|
|
|
|
*/
|
|
|
|
error_report("%s: poll failed", __func__);
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2017-07-17 14:09:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
|
|
|
|
/* Bail out and let the cancellation happen */
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2017-07-17 14:09:34 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rdma->received_error) {
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2017-07-17 14:09:34 +03:00
|
|
|
}
|
2023-09-28 16:19:53 +03:00
|
|
|
return -rdma->errored;
|
2017-07-17 14:09:34 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:31 +03:00
|
|
|
static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
{
|
|
|
|
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
|
|
|
|
rdma->recv_comp_channel;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:31 +03:00
|
|
|
static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
{
|
|
|
|
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* Block until the next work request has completed.
|
|
|
|
*
|
|
|
|
* First poll to see if a work request has already completed,
|
|
|
|
* otherwise block.
|
|
|
|
*
|
|
|
|
* If we encounter completed work requests for IDs other than
|
|
|
|
* the one we're interested in, then that's generally an error.
|
|
|
|
*
|
|
|
|
* The only exception is actual RDMA Write completions. These
|
|
|
|
* completions only need to be recorded, but do not actually
|
|
|
|
* need further processing.
|
|
|
|
*/
|
2023-09-28 16:19:31 +03:00
|
|
|
static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
|
|
|
|
uint64_t wrid_requested,
|
2013-08-10 00:05:42 +04:00
|
|
|
uint32_t *byte_len)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2023-09-28 16:19:54 +03:00
|
|
|
int num_cq_events = 0, ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
struct ibv_cq *cq;
|
|
|
|
void *cq_ctx;
|
|
|
|
uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
|
|
|
|
struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
if (ibv_req_notify_cq(poll_cq, 0)) {
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
/* poll cq first */
|
|
|
|
while (wr_id != wrid_requested) {
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
if (wr_id == RDMA_WRID_NONE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (wr_id != wrid_requested) {
|
2023-09-28 16:19:30 +03:00
|
|
|
trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wr_id == wrid_requested) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
ret = qemu_rdma_wait_comp_channel(rdma, ch);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2017-07-17 14:09:34 +03:00
|
|
|
goto err_block_for_wrid;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* FIXME perror() is problematic, because ibv_reg_mr() is
|
|
|
|
* not documented to set errno. Will go away later in
|
|
|
|
* this series.
|
|
|
|
*/
|
2013-07-22 18:01:54 +04:00
|
|
|
perror("ibv_get_cq_event");
|
|
|
|
goto err_block_for_wrid;
|
|
|
|
}
|
|
|
|
|
|
|
|
num_cq_events++;
|
|
|
|
|
2023-09-28 16:19:54 +03:00
|
|
|
if (ibv_req_notify_cq(cq, 0)) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_block_for_wrid;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (wr_id != wrid_requested) {
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
|
|
|
goto err_block_for_wrid;
|
|
|
|
}
|
|
|
|
|
|
|
|
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
if (wr_id == RDMA_WRID_NONE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (wr_id != wrid_requested) {
|
2023-09-28 16:19:30 +03:00
|
|
|
trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wr_id == wrid_requested) {
|
|
|
|
goto success_block_for_wrid;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
success_block_for_wrid:
|
|
|
|
if (num_cq_events) {
|
|
|
|
ibv_ack_cq_events(cq, num_cq_events);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_block_for_wrid:
|
|
|
|
if (num_cq_events) {
|
|
|
|
ibv_ack_cq_events(cq, num_cq_events);
|
|
|
|
}
|
2017-07-17 14:09:33 +03:00
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Post a SEND message work request for the control channel
|
|
|
|
* containing some data and block until the post completes.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
|
|
|
|
RDMAControlHeader *head)
|
|
|
|
{
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-08-04 06:54:52 +04:00
|
|
|
RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
|
2013-07-22 18:01:54 +04:00
|
|
|
struct ibv_send_wr *bad_wr;
|
|
|
|
struct ibv_sge sge = {
|
2015-02-28 21:09:43 +03:00
|
|
|
.addr = (uintptr_t)(wr->control),
|
2013-07-22 18:01:54 +04:00
|
|
|
.length = head->len + sizeof(RDMAControlHeader),
|
|
|
|
.lkey = wr->control_mr->lkey,
|
|
|
|
};
|
|
|
|
struct ibv_send_wr send_wr = {
|
|
|
|
.wr_id = RDMA_WRID_SEND_CONTROL,
|
|
|
|
.opcode = IBV_WR_SEND,
|
|
|
|
.send_flags = IBV_SEND_SIGNALED,
|
|
|
|
.sg_list = &sge,
|
|
|
|
.num_sge = 1,
|
|
|
|
};
|
|
|
|
|
2017-07-17 14:09:35 +03:00
|
|
|
trace_qemu_rdma_post_send_control(control_desc(head->type));
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't actually need to do a memcpy() in here if we used
|
|
|
|
* the "sge" properly, but since we're only sending control messages
|
|
|
|
* (not RAM in a performance-critical path), then its OK for now.
|
|
|
|
*
|
|
|
|
* The copy makes the RDMAControlHeader simpler to manipulate
|
|
|
|
* for the time being.
|
|
|
|
*/
|
2013-08-10 00:05:41 +04:00
|
|
|
assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
|
2013-07-22 18:01:54 +04:00
|
|
|
memcpy(wr->control, head, sizeof(RDMAControlHeader));
|
|
|
|
control_to_network((void *) wr->control);
|
|
|
|
|
|
|
|
if (buf) {
|
|
|
|
memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-02-18 06:34:06 +04:00
|
|
|
ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2014-02-18 06:34:06 +04:00
|
|
|
if (ret > 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to use post IB SEND for control");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2013-08-10 00:05:42 +04:00
|
|
|
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: send polling control error");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:52 +03:00
|
|
|
return 0;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Post a RECV work request in anticipation of some future receipt
|
|
|
|
* of data on the control channel.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
|
|
|
|
{
|
|
|
|
struct ibv_recv_wr *bad_wr;
|
|
|
|
struct ibv_sge sge = {
|
2015-02-28 21:09:43 +03:00
|
|
|
.addr = (uintptr_t)(rdma->wr_data[idx].control),
|
2013-07-22 18:01:54 +04:00
|
|
|
.length = RDMA_CONTROL_MAX_BUFFER,
|
|
|
|
.lkey = rdma->wr_data[idx].control_mr->lkey,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ibv_recv_wr recv_wr = {
|
|
|
|
.wr_id = RDMA_WRID_RECV_CONTROL + idx,
|
|
|
|
.sg_list = &sge,
|
|
|
|
.num_sge = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block and wait for a RECV control channel message to arrive.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
|
2023-09-28 16:19:33 +03:00
|
|
|
RDMAControlHeader *head, uint32_t expecting, int idx)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2013-08-10 00:05:42 +04:00
|
|
|
uint32_t byte_len;
|
|
|
|
int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
|
|
|
|
&byte_len);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: recv polling control error!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
network_to_control((void *) rdma->wr_data[idx].control);
|
|
|
|
memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
|
|
|
|
|
2017-07-17 14:09:35 +03:00
|
|
|
trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (expecting == RDMA_CONTROL_NONE) {
|
2017-07-17 14:09:35 +03:00
|
|
|
trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
|
2015-02-02 22:53:33 +03:00
|
|
|
head->type);
|
2013-07-22 18:01:54 +04:00
|
|
|
} else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Was expecting a %s (%d) control message"
|
|
|
|
", but got: %s (%d), length: %d",
|
2017-07-17 14:09:35 +03:00
|
|
|
control_desc(expecting), expecting,
|
|
|
|
control_desc(head->type), head->type, head->len);
|
2016-09-23 22:14:04 +03:00
|
|
|
if (head->type == RDMA_CONTROL_ERROR) {
|
|
|
|
rdma->received_error = true;
|
|
|
|
}
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
2013-08-10 00:05:41 +04:00
|
|
|
if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
|
2015-02-25 07:22:31 +03:00
|
|
|
error_report("too long length: %d", head->len);
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:41 +04:00
|
|
|
}
|
2013-08-10 00:05:42 +04:00
|
|
|
if (sizeof(*head) + head->len != byte_len) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Malformed length: %d byte_len %d", head->len, byte_len);
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-08-10 00:05:42 +04:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When a RECV work request has completed, the work request's
|
|
|
|
* buffer is pointed at the header.
|
|
|
|
*
|
|
|
|
* This will advance the pointer to the data portion
|
|
|
|
* of the control message of the work request's buffer that
|
|
|
|
* was populated after the work request finished.
|
|
|
|
*/
|
|
|
|
static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
|
|
|
|
RDMAControlHeader *head)
|
|
|
|
{
|
|
|
|
rdma->wr_data[idx].control_len = head->len;
|
|
|
|
rdma->wr_data[idx].control_curr =
|
|
|
|
rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an 'atomic' high-level operation to deliver a single, unified
|
|
|
|
* control-channel message.
|
|
|
|
*
|
|
|
|
* Additionally, if the user is expecting some kind of reply to this message,
|
|
|
|
* they can request a 'resp' response message be filled in by posting an
|
|
|
|
* additional work request on behalf of the user and waiting for an additional
|
|
|
|
* completion.
|
|
|
|
*
|
|
|
|
* The extra (optional) response is used during registration to us from having
|
|
|
|
* to perform an *additional* exchange of message just to provide a response by
|
|
|
|
* instead piggy-backing on the acknowledgement.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
|
|
|
|
uint8_t *data, RDMAControlHeader *resp,
|
|
|
|
int *resp_idx,
|
|
|
|
int (*callback)(RDMAContext *rdma))
|
|
|
|
{
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wait until the dest is ready before attempting to deliver the message
|
|
|
|
* by waiting for a READY message.
|
|
|
|
*/
|
|
|
|
if (rdma->control_ready_expected) {
|
2023-09-21 15:13:07 +03:00
|
|
|
RDMAControlHeader resp_ignored;
|
|
|
|
|
|
|
|
ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
|
|
|
|
RDMA_CONTROL_READY,
|
|
|
|
RDMA_WRID_READY);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the user is expecting a response, post a WR in anticipation of it.
|
|
|
|
*/
|
|
|
|
if (resp) {
|
|
|
|
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error posting"
|
2013-07-22 18:01:54 +04:00
|
|
|
" extra control recv for anticipated result!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Post a WR to replace the one we just consumed for the READY message.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error posting first control recv!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Deliver the control message that was requested.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_post_send_control(rdma, data, head);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to send control buffer!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we're expecting a response, block and wait for it.
|
|
|
|
*/
|
|
|
|
if (resp) {
|
|
|
|
if (callback) {
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_exchange_send_issue_callback();
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = callback(rdma);
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-07-17 14:09:35 +03:00
|
|
|
trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_exchange_get_response(rdma, resp,
|
|
|
|
resp->type, RDMA_WRID_DATA);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
|
|
|
|
if (resp_idx) {
|
|
|
|
*resp_idx = RDMA_WRID_DATA;
|
|
|
|
}
|
2017-07-17 14:09:35 +03:00
|
|
|
trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->control_ready_expected = 1;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an 'atomic' high-level operation to receive a single, unified
|
|
|
|
* control-channel message.
|
|
|
|
*/
|
|
|
|
static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
|
2023-09-28 16:19:33 +03:00
|
|
|
uint32_t expecting)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMAControlHeader ready = {
|
|
|
|
.len = 0,
|
|
|
|
.type = RDMA_CONTROL_READY,
|
|
|
|
.repeat = 1,
|
|
|
|
};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inform the source that we're ready to receive a message.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to send control buffer!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block and wait for the message.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_exchange_get_response(rdma, head,
|
|
|
|
expecting, RDMA_WRID_READY);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Post a new RECV work request to replace the one we just consumed.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error posting second control recv!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Write an actual chunk of memory using RDMA.
|
|
|
|
*
|
|
|
|
* If we're using dynamic registration on the dest-side, we have to
|
|
|
|
* send a registration command first.
|
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
static int qemu_rdma_write_one(RDMAContext *rdma,
|
2013-07-22 18:01:54 +04:00
|
|
|
int current_index, uint64_t current_addr,
|
|
|
|
uint64_t length)
|
|
|
|
{
|
|
|
|
struct ibv_sge sge;
|
|
|
|
struct ibv_send_wr send_wr = { 0 };
|
|
|
|
struct ibv_send_wr *bad_wr;
|
|
|
|
int reg_result_idx, ret, count = 0;
|
|
|
|
uint64_t chunk, chunks;
|
|
|
|
uint8_t *chunk_start, *chunk_end;
|
|
|
|
RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
|
|
|
|
RDMARegister reg;
|
|
|
|
RDMARegisterResult *reg_result;
|
|
|
|
RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
|
|
|
|
RDMAControlHeader head = { .len = sizeof(RDMARegister),
|
|
|
|
.type = RDMA_CONTROL_REGISTER_REQUEST,
|
|
|
|
.repeat = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
retry:
|
2015-02-28 21:09:43 +03:00
|
|
|
sge.addr = (uintptr_t)(block->local_host_addr +
|
2013-07-22 18:01:54 +04:00
|
|
|
(current_addr - block->offset));
|
|
|
|
sge.length = length;
|
|
|
|
|
2015-02-28 21:09:43 +03:00
|
|
|
chunk = ram_chunk_index(block->local_host_addr,
|
|
|
|
(uint8_t *)(uintptr_t)sge.addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
chunk_start = ram_chunk_start(block, chunk);
|
|
|
|
|
|
|
|
if (block->is_ram_block) {
|
|
|
|
chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
|
|
|
|
|
|
|
|
if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
|
|
|
|
chunks--;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
|
|
|
|
|
|
|
|
if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
|
|
|
|
chunks--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_top(chunks + 1,
|
|
|
|
(chunks + 1) *
|
|
|
|
(1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
chunk_end = ram_chunk_end(block, chunk + chunks);
|
|
|
|
|
|
|
|
|
|
|
|
while (test_bit(chunk, block->transit_bitmap)) {
|
|
|
|
(void)count;
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_block(count++, current_index, chunk,
|
2013-07-22 18:01:54 +04:00
|
|
|
sge.addr, length, rdma->nb_sent, block->nb_chunks);
|
|
|
|
|
2013-08-10 00:05:42 +04:00
|
|
|
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to Wait for previous write to complete "
|
2013-07-22 18:01:54 +04:00
|
|
|
"block %d chunk %" PRIu64
|
2015-02-02 22:53:33 +03:00
|
|
|
" current %" PRIu64 " len %" PRIu64 " %d",
|
2013-07-22 18:01:54 +04:00
|
|
|
current_index, chunk, sge.addr, length, rdma->nb_sent);
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!rdma->pin_all || !block->is_ram_block) {
|
|
|
|
if (!block->remote_keys[chunk]) {
|
|
|
|
/*
|
|
|
|
* This chunk has not yet been registered, so first check to see
|
|
|
|
* if the entire chunk is zero. If so, tell the other size to
|
|
|
|
* memset() + madvise() the entire chunk without RDMA.
|
|
|
|
*/
|
|
|
|
|
2016-08-29 21:46:14 +03:00
|
|
|
if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMACompress comp = {
|
|
|
|
.offset = current_addr,
|
|
|
|
.value = 0,
|
|
|
|
.block_idx = current_index,
|
|
|
|
.length = length,
|
|
|
|
};
|
|
|
|
|
|
|
|
head.len = sizeof(comp);
|
|
|
|
head.type = RDMA_CONTROL_COMPRESS;
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_zero(chunk, sge.length,
|
|
|
|
current_index, current_addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:22 +03:00
|
|
|
compress_to_network(rdma, &comp);
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head,
|
|
|
|
(uint8_t *) &comp, NULL, NULL, NULL);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2023-05-15 22:57:03 +03:00
|
|
|
/*
|
|
|
|
* TODO: Here we are sending something, but we are not
|
|
|
|
* accounting for anything transferred. The following is wrong:
|
|
|
|
*
|
|
|
|
* stat64_add(&mig_stats.rdma_bytes, sge.length);
|
|
|
|
*
|
|
|
|
* because we are using some kind of compression. I
|
|
|
|
* would think that head.len would be the more similar
|
|
|
|
* thing to a correct value.
|
|
|
|
*/
|
2023-04-27 11:35:26 +03:00
|
|
|
stat64_add(&mig_stats.zero_pages,
|
|
|
|
sge.length / qemu_target_page_size());
|
2013-07-22 18:01:54 +04:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, tell other side to register.
|
|
|
|
*/
|
|
|
|
reg.current_index = current_index;
|
|
|
|
if (block->is_ram_block) {
|
|
|
|
reg.key.current_addr = current_addr;
|
|
|
|
} else {
|
|
|
|
reg.key.chunk = chunk;
|
|
|
|
}
|
|
|
|
reg.chunks = chunks;
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
|
|
|
|
current_addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:22 +03:00
|
|
|
register_to_network(rdma, ®);
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®,
|
|
|
|
&resp, ®_result_idx, NULL);
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* try to overlap this single registration with the one we sent. */
|
2015-02-28 21:09:42 +03:00
|
|
|
if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
|
2013-07-22 18:01:54 +04:00
|
|
|
&sge.lkey, NULL, chunk,
|
|
|
|
chunk_start, chunk_end)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("cannot get lkey");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
reg_result = (RDMARegisterResult *)
|
|
|
|
rdma->wr_data[reg_result_idx].control_curr;
|
|
|
|
|
|
|
|
network_to_result(reg_result);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
|
|
|
|
reg_result->rkey, chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
block->remote_keys[chunk] = reg_result->rkey;
|
|
|
|
block->remote_host_addr = reg_result->host_addr;
|
|
|
|
} else {
|
|
|
|
/* already registered before */
|
2015-02-28 21:09:42 +03:00
|
|
|
if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
|
2013-07-22 18:01:54 +04:00
|
|
|
&sge.lkey, NULL, chunk,
|
|
|
|
chunk_start, chunk_end)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("cannot get lkey!");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
send_wr.wr.rdma.rkey = block->remote_keys[chunk];
|
|
|
|
} else {
|
|
|
|
send_wr.wr.rdma.rkey = block->remote_rkey;
|
|
|
|
|
2015-02-28 21:09:42 +03:00
|
|
|
if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
|
2013-07-22 18:01:54 +04:00
|
|
|
&sge.lkey, NULL, chunk,
|
|
|
|
chunk_start, chunk_end)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("cannot get lkey!");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Encode the ram block index and chunk within this wrid.
|
|
|
|
* We will use this information at the time of completion
|
|
|
|
* to figure out which bitmap to check against and then which
|
|
|
|
* chunk in the bitmap to look for.
|
|
|
|
*/
|
|
|
|
send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
|
|
|
|
current_index, chunk);
|
|
|
|
|
|
|
|
send_wr.opcode = IBV_WR_RDMA_WRITE;
|
|
|
|
send_wr.send_flags = IBV_SEND_SIGNALED;
|
|
|
|
send_wr.sg_list = &sge;
|
|
|
|
send_wr.num_sge = 1;
|
|
|
|
send_wr.wr.rdma.remote_addr = block->remote_host_addr +
|
|
|
|
(current_addr - block->offset);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
|
|
|
|
sge.length);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* ibv_post_send() does not return negative error numbers,
|
|
|
|
* per the specification they are positive - no idea why.
|
|
|
|
*/
|
|
|
|
ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
|
|
|
|
|
|
|
|
if (ret == ENOMEM) {
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_one_queue_full();
|
2013-08-10 00:05:42 +04:00
|
|
|
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: failed to make "
|
2023-09-28 16:19:43 +03:00
|
|
|
"room in full send queue!");
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
goto retry;
|
|
|
|
|
|
|
|
} else if (ret > 0) {
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* FIXME perror() is problematic, because whether
|
|
|
|
* ibv_post_send() sets errno is unclear. Will go away later
|
|
|
|
* in this series.
|
|
|
|
*/
|
2013-07-22 18:01:54 +04:00
|
|
|
perror("rdma migration: post rdma write failed");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
set_bit(chunk, block->transit_bitmap);
|
2023-04-27 18:12:03 +03:00
|
|
|
stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
|
2023-05-15 22:57:03 +03:00
|
|
|
/*
|
|
|
|
* We are adding to transferred the amount of data written, but no
|
|
|
|
* overhead at all. I will asume that RDMA is magicaly and don't
|
|
|
|
* need to transfer (at least) the addresses where it wants to
|
|
|
|
* write the pages. Here it looks like it should be something
|
|
|
|
* like:
|
|
|
|
* sizeof(send_wr) + sge.length
|
|
|
|
* but this being RDMA, who knows.
|
|
|
|
*/
|
|
|
|
stat64_add(&mig_stats.rdma_bytes, sge.length);
|
2023-04-27 18:12:03 +03:00
|
|
|
ram_transferred_add(sge.length);
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->total_writes++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Push out any unwritten RDMA operations.
|
|
|
|
*
|
|
|
|
* We support sending out multiple chunks at the same time.
|
|
|
|
* Not all of them need to get signaled in the completion queue.
|
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
static int qemu_rdma_write_flush(RDMAContext *rdma)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!rdma->current_length) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-05-15 22:57:05 +03:00
|
|
|
ret = qemu_rdma_write_one(rdma,
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->current_index, rdma->current_addr, rdma->current_length);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0) {
|
|
|
|
rdma->nb_sent++;
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_write_flush(rdma->nb_sent);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->current_length = 0;
|
|
|
|
rdma->current_addr = 0;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:40 +03:00
|
|
|
static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t offset, uint64_t len)
|
|
|
|
{
|
2013-08-04 06:54:51 +04:00
|
|
|
RDMALocalBlock *block;
|
|
|
|
uint8_t *host_addr;
|
|
|
|
uint8_t *chunk_end;
|
|
|
|
|
|
|
|
if (rdma->current_index < 0) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-08-04 06:54:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (rdma->current_chunk < 0) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-08-04 06:54:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
block = &(rdma->local_ram_blocks.block[rdma->current_index]);
|
|
|
|
host_addr = block->local_host_addr + (offset - block->offset);
|
|
|
|
chunk_end = ram_chunk_end(block, rdma->current_chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (rdma->current_length == 0) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only merge into chunk sequentially.
|
|
|
|
*/
|
|
|
|
if (offset != (rdma->current_addr + rdma->current_length)) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if (offset < block->offset) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((offset + len) > (block->offset + block->length)) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((host_addr + len) > chunk_end) {
|
2023-09-28 16:19:40 +03:00
|
|
|
return false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:40 +03:00
|
|
|
return true;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're not actually writing here, but doing three things:
|
|
|
|
*
|
|
|
|
* 1. Identify the chunk the buffer belongs to.
|
|
|
|
* 2. If the chunk is full or the buffer doesn't belong to the current
|
|
|
|
* chunk, then start a new chunk and flush() the old chunk.
|
|
|
|
* 3. To keep the hardware busy, we also group chunks into batches
|
|
|
|
* and only require that a batch gets acknowledged in the completion
|
2020-09-17 10:50:21 +03:00
|
|
|
* queue instead of each individual chunk.
|
2013-07-22 18:01:54 +04:00
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
static int qemu_rdma_write(RDMAContext *rdma,
|
2013-07-22 18:01:54 +04:00
|
|
|
uint64_t block_offset, uint64_t offset,
|
|
|
|
uint64_t len)
|
|
|
|
{
|
|
|
|
uint64_t current_addr = block_offset + offset;
|
|
|
|
uint64_t index = rdma->current_index;
|
|
|
|
uint64_t chunk = rdma->current_chunk;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* If we cannot merge it, we flush the current buffer first. */
|
2023-09-28 16:19:40 +03:00
|
|
|
if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
|
2023-05-15 22:57:05 +03:00
|
|
|
ret = qemu_rdma_write_flush(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
rdma->current_length = 0;
|
|
|
|
rdma->current_addr = current_addr;
|
|
|
|
|
2023-09-28 16:19:39 +03:00
|
|
|
qemu_rdma_search_ram_block(rdma, block_offset,
|
|
|
|
offset, len, &index, &chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->current_index = index;
|
|
|
|
rdma->current_chunk = chunk;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* merge it */
|
|
|
|
rdma->current_length += len;
|
|
|
|
|
|
|
|
/* flush it if buffer is too large */
|
|
|
|
if (rdma->current_length >= RDMA_MERGE_MAX) {
|
2023-05-15 22:57:05 +03:00
|
|
|
return qemu_rdma_write_flush(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qemu_rdma_cleanup(RDMAContext *rdma)
|
|
|
|
{
|
migration: not wait RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect
When cancel migration during RDMA precopy, the source qemu main thread hangs sometime.
The backtrace is:
(gdb) bt
#0 0x00007f249eabd43d in write () from /lib64/libpthread.so.0
#1 0x00007f24a1ce98e4 in rdma_get_cm_event (channel=0x4675d10, event=0x7ffe2f643dd0) at src/cma.c:2189
#2 0x00000000007b6166 in qemu_rdma_cleanup (rdma=0x6784000) at migration/rdma.c:2296
#3 0x00000000007b7cae in qio_channel_rdma_close (ioc=0x3bfcc30, errp=0x0) at migration/rdma.c:2999
#4 0x00000000008db60e in qio_channel_close (ioc=0x3bfcc30, errp=0x0) at io/channel.c:273
#5 0x00000000007a8765 in channel_close (opaque=0x3bfcc30) at migration/qemu-file-channel.c:98
#6 0x00000000007a71f9 in qemu_fclose (f=0x527c000) at migration/qemu-file.c:334
#7 0x0000000000795b96 in migrate_fd_cleanup (opaque=0x3b46280) at migration/migration.c:1162
#8 0x000000000093a71b in aio_bh_call (bh=0x3db7a20) at util/async.c:90
#9 0x000000000093a7b2 in aio_bh_poll (ctx=0x3b121c0) at util/async.c:118
#10 0x000000000093f2ad in aio_dispatch (ctx=0x3b121c0) at util/aio-posix.c:436
#11 0x000000000093ab41 in aio_ctx_dispatch (source=0x3b121c0, callback=0x0, user_data=0x0)
at util/async.c:261
#12 0x00007f249f73c7aa in g_main_context_dispatch () from /lib64/libglib-2.0.so.0
#13 0x000000000093dc5e in glib_pollfds_poll () at util/main-loop.c:215
#14 0x000000000093dd4e in os_host_main_loop_wait (timeout=28000000) at util/main-loop.c:263
#15 0x000000000093de05 in main_loop_wait (nonblocking=0) at util/main-loop.c:522
#16 0x00000000005bc6a5 in main_loop () at vl.c:1944
#17 0x00000000005c39b5 in main (argc=56, argv=0x7ffe2f6443f8, envp=0x3ad0030) at vl.c:4752
It does not get the RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect sometime.
According to IB Spec once active side send DREQ message, it should wait for DREP message
and only once it arrived it should trigger a DISCONNECT event. DREP message can be dropped
due to network issues.
For that case the spec defines a DREP_timeout state in the CM state machine, if the DREP is
dropped we should get a timeout and a TIMEWAIT_EXIT event will be trigger.
Unfortunately the current kernel CM implementation doesn't include the DREP_timeout state
and in above scenario we will not get DISCONNECT or TIMEWAIT_EXIT events.
So it should not invoke rdma_get_cm_event which may hang forever, and the event channel
is also destroyed in qemu_rdma_cleanup.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-05-30 12:43:31 +03:00
|
|
|
int idx;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-13 06:12:43 +04:00
|
|
|
if (rdma->cm_id && rdma->connected) {
|
2023-09-28 16:19:53 +03:00
|
|
|
if ((rdma->errored ||
|
2017-07-17 14:09:36 +03:00
|
|
|
migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
|
|
|
|
!rdma->received_error) {
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMAControlHeader head = { .len = 0,
|
|
|
|
.type = RDMA_CONTROL_ERROR,
|
|
|
|
.repeat = 1,
|
|
|
|
};
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Early error. Sending error.");
|
2013-07-22 18:01:54 +04:00
|
|
|
qemu_rdma_post_send_control(rdma, NULL, &head);
|
|
|
|
}
|
|
|
|
|
migration: not wait RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect
When cancel migration during RDMA precopy, the source qemu main thread hangs sometime.
The backtrace is:
(gdb) bt
#0 0x00007f249eabd43d in write () from /lib64/libpthread.so.0
#1 0x00007f24a1ce98e4 in rdma_get_cm_event (channel=0x4675d10, event=0x7ffe2f643dd0) at src/cma.c:2189
#2 0x00000000007b6166 in qemu_rdma_cleanup (rdma=0x6784000) at migration/rdma.c:2296
#3 0x00000000007b7cae in qio_channel_rdma_close (ioc=0x3bfcc30, errp=0x0) at migration/rdma.c:2999
#4 0x00000000008db60e in qio_channel_close (ioc=0x3bfcc30, errp=0x0) at io/channel.c:273
#5 0x00000000007a8765 in channel_close (opaque=0x3bfcc30) at migration/qemu-file-channel.c:98
#6 0x00000000007a71f9 in qemu_fclose (f=0x527c000) at migration/qemu-file.c:334
#7 0x0000000000795b96 in migrate_fd_cleanup (opaque=0x3b46280) at migration/migration.c:1162
#8 0x000000000093a71b in aio_bh_call (bh=0x3db7a20) at util/async.c:90
#9 0x000000000093a7b2 in aio_bh_poll (ctx=0x3b121c0) at util/async.c:118
#10 0x000000000093f2ad in aio_dispatch (ctx=0x3b121c0) at util/aio-posix.c:436
#11 0x000000000093ab41 in aio_ctx_dispatch (source=0x3b121c0, callback=0x0, user_data=0x0)
at util/async.c:261
#12 0x00007f249f73c7aa in g_main_context_dispatch () from /lib64/libglib-2.0.so.0
#13 0x000000000093dc5e in glib_pollfds_poll () at util/main-loop.c:215
#14 0x000000000093dd4e in os_host_main_loop_wait (timeout=28000000) at util/main-loop.c:263
#15 0x000000000093de05 in main_loop_wait (nonblocking=0) at util/main-loop.c:522
#16 0x00000000005bc6a5 in main_loop () at vl.c:1944
#17 0x00000000005c39b5 in main (argc=56, argv=0x7ffe2f6443f8, envp=0x3ad0030) at vl.c:4752
It does not get the RDMA_CM_EVENT_DISCONNECTED event after rdma_disconnect sometime.
According to IB Spec once active side send DREQ message, it should wait for DREP message
and only once it arrived it should trigger a DISCONNECT event. DREP message can be dropped
due to network issues.
For that case the spec defines a DREP_timeout state in the CM state machine, if the DREP is
dropped we should get a timeout and a TIMEWAIT_EXIT event will be trigger.
Unfortunately the current kernel CM implementation doesn't include the DREP_timeout state
and in above scenario we will not get DISCONNECT or TIMEWAIT_EXIT events.
So it should not invoke rdma_get_cm_event which may hang forever, and the event channel
is also destroyed in qemu_rdma_cleanup.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-05-30 12:43:31 +03:00
|
|
|
rdma_disconnect(rdma->cm_id);
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_cleanup_disconnect();
|
2013-08-13 06:12:43 +04:00
|
|
|
rdma->connected = false;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2019-02-14 21:53:51 +03:00
|
|
|
if (rdma->channel) {
|
|
|
|
qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
|
|
|
|
}
|
2015-04-20 18:57:16 +03:00
|
|
|
g_free(rdma->dest_blocks);
|
|
|
|
rdma->dest_blocks = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-04 06:54:52 +04:00
|
|
|
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
|
2013-07-22 18:01:54 +04:00
|
|
|
if (rdma->wr_data[idx].control_mr) {
|
|
|
|
rdma->total_registrations--;
|
|
|
|
ibv_dereg_mr(rdma->wr_data[idx].control_mr);
|
|
|
|
}
|
|
|
|
rdma->wr_data[idx].control_mr = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rdma->local_ram_blocks.block) {
|
|
|
|
while (rdma->local_ram_blocks.nb_blocks) {
|
2015-06-11 20:17:24 +03:00
|
|
|
rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-26 05:08:04 +03:00
|
|
|
if (rdma->qp) {
|
|
|
|
rdma_destroy_qp(rdma->cm_id);
|
|
|
|
rdma->qp = NULL;
|
|
|
|
}
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
if (rdma->recv_cq) {
|
|
|
|
ibv_destroy_cq(rdma->recv_cq);
|
|
|
|
rdma->recv_cq = NULL;
|
|
|
|
}
|
|
|
|
if (rdma->send_cq) {
|
|
|
|
ibv_destroy_cq(rdma->send_cq);
|
|
|
|
rdma->send_cq = NULL;
|
|
|
|
}
|
|
|
|
if (rdma->recv_comp_channel) {
|
|
|
|
ibv_destroy_comp_channel(rdma->recv_comp_channel);
|
|
|
|
rdma->recv_comp_channel = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
if (rdma->send_comp_channel) {
|
|
|
|
ibv_destroy_comp_channel(rdma->send_comp_channel);
|
|
|
|
rdma->send_comp_channel = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
if (rdma->pd) {
|
|
|
|
ibv_dealloc_pd(rdma->pd);
|
|
|
|
rdma->pd = NULL;
|
|
|
|
}
|
|
|
|
if (rdma->cm_id) {
|
|
|
|
rdma_destroy_id(rdma->cm_id);
|
|
|
|
rdma->cm_id = NULL;
|
|
|
|
}
|
2018-08-06 16:29:28 +03:00
|
|
|
|
|
|
|
/* the destination side, listen_id and channel is shared */
|
2015-03-26 05:08:04 +03:00
|
|
|
if (rdma->listen_id) {
|
2018-08-06 16:29:28 +03:00
|
|
|
if (!rdma->is_return_path) {
|
|
|
|
rdma_destroy_id(rdma->listen_id);
|
|
|
|
}
|
2015-03-26 05:08:04 +03:00
|
|
|
rdma->listen_id = NULL;
|
2018-08-06 16:29:28 +03:00
|
|
|
|
|
|
|
if (rdma->channel) {
|
|
|
|
if (!rdma->is_return_path) {
|
|
|
|
rdma_destroy_event_channel(rdma->channel);
|
|
|
|
}
|
|
|
|
rdma->channel = NULL;
|
|
|
|
}
|
2015-03-26 05:08:04 +03:00
|
|
|
}
|
2018-08-06 16:29:28 +03:00
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
if (rdma->channel) {
|
|
|
|
rdma_destroy_event_channel(rdma->channel);
|
|
|
|
rdma->channel = NULL;
|
|
|
|
}
|
2013-08-04 06:54:54 +04:00
|
|
|
g_free(rdma->host);
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
g_free(rdma->host_port);
|
2013-08-04 06:54:54 +04:00
|
|
|
rdma->host = NULL;
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
rdma->host_port = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-04-21 15:27:08 +03:00
|
|
|
static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
int ret, idx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Will be validated against destination's actual capabilities
|
|
|
|
* after the connect() completes.
|
|
|
|
*/
|
|
|
|
rdma->pin_all = pin_all;
|
|
|
|
|
2023-09-28 16:19:37 +03:00
|
|
|
ret = qemu_rdma_resolve_host(rdma, errp);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_source_init;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_alloc_pd_cq(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: "
|
|
|
|
"rdma migration: error allocating pd and cq! Your mlock()"
|
|
|
|
" limits may be too low. Please check $ ulimit -a # and "
|
|
|
|
"search for 'ulimit -l' in the output");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_source_init;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_alloc_qp(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_source_init;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:38 +03:00
|
|
|
qemu_rdma_init_ram_blocks(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:25 +03:00
|
|
|
/* Build the hash that maps from offset to RAMBlock */
|
|
|
|
rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
|
|
|
|
for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
|
|
|
|
g_hash_table_insert(rdma->blockmap,
|
|
|
|
(void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
|
|
|
|
&rdma->local_ram_blocks.block[idx]);
|
|
|
|
}
|
|
|
|
|
2013-08-04 06:54:52 +04:00
|
|
|
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_reg_control(rdma, idx);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp,
|
|
|
|
"RDMA ERROR: rdma migration: error registering %d control!",
|
|
|
|
idx);
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_source_init;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_rdma_source_init:
|
|
|
|
qemu_rdma_cleanup(rdma);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2021-05-25 11:05:52 +03:00
|
|
|
static int qemu_get_cm_event_timeout(RDMAContext *rdma,
|
|
|
|
struct rdma_cm_event **cm_event,
|
|
|
|
long msec, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct pollfd poll_fd = {
|
|
|
|
.fd = rdma->channel->fd,
|
|
|
|
.events = POLLIN,
|
|
|
|
.revents = 0
|
|
|
|
};
|
|
|
|
|
|
|
|
do {
|
|
|
|
ret = poll(&poll_fd, 1, msec);
|
|
|
|
} while (ret < 0 && errno == EINTR);
|
|
|
|
|
|
|
|
if (ret == 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: poll cm event timeout");
|
|
|
|
}
|
2021-05-25 11:05:52 +03:00
|
|
|
return -1;
|
|
|
|
} else if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i",
|
|
|
|
errno);
|
|
|
|
}
|
2021-05-25 11:05:52 +03:00
|
|
|
return -1;
|
|
|
|
} else if (poll_fd.revents & POLLIN) {
|
2023-09-28 16:19:47 +03:00
|
|
|
if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: failed to get cm event");
|
|
|
|
}
|
2023-09-28 16:19:47 +03:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
2021-05-25 11:05:52 +03:00
|
|
|
} else {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x",
|
|
|
|
poll_fd.revents);
|
|
|
|
}
|
2021-05-25 11:05:52 +03:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:36 +03:00
|
|
|
static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
|
|
|
|
Error **errp)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMACapabilities cap = {
|
|
|
|
.version = RDMA_CONTROL_VERSION_CURRENT,
|
|
|
|
.flags = 0,
|
|
|
|
};
|
|
|
|
struct rdma_conn_param conn_param = { .initiator_depth = 2,
|
|
|
|
.retry_count = 5,
|
|
|
|
.private_data = &cap,
|
|
|
|
.private_data_len = sizeof(cap),
|
|
|
|
};
|
|
|
|
struct rdma_cm_event *cm_event;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only negotiate the capability with destination if the user
|
|
|
|
* on the source first requested the capability.
|
|
|
|
*/
|
|
|
|
if (rdma->pin_all) {
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_connect_pin_all_requested();
|
2013-07-22 18:01:54 +04:00
|
|
|
cap.flags |= RDMA_CAPABILITY_PIN_ALL;
|
|
|
|
}
|
|
|
|
|
|
|
|
caps_to_network(&cap);
|
|
|
|
|
2017-07-17 14:09:31 +03:00
|
|
|
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: posting second control recv");
|
|
|
|
}
|
2017-07-17 14:09:31 +03:00
|
|
|
goto err_rdma_source_connect;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = rdma_connect(rdma->cm_id, &conn_param);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
perror("rdma_connect");
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: connecting to destination!");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_source_connect;
|
|
|
|
}
|
|
|
|
|
2021-05-25 11:05:52 +03:00
|
|
|
if (return_path) {
|
|
|
|
ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
|
|
|
|
} else {
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:47 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: failed to get cm event");
|
|
|
|
}
|
2023-09-28 16:19:47 +03:00
|
|
|
}
|
2021-05-25 11:05:52 +03:00
|
|
|
}
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
migration/rdma: Fix or document problematic uses of errno
We use errno after calling Libibverbs functions that are not
documented to set errno (manual page does not mention errno), or where
the documentation is unclear ("returns [...] the value of errno on
failure"). While this could be read as "sets errno and returns it",
a glance at the source code[*] kills that hope:
static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad_wr)
{
return qp->context->ops.post_send(qp, wr, bad_wr);
}
The callback can be
static int mana_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
struct ibv_send_wr **bad)
{
/* This version of driver supports RAW QP only.
* Posting WR is done directly in the application.
*/
return EOPNOTSUPP;
}
Neither of them touches errno.
One of these errno uses is easy to fix, so do that now. Several more
will go away later in the series; add temporary FIXME commments.
Three will remain; add TODO comments. TODO, not FIXME, because the
bug might be in Libibverbs documentation.
[*] https://github.com/linux-rdma/rdma-core.git
commit 55fa316b4b18f258d8ac1ceb4aa5a7a35b094dcf
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20230928132019.2544702-17-armbru@redhat.com>
2023-09-28 16:19:42 +03:00
|
|
|
/*
|
|
|
|
* FIXME perror() is wrong, because
|
|
|
|
* qemu_get_cm_event_timeout() can fail without setting errno.
|
|
|
|
* Will go away later in this series.
|
|
|
|
*/
|
2013-07-22 18:01:54 +04:00
|
|
|
perror("rdma_get_cm_event after rdma_connect");
|
|
|
|
goto err_rdma_source_connect;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
|
2021-06-28 10:19:59 +03:00
|
|
|
error_report("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: connecting to destination!");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_rdma_source_connect;
|
|
|
|
}
|
2013-08-13 06:12:43 +04:00
|
|
|
rdma->connected = true;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
|
|
|
|
network_to_caps(&cap);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Verify that the *requested* capabilities are supported by the destination
|
|
|
|
* and disable them otherwise.
|
|
|
|
*/
|
|
|
|
if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
|
2023-09-28 16:19:56 +03:00
|
|
|
warn_report("RDMA: Server cannot support pinning all memory. "
|
|
|
|
"Will register memory dynamically.");
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->pin_all = false;
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
|
|
|
|
rdma->control_ready_expected = 1;
|
|
|
|
rdma->nb_sent = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_rdma_source_connect:
|
|
|
|
qemu_rdma_cleanup(rdma);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
|
|
|
|
{
|
2015-02-16 10:50:25 +03:00
|
|
|
int ret, idx;
|
2013-07-22 18:01:54 +04:00
|
|
|
struct rdma_cm_id *listen_id;
|
|
|
|
char ip[40] = "unknown";
|
2015-02-16 10:50:25 +03:00
|
|
|
struct rdma_addrinfo *res, *e;
|
2013-08-04 06:54:48 +04:00
|
|
|
char port_str[16];
|
2022-02-08 11:56:40 +03:00
|
|
|
int reuse = 1;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-04 06:54:52 +04:00
|
|
|
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma->wr_data[idx].control_len = 0;
|
|
|
|
rdma->wr_data[idx].control_curr = NULL;
|
|
|
|
}
|
|
|
|
|
2015-02-16 10:50:25 +03:00
|
|
|
if (!rdma->host || !rdma->host[0]) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: RDMA host is not set!");
|
|
|
|
}
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
/* create CM channel */
|
|
|
|
rdma->channel = rdma_create_event_channel();
|
|
|
|
if (!rdma->channel) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not create rdma event channel");
|
|
|
|
}
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2013-07-22 18:01:54 +04:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* create CM id */
|
|
|
|
ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not create cm_id!");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_dest_init_create_listen_id;
|
|
|
|
}
|
|
|
|
|
2013-08-04 06:54:48 +04:00
|
|
|
snprintf(port_str, 16, "%d", rdma->port);
|
|
|
|
port_str[15] = '\0';
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-16 10:50:25 +03:00
|
|
|
ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
|
2023-09-28 16:19:50 +03:00
|
|
|
if (ret) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
|
|
|
|
rdma->host);
|
|
|
|
}
|
2015-02-16 10:50:25 +03:00
|
|
|
goto err_dest_init_bind_addr;
|
|
|
|
}
|
2013-08-10 00:05:43 +04:00
|
|
|
|
2022-02-08 11:56:40 +03:00
|
|
|
ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
|
|
|
|
&reuse, sizeof reuse);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option");
|
|
|
|
}
|
2022-02-08 11:56:40 +03:00
|
|
|
goto err_dest_init_bind_addr;
|
|
|
|
}
|
2015-02-16 10:50:25 +03:00
|
|
|
for (e = res; e != NULL; e = e->ai_next) {
|
|
|
|
inet_ntop(e->ai_family,
|
|
|
|
&((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
|
|
|
|
trace_qemu_rdma_dest_init_trying(rdma->host, ip);
|
|
|
|
ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-16 10:50:25 +03:00
|
|
|
continue;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
2015-02-16 10:50:25 +03:00
|
|
|
if (e->ai_family == AF_INET6) {
|
2017-04-21 15:27:08 +03:00
|
|
|
ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs, errp);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-16 10:50:25 +03:00
|
|
|
continue;
|
2013-08-10 00:05:43 +04:00
|
|
|
}
|
|
|
|
}
|
2015-02-16 10:50:25 +03:00
|
|
|
break;
|
|
|
|
}
|
2013-08-04 06:54:48 +04:00
|
|
|
|
2021-05-25 11:05:50 +03:00
|
|
|
rdma_freeaddrinfo(res);
|
2015-02-16 10:50:25 +03:00
|
|
|
if (!e) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
|
|
|
|
}
|
2013-08-10 00:05:43 +04:00
|
|
|
goto err_dest_init_bind_addr;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->listen_id = listen_id;
|
|
|
|
qemu_rdma_dump_gid("dest_init", listen_id);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_dest_init_bind_addr:
|
|
|
|
rdma_destroy_id(listen_id);
|
|
|
|
err_dest_init_create_listen_id:
|
|
|
|
rdma_destroy_event_channel(rdma->channel);
|
|
|
|
rdma->channel = NULL;
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:50 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
}
|
|
|
|
|
2018-08-06 16:29:28 +03:00
|
|
|
static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
|
|
|
|
RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
|
|
|
|
rdma_return_path->wr_data[idx].control_len = 0;
|
|
|
|
rdma_return_path->wr_data[idx].control_curr = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*the CM channel and CM id is shared*/
|
|
|
|
rdma_return_path->channel = rdma->channel;
|
|
|
|
rdma_return_path->listen_id = rdma->listen_id;
|
|
|
|
|
|
|
|
rdma->return_path = rdma_return_path;
|
|
|
|
rdma_return_path->return_path = rdma;
|
|
|
|
rdma_return_path->is_return_path = true;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:28 +03:00
|
|
|
static RDMAContext *qemu_rdma_data_init(const char *host_port, Error **errp)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMAContext *rdma = NULL;
|
|
|
|
InetSocketAddress *addr;
|
|
|
|
|
2023-09-28 16:19:48 +03:00
|
|
|
rdma = g_new0(RDMAContext, 1);
|
|
|
|
rdma->current_index = -1;
|
|
|
|
rdma->current_chunk = -1;
|
|
|
|
|
|
|
|
addr = g_new(InetSocketAddress, 1);
|
|
|
|
if (!inet_parse(addr, host_port, NULL)) {
|
|
|
|
rdma->port = atoi(addr->port);
|
|
|
|
rdma->host = g_strdup(addr->host);
|
|
|
|
rdma->host_port = g_strdup(host_port);
|
|
|
|
} else {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: bad RDMA migration address '%s'",
|
|
|
|
host_port);
|
|
|
|
}
|
2023-09-28 16:19:48 +03:00
|
|
|
g_free(rdma);
|
|
|
|
rdma = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:48 +03:00
|
|
|
qapi_free_InetSocketAddress(addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
return rdma;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* QEMUFile interface to the control channel.
|
|
|
|
* SEND messages for control only.
|
2014-09-12 10:03:14 +04:00
|
|
|
* VM's ram is handled with regular RDMA messages.
|
2013-07-22 18:01:54 +04:00
|
|
|
*/
|
2016-04-27 13:05:07 +03:00
|
|
|
static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
|
|
|
|
const struct iovec *iov,
|
|
|
|
size_t niov,
|
|
|
|
int *fds,
|
|
|
|
size_t nfds,
|
2022-05-13 09:28:31 +03:00
|
|
|
int flags,
|
2016-04-27 13:05:07 +03:00
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
int ret;
|
2016-04-27 13:05:07 +03:00
|
|
|
ssize_t done = 0;
|
2023-09-28 16:19:33 +03:00
|
|
|
size_t i, len;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
if (!rdma) {
|
2022-12-09 16:15:24 +03:00
|
|
|
error_setg(errp, "RDMA control channel output is not set");
|
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma->errored) {
|
2023-09-28 16:19:44 +03:00
|
|
|
error_setg(errp,
|
|
|
|
"RDMA is in an error state waiting migration to abort!");
|
|
|
|
return -1;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Push out any writes that
|
2014-09-12 10:03:14 +04:00
|
|
|
* we're queued up for VM's ram.
|
2013-07-22 18:01:54 +04:00
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
ret = qemu_rdma_write_flush(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:43 +03:00
|
|
|
error_setg(errp, "qemu_rdma_write_flush failed");
|
2022-12-09 16:15:24 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
for (i = 0; i < niov; i++) {
|
|
|
|
size_t remaining = iov[i].iov_len;
|
|
|
|
uint8_t * data = (void *)iov[i].iov_base;
|
|
|
|
while (remaining) {
|
2023-09-26 13:01:03 +03:00
|
|
|
RDMAControlHeader head = {};
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2018-05-30 12:43:27 +03:00
|
|
|
len = MIN(remaining, RDMA_SEND_INCREMENT);
|
|
|
|
remaining -= len;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2018-05-30 12:43:27 +03:00
|
|
|
head.len = len;
|
2016-04-27 13:05:07 +03:00
|
|
|
head.type = RDMA_CONTROL_QEMU_FILE;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:43 +03:00
|
|
|
error_setg(errp, "qemu_rdma_exchange_send failed");
|
2022-12-09 16:15:24 +03:00
|
|
|
return -1;
|
2016-04-27 13:05:07 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2018-05-30 12:43:27 +03:00
|
|
|
data += len;
|
|
|
|
done += len;
|
2016-04-27 13:05:07 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
return done;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
|
2015-08-13 13:51:33 +03:00
|
|
|
size_t size, int idx)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
size_t len = 0;
|
|
|
|
|
|
|
|
if (rdma->wr_data[idx].control_len) {
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
len = MIN(size, rdma->wr_data[idx].control_len);
|
|
|
|
memcpy(buf, rdma->wr_data[idx].control_curr, len);
|
|
|
|
rdma->wr_data[idx].control_curr += len;
|
|
|
|
rdma->wr_data[idx].control_len -= len;
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* QEMUFile interface to the control channel.
|
|
|
|
* RDMA links don't use bytestreams, so we have to
|
|
|
|
* return bytes to QEMUFile opportunistically.
|
|
|
|
*/
|
2016-04-27 13:05:07 +03:00
|
|
|
static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
|
|
|
|
const struct iovec *iov,
|
|
|
|
size_t niov,
|
|
|
|
int **fds,
|
|
|
|
size_t *nfds,
|
2022-12-20 21:44:17 +03:00
|
|
|
int flags,
|
2016-04-27 13:05:07 +03:00
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMAControlHeader head;
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2023-09-28 16:19:33 +03:00
|
|
|
ssize_t done = 0;
|
|
|
|
size_t i, len;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
if (!rdma) {
|
2022-12-09 16:15:24 +03:00
|
|
|
error_setg(errp, "RDMA control channel input is not set");
|
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma->errored) {
|
2023-09-28 16:19:44 +03:00
|
|
|
error_setg(errp,
|
|
|
|
"RDMA is in an error state waiting migration to abort!");
|
|
|
|
return -1;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
for (i = 0; i < niov; i++) {
|
|
|
|
size_t want = iov[i].iov_len;
|
|
|
|
uint8_t *data = (void *)iov[i].iov_base;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
/*
|
|
|
|
* First, we hold on to the last SEND message we
|
|
|
|
* were given and dish out the bytes until we run
|
|
|
|
* out of bytes.
|
|
|
|
*/
|
2023-09-28 16:19:32 +03:00
|
|
|
len = qemu_rdma_fill(rdma, data, want, 0);
|
|
|
|
done += len;
|
|
|
|
want -= len;
|
2016-04-27 13:05:07 +03:00
|
|
|
/* Got what we needed, so go to next iovec */
|
|
|
|
if (want == 0) {
|
|
|
|
continue;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
/* If we got any data so far, then don't wait
|
|
|
|
* for more, just return what we have */
|
|
|
|
if (done > 0) {
|
|
|
|
break;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
|
|
|
|
/* We've got nothing at all, so lets wait for
|
|
|
|
* more to arrive
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:43 +03:00
|
|
|
error_setg(errp, "qemu_rdma_exchange_recv failed");
|
2022-12-09 16:15:24 +03:00
|
|
|
return -1;
|
2016-04-27 13:05:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SEND was received with new bytes, now try again.
|
|
|
|
*/
|
2023-09-28 16:19:32 +03:00
|
|
|
len = qemu_rdma_fill(rdma, data, want, 0);
|
|
|
|
done += len;
|
|
|
|
want -= len;
|
2016-04-27 13:05:07 +03:00
|
|
|
|
|
|
|
/* Still didn't get enough, so lets just return */
|
|
|
|
if (want) {
|
|
|
|
if (done == 0) {
|
|
|
|
return QIO_CHANNEL_ERR_BLOCK;
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2018-05-30 12:43:27 +03:00
|
|
|
return done;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block until all the outstanding chunks have been delivered by the hardware.
|
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
static int qemu_rdma_drain_cq(RDMAContext *rdma)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2023-05-15 22:57:05 +03:00
|
|
|
if (qemu_rdma_write_flush(rdma) < 0) {
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
while (rdma->nb_sent) {
|
2013-08-10 00:05:42 +04:00
|
|
|
ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: complete polling error!");
|
2023-09-28 16:19:51 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_unregister_waiting(rdma);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
|
|
|
|
static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
|
|
|
|
bool blocking,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
|
|
|
/* XXX we should make readv/writev actually honour this :-) */
|
|
|
|
rioc->blocking = blocking;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct QIOChannelRDMASource QIOChannelRDMASource;
|
|
|
|
struct QIOChannelRDMASource {
|
|
|
|
GSource parent;
|
|
|
|
QIOChannelRDMA *rioc;
|
|
|
|
GIOCondition condition;
|
|
|
|
};
|
|
|
|
|
|
|
|
static gboolean
|
|
|
|
qio_channel_rdma_source_prepare(GSource *source,
|
|
|
|
gint *timeout)
|
|
|
|
{
|
|
|
|
QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2016-04-27 13:05:07 +03:00
|
|
|
GIOCondition cond = 0;
|
|
|
|
*timeout = -1;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-08-06 16:29:29 +03:00
|
|
|
if (rsource->condition == G_IO_IN) {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
} else {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!rdma) {
|
|
|
|
error_report("RDMAContext is NULL when prepare Gsource");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
if (rdma->wr_data[0].control_len) {
|
|
|
|
cond |= G_IO_IN;
|
|
|
|
}
|
|
|
|
cond |= G_IO_OUT;
|
|
|
|
|
|
|
|
return cond & rsource->condition;
|
|
|
|
}
|
|
|
|
|
|
|
|
static gboolean
|
|
|
|
qio_channel_rdma_source_check(GSource *source)
|
|
|
|
{
|
|
|
|
QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2016-04-27 13:05:07 +03:00
|
|
|
GIOCondition cond = 0;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-08-06 16:29:29 +03:00
|
|
|
if (rsource->condition == G_IO_IN) {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
} else {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!rdma) {
|
|
|
|
error_report("RDMAContext is NULL when check Gsource");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
if (rdma->wr_data[0].control_len) {
|
|
|
|
cond |= G_IO_IN;
|
|
|
|
}
|
|
|
|
cond |= G_IO_OUT;
|
|
|
|
|
|
|
|
return cond & rsource->condition;
|
|
|
|
}
|
|
|
|
|
|
|
|
static gboolean
|
|
|
|
qio_channel_rdma_source_dispatch(GSource *source,
|
|
|
|
GSourceFunc callback,
|
|
|
|
gpointer user_data)
|
|
|
|
{
|
|
|
|
QIOChannelFunc func = (QIOChannelFunc)callback;
|
|
|
|
QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2016-04-27 13:05:07 +03:00
|
|
|
GIOCondition cond = 0;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-08-06 16:29:29 +03:00
|
|
|
if (rsource->condition == G_IO_IN) {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
} else {
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!rdma) {
|
|
|
|
error_report("RDMAContext is NULL when dispatch Gsource");
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
if (rdma->wr_data[0].control_len) {
|
|
|
|
cond |= G_IO_IN;
|
|
|
|
}
|
|
|
|
cond |= G_IO_OUT;
|
|
|
|
|
|
|
|
return (*func)(QIO_CHANNEL(rsource->rioc),
|
|
|
|
(cond & rsource->condition),
|
|
|
|
user_data);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
qio_channel_rdma_source_finalize(GSource *source)
|
|
|
|
{
|
|
|
|
QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
|
|
|
|
|
|
|
|
object_unref(OBJECT(ssource->rioc));
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:34 +03:00
|
|
|
static GSourceFuncs qio_channel_rdma_source_funcs = {
|
2016-04-27 13:05:07 +03:00
|
|
|
qio_channel_rdma_source_prepare,
|
|
|
|
qio_channel_rdma_source_check,
|
|
|
|
qio_channel_rdma_source_dispatch,
|
|
|
|
qio_channel_rdma_source_finalize
|
|
|
|
};
|
|
|
|
|
|
|
|
static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
|
|
|
|
GIOCondition condition)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2016-04-27 13:05:07 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
|
|
|
QIOChannelRDMASource *ssource;
|
|
|
|
GSource *source;
|
|
|
|
|
|
|
|
source = g_source_new(&qio_channel_rdma_source_funcs,
|
|
|
|
sizeof(QIOChannelRDMASource));
|
|
|
|
ssource = (QIOChannelRDMASource *)source;
|
|
|
|
|
|
|
|
ssource->rioc = rioc;
|
|
|
|
object_ref(OBJECT(rioc));
|
|
|
|
|
|
|
|
ssource->condition = condition;
|
|
|
|
|
|
|
|
return source;
|
|
|
|
}
|
|
|
|
|
migration: implement io_set_aio_fd_handler function for RDMA QIOChannel
if qio_channel_rdma_readv return QIO_CHANNEL_ERR_BLOCK, the destination qemu
crash.
The backtrace is:
(gdb) bt
#0 0x0000000000000000 in ?? ()
#1 0x00000000008db50e in qio_channel_set_aio_fd_handler (ioc=0x38111e0, ctx=0x3726080,
io_read=0x8db841 <qio_channel_restart_read>, io_write=0x0, opaque=0x38111e0) at io/channel.c:
#2 0x00000000008db952 in qio_channel_set_aio_fd_handlers (ioc=0x38111e0) at io/channel.c:438
#3 0x00000000008dbab4 in qio_channel_yield (ioc=0x38111e0, condition=G_IO_IN) at io/channel.c:47
#4 0x00000000007a870b in channel_get_buffer (opaque=0x38111e0, buf=0x440c038 "", pos=0, size=327
at migration/qemu-file-channel.c:83
#5 0x00000000007a70f6 in qemu_fill_buffer (f=0x440c000) at migration/qemu-file.c:299
#6 0x00000000007a79d0 in qemu_peek_byte (f=0x440c000, offset=0) at migration/qemu-file.c:562
#7 0x00000000007a7a22 in qemu_get_byte (f=0x440c000) at migration/qemu-file.c:575
#8 0x00000000007a7c78 in qemu_get_be32 (f=0x440c000) at migration/qemu-file.c:655
#9 0x00000000007a0508 in qemu_loadvm_state (f=0x440c000) at migration/savevm.c:2126
#10 0x0000000000794141 in process_incoming_migration_co (opaque=0x0) at migration/migration.c:366
#11 0x000000000095c598 in coroutine_trampoline (i0=84033984, i1=0) at util/coroutine-ucontext.c:1
#12 0x00007f9c0db56d40 in ?? () from /lib64/libc.so.6
#13 0x00007f96fe858760 in ?? ()
#14 0x0000000000000000 in ?? ()
RDMA QIOChannel not implement io_set_aio_fd_handler. so
qio_channel_set_aio_fd_handler will access NULL pointer.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:31 +03:00
|
|
|
static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
|
2023-08-31 01:48:02 +03:00
|
|
|
AioContext *read_ctx,
|
|
|
|
IOHandler *io_read,
|
|
|
|
AioContext *write_ctx,
|
|
|
|
IOHandler *io_write,
|
|
|
|
void *opaque)
|
migration: implement io_set_aio_fd_handler function for RDMA QIOChannel
if qio_channel_rdma_readv return QIO_CHANNEL_ERR_BLOCK, the destination qemu
crash.
The backtrace is:
(gdb) bt
#0 0x0000000000000000 in ?? ()
#1 0x00000000008db50e in qio_channel_set_aio_fd_handler (ioc=0x38111e0, ctx=0x3726080,
io_read=0x8db841 <qio_channel_restart_read>, io_write=0x0, opaque=0x38111e0) at io/channel.c:
#2 0x00000000008db952 in qio_channel_set_aio_fd_handlers (ioc=0x38111e0) at io/channel.c:438
#3 0x00000000008dbab4 in qio_channel_yield (ioc=0x38111e0, condition=G_IO_IN) at io/channel.c:47
#4 0x00000000007a870b in channel_get_buffer (opaque=0x38111e0, buf=0x440c038 "", pos=0, size=327
at migration/qemu-file-channel.c:83
#5 0x00000000007a70f6 in qemu_fill_buffer (f=0x440c000) at migration/qemu-file.c:299
#6 0x00000000007a79d0 in qemu_peek_byte (f=0x440c000, offset=0) at migration/qemu-file.c:562
#7 0x00000000007a7a22 in qemu_get_byte (f=0x440c000) at migration/qemu-file.c:575
#8 0x00000000007a7c78 in qemu_get_be32 (f=0x440c000) at migration/qemu-file.c:655
#9 0x00000000007a0508 in qemu_loadvm_state (f=0x440c000) at migration/savevm.c:2126
#10 0x0000000000794141 in process_incoming_migration_co (opaque=0x0) at migration/migration.c:366
#11 0x000000000095c598 in coroutine_trampoline (i0=84033984, i1=0) at util/coroutine-ucontext.c:1
#12 0x00007f9c0db56d40 in ?? () from /lib64/libc.so.6
#13 0x00007f96fe858760 in ?? ()
#14 0x0000000000000000 in ?? ()
RDMA QIOChannel not implement io_set_aio_fd_handler. so
qio_channel_set_aio_fd_handler will access NULL pointer.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:31 +03:00
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
|
|
|
if (io_read) {
|
2023-08-31 01:48:02 +03:00
|
|
|
aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
|
|
|
|
io_read, io_write, NULL, NULL, opaque);
|
|
|
|
aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
|
|
|
|
io_read, io_write, NULL, NULL, opaque);
|
migration: implement io_set_aio_fd_handler function for RDMA QIOChannel
if qio_channel_rdma_readv return QIO_CHANNEL_ERR_BLOCK, the destination qemu
crash.
The backtrace is:
(gdb) bt
#0 0x0000000000000000 in ?? ()
#1 0x00000000008db50e in qio_channel_set_aio_fd_handler (ioc=0x38111e0, ctx=0x3726080,
io_read=0x8db841 <qio_channel_restart_read>, io_write=0x0, opaque=0x38111e0) at io/channel.c:
#2 0x00000000008db952 in qio_channel_set_aio_fd_handlers (ioc=0x38111e0) at io/channel.c:438
#3 0x00000000008dbab4 in qio_channel_yield (ioc=0x38111e0, condition=G_IO_IN) at io/channel.c:47
#4 0x00000000007a870b in channel_get_buffer (opaque=0x38111e0, buf=0x440c038 "", pos=0, size=327
at migration/qemu-file-channel.c:83
#5 0x00000000007a70f6 in qemu_fill_buffer (f=0x440c000) at migration/qemu-file.c:299
#6 0x00000000007a79d0 in qemu_peek_byte (f=0x440c000, offset=0) at migration/qemu-file.c:562
#7 0x00000000007a7a22 in qemu_get_byte (f=0x440c000) at migration/qemu-file.c:575
#8 0x00000000007a7c78 in qemu_get_be32 (f=0x440c000) at migration/qemu-file.c:655
#9 0x00000000007a0508 in qemu_loadvm_state (f=0x440c000) at migration/savevm.c:2126
#10 0x0000000000794141 in process_incoming_migration_co (opaque=0x0) at migration/migration.c:366
#11 0x000000000095c598 in coroutine_trampoline (i0=84033984, i1=0) at util/coroutine-ucontext.c:1
#12 0x00007f9c0db56d40 in ?? () from /lib64/libc.so.6
#13 0x00007f96fe858760 in ?? ()
#14 0x0000000000000000 in ?? ()
RDMA QIOChannel not implement io_set_aio_fd_handler. so
qio_channel_set_aio_fd_handler will access NULL pointer.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:31 +03:00
|
|
|
} else {
|
2023-08-31 01:48:02 +03:00
|
|
|
aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
|
|
|
|
io_read, io_write, NULL, NULL, opaque);
|
|
|
|
aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
|
|
|
|
io_read, io_write, NULL, NULL, opaque);
|
migration: implement io_set_aio_fd_handler function for RDMA QIOChannel
if qio_channel_rdma_readv return QIO_CHANNEL_ERR_BLOCK, the destination qemu
crash.
The backtrace is:
(gdb) bt
#0 0x0000000000000000 in ?? ()
#1 0x00000000008db50e in qio_channel_set_aio_fd_handler (ioc=0x38111e0, ctx=0x3726080,
io_read=0x8db841 <qio_channel_restart_read>, io_write=0x0, opaque=0x38111e0) at io/channel.c:
#2 0x00000000008db952 in qio_channel_set_aio_fd_handlers (ioc=0x38111e0) at io/channel.c:438
#3 0x00000000008dbab4 in qio_channel_yield (ioc=0x38111e0, condition=G_IO_IN) at io/channel.c:47
#4 0x00000000007a870b in channel_get_buffer (opaque=0x38111e0, buf=0x440c038 "", pos=0, size=327
at migration/qemu-file-channel.c:83
#5 0x00000000007a70f6 in qemu_fill_buffer (f=0x440c000) at migration/qemu-file.c:299
#6 0x00000000007a79d0 in qemu_peek_byte (f=0x440c000, offset=0) at migration/qemu-file.c:562
#7 0x00000000007a7a22 in qemu_get_byte (f=0x440c000) at migration/qemu-file.c:575
#8 0x00000000007a7c78 in qemu_get_be32 (f=0x440c000) at migration/qemu-file.c:655
#9 0x00000000007a0508 in qemu_loadvm_state (f=0x440c000) at migration/savevm.c:2126
#10 0x0000000000794141 in process_incoming_migration_co (opaque=0x0) at migration/migration.c:366
#11 0x000000000095c598 in coroutine_trampoline (i0=84033984, i1=0) at util/coroutine-ucontext.c:1
#12 0x00007f9c0db56d40 in ?? () from /lib64/libc.so.6
#13 0x00007f96fe858760 in ?? ()
#14 0x0000000000000000 in ?? ()
RDMA QIOChannel not implement io_set_aio_fd_handler. so
qio_channel_set_aio_fd_handler will access NULL pointer.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:31 +03:00
|
|
|
}
|
|
|
|
}
|
2016-04-27 13:05:07 +03:00
|
|
|
|
2019-09-13 19:35:07 +03:00
|
|
|
struct rdma_close_rcu {
|
|
|
|
struct rcu_head rcu;
|
|
|
|
RDMAContext *rdmain;
|
|
|
|
RDMAContext *rdmaout;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* callback from qio_channel_rdma_close via call_rcu */
|
|
|
|
static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
|
|
|
|
{
|
|
|
|
if (rcu->rdmain) {
|
|
|
|
qemu_rdma_cleanup(rcu->rdmain);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rcu->rdmaout) {
|
|
|
|
qemu_rdma_cleanup(rcu->rdmaout);
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(rcu->rdmain);
|
|
|
|
g_free(rcu->rdmaout);
|
|
|
|
g_free(rcu);
|
|
|
|
}
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
static int qio_channel_rdma_close(QIOChannel *ioc,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdmain, *rdmaout;
|
2019-09-13 19:35:07 +03:00
|
|
|
struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_close();
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
rdmain = rioc->rdmain;
|
|
|
|
if (rdmain) {
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_rcu_set(&rioc->rdmain, NULL);
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
rdmaout = rioc->rdmaout;
|
|
|
|
if (rdmaout) {
|
2020-09-23 13:56:46 +03:00
|
|
|
qatomic_rcu_set(&rioc->rdmaout, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
2018-08-06 16:29:29 +03:00
|
|
|
|
2019-09-13 19:35:07 +03:00
|
|
|
rcu->rdmain = rdmain;
|
|
|
|
rcu->rdmaout = rdmaout;
|
|
|
|
call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
static int
|
|
|
|
qio_channel_rdma_shutdown(QIOChannel *ioc,
|
|
|
|
QIOChannelShutdown how,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
|
|
|
RDMAContext *rdmain, *rdmaout;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
|
2020-09-23 13:56:46 +03:00
|
|
|
rdmain = qatomic_rcu_read(&rioc->rdmain);
|
|
|
|
rdmaout = qatomic_rcu_read(&rioc->rdmain);
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
|
|
|
|
switch (how) {
|
|
|
|
case QIO_CHANNEL_SHUTDOWN_READ:
|
|
|
|
if (rdmain) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdmain->errored = true;
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case QIO_CHANNEL_SHUTDOWN_WRITE:
|
|
|
|
if (rdmaout) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdmaout->errored = true;
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case QIO_CHANNEL_SHUTDOWN_BOTH:
|
|
|
|
default:
|
|
|
|
if (rdmain) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdmain->errored = true;
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
}
|
|
|
|
if (rdmaout) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdmaout->errored = true;
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* Parameters:
|
|
|
|
* @offset == 0 :
|
|
|
|
* This means that 'block_offset' is a full virtual address that does not
|
|
|
|
* belong to a RAMBlock of the virtual machine and instead
|
|
|
|
* represents a private malloc'd memory area that the caller wishes to
|
|
|
|
* transfer.
|
|
|
|
*
|
|
|
|
* @offset != 0 :
|
|
|
|
* Offset is an offset to be added to block_offset and used
|
|
|
|
* to also lookup the corresponding RAMBlock.
|
|
|
|
*
|
2022-06-20 14:01:47 +03:00
|
|
|
* @size : Number of bytes to transfer
|
2013-07-22 18:01:54 +04:00
|
|
|
*
|
2023-05-15 22:57:08 +03:00
|
|
|
* @pages_sent : User-specificed pointer to indicate how many pages were
|
2013-07-22 18:01:54 +04:00
|
|
|
* sent. Usually, this will not be more than a few bytes of
|
|
|
|
* the protocol because most transfers are sent asynchronously.
|
|
|
|
*/
|
2023-05-15 22:57:08 +03:00
|
|
|
static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
|
|
|
|
ram_addr_t offset, size_t size)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2022-06-20 14:01:55 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
int ret;
|
|
|
|
|
2023-05-04 14:44:43 +03:00
|
|
|
if (migration_in_postcopy()) {
|
|
|
|
return RAM_SAVE_CONTROL_NOT_SUPP;
|
|
|
|
}
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
if (!rdma) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma_errored(rdma)) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2023-09-28 16:19:45 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
qemu_fflush(f);
|
|
|
|
|
2022-06-20 14:01:47 +03:00
|
|
|
/*
|
|
|
|
* Add this page to the current 'chunk'. If the chunk
|
|
|
|
* is full, or the page doesn't belong to the current chunk,
|
|
|
|
* an actual RDMA write will occur and a new chunk will be formed.
|
|
|
|
*/
|
2023-05-15 22:57:05 +03:00
|
|
|
ret = qemu_rdma_write(rdma, block_offset, offset, size);
|
2022-06-20 14:01:47 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("rdma migration: write error");
|
2022-06-20 14:01:47 +03:00
|
|
|
goto err;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Drain the Completion Queue if possible, but do not block,
|
|
|
|
* just poll.
|
|
|
|
*
|
|
|
|
* If nothing to poll, the end of the iteration will do this
|
|
|
|
* again to make sure we don't overflow the request queue.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
uint64_t wr_id, wr_id_in;
|
2023-09-21 15:13:06 +03:00
|
|
|
ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
|
|
|
|
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("rdma migration: polling error");
|
migration/rdma: Fix out of order wrid
destination:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888
qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000)
source:
../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S
qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated
Please use disable-ticketing=on instead
QEMU 6.0.50 monitor - type 'help' for more information
(qemu)
(qemu) trace-event qemu_rdma_block_for_wrid_miss on
(qemu) migrate -d rdma:192.168.22.23:8888
source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet
(qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000)
NOTE: we use soft RoCE as the rdma device.
[root@iaas-rpma images]# rdma link show rxe_eth0/1
link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0
This migration could not be completed when out of order(OOO) CQ event occurs.
The send queue and receive queue shared a same completion queue, and
qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But
the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants.
So in this case, qemu_rdma_block_for_wrid() will block forever.
OOO cases will occur in both source side and destination side. And a
forever blocking happens on only SEND and RECV are out of order. OOO between
'WRITE RDMA' and 'RECV' doesn't matter.
below the OOO sequence:
source destination
rdma_write_one() qemu_rdma_registration_handle()
1. S1: post_recv X D1: post_recv Y
2. wait for recv CQ event X
3. D2: post_send X ---------------+
4. wait for send CQ send event X (D2) |
5. recv CQ event X reaches (D2) |
6. +-S2: post_send Y |
7. | wait for send CQ event Y |
8. | recv CQ event Y (S2) (drop it) |
9. +-send CQ event Y reaches (S2) |
10. send CQ event X reaches (D2) -----+
11. wait recv CQ event Y (dropped by (8))
Although a hardware IB works fine in my a hundred of runs, the IB specification
doesn't guaratee the CQ order in such case.
Here we introduce a independent send completion queue to distinguish
ibv_post_send completion queue from the original mixed completion queue.
It helps us to poll the specific CQE we are really interested in.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2021-10-29 05:14:47 +03:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
if (wr_id == RDMA_WRID_NONE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
uint64_t wr_id, wr_id_in;
|
2023-09-21 15:13:06 +03:00
|
|
|
ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("rdma migration: polling error");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
|
|
|
|
|
|
|
|
if (wr_id == RDMA_WRID_NONE) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return RAM_SAVE_CONTROL_DELAYED;
|
2023-09-28 16:19:49 +03:00
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
err:
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2018-08-06 16:29:28 +03:00
|
|
|
static void rdma_accept_incoming_migration(void *opaque);
|
|
|
|
|
2018-08-06 16:29:35 +03:00
|
|
|
static void rdma_cm_poll_handler(void *opaque)
|
|
|
|
{
|
|
|
|
RDMAContext *rdma = opaque;
|
|
|
|
int ret;
|
|
|
|
struct rdma_cm_event *cm_event;
|
|
|
|
MigrationIncomingState *mis = migration_incoming_get_current();
|
|
|
|
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2018-08-06 16:29:35 +03:00
|
|
|
error_report("get_cm_event failed %d", errno);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
|
|
|
|
cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
|
2023-09-28 16:19:53 +03:00
|
|
|
if (!rdma->errored &&
|
2019-09-13 19:35:06 +03:00
|
|
|
migration_incoming_get_current()->state !=
|
|
|
|
MIGRATION_STATUS_COMPLETED) {
|
|
|
|
error_report("receive cm event, cm event is %d", cm_event->event);
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2019-09-13 19:35:06 +03:00
|
|
|
if (rdma->return_path) {
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->return_path->errored = true;
|
2019-09-13 19:35:06 +03:00
|
|
|
}
|
2018-08-06 16:29:35 +03:00
|
|
|
}
|
2021-06-02 05:35:06 +03:00
|
|
|
rdma_ack_cm_event(cm_event);
|
2023-05-15 16:06:39 +03:00
|
|
|
if (mis->loadvm_co) {
|
|
|
|
qemu_coroutine_enter(mis->loadvm_co);
|
2018-08-06 16:29:35 +03:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2021-06-02 05:35:06 +03:00
|
|
|
rdma_ack_cm_event(cm_event);
|
2018-08-06 16:29:35 +03:00
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
static int qemu_rdma_accept(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
RDMACapabilities cap;
|
|
|
|
struct rdma_conn_param conn_param = {
|
|
|
|
.responder_resources = 2,
|
|
|
|
.private_data = &cap,
|
|
|
|
.private_data_len = sizeof(cap),
|
|
|
|
};
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
RDMAContext *rdma_return_path = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
struct rdma_cm_event *cm_event;
|
|
|
|
struct ibv_context *verbs;
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
int idx;
|
|
|
|
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
/*
|
|
|
|
* initialize the RDMAContext for return path for postcopy after first
|
|
|
|
* connection request reached.
|
|
|
|
*/
|
2023-03-02 00:25:47 +03:00
|
|
|
if ((migrate_postcopy() || migrate_return_path())
|
2023-03-14 20:15:58 +03:00
|
|
|
&& !rdma->is_return_path) {
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
|
|
|
|
if (rdma_return_path == NULL) {
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
|
|
|
|
|
|
|
|
network_to_caps(&cap);
|
|
|
|
|
|
|
|
if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
|
2023-09-28 16:19:35 +03:00
|
|
|
error_report("Unknown source RDMA version: %d, bailing...",
|
|
|
|
cap.version);
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_rdma_dest_wait;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Respond with only the capabilities this version of QEMU knows about.
|
|
|
|
*/
|
|
|
|
cap.flags &= known_capabilities;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable the ones that we do know about.
|
|
|
|
* Add other checks here as new ones are introduced.
|
|
|
|
*/
|
|
|
|
if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
|
|
|
|
rdma->pin_all = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma->cm_id = cm_event->id;
|
|
|
|
verbs = cm_event->id->verbs;
|
|
|
|
|
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_accept_pin_state(rdma->pin_all);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
caps_to_network(&cap);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_accept_pin_verbsc(verbs);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (!rdma->verbs) {
|
|
|
|
rdma->verbs = verbs;
|
|
|
|
} else if (rdma->verbs != verbs) {
|
2023-09-28 16:19:35 +03:00
|
|
|
error_report("ibv context not matching %p, %p!", rdma->verbs,
|
|
|
|
verbs);
|
|
|
|
goto err_rdma_dest_wait;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_dump_id("dest_init", verbs);
|
|
|
|
|
|
|
|
ret = qemu_rdma_alloc_pd_cq(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error allocating pd and cq!");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_alloc_qp(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error allocating qp!");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:38 +03:00
|
|
|
qemu_rdma_init_ram_blocks(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-04 06:54:52 +04:00
|
|
|
for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_reg_control(rdma, idx);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma: error registering %d control", idx);
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-06 16:29:28 +03:00
|
|
|
/* Accept the second connection request for return path */
|
2023-03-02 00:25:47 +03:00
|
|
|
if ((migrate_postcopy() || migrate_return_path())
|
2023-03-14 20:15:58 +03:00
|
|
|
&& !rdma->is_return_path) {
|
2018-08-06 16:29:28 +03:00
|
|
|
qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
|
|
|
|
NULL,
|
|
|
|
(void *)(intptr_t)rdma->return_path);
|
|
|
|
} else {
|
2018-08-06 16:29:35 +03:00
|
|
|
qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
|
|
|
|
NULL, rdma);
|
2018-08-06 16:29:28 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
ret = rdma_accept(rdma->cm_id, &conn_param);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("rdma_accept failed");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = rdma_get_cm_event(rdma->channel, &cm_event);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:43 +03:00
|
|
|
error_report("rdma_accept get_cm_event failed");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma_accept not event established");
|
2013-07-22 18:01:54 +04:00
|
|
|
rdma_ack_cm_event(cm_event);
|
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma_ack_cm_event(cm_event);
|
2013-08-13 06:12:43 +04:00
|
|
|
rdma->connected = true;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2013-08-04 06:54:53 +04:00
|
|
|
ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error posting second control recv");
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err_rdma_dest_wait;
|
|
|
|
}
|
|
|
|
|
|
|
|
qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err_rdma_dest_wait:
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2013-07-22 18:01:54 +04:00
|
|
|
qemu_rdma_cleanup(rdma);
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
g_free(rdma_return_path);
|
2023-09-28 16:19:52 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
static int dest_ram_sort_func(const void *a, const void *b)
|
|
|
|
{
|
|
|
|
unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
|
|
|
|
unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
|
|
|
|
|
|
|
|
return (a_index < b_index) ? -1 : (a_index != b_index);
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
/*
|
|
|
|
* During each iteration of the migration, we listen for instructions
|
|
|
|
* by the source VM to perform dynamic page registrations before they
|
|
|
|
* can perform RDMA operations.
|
|
|
|
*
|
|
|
|
* We respond with the 'rkey'.
|
|
|
|
*
|
|
|
|
* Keep doing this until the source tells us to stop.
|
|
|
|
*/
|
2023-05-03 16:18:36 +03:00
|
|
|
static int qemu_rdma_registration_handle(QEMUFile *f)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
|
|
|
RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
|
|
|
|
.type = RDMA_CONTROL_REGISTER_RESULT,
|
|
|
|
.repeat = 0,
|
|
|
|
};
|
|
|
|
RDMAControlHeader unreg_resp = { .len = 0,
|
|
|
|
.type = RDMA_CONTROL_UNREGISTER_FINISHED,
|
|
|
|
.repeat = 0,
|
|
|
|
};
|
|
|
|
RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
|
|
|
|
.repeat = 1 };
|
2023-05-03 16:18:36 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
|
|
|
RDMALocalBlocks *local;
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMAControlHeader head;
|
|
|
|
RDMARegister *reg, *registers;
|
|
|
|
RDMACompress *comp;
|
|
|
|
RDMARegisterResult *reg_result;
|
|
|
|
static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
|
|
|
|
RDMALocalBlock *block;
|
|
|
|
void *host_addr;
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
int idx = 0;
|
|
|
|
int count = 0;
|
|
|
|
int i = 0;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
if (!rdma) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma_errored(rdma)) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2023-09-28 16:19:45 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2018-08-06 16:29:29 +03:00
|
|
|
local = &rdma->local_ram_blocks;
|
2013-07-22 18:01:54 +04:00
|
|
|
do {
|
2015-06-11 20:17:23 +03:00
|
|
|
trace_qemu_rdma_registration_handle_wait();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma: Too many requests in this message (%d)."
|
|
|
|
"Bailing.", head.repeat);
|
2013-07-22 18:01:54 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (head.type) {
|
|
|
|
case RDMA_CONTROL_COMPRESS:
|
|
|
|
comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
|
|
|
|
network_to_compress(comp);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_compress(comp->length,
|
|
|
|
comp->block_idx,
|
|
|
|
comp->offset);
|
2015-06-11 20:17:27 +03:00
|
|
|
if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
|
|
|
|
error_report("rdma: 'compress' bad block index %u (vs %d)",
|
|
|
|
(unsigned int)comp->block_idx,
|
|
|
|
rdma->local_ram_blocks.nb_blocks);
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2015-06-11 20:17:27 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
block = &(rdma->local_ram_blocks.block[comp->block_idx]);
|
|
|
|
|
|
|
|
host_addr = block->local_host_addr +
|
|
|
|
(comp->offset - block->offset);
|
|
|
|
|
|
|
|
ram_handle_compressed(host_addr, comp->value, comp->length);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case RDMA_CONTROL_REGISTER_FINISHED:
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_finished();
|
2023-09-28 16:19:49 +03:00
|
|
|
return 0;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_ram_blocks();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
/* Sort our local RAM Block list so it's the same as the source,
|
|
|
|
* we can do this since we've filled in a src_index in the list
|
|
|
|
* as we received the RAMBlock list earlier.
|
|
|
|
*/
|
|
|
|
qsort(rdma->local_ram_blocks.block,
|
|
|
|
rdma->local_ram_blocks.nb_blocks,
|
|
|
|
sizeof(RDMALocalBlock), dest_ram_sort_func);
|
2018-05-06 17:54:58 +03:00
|
|
|
for (i = 0; i < local->nb_blocks; i++) {
|
|
|
|
local->block[i].index = i;
|
|
|
|
}
|
|
|
|
|
2013-07-22 18:01:54 +04:00
|
|
|
if (rdma->pin_all) {
|
|
|
|
ret = qemu_rdma_reg_whole_ram_blocks(rdma);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error dest "
|
|
|
|
"registering ram blocks");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Dest uses this to prepare to transmit the RAMBlock descriptions
|
|
|
|
* to the source VM after connection setup.
|
|
|
|
* Both sides use the "remote" structure to communicate and update
|
|
|
|
* their "local" descriptions with what was sent.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < local->nb_blocks; i++) {
|
2015-04-20 18:57:16 +03:00
|
|
|
rdma->dest_blocks[i].remote_host_addr =
|
2015-02-28 21:09:43 +03:00
|
|
|
(uintptr_t)(local->block[i].local_host_addr);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (rdma->pin_all) {
|
2015-04-20 18:57:16 +03:00
|
|
|
rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
rdma->dest_blocks[i].offset = local->block[i].offset;
|
|
|
|
rdma->dest_blocks[i].length = local->block[i].length;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
dest_block_to_network(&rdma->dest_blocks[i]);
|
2015-06-11 20:17:26 +03:00
|
|
|
trace_qemu_rdma_registration_handle_ram_blocks_loop(
|
|
|
|
local->block[i].block_name,
|
|
|
|
local->block[i].offset,
|
|
|
|
local->block[i].length,
|
|
|
|
local->block[i].local_host_addr,
|
|
|
|
local->block[i].src_index);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
blocks.len = rdma->local_ram_blocks.nb_blocks
|
2015-04-20 18:57:16 +03:00
|
|
|
* sizeof(RDMADestBlock);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
|
|
|
|
ret = qemu_rdma_post_send_control(rdma,
|
2015-04-20 18:57:16 +03:00
|
|
|
(uint8_t *) rdma->dest_blocks, &blocks);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("rdma migration: error sending remote info");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
case RDMA_CONTROL_REGISTER_REQUEST:
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_register(head.repeat);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
reg_resp.repeat = head.repeat;
|
|
|
|
registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
|
|
|
|
|
|
|
|
for (count = 0; count < head.repeat; count++) {
|
|
|
|
uint64_t chunk;
|
|
|
|
uint8_t *chunk_start, *chunk_end;
|
|
|
|
|
|
|
|
reg = ®isters[count];
|
|
|
|
network_to_register(reg);
|
|
|
|
|
|
|
|
reg_result = &results[count];
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_register_loop(count,
|
2013-07-22 18:01:54 +04:00
|
|
|
reg->current_index, reg->key.current_addr, reg->chunks);
|
|
|
|
|
2015-06-11 20:17:27 +03:00
|
|
|
if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
|
|
|
|
error_report("rdma: 'register' bad block index %u (vs %d)",
|
|
|
|
(unsigned int)reg->current_index,
|
|
|
|
rdma->local_ram_blocks.nb_blocks);
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2015-06-11 20:17:27 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
block = &(rdma->local_ram_blocks.block[reg->current_index]);
|
|
|
|
if (block->is_ram_block) {
|
2015-06-11 20:17:27 +03:00
|
|
|
if (block->offset > reg->key.current_addr) {
|
|
|
|
error_report("rdma: bad register address for block %s"
|
|
|
|
" offset: %" PRIx64 " current_addr: %" PRIx64,
|
|
|
|
block->block_name, block->offset,
|
|
|
|
reg->key.current_addr);
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2015-06-11 20:17:27 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
host_addr = (block->local_host_addr +
|
|
|
|
(reg->key.current_addr - block->offset));
|
|
|
|
chunk = ram_chunk_index(block->local_host_addr,
|
|
|
|
(uint8_t *) host_addr);
|
|
|
|
} else {
|
|
|
|
chunk = reg->key.chunk;
|
|
|
|
host_addr = block->local_host_addr +
|
|
|
|
(reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
|
2015-06-11 20:17:27 +03:00
|
|
|
/* Check for particularly bad chunk value */
|
|
|
|
if (host_addr < (void *)block->local_host_addr) {
|
|
|
|
error_report("rdma: bad chunk for block %s"
|
|
|
|
" chunk: %" PRIx64,
|
|
|
|
block->block_name, reg->key.chunk);
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2015-06-11 20:17:27 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
chunk_start = ram_chunk_start(block, chunk);
|
|
|
|
chunk_end = ram_chunk_end(block, chunk + reg->chunks);
|
2019-03-04 21:49:23 +03:00
|
|
|
/* avoid "-Waddress-of-packed-member" warning */
|
|
|
|
uint32_t tmp_rkey = 0;
|
2013-07-22 18:01:54 +04:00
|
|
|
if (qemu_rdma_register_and_get_keys(rdma, block,
|
2019-03-04 21:49:23 +03:00
|
|
|
(uintptr_t)host_addr, NULL, &tmp_rkey,
|
2013-07-22 18:01:54 +04:00
|
|
|
chunk, chunk_start, chunk_end)) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("cannot get rkey");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
2019-03-04 21:49:23 +03:00
|
|
|
reg_result->rkey = tmp_rkey;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-28 21:09:43 +03:00
|
|
|
reg_result->host_addr = (uintptr_t)block->local_host_addr;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_register_rkey(
|
|
|
|
reg_result->rkey);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
result_to_network(reg_result);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_post_send_control(rdma,
|
|
|
|
(uint8_t *) results, ®_resp);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to send control buffer");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case RDMA_CONTROL_UNREGISTER_REQUEST:
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_unregister(head.repeat);
|
2013-07-22 18:01:54 +04:00
|
|
|
unreg_resp.repeat = head.repeat;
|
|
|
|
registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
|
|
|
|
|
|
|
|
for (count = 0; count < head.repeat; count++) {
|
|
|
|
reg = ®isters[count];
|
|
|
|
network_to_register(reg);
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_unregister_loop(count,
|
|
|
|
reg->current_index, reg->key.chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
block = &(rdma->local_ram_blocks.block[reg->current_index]);
|
|
|
|
|
|
|
|
ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
|
|
|
|
block->pmr[reg->key.chunk] = NULL;
|
|
|
|
|
|
|
|
if (ret != 0) {
|
|
|
|
perror("rdma unregistration chunk failed");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->total_registrations--;
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_handle_unregister_success(
|
|
|
|
reg->key.chunk);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Failed to send control buffer");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
case RDMA_CONTROL_REGISTER_RESULT:
|
2015-02-02 22:53:33 +03:00
|
|
|
error_report("Invalid RESULT message at dest.");
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
default:
|
2017-07-17 14:09:35 +03:00
|
|
|
error_report("Unknown control message %s", control_desc(head.type));
|
2023-09-28 16:19:49 +03:00
|
|
|
goto err;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
} while (1);
|
2023-09-28 16:19:49 +03:00
|
|
|
|
|
|
|
err:
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
/* Destination:
|
|
|
|
* Called via a ram_control_load_hook during the initial RAM load section which
|
|
|
|
* lists the RAMBlocks by name. This lets us know the order of the RAMBlocks
|
|
|
|
* on the source.
|
|
|
|
* We've already built our local RAMBlock list, but not yet sent the list to
|
|
|
|
* the source.
|
|
|
|
*/
|
2016-04-27 13:05:07 +03:00
|
|
|
static int
|
2023-05-04 14:44:41 +03:00
|
|
|
rdma_block_notification_handle(QEMUFile *f, const char *name)
|
2015-06-11 20:17:26 +03:00
|
|
|
{
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2023-05-04 14:44:41 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
|
2015-06-11 20:17:26 +03:00
|
|
|
int curr;
|
|
|
|
int found = -1;
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmain);
|
2018-08-06 16:29:29 +03:00
|
|
|
|
|
|
|
if (!rdma) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
/* Find the matching RAMBlock in our local list */
|
|
|
|
for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
|
|
|
|
if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
|
|
|
|
found = curr;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (found == -1) {
|
|
|
|
error_report("RAMBlock '%s' not found on destination", name);
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2015-06-11 20:17:26 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
|
|
|
|
trace_rdma_block_notification_handle(name, rdma->next_src_index);
|
|
|
|
rdma->next_src_index++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-06-20 14:01:55 +03:00
|
|
|
static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
|
2015-06-11 20:17:23 +03:00
|
|
|
{
|
|
|
|
switch (flags) {
|
|
|
|
case RAM_CONTROL_BLOCK_REG:
|
2023-05-04 14:44:41 +03:00
|
|
|
return rdma_block_notification_handle(f, data);
|
2015-06-11 20:17:23 +03:00
|
|
|
|
|
|
|
case RAM_CONTROL_HOOK:
|
2023-05-03 16:18:36 +03:00
|
|
|
return qemu_rdma_registration_handle(f);
|
2015-06-11 20:17:23 +03:00
|
|
|
|
|
|
|
default:
|
|
|
|
/* Shouldn't be called with any other values */
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-20 14:01:55 +03:00
|
|
|
static int qemu_rdma_registration_start(QEMUFile *f,
|
2015-06-11 20:17:23 +03:00
|
|
|
uint64_t flags, void *data)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2022-06-20 14:01:55 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
|
|
|
|
2023-05-04 14:44:43 +03:00
|
|
|
if (migration_in_postcopy()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
if (!rdma) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma_errored(rdma)) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2023-09-28 16:19:45 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_start(flags);
|
2013-07-22 18:01:54 +04:00
|
|
|
qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
|
|
|
|
qemu_fflush(f);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inform dest that dynamic registrations are done for now.
|
|
|
|
* First, flush writes, if any.
|
|
|
|
*/
|
2022-06-20 14:01:55 +03:00
|
|
|
static int qemu_rdma_registration_stop(QEMUFile *f,
|
2015-06-11 20:17:23 +03:00
|
|
|
uint64_t flags, void *data)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2022-06-20 14:01:55 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
|
2018-08-06 16:29:29 +03:00
|
|
|
RDMAContext *rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
RDMAControlHeader head = { .len = 0, .repeat = 1 };
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-05-04 14:44:43 +03:00
|
|
|
if (migration_in_postcopy()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-07 17:36:40 +03:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 13:56:46 +03:00
|
|
|
rdma = qatomic_rcu_read(&rioc->rdmaout);
|
2018-08-06 16:29:29 +03:00
|
|
|
if (!rdma) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2018-08-06 16:29:29 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:53 +03:00
|
|
|
if (rdma_errored(rdma)) {
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2023-09-28 16:19:45 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
qemu_fflush(f);
|
2023-05-15 22:57:05 +03:00
|
|
|
ret = qemu_rdma_drain_cq(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
if (ret < 0) {
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (flags == RAM_CONTROL_SETUP) {
|
|
|
|
RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
|
|
|
|
RDMALocalBlocks *local = &rdma->local_ram_blocks;
|
2015-06-11 20:17:26 +03:00
|
|
|
int reg_result_idx, i, nb_dest_blocks;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_stop_ram();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure that we parallelize the pinning on both sides.
|
|
|
|
* For very large guests, doing this serially takes a really
|
|
|
|
* long time, so we have to 'interleave' the pinning locally
|
|
|
|
* with the control messages by performing the pinning on this
|
|
|
|
* side before we receive the control response from the other
|
|
|
|
* side that the pinning has completed.
|
|
|
|
*/
|
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
|
|
|
|
®_result_idx, rdma->pin_all ?
|
|
|
|
qemu_rdma_reg_whole_ram_blocks : NULL);
|
|
|
|
if (ret < 0) {
|
migration/rdma: Plug memory leaks in qemu_rdma_registration_stop()
qemu_rdma_registration_stop() uses the ERROR() macro to create, report
to stderr, and store an Error object. The stored Error object is
never used, and its memory is leaked.
Even where ERROR() doesn't leak, it is ill-advised. The whole point
of passing an Error to the caller is letting the caller handle the
error. Error handling may report to stderr, to somewhere else, or not
at all. Also reporting in the callee mixes up concerns that should be
kept separate. Since I don't know what reporting to stderr is
supposed to accomplish, I'm not touching it.
Commit 2a1bc8bde7 "migration/rdma: rdma_accept_incoming_migration fix
error handling" plugged the same leak in
rdma_accept_incoming_migration().
Plug the memory leak the same way: keep the report part, delete the
store part.
The report part uses fprintf(). If it's truly an error, it should use
error_report() instead. But I don't know, so I leave it alone, just
like commit 2a1bc8bde7 did.
Fixes: 2da776db4846eadcb808598a5d3484d149773c05
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Juan Quintela <quintela@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20200630090351.1247703-27-armbru@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-06-30 12:03:51 +03:00
|
|
|
fprintf(stderr, "receiving remote info!");
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The protocol uses two different sets of rkeys (mutually exclusive):
|
|
|
|
* 1. One key to represent the virtual address of the entire ram block.
|
|
|
|
* (dynamic chunk registration disabled - pin everything with one rkey.)
|
|
|
|
* 2. One to represent individual chunks within a ram block.
|
|
|
|
* (dynamic chunk registration enabled - pin individual chunks.)
|
|
|
|
*
|
|
|
|
* Once the capability is successfully negotiated, the destination transmits
|
|
|
|
* the keys to use (or sends them later) including the virtual addresses
|
|
|
|
* and then propagates the remote ram block descriptions to his local copy.
|
|
|
|
*/
|
|
|
|
|
2015-04-20 18:57:16 +03:00
|
|
|
if (local->nb_blocks != nb_dest_blocks) {
|
migration/rdma: Plug memory leaks in qemu_rdma_registration_stop()
qemu_rdma_registration_stop() uses the ERROR() macro to create, report
to stderr, and store an Error object. The stored Error object is
never used, and its memory is leaked.
Even where ERROR() doesn't leak, it is ill-advised. The whole point
of passing an Error to the caller is letting the caller handle the
error. Error handling may report to stderr, to somewhere else, or not
at all. Also reporting in the callee mixes up concerns that should be
kept separate. Since I don't know what reporting to stderr is
supposed to accomplish, I'm not touching it.
Commit 2a1bc8bde7 "migration/rdma: rdma_accept_incoming_migration fix
error handling" plugged the same leak in
rdma_accept_incoming_migration().
Plug the memory leak the same way: keep the report part, delete the
store part.
The report part uses fprintf(). If it's truly an error, it should use
error_report() instead. But I don't know, so I leave it alone, just
like commit 2a1bc8bde7 did.
Fixes: 2da776db4846eadcb808598a5d3484d149773c05
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Juan Quintela <quintela@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20200630090351.1247703-27-armbru@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-06-30 12:03:51 +03:00
|
|
|
fprintf(stderr, "ram blocks mismatch (Number of blocks %d vs %d) "
|
|
|
|
"Your QEMU command line parameters are probably "
|
|
|
|
"not identical on both the source and destination.",
|
|
|
|
local->nb_blocks, nb_dest_blocks);
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2013-08-10 00:05:40 +04:00
|
|
|
qemu_rdma_move_header(rdma, reg_result_idx, &resp);
|
2015-04-20 18:57:16 +03:00
|
|
|
memcpy(rdma->dest_blocks,
|
2013-08-10 00:05:40 +04:00
|
|
|
rdma->wr_data[reg_result_idx].control_curr, resp.len);
|
2015-04-20 18:57:16 +03:00
|
|
|
for (i = 0; i < nb_dest_blocks; i++) {
|
|
|
|
network_to_dest_block(&rdma->dest_blocks[i]);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:26 +03:00
|
|
|
/* We require that the blocks are in the same order */
|
|
|
|
if (rdma->dest_blocks[i].length != local->block[i].length) {
|
migration/rdma: Plug memory leaks in qemu_rdma_registration_stop()
qemu_rdma_registration_stop() uses the ERROR() macro to create, report
to stderr, and store an Error object. The stored Error object is
never used, and its memory is leaked.
Even where ERROR() doesn't leak, it is ill-advised. The whole point
of passing an Error to the caller is letting the caller handle the
error. Error handling may report to stderr, to somewhere else, or not
at all. Also reporting in the callee mixes up concerns that should be
kept separate. Since I don't know what reporting to stderr is
supposed to accomplish, I'm not touching it.
Commit 2a1bc8bde7 "migration/rdma: rdma_accept_incoming_migration fix
error handling" plugged the same leak in
rdma_accept_incoming_migration().
Plug the memory leak the same way: keep the report part, delete the
store part.
The report part uses fprintf(). If it's truly an error, it should use
error_report() instead. But I don't know, so I leave it alone, just
like commit 2a1bc8bde7 did.
Fixes: 2da776db4846eadcb808598a5d3484d149773c05
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Juan Quintela <quintela@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20200630090351.1247703-27-armbru@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-06-30 12:03:51 +03:00
|
|
|
fprintf(stderr, "Block %s/%d has a different length %" PRIu64
|
|
|
|
"vs %" PRIu64, local->block[i].block_name, i,
|
|
|
|
local->block[i].length,
|
|
|
|
rdma->dest_blocks[i].length);
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
2015-06-11 20:17:26 +03:00
|
|
|
local->block[i].remote_host_addr =
|
|
|
|
rdma->dest_blocks[i].remote_host_addr;
|
|
|
|
local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_qemu_rdma_registration_stop(flags);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
head.type = RDMA_CONTROL_REGISTER_FINISHED;
|
|
|
|
ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
|
|
|
|
|
|
|
|
if (ret < 0) {
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
err:
|
2023-09-28 16:19:53 +03:00
|
|
|
rdma->errored = true;
|
2023-09-28 16:19:49 +03:00
|
|
|
return -1;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2016-04-27 13:04:55 +03:00
|
|
|
static const QEMUFileHooks rdma_read_hooks = {
|
2015-06-11 20:17:23 +03:00
|
|
|
.hook_ram_load = rdma_load_hook,
|
2013-07-22 18:01:54 +04:00
|
|
|
};
|
|
|
|
|
2016-04-27 13:04:55 +03:00
|
|
|
static const QEMUFileHooks rdma_write_hooks = {
|
2013-07-22 18:01:54 +04:00
|
|
|
.before_ram_iterate = qemu_rdma_registration_start,
|
|
|
|
.after_ram_iterate = qemu_rdma_registration_stop,
|
|
|
|
.save_page = qemu_rdma_save_page,
|
|
|
|
};
|
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
|
|
|
|
static void qio_channel_rdma_finalize(Object *obj)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
|
2018-08-06 16:29:29 +03:00
|
|
|
if (rioc->rdmain) {
|
|
|
|
qemu_rdma_cleanup(rioc->rdmain);
|
|
|
|
g_free(rioc->rdmain);
|
|
|
|
rioc->rdmain = NULL;
|
|
|
|
}
|
|
|
|
if (rioc->rdmaout) {
|
|
|
|
qemu_rdma_cleanup(rioc->rdmaout);
|
|
|
|
g_free(rioc->rdmaout);
|
|
|
|
rioc->rdmaout = NULL;
|
2016-04-27 13:05:07 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void qio_channel_rdma_class_init(ObjectClass *klass,
|
|
|
|
void *class_data G_GNUC_UNUSED)
|
|
|
|
{
|
|
|
|
QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
|
|
|
|
|
|
|
|
ioc_klass->io_writev = qio_channel_rdma_writev;
|
|
|
|
ioc_klass->io_readv = qio_channel_rdma_readv;
|
|
|
|
ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
|
|
|
|
ioc_klass->io_close = qio_channel_rdma_close;
|
|
|
|
ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
|
migration: implement io_set_aio_fd_handler function for RDMA QIOChannel
if qio_channel_rdma_readv return QIO_CHANNEL_ERR_BLOCK, the destination qemu
crash.
The backtrace is:
(gdb) bt
#0 0x0000000000000000 in ?? ()
#1 0x00000000008db50e in qio_channel_set_aio_fd_handler (ioc=0x38111e0, ctx=0x3726080,
io_read=0x8db841 <qio_channel_restart_read>, io_write=0x0, opaque=0x38111e0) at io/channel.c:
#2 0x00000000008db952 in qio_channel_set_aio_fd_handlers (ioc=0x38111e0) at io/channel.c:438
#3 0x00000000008dbab4 in qio_channel_yield (ioc=0x38111e0, condition=G_IO_IN) at io/channel.c:47
#4 0x00000000007a870b in channel_get_buffer (opaque=0x38111e0, buf=0x440c038 "", pos=0, size=327
at migration/qemu-file-channel.c:83
#5 0x00000000007a70f6 in qemu_fill_buffer (f=0x440c000) at migration/qemu-file.c:299
#6 0x00000000007a79d0 in qemu_peek_byte (f=0x440c000, offset=0) at migration/qemu-file.c:562
#7 0x00000000007a7a22 in qemu_get_byte (f=0x440c000) at migration/qemu-file.c:575
#8 0x00000000007a7c78 in qemu_get_be32 (f=0x440c000) at migration/qemu-file.c:655
#9 0x00000000007a0508 in qemu_loadvm_state (f=0x440c000) at migration/savevm.c:2126
#10 0x0000000000794141 in process_incoming_migration_co (opaque=0x0) at migration/migration.c:366
#11 0x000000000095c598 in coroutine_trampoline (i0=84033984, i1=0) at util/coroutine-ucontext.c:1
#12 0x00007f9c0db56d40 in ?? () from /lib64/libc.so.6
#13 0x00007f96fe858760 in ?? ()
#14 0x0000000000000000 in ?? ()
RDMA QIOChannel not implement io_set_aio_fd_handler. so
qio_channel_set_aio_fd_handler will access NULL pointer.
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:31 +03:00
|
|
|
ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
|
migration: implement the shutdown for RDMA QIOChannel
Because RDMA QIOChannel not implement shutdown function,
If the to_dst_file was set error, the return path thread
will wait forever. and the migration thread will wait
return path thread exit.
the backtrace of return path thread is:
(gdb) bt
#0 0x00007f372a76bb0f in ppoll () from /lib64/libc.so.6
#1 0x000000000071dc24 in qemu_poll_ns (fds=0x7ef7091d0580, nfds=2, timeout=100000000)
at qemu-timer.c:325
#2 0x00000000006b2fba in qemu_rdma_wait_comp_channel (rdma=0xd424000)
at migration/rdma.c:1501
#3 0x00000000006b3191 in qemu_rdma_block_for_wrid (rdma=0xd424000, wrid_requested=4000,
byte_len=0x7ef7091d0640) at migration/rdma.c:1580
#4 0x00000000006b3638 in qemu_rdma_exchange_get_response (rdma=0xd424000,
head=0x7ef7091d0720, expecting=3, idx=0) at migration/rdma.c:1726
#5 0x00000000006b3ad6 in qemu_rdma_exchange_recv (rdma=0xd424000, head=0x7ef7091d0720,
expecting=3) at migration/rdma.c:1903
#6 0x00000000006b5d03 in qemu_rdma_get_buffer (opaque=0x6a57dc0, buf=0x5c80030 "", pos=8,
size=32768) at migration/rdma.c:2714
#7 0x00000000006a9635 in qemu_fill_buffer (f=0x5c80000) at migration/qemu-file.c:232
#8 0x00000000006a9ecd in qemu_peek_byte (f=0x5c80000, offset=0)
at migration/qemu-file.c:502
#9 0x00000000006a9f1f in qemu_get_byte (f=0x5c80000) at migration/qemu-file.c:515
#10 0x00000000006aa162 in qemu_get_be16 (f=0x5c80000) at migration/qemu-file.c:591
#11 0x00000000006a46d3 in source_return_path_thread (
opaque=0xd826a0 <current_migration.37100>) at migration/migration.c:1331
#12 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#13 0x00007f372a77635d in clone () from /lib64/libc.so.6
the backtrace of migration thread is:
(gdb) bt
#0 0x00007f372aa4af57 in pthread_join () from /lib64/libpthread.so.0
#1 0x00000000007d5711 in qemu_thread_join (thread=0xd826f8 <current_migration.37100+88>)
at util/qemu-thread-posix.c:504
#2 0x00000000006a4bc5 in await_return_path_close_on_source (
ms=0xd826a0 <current_migration.37100>) at migration/migration.c:1460
#3 0x00000000006a53e4 in migration_completion (s=0xd826a0 <current_migration.37100>,
current_active_state=4, old_vm_running=0x7ef7089cf976, start_time=0x7ef7089cf980)
at migration/migration.c:1695
#4 0x00000000006a5c54 in migration_thread (opaque=0xd826a0 <current_migration.37100>)
at migration/migration.c:1837
#5 0x00007f372aa49e25 in start_thread () from /lib64/libpthread.so.0
#6 0x00007f372a77635d in clone () from /lib64/libc.so.6
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-08-06 16:29:34 +03:00
|
|
|
ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
|
2016-04-27 13:05:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static const TypeInfo qio_channel_rdma_info = {
|
|
|
|
.parent = TYPE_QIO_CHANNEL,
|
|
|
|
.name = TYPE_QIO_CHANNEL_RDMA,
|
|
|
|
.instance_size = sizeof(QIOChannelRDMA),
|
|
|
|
.instance_finalize = qio_channel_rdma_finalize,
|
|
|
|
.class_init = qio_channel_rdma_class_init,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void qio_channel_rdma_register_types(void)
|
|
|
|
{
|
|
|
|
type_register_static(&qio_channel_rdma_info);
|
|
|
|
}
|
|
|
|
|
|
|
|
type_init(qio_channel_rdma_register_types);
|
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
static QEMUFile *rdma_new_input(RDMAContext *rdma)
|
2013-07-22 18:01:54 +04:00
|
|
|
{
|
2023-05-30 21:39:37 +03:00
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
|
|
|
|
rioc->rdmain = rdma;
|
|
|
|
rioc->rdmaout = rdma->return_path;
|
|
|
|
qemu_file_set_hooks(rioc->file, &rdma_read_hooks);
|
|
|
|
|
|
|
|
return rioc->file;
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
static QEMUFile *rdma_new_output(RDMAContext *rdma)
|
|
|
|
{
|
|
|
|
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
|
|
|
|
rioc->rdmaout = rdma;
|
|
|
|
rioc->rdmain = rdma->return_path;
|
|
|
|
qemu_file_set_hooks(rioc->file, &rdma_write_hooks);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2016-04-27 13:05:07 +03:00
|
|
|
return rioc->file;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void rdma_accept_incoming_migration(void *opaque)
|
|
|
|
{
|
|
|
|
RDMAContext *rdma = opaque;
|
|
|
|
int ret;
|
|
|
|
QEMUFile *f;
|
2020-02-10 22:44:59 +03:00
|
|
|
Error *local_err = NULL;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-06-11 20:17:20 +03:00
|
|
|
trace_qemu_rdma_accept_incoming_migration();
|
2013-07-22 18:01:54 +04:00
|
|
|
ret = qemu_rdma_accept(rdma);
|
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2020-02-10 22:44:59 +03:00
|
|
|
fprintf(stderr, "RDMA ERROR: Migration initialization failed\n");
|
2013-07-22 18:01:54 +04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2015-06-11 20:17:20 +03:00
|
|
|
trace_qemu_rdma_accept_incoming_migration_accepted();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2018-08-06 16:29:28 +03:00
|
|
|
if (rdma->is_return_path) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
f = rdma_new_input(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (f == NULL) {
|
2023-05-30 21:39:37 +03:00
|
|
|
fprintf(stderr, "RDMA ERROR: could not open RDMA for input\n");
|
2013-07-22 18:01:54 +04:00
|
|
|
qemu_rdma_cleanup(rdma);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma->migration_started_on_destination = 1;
|
2020-02-10 22:44:59 +03:00
|
|
|
migration_fd_process_incoming(f, &local_err);
|
|
|
|
if (local_err) {
|
|
|
|
error_reportf_err(local_err, "RDMA ERROR:");
|
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
void rdma_start_incoming_migration(const char *host_port, Error **errp)
|
|
|
|
{
|
|
|
|
int ret;
|
2023-03-16 12:55:00 +03:00
|
|
|
RDMAContext *rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_rdma_start_incoming_migration();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2020-06-26 10:22:35 +03:00
|
|
|
/* Avoid ram_block_discard_disable(), cannot change during migration. */
|
|
|
|
if (ram_block_discard_is_required()) {
|
|
|
|
error_setg(errp, "RDMA: cannot disable RAM discard");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:37 +03:00
|
|
|
rdma = qemu_rdma_data_init(host_port, errp);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (rdma == NULL) {
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:37 +03:00
|
|
|
ret = qemu_rdma_dest_init(rdma, errp);
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_rdma_start_incoming_migration_after_dest_init();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
|
|
|
ret = rdma_listen(rdma->listen_id, 5);
|
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2023-09-28 16:19:58 +03:00
|
|
|
if (errp && !*errp) {
|
|
|
|
error_setg(errp, "RDMA ERROR: listening on socket!");
|
|
|
|
}
|
2021-05-20 11:11:45 +03:00
|
|
|
goto cleanup_rdma;
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_rdma_start_incoming_migration_after_rdma_listen();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
Change qemu_set_fd_handler2(..., NULL, ...) to qemu_set_fd_handler
Done with following Coccinelle semantic patch, plus manual cosmetic changes in
net/*.c.
@@
expression E1, E2, E3, E4;
@@
- qemu_set_fd_handler2(E1, NULL, E2, E3, E4);
+ qemu_set_fd_handler(E1, E2, E3, E4);
Signed-off-by: Fam Zheng <famz@redhat.com>
Message-id: 1433400324-7358-8-git-send-email-famz@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2015-06-04 09:45:18 +03:00
|
|
|
qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
|
|
|
|
NULL, (void *)(intptr_t)rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
return;
|
2021-05-20 11:11:45 +03:00
|
|
|
|
|
|
|
cleanup_rdma:
|
|
|
|
qemu_rdma_cleanup(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
err:
|
2020-05-08 13:07:54 +03:00
|
|
|
if (rdma) {
|
|
|
|
g_free(rdma->host);
|
migration/rdma: destination: create the return patch after the first accept
destination side:
$ build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.1.10:8888
(qemu) migrate_set_capability postcopy-ram on
(qemu)
dest_init RDMA Device opened: kernel name rocep1s0f0 uverbs device name uverbs0, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs0, infiniband class device path /sys/class/infiniband/rocep1s0f0, transport: (2) Ethernet
Segmentation fault (core dumped)
(gdb) bt
#0 qemu_rdma_accept (rdma=0x0) at ../migration/rdma.c:3272
#1 rdma_accept_incoming_migration (opaque=0x0) at ../migration/rdma.c:3986
#2 0x0000563c9e51f02a in aio_dispatch_handler
(ctx=ctx@entry=0x563ca0606010, node=0x563ca12b2150) at ../util/aio-posix.c:329
#3 0x0000563c9e51f752 in aio_dispatch_handlers (ctx=0x563ca0606010) at ../util/aio-posix.c:372
#4 aio_dispatch (ctx=0x563ca0606010) at ../util/aio-posix.c:382
#5 0x0000563c9e4f4d9e in aio_ctx_dispatch (source=<optimized out>, callback=<optimized out>, user_data=<optimized out>) at ../util/async.c:306
#6 0x00007fe96ef3fa9f in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#7 0x0000563c9e4ffeb8 in glib_pollfds_poll () at ../util/main-loop.c:231
#8 os_host_main_loop_wait (timeout=12188789) at ../util/main-loop.c:254
#9 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:530
#10 0x0000563c9e3c7211 in qemu_main_loop () at ../softmmu/runstate.c:725
#11 0x0000563c9dfd46fe in main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>) at ../softmmu/main.c:50
The rdma return path will not be created when qemu incoming is starting
since migrate_copy() is false at that moment, then a NULL return path
rdma was referenced if the user enabled postcopy later.
Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
Message-Id: <20210525080552.28259-3-lizhijian@cn.fujitsu.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2021-05-25 11:05:51 +03:00
|
|
|
g_free(rdma->host_port);
|
2020-05-08 13:07:54 +03:00
|
|
|
}
|
2013-07-22 18:01:54 +04:00
|
|
|
g_free(rdma);
|
|
|
|
}
|
|
|
|
|
|
|
|
void rdma_start_outgoing_migration(void *opaque,
|
|
|
|
const char *host_port, Error **errp)
|
|
|
|
{
|
|
|
|
MigrationState *s = opaque;
|
2018-08-06 16:29:28 +03:00
|
|
|
RDMAContext *rdma_return_path = NULL;
|
2020-06-26 10:22:35 +03:00
|
|
|
RDMAContext *rdma;
|
2023-09-28 16:19:54 +03:00
|
|
|
int ret;
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2020-06-26 10:22:35 +03:00
|
|
|
/* Avoid ram_block_discard_disable(), cannot change during migration. */
|
|
|
|
if (ram_block_discard_is_required()) {
|
|
|
|
error_setg(errp, "RDMA: cannot disable RAM discard");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rdma = qemu_rdma_data_init(host_port, errp);
|
2013-07-22 18:01:54 +04:00
|
|
|
if (rdma == NULL) {
|
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2023-03-02 00:41:55 +03:00
|
|
|
ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_rdma_start_outgoing_migration_after_rdma_source_init();
|
2023-09-28 16:19:36 +03:00
|
|
|
ret = qemu_rdma_connect(rdma, false, errp);
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2013-07-22 18:01:54 +04:00
|
|
|
goto err;
|
|
|
|
}
|
|
|
|
|
2020-09-17 10:50:21 +03:00
|
|
|
/* RDMA postcopy need a separate queue pair for return path */
|
2023-03-02 00:25:47 +03:00
|
|
|
if (migrate_postcopy() || migrate_return_path()) {
|
2018-08-06 16:29:28 +03:00
|
|
|
rdma_return_path = qemu_rdma_data_init(host_port, errp);
|
|
|
|
|
|
|
|
if (rdma_return_path == NULL) {
|
2020-05-08 13:07:55 +03:00
|
|
|
goto return_path_err;
|
2018-08-06 16:29:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
ret = qemu_rdma_source_init(rdma_return_path,
|
2023-03-02 00:41:55 +03:00
|
|
|
migrate_rdma_pin_all(), errp);
|
2018-08-06 16:29:28 +03:00
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2020-05-08 13:07:55 +03:00
|
|
|
goto return_path_err;
|
2018-08-06 16:29:28 +03:00
|
|
|
}
|
|
|
|
|
2023-09-28 16:19:36 +03:00
|
|
|
ret = qemu_rdma_connect(rdma_return_path, true, errp);
|
2018-08-06 16:29:28 +03:00
|
|
|
|
2023-09-28 16:19:55 +03:00
|
|
|
if (ret < 0) {
|
2020-05-08 13:07:55 +03:00
|
|
|
goto return_path_err;
|
2018-08-06 16:29:28 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
rdma->return_path = rdma_return_path;
|
|
|
|
rdma_return_path->return_path = rdma;
|
|
|
|
rdma_return_path->is_return_path = true;
|
|
|
|
}
|
|
|
|
|
2015-02-02 22:53:33 +03:00
|
|
|
trace_rdma_start_outgoing_migration_after_rdma_connect();
|
2013-07-22 18:01:54 +04:00
|
|
|
|
2023-05-30 21:39:37 +03:00
|
|
|
s->to_dst_file = rdma_new_output(rdma);
|
2017-12-15 20:16:54 +03:00
|
|
|
migrate_fd_connect(s, NULL);
|
2013-07-22 18:01:54 +04:00
|
|
|
return;
|
2020-05-08 13:07:55 +03:00
|
|
|
return_path_err:
|
|
|
|
qemu_rdma_cleanup(rdma);
|
2013-07-22 18:01:54 +04:00
|
|
|
err:
|
|
|
|
g_free(rdma);
|
2018-08-06 16:29:28 +03:00
|
|
|
g_free(rdma_return_path);
|
2013-07-22 18:01:54 +04:00
|
|
|
}
|