qemu/hw/virtio/vhost-user.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

3040 lines
93 KiB
C
Raw Normal View History

/*
* vhost-user
*
* Copyright (c) 2013 Virtual Open Systems Sarl.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
2016-03-14 11:01:28 +03:00
#include "qapi/error.h"
#include "hw/virtio/virtio-dmabuf.h"
#include "hw/virtio/vhost.h"
#include "hw/virtio/virtio-crypto.h"
#include "hw/virtio/vhost-user.h"
#include "hw/virtio/vhost-backend.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-net.h"
#include "chardev/char-fe.h"
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
#include "io/channel-socket.h"
#include "sysemu/kvm.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/uuid.h"
#include "qemu/sockets.h"
#include "sysemu/runstate.h"
#include "sysemu/cryptodev.h"
#include "migration/migration.h"
#include "migration/postcopy-ram.h"
#include "trace.h"
#include "exec/ramblock.h"
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include "standard-headers/linux/vhost_types.h"
#ifdef CONFIG_LINUX
#include <linux/userfaultfd.h>
#endif
#define VHOST_MEMORY_BASELINE_NREGIONS 8
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#define VHOST_USER_BACKEND_MAX_FDS 8
#if defined(TARGET_PPC) || defined(TARGET_PPC64)
#include "hw/ppc/spapr.h"
#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
#else
#define VHOST_USER_MAX_RAM_SLOTS 512
#endif
/*
* Maximum size of virtio device config space
*/
#define VHOST_USER_MAX_CONFIG_SIZE 256
#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
VHOST_USER_GET_FEATURES = 1,
VHOST_USER_SET_FEATURES = 2,
VHOST_USER_SET_OWNER = 3,
VHOST_USER_RESET_OWNER = 4,
VHOST_USER_SET_MEM_TABLE = 5,
VHOST_USER_SET_LOG_BASE = 6,
VHOST_USER_SET_LOG_FD = 7,
VHOST_USER_SET_VRING_NUM = 8,
VHOST_USER_SET_VRING_ADDR = 9,
VHOST_USER_SET_VRING_BASE = 10,
VHOST_USER_GET_VRING_BASE = 11,
VHOST_USER_SET_VRING_KICK = 12,
VHOST_USER_SET_VRING_CALL = 13,
VHOST_USER_SET_VRING_ERR = 14,
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
VHOST_USER_GET_QUEUE_NUM = 17,
VHOST_USER_SET_VRING_ENABLE = 18,
VHOST_USER_SEND_RARP = 19,
VHOST_USER_NET_SET_MTU = 20,
VHOST_USER_SET_BACKEND_REQ_FD = 21,
VHOST_USER_IOTLB_MSG = 22,
VHOST_USER_SET_VRING_ENDIAN = 23,
VHOST_USER_GET_CONFIG = 24,
VHOST_USER_SET_CONFIG = 25,
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
VHOST_USER_GET_INFLIGHT_FD = 31,
VHOST_USER_SET_INFLIGHT_FD = 32,
VHOST_USER_GPU_SET_SOCKET = 33,
VHOST_USER_RESET_DEVICE = 34,
/* Message number 35 reserved for VHOST_USER_VRING_KICK. */
VHOST_USER_GET_MAX_MEM_SLOTS = 36,
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
VHOST_USER_ADD_MEM_REG = 37,
VHOST_USER_REM_MEM_REG = 38,
VHOST_USER_SET_STATUS = 39,
VHOST_USER_GET_STATUS = 40,
VHOST_USER_GET_SHARED_OBJECT = 41,
VHOST_USER_SET_DEVICE_STATE_FD = 42,
VHOST_USER_CHECK_DEVICE_STATE = 43,
VHOST_USER_MAX
} VhostUserRequest;
typedef enum VhostUserBackendRequest {
VHOST_USER_BACKEND_NONE = 0,
VHOST_USER_BACKEND_IOTLB_MSG = 1,
VHOST_USER_BACKEND_CONFIG_CHANGE_MSG = 2,
VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG = 3,
VHOST_USER_BACKEND_SHARED_OBJECT_ADD = 6,
VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE = 7,
VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP = 8,
VHOST_USER_BACKEND_MAX
} VhostUserBackendRequest;
typedef struct VhostUserMemoryRegion {
uint64_t guest_phys_addr;
uint64_t memory_size;
uint64_t userspace_addr;
uint64_t mmap_offset;
} VhostUserMemoryRegion;
typedef struct VhostUserMemory {
uint32_t nregions;
uint32_t padding;
VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
} VhostUserMemory;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
typedef struct VhostUserMemRegMsg {
vhost-user: fix VHOST_USER_ADD/REM_MEM_REG truncation QEMU currently truncates the mmap_offset field when sending VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages. The struct layout looks like this: typedef struct VhostUserMemoryRegion { uint64_t guest_phys_addr; uint64_t memory_size; uint64_t userspace_addr; uint64_t mmap_offset; } VhostUserMemoryRegion; typedef struct VhostUserMemRegMsg { uint32_t padding; /* WARNING: there is a 32-bit hole here! */ VhostUserMemoryRegion region; } VhostUserMemRegMsg; The payload size is calculated as follows when sending the message in hw/virtio/vhost-user.c: msg->hdr.size = sizeof(msg->payload.mem_reg.padding) + sizeof(VhostUserMemoryRegion); This calculation produces an incorrect result of only 36 bytes. sizeof(VhostUserMemRegMsg) is actually 40 bytes. The consequence of this is that the final field, mmap_offset, is truncated. This breaks x86_64 TCG guests on s390 hosts. Other guest/host combinations may get lucky if either of the following holds: 1. The guest memory layout does not need mmap_offset != 0. 2. The host is little-endian and mmap_offset <= 0xffffffff so the truncation has no effect. Fix this by extending the existing 32-bit padding field to 64-bit. Now the padding reflects the actual compiler padding. This can be verified using pahole(1). Also document the layout properly in the vhost-user specification. The vhost-user spec did not document the exact layout. It would be impossible to implement the spec without looking at the QEMU source code. Existing vhost-user frontends and device backends continue to work after this fix has been applied. The only change in the wire protocol is that QEMU now sets hdr.size to 40 instead of 36. If a vhost-user implementation has a hardcoded size check for 36 bytes, then it will fail with new QEMUs. Both QEMU and DPDK/SPDK don't check the exact payload size, so they continue to work. Fixes: f1aeb14b0809e313c74244d838645ed25e85ea63 ("Transmit vhost-user memory regions individually") Cc: Raphael Norwitz <raphael.norwitz@nutanix.com> Cc: Cornelia Huck <cohuck@redhat.com> Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Message-Id: <20201109174355.1069147-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Fixes: f1aeb14b0809 ("Transmit vhost-user memory regions individually") Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
2020-11-09 20:43:55 +03:00
uint64_t padding;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
VhostUserMemoryRegion region;
} VhostUserMemRegMsg;
typedef struct VhostUserLog {
uint64_t mmap_size;
uint64_t mmap_offset;
} VhostUserLog;
typedef struct VhostUserConfig {
uint32_t offset;
uint32_t size;
uint32_t flags;
uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
} VhostUserConfig;
#define VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN 512
#define VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN 64
#define VHOST_CRYPTO_ASYM_MAX_KEY_LEN 1024
typedef struct VhostUserCryptoSession {
uint64_t op_code;
union {
struct {
CryptoDevBackendSymSessionInfo session_setup_data;
uint8_t key[VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN];
uint8_t auth_key[VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN];
} sym;
struct {
CryptoDevBackendAsymSessionInfo session_setup_data;
uint8_t key[VHOST_CRYPTO_ASYM_MAX_KEY_LEN];
} asym;
} u;
/* session id for success, -1 on errors */
int64_t session_id;
} VhostUserCryptoSession;
static VhostUserConfig c __attribute__ ((unused));
#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \
+ sizeof(c.size) \
+ sizeof(c.flags))
typedef struct VhostUserVringArea {
uint64_t u64;
uint64_t size;
uint64_t offset;
} VhostUserVringArea;
typedef struct VhostUserInflight {
uint64_t mmap_size;
uint64_t mmap_offset;
uint16_t num_queues;
uint16_t queue_size;
} VhostUserInflight;
typedef struct VhostUserShared {
unsigned char uuid[16];
} VhostUserShared;
typedef struct {
VhostUserRequest request;
#define VHOST_USER_VERSION_MASK (0x3)
#define VHOST_USER_REPLY_MASK (0x1 << 2)
#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
uint32_t flags;
uint32_t size; /* the following payload size */
} QEMU_PACKED VhostUserHeader;
/* Request payload of VHOST_USER_SET_DEVICE_STATE_FD */
typedef struct VhostUserTransferDeviceState {
uint32_t direction;
uint32_t phase;
} VhostUserTransferDeviceState;
typedef union {
#define VHOST_USER_VRING_IDX_MASK (0xff)
#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
uint64_t u64;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
VhostUserMemory memory;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
VhostUserMemRegMsg mem_reg;
VhostUserLog log;
struct vhost_iotlb_msg iotlb;
VhostUserConfig config;
VhostUserCryptoSession session;
VhostUserVringArea area;
VhostUserInflight inflight;
VhostUserShared object;
VhostUserTransferDeviceState transfer_state;
} VhostUserPayload;
typedef struct VhostUserMsg {
VhostUserHeader hdr;
VhostUserPayload payload;
} QEMU_PACKED VhostUserMsg;
static VhostUserMsg m __attribute__ ((unused));
#define VHOST_USER_HDR_SIZE (sizeof(VhostUserHeader))
#define VHOST_USER_PAYLOAD_SIZE (sizeof(VhostUserPayload))
/* The version of the protocol we support */
#define VHOST_USER_VERSION (0x1)
struct vhost_user {
struct vhost_dev *dev;
/* Shared between vhost devs of the same virtio device */
VhostUserState *user;
QIOChannel *backend_ioc;
GSource *backend_src;
NotifierWithReturn postcopy_notifier;
struct PostCopyFD postcopy_fd;
uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS];
/* Length of the region_rb and region_rb_offset arrays */
size_t region_rb_len;
/* RAMBlock associated with a given region */
RAMBlock **region_rb;
/*
* The offset from the start of the RAMBlock to the start of the
* vhost region.
*/
ram_addr_t *region_rb_offset;
/* True once we've entered postcopy_listen */
bool postcopy_listen;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/* Our current regions */
int num_shadow_regions;
struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS];
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
};
struct scrub_regions {
struct vhost_memory_region *region;
int reg_idx;
int fd_idx;
};
static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
uint8_t *p = (uint8_t *) msg;
int r, size = VHOST_USER_HDR_SIZE;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
int saved_errno = errno;
error_report("Failed to read msg header. Read %d instead of %d."
" Original request %d.", r, size, msg->hdr.request);
return r < 0 ? -saved_errno : -EIO;
}
/* validate received flags */
if (msg->hdr.flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
error_report("Failed to read msg header."
" Flags 0x%x instead of 0x%x.", msg->hdr.flags,
VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
return -EPROTO;
}
trace_vhost_user_read(msg->hdr.request, msg->hdr.flags);
return 0;
}
static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
uint8_t *p = (uint8_t *) msg;
int r, size;
r = vhost_user_read_header(dev, msg);
if (r < 0) {
return r;
}
/* validate message size is sane */
if (msg->hdr.size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", msg->hdr.size,
VHOST_USER_PAYLOAD_SIZE);
return -EPROTO;
}
if (msg->hdr.size) {
p += VHOST_USER_HDR_SIZE;
size = msg->hdr.size;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
int saved_errno = errno;
error_report("Failed to read msg payload."
" Read %d instead of %d.", r, msg->hdr.size);
return r < 0 ? -saved_errno : -EIO;
}
}
return 0;
}
static int process_message_reply(struct vhost_dev *dev,
const VhostUserMsg *msg)
{
int ret;
VhostUserMsg msg_reply;
if ((msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
return 0;
}
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
}
if (msg_reply.hdr.request != msg->hdr.request) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
msg->hdr.request, msg_reply.hdr.request);
return -EPROTO;
}
return msg_reply.payload.u64 ? -EIO : 0;
}
static bool vhost_user_per_device_request(VhostUserRequest request)
{
switch (request) {
case VHOST_USER_SET_OWNER:
case VHOST_USER_RESET_OWNER:
case VHOST_USER_SET_MEM_TABLE:
case VHOST_USER_GET_QUEUE_NUM:
case VHOST_USER_NET_SET_MTU:
case VHOST_USER_RESET_DEVICE:
case VHOST_USER_ADD_MEM_REG:
case VHOST_USER_REM_MEM_REG:
return true;
default:
return false;
}
}
/* most non-init callers ignore the error */
static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
int *fds, int fd_num)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ret, size = VHOST_USER_HDR_SIZE + msg->hdr.size;
/*
* Some devices, like virtio-scsi, are implemented as a single vhost_dev,
* while others, like virtio-net, contain multiple vhost_devs. For
* operations such as configuring device memory mappings or issuing device
* resets, which affect the whole device instead of individual VQs,
* vhost-user messages should only be sent once.
*
* Devices with multiple vhost_devs are given an associated dev->vq_index
* so per_device requests are only sent if vq_index is 0.
*/
if (vhost_user_per_device_request(msg->hdr.request)
&& dev->vq_index != 0) {
msg->hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
return 0;
}
if (qemu_chr_fe_set_msgfds(chr, fds, fd_num) < 0) {
error_report("Failed to set msg fds.");
return -EINVAL;
}
ret = qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size);
if (ret != size) {
int saved_errno = errno;
error_report("Failed to write msg."
" Wrote %d instead of %d.", ret, size);
return ret < 0 ? -saved_errno : -EIO;
}
trace_vhost_user_write(msg->hdr.request, msg->hdr.flags);
return 0;
}
int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GPU_SET_SOCKET,
.hdr.flags = VHOST_USER_VERSION,
};
return vhost_user_write(dev, &msg, &fd, 1);
}
static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
struct vhost_log *log)
vhost-user: add multiple queue support This patch is initially based a patch from Nikolay Nikolaev. This patch adds vhost-user multiple queue support, by creating a nc and vhost_net pair for each queue. Qemu exits if find that the backend can't support the number of requested queues (by providing queues=# option). The max number is queried by a new message, VHOST_USER_GET_QUEUE_NUM, and is sent only when protocol feature VHOST_USER_PROTOCOL_F_MQ is present first. The max queue check is done at vhost-user initiation stage. We initiate one queue first, which, in the meantime, also gets the max_queues the backend supports. In older version, it was reported that some messages are sent more times than necessary. Here we came an agreement with Michael that we could categorize vhost user messages to 2 types: non-vring specific messages, which should be sent only once, and vring specific messages, which should be sent per queue. Here I introduced a helper function vhost_user_one_time_request(), which lists following messages as non-vring specific messages: VHOST_USER_SET_OWNER VHOST_USER_RESET_DEVICE VHOST_USER_SET_MEM_TABLE VHOST_USER_GET_QUEUE_NUM For above messages, we simply ignore them when they are not sent the first time. Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Tested-by: Marcel Apfelbaum <marcel@redhat.com>
2015-09-23 07:20:00 +03:00
{
int fds[VHOST_USER_MAX_RAM_SLOTS];
size_t fd_num = 0;
bool shmfd = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_LOG_BASE,
.hdr.flags = VHOST_USER_VERSION,
.payload.log.mmap_size = log->size * sizeof(*(log->log)),
.payload.log.mmap_offset = 0,
.hdr.size = sizeof(msg.payload.log),
};
/* Send only once with first queue pair */
if (dev->vq_index != 0) {
return 0;
}
if (shmfd && log->fd != -1) {
fds[fd_num++] = log->fd;
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
}
if (shmfd) {
msg.hdr.size = 0;
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_SET_LOG_BASE) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_SET_LOG_BASE, msg.hdr.request);
return -EPROTO;
}
vhost-user: add multiple queue support This patch is initially based a patch from Nikolay Nikolaev. This patch adds vhost-user multiple queue support, by creating a nc and vhost_net pair for each queue. Qemu exits if find that the backend can't support the number of requested queues (by providing queues=# option). The max number is queried by a new message, VHOST_USER_GET_QUEUE_NUM, and is sent only when protocol feature VHOST_USER_PROTOCOL_F_MQ is present first. The max queue check is done at vhost-user initiation stage. We initiate one queue first, which, in the meantime, also gets the max_queues the backend supports. In older version, it was reported that some messages are sent more times than necessary. Here we came an agreement with Michael that we could categorize vhost user messages to 2 types: non-vring specific messages, which should be sent only once, and vring specific messages, which should be sent per queue. Here I introduced a helper function vhost_user_one_time_request(), which lists following messages as non-vring specific messages: VHOST_USER_SET_OWNER VHOST_USER_RESET_DEVICE VHOST_USER_SET_MEM_TABLE VHOST_USER_GET_QUEUE_NUM For above messages, we simply ignore them when they are not sent the first time. Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Tested-by: Marcel Apfelbaum <marcel@redhat.com>
2015-09-23 07:20:00 +03:00
}
return 0;
vhost-user: add multiple queue support This patch is initially based a patch from Nikolay Nikolaev. This patch adds vhost-user multiple queue support, by creating a nc and vhost_net pair for each queue. Qemu exits if find that the backend can't support the number of requested queues (by providing queues=# option). The max number is queried by a new message, VHOST_USER_GET_QUEUE_NUM, and is sent only when protocol feature VHOST_USER_PROTOCOL_F_MQ is present first. The max queue check is done at vhost-user initiation stage. We initiate one queue first, which, in the meantime, also gets the max_queues the backend supports. In older version, it was reported that some messages are sent more times than necessary. Here we came an agreement with Michael that we could categorize vhost user messages to 2 types: non-vring specific messages, which should be sent only once, and vring specific messages, which should be sent per queue. Here I introduced a helper function vhost_user_one_time_request(), which lists following messages as non-vring specific messages: VHOST_USER_SET_OWNER VHOST_USER_RESET_DEVICE VHOST_USER_SET_MEM_TABLE VHOST_USER_GET_QUEUE_NUM For above messages, we simply ignore them when they are not sent the first time. Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com> Signed-off-by: Changchun Ouyang <changchun.ouyang@intel.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com> Reviewed-by: Jason Wang <jasowang@redhat.com> Tested-by: Marcel Apfelbaum <marcel@redhat.com>
2015-09-23 07:20:00 +03:00
}
static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset,
int *fd)
{
MemoryRegion *mr;
assert((uintptr_t)addr == addr);
mr = memory_region_from_host((void *)(uintptr_t)addr, offset);
*fd = memory_region_get_fd(mr);
*offset += mr->ram_block->fd_offset;
return mr;
}
static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
struct vhost_memory_region *src,
uint64_t mmap_offset)
{
assert(src != NULL && dst != NULL);
dst->userspace_addr = src->userspace_addr;
dst->memory_size = src->memory_size;
dst->guest_phys_addr = src->guest_phys_addr;
dst->mmap_offset = mmap_offset;
}
static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
struct vhost_dev *dev,
VhostUserMsg *msg,
int *fds, size_t *fd_num,
bool track_ramblocks)
{
int i, fd;
ram_addr_t offset;
MemoryRegion *mr;
struct vhost_memory_region *reg;
VhostUserMemoryRegion region_buffer;
msg->hdr.request = VHOST_USER_SET_MEM_TABLE;
for (i = 0; i < dev->mem->nregions; ++i) {
reg = dev->mem->regions + i;
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (fd > 0) {
if (track_ramblocks) {
assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS);
trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name,
reg->memory_size,
reg->guest_phys_addr,
reg->userspace_addr,
offset);
u->region_rb_offset[i] = offset;
u->region_rb[i] = mr->ram_block;
} else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) {
error_report("Failed preparing vhost-user memory table msg");
return -ENOBUFS;
}
vhost_user_fill_msg_region(&region_buffer, reg, offset);
msg->payload.memory.regions[*fd_num] = region_buffer;
fds[(*fd_num)++] = fd;
} else if (track_ramblocks) {
u->region_rb_offset[i] = 0;
u->region_rb[i] = NULL;
}
}
msg->payload.memory.nregions = *fd_num;
if (!*fd_num) {
error_report("Failed initializing vhost-user memory map, "
"consider using -object memory-backend-file share=on");
return -EINVAL;
}
msg->hdr.size = sizeof(msg->payload.memory.nregions);
msg->hdr.size += sizeof(msg->payload.memory.padding);
msg->hdr.size += *fd_num * sizeof(VhostUserMemoryRegion);
return 0;
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
static inline bool reg_equal(struct vhost_memory_region *shadow_reg,
struct vhost_memory_region *vdev_reg)
{
return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr &&
shadow_reg->userspace_addr == vdev_reg->userspace_addr &&
shadow_reg->memory_size == vdev_reg->memory_size;
}
static void scrub_shadow_regions(struct vhost_dev *dev,
struct scrub_regions *add_reg,
int *nr_add_reg,
struct scrub_regions *rem_reg,
int *nr_rem_reg, uint64_t *shadow_pcb,
bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
bool found[VHOST_USER_MAX_RAM_SLOTS] = {};
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
struct vhost_memory_region *reg, *shadow_reg;
int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0;
ram_addr_t offset;
MemoryRegion *mr;
bool matching;
/*
* Find memory regions present in our shadow state which are not in
* the device's current memory state.
*
* Mark regions in both the shadow and device state as "found".
*/
for (i = 0; i < u->num_shadow_regions; i++) {
shadow_reg = &u->shadow_regions[i];
matching = false;
for (j = 0; j < dev->mem->nregions; j++) {
reg = &dev->mem->regions[j];
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (reg_equal(shadow_reg, reg)) {
matching = true;
found[j] = true;
if (track_ramblocks) {
/*
* Reset postcopy client bases, region_rb, and
* region_rb_offset in case regions are removed.
*/
if (fd > 0) {
u->region_rb_offset[j] = offset;
u->region_rb[j] = mr->ram_block;
shadow_pcb[j] = u->postcopy_client_bases[i];
} else {
u->region_rb_offset[j] = 0;
u->region_rb[j] = NULL;
}
}
break;
}
}
/*
* If the region was not found in the current device memory state
* create an entry for it in the removed list.
*/
if (!matching) {
rem_reg[rm_idx].region = shadow_reg;
rem_reg[rm_idx++].reg_idx = i;
}
}
/*
* For regions not marked "found", create entries in the added list.
*
* Note their indexes in the device memory state and the indexes of their
* file descriptors.
*/
for (i = 0; i < dev->mem->nregions; i++) {
reg = &dev->mem->regions[i];
vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
if (fd > 0) {
++fd_num;
}
/*
* If the region was in both the shadow and device state we don't
* need to send a VHOST_USER_ADD_MEM_REG message for it.
*/
if (found[i]) {
continue;
}
add_reg[add_idx].region = reg;
add_reg[add_idx].reg_idx = i;
add_reg[add_idx++].fd_idx = fd_num;
}
*nr_rem_reg = rm_idx;
*nr_add_reg = add_idx;
return;
}
static int send_remove_regions(struct vhost_dev *dev,
struct scrub_regions *remove_reg,
int nr_rem_reg, VhostUserMsg *msg,
bool reply_supported)
{
struct vhost_user *u = dev->opaque;
struct vhost_memory_region *shadow_reg;
int i, fd, shadow_reg_idx, ret;
ram_addr_t offset;
VhostUserMemoryRegion region_buffer;
/*
* The regions in remove_reg appear in the same order they do in the
* shadow table. Therefore we can minimize memory copies by iterating
* through remove_reg backwards.
*/
for (i = nr_rem_reg - 1; i >= 0; i--) {
shadow_reg = remove_reg[i].region;
shadow_reg_idx = remove_reg[i].reg_idx;
vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd);
if (fd > 0) {
msg->hdr.request = VHOST_USER_REM_MEM_REG;
vhost_user_fill_msg_region(&region_buffer, shadow_reg, 0);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
msg->payload.mem_reg.region = region_buffer;
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
if (reply_supported) {
ret = process_message_reply(dev, msg);
if (ret) {
return ret;
}
}
}
/*
* At this point we know the backend has unmapped the region. It is now
* safe to remove it from the shadow table.
*/
memmove(&u->shadow_regions[shadow_reg_idx],
&u->shadow_regions[shadow_reg_idx + 1],
sizeof(struct vhost_memory_region) *
(u->num_shadow_regions - shadow_reg_idx - 1));
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
u->num_shadow_regions--;
}
return 0;
}
static int send_add_regions(struct vhost_dev *dev,
struct scrub_regions *add_reg, int nr_add_reg,
VhostUserMsg *msg, uint64_t *shadow_pcb,
bool reply_supported, bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
int i, fd, ret, reg_idx, reg_fd_idx;
struct vhost_memory_region *reg;
MemoryRegion *mr;
ram_addr_t offset;
VhostUserMsg msg_reply;
VhostUserMemoryRegion region_buffer;
for (i = 0; i < nr_add_reg; i++) {
reg = add_reg[i].region;
reg_idx = add_reg[i].reg_idx;
reg_fd_idx = add_reg[i].fd_idx;
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (fd > 0) {
if (track_ramblocks) {
trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name,
reg->memory_size,
reg->guest_phys_addr,
reg->userspace_addr,
offset);
u->region_rb_offset[reg_idx] = offset;
u->region_rb[reg_idx] = mr->ram_block;
}
msg->hdr.request = VHOST_USER_ADD_MEM_REG;
vhost_user_fill_msg_region(&region_buffer, reg, offset);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
msg->payload.mem_reg.region = region_buffer;
ret = vhost_user_write(dev, msg, &fd, 1);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
if (track_ramblocks) {
uint64_t reply_gpa;
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr;
if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) {
error_report("%s: Received unexpected msg type."
"Expected %d received %d", __func__,
VHOST_USER_ADD_MEM_REG,
msg_reply.hdr.request);
return -EPROTO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
/*
* We're using the same structure, just reusing one of the
* fields, so it should be the same size.
*/
if (msg_reply.hdr.size != msg->hdr.size) {
error_report("%s: Unexpected size for postcopy reply "
"%d vs %d", __func__, msg_reply.hdr.size,
msg->hdr.size);
return -EPROTO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
/* Get the postcopy client base from the backend's reply. */
if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) {
shadow_pcb[reg_idx] =
msg_reply.payload.mem_reg.region.userspace_addr;
trace_vhost_user_set_mem_table_postcopy(
msg_reply.payload.mem_reg.region.userspace_addr,
msg->payload.mem_reg.region.userspace_addr,
reg_fd_idx, reg_idx);
} else {
error_report("%s: invalid postcopy reply for region. "
"Got guest physical address %" PRIX64 ", expected "
"%" PRIX64, __func__, reply_gpa,
dev->mem->regions[reg_idx].guest_phys_addr);
return -EPROTO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
} else if (reply_supported) {
ret = process_message_reply(dev, msg);
if (ret) {
return ret;
}
}
} else if (track_ramblocks) {
u->region_rb_offset[reg_idx] = 0;
u->region_rb[reg_idx] = NULL;
}
/*
* At this point, we know the backend has mapped in the new
* region, if the region has a valid file descriptor.
*
* The region should now be added to the shadow table.
*/
u->shadow_regions[u->num_shadow_regions].guest_phys_addr =
reg->guest_phys_addr;
u->shadow_regions[u->num_shadow_regions].userspace_addr =
reg->userspace_addr;
u->shadow_regions[u->num_shadow_regions].memory_size =
reg->memory_size;
u->num_shadow_regions++;
}
return 0;
}
static int vhost_user_add_remove_regions(struct vhost_dev *dev,
VhostUserMsg *msg,
bool reply_supported,
bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS];
struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS];
uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {};
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
int nr_add_reg, nr_rem_reg;
int ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
vhost-user: fix VHOST_USER_ADD/REM_MEM_REG truncation QEMU currently truncates the mmap_offset field when sending VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages. The struct layout looks like this: typedef struct VhostUserMemoryRegion { uint64_t guest_phys_addr; uint64_t memory_size; uint64_t userspace_addr; uint64_t mmap_offset; } VhostUserMemoryRegion; typedef struct VhostUserMemRegMsg { uint32_t padding; /* WARNING: there is a 32-bit hole here! */ VhostUserMemoryRegion region; } VhostUserMemRegMsg; The payload size is calculated as follows when sending the message in hw/virtio/vhost-user.c: msg->hdr.size = sizeof(msg->payload.mem_reg.padding) + sizeof(VhostUserMemoryRegion); This calculation produces an incorrect result of only 36 bytes. sizeof(VhostUserMemRegMsg) is actually 40 bytes. The consequence of this is that the final field, mmap_offset, is truncated. This breaks x86_64 TCG guests on s390 hosts. Other guest/host combinations may get lucky if either of the following holds: 1. The guest memory layout does not need mmap_offset != 0. 2. The host is little-endian and mmap_offset <= 0xffffffff so the truncation has no effect. Fix this by extending the existing 32-bit padding field to 64-bit. Now the padding reflects the actual compiler padding. This can be verified using pahole(1). Also document the layout properly in the vhost-user specification. The vhost-user spec did not document the exact layout. It would be impossible to implement the spec without looking at the QEMU source code. Existing vhost-user frontends and device backends continue to work after this fix has been applied. The only change in the wire protocol is that QEMU now sets hdr.size to 40 instead of 36. If a vhost-user implementation has a hardcoded size check for 36 bytes, then it will fail with new QEMUs. Both QEMU and DPDK/SPDK don't check the exact payload size, so they continue to work. Fixes: f1aeb14b0809e313c74244d838645ed25e85ea63 ("Transmit vhost-user memory regions individually") Cc: Raphael Norwitz <raphael.norwitz@nutanix.com> Cc: Cornelia Huck <cohuck@redhat.com> Cc: Michael S. Tsirkin <mst@redhat.com> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Message-Id: <20201109174355.1069147-1-stefanha@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Fixes: f1aeb14b0809 ("Transmit vhost-user memory regions individually") Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>
2020-11-09 20:43:55 +03:00
msg->hdr.size = sizeof(msg->payload.mem_reg);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/* Find the regions which need to be removed or added. */
scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg,
shadow_pcb, track_ramblocks);
if (nr_rem_reg) {
ret = send_remove_regions(dev, rem_reg, nr_rem_reg, msg,
reply_supported);
if (ret < 0) {
goto err;
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
if (nr_add_reg) {
ret = send_add_regions(dev, add_reg, nr_add_reg, msg, shadow_pcb,
reply_supported, track_ramblocks);
if (ret < 0) {
goto err;
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
if (track_ramblocks) {
memcpy(u->postcopy_client_bases, shadow_pcb,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/*
* Now we've registered this with the postcopy code, we ack to the
* client, because now we're in the position to be able to deal with
* any faults it generates.
*/
/* TODO: Use this for failure cases as well with a bad value. */
msg->hdr.size = sizeof(msg->payload.u64);
msg->payload.u64 = 0; /* OK */
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
}
return 0;
err:
if (track_ramblocks) {
memcpy(u->postcopy_client_bases, shadow_pcb,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
struct vhost_memory *mem,
bool reply_supported,
bool config_mem_slots)
{
struct vhost_user *u = dev->opaque;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
size_t fd_num = 0;
VhostUserMsg msg_reply;
int region_i, msg_i;
int ret;
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
};
if (u->region_rb_len < dev->mem->nregions) {
u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions);
u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset,
dev->mem->nregions);
memset(&(u->region_rb[u->region_rb_len]), '\0',
sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len));
memset(&(u->region_rb_offset[u->region_rb_len]), '\0',
sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len));
u->region_rb_len = dev->mem->nregions;
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
if (config_mem_slots) {
ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, true);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
} else {
ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
true);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
error_report("%s: Received unexpected msg type."
"Expected %d received %d", __func__,
VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
return -EPROTO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/*
* We're using the same structure, just reusing one of the
* fields, so it should be the same size.
*/
if (msg_reply.hdr.size != msg.hdr.size) {
error_report("%s: Unexpected size for postcopy reply "
"%d vs %d", __func__, msg_reply.hdr.size,
msg.hdr.size);
return -EPROTO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
memset(u->postcopy_client_bases, 0,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/*
* They're in the same order as the regions that were sent
* but some of the regions were skipped (above) if they
* didn't have fd's
*/
for (msg_i = 0, region_i = 0;
region_i < dev->mem->nregions;
region_i++) {
if (msg_i < fd_num &&
msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
dev->mem->regions[region_i].guest_phys_addr) {
u->postcopy_client_bases[region_i] =
msg_reply.payload.memory.regions[msg_i].userspace_addr;
trace_vhost_user_set_mem_table_postcopy(
msg_reply.payload.memory.regions[msg_i].userspace_addr,
msg.payload.memory.regions[msg_i].userspace_addr,
msg_i, region_i);
msg_i++;
}
}
if (msg_i != fd_num) {
error_report("%s: postcopy reply not fully consumed "
"%d vs %zd",
__func__, msg_i, fd_num);
return -EIO;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
/*
* Now we've registered this with the postcopy code, we ack to the
* client, because now we're in the position to be able to deal
* with any faults it generates.
*/
/* TODO: Use this for failure cases as well with a bad value. */
msg.hdr.size = sizeof(msg.payload.u64);
msg.payload.u64 = 0; /* OK */
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
}
return 0;
}
static int vhost_user_set_mem_table(struct vhost_dev *dev,
struct vhost_memory *mem)
{
struct vhost_user *u = dev->opaque;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
size_t fd_num = 0;
bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
bool config_mem_slots =
virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS);
int ret;
if (do_postcopy) {
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
/*
* Postcopy has enough differences that it's best done in it's own
* version
*/
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported,
config_mem_slots);
}
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
};
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
if (config_mem_slots) {
ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, false);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
} else {
ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
false);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
}
Transmit vhost-user memory regions individually With this change, when the VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS protocol feature has been negotiated, Qemu no longer sends the backend all the memory regions in a single message. Rather, when the memory tables are set or updated, a series of VHOST_USER_ADD_MEM_REG and VHOST_USER_REM_MEM_REG messages are sent to transmit the regions to map and/or unmap instead of sending send all the regions in one fixed size VHOST_USER_SET_MEM_TABLE message. The vhost_user struct maintains a shadow state of the VM’s memory regions. When the memory tables are modified, the vhost_user_set_mem_table() function compares the new device memory state to the shadow state and only sends regions which need to be unmapped or mapped in. The regions which must be unmapped are sent first, followed by the new regions to be mapped in. After all the messages have been sent, the shadow state is set to the current virtual device state. Existing backends which do not support VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS are unaffected. Signed-off-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Signed-off-by: Swapnil Ingle <swapnil.ingle@nutanix.com> Signed-off-by: Peter Turschmid <peter.turschm@nutanix.com> Suggested-by: Mike Cui <cui@nutanix.com> Message-Id: <1588533678-23450-5-git-send-email-raphael.norwitz@nutanix.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Marc-André Lureau <marcandre.lureau@redhat.com>
2020-05-21 08:00:35 +03:00
if (reply_supported) {
return process_message_reply(dev, &msg);
}
}
return 0;
}
static int vhost_user_set_vring_endian(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
bool cross_endian = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_VRING_ENDIAN,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
if (!cross_endian) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
{
int ret;
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
};
if (vhost_user_per_device_request(request) && dev->vq_index != 0) {
return 0;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != request) {
error_report("Received unexpected msg type. Expected %d received %d",
request, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_report("Received bad msg size.");
return -EPROTO;
}
*u64 = msg.payload.u64;
return 0;
}
static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
{
if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) {
return -EPROTO;
}
return 0;
}
/* Note: "msg->hdr.flags" may be modified. */
static int vhost_user_write_sync(struct vhost_dev *dev, VhostUserMsg *msg,
bool wait_for_reply)
{
int ret;
if (wait_for_reply) {
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
if (reply_supported) {
msg->hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
}
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
}
if (wait_for_reply) {
uint64_t dummy;
if (msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
return process_message_reply(dev, msg);
}
/*
* We need to wait for a reply but the backend does not
* support replies for the command we just sent.
* Send VHOST_USER_GET_FEATURES which makes all backends
* send a reply.
*/
return vhost_user_get_features(dev, &dummy);
}
return 0;
}
static int vhost_set_vring(struct vhost_dev *dev,
unsigned long int request,
2023-10-02 23:32:20 +03:00
struct vhost_vring_state *ring,
bool wait_for_reply)
{
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
2023-10-02 23:32:20 +03:00
return vhost_user_write_sync(dev, &msg, wait_for_reply);
}
static int vhost_user_set_vring_num(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
2023-10-02 23:32:20 +03:00
return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring, false);
}
static void vhost_user_host_notifier_free(VhostUserHostNotifier *n)
{
assert(n && n->unmap_addr);
munmap(n->unmap_addr, qemu_real_host_page_size());
n->unmap_addr = NULL;
}
/*
* clean-up function for notifier, will finally free the structure
* under rcu.
*/
static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n,
VirtIODevice *vdev)
{
if (n->addr) {
if (vdev) {
virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false);
}
assert(!n->unmap_addr);
n->unmap_addr = n->addr;
n->addr = NULL;
call_rcu(n, vhost_user_host_notifier_free, rcu);
}
}
static int vhost_user_set_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
2023-10-02 23:32:20 +03:00
return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring, false);
}
static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
{
int i;
if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) {
return -EINVAL;
}
for (i = 0; i < dev->nvqs; ++i) {
int ret;
struct vhost_vring_state state = {
.index = dev->vq_index + i,
.num = enable,
};
vhost-user: call VHOST_USER_SET_VRING_ENABLE synchronously (1) The virtio-1.2 specification <http://docs.oasis-open.org/virtio/virtio/v1.2/virtio-v1.2.html> writes: > 3 General Initialization And Device Operation > 3.1 Device Initialization > 3.1.1 Driver Requirements: Device Initialization > > [...] > > 7. Perform device-specific setup, including discovery of virtqueues for > the device, optional per-bus setup, reading and possibly writing the > device’s virtio configuration space, and population of virtqueues. > > 8. Set the DRIVER_OK status bit. At this point the device is “live”. and > 4 Virtio Transport Options > 4.1 Virtio Over PCI Bus > 4.1.4 Virtio Structure PCI Capabilities > 4.1.4.3 Common configuration structure layout > 4.1.4.3.2 Driver Requirements: Common configuration structure layout > > [...] > > The driver MUST configure the other virtqueue fields before enabling the > virtqueue with queue_enable. > > [...] (The same statements are present in virtio-1.0 identically, at <http://docs.oasis-open.org/virtio/virtio/v1.0/virtio-v1.0.html>.) These together mean that the following sub-sequence of steps is valid for a virtio-1.0 guest driver: (1.1) set "queue_enable" for the needed queues as the final part of device initialization step (7), (1.2) set DRIVER_OK in step (8), (1.3) immediately start sending virtio requests to the device. (2) When vhost-user is enabled, and the VHOST_USER_F_PROTOCOL_FEATURES special virtio feature is negotiated, then virtio rings start in disabled state, according to <https://qemu-project.gitlab.io/qemu/interop/vhost-user.html#ring-states>. In this case, explicit VHOST_USER_SET_VRING_ENABLE messages are needed for enabling vrings. Therefore setting "queue_enable" from the guest (1.1) -- which is technically "buffered" on the QEMU side until the guest sets DRIVER_OK (1.2) -- is a *control plane* operation, which -- after (1.2) -- travels from the guest through QEMU to the vhost-user backend, using a unix domain socket. Whereas sending a virtio request (1.3) is a *data plane* operation, which evades QEMU -- it travels from guest to the vhost-user backend via eventfd. This means that operations ((1.1) + (1.2)) and (1.3) travel through different channels, and their relative order can be reversed, as perceived by the vhost-user backend. That's exactly what happens when OVMF's virtiofs driver (VirtioFsDxe) runs against the Rust-language virtiofsd version 1.7.2. (Which uses version 0.10.1 of the vhost-user-backend crate, and version 0.8.1 of the vhost crate.) Namely, when VirtioFsDxe binds a virtiofs device, it goes through the device initialization steps (i.e., control plane operations), and immediately sends a FUSE_INIT request too (i.e., performs a data plane operation). In the Rust-language virtiofsd, this creates a race between two components that run *concurrently*, i.e., in different threads or processes: - Control plane, handling vhost-user protocol messages: The "VhostUserSlaveReqHandlerMut::set_vring_enable" method [crates/vhost-user-backend/src/handler.rs] handles VHOST_USER_SET_VRING_ENABLE messages, and updates each vring's "enabled" flag according to the message processed. - Data plane, handling virtio / FUSE requests: The "VringEpollHandler::handle_event" method [crates/vhost-user-backend/src/event_loop.rs] handles the incoming virtio / FUSE request, consuming the virtio kick at the same time. If the vring's "enabled" flag is set, the virtio / FUSE request is processed genuinely. If the vring's "enabled" flag is clear, then the virtio / FUSE request is discarded. Note that OVMF enables the queue *first*, and sends FUSE_INIT *second*. However, if the data plane processor in virtiofsd wins the race, then it sees the FUSE_INIT *before* the control plane processor took notice of VHOST_USER_SET_VRING_ENABLE and green-lit the queue for the data plane processor. Therefore the latter drops FUSE_INIT on the floor, and goes back to waiting for further virtio / FUSE requests with epoll_wait. Meanwhile OVMF is stuck waiting for the FUSET_INIT response -- a deadlock. The deadlock is not deterministic. OVMF hangs infrequently during first boot. However, OVMF hangs almost certainly during reboots from the UEFI shell. The race can be "reliably masked" by inserting a very small delay -- a single debug message -- at the top of "VringEpollHandler::handle_event", i.e., just before the data plane processor checks the "enabled" field of the vring. That delay suffices for the control plane processor to act upon VHOST_USER_SET_VRING_ENABLE. We can deterministically prevent the race in QEMU, by blocking OVMF inside step (1.2) -- i.e., in the write to the device status register that "unleashes" queue enablement -- until VHOST_USER_SET_VRING_ENABLE actually *completes*. That way OVMF's VCPU cannot advance to the FUSE_INIT submission before virtiofsd's control plane processor takes notice of the queue being enabled. Wait for VHOST_USER_SET_VRING_ENABLE completion by: - setting the NEED_REPLY flag on VHOST_USER_SET_VRING_ENABLE, and waiting for the reply, if the VHOST_USER_PROTOCOL_F_REPLY_ACK vhost-user feature has been negotiated, or - performing a separate VHOST_USER_GET_FEATURES *exchange*, which requires a backend response regardless of VHOST_USER_PROTOCOL_F_REPLY_ACK. Cc: "Michael S. Tsirkin" <mst@redhat.com> (supporter:vhost) Cc: Eugenio Perez Martin <eperezma@redhat.com> Cc: German Maglione <gmaglione@redhat.com> Cc: Liu Jiang <gerry@linux.alibaba.com> Cc: Sergio Lopez Pascual <slp@redhat.com> Cc: Stefano Garzarella <sgarzare@redhat.com> Signed-off-by: Laszlo Ersek <lersek@redhat.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> Tested-by: Albert Esteve <aesteve@redhat.com> [lersek@redhat.com: work Eugenio's explanation into the commit message, about QEMU containing step (1.1) until step (1.2)] Reviewed-by: Eugenio Pérez <eperezma@redhat.com> Message-Id: <20231002203221.17241-8-lersek@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2023-10-02 23:32:21 +03:00
/*
* SET_VRING_ENABLE travels from guest to QEMU to vhost-user backend /
* control plane thread via unix domain socket. Virtio requests travel
* from guest to vhost-user backend / data plane thread via eventfd.
* Even if the guest enables the ring first, and pushes its first virtio
* request second (conforming to the virtio spec), the data plane thread
* in the backend may see the virtio request before the control plane
* thread sees the queue enablement. This causes (in fact, requires) the
* data plane thread to discard the virtio request (it arrived on a
* seemingly disabled queue). To prevent this out-of-order delivery,
* don't let the guest proceed to pushing the virtio request until the
* backend control plane acknowledges enabling the queue -- IOW, pass
* wait_for_reply=true below.
*/
ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state, true);
if (ret < 0) {
/*
* Restoring the previous state is likely infeasible, as well as
* proceeding regardless the error, so just bail out and hope for
* the device-level recovery.
*/
return ret;
}
}
return 0;
}
static VhostUserHostNotifier *fetch_notifier(VhostUserState *u,
int idx)
{
if (idx >= u->notifiers->len) {
return NULL;
}
return g_ptr_array_index(u->notifiers, idx);
}
static int vhost_user_get_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_VRING_BASE,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
struct vhost_user *u = dev->opaque;
VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index);
if (n) {
vhost_user_host_notifier_remove(n, dev->vdev);
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_VRING_BASE) {
error_report("Received unexpected msg type. Expected %d received %d",
VHOST_USER_GET_VRING_BASE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.state)) {
error_report("Received bad msg size.");
return -EPROTO;
}
*ring = msg.payload.state;
return 0;
}
static int vhost_set_vring_file(struct vhost_dev *dev,
VhostUserRequest request,
struct vhost_vring_file *file)
{
int fds[VHOST_USER_MAX_RAM_SLOTS];
size_t fd_num = 0;
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
.hdr.size = sizeof(msg.payload.u64),
};
if (file->fd > 0) {
fds[fd_num++] = file->fd;
} else {
msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
}
return vhost_user_write(dev, &msg, fds, fd_num);
}
static int vhost_user_set_vring_kick(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file);
}
static int vhost_user_set_vring_call(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file);
}
static int vhost_user_set_vring_err(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_ERR, file);
}
static int vhost_user_set_vring_addr(struct vhost_dev *dev,
struct vhost_vring_addr *addr)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_VRING_ADDR,
.hdr.flags = VHOST_USER_VERSION,
.payload.addr = *addr,
.hdr.size = sizeof(msg.payload.addr),
};
/*
* wait for a reply if logging is enabled to make sure
* backend is actually logging changes
*/
bool wait_for_reply = addr->flags & (1 << VHOST_VRING_F_LOG);
return vhost_user_write_sync(dev, &msg, wait_for_reply);
}
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64,
bool wait_for_reply)
{
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.u64 = u64,
.hdr.size = sizeof(msg.payload.u64),
};
return vhost_user_write_sync(dev, &msg, wait_for_reply);
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
}
static int vhost_user_set_status(struct vhost_dev *dev, uint8_t status)
{
return vhost_user_set_u64(dev, VHOST_USER_SET_STATUS, status, false);
}
static int vhost_user_get_status(struct vhost_dev *dev, uint8_t *status)
{
uint64_t value;
int ret;
ret = vhost_user_get_u64(dev, VHOST_USER_GET_STATUS, &value);
if (ret < 0) {
return ret;
}
*status = value;
return 0;
}
static int vhost_user_add_status(struct vhost_dev *dev, uint8_t status)
{
uint8_t s;
int ret;
ret = vhost_user_get_status(dev, &s);
if (ret < 0) {
return ret;
}
if ((s & status) == status) {
return 0;
}
s |= status;
return vhost_user_set_status(dev, s);
}
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
static int vhost_user_set_features(struct vhost_dev *dev,
uint64_t features)
{
/*
* wait for a reply if logging is enabled to make sure
* backend is actually logging changes
*/
bool log_enabled = features & (0x1ULL << VHOST_F_LOG_ALL);
int ret;
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
/*
* We need to include any extra backend only feature bits that
* might be needed by our device. Currently this includes the
* VHOST_USER_F_PROTOCOL_FEATURES bit for enabling protocol
* features.
*/
ret = vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES,
features | dev->backend_features,
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
log_enabled);
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
if (!ret) {
return vhost_user_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
}
}
return ret;
vhost: make SET_VRING_ADDR, SET_FEATURES send replies On vhost-user-blk migration, qemu normally sends a number of commands to enable logging if VHOST_USER_PROTOCOL_F_LOG_SHMFD is negotiated. Qemu sends VHOST_USER_SET_FEATURES to enable buffers logging and VHOST_USER_SET_VRING_ADDR per each started ring to enable "used ring" data logging. The issue is that qemu doesn't wait for reply from the vhost daemon for these commands which may result in races between qemu expectation of logging starting and actual login starting in vhost daemon. The race can appear as follows: on migration setup, qemu enables dirty page logging by sending VHOST_USER_SET_FEATURES. The command doesn't arrive to a vhost-user-blk daemon immediately and the daemon needs some time to turn the logging on internally. If qemu doesn't wait for reply, after sending the command, qemu may start migrateing memory pages to a destination. At this time, the logging may not be actually turned on in the daemon but some guest pages, which the daemon is about to write to, may have already been transferred without logging to the destination. Since the logging wasn't turned on, those pages won't be transferred again as dirty. So we may end up with corrupted data on the destination. The same scenario is applicable for "used ring" data logging, which is turned on with VHOST_USER_SET_VRING_ADDR command. To resolve this issue, this patch makes qemu wait for the command result explicitly if VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated and logging enabled. Signed-off-by: Denis Plotnikov <den-plotnikov@yandex-team.ru> Message-Id: <20210809104824.78830-1-den-plotnikov@yandex-team.ru> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-08-09 13:48:24 +03:00
}
static int vhost_user_set_protocol_features(struct vhost_dev *dev,
uint64_t features)
{
return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features,
false);
}
static int vhost_user_set_owner(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_OWNER,
.hdr.flags = VHOST_USER_VERSION,
};
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_get_max_memslots(struct vhost_dev *dev,
uint64_t *max_memslots)
{
uint64_t backend_max_memslots;
int err;
err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS,
&backend_max_memslots);
if (err < 0) {
return err;
}
*max_memslots = backend_max_memslots;
return 0;
}
static int vhost_user_reset_device(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
.hdr.request = VHOST_USER_RESET_DEVICE,
};
/*
* Historically, reset was not implemented so only reset devices
* that are expecting it.
*/
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_RESET_DEVICE)) {
return -ENOSYS;
}
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_backend_handle_config_change(struct vhost_dev *dev)
{
if (!dev->config_ops || !dev->config_ops->vhost_dev_config_notifier) {
return -ENOSYS;
}
return dev->config_ops->vhost_dev_config_notifier(dev);
}
/*
* Fetch or create the notifier for a given idx. Newly created
* notifiers are added to the pointer array that tracks them.
*/
static VhostUserHostNotifier *fetch_or_create_notifier(VhostUserState *u,
int idx)
{
VhostUserHostNotifier *n = NULL;
if (idx >= u->notifiers->len) {
g_ptr_array_set_size(u->notifiers, idx + 1);
}
n = g_ptr_array_index(u->notifiers, idx);
if (!n) {
/*
* In case notification arrive out-of-order,
* make room for current index.
*/
g_ptr_array_remove_index(u->notifiers, idx);
n = g_new0(VhostUserHostNotifier, 1);
n->idx = idx;
g_ptr_array_insert(u->notifiers, idx, n);
trace_vhost_user_create_notifier(idx, n);
}
return n;
}
static int vhost_user_backend_handle_vring_host_notifier(struct vhost_dev *dev,
VhostUserVringArea *area,
int fd)
{
int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
size_t page_size = qemu_real_host_page_size();
struct vhost_user *u = dev->opaque;
VhostUserState *user = u->user;
VirtIODevice *vdev = dev->vdev;
VhostUserHostNotifier *n;
void *addr;
char *name;
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
return -EINVAL;
}
/*
* Fetch notifier and invalidate any old data before setting up
* new mapped address.
*/
n = fetch_or_create_notifier(user, queue_idx);
vhost_user_host_notifier_remove(n, vdev);
if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
return 0;
}
/* Sanity check. */
if (area->size != page_size) {
return -EINVAL;
}
addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
fd, area->offset);
if (addr == MAP_FAILED) {
return -EFAULT;
}
name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
user, queue_idx);
if (!n->mr.ram) { /* Don't init again after suspend. */
memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
page_size, addr);
} else {
n->mr.ram_block->host = addr;
}
g_free(name);
if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
object_unparent(OBJECT(&n->mr));
munmap(addr, page_size);
return -ENXIO;
}
n->addr = addr;
return 0;
}
static int
vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev,
VhostUserShared *object)
{
QemuUUID uuid;
memcpy(uuid.data, object->uuid, sizeof(object->uuid));
return virtio_add_vhost_device(&uuid, dev);
}
static int
vhost_user_backend_handle_shared_object_remove(VhostUserShared *object)
{
QemuUUID uuid;
memcpy(uuid.data, object->uuid, sizeof(object->uuid));
return virtio_remove_resource(&uuid);
}
static bool vhost_user_send_resp(QIOChannel *ioc, VhostUserHeader *hdr,
VhostUserPayload *payload, Error **errp)
{
struct iovec iov[] = {
{ .iov_base = hdr, .iov_len = VHOST_USER_HDR_SIZE },
{ .iov_base = payload, .iov_len = hdr->size },
};
hdr->flags &= ~VHOST_USER_NEED_REPLY_MASK;
hdr->flags |= VHOST_USER_REPLY_MASK;
return !qio_channel_writev_all(ioc, iov, ARRAY_SIZE(iov), errp);
}
static bool
vhost_user_backend_send_dmabuf_fd(QIOChannel *ioc, VhostUserHeader *hdr,
VhostUserPayload *payload, Error **errp)
{
hdr->size = sizeof(payload->u64);
return vhost_user_send_resp(ioc, hdr, payload, errp);
}
int vhost_user_get_shared_object(struct vhost_dev *dev, unsigned char *uuid,
int *dmabuf_fd)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_SHARED_OBJECT,
.hdr.flags = VHOST_USER_VERSION,
};
memcpy(msg.payload.object.uuid, uuid, sizeof(msg.payload.object.uuid));
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_SHARED_OBJECT) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_GET_SHARED_OBJECT, msg.hdr.request);
return -EPROTO;
}
*dmabuf_fd = qemu_chr_fe_get_msgfd(chr);
if (*dmabuf_fd < 0) {
error_report("Failed to get dmabuf fd");
return -EIO;
}
return 0;
}
static int
vhost_user_backend_handle_shared_object_lookup(struct vhost_user *u,
QIOChannel *ioc,
VhostUserHeader *hdr,
VhostUserPayload *payload)
{
QemuUUID uuid;
CharBackend *chr = u->user->chr;
Error *local_err = NULL;
int dmabuf_fd = -1;
int fd_num = 0;
memcpy(uuid.data, payload->object.uuid, sizeof(payload->object.uuid));
payload->u64 = 0;
switch (virtio_object_type(&uuid)) {
case TYPE_DMABUF:
dmabuf_fd = virtio_lookup_dmabuf(&uuid);
break;
case TYPE_VHOST_DEV:
{
struct vhost_dev *dev = virtio_lookup_vhost_device(&uuid);
if (dev == NULL) {
payload->u64 = -EINVAL;
break;
}
int ret = vhost_user_get_shared_object(dev, uuid.data, &dmabuf_fd);
if (ret < 0) {
payload->u64 = ret;
}
break;
}
case TYPE_INVALID:
payload->u64 = -EINVAL;
break;
}
if (dmabuf_fd != -1) {
fd_num++;
}
if (qemu_chr_fe_set_msgfds(chr, &dmabuf_fd, fd_num) < 0) {
error_report("Failed to set msg fds.");
payload->u64 = -EINVAL;
}
if (!vhost_user_backend_send_dmabuf_fd(ioc, hdr, payload, &local_err)) {
error_report_err(local_err);
return -EINVAL;
}
return 0;
}
static void close_backend_channel(struct vhost_user *u)
{
g_source_destroy(u->backend_src);
g_source_unref(u->backend_src);
u->backend_src = NULL;
object_unref(OBJECT(u->backend_ioc));
u->backend_ioc = NULL;
}
static gboolean backend_read(QIOChannel *ioc, GIOCondition condition,
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
gpointer opaque)
{
struct vhost_dev *dev = opaque;
struct vhost_user *u = dev->opaque;
VhostUserHeader hdr = { 0, };
VhostUserPayload payload = { 0, };
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
Error *local_err = NULL;
gboolean rc = G_SOURCE_CONTINUE;
int ret = 0;
struct iovec iov;
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
g_autofree int *fd = NULL;
size_t fdsize = 0;
int i;
/* Read header */
iov.iov_base = &hdr;
iov.iov_len = VHOST_USER_HDR_SIZE;
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
if (qio_channel_readv_full_all(ioc, &iov, 1, &fd, &fdsize, &local_err)) {
error_report_err(local_err);
goto err;
}
if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", hdr.size,
VHOST_USER_PAYLOAD_SIZE);
goto err;
}
/* Read payload */
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) {
error_report_err(local_err);
goto err;
}
switch (hdr.request) {
case VHOST_USER_BACKEND_IOTLB_MSG:
ret = vhost_backend_handle_iotlb_msg(dev, &payload.iotlb);
break;
case VHOST_USER_BACKEND_CONFIG_CHANGE_MSG:
ret = vhost_user_backend_handle_config_change(dev);
break;
case VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG:
ret = vhost_user_backend_handle_vring_host_notifier(dev, &payload.area,
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
fd ? fd[0] : -1);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_ADD:
ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE:
ret = vhost_user_backend_handle_shared_object_remove(&payload.object);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP:
ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc,
&hdr, &payload);
break;
default:
error_report("Received unexpected msg type: %d.", hdr.request);
ret = -EINVAL;
}
/*
* REPLY_ACK feature handling. Other reply types has to be managed
* directly in their request handlers.
*/
if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
payload.u64 = !!ret;
hdr.size = sizeof(payload.u64);
if (!vhost_user_send_resp(ioc, &hdr, &payload, &local_err)) {
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
error_report_err(local_err);
goto err;
}
}
goto fdcleanup;
err:
close_backend_channel(u);
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
rc = G_SOURCE_REMOVE;
fdcleanup:
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
if (fd) {
for (i = 0; i < fdsize; i++) {
close(fd[i]);
}
}
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
return rc;
}
static int vhost_setup_backend_channel(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_BACKEND_REQ_FD,
.hdr.flags = VHOST_USER_VERSION,
};
struct vhost_user *u = dev->opaque;
int sv[2], ret = 0;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
Error *local_err = NULL;
QIOChannel *ioc;
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_BACKEND_REQ)) {
return 0;
}
if (qemu_socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
int saved_errno = errno;
error_report("socketpair() failed");
return -saved_errno;
}
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
ioc = QIO_CHANNEL(qio_channel_socket_new_fd(sv[0], &local_err));
if (!ioc) {
error_report_err(local_err);
return -ECONNREFUSED;
vhost-user: Convert slave channel to QIOChannelSocket The slave channel is implemented with socketpair() : QEMU creates the pair, passes one of the socket to virtiofsd and monitors the other one with the main event loop using qemu_set_fd_handler(). In order to fix a potential deadlock between QEMU and a vhost-user external process (e.g. virtiofsd with DAX), we want to be able to monitor and service the slave channel while handling vhost-user requests. Prepare ground for this by converting the slave channel to be a QIOChannelSocket. This will make monitoring of the slave channel as simple as calling qio_channel_add_watch_source(). Since the connection is already established between the two sockets, only incoming I/O (G_IO_IN) and disconnect (G_IO_HUP) need to be serviced. This also allows to get rid of the ancillary data parsing since QIOChannelSocket can do this for us. Note that the MSG_CTRUNC check is dropped on the way because QIOChannelSocket ignores this case. This isn't a problem since slave_read() provisions space for 8 file descriptors, but affected vhost-user slave protocol messages generally only convey one. If for some reason a buggy implementation passes more file descriptors, no need to break the connection, just like we don't break it if some other type of ancillary data is received : this isn't explicitely violating the protocol per-se so it seems better to ignore it. The current code errors out on short reads and writes. Use the qio_channel_*_all() variants to address this on the way. Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20210312092212.782255-5-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
2021-03-12 12:22:09 +03:00
}
u->backend_ioc = ioc;
u->backend_src = qio_channel_add_watch_source(u->backend_ioc,
Revert "vhost-user: Monitor slave channel in vhost_user_read()" This reverts commit db8a3772e300c1a656331a92da0785d81667dc81. Motivation : this is breaking vhost-user with DPDK as reported in [0]. Received unexpected msg type. Expected 22 received 40 Fail to update device iotlb Received unexpected msg type. Expected 40 received 22 Received unexpected msg type. Expected 22 received 11 Fail to update device iotlb Received unexpected msg type. Expected 11 received 22 vhost VQ 1 ring restore failed: -71: Protocol error (71) Received unexpected msg type. Expected 22 received 11 Fail to update device iotlb Received unexpected msg type. Expected 11 received 22 vhost VQ 0 ring restore failed: -71: Protocol error (71) unable to start vhost net: 71: falling back on userspace virtio The failing sequence that leads to the first error is : - QEMU sends a VHOST_USER_GET_STATUS (40) request to DPDK on the master socket - QEMU starts a nested event loop in order to wait for the VHOST_USER_GET_STATUS response and to be able to process messages from the slave channel - DPDK sends a couple of legitimate IOTLB miss messages on the slave channel - QEMU processes each IOTLB request and sends VHOST_USER_IOTLB_MSG (22) updates on the master socket - QEMU assumes to receive a response for the latest VHOST_USER_IOTLB_MSG but it gets the response for the VHOST_USER_GET_STATUS instead The subsequent errors have the same root cause : the nested event loop breaks the order by design. It lures QEMU to expect responses to the latest message sent on the master socket to arrive first. Since this was only needed for DAX enablement which is still not merged upstream, just drop the code for now. A working solution will have to be merged later on. Likely protect the master socket with a mutex and service the slave channel with a separate thread, as discussed with Maxime in the mail thread below. [0] https://lore.kernel.org/qemu-devel/43145ede-89dc-280e-b953-6a2b436de395@redhat.com/ Reported-by: Yanghang Liu <yanghliu@redhat.com> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2155173 Signed-off-by: Greg Kurz <groug@kaod.org> Message-Id: <20230119172424.478268-2-groug@kaod.org> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Stefan Hajnoczi <stefanha@redhat.com> Acked-by: Maxime Coquelin <maxime.coquelin@redhat.com>
2023-01-19 20:24:23 +03:00
G_IO_IN | G_IO_HUP,
backend_read, dev, NULL, NULL);
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
ret = vhost_user_write(dev, &msg, &sv[1], 1);
if (ret) {
goto out;
}
if (reply_supported) {
ret = process_message_reply(dev, &msg);
}
out:
close(sv[1]);
if (ret) {
close_backend_channel(u);
}
return ret;
}
#ifdef CONFIG_LINUX
/*
* Called back from the postcopy fault thread when a fault is received on our
* ufd.
* TODO: This is Linux specific
*/
static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd,
void *ufd)
{
struct vhost_dev *dev = pcfd->data;
struct vhost_user *u = dev->opaque;
struct uffd_msg *msg = ufd;
uint64_t faultaddr = msg->arg.pagefault.address;
RAMBlock *rb = NULL;
uint64_t rb_offset;
int i;
trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr,
dev->mem->nregions);
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
trace_vhost_user_postcopy_fault_handler_loop(i,
u->postcopy_client_bases[i], dev->mem->regions[i].memory_size);
if (faultaddr >= u->postcopy_client_bases[i]) {
/* Ofset of the fault address in the vhost region */
uint64_t region_offset = faultaddr - u->postcopy_client_bases[i];
if (region_offset < dev->mem->regions[i].memory_size) {
rb_offset = region_offset + u->region_rb_offset[i];
trace_vhost_user_postcopy_fault_handler_found(i,
region_offset, rb_offset);
rb = u->region_rb[i];
return postcopy_request_shared_page(pcfd, rb, faultaddr,
rb_offset);
}
}
}
error_report("%s: Failed to find region for fault %" PRIx64,
__func__, faultaddr);
return -1;
}
static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb,
uint64_t offset)
{
struct vhost_dev *dev = pcfd->data;
struct vhost_user *u = dev->opaque;
int i;
trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset);
if (!u) {
return 0;
}
/* Translate the offset into an address in the clients address space */
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
if (u->region_rb[i] == rb &&
offset >= u->region_rb_offset[i] &&
offset < (u->region_rb_offset[i] +
dev->mem->regions[i].memory_size)) {
uint64_t client_addr = (offset - u->region_rb_offset[i]) +
u->postcopy_client_bases[i];
trace_vhost_user_postcopy_waker_found(client_addr);
return postcopy_wake_shared(pcfd, client_addr, rb);
}
}
trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset);
return 0;
}
#endif
/*
* Called at the start of an inbound postcopy on reception of the
* 'advise' command.
*/
static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
{
#ifdef CONFIG_LINUX
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ufd;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_ADVISE,
.hdr.flags = VHOST_USER_VERSION,
};
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_advise to vhost");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg(errp, "Failed to get postcopy_advise reply from vhost");
return ret;
}
if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) {
error_setg(errp, "Unexpected msg type. Expected %d received %d",
VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size) {
error_setg(errp, "Received bad msg size.");
return -EPROTO;
}
ufd = qemu_chr_fe_get_msgfd(chr);
if (ufd < 0) {
error_setg(errp, "%s: Failed to get ufd", __func__);
return -EIO;
}
qemu_socket_set_nonblock(ufd);
/* register ufd with userfault thread */
u->postcopy_fd.fd = ufd;
u->postcopy_fd.data = dev;
u->postcopy_fd.handler = vhost_user_postcopy_fault_handler;
u->postcopy_fd.waker = vhost_user_postcopy_waker;
u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */
postcopy_register_shared_ufd(&u->postcopy_fd);
return 0;
#else
error_setg(errp, "Postcopy not supported on non-Linux systems");
return -ENOSYS;
#endif
}
/*
* Called at the switch to postcopy on reception of the 'listen' command.
*/
static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp)
{
struct vhost_user *u = dev->opaque;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_LISTEN,
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
};
u->postcopy_listen = true;
trace_vhost_user_postcopy_listen();
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_listen to vhost");
return ret;
}
ret = process_message_reply(dev, &msg);
if (ret) {
error_setg(errp, "Failed to receive reply to postcopy_listen");
return ret;
}
return 0;
}
/*
* Called at the end of postcopy
*/
static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_END,
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
};
int ret;
struct vhost_user *u = dev->opaque;
trace_vhost_user_postcopy_end_entry();
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_end to vhost");
return ret;
}
ret = process_message_reply(dev, &msg);
if (ret) {
error_setg(errp, "Failed to receive reply to postcopy_end");
return ret;
}
postcopy_unregister_shared_ufd(&u->postcopy_fd);
close(u->postcopy_fd.fd);
u->postcopy_fd.handler = NULL;
trace_vhost_user_postcopy_end_exit();
return 0;
}
static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
void *opaque)
{
struct PostcopyNotifyData *pnd = opaque;
struct vhost_user *u = container_of(notifier, struct vhost_user,
postcopy_notifier);
struct vhost_dev *dev = u->dev;
switch (pnd->reason) {
case POSTCOPY_NOTIFY_PROBE:
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_PAGEFAULT)) {
/* TODO: Get the device name into this error somehow */
error_setg(pnd->errp,
"vhost-user backend not capable of postcopy");
return -ENOENT;
}
break;
case POSTCOPY_NOTIFY_INBOUND_ADVISE:
return vhost_user_postcopy_advise(dev, pnd->errp);
case POSTCOPY_NOTIFY_INBOUND_LISTEN:
return vhost_user_postcopy_listen(dev, pnd->errp);
case POSTCOPY_NOTIFY_INBOUND_END:
return vhost_user_postcopy_end(dev, pnd->errp);
default:
/* We ignore notifications we don't know */
break;
}
return 0;
}
static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
Error **errp)
{
uint64_t features, ram_slots;
struct vhost_user *u;
VhostUserState *vus = (VhostUserState *) opaque;
int err;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
u = g_new0(struct vhost_user, 1);
u->user = vus;
u->dev = dev;
dev->opaque = u;
err = vhost_user_get_features(dev, &features);
if (err < 0) {
error_setg_errno(errp, -err, "vhost_backend_init failed");
return err;
}
if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
bool supports_f_config = vus->supports_config ||
(dev->config_ops && dev->config_ops->vhost_dev_config_notifier);
uint64_t protocol_features;
dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
&protocol_features);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
/*
* We will use all the protocol features we support - although
* we suppress F_CONFIG if we know QEMUs internal code can not support
* it.
*/
protocol_features &= VHOST_USER_PROTOCOL_FEATURE_MASK;
if (supports_f_config) {
if (!virtio_has_feature(protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
error_setg(errp, "vhost-user device expecting "
"VHOST_USER_PROTOCOL_F_CONFIG but the vhost-user backend does "
"not support it.");
return -EPROTO;
}
} else {
if (virtio_has_feature(protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
warn_report("vhost-user backend supports "
"VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not.");
protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
}
}
/* final set of protocol features */
dev->protocol_features = protocol_features;
err = vhost_user_set_protocol_features(dev, dev->protocol_features);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
/* query the max queues we support if backend supports Multiple Queue */
if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) {
err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
&dev->max_queues);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
} else {
dev->max_queues = 1;
}
if (dev->num_queues && dev->max_queues < dev->num_queues) {
error_setg(errp, "The maximum number of queues supported by the "
"backend is %" PRIu64, dev->max_queues);
return -EINVAL;
}
if (virtio_has_feature(features, VIRTIO_F_IOMMU_PLATFORM) &&
!(virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_BACKEND_REQ) &&
virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
error_setg(errp, "IOMMU support requires reply-ack and "
"backend-req protocol features.");
return -EINVAL;
}
/* get max memory regions if backend supports configurable RAM slots */
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS;
} else {
err = vhost_user_get_max_memslots(dev, &ram_slots);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
if (ram_slots < u->user->memory_slots) {
error_setg(errp, "The backend specified a max ram slots limit "
"of %" PRIu64", when the prior validated limit was "
"%d. This limit should never decrease.", ram_slots,
u->user->memory_slots);
return -EINVAL;
}
u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS);
}
}
if (dev->migration_blocker == NULL &&
!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD)) {
error_setg(&dev->migration_blocker,
"Migration disabled: vhost-user backend lacks "
"VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
}
if (dev->vq_index == 0) {
err = vhost_setup_backend_channel(dev);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
}
u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
postcopy_add_notifier(&u->postcopy_notifier);
return 0;
}
static int vhost_user_backend_cleanup(struct vhost_dev *dev)
{
struct vhost_user *u;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
u = dev->opaque;
if (u->postcopy_notifier.notify) {
postcopy_remove_notifier(&u->postcopy_notifier);
u->postcopy_notifier.notify = NULL;
}
u->postcopy_listen = false;
if (u->postcopy_fd.handler) {
postcopy_unregister_shared_ufd(&u->postcopy_fd);
close(u->postcopy_fd.fd);
u->postcopy_fd.handler = NULL;
}
if (u->backend_ioc) {
close_backend_channel(u);
}
g_free(u->region_rb);
u->region_rb = NULL;
g_free(u->region_rb_offset);
u->region_rb_offset = NULL;
u->region_rb_len = 0;
g_free(u);
dev->opaque = 0;
return 0;
}
static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
{
assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
return idx;
}
static int vhost_user_memslots_limit(struct vhost_dev *dev)
{
struct vhost_user *u = dev->opaque;
return u->user->memory_slots;
}
static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
{
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
return virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
}
static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
{
VhostUserMsg msg = { };
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
/* If guest supports GUEST_ANNOUNCE do nothing */
if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) {
return 0;
}
/* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_RARP)) {
msg.hdr.request = VHOST_USER_SEND_RARP;
msg.hdr.flags = VHOST_USER_VERSION;
memcpy((char *)&msg.payload.u64, mac_addr, 6);
msg.hdr.size = sizeof(msg.payload.u64);
return vhost_user_write(dev, &msg, NULL, 0);
}
return -ENOTSUP;
}
static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
{
VhostUserMsg msg;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
int ret;
if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))) {
return 0;
}
msg.hdr.request = VHOST_USER_NET_SET_MTU;
msg.payload.u64 = mtu;
msg.hdr.size = sizeof(msg.payload.u64);
msg.hdr.flags = VHOST_USER_VERSION;
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
/* If reply_ack supported, backend has to ack specified MTU is valid */
if (reply_supported) {
return process_message_reply(dev, &msg);
}
return 0;
}
static int vhost_user_send_device_iotlb_msg(struct vhost_dev *dev,
struct vhost_iotlb_msg *imsg)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_IOTLB_MSG,
.hdr.size = sizeof(msg.payload.iotlb),
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
.payload.iotlb = *imsg,
};
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
return process_message_reply(dev, &msg);
}
static void vhost_user_set_iotlb_callback(struct vhost_dev *dev, int enabled)
{
/* No-op as the receive channel is not dedicated to IOTLB messages. */
}
static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
uint32_t config_len, Error **errp)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_CONFIG,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
error_setg(errp, "VHOST_USER_PROTOCOL_F_CONFIG not supported");
return -EINVAL;
}
assert(config_len <= VHOST_USER_MAX_CONFIG_SIZE);
msg.payload.config.offset = 0;
msg.payload.config.size = config_len;
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg_errno(errp, -ret, "vhost_get_config failed");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret, "vhost_get_config failed");
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_CONFIG) {
error_setg(errp,
"Received unexpected msg type. Expected %d received %d",
VHOST_USER_GET_CONFIG, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
error_setg(errp, "Received bad msg size.");
return -EPROTO;
}
memcpy(config, msg.payload.config.region, config_len);
return 0;
}
static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags)
{
int ret;
uint8_t *p;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_CONFIG,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = VHOST_USER_CONFIG_HDR_SIZE + size,
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
return -ENOTSUP;
}
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
if (size > VHOST_USER_MAX_CONFIG_SIZE) {
return -EINVAL;
}
msg.payload.config.offset = offset,
msg.payload.config.size = size,
msg.payload.config.flags = flags,
p = msg.payload.config.region;
memcpy(p, data, size);
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
if (reply_supported) {
return process_message_reply(dev, &msg);
}
return 0;
}
static int vhost_user_crypto_create_session(struct vhost_dev *dev,
void *session_info,
uint64_t *session_id)
{
int ret;
bool crypto_session = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
CryptoDevBackendSessionInfo *backend_info = session_info;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_CREATE_CRYPTO_SESSION,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = sizeof(msg.payload.session),
};
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
if (!crypto_session) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
if (backend_info->op_code == VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION) {
CryptoDevBackendAsymSessionInfo *sess = &backend_info->u.asym_sess_info;
size_t keylen;
memcpy(&msg.payload.session.u.asym.session_setup_data, sess,
sizeof(CryptoDevBackendAsymSessionInfo));
if (sess->keylen) {
keylen = sizeof(msg.payload.session.u.asym.key);
if (sess->keylen > keylen) {
error_report("Unsupported asymmetric key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.asym.key, sess->key,
sess->keylen);
}
} else {
CryptoDevBackendSymSessionInfo *sess = &backend_info->u.sym_sess_info;
size_t keylen;
memcpy(&msg.payload.session.u.sym.session_setup_data, sess,
sizeof(CryptoDevBackendSymSessionInfo));
if (sess->key_len) {
keylen = sizeof(msg.payload.session.u.sym.key);
if (sess->key_len > keylen) {
error_report("Unsupported cipher key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.sym.key, sess->cipher_key,
sess->key_len);
}
if (sess->auth_key_len > 0) {
keylen = sizeof(msg.payload.session.u.sym.auth_key);
if (sess->auth_key_len > keylen) {
error_report("Unsupported auth key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.sym.auth_key, sess->auth_key,
sess->auth_key_len);
}
}
msg.payload.session.op_code = backend_info->op_code;
msg.payload.session.session_id = backend_info->session_id;
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_report("vhost_user_write() return %d, create session failed",
ret);
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_report("vhost_user_read() return %d, create session failed",
ret);
return ret;
}
if (msg.hdr.request != VHOST_USER_CREATE_CRYPTO_SESSION) {
error_report("Received unexpected msg type. Expected %d received %d",
VHOST_USER_CREATE_CRYPTO_SESSION, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.session)) {
error_report("Received bad msg size.");
return -EPROTO;
}
if (msg.payload.session.session_id < 0) {
error_report("Bad session id: %" PRId64 "",
msg.payload.session.session_id);
return -EINVAL;
}
*session_id = msg.payload.session.session_id;
return 0;
}
static int
vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id)
{
int ret;
bool crypto_session = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_CLOSE_CRYPTO_SESSION,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = sizeof(msg.payload.u64),
};
msg.payload.u64 = session_id;
if (!crypto_session) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_report("vhost_user_write() return %d, close session failed",
ret);
return ret;
}
return 0;
}
static bool vhost_user_no_private_memslots(struct vhost_dev *dev)
{
return true;
}
static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
uint16_t queue_size,
struct vhost_inflight *inflight)
{
void *addr;
int fd;
int ret;
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_INFLIGHT_FD,
.hdr.flags = VHOST_USER_VERSION,
.payload.inflight.num_queues = dev->nvqs,
.payload.inflight.queue_size = queue_size,
.hdr.size = sizeof(msg.payload.inflight),
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
return 0;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_INFLIGHT_FD) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_GET_INFLIGHT_FD, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.inflight)) {
error_report("Received bad msg size.");
return -EPROTO;
}
if (!msg.payload.inflight.mmap_size) {
return 0;
}
fd = qemu_chr_fe_get_msgfd(chr);
if (fd < 0) {
error_report("Failed to get mem fd");
return -EIO;
}
addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, msg.payload.inflight.mmap_offset);
if (addr == MAP_FAILED) {
error_report("Failed to mmap mem fd");
close(fd);
return -EFAULT;
}
inflight->addr = addr;
inflight->fd = fd;
inflight->size = msg.payload.inflight.mmap_size;
inflight->offset = msg.payload.inflight.mmap_offset;
inflight->queue_size = queue_size;
return 0;
}
static int vhost_user_set_inflight_fd(struct vhost_dev *dev,
struct vhost_inflight *inflight)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_INFLIGHT_FD,
.hdr.flags = VHOST_USER_VERSION,
.payload.inflight.mmap_size = inflight->size,
.payload.inflight.mmap_offset = inflight->offset,
.payload.inflight.num_queues = dev->nvqs,
.payload.inflight.queue_size = inflight->queue_size,
.hdr.size = sizeof(msg.payload.inflight),
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
return 0;
}
return vhost_user_write(dev, &msg, &inflight->fd, 1);
}
static void vhost_user_state_destroy(gpointer data)
{
VhostUserHostNotifier *n = (VhostUserHostNotifier *) data;
if (n) {
vhost_user_host_notifier_remove(n, NULL);
object_unparent(OBJECT(&n->mr));
/*
* We can't free until vhost_user_host_notifier_remove has
* done it's thing so schedule the free with RCU.
*/
g_free_rcu(n, rcu);
}
}
bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
{
if (user->chr) {
error_setg(errp, "Cannot initialize vhost-user state");
return false;
}
user->chr = chr;
user->memory_slots = 0;
user->notifiers = g_ptr_array_new_full(VIRTIO_QUEUE_MAX / 4,
&vhost_user_state_destroy);
return true;
}
void vhost_user_cleanup(VhostUserState *user)
{
if (!user->chr) {
return;
}
memory_region_transaction_begin();
user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true);
memory_region_transaction_commit();
user->chr = NULL;
}
typedef struct {
vu_async_close_fn cb;
DeviceState *dev;
CharBackend *cd;
struct vhost_dev *vhost;
IOEventHandler *event_cb;
} VhostAsyncCallback;
static void vhost_user_async_close_bh(void *opaque)
{
VhostAsyncCallback *data = opaque;
struct vhost_dev *vhost = data->vhost;
/*
* If the vhost_dev has been cleared in the meantime there is
* nothing left to do as some other path has completed the
* cleanup.
*/
if (vhost->vdev) {
data->cb(data->dev);
} else if (data->event_cb) {
qemu_chr_fe_set_handlers(data->cd, NULL, NULL, data->event_cb,
NULL, data->dev, NULL, true);
}
g_free(data);
}
/*
* We only schedule the work if the machine is running. If suspended
* we want to keep all the in-flight data as is for migration
* purposes.
*/
void vhost_user_async_close(DeviceState *d,
CharBackend *chardev, struct vhost_dev *vhost,
vu_async_close_fn cb,
IOEventHandler *event_cb)
{
if (!runstate_check(RUN_STATE_SHUTDOWN)) {
/*
* A close event may happen during a read/write, but vhost
* code assumes the vhost_dev remains setup, so delay the
* stop & clear.
*/
AioContext *ctx = qemu_get_current_aio_context();
VhostAsyncCallback *data = g_new0(VhostAsyncCallback, 1);
/* Save data for the callback */
data->cb = cb;
data->dev = d;
data->cd = chardev;
data->vhost = vhost;
data->event_cb = event_cb;
/* Disable any further notifications on the chardev */
qemu_chr_fe_set_handlers(chardev,
NULL, NULL, NULL, NULL, NULL, NULL,
false);
aio_bh_schedule_oneshot(ctx, vhost_user_async_close_bh, data);
/*
* Move vhost device to the stopped state. The vhost-user device
* will be clean up and disconnected in BH. This can be useful in
* the vhost migration code. If disconnect was caught there is an
* option for the general vhost code to get the dev state without
* knowing its type (in this case vhost-user).
*
* Note if the vhost device is fully cleared by the time we
* execute the bottom half we won't continue with the cleanup.
*/
vhost->started = false;
}
}
static int vhost_user_dev_start(struct vhost_dev *dev, bool started)
{
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
return 0;
}
/* Set device status only for last queue pair */
if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
return 0;
}
if (started) {
return vhost_user_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
VIRTIO_CONFIG_S_DRIVER |
VIRTIO_CONFIG_S_DRIVER_OK);
} else {
return 0;
}
}
static void vhost_user_reset_status(struct vhost_dev *dev)
{
/* Set device status only for last queue pair */
if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
return;
}
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
vhost_user_set_status(dev, 0);
}
}
static bool vhost_user_supports_device_state(struct vhost_dev *dev)
{
return virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_DEVICE_STATE);
}
static int vhost_user_set_device_state_fd(struct vhost_dev *dev,
VhostDeviceStateDirection direction,
VhostDeviceStatePhase phase,
int fd,
int *reply_fd,
Error **errp)
{
int ret;
struct vhost_user *vu = dev->opaque;
VhostUserMsg msg = {
.hdr = {
.request = VHOST_USER_SET_DEVICE_STATE_FD,
.flags = VHOST_USER_VERSION,
.size = sizeof(msg.payload.transfer_state),
},
.payload.transfer_state = {
.direction = direction,
.phase = phase,
},
};
*reply_fd = -1;
if (!vhost_user_supports_device_state(dev)) {
close(fd);
error_setg(errp, "Back-end does not support migration state transfer");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, &fd, 1);
close(fd);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to send SET_DEVICE_STATE_FD message");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to receive SET_DEVICE_STATE_FD reply");
return ret;
}
if (msg.hdr.request != VHOST_USER_SET_DEVICE_STATE_FD) {
error_setg(errp,
"Received unexpected message type, expected %d, received %d",
VHOST_USER_SET_DEVICE_STATE_FD, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_setg(errp,
"Received bad message size, expected %zu, received %" PRIu32,
sizeof(msg.payload.u64), msg.hdr.size);
return -EPROTO;
}
if ((msg.payload.u64 & 0xff) != 0) {
error_setg(errp, "Back-end did not accept migration state transfer");
return -EIO;
}
if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
*reply_fd = qemu_chr_fe_get_msgfd(vu->user->chr);
if (*reply_fd < 0) {
error_setg(errp,
"Failed to get back-end-provided transfer pipe FD");
*reply_fd = -1;
return -EIO;
}
}
return 0;
}
static int vhost_user_check_device_state(struct vhost_dev *dev, Error **errp)
{
int ret;
VhostUserMsg msg = {
.hdr = {
.request = VHOST_USER_CHECK_DEVICE_STATE,
.flags = VHOST_USER_VERSION,
.size = 0,
},
};
if (!vhost_user_supports_device_state(dev)) {
error_setg(errp, "Back-end does not support migration state transfer");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to send CHECK_DEVICE_STATE message");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to receive CHECK_DEVICE_STATE reply");
return ret;
}
if (msg.hdr.request != VHOST_USER_CHECK_DEVICE_STATE) {
error_setg(errp,
"Received unexpected message type, expected %d, received %d",
VHOST_USER_CHECK_DEVICE_STATE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_setg(errp,
"Received bad message size, expected %zu, received %" PRIu32,
sizeof(msg.payload.u64), msg.hdr.size);
return -EPROTO;
}
if (msg.payload.u64 != 0) {
error_setg(errp, "Back-end failed to process its internal state");
return -EIO;
}
return 0;
}
const VhostOps user_ops = {
.backend_type = VHOST_BACKEND_TYPE_USER,
.vhost_backend_init = vhost_user_backend_init,
.vhost_backend_cleanup = vhost_user_backend_cleanup,
.vhost_backend_memslots_limit = vhost_user_memslots_limit,
.vhost_backend_no_private_memslots = vhost_user_no_private_memslots,
.vhost_set_log_base = vhost_user_set_log_base,
.vhost_set_mem_table = vhost_user_set_mem_table,
.vhost_set_vring_addr = vhost_user_set_vring_addr,
.vhost_set_vring_endian = vhost_user_set_vring_endian,
.vhost_set_vring_num = vhost_user_set_vring_num,
.vhost_set_vring_base = vhost_user_set_vring_base,
.vhost_get_vring_base = vhost_user_get_vring_base,
.vhost_set_vring_kick = vhost_user_set_vring_kick,
.vhost_set_vring_call = vhost_user_set_vring_call,
.vhost_set_vring_err = vhost_user_set_vring_err,
.vhost_set_features = vhost_user_set_features,
.vhost_get_features = vhost_user_get_features,
.vhost_set_owner = vhost_user_set_owner,
.vhost_reset_device = vhost_user_reset_device,
.vhost_get_vq_index = vhost_user_get_vq_index,
.vhost_set_vring_enable = vhost_user_set_vring_enable,
.vhost_requires_shm_log = vhost_user_requires_shm_log,
.vhost_migration_done = vhost_user_migration_done,
.vhost_net_set_mtu = vhost_user_net_set_mtu,
.vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
.vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
.vhost_get_config = vhost_user_get_config,
.vhost_set_config = vhost_user_set_config,
.vhost_crypto_create_session = vhost_user_crypto_create_session,
.vhost_crypto_close_session = vhost_user_crypto_close_session,
.vhost_get_inflight_fd = vhost_user_get_inflight_fd,
.vhost_set_inflight_fd = vhost_user_set_inflight_fd,
.vhost_dev_start = vhost_user_dev_start,
.vhost_reset_status = vhost_user_reset_status,
.vhost_supports_device_state = vhost_user_supports_device_state,
.vhost_set_device_state_fd = vhost_user_set_device_state_fd,
.vhost_check_device_state = vhost_user_check_device_state,
};