qemu/hw/virtio/vhost-user.c
Steve Sistare be19d836cd notify: pass error to notifier with return
Pass an error object as the third parameter to "notifier with return"
notifiers, so clients no longer need to bundle an error object in the
opaque data.  The new parameter is used in a later patch.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/1708622920-68779-2-git-send-email-steven.sistare@oracle.com
Signed-off-by: Peter Xu <peterx@redhat.com>
2024-02-28 11:31:28 +08:00

3040 lines
93 KiB
C

/*
* vhost-user
*
* Copyright (c) 2013 Virtual Open Systems Sarl.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "qapi/error.h"
#include "hw/virtio/virtio-dmabuf.h"
#include "hw/virtio/vhost.h"
#include "hw/virtio/virtio-crypto.h"
#include "hw/virtio/vhost-user.h"
#include "hw/virtio/vhost-backend.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-net.h"
#include "chardev/char-fe.h"
#include "io/channel-socket.h"
#include "sysemu/kvm.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/uuid.h"
#include "qemu/sockets.h"
#include "sysemu/runstate.h"
#include "sysemu/cryptodev.h"
#include "migration/migration.h"
#include "migration/postcopy-ram.h"
#include "trace.h"
#include "exec/ramblock.h"
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include "standard-headers/linux/vhost_types.h"
#ifdef CONFIG_LINUX
#include <linux/userfaultfd.h>
#endif
#define VHOST_MEMORY_BASELINE_NREGIONS 8
#define VHOST_USER_F_PROTOCOL_FEATURES 30
#define VHOST_USER_BACKEND_MAX_FDS 8
#if defined(TARGET_PPC) || defined(TARGET_PPC64)
#include "hw/ppc/spapr.h"
#define VHOST_USER_MAX_RAM_SLOTS SPAPR_MAX_RAM_SLOTS
#else
#define VHOST_USER_MAX_RAM_SLOTS 512
#endif
/*
* Maximum size of virtio device config space
*/
#define VHOST_USER_MAX_CONFIG_SIZE 256
#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
VHOST_USER_GET_FEATURES = 1,
VHOST_USER_SET_FEATURES = 2,
VHOST_USER_SET_OWNER = 3,
VHOST_USER_RESET_OWNER = 4,
VHOST_USER_SET_MEM_TABLE = 5,
VHOST_USER_SET_LOG_BASE = 6,
VHOST_USER_SET_LOG_FD = 7,
VHOST_USER_SET_VRING_NUM = 8,
VHOST_USER_SET_VRING_ADDR = 9,
VHOST_USER_SET_VRING_BASE = 10,
VHOST_USER_GET_VRING_BASE = 11,
VHOST_USER_SET_VRING_KICK = 12,
VHOST_USER_SET_VRING_CALL = 13,
VHOST_USER_SET_VRING_ERR = 14,
VHOST_USER_GET_PROTOCOL_FEATURES = 15,
VHOST_USER_SET_PROTOCOL_FEATURES = 16,
VHOST_USER_GET_QUEUE_NUM = 17,
VHOST_USER_SET_VRING_ENABLE = 18,
VHOST_USER_SEND_RARP = 19,
VHOST_USER_NET_SET_MTU = 20,
VHOST_USER_SET_BACKEND_REQ_FD = 21,
VHOST_USER_IOTLB_MSG = 22,
VHOST_USER_SET_VRING_ENDIAN = 23,
VHOST_USER_GET_CONFIG = 24,
VHOST_USER_SET_CONFIG = 25,
VHOST_USER_CREATE_CRYPTO_SESSION = 26,
VHOST_USER_CLOSE_CRYPTO_SESSION = 27,
VHOST_USER_POSTCOPY_ADVISE = 28,
VHOST_USER_POSTCOPY_LISTEN = 29,
VHOST_USER_POSTCOPY_END = 30,
VHOST_USER_GET_INFLIGHT_FD = 31,
VHOST_USER_SET_INFLIGHT_FD = 32,
VHOST_USER_GPU_SET_SOCKET = 33,
VHOST_USER_RESET_DEVICE = 34,
/* Message number 35 reserved for VHOST_USER_VRING_KICK. */
VHOST_USER_GET_MAX_MEM_SLOTS = 36,
VHOST_USER_ADD_MEM_REG = 37,
VHOST_USER_REM_MEM_REG = 38,
VHOST_USER_SET_STATUS = 39,
VHOST_USER_GET_STATUS = 40,
VHOST_USER_GET_SHARED_OBJECT = 41,
VHOST_USER_SET_DEVICE_STATE_FD = 42,
VHOST_USER_CHECK_DEVICE_STATE = 43,
VHOST_USER_MAX
} VhostUserRequest;
typedef enum VhostUserBackendRequest {
VHOST_USER_BACKEND_NONE = 0,
VHOST_USER_BACKEND_IOTLB_MSG = 1,
VHOST_USER_BACKEND_CONFIG_CHANGE_MSG = 2,
VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG = 3,
VHOST_USER_BACKEND_SHARED_OBJECT_ADD = 6,
VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE = 7,
VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP = 8,
VHOST_USER_BACKEND_MAX
} VhostUserBackendRequest;
typedef struct VhostUserMemoryRegion {
uint64_t guest_phys_addr;
uint64_t memory_size;
uint64_t userspace_addr;
uint64_t mmap_offset;
} VhostUserMemoryRegion;
typedef struct VhostUserMemory {
uint32_t nregions;
uint32_t padding;
VhostUserMemoryRegion regions[VHOST_MEMORY_BASELINE_NREGIONS];
} VhostUserMemory;
typedef struct VhostUserMemRegMsg {
uint64_t padding;
VhostUserMemoryRegion region;
} VhostUserMemRegMsg;
typedef struct VhostUserLog {
uint64_t mmap_size;
uint64_t mmap_offset;
} VhostUserLog;
typedef struct VhostUserConfig {
uint32_t offset;
uint32_t size;
uint32_t flags;
uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
} VhostUserConfig;
#define VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN 512
#define VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN 64
#define VHOST_CRYPTO_ASYM_MAX_KEY_LEN 1024
typedef struct VhostUserCryptoSession {
uint64_t op_code;
union {
struct {
CryptoDevBackendSymSessionInfo session_setup_data;
uint8_t key[VHOST_CRYPTO_SYM_CIPHER_MAX_KEY_LEN];
uint8_t auth_key[VHOST_CRYPTO_SYM_HMAC_MAX_KEY_LEN];
} sym;
struct {
CryptoDevBackendAsymSessionInfo session_setup_data;
uint8_t key[VHOST_CRYPTO_ASYM_MAX_KEY_LEN];
} asym;
} u;
/* session id for success, -1 on errors */
int64_t session_id;
} VhostUserCryptoSession;
static VhostUserConfig c __attribute__ ((unused));
#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \
+ sizeof(c.size) \
+ sizeof(c.flags))
typedef struct VhostUserVringArea {
uint64_t u64;
uint64_t size;
uint64_t offset;
} VhostUserVringArea;
typedef struct VhostUserInflight {
uint64_t mmap_size;
uint64_t mmap_offset;
uint16_t num_queues;
uint16_t queue_size;
} VhostUserInflight;
typedef struct VhostUserShared {
unsigned char uuid[16];
} VhostUserShared;
typedef struct {
VhostUserRequest request;
#define VHOST_USER_VERSION_MASK (0x3)
#define VHOST_USER_REPLY_MASK (0x1 << 2)
#define VHOST_USER_NEED_REPLY_MASK (0x1 << 3)
uint32_t flags;
uint32_t size; /* the following payload size */
} QEMU_PACKED VhostUserHeader;
/* Request payload of VHOST_USER_SET_DEVICE_STATE_FD */
typedef struct VhostUserTransferDeviceState {
uint32_t direction;
uint32_t phase;
} VhostUserTransferDeviceState;
typedef union {
#define VHOST_USER_VRING_IDX_MASK (0xff)
#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
uint64_t u64;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
VhostUserMemory memory;
VhostUserMemRegMsg mem_reg;
VhostUserLog log;
struct vhost_iotlb_msg iotlb;
VhostUserConfig config;
VhostUserCryptoSession session;
VhostUserVringArea area;
VhostUserInflight inflight;
VhostUserShared object;
VhostUserTransferDeviceState transfer_state;
} VhostUserPayload;
typedef struct VhostUserMsg {
VhostUserHeader hdr;
VhostUserPayload payload;
} QEMU_PACKED VhostUserMsg;
static VhostUserMsg m __attribute__ ((unused));
#define VHOST_USER_HDR_SIZE (sizeof(VhostUserHeader))
#define VHOST_USER_PAYLOAD_SIZE (sizeof(VhostUserPayload))
/* The version of the protocol we support */
#define VHOST_USER_VERSION (0x1)
struct vhost_user {
struct vhost_dev *dev;
/* Shared between vhost devs of the same virtio device */
VhostUserState *user;
QIOChannel *backend_ioc;
GSource *backend_src;
NotifierWithReturn postcopy_notifier;
struct PostCopyFD postcopy_fd;
uint64_t postcopy_client_bases[VHOST_USER_MAX_RAM_SLOTS];
/* Length of the region_rb and region_rb_offset arrays */
size_t region_rb_len;
/* RAMBlock associated with a given region */
RAMBlock **region_rb;
/*
* The offset from the start of the RAMBlock to the start of the
* vhost region.
*/
ram_addr_t *region_rb_offset;
/* True once we've entered postcopy_listen */
bool postcopy_listen;
/* Our current regions */
int num_shadow_regions;
struct vhost_memory_region shadow_regions[VHOST_USER_MAX_RAM_SLOTS];
};
struct scrub_regions {
struct vhost_memory_region *region;
int reg_idx;
int fd_idx;
};
static int vhost_user_read_header(struct vhost_dev *dev, VhostUserMsg *msg)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
uint8_t *p = (uint8_t *) msg;
int r, size = VHOST_USER_HDR_SIZE;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
int saved_errno = errno;
error_report("Failed to read msg header. Read %d instead of %d."
" Original request %d.", r, size, msg->hdr.request);
return r < 0 ? -saved_errno : -EIO;
}
/* validate received flags */
if (msg->hdr.flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
error_report("Failed to read msg header."
" Flags 0x%x instead of 0x%x.", msg->hdr.flags,
VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
return -EPROTO;
}
trace_vhost_user_read(msg->hdr.request, msg->hdr.flags);
return 0;
}
static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
uint8_t *p = (uint8_t *) msg;
int r, size;
r = vhost_user_read_header(dev, msg);
if (r < 0) {
return r;
}
/* validate message size is sane */
if (msg->hdr.size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", msg->hdr.size,
VHOST_USER_PAYLOAD_SIZE);
return -EPROTO;
}
if (msg->hdr.size) {
p += VHOST_USER_HDR_SIZE;
size = msg->hdr.size;
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
int saved_errno = errno;
error_report("Failed to read msg payload."
" Read %d instead of %d.", r, msg->hdr.size);
return r < 0 ? -saved_errno : -EIO;
}
}
return 0;
}
static int process_message_reply(struct vhost_dev *dev,
const VhostUserMsg *msg)
{
int ret;
VhostUserMsg msg_reply;
if ((msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
return 0;
}
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
}
if (msg_reply.hdr.request != msg->hdr.request) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
msg->hdr.request, msg_reply.hdr.request);
return -EPROTO;
}
return msg_reply.payload.u64 ? -EIO : 0;
}
static bool vhost_user_per_device_request(VhostUserRequest request)
{
switch (request) {
case VHOST_USER_SET_OWNER:
case VHOST_USER_RESET_OWNER:
case VHOST_USER_SET_MEM_TABLE:
case VHOST_USER_GET_QUEUE_NUM:
case VHOST_USER_NET_SET_MTU:
case VHOST_USER_RESET_DEVICE:
case VHOST_USER_ADD_MEM_REG:
case VHOST_USER_REM_MEM_REG:
return true;
default:
return false;
}
}
/* most non-init callers ignore the error */
static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
int *fds, int fd_num)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ret, size = VHOST_USER_HDR_SIZE + msg->hdr.size;
/*
* Some devices, like virtio-scsi, are implemented as a single vhost_dev,
* while others, like virtio-net, contain multiple vhost_devs. For
* operations such as configuring device memory mappings or issuing device
* resets, which affect the whole device instead of individual VQs,
* vhost-user messages should only be sent once.
*
* Devices with multiple vhost_devs are given an associated dev->vq_index
* so per_device requests are only sent if vq_index is 0.
*/
if (vhost_user_per_device_request(msg->hdr.request)
&& dev->vq_index != 0) {
msg->hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
return 0;
}
if (qemu_chr_fe_set_msgfds(chr, fds, fd_num) < 0) {
error_report("Failed to set msg fds.");
return -EINVAL;
}
ret = qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size);
if (ret != size) {
int saved_errno = errno;
error_report("Failed to write msg."
" Wrote %d instead of %d.", ret, size);
return ret < 0 ? -saved_errno : -EIO;
}
trace_vhost_user_write(msg->hdr.request, msg->hdr.flags);
return 0;
}
int vhost_user_gpu_set_socket(struct vhost_dev *dev, int fd)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GPU_SET_SOCKET,
.hdr.flags = VHOST_USER_VERSION,
};
return vhost_user_write(dev, &msg, &fd, 1);
}
static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
struct vhost_log *log)
{
int fds[VHOST_USER_MAX_RAM_SLOTS];
size_t fd_num = 0;
bool shmfd = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_LOG_BASE,
.hdr.flags = VHOST_USER_VERSION,
.payload.log.mmap_size = log->size * sizeof(*(log->log)),
.payload.log.mmap_offset = 0,
.hdr.size = sizeof(msg.payload.log),
};
/* Send only once with first queue pair */
if (dev->vq_index != 0) {
return 0;
}
if (shmfd && log->fd != -1) {
fds[fd_num++] = log->fd;
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
}
if (shmfd) {
msg.hdr.size = 0;
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_SET_LOG_BASE) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_SET_LOG_BASE, msg.hdr.request);
return -EPROTO;
}
}
return 0;
}
static MemoryRegion *vhost_user_get_mr_data(uint64_t addr, ram_addr_t *offset,
int *fd)
{
MemoryRegion *mr;
assert((uintptr_t)addr == addr);
mr = memory_region_from_host((void *)(uintptr_t)addr, offset);
*fd = memory_region_get_fd(mr);
*offset += mr->ram_block->fd_offset;
return mr;
}
static void vhost_user_fill_msg_region(VhostUserMemoryRegion *dst,
struct vhost_memory_region *src,
uint64_t mmap_offset)
{
assert(src != NULL && dst != NULL);
dst->userspace_addr = src->userspace_addr;
dst->memory_size = src->memory_size;
dst->guest_phys_addr = src->guest_phys_addr;
dst->mmap_offset = mmap_offset;
}
static int vhost_user_fill_set_mem_table_msg(struct vhost_user *u,
struct vhost_dev *dev,
VhostUserMsg *msg,
int *fds, size_t *fd_num,
bool track_ramblocks)
{
int i, fd;
ram_addr_t offset;
MemoryRegion *mr;
struct vhost_memory_region *reg;
VhostUserMemoryRegion region_buffer;
msg->hdr.request = VHOST_USER_SET_MEM_TABLE;
for (i = 0; i < dev->mem->nregions; ++i) {
reg = dev->mem->regions + i;
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (fd > 0) {
if (track_ramblocks) {
assert(*fd_num < VHOST_MEMORY_BASELINE_NREGIONS);
trace_vhost_user_set_mem_table_withfd(*fd_num, mr->name,
reg->memory_size,
reg->guest_phys_addr,
reg->userspace_addr,
offset);
u->region_rb_offset[i] = offset;
u->region_rb[i] = mr->ram_block;
} else if (*fd_num == VHOST_MEMORY_BASELINE_NREGIONS) {
error_report("Failed preparing vhost-user memory table msg");
return -ENOBUFS;
}
vhost_user_fill_msg_region(&region_buffer, reg, offset);
msg->payload.memory.regions[*fd_num] = region_buffer;
fds[(*fd_num)++] = fd;
} else if (track_ramblocks) {
u->region_rb_offset[i] = 0;
u->region_rb[i] = NULL;
}
}
msg->payload.memory.nregions = *fd_num;
if (!*fd_num) {
error_report("Failed initializing vhost-user memory map, "
"consider using -object memory-backend-file share=on");
return -EINVAL;
}
msg->hdr.size = sizeof(msg->payload.memory.nregions);
msg->hdr.size += sizeof(msg->payload.memory.padding);
msg->hdr.size += *fd_num * sizeof(VhostUserMemoryRegion);
return 0;
}
static inline bool reg_equal(struct vhost_memory_region *shadow_reg,
struct vhost_memory_region *vdev_reg)
{
return shadow_reg->guest_phys_addr == vdev_reg->guest_phys_addr &&
shadow_reg->userspace_addr == vdev_reg->userspace_addr &&
shadow_reg->memory_size == vdev_reg->memory_size;
}
static void scrub_shadow_regions(struct vhost_dev *dev,
struct scrub_regions *add_reg,
int *nr_add_reg,
struct scrub_regions *rem_reg,
int *nr_rem_reg, uint64_t *shadow_pcb,
bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
bool found[VHOST_USER_MAX_RAM_SLOTS] = {};
struct vhost_memory_region *reg, *shadow_reg;
int i, j, fd, add_idx = 0, rm_idx = 0, fd_num = 0;
ram_addr_t offset;
MemoryRegion *mr;
bool matching;
/*
* Find memory regions present in our shadow state which are not in
* the device's current memory state.
*
* Mark regions in both the shadow and device state as "found".
*/
for (i = 0; i < u->num_shadow_regions; i++) {
shadow_reg = &u->shadow_regions[i];
matching = false;
for (j = 0; j < dev->mem->nregions; j++) {
reg = &dev->mem->regions[j];
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (reg_equal(shadow_reg, reg)) {
matching = true;
found[j] = true;
if (track_ramblocks) {
/*
* Reset postcopy client bases, region_rb, and
* region_rb_offset in case regions are removed.
*/
if (fd > 0) {
u->region_rb_offset[j] = offset;
u->region_rb[j] = mr->ram_block;
shadow_pcb[j] = u->postcopy_client_bases[i];
} else {
u->region_rb_offset[j] = 0;
u->region_rb[j] = NULL;
}
}
break;
}
}
/*
* If the region was not found in the current device memory state
* create an entry for it in the removed list.
*/
if (!matching) {
rem_reg[rm_idx].region = shadow_reg;
rem_reg[rm_idx++].reg_idx = i;
}
}
/*
* For regions not marked "found", create entries in the added list.
*
* Note their indexes in the device memory state and the indexes of their
* file descriptors.
*/
for (i = 0; i < dev->mem->nregions; i++) {
reg = &dev->mem->regions[i];
vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (fd > 0) {
++fd_num;
}
/*
* If the region was in both the shadow and device state we don't
* need to send a VHOST_USER_ADD_MEM_REG message for it.
*/
if (found[i]) {
continue;
}
add_reg[add_idx].region = reg;
add_reg[add_idx].reg_idx = i;
add_reg[add_idx++].fd_idx = fd_num;
}
*nr_rem_reg = rm_idx;
*nr_add_reg = add_idx;
return;
}
static int send_remove_regions(struct vhost_dev *dev,
struct scrub_regions *remove_reg,
int nr_rem_reg, VhostUserMsg *msg,
bool reply_supported)
{
struct vhost_user *u = dev->opaque;
struct vhost_memory_region *shadow_reg;
int i, fd, shadow_reg_idx, ret;
ram_addr_t offset;
VhostUserMemoryRegion region_buffer;
/*
* The regions in remove_reg appear in the same order they do in the
* shadow table. Therefore we can minimize memory copies by iterating
* through remove_reg backwards.
*/
for (i = nr_rem_reg - 1; i >= 0; i--) {
shadow_reg = remove_reg[i].region;
shadow_reg_idx = remove_reg[i].reg_idx;
vhost_user_get_mr_data(shadow_reg->userspace_addr, &offset, &fd);
if (fd > 0) {
msg->hdr.request = VHOST_USER_REM_MEM_REG;
vhost_user_fill_msg_region(&region_buffer, shadow_reg, 0);
msg->payload.mem_reg.region = region_buffer;
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
}
if (reply_supported) {
ret = process_message_reply(dev, msg);
if (ret) {
return ret;
}
}
}
/*
* At this point we know the backend has unmapped the region. It is now
* safe to remove it from the shadow table.
*/
memmove(&u->shadow_regions[shadow_reg_idx],
&u->shadow_regions[shadow_reg_idx + 1],
sizeof(struct vhost_memory_region) *
(u->num_shadow_regions - shadow_reg_idx - 1));
u->num_shadow_regions--;
}
return 0;
}
static int send_add_regions(struct vhost_dev *dev,
struct scrub_regions *add_reg, int nr_add_reg,
VhostUserMsg *msg, uint64_t *shadow_pcb,
bool reply_supported, bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
int i, fd, ret, reg_idx, reg_fd_idx;
struct vhost_memory_region *reg;
MemoryRegion *mr;
ram_addr_t offset;
VhostUserMsg msg_reply;
VhostUserMemoryRegion region_buffer;
for (i = 0; i < nr_add_reg; i++) {
reg = add_reg[i].region;
reg_idx = add_reg[i].reg_idx;
reg_fd_idx = add_reg[i].fd_idx;
mr = vhost_user_get_mr_data(reg->userspace_addr, &offset, &fd);
if (fd > 0) {
if (track_ramblocks) {
trace_vhost_user_set_mem_table_withfd(reg_fd_idx, mr->name,
reg->memory_size,
reg->guest_phys_addr,
reg->userspace_addr,
offset);
u->region_rb_offset[reg_idx] = offset;
u->region_rb[reg_idx] = mr->ram_block;
}
msg->hdr.request = VHOST_USER_ADD_MEM_REG;
vhost_user_fill_msg_region(&region_buffer, reg, offset);
msg->payload.mem_reg.region = region_buffer;
ret = vhost_user_write(dev, msg, &fd, 1);
if (ret < 0) {
return ret;
}
if (track_ramblocks) {
uint64_t reply_gpa;
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
}
reply_gpa = msg_reply.payload.mem_reg.region.guest_phys_addr;
if (msg_reply.hdr.request != VHOST_USER_ADD_MEM_REG) {
error_report("%s: Received unexpected msg type."
"Expected %d received %d", __func__,
VHOST_USER_ADD_MEM_REG,
msg_reply.hdr.request);
return -EPROTO;
}
/*
* We're using the same structure, just reusing one of the
* fields, so it should be the same size.
*/
if (msg_reply.hdr.size != msg->hdr.size) {
error_report("%s: Unexpected size for postcopy reply "
"%d vs %d", __func__, msg_reply.hdr.size,
msg->hdr.size);
return -EPROTO;
}
/* Get the postcopy client base from the backend's reply. */
if (reply_gpa == dev->mem->regions[reg_idx].guest_phys_addr) {
shadow_pcb[reg_idx] =
msg_reply.payload.mem_reg.region.userspace_addr;
trace_vhost_user_set_mem_table_postcopy(
msg_reply.payload.mem_reg.region.userspace_addr,
msg->payload.mem_reg.region.userspace_addr,
reg_fd_idx, reg_idx);
} else {
error_report("%s: invalid postcopy reply for region. "
"Got guest physical address %" PRIX64 ", expected "
"%" PRIX64, __func__, reply_gpa,
dev->mem->regions[reg_idx].guest_phys_addr);
return -EPROTO;
}
} else if (reply_supported) {
ret = process_message_reply(dev, msg);
if (ret) {
return ret;
}
}
} else if (track_ramblocks) {
u->region_rb_offset[reg_idx] = 0;
u->region_rb[reg_idx] = NULL;
}
/*
* At this point, we know the backend has mapped in the new
* region, if the region has a valid file descriptor.
*
* The region should now be added to the shadow table.
*/
u->shadow_regions[u->num_shadow_regions].guest_phys_addr =
reg->guest_phys_addr;
u->shadow_regions[u->num_shadow_regions].userspace_addr =
reg->userspace_addr;
u->shadow_regions[u->num_shadow_regions].memory_size =
reg->memory_size;
u->num_shadow_regions++;
}
return 0;
}
static int vhost_user_add_remove_regions(struct vhost_dev *dev,
VhostUserMsg *msg,
bool reply_supported,
bool track_ramblocks)
{
struct vhost_user *u = dev->opaque;
struct scrub_regions add_reg[VHOST_USER_MAX_RAM_SLOTS];
struct scrub_regions rem_reg[VHOST_USER_MAX_RAM_SLOTS];
uint64_t shadow_pcb[VHOST_USER_MAX_RAM_SLOTS] = {};
int nr_add_reg, nr_rem_reg;
int ret;
msg->hdr.size = sizeof(msg->payload.mem_reg);
/* Find the regions which need to be removed or added. */
scrub_shadow_regions(dev, add_reg, &nr_add_reg, rem_reg, &nr_rem_reg,
shadow_pcb, track_ramblocks);
if (nr_rem_reg) {
ret = send_remove_regions(dev, rem_reg, nr_rem_reg, msg,
reply_supported);
if (ret < 0) {
goto err;
}
}
if (nr_add_reg) {
ret = send_add_regions(dev, add_reg, nr_add_reg, msg, shadow_pcb,
reply_supported, track_ramblocks);
if (ret < 0) {
goto err;
}
}
if (track_ramblocks) {
memcpy(u->postcopy_client_bases, shadow_pcb,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
/*
* Now we've registered this with the postcopy code, we ack to the
* client, because now we're in the position to be able to deal with
* any faults it generates.
*/
/* TODO: Use this for failure cases as well with a bad value. */
msg->hdr.size = sizeof(msg->payload.u64);
msg->payload.u64 = 0; /* OK */
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
}
}
return 0;
err:
if (track_ramblocks) {
memcpy(u->postcopy_client_bases, shadow_pcb,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
}
return ret;
}
static int vhost_user_set_mem_table_postcopy(struct vhost_dev *dev,
struct vhost_memory *mem,
bool reply_supported,
bool config_mem_slots)
{
struct vhost_user *u = dev->opaque;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
size_t fd_num = 0;
VhostUserMsg msg_reply;
int region_i, msg_i;
int ret;
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
};
if (u->region_rb_len < dev->mem->nregions) {
u->region_rb = g_renew(RAMBlock*, u->region_rb, dev->mem->nregions);
u->region_rb_offset = g_renew(ram_addr_t, u->region_rb_offset,
dev->mem->nregions);
memset(&(u->region_rb[u->region_rb_len]), '\0',
sizeof(RAMBlock *) * (dev->mem->nregions - u->region_rb_len));
memset(&(u->region_rb_offset[u->region_rb_len]), '\0',
sizeof(ram_addr_t) * (dev->mem->nregions - u->region_rb_len));
u->region_rb_len = dev->mem->nregions;
}
if (config_mem_slots) {
ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, true);
if (ret < 0) {
return ret;
}
} else {
ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
true);
if (ret < 0) {
return ret;
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg_reply);
if (ret < 0) {
return ret;
}
if (msg_reply.hdr.request != VHOST_USER_SET_MEM_TABLE) {
error_report("%s: Received unexpected msg type."
"Expected %d received %d", __func__,
VHOST_USER_SET_MEM_TABLE, msg_reply.hdr.request);
return -EPROTO;
}
/*
* We're using the same structure, just reusing one of the
* fields, so it should be the same size.
*/
if (msg_reply.hdr.size != msg.hdr.size) {
error_report("%s: Unexpected size for postcopy reply "
"%d vs %d", __func__, msg_reply.hdr.size,
msg.hdr.size);
return -EPROTO;
}
memset(u->postcopy_client_bases, 0,
sizeof(uint64_t) * VHOST_USER_MAX_RAM_SLOTS);
/*
* They're in the same order as the regions that were sent
* but some of the regions were skipped (above) if they
* didn't have fd's
*/
for (msg_i = 0, region_i = 0;
region_i < dev->mem->nregions;
region_i++) {
if (msg_i < fd_num &&
msg_reply.payload.memory.regions[msg_i].guest_phys_addr ==
dev->mem->regions[region_i].guest_phys_addr) {
u->postcopy_client_bases[region_i] =
msg_reply.payload.memory.regions[msg_i].userspace_addr;
trace_vhost_user_set_mem_table_postcopy(
msg_reply.payload.memory.regions[msg_i].userspace_addr,
msg.payload.memory.regions[msg_i].userspace_addr,
msg_i, region_i);
msg_i++;
}
}
if (msg_i != fd_num) {
error_report("%s: postcopy reply not fully consumed "
"%d vs %zd",
__func__, msg_i, fd_num);
return -EIO;
}
/*
* Now we've registered this with the postcopy code, we ack to the
* client, because now we're in the position to be able to deal
* with any faults it generates.
*/
/* TODO: Use this for failure cases as well with a bad value. */
msg.hdr.size = sizeof(msg.payload.u64);
msg.payload.u64 = 0; /* OK */
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
}
return 0;
}
static int vhost_user_set_mem_table(struct vhost_dev *dev,
struct vhost_memory *mem)
{
struct vhost_user *u = dev->opaque;
int fds[VHOST_MEMORY_BASELINE_NREGIONS];
size_t fd_num = 0;
bool do_postcopy = u->postcopy_listen && u->postcopy_fd.handler;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
bool config_mem_slots =
virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS);
int ret;
if (do_postcopy) {
/*
* Postcopy has enough differences that it's best done in it's own
* version
*/
return vhost_user_set_mem_table_postcopy(dev, mem, reply_supported,
config_mem_slots);
}
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
};
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
if (config_mem_slots) {
ret = vhost_user_add_remove_regions(dev, &msg, reply_supported, false);
if (ret < 0) {
return ret;
}
} else {
ret = vhost_user_fill_set_mem_table_msg(u, dev, &msg, fds, &fd_num,
false);
if (ret < 0) {
return ret;
}
ret = vhost_user_write(dev, &msg, fds, fd_num);
if (ret < 0) {
return ret;
}
if (reply_supported) {
return process_message_reply(dev, &msg);
}
}
return 0;
}
static int vhost_user_set_vring_endian(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
bool cross_endian = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CROSS_ENDIAN);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_VRING_ENDIAN,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
if (!cross_endian) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
{
int ret;
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
};
if (vhost_user_per_device_request(request) && dev->vq_index != 0) {
return 0;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != request) {
error_report("Received unexpected msg type. Expected %d received %d",
request, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_report("Received bad msg size.");
return -EPROTO;
}
*u64 = msg.payload.u64;
return 0;
}
static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
{
if (vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features) < 0) {
return -EPROTO;
}
return 0;
}
/* Note: "msg->hdr.flags" may be modified. */
static int vhost_user_write_sync(struct vhost_dev *dev, VhostUserMsg *msg,
bool wait_for_reply)
{
int ret;
if (wait_for_reply) {
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
if (reply_supported) {
msg->hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
}
ret = vhost_user_write(dev, msg, NULL, 0);
if (ret < 0) {
return ret;
}
if (wait_for_reply) {
uint64_t dummy;
if (msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
return process_message_reply(dev, msg);
}
/*
* We need to wait for a reply but the backend does not
* support replies for the command we just sent.
* Send VHOST_USER_GET_FEATURES which makes all backends
* send a reply.
*/
return vhost_user_get_features(dev, &dummy);
}
return 0;
}
static int vhost_set_vring(struct vhost_dev *dev,
unsigned long int request,
struct vhost_vring_state *ring,
bool wait_for_reply)
{
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
return vhost_user_write_sync(dev, &msg, wait_for_reply);
}
static int vhost_user_set_vring_num(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring, false);
}
static void vhost_user_host_notifier_free(VhostUserHostNotifier *n)
{
assert(n && n->unmap_addr);
munmap(n->unmap_addr, qemu_real_host_page_size());
n->unmap_addr = NULL;
}
/*
* clean-up function for notifier, will finally free the structure
* under rcu.
*/
static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n,
VirtIODevice *vdev)
{
if (n->addr) {
if (vdev) {
virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false);
}
assert(!n->unmap_addr);
n->unmap_addr = n->addr;
n->addr = NULL;
call_rcu(n, vhost_user_host_notifier_free, rcu);
}
}
static int vhost_user_set_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring, false);
}
static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
{
int i;
if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) {
return -EINVAL;
}
for (i = 0; i < dev->nvqs; ++i) {
int ret;
struct vhost_vring_state state = {
.index = dev->vq_index + i,
.num = enable,
};
/*
* SET_VRING_ENABLE travels from guest to QEMU to vhost-user backend /
* control plane thread via unix domain socket. Virtio requests travel
* from guest to vhost-user backend / data plane thread via eventfd.
* Even if the guest enables the ring first, and pushes its first virtio
* request second (conforming to the virtio spec), the data plane thread
* in the backend may see the virtio request before the control plane
* thread sees the queue enablement. This causes (in fact, requires) the
* data plane thread to discard the virtio request (it arrived on a
* seemingly disabled queue). To prevent this out-of-order delivery,
* don't let the guest proceed to pushing the virtio request until the
* backend control plane acknowledges enabling the queue -- IOW, pass
* wait_for_reply=true below.
*/
ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state, true);
if (ret < 0) {
/*
* Restoring the previous state is likely infeasible, as well as
* proceeding regardless the error, so just bail out and hope for
* the device-level recovery.
*/
return ret;
}
}
return 0;
}
static VhostUserHostNotifier *fetch_notifier(VhostUserState *u,
int idx)
{
if (idx >= u->notifiers->len) {
return NULL;
}
return g_ptr_array_index(u->notifiers, idx);
}
static int vhost_user_get_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_VRING_BASE,
.hdr.flags = VHOST_USER_VERSION,
.payload.state = *ring,
.hdr.size = sizeof(msg.payload.state),
};
struct vhost_user *u = dev->opaque;
VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index);
if (n) {
vhost_user_host_notifier_remove(n, dev->vdev);
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_VRING_BASE) {
error_report("Received unexpected msg type. Expected %d received %d",
VHOST_USER_GET_VRING_BASE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.state)) {
error_report("Received bad msg size.");
return -EPROTO;
}
*ring = msg.payload.state;
return 0;
}
static int vhost_set_vring_file(struct vhost_dev *dev,
VhostUserRequest request,
struct vhost_vring_file *file)
{
int fds[VHOST_USER_MAX_RAM_SLOTS];
size_t fd_num = 0;
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
.hdr.size = sizeof(msg.payload.u64),
};
if (file->fd > 0) {
fds[fd_num++] = file->fd;
} else {
msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
}
return vhost_user_write(dev, &msg, fds, fd_num);
}
static int vhost_user_set_vring_kick(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file);
}
static int vhost_user_set_vring_call(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file);
}
static int vhost_user_set_vring_err(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_ERR, file);
}
static int vhost_user_set_vring_addr(struct vhost_dev *dev,
struct vhost_vring_addr *addr)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_VRING_ADDR,
.hdr.flags = VHOST_USER_VERSION,
.payload.addr = *addr,
.hdr.size = sizeof(msg.payload.addr),
};
/*
* wait for a reply if logging is enabled to make sure
* backend is actually logging changes
*/
bool wait_for_reply = addr->flags & (1 << VHOST_VRING_F_LOG);
return vhost_user_write_sync(dev, &msg, wait_for_reply);
}
static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64,
bool wait_for_reply)
{
VhostUserMsg msg = {
.hdr.request = request,
.hdr.flags = VHOST_USER_VERSION,
.payload.u64 = u64,
.hdr.size = sizeof(msg.payload.u64),
};
return vhost_user_write_sync(dev, &msg, wait_for_reply);
}
static int vhost_user_set_status(struct vhost_dev *dev, uint8_t status)
{
return vhost_user_set_u64(dev, VHOST_USER_SET_STATUS, status, false);
}
static int vhost_user_get_status(struct vhost_dev *dev, uint8_t *status)
{
uint64_t value;
int ret;
ret = vhost_user_get_u64(dev, VHOST_USER_GET_STATUS, &value);
if (ret < 0) {
return ret;
}
*status = value;
return 0;
}
static int vhost_user_add_status(struct vhost_dev *dev, uint8_t status)
{
uint8_t s;
int ret;
ret = vhost_user_get_status(dev, &s);
if (ret < 0) {
return ret;
}
if ((s & status) == status) {
return 0;
}
s |= status;
return vhost_user_set_status(dev, s);
}
static int vhost_user_set_features(struct vhost_dev *dev,
uint64_t features)
{
/*
* wait for a reply if logging is enabled to make sure
* backend is actually logging changes
*/
bool log_enabled = features & (0x1ULL << VHOST_F_LOG_ALL);
int ret;
/*
* We need to include any extra backend only feature bits that
* might be needed by our device. Currently this includes the
* VHOST_USER_F_PROTOCOL_FEATURES bit for enabling protocol
* features.
*/
ret = vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES,
features | dev->backend_features,
log_enabled);
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
if (!ret) {
return vhost_user_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
}
}
return ret;
}
static int vhost_user_set_protocol_features(struct vhost_dev *dev,
uint64_t features)
{
return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features,
false);
}
static int vhost_user_set_owner(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_OWNER,
.hdr.flags = VHOST_USER_VERSION,
};
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_get_max_memslots(struct vhost_dev *dev,
uint64_t *max_memslots)
{
uint64_t backend_max_memslots;
int err;
err = vhost_user_get_u64(dev, VHOST_USER_GET_MAX_MEM_SLOTS,
&backend_max_memslots);
if (err < 0) {
return err;
}
*max_memslots = backend_max_memslots;
return 0;
}
static int vhost_user_reset_device(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.flags = VHOST_USER_VERSION,
.hdr.request = VHOST_USER_RESET_DEVICE,
};
/*
* Historically, reset was not implemented so only reset devices
* that are expecting it.
*/
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_RESET_DEVICE)) {
return -ENOSYS;
}
return vhost_user_write(dev, &msg, NULL, 0);
}
static int vhost_user_backend_handle_config_change(struct vhost_dev *dev)
{
if (!dev->config_ops || !dev->config_ops->vhost_dev_config_notifier) {
return -ENOSYS;
}
return dev->config_ops->vhost_dev_config_notifier(dev);
}
/*
* Fetch or create the notifier for a given idx. Newly created
* notifiers are added to the pointer array that tracks them.
*/
static VhostUserHostNotifier *fetch_or_create_notifier(VhostUserState *u,
int idx)
{
VhostUserHostNotifier *n = NULL;
if (idx >= u->notifiers->len) {
g_ptr_array_set_size(u->notifiers, idx + 1);
}
n = g_ptr_array_index(u->notifiers, idx);
if (!n) {
/*
* In case notification arrive out-of-order,
* make room for current index.
*/
g_ptr_array_remove_index(u->notifiers, idx);
n = g_new0(VhostUserHostNotifier, 1);
n->idx = idx;
g_ptr_array_insert(u->notifiers, idx, n);
trace_vhost_user_create_notifier(idx, n);
}
return n;
}
static int vhost_user_backend_handle_vring_host_notifier(struct vhost_dev *dev,
VhostUserVringArea *area,
int fd)
{
int queue_idx = area->u64 & VHOST_USER_VRING_IDX_MASK;
size_t page_size = qemu_real_host_page_size();
struct vhost_user *u = dev->opaque;
VhostUserState *user = u->user;
VirtIODevice *vdev = dev->vdev;
VhostUserHostNotifier *n;
void *addr;
char *name;
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) ||
vdev == NULL || queue_idx >= virtio_get_num_queues(vdev)) {
return -EINVAL;
}
/*
* Fetch notifier and invalidate any old data before setting up
* new mapped address.
*/
n = fetch_or_create_notifier(user, queue_idx);
vhost_user_host_notifier_remove(n, vdev);
if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
return 0;
}
/* Sanity check. */
if (area->size != page_size) {
return -EINVAL;
}
addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
fd, area->offset);
if (addr == MAP_FAILED) {
return -EFAULT;
}
name = g_strdup_printf("vhost-user/host-notifier@%p mmaps[%d]",
user, queue_idx);
if (!n->mr.ram) { /* Don't init again after suspend. */
memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
page_size, addr);
} else {
n->mr.ram_block->host = addr;
}
g_free(name);
if (virtio_queue_set_host_notifier_mr(vdev, queue_idx, &n->mr, true)) {
object_unparent(OBJECT(&n->mr));
munmap(addr, page_size);
return -ENXIO;
}
n->addr = addr;
return 0;
}
static int
vhost_user_backend_handle_shared_object_add(struct vhost_dev *dev,
VhostUserShared *object)
{
QemuUUID uuid;
memcpy(uuid.data, object->uuid, sizeof(object->uuid));
return virtio_add_vhost_device(&uuid, dev);
}
static int
vhost_user_backend_handle_shared_object_remove(VhostUserShared *object)
{
QemuUUID uuid;
memcpy(uuid.data, object->uuid, sizeof(object->uuid));
return virtio_remove_resource(&uuid);
}
static bool vhost_user_send_resp(QIOChannel *ioc, VhostUserHeader *hdr,
VhostUserPayload *payload, Error **errp)
{
struct iovec iov[] = {
{ .iov_base = hdr, .iov_len = VHOST_USER_HDR_SIZE },
{ .iov_base = payload, .iov_len = hdr->size },
};
hdr->flags &= ~VHOST_USER_NEED_REPLY_MASK;
hdr->flags |= VHOST_USER_REPLY_MASK;
return !qio_channel_writev_all(ioc, iov, ARRAY_SIZE(iov), errp);
}
static bool
vhost_user_backend_send_dmabuf_fd(QIOChannel *ioc, VhostUserHeader *hdr,
VhostUserPayload *payload, Error **errp)
{
hdr->size = sizeof(payload->u64);
return vhost_user_send_resp(ioc, hdr, payload, errp);
}
int vhost_user_get_shared_object(struct vhost_dev *dev, unsigned char *uuid,
int *dmabuf_fd)
{
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_SHARED_OBJECT,
.hdr.flags = VHOST_USER_VERSION,
};
memcpy(msg.payload.object.uuid, uuid, sizeof(msg.payload.object.uuid));
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_SHARED_OBJECT) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_GET_SHARED_OBJECT, msg.hdr.request);
return -EPROTO;
}
*dmabuf_fd = qemu_chr_fe_get_msgfd(chr);
if (*dmabuf_fd < 0) {
error_report("Failed to get dmabuf fd");
return -EIO;
}
return 0;
}
static int
vhost_user_backend_handle_shared_object_lookup(struct vhost_user *u,
QIOChannel *ioc,
VhostUserHeader *hdr,
VhostUserPayload *payload)
{
QemuUUID uuid;
CharBackend *chr = u->user->chr;
Error *local_err = NULL;
int dmabuf_fd = -1;
int fd_num = 0;
memcpy(uuid.data, payload->object.uuid, sizeof(payload->object.uuid));
payload->u64 = 0;
switch (virtio_object_type(&uuid)) {
case TYPE_DMABUF:
dmabuf_fd = virtio_lookup_dmabuf(&uuid);
break;
case TYPE_VHOST_DEV:
{
struct vhost_dev *dev = virtio_lookup_vhost_device(&uuid);
if (dev == NULL) {
payload->u64 = -EINVAL;
break;
}
int ret = vhost_user_get_shared_object(dev, uuid.data, &dmabuf_fd);
if (ret < 0) {
payload->u64 = ret;
}
break;
}
case TYPE_INVALID:
payload->u64 = -EINVAL;
break;
}
if (dmabuf_fd != -1) {
fd_num++;
}
if (qemu_chr_fe_set_msgfds(chr, &dmabuf_fd, fd_num) < 0) {
error_report("Failed to set msg fds.");
payload->u64 = -EINVAL;
}
if (!vhost_user_backend_send_dmabuf_fd(ioc, hdr, payload, &local_err)) {
error_report_err(local_err);
return -EINVAL;
}
return 0;
}
static void close_backend_channel(struct vhost_user *u)
{
g_source_destroy(u->backend_src);
g_source_unref(u->backend_src);
u->backend_src = NULL;
object_unref(OBJECT(u->backend_ioc));
u->backend_ioc = NULL;
}
static gboolean backend_read(QIOChannel *ioc, GIOCondition condition,
gpointer opaque)
{
struct vhost_dev *dev = opaque;
struct vhost_user *u = dev->opaque;
VhostUserHeader hdr = { 0, };
VhostUserPayload payload = { 0, };
Error *local_err = NULL;
gboolean rc = G_SOURCE_CONTINUE;
int ret = 0;
struct iovec iov;
g_autofree int *fd = NULL;
size_t fdsize = 0;
int i;
/* Read header */
iov.iov_base = &hdr;
iov.iov_len = VHOST_USER_HDR_SIZE;
if (qio_channel_readv_full_all(ioc, &iov, 1, &fd, &fdsize, &local_err)) {
error_report_err(local_err);
goto err;
}
if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", hdr.size,
VHOST_USER_PAYLOAD_SIZE);
goto err;
}
/* Read payload */
if (qio_channel_read_all(ioc, (char *) &payload, hdr.size, &local_err)) {
error_report_err(local_err);
goto err;
}
switch (hdr.request) {
case VHOST_USER_BACKEND_IOTLB_MSG:
ret = vhost_backend_handle_iotlb_msg(dev, &payload.iotlb);
break;
case VHOST_USER_BACKEND_CONFIG_CHANGE_MSG:
ret = vhost_user_backend_handle_config_change(dev);
break;
case VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG:
ret = vhost_user_backend_handle_vring_host_notifier(dev, &payload.area,
fd ? fd[0] : -1);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_ADD:
ret = vhost_user_backend_handle_shared_object_add(dev, &payload.object);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_REMOVE:
ret = vhost_user_backend_handle_shared_object_remove(&payload.object);
break;
case VHOST_USER_BACKEND_SHARED_OBJECT_LOOKUP:
ret = vhost_user_backend_handle_shared_object_lookup(dev->opaque, ioc,
&hdr, &payload);
break;
default:
error_report("Received unexpected msg type: %d.", hdr.request);
ret = -EINVAL;
}
/*
* REPLY_ACK feature handling. Other reply types has to be managed
* directly in their request handlers.
*/
if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
payload.u64 = !!ret;
hdr.size = sizeof(payload.u64);
if (!vhost_user_send_resp(ioc, &hdr, &payload, &local_err)) {
error_report_err(local_err);
goto err;
}
}
goto fdcleanup;
err:
close_backend_channel(u);
rc = G_SOURCE_REMOVE;
fdcleanup:
if (fd) {
for (i = 0; i < fdsize; i++) {
close(fd[i]);
}
}
return rc;
}
static int vhost_setup_backend_channel(struct vhost_dev *dev)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_BACKEND_REQ_FD,
.hdr.flags = VHOST_USER_VERSION,
};
struct vhost_user *u = dev->opaque;
int sv[2], ret = 0;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
Error *local_err = NULL;
QIOChannel *ioc;
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_BACKEND_REQ)) {
return 0;
}
if (qemu_socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
int saved_errno = errno;
error_report("socketpair() failed");
return -saved_errno;
}
ioc = QIO_CHANNEL(qio_channel_socket_new_fd(sv[0], &local_err));
if (!ioc) {
error_report_err(local_err);
return -ECONNREFUSED;
}
u->backend_ioc = ioc;
u->backend_src = qio_channel_add_watch_source(u->backend_ioc,
G_IO_IN | G_IO_HUP,
backend_read, dev, NULL, NULL);
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
ret = vhost_user_write(dev, &msg, &sv[1], 1);
if (ret) {
goto out;
}
if (reply_supported) {
ret = process_message_reply(dev, &msg);
}
out:
close(sv[1]);
if (ret) {
close_backend_channel(u);
}
return ret;
}
#ifdef CONFIG_LINUX
/*
* Called back from the postcopy fault thread when a fault is received on our
* ufd.
* TODO: This is Linux specific
*/
static int vhost_user_postcopy_fault_handler(struct PostCopyFD *pcfd,
void *ufd)
{
struct vhost_dev *dev = pcfd->data;
struct vhost_user *u = dev->opaque;
struct uffd_msg *msg = ufd;
uint64_t faultaddr = msg->arg.pagefault.address;
RAMBlock *rb = NULL;
uint64_t rb_offset;
int i;
trace_vhost_user_postcopy_fault_handler(pcfd->idstr, faultaddr,
dev->mem->nregions);
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
trace_vhost_user_postcopy_fault_handler_loop(i,
u->postcopy_client_bases[i], dev->mem->regions[i].memory_size);
if (faultaddr >= u->postcopy_client_bases[i]) {
/* Ofset of the fault address in the vhost region */
uint64_t region_offset = faultaddr - u->postcopy_client_bases[i];
if (region_offset < dev->mem->regions[i].memory_size) {
rb_offset = region_offset + u->region_rb_offset[i];
trace_vhost_user_postcopy_fault_handler_found(i,
region_offset, rb_offset);
rb = u->region_rb[i];
return postcopy_request_shared_page(pcfd, rb, faultaddr,
rb_offset);
}
}
}
error_report("%s: Failed to find region for fault %" PRIx64,
__func__, faultaddr);
return -1;
}
static int vhost_user_postcopy_waker(struct PostCopyFD *pcfd, RAMBlock *rb,
uint64_t offset)
{
struct vhost_dev *dev = pcfd->data;
struct vhost_user *u = dev->opaque;
int i;
trace_vhost_user_postcopy_waker(qemu_ram_get_idstr(rb), offset);
if (!u) {
return 0;
}
/* Translate the offset into an address in the clients address space */
for (i = 0; i < MIN(dev->mem->nregions, u->region_rb_len); i++) {
if (u->region_rb[i] == rb &&
offset >= u->region_rb_offset[i] &&
offset < (u->region_rb_offset[i] +
dev->mem->regions[i].memory_size)) {
uint64_t client_addr = (offset - u->region_rb_offset[i]) +
u->postcopy_client_bases[i];
trace_vhost_user_postcopy_waker_found(client_addr);
return postcopy_wake_shared(pcfd, client_addr, rb);
}
}
trace_vhost_user_postcopy_waker_nomatch(qemu_ram_get_idstr(rb), offset);
return 0;
}
#endif
/*
* Called at the start of an inbound postcopy on reception of the
* 'advise' command.
*/
static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
{
#ifdef CONFIG_LINUX
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
int ufd;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_ADVISE,
.hdr.flags = VHOST_USER_VERSION,
};
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_advise to vhost");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg(errp, "Failed to get postcopy_advise reply from vhost");
return ret;
}
if (msg.hdr.request != VHOST_USER_POSTCOPY_ADVISE) {
error_setg(errp, "Unexpected msg type. Expected %d received %d",
VHOST_USER_POSTCOPY_ADVISE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size) {
error_setg(errp, "Received bad msg size.");
return -EPROTO;
}
ufd = qemu_chr_fe_get_msgfd(chr);
if (ufd < 0) {
error_setg(errp, "%s: Failed to get ufd", __func__);
return -EIO;
}
qemu_socket_set_nonblock(ufd);
/* register ufd with userfault thread */
u->postcopy_fd.fd = ufd;
u->postcopy_fd.data = dev;
u->postcopy_fd.handler = vhost_user_postcopy_fault_handler;
u->postcopy_fd.waker = vhost_user_postcopy_waker;
u->postcopy_fd.idstr = "vhost-user"; /* Need to find unique name */
postcopy_register_shared_ufd(&u->postcopy_fd);
return 0;
#else
error_setg(errp, "Postcopy not supported on non-Linux systems");
return -ENOSYS;
#endif
}
/*
* Called at the switch to postcopy on reception of the 'listen' command.
*/
static int vhost_user_postcopy_listen(struct vhost_dev *dev, Error **errp)
{
struct vhost_user *u = dev->opaque;
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_LISTEN,
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
};
u->postcopy_listen = true;
trace_vhost_user_postcopy_listen();
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_listen to vhost");
return ret;
}
ret = process_message_reply(dev, &msg);
if (ret) {
error_setg(errp, "Failed to receive reply to postcopy_listen");
return ret;
}
return 0;
}
/*
* Called at the end of postcopy
*/
static int vhost_user_postcopy_end(struct vhost_dev *dev, Error **errp)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_POSTCOPY_END,
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
};
int ret;
struct vhost_user *u = dev->opaque;
trace_vhost_user_postcopy_end_entry();
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg(errp, "Failed to send postcopy_end to vhost");
return ret;
}
ret = process_message_reply(dev, &msg);
if (ret) {
error_setg(errp, "Failed to receive reply to postcopy_end");
return ret;
}
postcopy_unregister_shared_ufd(&u->postcopy_fd);
close(u->postcopy_fd.fd);
u->postcopy_fd.handler = NULL;
trace_vhost_user_postcopy_end_exit();
return 0;
}
static int vhost_user_postcopy_notifier(NotifierWithReturn *notifier,
void *opaque, Error **errp)
{
struct PostcopyNotifyData *pnd = opaque;
struct vhost_user *u = container_of(notifier, struct vhost_user,
postcopy_notifier);
struct vhost_dev *dev = u->dev;
switch (pnd->reason) {
case POSTCOPY_NOTIFY_PROBE:
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_PAGEFAULT)) {
/* TODO: Get the device name into this error somehow */
error_setg(pnd->errp,
"vhost-user backend not capable of postcopy");
return -ENOENT;
}
break;
case POSTCOPY_NOTIFY_INBOUND_ADVISE:
return vhost_user_postcopy_advise(dev, pnd->errp);
case POSTCOPY_NOTIFY_INBOUND_LISTEN:
return vhost_user_postcopy_listen(dev, pnd->errp);
case POSTCOPY_NOTIFY_INBOUND_END:
return vhost_user_postcopy_end(dev, pnd->errp);
default:
/* We ignore notifications we don't know */
break;
}
return 0;
}
static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque,
Error **errp)
{
uint64_t features, ram_slots;
struct vhost_user *u;
VhostUserState *vus = (VhostUserState *) opaque;
int err;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
u = g_new0(struct vhost_user, 1);
u->user = vus;
u->dev = dev;
dev->opaque = u;
err = vhost_user_get_features(dev, &features);
if (err < 0) {
error_setg_errno(errp, -err, "vhost_backend_init failed");
return err;
}
if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
bool supports_f_config = vus->supports_config ||
(dev->config_ops && dev->config_ops->vhost_dev_config_notifier);
uint64_t protocol_features;
dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
&protocol_features);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
/*
* We will use all the protocol features we support - although
* we suppress F_CONFIG if we know QEMUs internal code can not support
* it.
*/
protocol_features &= VHOST_USER_PROTOCOL_FEATURE_MASK;
if (supports_f_config) {
if (!virtio_has_feature(protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
error_setg(errp, "vhost-user device expecting "
"VHOST_USER_PROTOCOL_F_CONFIG but the vhost-user backend does "
"not support it.");
return -EPROTO;
}
} else {
if (virtio_has_feature(protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
warn_report("vhost-user backend supports "
"VHOST_USER_PROTOCOL_F_CONFIG but QEMU does not.");
protocol_features &= ~(1ULL << VHOST_USER_PROTOCOL_F_CONFIG);
}
}
/* final set of protocol features */
dev->protocol_features = protocol_features;
err = vhost_user_set_protocol_features(dev, dev->protocol_features);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
/* query the max queues we support if backend supports Multiple Queue */
if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) {
err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
&dev->max_queues);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
} else {
dev->max_queues = 1;
}
if (dev->num_queues && dev->max_queues < dev->num_queues) {
error_setg(errp, "The maximum number of queues supported by the "
"backend is %" PRIu64, dev->max_queues);
return -EINVAL;
}
if (virtio_has_feature(features, VIRTIO_F_IOMMU_PLATFORM) &&
!(virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_BACKEND_REQ) &&
virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK))) {
error_setg(errp, "IOMMU support requires reply-ack and "
"backend-req protocol features.");
return -EINVAL;
}
/* get max memory regions if backend supports configurable RAM slots */
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS)) {
u->user->memory_slots = VHOST_MEMORY_BASELINE_NREGIONS;
} else {
err = vhost_user_get_max_memslots(dev, &ram_slots);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
if (ram_slots < u->user->memory_slots) {
error_setg(errp, "The backend specified a max ram slots limit "
"of %" PRIu64", when the prior validated limit was "
"%d. This limit should never decrease.", ram_slots,
u->user->memory_slots);
return -EINVAL;
}
u->user->memory_slots = MIN(ram_slots, VHOST_USER_MAX_RAM_SLOTS);
}
}
if (dev->migration_blocker == NULL &&
!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD)) {
error_setg(&dev->migration_blocker,
"Migration disabled: vhost-user backend lacks "
"VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
}
if (dev->vq_index == 0) {
err = vhost_setup_backend_channel(dev);
if (err < 0) {
error_setg_errno(errp, EPROTO, "vhost_backend_init failed");
return -EPROTO;
}
}
u->postcopy_notifier.notify = vhost_user_postcopy_notifier;
postcopy_add_notifier(&u->postcopy_notifier);
return 0;
}
static int vhost_user_backend_cleanup(struct vhost_dev *dev)
{
struct vhost_user *u;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
u = dev->opaque;
if (u->postcopy_notifier.notify) {
postcopy_remove_notifier(&u->postcopy_notifier);
u->postcopy_notifier.notify = NULL;
}
u->postcopy_listen = false;
if (u->postcopy_fd.handler) {
postcopy_unregister_shared_ufd(&u->postcopy_fd);
close(u->postcopy_fd.fd);
u->postcopy_fd.handler = NULL;
}
if (u->backend_ioc) {
close_backend_channel(u);
}
g_free(u->region_rb);
u->region_rb = NULL;
g_free(u->region_rb_offset);
u->region_rb_offset = NULL;
u->region_rb_len = 0;
g_free(u);
dev->opaque = 0;
return 0;
}
static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
{
assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
return idx;
}
static int vhost_user_memslots_limit(struct vhost_dev *dev)
{
struct vhost_user *u = dev->opaque;
return u->user->memory_slots;
}
static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
{
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
return virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_LOG_SHMFD);
}
static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
{
VhostUserMsg msg = { };
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
/* If guest supports GUEST_ANNOUNCE do nothing */
if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) {
return 0;
}
/* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_RARP)) {
msg.hdr.request = VHOST_USER_SEND_RARP;
msg.hdr.flags = VHOST_USER_VERSION;
memcpy((char *)&msg.payload.u64, mac_addr, 6);
msg.hdr.size = sizeof(msg.payload.u64);
return vhost_user_write(dev, &msg, NULL, 0);
}
return -ENOTSUP;
}
static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
{
VhostUserMsg msg;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
int ret;
if (!(dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))) {
return 0;
}
msg.hdr.request = VHOST_USER_NET_SET_MTU;
msg.payload.u64 = mtu;
msg.hdr.size = sizeof(msg.payload.u64);
msg.hdr.flags = VHOST_USER_VERSION;
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
/* If reply_ack supported, backend has to ack specified MTU is valid */
if (reply_supported) {
return process_message_reply(dev, &msg);
}
return 0;
}
static int vhost_user_send_device_iotlb_msg(struct vhost_dev *dev,
struct vhost_iotlb_msg *imsg)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_IOTLB_MSG,
.hdr.size = sizeof(msg.payload.iotlb),
.hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
.payload.iotlb = *imsg,
};
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
return process_message_reply(dev, &msg);
}
static void vhost_user_set_iotlb_callback(struct vhost_dev *dev, int enabled)
{
/* No-op as the receive channel is not dedicated to IOTLB messages. */
}
static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
uint32_t config_len, Error **errp)
{
int ret;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_CONFIG,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
error_setg(errp, "VHOST_USER_PROTOCOL_F_CONFIG not supported");
return -EINVAL;
}
assert(config_len <= VHOST_USER_MAX_CONFIG_SIZE);
msg.payload.config.offset = 0;
msg.payload.config.size = config_len;
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg_errno(errp, -ret, "vhost_get_config failed");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret, "vhost_get_config failed");
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_CONFIG) {
error_setg(errp,
"Received unexpected msg type. Expected %d received %d",
VHOST_USER_GET_CONFIG, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
error_setg(errp, "Received bad msg size.");
return -EPROTO;
}
memcpy(config, msg.payload.config.region, config_len);
return 0;
}
static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags)
{
int ret;
uint8_t *p;
bool reply_supported = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_REPLY_ACK);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_CONFIG,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = VHOST_USER_CONFIG_HDR_SIZE + size,
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CONFIG)) {
return -ENOTSUP;
}
if (reply_supported) {
msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
}
if (size > VHOST_USER_MAX_CONFIG_SIZE) {
return -EINVAL;
}
msg.payload.config.offset = offset,
msg.payload.config.size = size,
msg.payload.config.flags = flags,
p = msg.payload.config.region;
memcpy(p, data, size);
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
if (reply_supported) {
return process_message_reply(dev, &msg);
}
return 0;
}
static int vhost_user_crypto_create_session(struct vhost_dev *dev,
void *session_info,
uint64_t *session_id)
{
int ret;
bool crypto_session = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
CryptoDevBackendSessionInfo *backend_info = session_info;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_CREATE_CRYPTO_SESSION,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = sizeof(msg.payload.session),
};
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
if (!crypto_session) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
if (backend_info->op_code == VIRTIO_CRYPTO_AKCIPHER_CREATE_SESSION) {
CryptoDevBackendAsymSessionInfo *sess = &backend_info->u.asym_sess_info;
size_t keylen;
memcpy(&msg.payload.session.u.asym.session_setup_data, sess,
sizeof(CryptoDevBackendAsymSessionInfo));
if (sess->keylen) {
keylen = sizeof(msg.payload.session.u.asym.key);
if (sess->keylen > keylen) {
error_report("Unsupported asymmetric key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.asym.key, sess->key,
sess->keylen);
}
} else {
CryptoDevBackendSymSessionInfo *sess = &backend_info->u.sym_sess_info;
size_t keylen;
memcpy(&msg.payload.session.u.sym.session_setup_data, sess,
sizeof(CryptoDevBackendSymSessionInfo));
if (sess->key_len) {
keylen = sizeof(msg.payload.session.u.sym.key);
if (sess->key_len > keylen) {
error_report("Unsupported cipher key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.sym.key, sess->cipher_key,
sess->key_len);
}
if (sess->auth_key_len > 0) {
keylen = sizeof(msg.payload.session.u.sym.auth_key);
if (sess->auth_key_len > keylen) {
error_report("Unsupported auth key size");
return -ENOTSUP;
}
memcpy(&msg.payload.session.u.sym.auth_key, sess->auth_key,
sess->auth_key_len);
}
}
msg.payload.session.op_code = backend_info->op_code;
msg.payload.session.session_id = backend_info->session_id;
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_report("vhost_user_write() return %d, create session failed",
ret);
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_report("vhost_user_read() return %d, create session failed",
ret);
return ret;
}
if (msg.hdr.request != VHOST_USER_CREATE_CRYPTO_SESSION) {
error_report("Received unexpected msg type. Expected %d received %d",
VHOST_USER_CREATE_CRYPTO_SESSION, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.session)) {
error_report("Received bad msg size.");
return -EPROTO;
}
if (msg.payload.session.session_id < 0) {
error_report("Bad session id: %" PRId64 "",
msg.payload.session.session_id);
return -EINVAL;
}
*session_id = msg.payload.session.session_id;
return 0;
}
static int
vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id)
{
int ret;
bool crypto_session = virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_CRYPTO_SESSION);
VhostUserMsg msg = {
.hdr.request = VHOST_USER_CLOSE_CRYPTO_SESSION,
.hdr.flags = VHOST_USER_VERSION,
.hdr.size = sizeof(msg.payload.u64),
};
msg.payload.u64 = session_id;
if (!crypto_session) {
error_report("vhost-user trying to send unhandled ioctl");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_report("vhost_user_write() return %d, close session failed",
ret);
return ret;
}
return 0;
}
static bool vhost_user_no_private_memslots(struct vhost_dev *dev)
{
return true;
}
static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
uint16_t queue_size,
struct vhost_inflight *inflight)
{
void *addr;
int fd;
int ret;
struct vhost_user *u = dev->opaque;
CharBackend *chr = u->user->chr;
VhostUserMsg msg = {
.hdr.request = VHOST_USER_GET_INFLIGHT_FD,
.hdr.flags = VHOST_USER_VERSION,
.payload.inflight.num_queues = dev->nvqs,
.payload.inflight.queue_size = queue_size,
.hdr.size = sizeof(msg.payload.inflight),
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
return 0;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
return ret;
}
if (msg.hdr.request != VHOST_USER_GET_INFLIGHT_FD) {
error_report("Received unexpected msg type. "
"Expected %d received %d",
VHOST_USER_GET_INFLIGHT_FD, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.inflight)) {
error_report("Received bad msg size.");
return -EPROTO;
}
if (!msg.payload.inflight.mmap_size) {
return 0;
}
fd = qemu_chr_fe_get_msgfd(chr);
if (fd < 0) {
error_report("Failed to get mem fd");
return -EIO;
}
addr = mmap(0, msg.payload.inflight.mmap_size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, msg.payload.inflight.mmap_offset);
if (addr == MAP_FAILED) {
error_report("Failed to mmap mem fd");
close(fd);
return -EFAULT;
}
inflight->addr = addr;
inflight->fd = fd;
inflight->size = msg.payload.inflight.mmap_size;
inflight->offset = msg.payload.inflight.mmap_offset;
inflight->queue_size = queue_size;
return 0;
}
static int vhost_user_set_inflight_fd(struct vhost_dev *dev,
struct vhost_inflight *inflight)
{
VhostUserMsg msg = {
.hdr.request = VHOST_USER_SET_INFLIGHT_FD,
.hdr.flags = VHOST_USER_VERSION,
.payload.inflight.mmap_size = inflight->size,
.payload.inflight.mmap_offset = inflight->offset,
.payload.inflight.num_queues = dev->nvqs,
.payload.inflight.queue_size = inflight->queue_size,
.hdr.size = sizeof(msg.payload.inflight),
};
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)) {
return 0;
}
return vhost_user_write(dev, &msg, &inflight->fd, 1);
}
static void vhost_user_state_destroy(gpointer data)
{
VhostUserHostNotifier *n = (VhostUserHostNotifier *) data;
if (n) {
vhost_user_host_notifier_remove(n, NULL);
object_unparent(OBJECT(&n->mr));
/*
* We can't free until vhost_user_host_notifier_remove has
* done it's thing so schedule the free with RCU.
*/
g_free_rcu(n, rcu);
}
}
bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
{
if (user->chr) {
error_setg(errp, "Cannot initialize vhost-user state");
return false;
}
user->chr = chr;
user->memory_slots = 0;
user->notifiers = g_ptr_array_new_full(VIRTIO_QUEUE_MAX / 4,
&vhost_user_state_destroy);
return true;
}
void vhost_user_cleanup(VhostUserState *user)
{
if (!user->chr) {
return;
}
memory_region_transaction_begin();
user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true);
memory_region_transaction_commit();
user->chr = NULL;
}
typedef struct {
vu_async_close_fn cb;
DeviceState *dev;
CharBackend *cd;
struct vhost_dev *vhost;
IOEventHandler *event_cb;
} VhostAsyncCallback;
static void vhost_user_async_close_bh(void *opaque)
{
VhostAsyncCallback *data = opaque;
struct vhost_dev *vhost = data->vhost;
/*
* If the vhost_dev has been cleared in the meantime there is
* nothing left to do as some other path has completed the
* cleanup.
*/
if (vhost->vdev) {
data->cb(data->dev);
} else if (data->event_cb) {
qemu_chr_fe_set_handlers(data->cd, NULL, NULL, data->event_cb,
NULL, data->dev, NULL, true);
}
g_free(data);
}
/*
* We only schedule the work if the machine is running. If suspended
* we want to keep all the in-flight data as is for migration
* purposes.
*/
void vhost_user_async_close(DeviceState *d,
CharBackend *chardev, struct vhost_dev *vhost,
vu_async_close_fn cb,
IOEventHandler *event_cb)
{
if (!runstate_check(RUN_STATE_SHUTDOWN)) {
/*
* A close event may happen during a read/write, but vhost
* code assumes the vhost_dev remains setup, so delay the
* stop & clear.
*/
AioContext *ctx = qemu_get_current_aio_context();
VhostAsyncCallback *data = g_new0(VhostAsyncCallback, 1);
/* Save data for the callback */
data->cb = cb;
data->dev = d;
data->cd = chardev;
data->vhost = vhost;
data->event_cb = event_cb;
/* Disable any further notifications on the chardev */
qemu_chr_fe_set_handlers(chardev,
NULL, NULL, NULL, NULL, NULL, NULL,
false);
aio_bh_schedule_oneshot(ctx, vhost_user_async_close_bh, data);
/*
* Move vhost device to the stopped state. The vhost-user device
* will be clean up and disconnected in BH. This can be useful in
* the vhost migration code. If disconnect was caught there is an
* option for the general vhost code to get the dev state without
* knowing its type (in this case vhost-user).
*
* Note if the vhost device is fully cleared by the time we
* execute the bottom half we won't continue with the cleanup.
*/
vhost->started = false;
}
}
static int vhost_user_dev_start(struct vhost_dev *dev, bool started)
{
if (!virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
return 0;
}
/* Set device status only for last queue pair */
if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
return 0;
}
if (started) {
return vhost_user_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
VIRTIO_CONFIG_S_DRIVER |
VIRTIO_CONFIG_S_DRIVER_OK);
} else {
return 0;
}
}
static void vhost_user_reset_status(struct vhost_dev *dev)
{
/* Set device status only for last queue pair */
if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
return;
}
if (virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_STATUS)) {
vhost_user_set_status(dev, 0);
}
}
static bool vhost_user_supports_device_state(struct vhost_dev *dev)
{
return virtio_has_feature(dev->protocol_features,
VHOST_USER_PROTOCOL_F_DEVICE_STATE);
}
static int vhost_user_set_device_state_fd(struct vhost_dev *dev,
VhostDeviceStateDirection direction,
VhostDeviceStatePhase phase,
int fd,
int *reply_fd,
Error **errp)
{
int ret;
struct vhost_user *vu = dev->opaque;
VhostUserMsg msg = {
.hdr = {
.request = VHOST_USER_SET_DEVICE_STATE_FD,
.flags = VHOST_USER_VERSION,
.size = sizeof(msg.payload.transfer_state),
},
.payload.transfer_state = {
.direction = direction,
.phase = phase,
},
};
*reply_fd = -1;
if (!vhost_user_supports_device_state(dev)) {
close(fd);
error_setg(errp, "Back-end does not support migration state transfer");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, &fd, 1);
close(fd);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to send SET_DEVICE_STATE_FD message");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to receive SET_DEVICE_STATE_FD reply");
return ret;
}
if (msg.hdr.request != VHOST_USER_SET_DEVICE_STATE_FD) {
error_setg(errp,
"Received unexpected message type, expected %d, received %d",
VHOST_USER_SET_DEVICE_STATE_FD, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_setg(errp,
"Received bad message size, expected %zu, received %" PRIu32,
sizeof(msg.payload.u64), msg.hdr.size);
return -EPROTO;
}
if ((msg.payload.u64 & 0xff) != 0) {
error_setg(errp, "Back-end did not accept migration state transfer");
return -EIO;
}
if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) {
*reply_fd = qemu_chr_fe_get_msgfd(vu->user->chr);
if (*reply_fd < 0) {
error_setg(errp,
"Failed to get back-end-provided transfer pipe FD");
*reply_fd = -1;
return -EIO;
}
}
return 0;
}
static int vhost_user_check_device_state(struct vhost_dev *dev, Error **errp)
{
int ret;
VhostUserMsg msg = {
.hdr = {
.request = VHOST_USER_CHECK_DEVICE_STATE,
.flags = VHOST_USER_VERSION,
.size = 0,
},
};
if (!vhost_user_supports_device_state(dev)) {
error_setg(errp, "Back-end does not support migration state transfer");
return -ENOTSUP;
}
ret = vhost_user_write(dev, &msg, NULL, 0);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to send CHECK_DEVICE_STATE message");
return ret;
}
ret = vhost_user_read(dev, &msg);
if (ret < 0) {
error_setg_errno(errp, -ret,
"Failed to receive CHECK_DEVICE_STATE reply");
return ret;
}
if (msg.hdr.request != VHOST_USER_CHECK_DEVICE_STATE) {
error_setg(errp,
"Received unexpected message type, expected %d, received %d",
VHOST_USER_CHECK_DEVICE_STATE, msg.hdr.request);
return -EPROTO;
}
if (msg.hdr.size != sizeof(msg.payload.u64)) {
error_setg(errp,
"Received bad message size, expected %zu, received %" PRIu32,
sizeof(msg.payload.u64), msg.hdr.size);
return -EPROTO;
}
if (msg.payload.u64 != 0) {
error_setg(errp, "Back-end failed to process its internal state");
return -EIO;
}
return 0;
}
const VhostOps user_ops = {
.backend_type = VHOST_BACKEND_TYPE_USER,
.vhost_backend_init = vhost_user_backend_init,
.vhost_backend_cleanup = vhost_user_backend_cleanup,
.vhost_backend_memslots_limit = vhost_user_memslots_limit,
.vhost_backend_no_private_memslots = vhost_user_no_private_memslots,
.vhost_set_log_base = vhost_user_set_log_base,
.vhost_set_mem_table = vhost_user_set_mem_table,
.vhost_set_vring_addr = vhost_user_set_vring_addr,
.vhost_set_vring_endian = vhost_user_set_vring_endian,
.vhost_set_vring_num = vhost_user_set_vring_num,
.vhost_set_vring_base = vhost_user_set_vring_base,
.vhost_get_vring_base = vhost_user_get_vring_base,
.vhost_set_vring_kick = vhost_user_set_vring_kick,
.vhost_set_vring_call = vhost_user_set_vring_call,
.vhost_set_vring_err = vhost_user_set_vring_err,
.vhost_set_features = vhost_user_set_features,
.vhost_get_features = vhost_user_get_features,
.vhost_set_owner = vhost_user_set_owner,
.vhost_reset_device = vhost_user_reset_device,
.vhost_get_vq_index = vhost_user_get_vq_index,
.vhost_set_vring_enable = vhost_user_set_vring_enable,
.vhost_requires_shm_log = vhost_user_requires_shm_log,
.vhost_migration_done = vhost_user_migration_done,
.vhost_net_set_mtu = vhost_user_net_set_mtu,
.vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
.vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
.vhost_get_config = vhost_user_get_config,
.vhost_set_config = vhost_user_set_config,
.vhost_crypto_create_session = vhost_user_crypto_create_session,
.vhost_crypto_close_session = vhost_user_crypto_close_session,
.vhost_get_inflight_fd = vhost_user_get_inflight_fd,
.vhost_set_inflight_fd = vhost_user_set_inflight_fd,
.vhost_dev_start = vhost_user_dev_start,
.vhost_reset_status = vhost_user_reset_status,
.vhost_supports_device_state = vhost_user_supports_device_state,
.vhost_set_device_state_fd = vhost_user_set_device_state_fd,
.vhost_check_device_state = vhost_user_check_device_state,
};