From 3d1ad18c4257471cd5fda8397a46457735027b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:00 +0300 Subject: [PATCH 1/9] tests/vhost-user-bridge: remove false comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dispatcher_remove() is in use. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 775e031069..e91279b91e 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -101,8 +101,6 @@ dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) return 0; } -/* dispatcher_remove() is not currently in use but may be useful - * in the future. */ static int dispatcher_remove(Dispatcher *dispr, int sock) { From 4e4212d0566e4daa0b8eff9029cafb4a6ac207c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:01 +0300 Subject: [PATCH 2/9] tests/vhost-user-bridge: remove unnecessary dispatcher_remove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The call fd is not watched Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index e91279b91e..19b0e94258 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -979,7 +979,6 @@ vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) if (dev->vq[index].call_fd != -1) { close(dev->vq[index].call_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); dev->vq[index].call_fd = -1; } if (dev->vq[index].kick_fd != -1) { @@ -1043,7 +1042,6 @@ vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) if (dev->vq[index].call_fd != -1) { close(dev->vq[index].call_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); } dev->vq[index].call_fd = vmsg->fds[0]; DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); From 9652f5785e394f099dbc9188ae88860727df44ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:02 +0300 Subject: [PATCH 3/9] tests/vhost-user-bridge: indicate peer disconnected MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 19b0e94258..97e45d8be8 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -385,7 +385,6 @@ vubr_message_read(int conn_fd, VhostUserMsg *vmsg) rc = recvmsg(conn_fd, &msg, 0); if (rc == 0) { - vubr_die("recvmsg"); fprintf(stderr, "Peer disconnected.\n"); exit(1); } From 98206d4e6b719f650b0f2d23bcd4bab83c624341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:03 +0300 Subject: [PATCH 4/9] tests/vhost-user-bridge: do not accept more than one connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-bridge.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 97e45d8be8..5b618f670a 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -1200,6 +1200,7 @@ vubr_accept_cb(int sock, void *ctx) } DPRINT("Got connection from remote peer on sock %d\n", conn_fd); dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); + dispatcher_remove(&dev->dispatcher, sock); } static VubrDev * From 7b2e5c65f42fff7a418e2b8216e92f5e0a0c3e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:04 +0300 Subject: [PATCH 5/9] contrib: add libvhost-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a library to help implementing vhost-user backend (or slave). Dealing with vhost-user as an application developer isn't so easy: you have all the trouble with any protocol: validation, unix ancillary data, shared memory, eventfd, logging, and on top of that you need to deal with virtio queues, if possible efficiently. qemu test has a nice vhost-user testing application vhost-user-bridge, which implements most of vhost-user, and virtio.c which implements virtqueues manipulation. Based on these two, I tried to make a simple library, reusable for tests or development of new vhost-user scenarios. Signed-off-by: Marc-André Lureau [Felipe: set used_idx copy on SET_VRING_ADDR and update shadow avail idx on SET_VRING_BASE] Signed-off-by: Felipe Franciosi Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- Makefile | 1 + Makefile.objs | 2 +- contrib/libvhost-user/Makefile.objs | 1 + contrib/libvhost-user/libvhost-user.c | 1499 +++++++++++++++++++++++++ contrib/libvhost-user/libvhost-user.h | 435 +++++++ 5 files changed, 1937 insertions(+), 1 deletion(-) create mode 100644 contrib/libvhost-user/Makefile.objs create mode 100644 contrib/libvhost-user/libvhost-user.c create mode 100644 contrib/libvhost-user/libvhost-user.h diff --git a/Makefile b/Makefile index 474cc5e66a..68accb9252 100644 --- a/Makefile +++ b/Makefile @@ -149,6 +149,7 @@ dummy := $(call unnest-vars,, \ qga-obj-y \ ivshmem-client-obj-y \ ivshmem-server-obj-y \ + libvhost-user-obj-y \ qga-vss-dll-obj-y \ block-obj-y \ block-obj-m \ diff --git a/Makefile.objs b/Makefile.objs index 06f74b8b99..ecd6576a77 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -116,7 +116,7 @@ qga-vss-dll-obj-y = qga/ # contrib ivshmem-client-obj-y = contrib/ivshmem-client/ ivshmem-server-obj-y = contrib/ivshmem-server/ - +libvhost-user-obj-y = contrib/libvhost-user/ ###################################################################### trace-events-y = trace-events diff --git a/contrib/libvhost-user/Makefile.objs b/contrib/libvhost-user/Makefile.objs new file mode 100644 index 0000000000..cef1ad6e31 --- /dev/null +++ b/contrib/libvhost-user/Makefile.objs @@ -0,0 +1 @@ +libvhost-user-obj-y = libvhost-user.o diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c new file mode 100644 index 0000000000..af4faad60b --- /dev/null +++ b/contrib/libvhost-user/libvhost-user.c @@ -0,0 +1,1499 @@ +/* + * Vhost User library + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori + * Marc-André Lureau + * Victor Kaplansky + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include +#include +#include + +#include "qemu/atomic.h" + +#include "libvhost-user.h" + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 1 +#define LIBVHOST_USER_DEBUG 0 + +#define DPRINT(...) \ + do { \ + if (LIBVHOST_USER_DEBUG) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) + +static const char * +vu_request_to_string(int req) +{ +#define REQ(req) [req] = #req + static const char *vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_INPUT_GET_CONFIG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + + if (req < VHOST_USER_MAX) { + return vu_request_str[req]; + } else { + return "unknown"; + } +} + +static void +vu_panic(VuDev *dev, const char *msg, ...) +{ + char *buf = NULL; + va_list ap; + + va_start(ap, msg); + (void)vasprintf(&buf, msg, ap); + va_end(ap); + + dev->broken = true; + dev->panic(dev, buf); + free(buf); + + /* FIXME: find a way to call virtio_error? */ +} + +/* Translate guest physical address to our virtual address. */ +void * +vu_gpa_to_va(VuDev *dev, uint64_t guest_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + return (void *)(uintptr_t) + guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; + } + } + + return NULL; +} + +/* Translate qemu virtual address to our virtual address. */ +static void * +qva_to_va(VuDev *dev, uint64_t qemu_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + return (void *)(uintptr_t) + qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; + } + } + + return NULL; +} + +static void +vmsg_close_fds(VhostUserMsg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) { + close(vmsg->fds[i]); + } +} + +static bool +vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + int rc; + + do { + rc = recvmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); + return false; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) + { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + if (vmsg->size > sizeof(vmsg->payload)) { + vu_panic(dev, + "Error: too big message request: %d, size: vmsg->size: %u, " + "while sizeof(vmsg->payload) = %zu\n", + vmsg->request, vmsg->size, sizeof(vmsg->payload)); + goto fail; + } + + if (vmsg->size) { + do { + rc = read(conn_fd, &vmsg->payload, vmsg->size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while reading: %s", strerror(errno)); + goto fail; + } + + assert(rc == vmsg->size); + } + + return true; + +fail: + vmsg_close_fds(vmsg); + + return false; +} + +static bool +vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) +{ + int rc; + uint8_t *p = (uint8_t *)vmsg; + + /* Set the version in the flags when sending the reply */ + vmsg->flags &= ~VHOST_USER_VERSION_MASK; + vmsg->flags |= VHOST_USER_VERSION; + vmsg->flags |= VHOST_USER_REPLY_MASK; + + do { + rc = write(conn_fd, p, VHOST_USER_HDR_SIZE); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + do { + if (vmsg->data) { + rc = write(conn_fd, vmsg->data, vmsg->size); + } else { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); + } + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while writing: %s", strerror(errno)); + return false; + } + + return true; +} + +/* Kick the log_call_fd if required. */ +static void +vu_log_kick(VuDev *dev) +{ + if (dev->log_call_fd != -1) { + DPRINT("Kicking the QEMU's log...\n"); + if (eventfd_write(dev->log_call_fd, 1) < 0) { + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); + } + } +} + +static void +vu_log_page(uint8_t *log_table, uint64_t page) +{ + DPRINT("Logged dirty guest page: %"PRId64"\n", page); + atomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +static void +vu_log_write(VuDev *dev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || + !dev->log_table || !length) { + return; + } + + assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vu_log_page(dev->log_table, page); + page += VHOST_LOG_PAGE; + } + + vu_log_kick(dev); +} + +static void +vu_kick_cb(VuDev *dev, int condition, void *data) +{ + int index = (intptr_t)data; + VuVirtq *vq = &dev->vq[index]; + int sock = vq->kick_fd; + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(sock, &kick_data); + if (rc == -1) { + vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); + dev->remove_watch(dev, dev->vq[index].kick_fd); + } else { + DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", + kick_data, vq->handler, index); + if (vq->handler) { + vq->handler(dev, index); + } + } +} + +static bool +vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = + 1ULL << VHOST_F_LOG_ALL | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + if (dev->iface->get_features) { + vmsg->payload.u64 |= dev->iface->get_features(dev); + } + + vmsg->size = sizeof(vmsg->payload.u64); + + DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + return true; +} + +static void +vu_set_enable_all_rings(VuDev *dev, bool enabled) +{ + int i; + + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + dev->vq[i].enable = enabled; + } +} + +static bool +vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + dev->features = vmsg->payload.u64; + + if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { + vu_set_enable_all_rings(dev, true); + } + + if (dev->iface->set_features) { + dev->iface->set_features(dev, dev->features); + } + + return false; +} + +static bool +vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + return false; +} + +static void +vu_close_log(VuDev *dev) +{ + if (dev->log_table) { + if (munmap(dev->log_table, dev->log_size) != 0) { + perror("close log munmap() error"); + } + + dev->log_table = NULL; + } + if (dev->log_call_fd != -1) { + close(dev->log_call_fd); + dev->log_call_fd = -1; + } +} + +static bool +vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + vu_set_enable_all_rings(dev, false); + + return false; +} + +static bool +vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + dev_region->mmap_addr); + } + + close(vmsg->fds[i]); + } + + return false; +} + +static bool +vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int fd; + uint64_t log_mmap_size, log_mmap_offset; + void *rc; + + if (vmsg->fd_num != 1 || + vmsg->size != sizeof(vmsg->payload.log)) { + vu_panic(dev, "Invalid log_base message"); + return true; + } + + fd = vmsg->fds[0]; + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; + DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); + DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); + + rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + if (rc == MAP_FAILED) { + perror("log mmap error"); + } + dev->log_table = rc; + dev->log_size = log_mmap_size; + + vmsg->size = sizeof(vmsg->payload.u64); + + return true; +} + +static bool +vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + if (vmsg->fd_num != 1) { + vu_panic(dev, "Invalid log_fd message"); + return false; + } + + if (dev->log_call_fd != -1) { + close(dev->log_call_fd); + } + dev->log_call_fd = vmsg->fds[0]; + DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); + + return false; +} + +static bool +vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].vring.num = num; + + return false; +} + +static bool +vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + struct vhost_vring_addr *vra = &vmsg->payload.addr; + unsigned int index = vra->index; + VuVirtq *vq = &dev->vq[index]; + + DPRINT("vhost_vring_addr:\n"); + DPRINT(" index: %d\n", vra->index); + DPRINT(" flags: %d\n", vra->flags); + DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); + DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); + DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); + + vq->vring.flags = vra->flags; + vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); + vq->vring.used = qva_to_va(dev, vra->used_user_addr); + vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); + vq->vring.log_guest_addr = vra->log_guest_addr; + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { + vu_panic(dev, "Invalid vring_addr message"); + return false; + } + + vq->used_idx = vq->vring.used->idx; + + return false; +} + +static bool +vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; + + return false; +} + +static bool +vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + + DPRINT("State.index: %d\n", index); + vmsg->payload.state.num = dev->vq[index].last_avail_idx; + vmsg->size = sizeof(vmsg->payload.state); + + dev->vq[index].started = false; + if (dev->iface->queue_set_started) { + dev->iface->queue_set_started(dev, index, false); + } + + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dev->vq[index].call_fd = -1; + } + if (dev->vq[index].kick_fd != -1) { + dev->remove_watch(dev, dev->vq[index].kick_fd); + close(dev->vq[index].kick_fd); + dev->vq[index].kick_fd = -1; + } + + return true; +} + +static bool +vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + if (index >= VHOST_MAX_NR_VIRTQUEUE) { + vmsg_close_fds(vmsg); + vu_panic(dev, "Invalid queue index: %u", index); + return false; + } + + if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || + vmsg->fd_num != 1) { + vmsg_close_fds(vmsg); + vu_panic(dev, "Invalid fds in request: %d", vmsg->request); + return false; + } + + return true; +} + +static bool +vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].kick_fd != -1) { + dev->remove_watch(dev, dev->vq[index].kick_fd); + close(dev->vq[index].kick_fd); + dev->vq[index].kick_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].kick_fd = vmsg->fds[0]; + DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); + } + + dev->vq[index].started = true; + if (dev->iface->queue_set_started) { + dev->iface->queue_set_started(dev, index, true); + } + + if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { + dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, + vu_kick_cb, (void *)(long)index); + + DPRINT("Waiting for kicks on fd: %d for vq: %d\n", + dev->vq[index].kick_fd, index); + } + + return false; +} + +void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, + vu_queue_handler_cb handler) +{ + int qidx = vq - dev->vq; + + vq->handler = handler; + if (vq->kick_fd >= 0) { + if (handler) { + dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, + vu_kick_cb, (void *)(long)qidx); + } else { + dev->remove_watch(dev, vq->kick_fd); + } + } +} + +static bool +vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dev->vq[index].call_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].call_fd = vmsg->fds[0]; + } + + DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); + + return false; +} + +static bool +vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].err_fd != -1) { + close(dev->vq[index].err_fd); + dev->vq[index].err_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].err_fd = vmsg->fds[0]; + } + + return false; +} + +static bool +vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + + if (dev->iface->get_protocol_features) { + features |= dev->iface->get_protocol_features(dev); + } + + vmsg->payload.u64 = features; + vmsg->size = sizeof(vmsg->payload.u64); + + return true; +} + +static bool +vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + uint64_t features = vmsg->payload.u64; + + DPRINT("u64: 0x%016"PRIx64"\n", features); + + dev->protocol_features = vmsg->payload.u64; + + if (dev->iface->set_protocol_features) { + dev->iface->set_protocol_features(dev, features); + } + + return false; +} + +static bool +vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return false; +} + +static bool +vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int enable = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.enable: %d\n", enable); + + if (index >= VHOST_MAX_NR_VIRTQUEUE) { + vu_panic(dev, "Invalid vring_enable index: %u", index); + return false; + } + + dev->vq[index].enable = enable; + return false; +} + +static bool +vu_process_message(VuDev *dev, VhostUserMsg *vmsg) +{ + int do_reply = 0; + + /* Print out generic part of the request. */ + DPRINT("================ Vhost user message ================\n"); + DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), + vmsg->request); + DPRINT("Flags: 0x%x\n", vmsg->flags); + DPRINT("Size: %d\n", vmsg->size); + + if (vmsg->fd_num) { + int i; + DPRINT("Fds:"); + for (i = 0; i < vmsg->fd_num; i++) { + DPRINT(" %d", vmsg->fds[i]); + } + DPRINT("\n"); + } + + if (dev->iface->process_msg && + dev->iface->process_msg(dev, vmsg, &do_reply)) { + return do_reply; + } + + switch (vmsg->request) { + case VHOST_USER_GET_FEATURES: + return vu_get_features_exec(dev, vmsg); + case VHOST_USER_SET_FEATURES: + return vu_set_features_exec(dev, vmsg); + case VHOST_USER_GET_PROTOCOL_FEATURES: + return vu_get_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_PROTOCOL_FEATURES: + return vu_set_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_OWNER: + return vu_set_owner_exec(dev, vmsg); + case VHOST_USER_RESET_OWNER: + return vu_reset_device_exec(dev, vmsg); + case VHOST_USER_SET_MEM_TABLE: + return vu_set_mem_table_exec(dev, vmsg); + case VHOST_USER_SET_LOG_BASE: + return vu_set_log_base_exec(dev, vmsg); + case VHOST_USER_SET_LOG_FD: + return vu_set_log_fd_exec(dev, vmsg); + case VHOST_USER_SET_VRING_NUM: + return vu_set_vring_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ADDR: + return vu_set_vring_addr_exec(dev, vmsg); + case VHOST_USER_SET_VRING_BASE: + return vu_set_vring_base_exec(dev, vmsg); + case VHOST_USER_GET_VRING_BASE: + return vu_get_vring_base_exec(dev, vmsg); + case VHOST_USER_SET_VRING_KICK: + return vu_set_vring_kick_exec(dev, vmsg); + case VHOST_USER_SET_VRING_CALL: + return vu_set_vring_call_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ERR: + return vu_set_vring_err_exec(dev, vmsg); + case VHOST_USER_GET_QUEUE_NUM: + return vu_get_queue_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ENABLE: + return vu_set_vring_enable_exec(dev, vmsg); + default: + vmsg_close_fds(vmsg); + vu_panic(dev, "Unhandled request: %d", vmsg->request); + } + + return false; +} + +bool +vu_dispatch(VuDev *dev) +{ + VhostUserMsg vmsg = { 0, }; + int reply_requested; + bool success = false; + + if (!vu_message_read(dev, dev->sock, &vmsg)) { + goto end; + } + + reply_requested = vu_process_message(dev, &vmsg); + if (!reply_requested) { + success = true; + goto end; + } + + if (!vu_message_write(dev, dev->sock, &vmsg)) { + goto end; + } + + success = true; + +end: + g_free(vmsg.data); + return success; +} + +void +vu_deinit(VuDev *dev) +{ + int i; + + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + void *m = (void *) (uintptr_t) r->mmap_addr; + if (m != MAP_FAILED) { + munmap(m, r->size + r->mmap_offset); + } + } + dev->nregions = 0; + + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + VuVirtq *vq = &dev->vq[i]; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + + if (vq->kick_fd != -1) { + close(vq->kick_fd); + vq->kick_fd = -1; + } + + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + } + + + vu_close_log(dev); + + if (dev->sock != -1) { + close(dev->sock); + } +} + +void +vu_init(VuDev *dev, + int socket, + vu_panic_cb panic, + vu_set_watch_cb set_watch, + vu_remove_watch_cb remove_watch, + const VuDevIface *iface) +{ + int i; + + assert(socket >= 0); + assert(set_watch); + assert(remove_watch); + assert(iface); + assert(panic); + + memset(dev, 0, sizeof(*dev)); + + dev->sock = socket; + dev->panic = panic; + dev->set_watch = set_watch; + dev->remove_watch = remove_watch; + dev->iface = iface; + dev->log_call_fd = -1; + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + dev->vq[i] = (VuVirtq) { + .call_fd = -1, .kick_fd = -1, .err_fd = -1, + .notification = true, + }; + } +} + +VuVirtq * +vu_get_queue(VuDev *dev, int qidx) +{ + assert(qidx < VHOST_MAX_NR_VIRTQUEUE); + return &dev->vq[qidx]; +} + +bool +vu_queue_enabled(VuDev *dev, VuVirtq *vq) +{ + return vq->enable; +} + +static inline uint16_t +vring_avail_flags(VuVirtq *vq) +{ + return vq->vring.avail->flags; +} + +static inline uint16_t +vring_avail_idx(VuVirtq *vq) +{ + vq->shadow_avail_idx = vq->vring.avail->idx; + + return vq->shadow_avail_idx; +} + +static inline uint16_t +vring_avail_ring(VuVirtq *vq, int i) +{ + return vq->vring.avail->ring[i]; +} + +static inline uint16_t +vring_get_used_event(VuVirtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static int +virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) +{ + uint16_t num_heads = vring_avail_idx(vq) - idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (num_heads > vq->vring.num) { + vu_panic(dev, "Guest moved used index from %u to %u", + idx, vq->shadow_avail_idx); + return -1; + } + if (num_heads) { + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + smp_rmb(); + } + + return num_heads; +} + +static bool +virtqueue_get_head(VuDev *dev, VuVirtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + vu_panic(dev, "Guest says index %u is available", head); + return false; + } + + return true; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int +virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc[i].flags & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + /* Check they're not leading us off end of descriptors. */ + *next = desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + vu_panic(dev, "Desc next is %u", next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + return VIRTQUEUE_READ_DESC_MORE; +} + +void +vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes) +{ + unsigned int idx; + unsigned int total_bufs, in_total, out_total; + int rc; + + idx = vq->last_avail_idx; + + total_bufs = in_total = out_total = 0; + while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { + unsigned int max, num_bufs, indirect = 0; + struct vring_desc *desc; + unsigned int i; + + max = vq->vring.num; + num_bufs = total_bufs; + if (!virtqueue_get_head(dev, vq, idx++, &i)) { + goto err; + } + desc = vq->vring.desc; + + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + goto err; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (num_bufs >= max) { + vu_panic(dev, "Looped descriptor"); + goto err; + } + + /* loop over the indirect descriptor table */ + indirect = 1; + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + num_bufs = i = 0; + } + + do { + /* If we've got too many, that implies a descriptor loop. */ + if (++num_bufs > max) { + vu_panic(dev, "Looped descriptor"); + goto err; + } + + if (desc[i].flags & VRING_DESC_F_WRITE) { + in_total += desc[i].len; + } else { + out_total += desc[i].len; + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + goto err; + } + + if (!indirect) { + total_bufs = num_bufs; + } else { + total_bufs++; + } + } + if (rc < 0) { + goto err; + } +done: + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } + return; + +err: + in_total = out_total = 0; + goto done; +} + +bool +vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, + unsigned int out_bytes) +{ + unsigned int in_total, out_total; + + vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, + in_bytes, out_bytes); + + return in_bytes <= in_total && out_bytes <= out_total; +} + +/* Fetch avail_idx from VQ memory only when we really need to know if + * guest has added some buffers. */ +int +vu_queue_empty(VuDev *dev, VuVirtq *vq) +{ + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static inline +bool has_feature(uint64_t features, unsigned int fbit) +{ + assert(fbit < 64); + return !!(features & (1ULL << fbit)); +} + +static inline +bool vu_has_feature(VuDev *dev, + unsigned int fbit) +{ + return has_feature(dev->features, fbit); +} + +static bool +vring_notify(VuDev *dev, VuVirtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(dev, vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + } + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +void +vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + if (unlikely(dev->broken)) { + return; + } + + if (!vring_notify(dev, vq)) { + DPRINT("skipped notify...\n"); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) { + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); + } +} + +static inline void +vring_used_flags_set_bit(VuVirtq *vq, int mask) +{ + uint16_t *flags; + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); + *flags |= mask; +} + +static inline void +vring_used_flags_unset_bit(VuVirtq *vq, int mask) +{ + uint16_t *flags; + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); + *flags &= ~mask; +} + +static inline void +vring_set_avail_event(VuVirtq *vq, uint16_t val) +{ + if (!vq->notification) { + return; + } + + *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val; +} + +void +vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) +{ + vq->notification = enable; + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vring_avail_idx(vq)); + } else if (enable) { + vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); + } else { + vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); + } + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } +} + +static void +virtqueue_map_desc(VuDev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, bool is_write, + uint64_t pa, size_t sz) +{ + unsigned num_sg = *p_num_sg; + + assert(num_sg <= max_num_sg); + + if (!sz) { + vu_panic(dev, "virtio: zero sized buffers are not allowed"); + return; + } + + iov[num_sg].iov_base = vu_gpa_to_va(dev, pa); + iov[num_sg].iov_len = sz; + num_sg++; + + *p_num_sg = num_sg; +} + +/* Round number down to multiple */ +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) + +/* Round number up to multiple */ +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + +static void * +virtqueue_alloc_element(size_t sz, + unsigned out_num, unsigned in_num) +{ + VuVirtqElement *elem; + size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); + + assert(sz >= sizeof(VuVirtqElement)); + elem = malloc(out_sg_end); + elem->out_num = out_num; + elem->in_num = in_num; + elem->in_sg = (void *)elem + in_sg_ofs; + elem->out_sg = (void *)elem + out_sg_ofs; + return elem; +} + +void * +vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) +{ + unsigned int i, head, max; + VuVirtqElement *elem; + unsigned out_num, in_num; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + struct vring_desc *desc; + int rc; + + if (unlikely(dev->broken)) { + return NULL; + } + + if (vu_queue_empty(dev, vq)) { + return NULL; + } + /* Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). */ + smp_rmb(); + + /* When we start there are none of either input nor output. */ + out_num = in_num = 0; + + max = vq->vring.num; + if (vq->inuse >= vq->vring.num) { + vu_panic(dev, "Virtqueue size exceeded"); + return NULL; + } + + if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { + return NULL; + } + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vq->last_avail_idx); + } + + i = head; + desc = vq->vring.desc; + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (desc[i].flags & VRING_DESC_F_WRITE) { + virtqueue_map_desc(dev, &in_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, + desc[i].addr, desc[i].len); + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); + return NULL; + } + virtqueue_map_desc(dev, &out_num, iov, + VIRTQUEUE_MAX_SIZE, false, + desc[i].addr, desc[i].len); + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) { + vu_panic(dev, "Looped descriptor"); + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + return NULL; + } + + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); + elem->index = head; + for (i = 0; i < out_num; i++) { + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_sg[i] = iov[out_num + i]; + } + + vq->inuse++; + + return elem; +} + +bool +vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) +{ + if (num > vq->inuse) { + return false; + } + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +static inline +void vring_used_write(VuDev *dev, VuVirtq *vq, + struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; + vu_log_write(dev, vq->vring.log_guest_addr + + offsetof(struct vring_used, ring[i]), + sizeof(used->ring[i])); +} + + +static void +vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len) +{ + struct vring_desc *desc = vq->vring.desc; + unsigned int i, max, min; + unsigned num_bufs = 0; + + max = vq->vring.num; + i = elem->index; + + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + i = 0; + } + + do { + if (++num_bufs > max) { + vu_panic(dev, "Looped descriptor"); + return; + } + + if (desc[i].flags & VRING_DESC_F_WRITE) { + min = MIN(desc[i].len, len); + vu_log_write(dev, desc[i].addr, min); + len -= min; + } + + } while (len > 0 && + (virtqueue_read_next_desc(dev, desc, i, max, &i) + == VIRTQUEUE_READ_DESC_MORE)); +} + +void +vu_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (unlikely(dev->broken)) { + return; + } + + vu_log_queue_fill(dev, vq, elem, len); + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = elem->index; + uelem.len = len; + vring_used_write(dev, vq, &uelem, idx); +} + +static inline +void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) +{ + vq->vring.used->idx = val; + vu_log_write(dev, + vq->vring.log_guest_addr + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); + + vq->used_idx = val; +} + +void +vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (unlikely(dev->broken)) { + return; + } + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(dev, vq, new); + vq->inuse -= count; + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { + vq->signalled_used_valid = false; + } +} + +void +vu_queue_push(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len) +{ + vu_queue_fill(dev, vq, elem, len, 0); + vu_queue_flush(dev, vq, 1); +} diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h new file mode 100644 index 0000000000..156b50e989 --- /dev/null +++ b/contrib/libvhost-user/libvhost-user.h @@ -0,0 +1,435 @@ +/* + * Vhost User library + * + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Victor Kaplansky + * Marc-André Lureau + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef LIBVHOST_USER_H +#define LIBVHOST_USER_H + +#include +#include +#include +#include +#include "standard-headers/linux/virtio_ring.h" + +/* Based on qemu/hw/virtio/vhost-user.c */ +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 + +#define VHOST_MAX_NR_VIRTQUEUE 8 +#define VIRTQUEUE_MAX_SIZE 1024 + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_INPUT_GET_CONFIG = 20, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +#if defined(_WIN32) +# define VU_PACKED __attribute__((gcc_struct, packed)) +#else +# define VU_PACKED __attribute__((packed)) +#endif + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + } payload; + + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num; + uint8_t *data; +} VU_PACKED VhostUserMsg; + +typedef struct VuDevRegion { + /* Guest Physical address. */ + uint64_t gpa; + /* Memory region size. */ + uint64_t size; + /* QEMU virtual address (userspace). */ + uint64_t qva; + /* Starting offset in our mmaped space. */ + uint64_t mmap_offset; + /* Start address of mmaped space. */ + uint64_t mmap_addr; +} VuDevRegion; + +typedef struct VuDev VuDev; + +typedef uint64_t (*vu_get_features_cb) (VuDev *dev); +typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features); +typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg, + int *do_reply); +typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started); + +typedef struct VuDevIface { + /* called by VHOST_USER_GET_FEATURES to get the features bitmask */ + vu_get_features_cb get_features; + /* enable vhost implementation features */ + vu_set_features_cb set_features; + /* get the protocol feature bitmask from the underlying vhost + * implementation */ + vu_get_features_cb get_protocol_features; + /* enable protocol features in the underlying vhost implementation. */ + vu_set_features_cb set_protocol_features; + /* process_msg is called for each vhost-user message received */ + /* skip libvhost-user processing if return value != 0 */ + vu_process_msg_cb process_msg; + /* tells when queues can be processed */ + vu_queue_set_started_cb queue_set_started; +} VuDevIface; + +typedef void (*vu_queue_handler_cb) (VuDev *dev, int qidx); + +typedef struct VuRing { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +} VuRing; + +typedef struct VuVirtq { + VuRing vring; + + /* Next head to pop */ + uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Notification enabled? */ + bool notification; + + int inuse; + + vu_queue_handler_cb handler; + + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; +} VuVirtq; + +enum VuWatchCondtion { + VU_WATCH_IN = 1 << 0, + VU_WATCH_OUT = 1 << 1, + VU_WATCH_PRI = 1 << 2, + VU_WATCH_ERR = 1 << 3, + VU_WATCH_HUP = 1 << 4, +}; + +typedef void (*vu_panic_cb) (VuDev *dev, const char *err); +typedef void (*vu_watch_cb) (VuDev *dev, int condition, void *data); +typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition, + vu_watch_cb cb, void *data); +typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd); + +struct VuDev { + int sock; + uint32_t nregions; + VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE]; + int log_call_fd; + uint64_t log_size; + uint8_t *log_table; + uint64_t features; + uint64_t protocol_features; + bool broken; + + /* @set_watch: add or update the given fd to the watch set, + * call cb when condition is met */ + vu_set_watch_cb set_watch; + + /* @remove_watch: remove the given fd from the watch set */ + vu_remove_watch_cb remove_watch; + + /* @panic: encountered an unrecoverable error, you may try to + * re-initialize */ + vu_panic_cb panic; + const VuDevIface *iface; +}; + +typedef struct VuVirtqElement { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +} VuVirtqElement; + +/** + * vu_init: + * @dev: a VuDev context + * @socket: the socket connected to vhost-user master + * @panic: a panic callback + * @set_watch: a set_watch callback + * @remove_watch: a remove_watch callback + * @iface: a VuDevIface structure with vhost-user device callbacks + * + * Intializes a VuDev vhost-user context. + **/ +void vu_init(VuDev *dev, + int socket, + vu_panic_cb panic, + vu_set_watch_cb set_watch, + vu_remove_watch_cb remove_watch, + const VuDevIface *iface); + + +/** + * vu_deinit: + * @dev: a VuDev context + * + * Cleans up the VuDev context + */ +void vu_deinit(VuDev *dev); + +/** + * vu_dispatch: + * @dev: a VuDev context + * + * Process one vhost-user message. + * + * Returns: TRUE on success, FALSE on failure. + */ +bool vu_dispatch(VuDev *dev); + +/** + * vu_gpa_to_va: + * @dev: a VuDev context + * @guest_addr: guest address + * + * Translate a guest address to a pointer. Returns NULL on failure. + */ +void *vu_gpa_to_va(VuDev *dev, uint64_t guest_addr); + +/** + * vu_get_queue: + * @dev: a VuDev context + * @qidx: queue index + * + * Returns the queue number @qidx. + */ +VuVirtq *vu_get_queue(VuDev *dev, int qidx); + +/** + * vu_set_queue_handler: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @handler: the queue handler callback + * + * Set the queue handler. This function may be called several times + * for the same queue. If called with NULL @handler, the handler is + * removed. + */ +void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, + vu_queue_handler_cb handler); + + +/** + * vu_queue_set_notification: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @enable: state + * + * Set whether the queue notifies (via event index or interrupt) + */ +void vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable); + +/** + * vu_queue_enabled: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Returns: whether the queue is enabled. + */ +bool vu_queue_enabled(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_enabled: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Returns: whether the queue is empty. + */ +int vu_queue_empty(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_notify: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Request to notify the queue via callfd (skipped if unnecessary) + */ +void vu_queue_notify(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_pop: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @sz: the size of struct to return (must be >= VuVirtqElement) + * + * Returns: a VuVirtqElement filled from the queue or NULL. + */ +void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz); + +/** + * vu_queue_rewind: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @num: number of elements to push back + * + * Pretend that elements weren't popped from the virtqueue. The next + * virtqueue_pop() will refetch the oldest element. + * + * Returns: true on success, false if @num is greater than the number of in use + * elements. + */ +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num); + +/** + * vu_queue_fill: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @elem: a VuVirtqElement + * @len: length in bytes to write + * @idx: optional offset for the used ring index (0 in general) + * + * Fill the used ring with @elem element. + */ +void vu_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len, unsigned int idx); + +/** + * vu_queue_push: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @elem: a VuVirtqElement + * @len: length in bytes to write + * + * Helper that combines vu_queue_fill() with a vu_queue_flush(). + */ +void vu_queue_push(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len); + +/** + * vu_queue_flush: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @num: number of elements to flush + * + * Mark the last number of elements as done (used.idx is updated by + * num elements). +*/ +void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int num); + +/** + * vu_queue_get_avail_bytes: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @in_bytes: in bytes + * @out_bytes: out bytes + * @max_in_bytes: stop counting after max_in_bytes + * @max_out_bytes: stop counting after max_out_bytes + * + * Count the number of available bytes, up to max_in_bytes/max_out_bytes. + */ +void vu_queue_get_avail_bytes(VuDev *vdev, VuVirtq *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes); + +/** + * vu_queue_avail_bytes: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @in_bytes: expected in bytes + * @out_bytes: expected out bytes + * + * Returns: true if in_bytes <= in_total && out_bytes <= out_total + */ +bool vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, + unsigned int out_bytes); + +#endif /* LIBVHOST_USER_H */ From e10e798c85c2331dab338b6a01835ebde81136e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Tue, 18 Oct 2016 12:24:05 +0300 Subject: [PATCH 6/9] tests/vhost-user-bridge: use contrib/libvhost-user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the libvhost-user library. This ended up being a rather large patch that cannot be easily splitted, due to massive code move and API changes. Signed-off-by: Marc-André Lureau Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/Makefile.include | 2 +- tests/vhost-user-bridge.c | 1311 +++++++++---------------------------- 2 files changed, 296 insertions(+), 1017 deletions(-) diff --git a/tests/Makefile.include b/tests/Makefile.include index e98d3b6bb3..6554ef877b 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -687,7 +687,7 @@ tests/test-filter-mirror$(EXESUF): tests/test-filter-mirror.o $(qtest-obj-y) tests/test-filter-redirector$(EXESUF): tests/test-filter-redirector.o $(qtest-obj-y) tests/test-x86-cpuid-compat$(EXESUF): tests/test-x86-cpuid-compat.o $(qtest-obj-y) tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y) -tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o +tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o contrib/libvhost-user/libvhost-user.o $(test-util-obj-y) tests/test-uuid$(EXESUF): tests/test-uuid.o $(test-util-obj-y) tests/test-arm-mptimer$(EXESUF): tests/test-arm-mptimer.o diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 5b618f670a..8618c20d53 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -30,17 +30,9 @@ #define _FILE_OFFSET_BITS 64 #include "qemu/osdep.h" -#include -#include -#include -#include -#include -#include -#include - -#include "qemu/atomic.h" +#include "qemu/iov.h" #include "standard-headers/linux/virtio_net.h" -#include "standard-headers/linux/virtio_ring.h" +#include "contrib/libvhost-user/libvhost-user.h" #define VHOST_USER_BRIDGE_DEBUG 1 @@ -64,6 +56,17 @@ typedef struct Dispatcher { Event events[FD_SETSIZE]; } Dispatcher; +typedef struct VubrDev { + VuDev vudev; + Dispatcher dispatcher; + int backend_udp_sock; + struct sockaddr_in backend_udp_dest; + int hdrlen; + int sock; + int ready; + int quit; +} VubrDev; + static void vubr_die(const char *s) { @@ -155,1037 +158,314 @@ dispatcher_wait(Dispatcher *dispr, uint32_t timeout) return 0; } -typedef struct VubrVirtq { - int call_fd; - int kick_fd; - uint32_t size; - uint16_t last_avail_index; - uint16_t last_used_index; - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; - uint64_t log_guest_addr; - int enable; -} VubrVirtq; - -/* Based on qemu/hw/virtio/vhost-user.c */ - -#define VHOST_MEMORY_MAX_NREGIONS 8 -#define VHOST_USER_F_PROTOCOL_FEATURES 30 -/* v1.0 compliant. */ -#define VIRTIO_F_VERSION_1 32 - -#define VHOST_LOG_PAGE 4096 - -enum VhostUserProtocolFeature { - VHOST_USER_PROTOCOL_F_MQ = 0, - VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, - VHOST_USER_PROTOCOL_F_RARP = 2, - - VHOST_USER_PROTOCOL_F_MAX -}; - -#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) - -typedef enum VhostUserRequest { - VHOST_USER_NONE = 0, - VHOST_USER_GET_FEATURES = 1, - VHOST_USER_SET_FEATURES = 2, - VHOST_USER_SET_OWNER = 3, - VHOST_USER_RESET_OWNER = 4, - VHOST_USER_SET_MEM_TABLE = 5, - VHOST_USER_SET_LOG_BASE = 6, - VHOST_USER_SET_LOG_FD = 7, - VHOST_USER_SET_VRING_NUM = 8, - VHOST_USER_SET_VRING_ADDR = 9, - VHOST_USER_SET_VRING_BASE = 10, - VHOST_USER_GET_VRING_BASE = 11, - VHOST_USER_SET_VRING_KICK = 12, - VHOST_USER_SET_VRING_CALL = 13, - VHOST_USER_SET_VRING_ERR = 14, - VHOST_USER_GET_PROTOCOL_FEATURES = 15, - VHOST_USER_SET_PROTOCOL_FEATURES = 16, - VHOST_USER_GET_QUEUE_NUM = 17, - VHOST_USER_SET_VRING_ENABLE = 18, - VHOST_USER_SEND_RARP = 19, - VHOST_USER_MAX -} VhostUserRequest; - -typedef struct VhostUserMemoryRegion { - uint64_t guest_phys_addr; - uint64_t memory_size; - uint64_t userspace_addr; - uint64_t mmap_offset; -} VhostUserMemoryRegion; - -typedef struct VhostUserMemory { - uint32_t nregions; - uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; -} VhostUserMemory; - -typedef struct VhostUserLog { - uint64_t mmap_size; - uint64_t mmap_offset; -} VhostUserLog; - -typedef struct VhostUserMsg { - VhostUserRequest request; - -#define VHOST_USER_VERSION_MASK (0x3) -#define VHOST_USER_REPLY_MASK (0x1<<2) - uint32_t flags; - uint32_t size; /* the following payload size */ - union { -#define VHOST_USER_VRING_IDX_MASK (0xff) -#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) - uint64_t u64; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - VhostUserMemory memory; - VhostUserLog log; - } payload; - int fds[VHOST_MEMORY_MAX_NREGIONS]; - int fd_num; -} QEMU_PACKED VhostUserMsg; - -#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) - -/* The version of the protocol we support */ -#define VHOST_USER_VERSION (0x1) - -#define MAX_NR_VIRTQUEUE (8) - -typedef struct VubrDevRegion { - /* Guest Physical address. */ - uint64_t gpa; - /* Memory region size. */ - uint64_t size; - /* QEMU virtual address (userspace). */ - uint64_t qva; - /* Starting offset in our mmaped space. */ - uint64_t mmap_offset; - /* Start address of mmaped space. */ - uint64_t mmap_addr; -} VubrDevRegion; - -typedef struct VubrDev { - int sock; - Dispatcher dispatcher; - uint32_t nregions; - VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; - VubrVirtq vq[MAX_NR_VIRTQUEUE]; - int log_call_fd; - uint64_t log_size; - uint8_t *log_table; - int backend_udp_sock; - struct sockaddr_in backend_udp_dest; - int ready; - uint64_t features; - int hdrlen; -} VubrDev; - -static const char *vubr_request_str[] = { - [VHOST_USER_NONE] = "VHOST_USER_NONE", - [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", - [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", - [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", - [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", - [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", - [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", - [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", - [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", - [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", - [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", - [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", - [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", - [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", - [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", - [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", - [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", - [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", - [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", - [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", - [VHOST_USER_MAX] = "VHOST_USER_MAX", -}; - static void -print_buffer(uint8_t *buf, size_t len) +vubr_handle_tx(VuDev *dev, int qidx) { - int i; - printf("Raw buffer:\n"); - for (i = 0; i < len; i++) { - if (i % 16 == 0) { - printf("\n"); - } - if (i % 4 == 0) { - printf(" "); - } - printf("%02x ", buf[i]); - } - printf("\n............................................................\n"); -} + VuVirtq *vq = vu_get_queue(dev, qidx); + VubrDev *vubr = container_of(dev, VubrDev, vudev); + int hdrlen = vubr->hdrlen; + VuVirtqElement *elem = NULL; -/* Translate guest physical address to our virtual address. */ -static uint64_t -gpa_to_va(VubrDev *dev, uint64_t guest_addr) -{ - int i; + assert(qidx % 2); - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VubrDevRegion *r = &dev->regions[i]; + for (;;) { + ssize_t ret; + unsigned int out_num; + struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg; - if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { - return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; - } - } - - assert(!"address not found in regions"); - return 0; -} - -/* Translate qemu virtual address to our virtual address. */ -static uint64_t -qva_to_va(VubrDev *dev, uint64_t qemu_addr) -{ - int i; - - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VubrDevRegion *r = &dev->regions[i]; - - if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { - return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; - } - } - - assert(!"address not found in regions"); - return 0; -} - -static void -vubr_message_read(int conn_fd, VhostUserMsg *vmsg) -{ - char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; - struct iovec iov = { - .iov_base = (char *)vmsg, - .iov_len = VHOST_USER_HDR_SIZE, - }; - struct msghdr msg = { - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = control, - .msg_controllen = sizeof(control), - }; - size_t fd_size; - struct cmsghdr *cmsg; - int rc; - - rc = recvmsg(conn_fd, &msg, 0); - - if (rc == 0) { - fprintf(stderr, "Peer disconnected.\n"); - exit(1); - } - if (rc < 0) { - vubr_die("recvmsg"); - } - - vmsg->fd_num = 0; - for (cmsg = CMSG_FIRSTHDR(&msg); - cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) - { - if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { - fd_size = cmsg->cmsg_len - CMSG_LEN(0); - vmsg->fd_num = fd_size / sizeof(int); - memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); - break; - } - } - - if (vmsg->size > sizeof(vmsg->payload)) { - fprintf(stderr, - "Error: too big message request: %d, size: vmsg->size: %u, " - "while sizeof(vmsg->payload) = %zu\n", - vmsg->request, vmsg->size, sizeof(vmsg->payload)); - exit(1); - } - - if (vmsg->size) { - rc = read(conn_fd, &vmsg->payload, vmsg->size); - if (rc == 0) { - vubr_die("recvmsg"); - fprintf(stderr, "Peer disconnected.\n"); - exit(1); - } - if (rc < 0) { - vubr_die("recvmsg"); - } - - assert(rc == vmsg->size); - } -} - -static void -vubr_message_write(int conn_fd, VhostUserMsg *vmsg) -{ - int rc; - - do { - rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size); - } while (rc < 0 && errno == EINTR); - - if (rc < 0) { - vubr_die("write"); - } -} - -static void -vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len) -{ - int slen = sizeof(struct sockaddr_in); - - if (sendto(dev->backend_udp_sock, buf, len, 0, - (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) { - vubr_die("sendto()"); - } -} - -static int -vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen) -{ - int slen = sizeof(struct sockaddr_in); - int rc; - - rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0, - (struct sockaddr *) &dev->backend_udp_dest, - (socklen_t *)&slen); - if (rc == -1) { - vubr_die("recvfrom()"); - } - - return rc; -} - -static void -vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len) -{ - int hdrlen = dev->hdrlen; - DPRINT(" hdrlen = %d\n", dev->hdrlen); - - if (VHOST_USER_BRIDGE_DEBUG) { - print_buffer(buf, len); - } - vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen); -} - -/* Kick the log_call_fd if required. */ -static void -vubr_log_kick(VubrDev *dev) -{ - if (dev->log_call_fd != -1) { - DPRINT("Kicking the QEMU's log...\n"); - eventfd_write(dev->log_call_fd, 1); - } -} - -/* Kick the guest if necessary. */ -static void -vubr_virtqueue_kick(VubrVirtq *vq) -{ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { - DPRINT("Kicking the guest...\n"); - eventfd_write(vq->call_fd, 1); - } -} - -static void -vubr_log_page(uint8_t *log_table, uint64_t page) -{ - DPRINT("Logged dirty guest page: %"PRId64"\n", page); - atomic_or(&log_table[page / 8], 1 << (page % 8)); -} - -static void -vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length) -{ - uint64_t page; - - if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || - !dev->log_table || !length) { - return; - } - - assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); - - page = address / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < address + length) { - vubr_log_page(dev->log_table, page); - page += VHOST_LOG_PAGE; - } - vubr_log_kick(dev); -} - -static void -vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) -{ - struct vring_desc *desc = vq->desc; - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - int32_t remaining_len = len; - - unsigned int size = vq->size; - - uint16_t avail_index = atomic_mb_read(&avail->idx); - - /* We check the available descriptors before posting the - * buffer, so here we assume that enough available - * descriptors. */ - assert(vq->last_avail_index != avail_index); - uint16_t a_index = vq->last_avail_index % size; - uint16_t u_index = vq->last_used_index % size; - uint16_t d_index = avail->ring[a_index]; - - int i = d_index; - uint32_t written_len = 0; - - do { - DPRINT("Post packet to guest on vq:\n"); - DPRINT(" size = %d\n", vq->size); - DPRINT(" last_avail_index = %d\n", vq->last_avail_index); - DPRINT(" last_used_index = %d\n", vq->last_used_index); - DPRINT(" a_index = %d\n", a_index); - DPRINT(" u_index = %d\n", u_index); - DPRINT(" d_index = %d\n", d_index); - DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr); - DPRINT(" desc[%d].len = %d\n", i, desc[i].len); - DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags); - DPRINT(" avail->idx = %d\n", avail_index); - DPRINT(" used->idx = %d\n", used->idx); - - if (!(desc[i].flags & VRING_DESC_F_WRITE)) { - /* FIXME: we should find writable descriptor. */ - fprintf(stderr, "Error: descriptor is not writable. Exiting.\n"); - exit(1); - } - - void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr); - uint32_t chunk_len = desc[i].len; - uint32_t chunk_write_len = MIN(remaining_len, chunk_len); - - memcpy(chunk_start, buf + written_len, chunk_write_len); - vubr_log_write(dev, desc[i].addr, chunk_write_len); - remaining_len -= chunk_write_len; - written_len += chunk_write_len; - - if ((remaining_len == 0) || !(desc[i].flags & VRING_DESC_F_NEXT)) { + elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); + if (!elem) { break; } - i = desc[i].next; - } while (1); + out_num = elem->out_num; + out_sg = elem->out_sg; + if (out_num < 1) { + fprintf(stderr, "virtio-net header not in first element\n"); + break; + } + if (VHOST_USER_BRIDGE_DEBUG) { + iov_hexdump(out_sg, out_num, stderr, "TX:", 1024); + } - if (remaining_len > 0) { - fprintf(stderr, - "Too long packet for RX, remaining_len = %d, Dropping...\n", - remaining_len); + if (hdrlen) { + unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), + out_sg, out_num, + hdrlen, -1); + out_num = sg_num; + out_sg = sg; + } + + struct msghdr msg = { + .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, + .msg_namelen = sizeof(struct sockaddr_in), + .msg_iov = out_sg, + .msg_iovlen = out_num, + }; + do { + ret = sendmsg(vubr->backend_udp_sock, &msg, 0); + } while (ret == -1 && (errno == EAGAIN || errno == EINTR)); + + if (ret == -1) { + vubr_die("sendmsg()"); + } + + vu_queue_push(dev, vq, elem, 0); + vu_queue_notify(dev, vq); + + free(elem); + elem = NULL; + } + + free(elem); +} + +static void +iov_restore_front(struct iovec *front, struct iovec *iov, size_t bytes) +{ + struct iovec *cur; + + for (cur = front; front != iov; cur++) { + bytes -= cur->iov_len; + } + + cur->iov_base -= bytes; + cur->iov_len += bytes; +} + +static void +iov_truncate(struct iovec *iov, unsigned iovc, size_t bytes) +{ + unsigned i; + + for (i = 0; i < iovc; i++, iov++) { + if (bytes < iov->iov_len) { + iov->iov_len = bytes; return; - } - - /* Add descriptor to the used ring. */ - used->ring[u_index].id = d_index; - used->ring[u_index].len = len; - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, ring[u_index]), - sizeof(used->ring[u_index])); - - vq->last_avail_index++; - vq->last_used_index++; - - atomic_mb_set(&used->idx, vq->last_used_index); - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, idx), - sizeof(used->idx)); - - /* Kick the guest if necessary. */ - vubr_virtqueue_kick(vq); -} - -static int -vubr_process_desc(VubrDev *dev, VubrVirtq *vq) -{ - struct vring_desc *desc = vq->desc; - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - - unsigned int size = vq->size; - - uint16_t a_index = vq->last_avail_index % size; - uint16_t u_index = vq->last_used_index % size; - uint16_t d_index = avail->ring[a_index]; - - uint32_t i, len = 0; - size_t buf_size = 4096; - uint8_t buf[4096]; - - DPRINT("Chunks: "); - i = d_index; - do { - void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr); - uint32_t chunk_len = desc[i].len; - - assert(!(desc[i].flags & VRING_DESC_F_WRITE)); - - if (len + chunk_len < buf_size) { - memcpy(buf + len, chunk_start, chunk_len); - DPRINT("%d ", chunk_len); - } else { - fprintf(stderr, "Error: too long packet. Dropping...\n"); - break; } - len += chunk_len; - - if (!(desc[i].flags & VRING_DESC_F_NEXT)) { - break; - } - - i = desc[i].next; - } while (1); - DPRINT("\n"); - - if (!len) { - return -1; + bytes -= iov->iov_len; } - /* Add descriptor to the used ring. */ - used->ring[u_index].id = d_index; - used->ring[u_index].len = len; - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, ring[u_index]), - sizeof(used->ring[u_index])); - - vubr_consume_raw_packet(dev, buf, len); - - return 0; -} - -static void -vubr_process_avail(VubrDev *dev, VubrVirtq *vq) -{ - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - - while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { - vubr_process_desc(dev, vq); - vq->last_avail_index++; - vq->last_used_index++; - } - - atomic_mb_set(&used->idx, vq->last_used_index); - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, idx), - sizeof(used->idx)); + assert(!"couldn't truncate iov"); } static void vubr_backend_recv_cb(int sock, void *ctx) { - VubrDev *dev = (VubrDev *) ctx; - VubrVirtq *rx_vq = &dev->vq[0]; - uint8_t buf[4096]; - struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf; - int hdrlen = dev->hdrlen; - int buflen = sizeof(buf); - int len; - - if (!dev->ready) { - return; - } + VubrDev *vubr = (VubrDev *) ctx; + VuDev *dev = &vubr->vudev; + VuVirtq *vq = vu_get_queue(dev, 0); + VuVirtqElement *elem = NULL; + struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_mrg_rxbuf mhdr; + unsigned mhdr_cnt = 0; + int hdrlen = vubr->hdrlen; + int i = 0; + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); DPRINT(" hdrlen = %d\n", hdrlen); - uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); - - /* If there is no available descriptors, just do nothing. - * The buffer will be handled by next arrived UDP packet, - * or next kick on receive virtq. */ - if (rx_vq->last_avail_index == avail_index) { + if (!vu_queue_enabled(dev, vq) || + !vu_queue_avail_bytes(dev, vq, hdrlen, 0)) { DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); return; } - memset(buf, 0, hdrlen); - /* TODO: support mergeable buffers. */ - if (hdrlen == 12) - hdr->num_buffers = 1; - len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen); + do { + struct iovec *sg; + ssize_t ret, total = 0; + unsigned int num; - vubr_post_buffer(dev, rx_vq, buf, len + hdrlen); -} - -static void -vubr_kick_cb(int sock, void *ctx) -{ - VubrDev *dev = (VubrDev *) ctx; - eventfd_t kick_data; - ssize_t rc; - - rc = eventfd_read(sock, &kick_data); - if (rc == -1) { - vubr_die("eventfd_read()"); - } else { - DPRINT("Got kick_data: %016"PRIx64"\n", kick_data); - vubr_process_avail(dev, &dev->vq[1]); - } -} - -static int -vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; -} - -static int -vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - vmsg->payload.u64 = - ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VHOST_F_LOG_ALL) | - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)); - - vmsg->size = sizeof(vmsg->payload.u64); - - DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - /* Reply */ - return 1; -} - -static int -vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - dev->features = vmsg->payload.u64; - if ((dev->features & (1ULL << VIRTIO_F_VERSION_1)) || - (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) { - dev->hdrlen = 12; - } else { - dev->hdrlen = 10; - } - - return 0; -} - -static int -vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - return 0; -} - -static void -vubr_close_log(VubrDev *dev) -{ - if (dev->log_table) { - if (munmap(dev->log_table, dev->log_size) != 0) { - vubr_die("munmap()"); + elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); + if (!elem) { + break; } - dev->log_table = 0; - } - if (dev->log_call_fd != -1) { - close(dev->log_call_fd); - dev->log_call_fd = -1; - } -} - -static int -vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - vubr_close_log(dev); - dev->ready = 0; - dev->features = 0; - return 0; -} - -static int -vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - int i; - VhostUserMemory *memory = &vmsg->payload.memory; - dev->nregions = memory->nregions; - - DPRINT("Nregions: %d\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VubrDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vubr_die("mmap"); + if (elem->in_num < 1) { + fprintf(stderr, "virtio-net contains no in buffers\n"); + break; } - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); - close(vmsg->fds[i]); - } - - return 0; -} - -static int -vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - int fd; - uint64_t log_mmap_size, log_mmap_offset; - void *rc; - - assert(vmsg->fd_num == 1); - fd = vmsg->fds[0]; - - assert(vmsg->size == sizeof(vmsg->payload.log)); - log_mmap_offset = vmsg->payload.log.mmap_offset; - log_mmap_size = vmsg->payload.log.mmap_size; - DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); - DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); - - rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, - log_mmap_offset); - if (rc == MAP_FAILED) { - vubr_die("mmap"); - } - dev->log_table = rc; - dev->log_size = log_mmap_size; - - vmsg->size = sizeof(vmsg->payload.u64); - /* Reply */ - return 1; -} - -static int -vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - assert(vmsg->fd_num == 1); - dev->log_call_fd = vmsg->fds[0]; - DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); - return 0; -} - -static int -vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - unsigned int num = vmsg->payload.state.num; - - DPRINT("State.index: %d\n", index); - DPRINT("State.num: %d\n", num); - dev->vq[index].size = num; - return 0; -} - -static int -vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - struct vhost_vring_addr *vra = &vmsg->payload.addr; - unsigned int index = vra->index; - VubrVirtq *vq = &dev->vq[index]; - - DPRINT("vhost_vring_addr:\n"); - DPRINT(" index: %d\n", vra->index); - DPRINT(" flags: %d\n", vra->flags); - DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); - DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); - DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); - DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); - - vq->desc = (struct vring_desc *)(uintptr_t)qva_to_va(dev, vra->desc_user_addr); - vq->used = (struct vring_used *)(uintptr_t)qva_to_va(dev, vra->used_user_addr); - vq->avail = (struct vring_avail *)(uintptr_t)qva_to_va(dev, vra->avail_user_addr); - vq->log_guest_addr = vra->log_guest_addr; - - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->desc); - DPRINT(" vring_used at %p\n", vq->used); - DPRINT(" vring_avail at %p\n", vq->avail); - - vq->last_used_index = vq->used->idx; - - if (vq->last_avail_index != vq->used->idx) { - DPRINT("Last avail index != used index: %d != %d, resuming", - vq->last_avail_index, vq->used->idx); - vq->last_avail_index = vq->used->idx; - } - - return 0; -} - -static int -vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - unsigned int num = vmsg->payload.state.num; - - DPRINT("State.index: %d\n", index); - DPRINT("State.num: %d\n", num); - dev->vq[index].last_avail_index = num; - - return 0; -} - -static int -vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - - DPRINT("State.index: %d\n", index); - vmsg->payload.state.num = dev->vq[index].last_avail_index; - vmsg->size = sizeof(vmsg->payload.state); - /* FIXME: this is a work-around for a bug in QEMU enabling - * too early vrings. When protocol features are enabled, - * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */ - dev->ready = 0; - - if (dev->vq[index].call_fd != -1) { - close(dev->vq[index].call_fd); - dev->vq[index].call_fd = -1; - } - if (dev->vq[index].kick_fd != -1) { - close(dev->vq[index].kick_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); - dev->vq[index].kick_fd = -1; - } - - /* Reply */ - return 1; -} - -static int -vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - uint64_t u64_arg = vmsg->payload.u64; - int index = u64_arg & VHOST_USER_VRING_IDX_MASK; - - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); - assert(vmsg->fd_num == 1); - - if (dev->vq[index].kick_fd != -1) { - close(dev->vq[index].kick_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); - } - dev->vq[index].kick_fd = vmsg->fds[0]; - DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); - - if (index % 2 == 1) { - /* TX queue. */ - dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd, - dev, vubr_kick_cb); - - DPRINT("Waiting for kicks on fd: %d for vq: %d\n", - dev->vq[index].kick_fd, index); - } - /* We temporarily use this hack to determine that both TX and RX - * queues are set up and ready for processing. - * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and - * actual kicks. */ - if (dev->vq[0].kick_fd != -1 && - dev->vq[1].kick_fd != -1) { - dev->ready = 1; - DPRINT("vhost-user-bridge is ready for processing queues.\n"); - } - return 0; - -} - -static int -vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - uint64_t u64_arg = vmsg->payload.u64; - int index = u64_arg & VHOST_USER_VRING_IDX_MASK; - - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); - assert(vmsg->fd_num == 1); - - if (dev->vq[index].call_fd != -1) { - close(dev->vq[index].call_fd); - } - dev->vq[index].call_fd = vmsg->fds[0]; - DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); - - return 0; -} - -static int -vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - return 0; -} - -static int -vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - vmsg->size = sizeof(vmsg->payload.u64); - - /* Reply */ - return 1; -} - -static int -vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - /* FIXME: unimplented */ - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - return 0; -} - -static int -vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; -} - -static int -vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - unsigned int enable = vmsg->payload.state.num; - - DPRINT("State.index: %d\n", index); - DPRINT("State.enable: %d\n", enable); - dev->vq[index].enable = enable; - return 0; -} - -static int -vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; -} - -static int -vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg) -{ - /* Print out generic part of the request. */ - DPRINT( - "================== Vhost user message from QEMU ==================\n"); - DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request], - vmsg->request); - DPRINT("Flags: 0x%x\n", vmsg->flags); - DPRINT("Size: %d\n", vmsg->size); - - if (vmsg->fd_num) { - int i; - DPRINT("Fds:"); - for (i = 0; i < vmsg->fd_num; i++) { - DPRINT(" %d", vmsg->fds[i]); + sg = elem->in_sg; + num = elem->in_num; + if (i == 0) { + if (hdrlen == 12) { + mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), + sg, elem->in_num, + offsetof(typeof(mhdr), num_buffers), + sizeof(mhdr.num_buffers)); + } + iov_from_buf(sg, elem->in_num, 0, &hdr, sizeof hdr); + total += hdrlen; + assert(iov_discard_front(&sg, &num, hdrlen) == hdrlen); } - DPRINT("\n"); + + struct msghdr msg = { + .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, + .msg_namelen = sizeof(struct sockaddr_in), + .msg_iov = sg, + .msg_iovlen = elem->in_num, + .msg_flags = MSG_DONTWAIT, + }; + do { + ret = recvmsg(vubr->backend_udp_sock, &msg, 0); + } while (ret == -1 && (errno == EINTR)); + + if (i == 0) { + iov_restore_front(elem->in_sg, sg, hdrlen); + } + + if (ret == -1) { + if (errno == EWOULDBLOCK) { + vu_queue_rewind(dev, vq, 1); + break; + } + + vubr_die("recvmsg()"); + } + + total += ret; + iov_truncate(elem->in_sg, elem->in_num, total); + vu_queue_fill(dev, vq, elem, total, i++); + + free(elem); + elem = NULL; + } while (false); /* could loop if DONTWAIT worked? */ + + if (mhdr_cnt) { + mhdr.num_buffers = i; + iov_from_buf(mhdr_sg, mhdr_cnt, + 0, + &mhdr.num_buffers, sizeof mhdr.num_buffers); } - switch (vmsg->request) { - case VHOST_USER_NONE: - return vubr_none_exec(dev, vmsg); - case VHOST_USER_GET_FEATURES: - return vubr_get_features_exec(dev, vmsg); - case VHOST_USER_SET_FEATURES: - return vubr_set_features_exec(dev, vmsg); - case VHOST_USER_SET_OWNER: - return vubr_set_owner_exec(dev, vmsg); - case VHOST_USER_RESET_OWNER: - return vubr_reset_device_exec(dev, vmsg); - case VHOST_USER_SET_MEM_TABLE: - return vubr_set_mem_table_exec(dev, vmsg); - case VHOST_USER_SET_LOG_BASE: - return vubr_set_log_base_exec(dev, vmsg); - case VHOST_USER_SET_LOG_FD: - return vubr_set_log_fd_exec(dev, vmsg); - case VHOST_USER_SET_VRING_NUM: - return vubr_set_vring_num_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ADDR: - return vubr_set_vring_addr_exec(dev, vmsg); - case VHOST_USER_SET_VRING_BASE: - return vubr_set_vring_base_exec(dev, vmsg); - case VHOST_USER_GET_VRING_BASE: - return vubr_get_vring_base_exec(dev, vmsg); - case VHOST_USER_SET_VRING_KICK: - return vubr_set_vring_kick_exec(dev, vmsg); - case VHOST_USER_SET_VRING_CALL: - return vubr_set_vring_call_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ERR: - return vubr_set_vring_err_exec(dev, vmsg); - case VHOST_USER_GET_PROTOCOL_FEATURES: - return vubr_get_protocol_features_exec(dev, vmsg); - case VHOST_USER_SET_PROTOCOL_FEATURES: - return vubr_set_protocol_features_exec(dev, vmsg); - case VHOST_USER_GET_QUEUE_NUM: - return vubr_get_queue_num_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ENABLE: - return vubr_set_vring_enable_exec(dev, vmsg); - case VHOST_USER_SEND_RARP: - return vubr_send_rarp_exec(dev, vmsg); + vu_queue_flush(dev, vq, i); + vu_queue_notify(dev, vq); - case VHOST_USER_MAX: - assert(vmsg->request != VHOST_USER_MAX); - } - return 0; + free(elem); } static void vubr_receive_cb(int sock, void *ctx) { - VubrDev *dev = (VubrDev *) ctx; - VhostUserMsg vmsg; - int reply_requested; + VubrDev *vubr = (VubrDev *)ctx; - vubr_message_read(sock, &vmsg); - reply_requested = vubr_execute_request(dev, &vmsg); - if (reply_requested) { - /* Set the version in the flags when sending the reply */ - vmsg.flags &= ~VHOST_USER_VERSION_MASK; - vmsg.flags |= VHOST_USER_VERSION; - vmsg.flags |= VHOST_USER_REPLY_MASK; - vubr_message_write(sock, &vmsg); + if (!vu_dispatch(&vubr->vudev)) { + fprintf(stderr, "Error while dispatching\n"); } } +typedef struct WatchData { + VuDev *dev; + vu_watch_cb cb; + void *data; +} WatchData; + +static void +watch_cb(int sock, void *ctx) +{ + struct WatchData *wd = ctx; + + wd->cb(wd->dev, VU_WATCH_IN, wd->data); +} + +static void +vubr_set_watch(VuDev *dev, int fd, int condition, + vu_watch_cb cb, void *data) +{ + VubrDev *vubr = container_of(dev, VubrDev, vudev); + static WatchData watches[FD_SETSIZE]; + struct WatchData *wd = &watches[fd]; + + wd->cb = cb; + wd->data = data; + wd->dev = dev; + dispatcher_add(&vubr->dispatcher, fd, wd, watch_cb); +} + +static void +vubr_remove_watch(VuDev *dev, int fd) +{ + VubrDev *vubr = container_of(dev, VubrDev, vudev); + + dispatcher_remove(&vubr->dispatcher, fd); +} + +static int +vubr_send_rarp_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply) +{ + switch (vmsg->request) { + case VHOST_USER_SEND_RARP: + *do_reply = vubr_send_rarp_exec(dev, vmsg); + return 1; + default: + /* let the library handle the rest */ + return 0; + } + + return 0; +} + +static void +vubr_set_features(VuDev *dev, uint64_t features) +{ + VubrDev *vubr = container_of(dev, VubrDev, vudev); + + if ((features & (1ULL << VIRTIO_F_VERSION_1)) || + (features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) { + vubr->hdrlen = 12; + } else { + vubr->hdrlen = 10; + } +} + +static uint64_t +vubr_get_features(VuDev *dev) +{ + return 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE | + 1ULL << VIRTIO_NET_F_MRG_RXBUF; +} + +static void +vubr_queue_set_started(VuDev *dev, int qidx, bool started) +{ + VuVirtq *vq = vu_get_queue(dev, qidx); + + if (qidx % 2 == 1) { + vu_set_queue_handler(dev, vq, started ? vubr_handle_tx : NULL); + } +} + +static void +vubr_panic(VuDev *dev, const char *msg) +{ + VubrDev *vubr = container_of(dev, VubrDev, vudev); + + fprintf(stderr, "PANIC: %s\n", msg); + + dispatcher_remove(&vubr->dispatcher, dev->sock); + vubr->quit = 1; +} + +static const VuDevIface vuiface = { + .get_features = vubr_get_features, + .set_features = vubr_set_features, + .process_msg = vubr_process_msg, + .queue_set_started = vubr_queue_set_started, +}; + static void vubr_accept_cb(int sock, void *ctx) { @@ -1199,6 +479,14 @@ vubr_accept_cb(int sock, void *ctx) vubr_die("accept()"); } DPRINT("Got connection from remote peer on sock %d\n", conn_fd); + + vu_init(&dev->vudev, + conn_fd, + vubr_panic, + vubr_set_watch, + vubr_remove_watch, + &vuiface); + dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); dispatcher_remove(&dev->dispatcher, sock); } @@ -1207,29 +495,10 @@ static VubrDev * vubr_new(const char *path, bool client) { VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev)); - dev->nregions = 0; - int i; struct sockaddr_un un; CallbackFunc cb; size_t len; - for (i = 0; i < MAX_NR_VIRTQUEUE; i++) { - dev->vq[i] = (VubrVirtq) { - .call_fd = -1, .kick_fd = -1, - .size = 0, - .last_avail_index = 0, .last_used_index = 0, - .desc = 0, .avail = 0, .used = 0, - .enable = 0, - }; - } - - /* Init log */ - dev->log_call_fd = -1; - dev->log_size = 0; - dev->log_table = 0; - dev->ready = 0; - dev->features = 0; - /* Get a UNIX socket. */ dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); if (dev->sock == -1) { @@ -1257,10 +526,17 @@ vubr_new(const char *path, bool client) if (connect(dev->sock, (struct sockaddr *)&un, len) == -1) { vubr_die("connect"); } + vu_init(&dev->vudev, + dev->sock, + vubr_panic, + vubr_set_watch, + vubr_remove_watch, + &vuiface); cb = vubr_receive_cb; } dispatcher_init(&dev->dispatcher); + dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, cb); return dev; @@ -1341,7 +617,7 @@ vubr_backend_udp_setup(VubrDev *dev, static void vubr_run(VubrDev *dev) { - while (1) { + while (!dev->quit) { /* timeout 200ms */ dispatcher_wait(&dev->dispatcher, 200000); /* Here one can try polling strategy. */ @@ -1417,6 +693,9 @@ main(int argc, char *argv[]) vubr_backend_udp_setup(dev, lhost, lport, rhost, rport); vubr_run(dev); + + vu_deinit(&dev->vudev); + return 0; out: From d9429b84af2302b6e28bec3c52710cf67eda3cee Mon Sep 17 00:00:00 2001 From: Prasad J Pandit Date: Fri, 2 Dec 2016 16:46:26 +0530 Subject: [PATCH 7/9] i386: amd_iommu: fix MMIO register count and access IOMMU MMIO registers are divided in two groups by their offsets. Low offsets(<0x2000) registers are grouped into 'amdvi_mmio_low' table and higher offsets(>=0x2000) registers are grouped into 'amdvi_mmio_high' table. No of registers in each table is given by macro 'AMDVI_MMIO_REGS_LOW' and 'AMDVI_MMIO_REGS_HIGH' resp. Values of these two macros were swapped, resulting in an OOB access when reading 'amdvi_mmio_high' table. Correct these two macros. Also read from 'amdvi_mmio_low' table for lower address. Reported-by: Azureyang Signed-off-by: Prasad J Pandit Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/amd_iommu.c | 2 +- hw/i386/amd_iommu.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 47b79d9112..e0732ccaf1 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -562,7 +562,7 @@ static void amdvi_mmio_trace(hwaddr addr, unsigned size) trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); } else { index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index; - trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); + trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07); } } diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 884926e9e7..0d3dc6a9f2 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -49,8 +49,8 @@ #define AMDVI_CAPAB_INIT_TYPE (3 << 16) /* No. of used MMIO registers */ -#define AMDVI_MMIO_REGS_HIGH 8 -#define AMDVI_MMIO_REGS_LOW 7 +#define AMDVI_MMIO_REGS_HIGH 7 +#define AMDVI_MMIO_REGS_LOW 8 /* MMIO registers */ #define AMDVI_MMIO_DEVICE_TABLE 0x0000 From d93ddfb1f8fb72a7c175a8cf1028c639f769d105 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 12 Dec 2016 20:42:03 +0200 Subject: [PATCH 8/9] pci: fix error message for express slots PCI Express downstream slot has a single PCI slot behind it, using PCI_DEVFN(PCI_SLOT(devfn), 0) does not give you function 0 in cases such as ARI as well as some error cases. This is exactly what we are hitting: $ qemu-system-x86_64 -machine q35 -readconfig docs/q35-chipset.cfg -monitor stdio (qemu) device_add e1000e,bus=ich9-pcie-port-4,addr=00 (qemu) device_add e1000e,bus=ich9-pcie-port-4,addr=08 Segmentation fault (core dumped) The fix is to use the pci_get_function_0 API. Cc: qemu-stable@nongnu.org Signed-off-by: Michael S. Tsirkin Reported-by: Eduardo Habkost Tested-by: Cao jin Tested-by: Eduardo Habkost Reviewed-by: Eduardo Habkost --- hw/pci/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 24fae1689d..637d54549e 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -982,8 +982,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, pci_get_function_0(pci_dev)) { error_setg(errp, "PCI: slot %d function 0 already ocuppied by %s," " new func %s cannot be exposed to guest.", - PCI_SLOT(devfn), - bus->devices[PCI_DEVFN(PCI_SLOT(devfn), 0)]->name, + PCI_SLOT(pci_get_function_0(pci_dev)->devfn), + pci_get_function_0(pci_dev)->name, name); return NULL; From 2858bc68701e282c404ed04d65d4f065e4b40e52 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 15 Dec 2016 12:23:24 -0600 Subject: [PATCH 9/9] virtio: avoid using guest_notifier_mask in vhost-user mode Because guest mask notifier cannot be used in vhost-user mode, a boolean flag "use_guest_notifier_mask" was added in commit 5669655aafd to disable the use of guest mask notifier under virtio-pci. However this flag wasn't checked in other virtio devices, such as virtio-mmio. In our tests, it caused assertion error under "vhost-user + virtio-mmio". This patch addresses this problem by adding a check before guest_notifier_mask is called. Signed-off-by: Wei Huang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/s390x/virtio-ccw.c | 4 ++-- hw/virtio/virtio-mmio.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index f5c1d98192..07650683f7 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1098,7 +1098,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n, * We do not support individual masking for channel devices, so we * need to manually trigger any guest masking callbacks here. */ - if (k->guest_notifier_mask) { + if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) { k->guest_notifier_mask(vdev, n, false); } /* get lost events and re-inject */ @@ -1107,7 +1107,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n, event_notifier_set(notifier); } } else { - if (k->guest_notifier_mask) { + if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) { k->guest_notifier_mask(vdev, n, true); } if (with_irqfd) { diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 17412cb7b5..60654dc19d 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -402,7 +402,7 @@ static int virtio_mmio_set_guest_notifier(DeviceState *d, int n, bool assign, event_notifier_cleanup(notifier); } - if (vdc->guest_notifier_mask) { + if (vdc->guest_notifier_mask && vdev->use_guest_notifier_mask) { vdc->guest_notifier_mask(vdev, n, !assign); }