qemu/hw/virtio/vhost-shadow-virtqueue.c
Eugenio Pérez e966c0b781 vhost: Add svq avail_handler callback
This allows external handlers to be aware of new buffers that the guest
places in the virtqueue.

When this callback is defined the ownership of the guest's virtqueue
element is transferred to the callback. This means that if the user
wants to forward the descriptor it needs to manually inject it. The
callback is also free to process the command by itself and use the
element with svq_push.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
2022-07-20 16:58:08 +08:00

748 lines
22 KiB
C

/*
* vhost shadow virtqueue
*
* SPDX-FileCopyrightText: Red Hat, Inc. 2021
* SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
*
* SPDX-License-Identifier: GPL-2.0-or-later
*/
#include "qemu/osdep.h"
#include "hw/virtio/vhost-shadow-virtqueue.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qemu/main-loop.h"
#include "qemu/log.h"
#include "qemu/memalign.h"
#include "linux-headers/linux/vhost.h"
/**
* Validate the transport device features that both guests can use with the SVQ
* and SVQs can use with the device.
*
* @dev_features: The features
* @errp: Error pointer
*/
bool vhost_svq_valid_features(uint64_t features, Error **errp)
{
bool ok = true;
uint64_t svq_features = features;
for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
++b) {
switch (b) {
case VIRTIO_F_ANY_LAYOUT:
continue;
case VIRTIO_F_ACCESS_PLATFORM:
/* SVQ trust in the host's IOMMU to translate addresses */
case VIRTIO_F_VERSION_1:
/* SVQ trust that the guest vring is little endian */
if (!(svq_features & BIT_ULL(b))) {
svq_features |= BIT_ULL(b);
ok = false;
}
continue;
default:
if (svq_features & BIT_ULL(b)) {
svq_features &= ~BIT_ULL(b);
ok = false;
}
}
}
if (!ok) {
error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
", ok: 0x%"PRIx64, features, svq_features);
}
return ok;
}
/**
* Number of descriptors that the SVQ can make available from the guest.
*
* @svq: The svq
*/
static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
{
return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
}
/**
* Translate addresses between the qemu's virtual address and the SVQ IOVA
*
* @svq: Shadow VirtQueue
* @vaddr: Translated IOVA addresses
* @iovec: Source qemu's VA addresses
* @num: Length of iovec and minimum length of vaddr
*/
static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
hwaddr *addrs, const struct iovec *iovec,
size_t num)
{
if (num == 0) {
return true;
}
for (size_t i = 0; i < num; ++i) {
DMAMap needle = {
.translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
.size = iovec[i].iov_len,
};
Int128 needle_last, map_last;
size_t off;
const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
/*
* Map cannot be NULL since iova map contains all guest space and
* qemu already has a physical address mapped
*/
if (unlikely(!map)) {
qemu_log_mask(LOG_GUEST_ERROR,
"Invalid address 0x%"HWADDR_PRIx" given by guest",
needle.translated_addr);
return false;
}
off = needle.translated_addr - map->translated_addr;
addrs[i] = map->iova + off;
needle_last = int128_add(int128_make64(needle.translated_addr),
int128_make64(iovec[i].iov_len));
map_last = int128_make64(map->translated_addr + map->size);
if (unlikely(int128_gt(needle_last, map_last))) {
qemu_log_mask(LOG_GUEST_ERROR,
"Guest buffer expands over iova range");
return false;
}
}
return true;
}
/**
* Write descriptors to SVQ vring
*
* @svq: The shadow virtqueue
* @sg: Cache for hwaddr
* @iovec: The iovec from the guest
* @num: iovec length
* @more_descs: True if more descriptors come in the chain
* @write: True if they are writeable descriptors
*
* Return true if success, false otherwise and print error.
*/
static bool vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
const struct iovec *iovec, size_t num,
bool more_descs, bool write)
{
uint16_t i = svq->free_head, last = svq->free_head;
unsigned n;
uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
vring_desc_t *descs = svq->vring.desc;
bool ok;
if (num == 0) {
return true;
}
ok = vhost_svq_translate_addr(svq, sg, iovec, num);
if (unlikely(!ok)) {
return false;
}
for (n = 0; n < num; n++) {
if (more_descs || (n + 1 < num)) {
descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
descs[i].next = cpu_to_le16(svq->desc_next[i]);
} else {
descs[i].flags = flags;
}
descs[i].addr = cpu_to_le64(sg[n]);
descs[i].len = cpu_to_le32(iovec[n].iov_len);
last = i;
i = cpu_to_le16(svq->desc_next[i]);
}
svq->free_head = le16_to_cpu(svq->desc_next[last]);
return true;
}
static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
const struct iovec *out_sg, size_t out_num,
const struct iovec *in_sg, size_t in_num,
unsigned *head)
{
unsigned avail_idx;
vring_avail_t *avail = svq->vring.avail;
bool ok;
g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
*head = svq->free_head;
/* We need some descriptors here */
if (unlikely(!out_num && !in_num)) {
qemu_log_mask(LOG_GUEST_ERROR,
"Guest provided element with no descriptors");
return false;
}
ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, in_num > 0,
false);
if (unlikely(!ok)) {
return false;
}
ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, false, true);
if (unlikely(!ok)) {
return false;
}
/*
* Put the entry in the available array (but don't update avail->idx until
* they do sync).
*/
avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
avail->ring[avail_idx] = cpu_to_le16(*head);
svq->shadow_avail_idx++;
/* Update the avail index after write the descriptor */
smp_wmb();
avail->idx = cpu_to_le16(svq->shadow_avail_idx);
return true;
}
static void vhost_svq_kick(VhostShadowVirtqueue *svq)
{
/*
* We need to expose the available array entries before checking the used
* flags
*/
smp_mb();
if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
return;
}
event_notifier_set(&svq->hdev_kick);
}
/**
* Add an element to a SVQ.
*
* The caller must check that there is enough slots for the new element. It
* takes ownership of the element: In case of failure not ENOSPC, it is free.
*
* Return -EINVAL if element is invalid, -ENOSPC if dev queue is full
*/
int vhost_svq_add(VhostShadowVirtqueue *svq, const struct iovec *out_sg,
size_t out_num, const struct iovec *in_sg, size_t in_num,
VirtQueueElement *elem)
{
unsigned qemu_head;
unsigned ndescs = in_num + out_num;
bool ok;
if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
return -ENOSPC;
}
ok = vhost_svq_add_split(svq, out_sg, out_num, in_sg, in_num, &qemu_head);
if (unlikely(!ok)) {
g_free(elem);
return -EINVAL;
}
svq->desc_state[qemu_head].elem = elem;
svq->desc_state[qemu_head].ndescs = ndescs;
vhost_svq_kick(svq);
return 0;
}
/* Convenience wrapper to add a guest's element to SVQ */
static int vhost_svq_add_element(VhostShadowVirtqueue *svq,
VirtQueueElement *elem)
{
return vhost_svq_add(svq, elem->out_sg, elem->out_num, elem->in_sg,
elem->in_num, elem);
}
/**
* Forward available buffers.
*
* @svq: Shadow VirtQueue
*
* Note that this function does not guarantee that all guest's available
* buffers are available to the device in SVQ avail ring. The guest may have
* exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
* qemu vaddr.
*
* If that happens, guest's kick notifications will be disabled until the
* device uses some buffers.
*/
static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
{
/* Clear event notifier */
event_notifier_test_and_clear(&svq->svq_kick);
/* Forward to the device as many available buffers as possible */
do {
virtio_queue_set_notification(svq->vq, false);
while (true) {
VirtQueueElement *elem;
int r;
if (svq->next_guest_avail_elem) {
elem = g_steal_pointer(&svq->next_guest_avail_elem);
} else {
elem = virtqueue_pop(svq->vq, sizeof(*elem));
}
if (!elem) {
break;
}
if (svq->ops) {
r = svq->ops->avail_handler(svq, elem, svq->ops_opaque);
} else {
r = vhost_svq_add_element(svq, elem);
}
if (unlikely(r != 0)) {
if (r == -ENOSPC) {
/*
* This condition is possible since a contiguous buffer in
* GPA does not imply a contiguous buffer in qemu's VA
* scatter-gather segments. If that happens, the buffer
* exposed to the device needs to be a chain of descriptors
* at this moment.
*
* SVQ cannot hold more available buffers if we are here:
* queue the current guest descriptor and ignore kicks
* until some elements are used.
*/
svq->next_guest_avail_elem = elem;
}
/* VQ is full or broken, just return and ignore kicks */
return;
}
}
virtio_queue_set_notification(svq->vq, true);
} while (!virtio_queue_empty(svq->vq));
}
/**
* Handle guest's kick.
*
* @n: guest kick event notifier, the one that guest set to notify svq.
*/
static void vhost_handle_guest_kick_notifier(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
event_notifier_test_and_clear(n);
vhost_handle_guest_kick(svq);
}
static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
{
uint16_t *used_idx = &svq->vring.used->idx;
if (svq->last_used_idx != svq->shadow_used_idx) {
return true;
}
svq->shadow_used_idx = cpu_to_le16(*(volatile uint16_t *)used_idx);
return svq->last_used_idx != svq->shadow_used_idx;
}
/**
* Enable vhost device calls after disable them.
*
* @svq: The svq
*
* It returns false if there are pending used buffers from the vhost device,
* avoiding the possible races between SVQ checking for more work and enabling
* callbacks. True if SVQ used vring has no more pending buffers.
*/
static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
{
svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
/* Make sure the flag is written before the read of used_idx */
smp_mb();
return !vhost_svq_more_used(svq);
}
static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
{
svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
}
static uint16_t vhost_svq_last_desc_of_chain(const VhostShadowVirtqueue *svq,
uint16_t num, uint16_t i)
{
for (uint16_t j = 0; j < (num - 1); ++j) {
i = le16_to_cpu(svq->desc_next[i]);
}
return i;
}
static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
uint32_t *len)
{
const vring_used_t *used = svq->vring.used;
vring_used_elem_t used_elem;
uint16_t last_used, last_used_chain, num;
if (!vhost_svq_more_used(svq)) {
return NULL;
}
/* Only get used array entries after they have been exposed by dev */
smp_rmb();
last_used = svq->last_used_idx & (svq->vring.num - 1);
used_elem.id = le32_to_cpu(used->ring[last_used].id);
used_elem.len = le32_to_cpu(used->ring[last_used].len);
svq->last_used_idx++;
if (unlikely(used_elem.id >= svq->vring.num)) {
qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
svq->vdev->name, used_elem.id);
return NULL;
}
if (unlikely(!svq->desc_state[used_elem.id].elem)) {
qemu_log_mask(LOG_GUEST_ERROR,
"Device %s says index %u is used, but it was not available",
svq->vdev->name, used_elem.id);
return NULL;
}
num = svq->desc_state[used_elem.id].ndescs;
last_used_chain = vhost_svq_last_desc_of_chain(svq, num, used_elem.id);
svq->desc_next[last_used_chain] = svq->free_head;
svq->free_head = used_elem.id;
*len = used_elem.len;
return g_steal_pointer(&svq->desc_state[used_elem.id].elem);
}
/**
* Push an element to SVQ, returning it to the guest.
*/
void vhost_svq_push_elem(VhostShadowVirtqueue *svq,
const VirtQueueElement *elem, uint32_t len)
{
virtqueue_push(svq->vq, elem, len);
if (svq->next_guest_avail_elem) {
/*
* Avail ring was full when vhost_svq_flush was called, so it's a
* good moment to make more descriptors available if possible.
*/
vhost_handle_guest_kick(svq);
}
}
static void vhost_svq_flush(VhostShadowVirtqueue *svq,
bool check_for_avail_queue)
{
VirtQueue *vq = svq->vq;
/* Forward as many used buffers as possible. */
do {
unsigned i = 0;
vhost_svq_disable_notification(svq);
while (true) {
uint32_t len;
g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
if (!elem) {
break;
}
if (unlikely(i >= svq->vring.num)) {
qemu_log_mask(LOG_GUEST_ERROR,
"More than %u used buffers obtained in a %u size SVQ",
i, svq->vring.num);
virtqueue_fill(vq, elem, len, i);
virtqueue_flush(vq, i);
return;
}
virtqueue_fill(vq, elem, len, i++);
}
virtqueue_flush(vq, i);
event_notifier_set(&svq->svq_call);
if (check_for_avail_queue && svq->next_guest_avail_elem) {
/*
* Avail ring was full when vhost_svq_flush was called, so it's a
* good moment to make more descriptors available if possible.
*/
vhost_handle_guest_kick(svq);
}
} while (!vhost_svq_enable_notification(svq));
}
/**
* Poll the SVQ for one device used buffer.
*
* This function race with main event loop SVQ polling, so extra
* synchronization is needed.
*
* Return the length written by the device.
*/
size_t vhost_svq_poll(VhostShadowVirtqueue *svq)
{
int64_t start_us = g_get_monotonic_time();
do {
uint32_t len;
VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
if (elem) {
return len;
}
if (unlikely(g_get_monotonic_time() - start_us > 10e6)) {
return 0;
}
/* Make sure we read new used_idx */
smp_rmb();
} while (true);
}
/**
* Forward used buffers.
*
* @n: hdev call event notifier, the one that device set to notify svq.
*
* Note that we are not making any buffers available in the loop, there is no
* way that it runs more than virtqueue size times.
*/
static void vhost_svq_handle_call(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
hdev_call);
event_notifier_test_and_clear(n);
vhost_svq_flush(svq, true);
}
/**
* Set the call notifier for the SVQ to call the guest
*
* @svq: Shadow virtqueue
* @call_fd: call notifier
*
* Called on BQL context.
*/
void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
{
if (call_fd == VHOST_FILE_UNBIND) {
/*
* Fail event_notifier_set if called handling device call.
*
* SVQ still needs device notifications, since it needs to keep
* forwarding used buffers even with the unbind.
*/
memset(&svq->svq_call, 0, sizeof(svq->svq_call));
} else {
event_notifier_init_fd(&svq->svq_call, call_fd);
}
}
/**
* Get the shadow vq vring address.
* @svq: Shadow virtqueue
* @addr: Destination to store address
*/
void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
struct vhost_vring_addr *addr)
{
addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
}
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
{
size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
size_t avail_size = offsetof(vring_avail_t, ring) +
sizeof(uint16_t) * svq->vring.num;
return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size());
}
size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
{
size_t used_size = offsetof(vring_used_t, ring) +
sizeof(vring_used_elem_t) * svq->vring.num;
return ROUND_UP(used_size, qemu_real_host_page_size());
}
/**
* Set a new file descriptor for the guest to kick the SVQ and notify for avail
*
* @svq: The svq
* @svq_kick_fd: The svq kick fd
*
* Note that the SVQ will never close the old file descriptor.
*/
void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
{
EventNotifier *svq_kick = &svq->svq_kick;
bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
if (poll_stop) {
event_notifier_set_handler(svq_kick, NULL);
}
/*
* event_notifier_set_handler already checks for guest's notifications if
* they arrive at the new file descriptor in the switch, so there is no
* need to explicitly check for them.
*/
if (poll_start) {
event_notifier_init_fd(svq_kick, svq_kick_fd);
event_notifier_set(svq_kick);
event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
}
}
/**
* Start the shadow virtqueue operation.
*
* @svq: Shadow Virtqueue
* @vdev: VirtIO device
* @vq: Virtqueue to shadow
*/
void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
VirtQueue *vq)
{
size_t desc_size, driver_size, device_size;
svq->next_guest_avail_elem = NULL;
svq->shadow_avail_idx = 0;
svq->shadow_used_idx = 0;
svq->last_used_idx = 0;
svq->vdev = vdev;
svq->vq = vq;
svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
driver_size = vhost_svq_driver_area_size(svq);
device_size = vhost_svq_device_area_size(svq);
svq->vring.desc = qemu_memalign(qemu_real_host_page_size(), driver_size);
desc_size = sizeof(vring_desc_t) * svq->vring.num;
svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
memset(svq->vring.desc, 0, driver_size);
svq->vring.used = qemu_memalign(qemu_real_host_page_size(), device_size);
memset(svq->vring.used, 0, device_size);
svq->desc_state = g_new0(SVQDescState, svq->vring.num);
svq->desc_next = g_new0(uint16_t, svq->vring.num);
for (unsigned i = 0; i < svq->vring.num - 1; i++) {
svq->desc_next[i] = cpu_to_le16(i + 1);
}
}
/**
* Stop the shadow virtqueue operation.
* @svq: Shadow Virtqueue
*/
void vhost_svq_stop(VhostShadowVirtqueue *svq)
{
event_notifier_set_handler(&svq->svq_kick, NULL);
g_autofree VirtQueueElement *next_avail_elem = NULL;
if (!svq->vq) {
return;
}
/* Send all pending used descriptors to guest */
vhost_svq_flush(svq, false);
for (unsigned i = 0; i < svq->vring.num; ++i) {
g_autofree VirtQueueElement *elem = NULL;
elem = g_steal_pointer(&svq->desc_state[i].elem);
if (elem) {
virtqueue_detach_element(svq->vq, elem, 0);
}
}
next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
if (next_avail_elem) {
virtqueue_detach_element(svq->vq, next_avail_elem, 0);
}
svq->vq = NULL;
g_free(svq->desc_next);
g_free(svq->desc_state);
qemu_vfree(svq->vring.desc);
qemu_vfree(svq->vring.used);
}
/**
* Creates vhost shadow virtqueue, and instructs the vhost device to use the
* shadow methods and file descriptors.
*
* @iova_tree: Tree to perform descriptors translations
* @ops: SVQ owner callbacks
* @ops_opaque: ops opaque pointer
*
* Returns the new virtqueue or NULL.
*
* In case of error, reason is reported through error_report.
*/
VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree,
const VhostShadowVirtqueueOps *ops,
void *ops_opaque)
{
g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
int r;
r = event_notifier_init(&svq->hdev_kick, 0);
if (r != 0) {
error_report("Couldn't create kick event notifier: %s (%d)",
g_strerror(errno), errno);
goto err_init_hdev_kick;
}
r = event_notifier_init(&svq->hdev_call, 0);
if (r != 0) {
error_report("Couldn't create call event notifier: %s (%d)",
g_strerror(errno), errno);
goto err_init_hdev_call;
}
event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
svq->iova_tree = iova_tree;
svq->ops = ops;
svq->ops_opaque = ops_opaque;
return g_steal_pointer(&svq);
err_init_hdev_call:
event_notifier_cleanup(&svq->hdev_kick);
err_init_hdev_kick:
return NULL;
}
/**
* Free the resources of the shadow virtqueue.
*
* @pvq: gpointer to SVQ so it can be used by autofree functions.
*/
void vhost_svq_free(gpointer pvq)
{
VhostShadowVirtqueue *vq = pvq;
vhost_svq_stop(vq);
event_notifier_cleanup(&vq->hdev_kick);
event_notifier_set_handler(&vq->hdev_call, NULL);
event_notifier_cleanup(&vq->hdev_call);
g_free(vq);
}