qemu/hw/virtio/virtio-iommu.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1608 lines
51 KiB
C
Raw Normal View History

/*
* virtio-iommu device
*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2 or later, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/iov.h"
#include "qemu/range.h"
#include "qemu/reserved-region.h"
#include "exec/target_page.h"
#include "hw/qdev-properties.h"
#include "hw/virtio/virtio.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "sysemu/sysemu.h"
#include "qemu/reserved-region.h"
#include "qemu/units.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "standard-headers/linux/virtio_ids.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-iommu.h"
#include "hw/pci/pci_bus.h"
#include "hw/pci/pci.h"
/* Max size */
#define VIOMMU_DEFAULT_QUEUE_SIZE 256
#define VIOMMU_PROBE_SIZE 512
typedef struct VirtIOIOMMUDomain {
uint32_t id;
bool bypass;
GTree *mappings;
QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
} VirtIOIOMMUDomain;
typedef struct VirtIOIOMMUEndpoint {
uint32_t id;
VirtIOIOMMUDomain *domain;
IOMMUMemoryRegion *iommu_mr;
QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
} VirtIOIOMMUEndpoint;
typedef struct VirtIOIOMMUInterval {
uint64_t low;
uint64_t high;
} VirtIOIOMMUInterval;
typedef struct VirtIOIOMMUMapping {
uint64_t phys_addr;
uint32_t flags;
} VirtIOIOMMUMapping;
static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
{
return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
}
static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
{
uint32_t sid;
bool bypassed;
VirtIOIOMMU *s = sdev->viommu;
VirtIOIOMMUEndpoint *ep;
sid = virtio_iommu_get_bdf(sdev);
qemu_rec_mutex_lock(&s->mutex);
/* need to check bypass before system reset */
if (!s->endpoints) {
bypassed = s->config.bypass;
goto unlock;
}
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep || !ep->domain) {
bypassed = s->config.bypass;
} else {
bypassed = ep->domain->bypass;
}
unlock:
qemu_rec_mutex_unlock(&s->mutex);
return bypassed;
}
/* Return whether the device is using IOMMU translation. */
static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
{
bool use_remapping;
assert(sdev);
use_remapping = !virtio_iommu_device_bypassed(sdev);
trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
PCI_SLOT(sdev->devfn),
PCI_FUNC(sdev->devfn),
use_remapping);
/* Turn off first then on the other */
if (use_remapping) {
memory_region_set_enabled(&sdev->bypass_mr, false);
memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
} else {
memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
memory_region_set_enabled(&sdev->bypass_mr, true);
}
return use_remapping;
}
static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
{
GHashTableIter iter;
IOMMUPciBus *iommu_pci_bus;
int i;
g_hash_table_iter_init(&iter, s->as_by_busptr);
while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
for (i = 0; i < PCI_DEVFN_MAX; i++) {
if (!iommu_pci_bus->pbdev[i]) {
continue;
}
virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
}
}
}
/**
* The bus number is used for lookup when SID based operations occur.
* In that case we lazily populate the IOMMUPciBus array from the bus hash
* table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
* numbers may not be always initialized yet.
*/
static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
{
IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
if (!iommu_pci_bus) {
GHashTableIter iter;
g_hash_table_iter_init(&iter, s->as_by_busptr);
while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
return iommu_pci_bus;
}
}
return NULL;
}
return iommu_pci_bus;
}
static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
{
uint8_t bus_n, devfn;
IOMMUPciBus *iommu_pci_bus;
IOMMUDevice *dev;
bus_n = PCI_BUS_NUM(sid);
iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
if (iommu_pci_bus) {
devfn = sid & (PCI_DEVFN_MAX - 1);
dev = iommu_pci_bus->pbdev[devfn];
if (dev) {
return &dev->iommu_mr;
}
}
return NULL;
}
static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
if (inta->high < intb->low) {
return -1;
} else if (intb->high < inta->low) {
return 1;
} else {
return 0;
}
}
static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr,
IOMMUTLBEvent *event,
hwaddr virt_start, hwaddr virt_end)
{
uint64_t delta = virt_end - virt_start;
event->entry.iova = virt_start;
event->entry.addr_mask = delta;
if (delta == UINT64_MAX) {
memory_region_notify_iommu(mr, 0, *event);
}
while (virt_start != virt_end + 1) {
uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64);
event->entry.addr_mask = mask;
event->entry.iova = virt_start;
memory_region_notify_iommu(mr, 0, *event);
virt_start += mask + 1;
if (event->entry.perm != IOMMU_NONE) {
event->entry.translated_addr += mask + 1;
}
}
}
static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end, hwaddr paddr,
uint32_t flags)
{
IOMMUTLBEvent event;
IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ,
flags & VIRTIO_IOMMU_MAP_F_WRITE);
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) ||
(flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) {
return;
}
trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end,
paddr, perm);
event.type = IOMMU_NOTIFIER_MAP;
event.entry.target_as = &address_space_memory;
event.entry.perm = perm;
event.entry.translated_addr = paddr;
virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
}
static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end)
{
IOMMUTLBEvent event;
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) {
return;
}
trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end);
event.type = IOMMU_NOTIFIER_UNMAP;
event.entry.target_as = &address_space_memory;
event.entry.perm = IOMMU_NONE;
event.entry.translated_addr = 0;
virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
}
static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_unmap(mr, interval->low, interval->high);
return false;
}
static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
{
VirtIOIOMMUDomain *domain = ep->domain;
IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
if (!ep->domain) {
return;
}
g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
ep->iommu_mr);
QLIST_REMOVE(ep, next);
ep->domain = NULL;
virtio_iommu_switch_address_space(sdev);
}
static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
uint32_t ep_id)
{
VirtIOIOMMUEndpoint *ep;
IOMMUMemoryRegion *mr;
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (ep) {
return ep;
}
mr = virtio_iommu_mr(s, ep_id);
if (!mr) {
return NULL;
}
ep = g_malloc0(sizeof(*ep));
ep->id = ep_id;
ep->iommu_mr = mr;
trace_virtio_iommu_get_endpoint(ep_id);
g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
return ep;
}
static void virtio_iommu_put_endpoint(gpointer data)
{
VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
if (ep->domain) {
virtio_iommu_detach_endpoint_from_domain(ep);
}
trace_virtio_iommu_put_endpoint(ep->id);
g_free(ep);
}
static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
uint32_t domain_id,
bool bypass)
{
VirtIOIOMMUDomain *domain;
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (domain) {
if (domain->bypass != bypass) {
return NULL;
}
return domain;
}
domain = g_malloc0(sizeof(*domain));
domain->id = domain_id;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, (GDestroyNotify)g_free,
(GDestroyNotify)g_free);
domain->bypass = bypass;
g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
QLIST_INIT(&domain->endpoint_list);
trace_virtio_iommu_get_domain(domain_id);
return domain;
}
static void virtio_iommu_put_domain(gpointer data)
{
VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
VirtIOIOMMUEndpoint *iter, *tmp;
QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
virtio_iommu_detach_endpoint_from_domain(iter);
}
g_tree_destroy(domain->mappings);
trace_virtio_iommu_put_domain(domain->id);
g_free(domain);
}
static void add_prop_resv_regions(IOMMUDevice *sdev)
{
VirtIOIOMMU *s = sdev->viommu;
int i;
for (i = 0; i < s->nr_prop_resv_regions; i++) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
*reg = s->prop_resv_regions[i];
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
}
}
static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
VirtIOIOMMU *s = opaque;
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
static uint32_t mr_index;
IOMMUDevice *sdev;
if (!sbus) {
sbus = g_malloc0(sizeof(IOMMUPciBus) +
sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
sbus->bus = bus;
g_hash_table_insert(s->as_by_busptr, bus, sbus);
}
sdev = sbus->pbdev[devfn];
if (!sdev) {
char *name = g_strdup_printf("%s-%d-%d",
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
mr_index++, devfn);
sdev = sbus->pbdev[devfn] = g_new0(IOMMUDevice, 1);
sdev->viommu = s;
sdev->bus = bus;
sdev->devfn = devfn;
trace_virtio_iommu_init_iommu_mr(name);
memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
add_prop_resv_regions(sdev);
/*
* Build the IOMMU disabled container with aliases to the
* shared MRs. Note that aliasing to a shared memory region
* could help the memory API to detect same FlatViews so we
* can have devices to share the same FlatView when in bypass
* mode. (either by not configuring virtio-iommu driver or with
* "iommu=pt"). It will greatly reduce the total number of
* FlatViews of the system hence VM runs faster.
*/
memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
"system", get_system_memory(), 0,
memory_region_size(get_system_memory()));
memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
OBJECT(s), name,
UINT64_MAX);
/*
* Hook both the containers under the root container, we
* switch between iommu & bypass MRs by enable/disable
* corresponding sub-containers
*/
memory_region_add_subregion_overlap(&sdev->root, 0,
MEMORY_REGION(&sdev->iommu_mr),
0);
memory_region_add_subregion_overlap(&sdev->root, 0,
&sdev->bypass_mr, 0);
virtio_iommu_switch_address_space(sdev);
g_free(name);
}
return &sdev->as;
}
hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps This patch modifies pci_setup_iommu() to set PCIIOMMUOps instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to get an address space for a PCI device in vendor specific way. The PCIIOMMUOps still offers this functionality. But using PCIIOMMUOps leaves space to add more iommu related vendor specific operations. Cc: Kevin Tian <kevin.tian@intel.com> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Eric Auger <eric.auger@redhat.com> Cc: Yi Sun <yi.y.sun@linux.intel.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Eric Auger <eric.auger@redhat.com> Cc: Peter Maydell <peter.maydell@linaro.org> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Jason Wang <jasowang@redhat.com> Cc: Andrey Smirnov <andrew.smirnov@gmail.com> Cc: Helge Deller <deller@gmx.de> Cc: Hervé Poussineau <hpoussin@reactos.org> Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> Cc: BALATON Zoltan <balaton@eik.bme.hu> Cc: Elena Ufimtseva <elena.ufimtseva@oracle.com> Cc: Jagannathan Raman <jag.raman@oracle.com> Cc: Matthew Rosato <mjrosato@linux.ibm.com> Cc: Eric Farman <farman@linux.ibm.com> Cc: Halil Pasic <pasic@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Thomas Huth <thuth@redhat.com> Cc: Helge Deller <deller@gmx.de> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Peter Xu <peterx@redhat.com> Signed-off-by: Yi Liu <yi.l.liu@intel.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Reviewed-by: Eric Auger <eric.auger@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> [ clg: - refreshed on latest QEMU - included hw/remote/iommu.c - documentation update - asserts in pci_setup_iommu() - removed checks on iommu_bus->iommu_ops->get_address_space - included Elroy PCI host (PA-RISC) ] Signed-off-by: Cédric Le Goater <clg@redhat.com>
2023-10-17 19:14:04 +03:00
static const PCIIOMMUOps virtio_iommu_ops = {
.get_address_space = virtio_iommu_find_add_as,
};
static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
uint32_t flags = le32_to_cpu(req->flags);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
IOMMUDevice *sdev;
trace_virtio_iommu_attach(domain_id, ep_id);
if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) {
return VIRTIO_IOMMU_S_INVAL;
}
ep = virtio_iommu_get_endpoint(s, ep_id);
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
if (ep->domain) {
VirtIOIOMMUDomain *previous_domain = ep->domain;
/*
* the device is already attached to a domain,
* detach it first
*/
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
}
}
domain = virtio_iommu_get_domain(s, domain_id,
flags & VIRTIO_IOMMU_ATTACH_F_BYPASS);
if (!domain) {
/* Incompatible bypass flag */
return VIRTIO_IOMMU_S_INVAL;
}
QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
ep->domain = domain;
sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
virtio_iommu_switch_address_space(sdev);
/* Replay domain mappings on the associated memory region */
g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
ep->iommu_mr);
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_detach(VirtIOIOMMU *s,
struct virtio_iommu_req_detach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
trace_virtio_iommu_detach(domain_id, ep_id);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
domain = ep->domain;
if (!domain || domain->id != domain_id) {
return VIRTIO_IOMMU_S_INVAL;
}
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_map(VirtIOIOMMU *s,
struct virtio_iommu_req_map *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t phys_start = le64_to_cpu(req->phys_start);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
uint32_t flags = le32_to_cpu(req->flags);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUInterval *interval;
VirtIOIOMMUMapping *mapping;
VirtIOIOMMUEndpoint *ep;
if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
return VIRTIO_IOMMU_S_INVAL;
}
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
if (domain->bypass) {
return VIRTIO_IOMMU_S_INVAL;
}
interval = g_malloc0(sizeof(*interval));
interval->low = virt_start;
interval->high = virt_end;
mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
if (mapping) {
g_free(interval);
return VIRTIO_IOMMU_S_INVAL;
}
trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
mapping = g_malloc0(sizeof(*mapping));
mapping->phys_addr = phys_start;
mapping->flags = flags;
g_tree_insert(domain->mappings, interval, mapping);
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start,
flags);
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_unmap(VirtIOIOMMU *s,
struct virtio_iommu_req_unmap *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
VirtIOIOMMUMapping *iter_val;
VirtIOIOMMUInterval interval, *iter_key;
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
int ret = VIRTIO_IOMMU_S_OK;
trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
if (domain->bypass) {
return VIRTIO_IOMMU_S_INVAL;
}
interval.low = virt_start;
interval.high = virt_end;
while (g_tree_lookup_extended(domain->mappings, &interval,
(void **)&iter_key, (void**)&iter_val)) {
uint64_t current_low = iter_key->low;
uint64_t current_high = iter_key->high;
if (interval.low <= current_low && interval.high >= current_high) {
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_unmap(ep->iommu_mr, current_low,
current_high);
}
g_tree_remove(domain->mappings, iter_key);
trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
} else {
ret = VIRTIO_IOMMU_S_RANGE;
break;
}
}
return ret;
}
static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep,
uint8_t *buf, size_t free)
{
struct virtio_iommu_probe_resv_mem prop = {};
size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
GList *l;
total = size * g_list_length(sdev->resv_regions);
if (total > free) {
return -ENOSPC;
}
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
unsigned subtype = reg->type;
Range *range = &reg->range;
assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
prop.head.length = cpu_to_le16(length);
prop.subtype = subtype;
prop.start = cpu_to_le64(range_lob(range));
prop.end = cpu_to_le64(range_upb(range));
memcpy(buf, &prop, size);
trace_virtio_iommu_fill_resv_property(ep, prop.subtype,
prop.start, prop.end);
buf += size;
}
return total;
}
/**
* virtio_iommu_probe - Fill the probe request buffer with
* the properties the device is able to return
*/
static int virtio_iommu_probe(VirtIOIOMMU *s,
struct virtio_iommu_req_probe *req,
uint8_t *buf)
{
uint32_t ep_id = le32_to_cpu(req->endpoint);
IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id);
size_t free = VIOMMU_PROBE_SIZE;
IOMMUDevice *sdev;
ssize_t count;
if (!iommu_mr) {
return VIRTIO_IOMMU_S_NOENT;
}
sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr);
count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free);
if (count < 0) {
return VIRTIO_IOMMU_S_INVAL;
}
buf += count;
free -= count;
sdev->probe_done = true;
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_iov_to_req(struct iovec *iov,
unsigned int iov_cnt,
void *req, size_t payload_sz)
{
size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
if (unlikely(sz != payload_sz)) {
return VIRTIO_IOMMU_S_INVAL;
}
return 0;
}
#define virtio_iommu_handle_req(__req) \
static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
struct iovec *iov, \
unsigned int iov_cnt) \
{ \
struct virtio_iommu_req_ ## __req req; \
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \
sizeof(req) - sizeof(struct virtio_iommu_req_tail));\
\
return ret ? ret : virtio_iommu_ ## __req(s, &req); \
}
virtio_iommu_handle_req(attach)
virtio_iommu_handle_req(detach)
virtio_iommu_handle_req(map)
virtio_iommu_handle_req(unmap)
static int virtio_iommu_handle_probe(VirtIOIOMMU *s,
struct iovec *iov,
unsigned int iov_cnt,
uint8_t *buf)
{
struct virtio_iommu_req_probe req;
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req));
return ret ? ret : virtio_iommu_probe(s, &req, buf);
}
static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
struct virtio_iommu_req_head head;
struct virtio_iommu_req_tail tail = {};
VirtQueueElement *elem;
unsigned int iov_cnt;
struct iovec *iov;
void *buf = NULL;
size_t sz;
for (;;) {
size_t output_size = sizeof(tail);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
virtio_error(vdev, "virtio-iommu bad head/tail size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
break;
}
iov_cnt = elem->out_num;
iov = elem->out_sg;
sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
if (unlikely(sz != sizeof(head))) {
tail.status = VIRTIO_IOMMU_S_DEVERR;
goto out;
}
qemu_rec_mutex_lock(&s->mutex);
switch (head.type) {
case VIRTIO_IOMMU_T_ATTACH:
tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_DETACH:
tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_MAP:
tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_UNMAP:
tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_PROBE:
{
struct virtio_iommu_req_tail *ptail;
output_size = s->config.probe_size + sizeof(tail);
buf = g_malloc0(output_size);
ptail = buf + s->config.probe_size;
ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf);
break;
}
default:
tail.status = VIRTIO_IOMMU_S_UNSUPP;
}
qemu_rec_mutex_unlock(&s->mutex);
out:
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
buf ? buf : &tail, output_size);
assert(sz == output_size);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
g_free(buf);
virtio-iommu: use-after-free fix A potential Use-after-free was reported in virtio_iommu_handle_command when using virtio-iommu: > I find a potential Use-after-free in QEMU 6.2.0, which is in > virtio_iommu_handle_command() (./hw/virtio/virtio-iommu.c). > > > Specifically, in the loop body, the variable 'buf' allocated at line 639 can be > freed by g_free() at line 659. However, if the execution path enters the loop > body again and the if branch takes true at line 616, the control will directly > jump to 'out' at line 651. At this time, 'buf' is a freed pointer, which is not > assigned with an allocated memory but used at line 653. As a result, a UAF bug > is triggered. > > > > 599 for (;;) { > ... > 615 sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head)); > 616 if (unlikely(sz != sizeof(head))) { > 617 tail.status = VIRTIO_IOMMU_S_DEVERR; > 618 goto out; > 619 } > ... > 639 buf = g_malloc0(output_size); > ... > 651 out: > 652 sz = iov_from_buf(elem->in_sg, elem->in_num, 0, > 653 buf ? buf : &tail, output_size); > ... > 659 g_free(buf); > > We can fix it by set ‘buf‘ to NULL after freeing it: > > > 651 out: > 652 sz = iov_from_buf(elem->in_sg, elem->in_num, 0, > 653 buf ? buf : &tail, output_size); > ... > 659 g_free(buf); > +++ buf = NULL; > 660 } Fix as suggested by the reporter. Signed-off-by: Wentao Liang <Wentao_Liang_g@163.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> Message-id: 20220407095047.50371-1-mst@redhat.com Message-ID: <20220406040445-mutt-send-email-mst@kernel.org> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2022-04-07 12:51:59 +03:00
buf = NULL;
}
}
static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
int flags, uint32_t endpoint,
uint64_t address)
{
VirtIODevice *vdev = &viommu->parent_obj;
VirtQueue *vq = viommu->event_vq;
struct virtio_iommu_fault fault;
VirtQueueElement *elem;
size_t sz;
memset(&fault, 0, sizeof(fault));
fault.reason = reason;
fault.flags = cpu_to_le32(flags);
fault.endpoint = cpu_to_le32(endpoint);
fault.address = cpu_to_le64(address);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
error_report_once(
"no buffer available in event queue to report event");
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
virtio_error(vdev, "error buffer of wrong size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
return;
}
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
&fault, sizeof(fault));
assert(sz == sizeof(fault));
trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
}
static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
IOMMUAccessFlags flag,
int iommu_idx)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMUInterval interval, *mapping_key;
VirtIOIOMMUMapping *mapping_value;
VirtIOIOMMU *s = sdev->viommu;
bool read_fault, write_fault;
VirtIOIOMMUEndpoint *ep;
uint32_t sid, flags;
bool bypass_allowed;
int granule;
bool found;
GList *l;
interval.low = addr;
interval.high = addr + 1;
granule = ctz64(s->config.page_size_mask);
IOMMUTLBEntry entry = {
.target_as = &address_space_memory,
.iova = addr,
.translated_addr = addr,
.addr_mask = BIT_ULL(granule) - 1,
.perm = IOMMU_NONE,
};
bypass_allowed = s->config.bypass;
sid = virtio_iommu_get_bdf(sdev);
trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
qemu_rec_mutex_lock(&s->mutex);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (bypass_allowed)
assert(ep && ep->domain && !ep->domain->bypass);
if (!ep) {
if (!bypass_allowed) {
error_report_once("%s sid=%d is not known!!", __func__, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
}
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
if (range_contains(&reg->range, addr)) {
switch (reg->type) {
case VIRTIO_IOMMU_RESV_MEM_T_MSI:
entry.perm = flag;
break;
case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
default:
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
break;
}
goto unlock;
}
}
if (!ep->domain) {
if (!bypass_allowed) {
error_report_once("%s %02x:%02x.%01x not attached to any domain",
__func__, PCI_BUS_NUM(sid),
PCI_SLOT(sid), PCI_FUNC(sid));
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
} else if (ep->domain->bypass) {
entry.perm = flag;
goto unlock;
}
found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
(void **)&mapping_key,
(void **)&mapping_value);
if (!found) {
error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
__func__, addr, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
read_fault = (flag & IOMMU_RO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
write_fault = (flag & IOMMU_WO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
if (flags) {
error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
__func__, addr, flag, mapping_value->flags);
flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
entry.perm = flag;
trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
unlock:
qemu_rec_mutex_unlock(&s->mutex);
return entry;
}
static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
struct virtio_iommu_config *dev_config = &dev->config;
struct virtio_iommu_config *out_config = (void *)config_data;
out_config->page_size_mask = cpu_to_le64(dev_config->page_size_mask);
out_config->input_range.start = cpu_to_le64(dev_config->input_range.start);
out_config->input_range.end = cpu_to_le64(dev_config->input_range.end);
out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start);
out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end);
out_config->probe_size = cpu_to_le32(dev_config->probe_size);
out_config->bypass = dev_config->bypass;
trace_virtio_iommu_get_config(dev_config->page_size_mask,
dev_config->input_range.start,
dev_config->input_range.end,
dev_config->domain_range.start,
dev_config->domain_range.end,
dev_config->probe_size,
dev_config->bypass);
}
static void virtio_iommu_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
struct virtio_iommu_config *dev_config = &dev->config;
const struct virtio_iommu_config *in_config = (void *)config_data;
if (in_config->bypass != dev_config->bypass) {
if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
virtio_error(vdev, "cannot set config.bypass");
return;
} else if (in_config->bypass != 0 && in_config->bypass != 1) {
virtio_error(vdev, "invalid config.bypass value '%u'",
in_config->bypass);
return;
}
dev_config->bypass = in_config->bypass;
virtio_iommu_switch_address_space_all(dev);
}
trace_virtio_iommu_set_config(in_config->bypass);
}
static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
Error **errp)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
f |= dev->features;
trace_virtio_iommu_get_features(f);
return f;
}
static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
guint ua = GPOINTER_TO_UINT(a);
guint ub = GPOINTER_TO_UINT(b);
return (ua > ub) - (ua < ub);
}
static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high,
mapping->phys_addr);
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint32_t sid;
VirtIOIOMMUEndpoint *ep;
sid = virtio_iommu_get_bdf(sdev);
qemu_rec_mutex_lock(&s->mutex);
if (!s->endpoints) {
goto unlock;
}
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep || !ep->domain) {
goto unlock;
}
g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr);
unlock:
qemu_rec_mutex_unlock(&s->mutex);
}
static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
IOMMUNotifierFlag old,
IOMMUNotifierFlag new,
Error **errp)
{
vhost: Unbreak SMMU and virtio-iommu on dev-iotlb support Previous work on dev-iotlb message broke vhost on either SMMU or virtio-iommu since dev-iotlb (or PCIe ATS) is not yet supported for those archs. An initial idea is that we can let IOMMU to export this information to vhost so that vhost would know whether the vIOMMU would support dev-iotlb, then vhost can conditionally register to dev-iotlb or the old iotlb way. We can work based on some previous patch to introduce PCIIOMMUOps as Yi Liu proposed [1]. However it's not as easy as I thought since vhost_iommu_region_add() does not have a PCIDevice context at all since it's completely a backend. It seems non-trivial to pass over a PCI device to the backend during init. E.g. when the IOMMU notifier registered hdev->vdev is still NULL. To make the fix smaller and easier, this patch goes the other way to leverage the flag_changed() hook of vIOMMUs so that SMMU and virtio-iommu can trap the dev-iotlb registration and fail it. Then vhost could try the fallback solution as using UNMAP invalidation for it's translations. [1] https://lore.kernel.org/qemu-devel/1599735398-6829-4-git-send-email-yi.l.liu@intel.com/ Reported-by: Eric Auger <eric.auger@redhat.com> Fixes: b68ba1ca57677acf870d5ab10579e6105c1f5338 Reviewed-by: Eric Auger <eric.auger@redhat.com> Tested-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20210204191228.187550-1-peterx@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2021-02-04 22:12:28 +03:00
if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
error_setg(errp, "Virtio-iommu does not support dev-iotlb yet");
return -EINVAL;
}
if (old == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name);
} else if (new == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name);
}
return 0;
}
/*
* The default mask depends on the "granule" property. For example, with
* 4k granule, it is -(4 * KiB). When an assigned device has page size
* restrictions due to the hardware IOMMU configuration, apply this restriction
* to the mask.
*/
static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
uint64_t new_mask,
Error **errp)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint64_t cur_mask = s->config.page_size_mask;
trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask,
new_mask);
if ((cur_mask & new_mask) == 0) {
error_setg(errp, "virtio-iommu %s reports a page size mask 0x%"PRIx64
" incompatible with currently supported mask 0x%"PRIx64,
mr->parent_obj.name, new_mask, cur_mask);
return -1;
}
/*
* Once the granule is frozen we can't change the mask anymore. If by
* chance the hotplugged device supports the same granule, we can still
* accept it.
*/
if (s->granule_frozen) {
int cur_granule = ctz64(cur_mask);
if (!(BIT_ULL(cur_granule) & new_mask)) {
error_setg(errp, "virtio-iommu %s does not support frozen granule 0x%llx",
mr->parent_obj.name, BIT_ULL(cur_granule));
return -1;
}
return 0;
}
s->config.page_size_mask &= new_mask;
return 0;
}
/**
* rebuild_resv_regions: rebuild resv regions with both the
* info of host resv ranges and property set resv ranges
*/
static int rebuild_resv_regions(IOMMUDevice *sdev)
{
GList *l;
int i = 0;
/* free the existing list and rebuild it from scratch */
g_list_free_full(sdev->resv_regions, g_free);
sdev->resv_regions = NULL;
/* First add host reserved regions if any, all tagged as RESERVED */
for (l = sdev->host_resv_ranges; l; l = l->next) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
Range *r = (Range *)l->data;
reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED;
range_set_bounds(&reg->range, range_lob(r), range_upb(r));
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i,
range_lob(&reg->range),
range_upb(&reg->range));
i++;
}
/*
* then add higher priority reserved regions set by the machine
* through properties
*/
add_prop_resv_regions(sdev);
return 0;
}
/**
* virtio_iommu_set_iova_ranges: Conveys the usable IOVA ranges
*
* The function turns those into reserved ranges. Once some
* reserved ranges have been set, new reserved regions cannot be
* added outside of the original ones.
*
* @mr: IOMMU MR
* @iova_ranges: list of usable IOVA ranges
* @errp: error handle
*/
static int virtio_iommu_set_iova_ranges(IOMMUMemoryRegion *mr,
GList *iova_ranges,
Error **errp)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
GList *current_ranges = sdev->host_resv_ranges;
GList *l, *tmp, *new_ranges = NULL;
int ret = -EINVAL;
/* check that each new resv region is included in an existing one */
if (sdev->host_resv_ranges) {
range_inverse_array(iova_ranges,
&new_ranges,
0, UINT64_MAX);
for (tmp = new_ranges; tmp; tmp = tmp->next) {
Range *newr = (Range *)tmp->data;
bool included = false;
for (l = current_ranges; l; l = l->next) {
Range * r = (Range *)l->data;
if (range_contains_range(r, newr)) {
included = true;
break;
}
}
if (!included) {
goto error;
}
}
/* all new reserved ranges are included in existing ones */
ret = 0;
goto out;
}
if (sdev->probe_done) {
warn_report("%s: Notified about new host reserved regions after probe",
mr->parent_obj.name);
}
range_inverse_array(iova_ranges,
&sdev->host_resv_ranges,
0, UINT64_MAX);
rebuild_resv_regions(sdev);
return 0;
error:
error_setg(errp, "IOMMU mr=%s Conflicting host reserved ranges set!",
mr->parent_obj.name);
out:
g_list_free_full(new_ranges, g_free);
return ret;
}
static void virtio_iommu_system_reset(void *opaque)
{
VirtIOIOMMU *s = opaque;
trace_virtio_iommu_system_reset();
memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
/*
* config.bypass is sticky across device reset, but should be restored on
* system reset
*/
s->config.bypass = s->boot_bypass;
virtio_iommu_switch_address_space_all(s);
}
static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
{
VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
int granule;
if (likely(s->config.bypass)) {
/*
* Transient IOMMU MR enable to collect page_size_mask requirements
* through memory_region_iommu_set_page_size_mask() called by
* VFIO region_add() callback
*/
s->config.bypass = false;
virtio_iommu_switch_address_space_all(s);
/* restore default */
s->config.bypass = true;
virtio_iommu_switch_address_space_all(s);
}
s->granule_frozen = true;
granule = ctz64(s->config.page_size_mask);
trace_virtio_iommu_freeze_granule(BIT_ULL(granule));
}
static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config));
s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
virtio_iommu_handle_command);
s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
/*
* config.bypass is needed to get initial address space early, such as
* in vfio realize
*/
s->config.bypass = s->boot_bypass;
if (s->aw_bits < 32 || s->aw_bits > 64) {
error_setg(errp, "aw-bits must be within [32,64]");
return;
}
s->config.input_range.end =
s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1;
switch (s->granule_mode) {
case GRANULE_MODE_4K:
s->config.page_size_mask = -(4 * KiB);
break;
case GRANULE_MODE_8K:
s->config.page_size_mask = -(8 * KiB);
break;
case GRANULE_MODE_16K:
s->config.page_size_mask = -(16 * KiB);
break;
case GRANULE_MODE_64K:
s->config.page_size_mask = -(64 * KiB);
break;
case GRANULE_MODE_HOST:
s->config.page_size_mask = qemu_real_host_page_mask();
break;
default:
error_setg(errp, "Unsupported granule mode");
}
s->config.domain_range.end = UINT32_MAX;
s->config.probe_size = VIOMMU_PROBE_SIZE;
virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG);
qemu_rec_mutex_init(&s->mutex);
s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
if (s->primary_bus) {
hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps This patch modifies pci_setup_iommu() to set PCIIOMMUOps instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to get an address space for a PCI device in vendor specific way. The PCIIOMMUOps still offers this functionality. But using PCIIOMMUOps leaves space to add more iommu related vendor specific operations. Cc: Kevin Tian <kevin.tian@intel.com> Cc: Jacob Pan <jacob.jun.pan@linux.intel.com> Cc: Peter Xu <peterx@redhat.com> Cc: Eric Auger <eric.auger@redhat.com> Cc: Yi Sun <yi.y.sun@linux.intel.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Eric Auger <eric.auger@redhat.com> Cc: Peter Maydell <peter.maydell@linaro.org> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Jason Wang <jasowang@redhat.com> Cc: Andrey Smirnov <andrew.smirnov@gmail.com> Cc: Helge Deller <deller@gmx.de> Cc: Hervé Poussineau <hpoussin@reactos.org> Cc: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk> Cc: BALATON Zoltan <balaton@eik.bme.hu> Cc: Elena Ufimtseva <elena.ufimtseva@oracle.com> Cc: Jagannathan Raman <jag.raman@oracle.com> Cc: Matthew Rosato <mjrosato@linux.ibm.com> Cc: Eric Farman <farman@linux.ibm.com> Cc: Halil Pasic <pasic@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Thomas Huth <thuth@redhat.com> Cc: Helge Deller <deller@gmx.de> Reviewed-by: David Gibson <david@gibson.dropbear.id.au> Reviewed-by: Peter Xu <peterx@redhat.com> Signed-off-by: Yi Liu <yi.l.liu@intel.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Reviewed-by: Eric Auger <eric.auger@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> [ clg: - refreshed on latest QEMU - included hw/remote/iommu.c - documentation update - asserts in pci_setup_iommu() - removed checks on iommu_bus->iommu_ops->get_address_space - included Elroy PCI host (PA-RISC) ] Signed-off-by: Cédric Le Goater <clg@redhat.com>
2023-10-17 19:14:04 +03:00
pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s);
} else {
error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
}
s->machine_done.notify = virtio_iommu_freeze_granule;
qemu_add_machine_init_done_notifier(&s->machine_done);
qemu_register_reset(virtio_iommu_system_reset, s);
}
qdev: Unrealize must not fail Devices may have component devices and buses. Device realization may fail. Realization is recursive: a device's realize() method realizes its components, and device_set_realized() realizes its buses (which should in turn realize the devices on that bus, except bus_set_realized() doesn't implement that, yet). When realization of a component or bus fails, we need to roll back: unrealize everything we realized so far. If any of these unrealizes failed, the device would be left in an inconsistent state. Must not happen. device_set_realized() lets it happen: it ignores errors in the roll back code starting at label child_realize_fail. Since realization is recursive, unrealization must be recursive, too. But how could a partly failed unrealize be rolled back? We'd have to re-realize, which can fail. This design is fundamentally broken. device_set_realized() does not roll back at all. Instead, it keeps unrealizing, ignoring further errors. It can screw up even for a device with no buses: if the lone dc->unrealize() fails, it still unregisters vmstate, and calls listeners' unrealize() callback. bus_set_realized() does not roll back either. Instead, it stops unrealizing. Fortunately, no unrealize method can fail, as we'll see below. To fix the design error, drop parameter @errp from all the unrealize methods. Any unrealize method that uses @errp now needs an update. This leads us to unrealize() methods that can fail. Merely passing it to another unrealize method cannot cause failure, though. Here are the ones that do other things with @errp: * virtio_serial_device_unrealize() Fails when qbus_set_hotplug_handler() fails, but still does all the other work. On failure, the device would stay realized with its resources completely gone. Oops. Can't happen, because qbus_set_hotplug_handler() can't actually fail here. Pass &error_abort to qbus_set_hotplug_handler() instead. * hw/ppc/spapr_drc.c's unrealize() Fails when object_property_del() fails, but all the other work is already done. On failure, the device would stay realized with its vmstate registration gone. Oops. Can't happen, because object_property_del() can't actually fail here. Pass &error_abort to object_property_del() instead. * spapr_phb_unrealize() Fails and bails out when remove_drcs() fails, but other work is already done. On failure, the device would stay realized with some of its resources gone. Oops. remove_drcs() fails only when chassis_from_bus()'s object_property_get_uint() fails, and it can't here. Pass &error_abort to remove_drcs() instead. Therefore, no unrealize method can fail before this patch. device_set_realized()'s recursive unrealization via bus uses object_property_set_bool(). Can't drop @errp there, so pass &error_abort. We similarly unrealize with object_property_set_bool() elsewhere, always ignoring errors. Pass &error_abort instead. Several unrealize methods no longer handle errors from other unrealize methods: virtio_9p_device_unrealize(), virtio_input_device_unrealize(), scsi_qdev_unrealize(), ... Much of the deleted error handling looks wrong anyway. One unrealize methods no longer ignore such errors: usb_ehci_pci_exit(). Several realize methods no longer ignore errors when rolling back: v9fs_device_realize_common(), pci_qdev_unrealize(), spapr_phb_realize(), usb_qdev_realize(), vfio_ccw_realize(), virtio_device_realize(). Signed-off-by: Markus Armbruster <armbru@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com> Message-Id: <20200505152926.18877-17-armbru@redhat.com>
2020-05-05 18:29:24 +03:00
static void virtio_iommu_device_unrealize(DeviceState *dev)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
qemu_unregister_reset(virtio_iommu_system_reset, s);
qemu_remove_machine_init_done_notifier(&s->machine_done);
g_hash_table_destroy(s->as_by_busptr);
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
qemu_rec_mutex_destroy(&s->mutex);
virtio_delete_queue(s->req_vq);
virtio_delete_queue(s->event_vq);
virtio_cleanup(vdev);
}
static void virtio_iommu_device_reset(VirtIODevice *vdev)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
trace_virtio_iommu_device_reset();
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_domain);
s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_endpoint);
}
static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
{
trace_virtio_iommu_device_status(status);
}
static void virtio_iommu_instance_init(Object *obj)
{
}
#define VMSTATE_INTERVAL \
{ \
.name = "interval", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (const VMStateField[]) { \
VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
VMSTATE_END_OF_LIST() \
} \
}
#define VMSTATE_MAPPING \
{ \
.name = "mapping", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (const VMStateField[]) { \
VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
VMSTATE_END_OF_LIST() \
}, \
}
static const VMStateDescription vmstate_interval_mapping[2] = {
VMSTATE_MAPPING, /* value */
VMSTATE_INTERVAL /* key */
};
static int domain_preload(void *opaque)
{
VirtIOIOMMUDomain *domain = opaque;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, g_free, g_free);
return 0;
}
static const VMStateDescription vmstate_endpoint = {
.name = "endpoint",
.version_id = 1,
.minimum_version_id = 1,
.fields = (const VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_domain = {
.name = "domain",
.version_id = 2,
.minimum_version_id = 2,
.pre_load = domain_preload,
.fields = (const VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUDomain),
VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
vmstate_interval_mapping,
VirtIOIOMMUInterval, VirtIOIOMMUMapping),
VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
vmstate_endpoint, VirtIOIOMMUEndpoint, next),
VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2),
VMSTATE_END_OF_LIST()
}
};
static gboolean reconstruct_endpoints(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMU *s = (VirtIOIOMMU *)data;
VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
VirtIOIOMMUEndpoint *iter;
IOMMUMemoryRegion *mr;
QLIST_FOREACH(iter, &d->endpoint_list, next) {
mr = virtio_iommu_mr(s, iter->id);
assert(mr);
iter->domain = d;
iter->iommu_mr = mr;
g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
}
return false; /* continue the domain traversal */
}
static int iommu_post_load(void *opaque, int version_id)
{
VirtIOIOMMU *s = opaque;
g_tree_foreach(s->domains, reconstruct_endpoints, s);
/*
* Memory regions are dynamically turned on/off depending on
* 'config.bypass' and attached domain type if there is. After
* migration, we need to make sure the memory regions are
* still correct.
*/
virtio_iommu_switch_address_space_all(s);
return 0;
}
static const VMStateDescription vmstate_virtio_iommu_device = {
.name = "virtio-iommu-device",
.minimum_version_id = 2,
.version_id = 2,
.post_load = iommu_post_load,
.fields = (const VMStateField[]) {
VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2,
&vmstate_domain, VirtIOIOMMUDomain),
VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2),
VMSTATE_END_OF_LIST()
},
};
static const VMStateDescription vmstate_virtio_iommu = {
.name = "virtio-iommu",
.minimum_version_id = 2,
.priority = MIG_PRI_IOMMU,
.version_id = 2,
.fields = (const VMStateField[]) {
VMSTATE_VIRTIO_DEVICE,
VMSTATE_END_OF_LIST()
},
};
static Property virtio_iommu_properties[] = {
DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
TYPE_PCI_BUS, PCIBus *),
DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
virtio-iommu: Change the default granule to the host page size We used to set the default granule to 4KB but with VFIO assignment it makes more sense to use the actual host page size. Indeed when hotplugging a VFIO device protected by a virtio-iommu on a 64kB/64kB host/guest config, we current get a qemu crash: "vfio: DMA mapping failed, unable to continue" This is due to the hot-attached VFIO device calling memory_region_iommu_set_page_size_mask() with 64kB granule whereas the virtio-iommu granule was already frozen to 4KB on machine init done. Set the granule property to "host" and introduce a new compat. The page size mask used before 9.0 was qemu_target_page_mask(). Since the virtio-iommu currently only supports x86_64 and aarch64, this matched a 4KB granule. Note that the new default will prevent 4kB guest on 64kB host because the granule will be set to 64kB which would be larger than the guest page size. In that situation, the virtio-iommu driver fails on viommu_domain_finalise() with "granule 0x10000 larger than system page size 0x1000". In that case the workaround is to request 4K granule. The current limitation of global granule in the virtio-iommu should be removed and turned into per domain granule. But until we get this upgraded, this new default is probably better because I don't think anyone is currently interested in running a 4KB page size guest with virtio-iommu on a 64KB host. However supporting 64kB guest on 64kB host with virtio-iommu and VFIO looks a more important feature. Signed-off-by: Eric Auger <eric.auger@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com> Message-Id: <20240307134445.92296-4-eric.auger@redhat.com> Reviewed-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2024-03-07 16:43:04 +03:00
GRANULE_MODE_HOST),
DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
DEFINE_PROP_END_OF_LIST(),
};
static void virtio_iommu_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
device_class_set_props(dc, virtio_iommu_properties);
dc->vmsd = &vmstate_virtio_iommu;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_iommu_device_realize;
vdc->unrealize = virtio_iommu_device_unrealize;
vdc->reset = virtio_iommu_device_reset;
vdc->get_config = virtio_iommu_get_config;
vdc->set_config = virtio_iommu_set_config;
vdc->get_features = virtio_iommu_get_features;
vdc->set_status = virtio_iommu_set_status;
vdc->vmsd = &vmstate_virtio_iommu_device;
}
static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
void *data)
{
IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
imrc->translate = virtio_iommu_translate;
imrc->replay = virtio_iommu_replay;
imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
imrc->iommu_set_iova_ranges = virtio_iommu_set_iova_ranges;
}
static const TypeInfo virtio_iommu_info = {
.name = TYPE_VIRTIO_IOMMU,
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOIOMMU),
.instance_init = virtio_iommu_instance_init,
.class_init = virtio_iommu_class_init,
};
static const TypeInfo virtio_iommu_memory_region_info = {
.parent = TYPE_IOMMU_MEMORY_REGION,
.name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
.class_init = virtio_iommu_memory_region_class_init,
};
static void virtio_register_types(void)
{
type_register_static(&virtio_iommu_info);
type_register_static(&virtio_iommu_memory_region_info);
}
type_init(virtio_register_types)