qemu/hw/virtio/virtio-iommu.c
Cédric Le Goater 1b889d6e39 virtio-iommu: Clear IOMMUDevice when VFIO device is unplugged
When a VFIO device is hoplugged in a VM using virtio-iommu, IOMMUPciBus
and IOMMUDevice cache entries are created in the .get_address_space()
handler of the machine IOMMU device. However, these entries are never
destroyed, not even when the VFIO device is detached from the machine.
This can lead to an assert if the device is reattached again.

When reattached, the .get_address_space() handler reuses an
IOMMUDevice entry allocated when the VFIO device was first attached.
virtio_iommu_set_host_iova_ranges() is called later on from the
.set_iommu_device() handler an fails with an assert on 'probe_done'
because the device appears to have been already probed when this is
not the case.

The IOMMUDevice entry is allocated in pci_device_iommu_address_space()
called from under vfio_realize(), the VFIO PCI realize handler. Since
pci_device_unset_iommu_device() is called from vfio_exitfn(), a sub
function of the PCIDevice unrealize() handler, it seems that the
.unset_iommu_device() handler is the best place to release resources
allocated at realize time. Clear the IOMMUDevice cache entry there to
fix hotplug.

Fixes: 817ef10da2 ("virtio-iommu: Implement set|unset]_iommu_device() callbacks")
Signed-off-by: Cédric Le Goater <clg@redhat.com>
Message-Id: <20240701101453.203985-1-clg@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2024-07-03 18:14:07 -04:00

1745 lines
54 KiB
C

/*
* virtio-iommu device
*
* Copyright (c) 2020 Red Hat, Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2 or later, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "qemu/osdep.h"
#include "qemu/log.h"
#include "qemu/iov.h"
#include "qemu/range.h"
#include "qemu/reserved-region.h"
#include "exec/target_page.h"
#include "hw/qdev-properties.h"
#include "hw/virtio/virtio.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "sysemu/sysemu.h"
#include "qemu/reserved-region.h"
#include "qemu/units.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "standard-headers/linux/virtio_ids.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-iommu.h"
#include "hw/pci/pci_bus.h"
#include "hw/pci/pci.h"
/* Max size */
#define VIOMMU_DEFAULT_QUEUE_SIZE 256
#define VIOMMU_PROBE_SIZE 512
typedef struct VirtIOIOMMUDomain {
uint32_t id;
bool bypass;
GTree *mappings;
QLIST_HEAD(, VirtIOIOMMUEndpoint) endpoint_list;
} VirtIOIOMMUDomain;
typedef struct VirtIOIOMMUEndpoint {
uint32_t id;
VirtIOIOMMUDomain *domain;
IOMMUMemoryRegion *iommu_mr;
QLIST_ENTRY(VirtIOIOMMUEndpoint) next;
} VirtIOIOMMUEndpoint;
typedef struct VirtIOIOMMUInterval {
uint64_t low;
uint64_t high;
} VirtIOIOMMUInterval;
typedef struct VirtIOIOMMUMapping {
uint64_t phys_addr;
uint32_t flags;
} VirtIOIOMMUMapping;
struct hiod_key {
PCIBus *bus;
uint8_t devfn;
};
static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
{
return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
}
static bool virtio_iommu_device_bypassed(IOMMUDevice *sdev)
{
uint32_t sid;
bool bypassed;
VirtIOIOMMU *s = sdev->viommu;
VirtIOIOMMUEndpoint *ep;
sid = virtio_iommu_get_bdf(sdev);
qemu_rec_mutex_lock(&s->mutex);
/* need to check bypass before system reset */
if (!s->endpoints) {
bypassed = s->config.bypass;
goto unlock;
}
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep || !ep->domain) {
bypassed = s->config.bypass;
} else {
bypassed = ep->domain->bypass;
}
unlock:
qemu_rec_mutex_unlock(&s->mutex);
return bypassed;
}
/* Return whether the device is using IOMMU translation. */
static bool virtio_iommu_switch_address_space(IOMMUDevice *sdev)
{
bool use_remapping;
assert(sdev);
use_remapping = !virtio_iommu_device_bypassed(sdev);
trace_virtio_iommu_switch_address_space(pci_bus_num(sdev->bus),
PCI_SLOT(sdev->devfn),
PCI_FUNC(sdev->devfn),
use_remapping);
/* Turn off first then on the other */
if (use_remapping) {
memory_region_set_enabled(&sdev->bypass_mr, false);
memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), true);
} else {
memory_region_set_enabled(MEMORY_REGION(&sdev->iommu_mr), false);
memory_region_set_enabled(&sdev->bypass_mr, true);
}
return use_remapping;
}
static void virtio_iommu_switch_address_space_all(VirtIOIOMMU *s)
{
GHashTableIter iter;
IOMMUPciBus *iommu_pci_bus;
int i;
g_hash_table_iter_init(&iter, s->as_by_busptr);
while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
for (i = 0; i < PCI_DEVFN_MAX; i++) {
if (!iommu_pci_bus->pbdev[i]) {
continue;
}
virtio_iommu_switch_address_space(iommu_pci_bus->pbdev[i]);
}
}
}
/**
* The bus number is used for lookup when SID based operations occur.
* In that case we lazily populate the IOMMUPciBus array from the bus hash
* table. At the time the IOMMUPciBus is created (iommu_find_add_as), the bus
* numbers may not be always initialized yet.
*/
static IOMMUPciBus *iommu_find_iommu_pcibus(VirtIOIOMMU *s, uint8_t bus_num)
{
IOMMUPciBus *iommu_pci_bus = s->iommu_pcibus_by_bus_num[bus_num];
if (!iommu_pci_bus) {
GHashTableIter iter;
g_hash_table_iter_init(&iter, s->as_by_busptr);
while (g_hash_table_iter_next(&iter, NULL, (void **)&iommu_pci_bus)) {
if (pci_bus_num(iommu_pci_bus->bus) == bus_num) {
s->iommu_pcibus_by_bus_num[bus_num] = iommu_pci_bus;
return iommu_pci_bus;
}
}
return NULL;
}
return iommu_pci_bus;
}
static IOMMUMemoryRegion *virtio_iommu_mr(VirtIOIOMMU *s, uint32_t sid)
{
uint8_t bus_n, devfn;
IOMMUPciBus *iommu_pci_bus;
IOMMUDevice *dev;
bus_n = PCI_BUS_NUM(sid);
iommu_pci_bus = iommu_find_iommu_pcibus(s, bus_n);
if (iommu_pci_bus) {
devfn = sid & (PCI_DEVFN_MAX - 1);
dev = iommu_pci_bus->pbdev[devfn];
if (dev) {
return &dev->iommu_mr;
}
}
return NULL;
}
static gint interval_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
VirtIOIOMMUInterval *inta = (VirtIOIOMMUInterval *)a;
VirtIOIOMMUInterval *intb = (VirtIOIOMMUInterval *)b;
if (inta->high < intb->low) {
return -1;
} else if (intb->high < inta->low) {
return 1;
} else {
return 0;
}
}
static void virtio_iommu_notify_map_unmap(IOMMUMemoryRegion *mr,
IOMMUTLBEvent *event,
hwaddr virt_start, hwaddr virt_end)
{
uint64_t delta = virt_end - virt_start;
event->entry.iova = virt_start;
event->entry.addr_mask = delta;
if (delta == UINT64_MAX) {
memory_region_notify_iommu(mr, 0, *event);
}
while (virt_start != virt_end + 1) {
uint64_t mask = dma_aligned_pow2_mask(virt_start, virt_end, 64);
event->entry.addr_mask = mask;
event->entry.iova = virt_start;
memory_region_notify_iommu(mr, 0, *event);
virt_start += mask + 1;
if (event->entry.perm != IOMMU_NONE) {
event->entry.translated_addr += mask + 1;
}
}
}
static void virtio_iommu_notify_map(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end, hwaddr paddr,
uint32_t flags)
{
IOMMUTLBEvent event;
IOMMUAccessFlags perm = IOMMU_ACCESS_FLAG(flags & VIRTIO_IOMMU_MAP_F_READ,
flags & VIRTIO_IOMMU_MAP_F_WRITE);
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_MAP) ||
(flags & VIRTIO_IOMMU_MAP_F_MMIO) || !perm) {
return;
}
trace_virtio_iommu_notify_map(mr->parent_obj.name, virt_start, virt_end,
paddr, perm);
event.type = IOMMU_NOTIFIER_MAP;
event.entry.target_as = &address_space_memory;
event.entry.perm = perm;
event.entry.translated_addr = paddr;
virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
}
static void virtio_iommu_notify_unmap(IOMMUMemoryRegion *mr, hwaddr virt_start,
hwaddr virt_end)
{
IOMMUTLBEvent event;
if (!(mr->iommu_notify_flags & IOMMU_NOTIFIER_UNMAP)) {
return;
}
trace_virtio_iommu_notify_unmap(mr->parent_obj.name, virt_start, virt_end);
event.type = IOMMU_NOTIFIER_UNMAP;
event.entry.target_as = &address_space_memory;
event.entry.perm = IOMMU_NONE;
event.entry.translated_addr = 0;
virtio_iommu_notify_map_unmap(mr, &event, virt_start, virt_end);
}
static gboolean virtio_iommu_notify_unmap_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_unmap(mr, interval->low, interval->high);
return false;
}
static gboolean virtio_iommu_notify_map_cb(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_detach_endpoint_from_domain(VirtIOIOMMUEndpoint *ep)
{
VirtIOIOMMUDomain *domain = ep->domain;
IOMMUDevice *sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
if (!ep->domain) {
return;
}
g_tree_foreach(domain->mappings, virtio_iommu_notify_unmap_cb,
ep->iommu_mr);
QLIST_REMOVE(ep, next);
ep->domain = NULL;
virtio_iommu_switch_address_space(sdev);
}
static VirtIOIOMMUEndpoint *virtio_iommu_get_endpoint(VirtIOIOMMU *s,
uint32_t ep_id)
{
VirtIOIOMMUEndpoint *ep;
IOMMUMemoryRegion *mr;
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (ep) {
return ep;
}
mr = virtio_iommu_mr(s, ep_id);
if (!mr) {
return NULL;
}
ep = g_malloc0(sizeof(*ep));
ep->id = ep_id;
ep->iommu_mr = mr;
trace_virtio_iommu_get_endpoint(ep_id);
g_tree_insert(s->endpoints, GUINT_TO_POINTER(ep_id), ep);
return ep;
}
static void virtio_iommu_put_endpoint(gpointer data)
{
VirtIOIOMMUEndpoint *ep = (VirtIOIOMMUEndpoint *)data;
if (ep->domain) {
virtio_iommu_detach_endpoint_from_domain(ep);
}
trace_virtio_iommu_put_endpoint(ep->id);
g_free(ep);
}
static VirtIOIOMMUDomain *virtio_iommu_get_domain(VirtIOIOMMU *s,
uint32_t domain_id,
bool bypass)
{
VirtIOIOMMUDomain *domain;
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (domain) {
if (domain->bypass != bypass) {
return NULL;
}
return domain;
}
domain = g_malloc0(sizeof(*domain));
domain->id = domain_id;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, (GDestroyNotify)g_free,
(GDestroyNotify)g_free);
domain->bypass = bypass;
g_tree_insert(s->domains, GUINT_TO_POINTER(domain_id), domain);
QLIST_INIT(&domain->endpoint_list);
trace_virtio_iommu_get_domain(domain_id);
return domain;
}
static void virtio_iommu_put_domain(gpointer data)
{
VirtIOIOMMUDomain *domain = (VirtIOIOMMUDomain *)data;
VirtIOIOMMUEndpoint *iter, *tmp;
QLIST_FOREACH_SAFE(iter, &domain->endpoint_list, next, tmp) {
virtio_iommu_detach_endpoint_from_domain(iter);
}
g_tree_destroy(domain->mappings);
trace_virtio_iommu_put_domain(domain->id);
g_free(domain);
}
static void add_prop_resv_regions(IOMMUDevice *sdev)
{
VirtIOIOMMU *s = sdev->viommu;
int i;
for (i = 0; i < s->nr_prop_resv_regions; i++) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
*reg = s->prop_resv_regions[i];
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
}
}
static AddressSpace *virtio_iommu_find_add_as(PCIBus *bus, void *opaque,
int devfn)
{
VirtIOIOMMU *s = opaque;
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
static uint32_t mr_index;
IOMMUDevice *sdev;
if (!sbus) {
sbus = g_malloc0(sizeof(IOMMUPciBus) +
sizeof(IOMMUDevice *) * PCI_DEVFN_MAX);
sbus->bus = bus;
g_hash_table_insert(s->as_by_busptr, bus, sbus);
}
sdev = sbus->pbdev[devfn];
if (!sdev) {
char *name = g_strdup_printf("%s-%d-%d",
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
mr_index++, devfn);
sdev = sbus->pbdev[devfn] = g_new0(IOMMUDevice, 1);
sdev->viommu = s;
sdev->bus = bus;
sdev->devfn = devfn;
trace_virtio_iommu_init_iommu_mr(name);
memory_region_init(&sdev->root, OBJECT(s), name, UINT64_MAX);
address_space_init(&sdev->as, &sdev->root, TYPE_VIRTIO_IOMMU);
add_prop_resv_regions(sdev);
/*
* Build the IOMMU disabled container with aliases to the
* shared MRs. Note that aliasing to a shared memory region
* could help the memory API to detect same FlatViews so we
* can have devices to share the same FlatView when in bypass
* mode. (either by not configuring virtio-iommu driver or with
* "iommu=pt"). It will greatly reduce the total number of
* FlatViews of the system hence VM runs faster.
*/
memory_region_init_alias(&sdev->bypass_mr, OBJECT(s),
"system", get_system_memory(), 0,
memory_region_size(get_system_memory()));
memory_region_init_iommu(&sdev->iommu_mr, sizeof(sdev->iommu_mr),
TYPE_VIRTIO_IOMMU_MEMORY_REGION,
OBJECT(s), name,
UINT64_MAX);
/*
* Hook both the containers under the root container, we
* switch between iommu & bypass MRs by enable/disable
* corresponding sub-containers
*/
memory_region_add_subregion_overlap(&sdev->root, 0,
MEMORY_REGION(&sdev->iommu_mr),
0);
memory_region_add_subregion_overlap(&sdev->root, 0,
&sdev->bypass_mr, 0);
virtio_iommu_switch_address_space(sdev);
g_free(name);
}
return &sdev->as;
}
static void virtio_iommu_device_clear(VirtIOIOMMU *s, PCIBus *bus, int devfn)
{
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
IOMMUDevice *sdev;
if (!sbus) {
return;
}
sdev = sbus->pbdev[devfn];
if (!sdev) {
return;
}
g_list_free_full(sdev->resv_regions, g_free);
sdev->resv_regions = NULL;
g_free(sdev);
sbus->pbdev[devfn] = NULL;
}
static gboolean hiod_equal(gconstpointer v1, gconstpointer v2)
{
const struct hiod_key *key1 = v1;
const struct hiod_key *key2 = v2;
return (key1->bus == key2->bus) && (key1->devfn == key2->devfn);
}
static guint hiod_hash(gconstpointer v)
{
const struct hiod_key *key = v;
guint value = (guint)(uintptr_t)key->bus;
return (guint)(value << 8 | key->devfn);
}
static void hiod_destroy(gpointer v)
{
object_unref(v);
}
static HostIOMMUDevice *
get_host_iommu_device(VirtIOIOMMU *viommu, PCIBus *bus, int devfn) {
struct hiod_key key = {
.bus = bus,
.devfn = devfn,
};
return g_hash_table_lookup(viommu->host_iommu_devices, &key);
}
/**
* rebuild_resv_regions: rebuild resv regions with both the
* info of host resv ranges and property set resv ranges
*/
static int rebuild_resv_regions(IOMMUDevice *sdev)
{
GList *l;
int i = 0;
/* free the existing list and rebuild it from scratch */
g_list_free_full(sdev->resv_regions, g_free);
sdev->resv_regions = NULL;
/* First add host reserved regions if any, all tagged as RESERVED */
for (l = sdev->host_resv_ranges; l; l = l->next) {
ReservedRegion *reg = g_new0(ReservedRegion, 1);
Range *r = (Range *)l->data;
reg->type = VIRTIO_IOMMU_RESV_MEM_T_RESERVED;
range_set_bounds(&reg->range, range_lob(r), range_upb(r));
sdev->resv_regions = resv_region_list_insert(sdev->resv_regions, reg);
trace_virtio_iommu_host_resv_regions(sdev->iommu_mr.parent_obj.name, i,
range_lob(&reg->range),
range_upb(&reg->range));
i++;
}
/*
* then add higher priority reserved regions set by the machine
* through properties
*/
add_prop_resv_regions(sdev);
return 0;
}
static int virtio_iommu_set_host_iova_ranges(VirtIOIOMMU *s, PCIBus *bus,
int devfn, GList *iova_ranges,
Error **errp)
{
IOMMUPciBus *sbus = g_hash_table_lookup(s->as_by_busptr, bus);
IOMMUDevice *sdev;
GList *current_ranges;
GList *l, *tmp, *new_ranges = NULL;
int ret = -EINVAL;
if (!sbus) {
error_report("%s no sbus", __func__);
}
sdev = sbus->pbdev[devfn];
current_ranges = sdev->host_resv_ranges;
g_assert(!sdev->probe_done);
/* check that each new resv region is included in an existing one */
if (sdev->host_resv_ranges) {
range_inverse_array(iova_ranges,
&new_ranges,
0, UINT64_MAX);
for (tmp = new_ranges; tmp; tmp = tmp->next) {
Range *newr = (Range *)tmp->data;
bool included = false;
for (l = current_ranges; l; l = l->next) {
Range * r = (Range *)l->data;
if (range_contains_range(r, newr)) {
included = true;
break;
}
}
if (!included) {
goto error;
}
}
/* all new reserved ranges are included in existing ones */
ret = 0;
goto out;
}
range_inverse_array(iova_ranges,
&sdev->host_resv_ranges,
0, UINT64_MAX);
rebuild_resv_regions(sdev);
return 0;
error:
error_setg(errp, "%s Conflicting host reserved ranges set!",
__func__);
out:
g_list_free_full(new_ranges, g_free);
return ret;
}
static bool virtio_iommu_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
HostIOMMUDevice *hiod, Error **errp)
{
VirtIOIOMMU *viommu = opaque;
HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_GET_CLASS(hiod);
struct hiod_key *new_key;
GList *host_iova_ranges = NULL;
assert(hiod);
if (get_host_iommu_device(viommu, bus, devfn)) {
error_setg(errp, "Host IOMMU device already exists");
return false;
}
if (hiodc->get_iova_ranges) {
int ret;
host_iova_ranges = hiodc->get_iova_ranges(hiod, errp);
if (!host_iova_ranges) {
return true; /* some old kernels may not support that capability */
}
ret = virtio_iommu_set_host_iova_ranges(viommu, hiod->aliased_bus,
hiod->aliased_devfn,
host_iova_ranges, errp);
if (ret) {
g_list_free_full(host_iova_ranges, g_free);
return false;
}
}
new_key = g_malloc(sizeof(*new_key));
new_key->bus = bus;
new_key->devfn = devfn;
object_ref(hiod);
g_hash_table_insert(viommu->host_iommu_devices, new_key, hiod);
g_list_free_full(host_iova_ranges, g_free);
return true;
}
static void
virtio_iommu_unset_iommu_device(PCIBus *bus, void *opaque, int devfn)
{
VirtIOIOMMU *viommu = opaque;
HostIOMMUDevice *hiod;
struct hiod_key key = {
.bus = bus,
.devfn = devfn,
};
hiod = g_hash_table_lookup(viommu->host_iommu_devices, &key);
if (!hiod) {
return;
}
g_hash_table_remove(viommu->host_iommu_devices, &key);
virtio_iommu_device_clear(viommu, bus, devfn);
}
static const PCIIOMMUOps virtio_iommu_ops = {
.get_address_space = virtio_iommu_find_add_as,
.set_iommu_device = virtio_iommu_set_iommu_device,
.unset_iommu_device = virtio_iommu_unset_iommu_device,
};
static int virtio_iommu_attach(VirtIOIOMMU *s,
struct virtio_iommu_req_attach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
uint32_t flags = le32_to_cpu(req->flags);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
IOMMUDevice *sdev;
trace_virtio_iommu_attach(domain_id, ep_id);
if (flags & ~VIRTIO_IOMMU_ATTACH_F_BYPASS) {
return VIRTIO_IOMMU_S_INVAL;
}
ep = virtio_iommu_get_endpoint(s, ep_id);
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
if (ep->domain) {
VirtIOIOMMUDomain *previous_domain = ep->domain;
/*
* the device is already attached to a domain,
* detach it first
*/
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&previous_domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(previous_domain->id));
}
}
domain = virtio_iommu_get_domain(s, domain_id,
flags & VIRTIO_IOMMU_ATTACH_F_BYPASS);
if (!domain) {
/* Incompatible bypass flag */
return VIRTIO_IOMMU_S_INVAL;
}
QLIST_INSERT_HEAD(&domain->endpoint_list, ep, next);
ep->domain = domain;
sdev = container_of(ep->iommu_mr, IOMMUDevice, iommu_mr);
virtio_iommu_switch_address_space(sdev);
/* Replay domain mappings on the associated memory region */
g_tree_foreach(domain->mappings, virtio_iommu_notify_map_cb,
ep->iommu_mr);
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_detach(VirtIOIOMMU *s,
struct virtio_iommu_req_detach *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint32_t ep_id = le32_to_cpu(req->endpoint);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
trace_virtio_iommu_detach(domain_id, ep_id);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(ep_id));
if (!ep) {
return VIRTIO_IOMMU_S_NOENT;
}
domain = ep->domain;
if (!domain || domain->id != domain_id) {
return VIRTIO_IOMMU_S_INVAL;
}
virtio_iommu_detach_endpoint_from_domain(ep);
if (QLIST_EMPTY(&domain->endpoint_list)) {
g_tree_remove(s->domains, GUINT_TO_POINTER(domain->id));
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_map(VirtIOIOMMU *s,
struct virtio_iommu_req_map *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t phys_start = le64_to_cpu(req->phys_start);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
uint32_t flags = le32_to_cpu(req->flags);
VirtIOIOMMUDomain *domain;
VirtIOIOMMUInterval *interval;
VirtIOIOMMUMapping *mapping;
VirtIOIOMMUEndpoint *ep;
if (flags & ~VIRTIO_IOMMU_MAP_F_MASK) {
return VIRTIO_IOMMU_S_INVAL;
}
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
if (domain->bypass) {
return VIRTIO_IOMMU_S_INVAL;
}
interval = g_malloc0(sizeof(*interval));
interval->low = virt_start;
interval->high = virt_end;
mapping = g_tree_lookup(domain->mappings, (gpointer)interval);
if (mapping) {
g_free(interval);
return VIRTIO_IOMMU_S_INVAL;
}
trace_virtio_iommu_map(domain_id, virt_start, virt_end, phys_start, flags);
mapping = g_malloc0(sizeof(*mapping));
mapping->phys_addr = phys_start;
mapping->flags = flags;
g_tree_insert(domain->mappings, interval, mapping);
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_map(ep->iommu_mr, virt_start, virt_end, phys_start,
flags);
}
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_unmap(VirtIOIOMMU *s,
struct virtio_iommu_req_unmap *req)
{
uint32_t domain_id = le32_to_cpu(req->domain);
uint64_t virt_start = le64_to_cpu(req->virt_start);
uint64_t virt_end = le64_to_cpu(req->virt_end);
VirtIOIOMMUMapping *iter_val;
VirtIOIOMMUInterval interval, *iter_key;
VirtIOIOMMUDomain *domain;
VirtIOIOMMUEndpoint *ep;
int ret = VIRTIO_IOMMU_S_OK;
trace_virtio_iommu_unmap(domain_id, virt_start, virt_end);
domain = g_tree_lookup(s->domains, GUINT_TO_POINTER(domain_id));
if (!domain) {
return VIRTIO_IOMMU_S_NOENT;
}
if (domain->bypass) {
return VIRTIO_IOMMU_S_INVAL;
}
interval.low = virt_start;
interval.high = virt_end;
while (g_tree_lookup_extended(domain->mappings, &interval,
(void **)&iter_key, (void**)&iter_val)) {
uint64_t current_low = iter_key->low;
uint64_t current_high = iter_key->high;
if (interval.low <= current_low && interval.high >= current_high) {
QLIST_FOREACH(ep, &domain->endpoint_list, next) {
virtio_iommu_notify_unmap(ep->iommu_mr, current_low,
current_high);
}
g_tree_remove(domain->mappings, iter_key);
trace_virtio_iommu_unmap_done(domain_id, current_low, current_high);
} else {
ret = VIRTIO_IOMMU_S_RANGE;
break;
}
}
return ret;
}
static ssize_t virtio_iommu_fill_resv_mem_prop(IOMMUDevice *sdev, uint32_t ep,
uint8_t *buf, size_t free)
{
struct virtio_iommu_probe_resv_mem prop = {};
size_t size = sizeof(prop), length = size - sizeof(prop.head), total;
GList *l;
total = size * g_list_length(sdev->resv_regions);
if (total > free) {
return -ENOSPC;
}
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
unsigned subtype = reg->type;
Range *range = &reg->range;
assert(subtype == VIRTIO_IOMMU_RESV_MEM_T_RESERVED ||
subtype == VIRTIO_IOMMU_RESV_MEM_T_MSI);
prop.head.type = cpu_to_le16(VIRTIO_IOMMU_PROBE_T_RESV_MEM);
prop.head.length = cpu_to_le16(length);
prop.subtype = subtype;
prop.start = cpu_to_le64(range_lob(range));
prop.end = cpu_to_le64(range_upb(range));
memcpy(buf, &prop, size);
trace_virtio_iommu_fill_resv_property(ep, prop.subtype,
prop.start, prop.end);
buf += size;
}
return total;
}
/**
* virtio_iommu_probe - Fill the probe request buffer with
* the properties the device is able to return
*/
static int virtio_iommu_probe(VirtIOIOMMU *s,
struct virtio_iommu_req_probe *req,
uint8_t *buf)
{
uint32_t ep_id = le32_to_cpu(req->endpoint);
IOMMUMemoryRegion *iommu_mr = virtio_iommu_mr(s, ep_id);
size_t free = VIOMMU_PROBE_SIZE;
IOMMUDevice *sdev;
ssize_t count;
if (!iommu_mr) {
return VIRTIO_IOMMU_S_NOENT;
}
sdev = container_of(iommu_mr, IOMMUDevice, iommu_mr);
count = virtio_iommu_fill_resv_mem_prop(sdev, ep_id, buf, free);
if (count < 0) {
return VIRTIO_IOMMU_S_INVAL;
}
buf += count;
free -= count;
sdev->probe_done = true;
return VIRTIO_IOMMU_S_OK;
}
static int virtio_iommu_iov_to_req(struct iovec *iov,
unsigned int iov_cnt,
void *req, size_t payload_sz)
{
size_t sz = iov_to_buf(iov, iov_cnt, 0, req, payload_sz);
if (unlikely(sz != payload_sz)) {
return VIRTIO_IOMMU_S_INVAL;
}
return 0;
}
#define virtio_iommu_handle_req(__req) \
static int virtio_iommu_handle_ ## __req(VirtIOIOMMU *s, \
struct iovec *iov, \
unsigned int iov_cnt) \
{ \
struct virtio_iommu_req_ ## __req req; \
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, \
sizeof(req) - sizeof(struct virtio_iommu_req_tail));\
\
return ret ? ret : virtio_iommu_ ## __req(s, &req); \
}
virtio_iommu_handle_req(attach)
virtio_iommu_handle_req(detach)
virtio_iommu_handle_req(map)
virtio_iommu_handle_req(unmap)
static int virtio_iommu_handle_probe(VirtIOIOMMU *s,
struct iovec *iov,
unsigned int iov_cnt,
uint8_t *buf)
{
struct virtio_iommu_req_probe req;
int ret = virtio_iommu_iov_to_req(iov, iov_cnt, &req, sizeof(req));
return ret ? ret : virtio_iommu_probe(s, &req, buf);
}
static void virtio_iommu_handle_command(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
struct virtio_iommu_req_head head;
struct virtio_iommu_req_tail tail = {};
VirtQueueElement *elem;
unsigned int iov_cnt;
struct iovec *iov;
void *buf = NULL;
size_t sz;
for (;;) {
size_t output_size = sizeof(tail);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(tail) ||
iov_size(elem->out_sg, elem->out_num) < sizeof(head)) {
virtio_error(vdev, "virtio-iommu bad head/tail size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
break;
}
iov_cnt = elem->out_num;
iov = elem->out_sg;
sz = iov_to_buf(iov, iov_cnt, 0, &head, sizeof(head));
if (unlikely(sz != sizeof(head))) {
qemu_log_mask(LOG_GUEST_ERROR,
"%s: read %zu bytes from command head"
"but expected %zu\n", __func__, sz, sizeof(head));
tail.status = VIRTIO_IOMMU_S_DEVERR;
goto out;
}
qemu_rec_mutex_lock(&s->mutex);
switch (head.type) {
case VIRTIO_IOMMU_T_ATTACH:
tail.status = virtio_iommu_handle_attach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_DETACH:
tail.status = virtio_iommu_handle_detach(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_MAP:
tail.status = virtio_iommu_handle_map(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_UNMAP:
tail.status = virtio_iommu_handle_unmap(s, iov, iov_cnt);
break;
case VIRTIO_IOMMU_T_PROBE:
{
struct virtio_iommu_req_tail *ptail;
output_size = s->config.probe_size + sizeof(tail);
buf = g_malloc0(output_size);
ptail = buf + s->config.probe_size;
ptail->status = virtio_iommu_handle_probe(s, iov, iov_cnt, buf);
break;
}
default:
tail.status = VIRTIO_IOMMU_S_UNSUPP;
}
qemu_rec_mutex_unlock(&s->mutex);
out:
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
buf ? buf : &tail, output_size);
if (unlikely(sz != output_size)) {
qemu_log_mask(LOG_GUEST_ERROR,
"%s: wrote %zu bytes to command response"
"but response size is %zu\n",
__func__, sz, output_size);
tail.status = VIRTIO_IOMMU_S_DEVERR;
/*
* We checked that sizeof(tail) can fit to elem->in_sg at the
* beginning of the loop
*/
output_size = sizeof(tail);
g_free(buf);
buf = NULL;
sz = iov_from_buf(elem->in_sg,
elem->in_num,
0,
&tail,
output_size);
}
assert(sz == output_size);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
g_free(buf);
buf = NULL;
}
}
static void virtio_iommu_report_fault(VirtIOIOMMU *viommu, uint8_t reason,
int flags, uint32_t endpoint,
uint64_t address)
{
VirtIODevice *vdev = &viommu->parent_obj;
VirtQueue *vq = viommu->event_vq;
struct virtio_iommu_fault fault;
VirtQueueElement *elem;
size_t sz;
memset(&fault, 0, sizeof(fault));
fault.reason = reason;
fault.flags = cpu_to_le32(flags);
fault.endpoint = cpu_to_le32(endpoint);
fault.address = cpu_to_le64(address);
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
error_report_once(
"no buffer available in event queue to report event");
return;
}
if (iov_size(elem->in_sg, elem->in_num) < sizeof(fault)) {
virtio_error(vdev, "error buffer of wrong size");
virtqueue_detach_element(vq, elem, 0);
g_free(elem);
return;
}
sz = iov_from_buf(elem->in_sg, elem->in_num, 0,
&fault, sizeof(fault));
assert(sz == sizeof(fault));
trace_virtio_iommu_report_fault(reason, flags, endpoint, address);
virtqueue_push(vq, elem, sz);
virtio_notify(vdev, vq);
g_free(elem);
}
static IOMMUTLBEntry virtio_iommu_translate(IOMMUMemoryRegion *mr, hwaddr addr,
IOMMUAccessFlags flag,
int iommu_idx)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMUInterval interval, *mapping_key;
VirtIOIOMMUMapping *mapping_value;
VirtIOIOMMU *s = sdev->viommu;
bool read_fault, write_fault;
VirtIOIOMMUEndpoint *ep;
uint32_t sid, flags;
bool bypass_allowed;
int granule;
bool found;
GList *l;
interval.low = addr;
interval.high = addr + 1;
granule = ctz64(s->config.page_size_mask);
IOMMUTLBEntry entry = {
.target_as = &address_space_memory,
.iova = addr,
.translated_addr = addr,
.addr_mask = BIT_ULL(granule) - 1,
.perm = IOMMU_NONE,
};
bypass_allowed = s->config.bypass;
sid = virtio_iommu_get_bdf(sdev);
trace_virtio_iommu_translate(mr->parent_obj.name, sid, addr, flag);
qemu_rec_mutex_lock(&s->mutex);
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (bypass_allowed)
assert(ep && ep->domain && !ep->domain->bypass);
if (!ep) {
if (!bypass_allowed) {
error_report_once("%s sid=%d is not known!!", __func__, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_UNKNOWN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
}
for (l = sdev->resv_regions; l; l = l->next) {
ReservedRegion *reg = l->data;
if (range_contains(&reg->range, addr)) {
switch (reg->type) {
case VIRTIO_IOMMU_RESV_MEM_T_MSI:
entry.perm = flag;
break;
case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
default:
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
break;
}
goto unlock;
}
}
if (!ep->domain) {
if (!bypass_allowed) {
error_report_once("%s %02x:%02x.%01x not attached to any domain",
__func__, PCI_BUS_NUM(sid),
PCI_SLOT(sid), PCI_FUNC(sid));
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_DOMAIN,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
} else {
entry.perm = flag;
}
goto unlock;
} else if (ep->domain->bypass) {
entry.perm = flag;
goto unlock;
}
found = g_tree_lookup_extended(ep->domain->mappings, (gpointer)(&interval),
(void **)&mapping_key,
(void **)&mapping_value);
if (!found) {
error_report_once("%s no mapping for 0x%"PRIx64" for sid=%d",
__func__, addr, sid);
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
read_fault = (flag & IOMMU_RO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_READ);
write_fault = (flag & IOMMU_WO) &&
!(mapping_value->flags & VIRTIO_IOMMU_MAP_F_WRITE);
flags = read_fault ? VIRTIO_IOMMU_FAULT_F_READ : 0;
flags |= write_fault ? VIRTIO_IOMMU_FAULT_F_WRITE : 0;
if (flags) {
error_report_once("%s permission error on 0x%"PRIx64"(%d): allowed=%d",
__func__, addr, flag, mapping_value->flags);
flags |= VIRTIO_IOMMU_FAULT_F_ADDRESS;
virtio_iommu_report_fault(s, VIRTIO_IOMMU_FAULT_R_MAPPING,
flags | VIRTIO_IOMMU_FAULT_F_ADDRESS,
sid, addr);
goto unlock;
}
entry.translated_addr = addr - mapping_key->low + mapping_value->phys_addr;
entry.perm = flag;
trace_virtio_iommu_translate_out(addr, entry.translated_addr, sid);
unlock:
qemu_rec_mutex_unlock(&s->mutex);
return entry;
}
static void virtio_iommu_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
struct virtio_iommu_config *dev_config = &dev->config;
struct virtio_iommu_config *out_config = (void *)config_data;
out_config->page_size_mask = cpu_to_le64(dev_config->page_size_mask);
out_config->input_range.start = cpu_to_le64(dev_config->input_range.start);
out_config->input_range.end = cpu_to_le64(dev_config->input_range.end);
out_config->domain_range.start = cpu_to_le32(dev_config->domain_range.start);
out_config->domain_range.end = cpu_to_le32(dev_config->domain_range.end);
out_config->probe_size = cpu_to_le32(dev_config->probe_size);
out_config->bypass = dev_config->bypass;
trace_virtio_iommu_get_config(dev_config->page_size_mask,
dev_config->input_range.start,
dev_config->input_range.end,
dev_config->domain_range.start,
dev_config->domain_range.end,
dev_config->probe_size,
dev_config->bypass);
}
static void virtio_iommu_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
struct virtio_iommu_config *dev_config = &dev->config;
const struct virtio_iommu_config *in_config = (void *)config_data;
if (in_config->bypass != dev_config->bypass) {
if (!virtio_vdev_has_feature(vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) {
virtio_error(vdev, "cannot set config.bypass");
return;
} else if (in_config->bypass != 0 && in_config->bypass != 1) {
virtio_error(vdev, "invalid config.bypass value '%u'",
in_config->bypass);
return;
}
dev_config->bypass = in_config->bypass;
virtio_iommu_switch_address_space_all(dev);
}
trace_virtio_iommu_set_config(in_config->bypass);
}
static uint64_t virtio_iommu_get_features(VirtIODevice *vdev, uint64_t f,
Error **errp)
{
VirtIOIOMMU *dev = VIRTIO_IOMMU(vdev);
f |= dev->features;
trace_virtio_iommu_get_features(f);
return f;
}
static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
{
guint ua = GPOINTER_TO_UINT(a);
guint ub = GPOINTER_TO_UINT(b);
return (ua > ub) - (ua < ub);
}
static gboolean virtio_iommu_remap(gpointer key, gpointer value, gpointer data)
{
VirtIOIOMMUMapping *mapping = (VirtIOIOMMUMapping *) value;
VirtIOIOMMUInterval *interval = (VirtIOIOMMUInterval *) key;
IOMMUMemoryRegion *mr = (IOMMUMemoryRegion *) data;
trace_virtio_iommu_remap(mr->parent_obj.name, interval->low, interval->high,
mapping->phys_addr);
virtio_iommu_notify_map(mr, interval->low, interval->high,
mapping->phys_addr, mapping->flags);
return false;
}
static void virtio_iommu_replay(IOMMUMemoryRegion *mr, IOMMUNotifier *n)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint32_t sid;
VirtIOIOMMUEndpoint *ep;
sid = virtio_iommu_get_bdf(sdev);
qemu_rec_mutex_lock(&s->mutex);
if (!s->endpoints) {
goto unlock;
}
ep = g_tree_lookup(s->endpoints, GUINT_TO_POINTER(sid));
if (!ep || !ep->domain) {
goto unlock;
}
g_tree_foreach(ep->domain->mappings, virtio_iommu_remap, mr);
unlock:
qemu_rec_mutex_unlock(&s->mutex);
}
static int virtio_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu_mr,
IOMMUNotifierFlag old,
IOMMUNotifierFlag new,
Error **errp)
{
if (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP) {
error_setg(errp, "Virtio-iommu does not support dev-iotlb yet");
return -EINVAL;
}
if (old == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_add(iommu_mr->parent_obj.name);
} else if (new == IOMMU_NOTIFIER_NONE) {
trace_virtio_iommu_notify_flag_del(iommu_mr->parent_obj.name);
}
return 0;
}
/*
* The default mask depends on the "granule" property. For example, with
* 4k granule, it is -(4 * KiB). When an assigned device has page size
* restrictions due to the hardware IOMMU configuration, apply this restriction
* to the mask.
*/
static int virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
uint64_t new_mask,
Error **errp)
{
IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
VirtIOIOMMU *s = sdev->viommu;
uint64_t cur_mask = s->config.page_size_mask;
trace_virtio_iommu_set_page_size_mask(mr->parent_obj.name, cur_mask,
new_mask);
if ((cur_mask & new_mask) == 0) {
error_setg(errp, "virtio-iommu %s reports a page size mask 0x%"PRIx64
" incompatible with currently supported mask 0x%"PRIx64,
mr->parent_obj.name, new_mask, cur_mask);
return -1;
}
/*
* Once the granule is frozen we can't change the mask anymore. If by
* chance the hotplugged device supports the same granule, we can still
* accept it.
*/
if (s->granule_frozen) {
int cur_granule = ctz64(cur_mask);
if (!(BIT_ULL(cur_granule) & new_mask)) {
error_setg(errp, "virtio-iommu %s does not support frozen granule 0x%llx",
mr->parent_obj.name, BIT_ULL(cur_granule));
return -1;
}
return 0;
}
s->config.page_size_mask &= new_mask;
return 0;
}
static void virtio_iommu_system_reset(void *opaque)
{
VirtIOIOMMU *s = opaque;
trace_virtio_iommu_system_reset();
memset(s->iommu_pcibus_by_bus_num, 0, sizeof(s->iommu_pcibus_by_bus_num));
/*
* config.bypass is sticky across device reset, but should be restored on
* system reset
*/
s->config.bypass = s->boot_bypass;
virtio_iommu_switch_address_space_all(s);
}
static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
{
VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
int granule;
if (likely(s->config.bypass)) {
/*
* Transient IOMMU MR enable to collect page_size_mask requirements
* through memory_region_iommu_set_page_size_mask() called by
* VFIO region_add() callback
*/
s->config.bypass = false;
virtio_iommu_switch_address_space_all(s);
/* restore default */
s->config.bypass = true;
virtio_iommu_switch_address_space_all(s);
}
s->granule_frozen = true;
granule = ctz64(s->config.page_size_mask);
trace_virtio_iommu_freeze_granule(BIT_ULL(granule));
}
static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
virtio_init(vdev, VIRTIO_ID_IOMMU, sizeof(struct virtio_iommu_config));
s->req_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE,
virtio_iommu_handle_command);
s->event_vq = virtio_add_queue(vdev, VIOMMU_DEFAULT_QUEUE_SIZE, NULL);
/*
* config.bypass is needed to get initial address space early, such as
* in vfio realize
*/
s->config.bypass = s->boot_bypass;
if (s->aw_bits < 32 || s->aw_bits > 64) {
error_setg(errp, "aw-bits must be within [32,64]");
return;
}
s->config.input_range.end =
s->aw_bits == 64 ? UINT64_MAX : BIT_ULL(s->aw_bits) - 1;
switch (s->granule_mode) {
case GRANULE_MODE_4K:
s->config.page_size_mask = -(4 * KiB);
break;
case GRANULE_MODE_8K:
s->config.page_size_mask = -(8 * KiB);
break;
case GRANULE_MODE_16K:
s->config.page_size_mask = -(16 * KiB);
break;
case GRANULE_MODE_64K:
s->config.page_size_mask = -(64 * KiB);
break;
case GRANULE_MODE_HOST:
s->config.page_size_mask = qemu_real_host_page_mask();
break;
default:
error_setg(errp, "Unsupported granule mode");
}
s->config.domain_range.end = UINT32_MAX;
s->config.probe_size = VIOMMU_PROBE_SIZE;
virtio_add_feature(&s->features, VIRTIO_RING_F_EVENT_IDX);
virtio_add_feature(&s->features, VIRTIO_RING_F_INDIRECT_DESC);
virtio_add_feature(&s->features, VIRTIO_F_VERSION_1);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_INPUT_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_DOMAIN_RANGE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MAP_UNMAP);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_MMIO);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_PROBE);
virtio_add_feature(&s->features, VIRTIO_IOMMU_F_BYPASS_CONFIG);
qemu_rec_mutex_init(&s->mutex);
s->as_by_busptr = g_hash_table_new_full(NULL, NULL, NULL, g_free);
s->host_iommu_devices = g_hash_table_new_full(hiod_hash, hiod_equal,
g_free, hiod_destroy);
if (s->primary_bus) {
pci_setup_iommu(s->primary_bus, &virtio_iommu_ops, s);
} else {
error_setg(errp, "VIRTIO-IOMMU is not attached to any PCI bus!");
}
s->machine_done.notify = virtio_iommu_freeze_granule;
qemu_add_machine_init_done_notifier(&s->machine_done);
qemu_register_reset(virtio_iommu_system_reset, s);
}
static void virtio_iommu_device_unrealize(DeviceState *dev)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOIOMMU *s = VIRTIO_IOMMU(dev);
qemu_unregister_reset(virtio_iommu_system_reset, s);
qemu_remove_machine_init_done_notifier(&s->machine_done);
g_hash_table_destroy(s->as_by_busptr);
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
qemu_rec_mutex_destroy(&s->mutex);
virtio_delete_queue(s->req_vq);
virtio_delete_queue(s->event_vq);
virtio_cleanup(vdev);
}
static void virtio_iommu_device_reset(VirtIODevice *vdev)
{
VirtIOIOMMU *s = VIRTIO_IOMMU(vdev);
trace_virtio_iommu_device_reset();
if (s->domains) {
g_tree_destroy(s->domains);
}
if (s->endpoints) {
g_tree_destroy(s->endpoints);
}
s->domains = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_domain);
s->endpoints = g_tree_new_full((GCompareDataFunc)int_cmp,
NULL, NULL, virtio_iommu_put_endpoint);
}
static void virtio_iommu_set_status(VirtIODevice *vdev, uint8_t status)
{
trace_virtio_iommu_device_status(status);
}
static void virtio_iommu_instance_init(Object *obj)
{
}
#define VMSTATE_INTERVAL \
{ \
.name = "interval", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (const VMStateField[]) { \
VMSTATE_UINT64(low, VirtIOIOMMUInterval), \
VMSTATE_UINT64(high, VirtIOIOMMUInterval), \
VMSTATE_END_OF_LIST() \
} \
}
#define VMSTATE_MAPPING \
{ \
.name = "mapping", \
.version_id = 1, \
.minimum_version_id = 1, \
.fields = (const VMStateField[]) { \
VMSTATE_UINT64(phys_addr, VirtIOIOMMUMapping),\
VMSTATE_UINT32(flags, VirtIOIOMMUMapping), \
VMSTATE_END_OF_LIST() \
}, \
}
static const VMStateDescription vmstate_interval_mapping[2] = {
VMSTATE_MAPPING, /* value */
VMSTATE_INTERVAL /* key */
};
static int domain_preload(void *opaque)
{
VirtIOIOMMUDomain *domain = opaque;
domain->mappings = g_tree_new_full((GCompareDataFunc)interval_cmp,
NULL, g_free, g_free);
return 0;
}
static const VMStateDescription vmstate_endpoint = {
.name = "endpoint",
.version_id = 1,
.minimum_version_id = 1,
.fields = (const VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUEndpoint),
VMSTATE_END_OF_LIST()
}
};
static const VMStateDescription vmstate_domain = {
.name = "domain",
.version_id = 2,
.minimum_version_id = 2,
.pre_load = domain_preload,
.fields = (const VMStateField[]) {
VMSTATE_UINT32(id, VirtIOIOMMUDomain),
VMSTATE_GTREE_V(mappings, VirtIOIOMMUDomain, 1,
vmstate_interval_mapping,
VirtIOIOMMUInterval, VirtIOIOMMUMapping),
VMSTATE_QLIST_V(endpoint_list, VirtIOIOMMUDomain, 1,
vmstate_endpoint, VirtIOIOMMUEndpoint, next),
VMSTATE_BOOL_V(bypass, VirtIOIOMMUDomain, 2),
VMSTATE_END_OF_LIST()
}
};
static gboolean reconstruct_endpoints(gpointer key, gpointer value,
gpointer data)
{
VirtIOIOMMU *s = (VirtIOIOMMU *)data;
VirtIOIOMMUDomain *d = (VirtIOIOMMUDomain *)value;
VirtIOIOMMUEndpoint *iter;
IOMMUMemoryRegion *mr;
QLIST_FOREACH(iter, &d->endpoint_list, next) {
mr = virtio_iommu_mr(s, iter->id);
assert(mr);
iter->domain = d;
iter->iommu_mr = mr;
g_tree_insert(s->endpoints, GUINT_TO_POINTER(iter->id), iter);
}
return false; /* continue the domain traversal */
}
static int iommu_post_load(void *opaque, int version_id)
{
VirtIOIOMMU *s = opaque;
g_tree_foreach(s->domains, reconstruct_endpoints, s);
/*
* Memory regions are dynamically turned on/off depending on
* 'config.bypass' and attached domain type if there is. After
* migration, we need to make sure the memory regions are
* still correct.
*/
virtio_iommu_switch_address_space_all(s);
return 0;
}
static const VMStateDescription vmstate_virtio_iommu_device = {
.name = "virtio-iommu-device",
.minimum_version_id = 2,
.version_id = 2,
.post_load = iommu_post_load,
.fields = (const VMStateField[]) {
VMSTATE_GTREE_DIRECT_KEY_V(domains, VirtIOIOMMU, 2,
&vmstate_domain, VirtIOIOMMUDomain),
VMSTATE_UINT8_V(config.bypass, VirtIOIOMMU, 2),
VMSTATE_END_OF_LIST()
},
};
static const VMStateDescription vmstate_virtio_iommu = {
.name = "virtio-iommu",
.minimum_version_id = 2,
.priority = MIG_PRI_IOMMU,
.version_id = 2,
.fields = (const VMStateField[]) {
VMSTATE_VIRTIO_DEVICE,
VMSTATE_END_OF_LIST()
},
};
static Property virtio_iommu_properties[] = {
DEFINE_PROP_LINK("primary-bus", VirtIOIOMMU, primary_bus,
TYPE_PCI_BUS, PCIBus *),
DEFINE_PROP_BOOL("boot-bypass", VirtIOIOMMU, boot_bypass, true),
DEFINE_PROP_GRANULE_MODE("granule", VirtIOIOMMU, granule_mode,
GRANULE_MODE_HOST),
DEFINE_PROP_UINT8("aw-bits", VirtIOIOMMU, aw_bits, 64),
DEFINE_PROP_END_OF_LIST(),
};
static void virtio_iommu_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
device_class_set_props(dc, virtio_iommu_properties);
dc->vmsd = &vmstate_virtio_iommu;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_iommu_device_realize;
vdc->unrealize = virtio_iommu_device_unrealize;
vdc->reset = virtio_iommu_device_reset;
vdc->get_config = virtio_iommu_get_config;
vdc->set_config = virtio_iommu_set_config;
vdc->get_features = virtio_iommu_get_features;
vdc->set_status = virtio_iommu_set_status;
vdc->vmsd = &vmstate_virtio_iommu_device;
}
static void virtio_iommu_memory_region_class_init(ObjectClass *klass,
void *data)
{
IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
imrc->translate = virtio_iommu_translate;
imrc->replay = virtio_iommu_replay;
imrc->notify_flag_changed = virtio_iommu_notify_flag_changed;
imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
}
static const TypeInfo virtio_iommu_info = {
.name = TYPE_VIRTIO_IOMMU,
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOIOMMU),
.instance_init = virtio_iommu_instance_init,
.class_init = virtio_iommu_class_init,
};
static const TypeInfo virtio_iommu_memory_region_info = {
.parent = TYPE_IOMMU_MEMORY_REGION,
.name = TYPE_VIRTIO_IOMMU_MEMORY_REGION,
.class_init = virtio_iommu_memory_region_class_init,
};
static void virtio_register_types(void)
{
type_register_static(&virtio_iommu_info);
type_register_static(&virtio_iommu_memory_region_info);
}
type_init(virtio_register_types)