2014-12-22 19:54:51 +03:00
|
|
|
/*
|
|
|
|
* generic functions used by VFIO devices
|
|
|
|
*
|
|
|
|
* Copyright Red Hat, Inc. 2012
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Alex Williamson <alex.williamson@redhat.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2. See
|
|
|
|
* the COPYING file in the top-level directory.
|
|
|
|
*
|
|
|
|
* Based on qemu-kvm device-assignment:
|
|
|
|
* Adapted for KVM by Qumranet.
|
|
|
|
* Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
|
|
|
|
* Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
|
|
|
|
* Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
|
|
|
|
* Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
|
|
|
|
* Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
|
|
|
|
*/
|
|
|
|
|
2016-01-26 21:17:14 +03:00
|
|
|
#include "qemu/osdep.h"
|
2014-12-22 19:54:51 +03:00
|
|
|
#include <sys/ioctl.h>
|
2016-06-22 20:11:19 +03:00
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
#include <linux/kvm.h>
|
|
|
|
#endif
|
2014-12-22 19:54:51 +03:00
|
|
|
#include <linux/vfio.h>
|
|
|
|
|
|
|
|
#include "hw/vfio/vfio-common.h"
|
|
|
|
#include "hw/vfio/vfio.h"
|
|
|
|
#include "exec/address-spaces.h"
|
|
|
|
#include "exec/memory.h"
|
2020-10-26 12:36:23 +03:00
|
|
|
#include "exec/ram_addr.h"
|
2014-12-22 19:54:51 +03:00
|
|
|
#include "hw/hw.h"
|
|
|
|
#include "qemu/error-report.h"
|
Include qemu/main-loop.h less
In my "build everything" tree, changing qemu/main-loop.h triggers a
recompile of some 5600 out of 6600 objects (not counting tests and
objects that don't depend on qemu/osdep.h). It includes block/aio.h,
which in turn includes qemu/event_notifier.h, qemu/notify.h,
qemu/processor.h, qemu/qsp.h, qemu/queue.h, qemu/thread-posix.h,
qemu/thread.h, qemu/timer.h, and a few more.
Include qemu/main-loop.h only where it's needed. Touching it now
recompiles only some 1700 objects. For block/aio.h and
qemu/event_notifier.h, these numbers drop from 5600 to 2800. For the
others, they shrink only slightly.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <20190812052359.30071-21-armbru@redhat.com>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Tested-by: Philippe Mathieu-Daudé <philmd@redhat.com>
2019-08-12 08:23:50 +03:00
|
|
|
#include "qemu/main-loop.h"
|
2016-07-04 06:33:05 +03:00
|
|
|
#include "qemu/range.h"
|
2014-12-22 19:54:51 +03:00
|
|
|
#include "sysemu/kvm.h"
|
2019-08-12 08:23:38 +03:00
|
|
|
#include "sysemu/reset.h"
|
2021-04-13 12:55:27 +03:00
|
|
|
#include "sysemu/runstate.h"
|
2014-12-22 19:54:51 +03:00
|
|
|
#include "trace.h"
|
2016-10-17 19:57:59 +03:00
|
|
|
#include "qapi/error.h"
|
2020-10-26 12:36:23 +03:00
|
|
|
#include "migration/migration.h"
|
2023-02-16 17:36:23 +03:00
|
|
|
#include "migration/misc.h"
|
2023-02-16 17:36:24 +03:00
|
|
|
#include "migration/blocker.h"
|
2023-03-07 15:54:38 +03:00
|
|
|
#include "migration/qemu-file.h"
|
2022-05-06 16:25:10 +03:00
|
|
|
#include "sysemu/tpm.h"
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2018-12-06 13:56:15 +03:00
|
|
|
VFIOGroupList vfio_group_list =
|
2015-02-04 21:45:32 +03:00
|
|
|
QLIST_HEAD_INITIALIZER(vfio_group_list);
|
2018-12-10 19:58:54 +03:00
|
|
|
static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces =
|
2014-12-22 19:54:51 +03:00
|
|
|
QLIST_HEAD_INITIALIZER(vfio_address_spaces);
|
|
|
|
|
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
/*
|
|
|
|
* We have a single VFIO pseudo device per KVM VM. Once created it lives
|
|
|
|
* for the life of the VM. Closing the file descriptor only drops our
|
|
|
|
* reference to it and the device's reference to kvm. Therefore once
|
|
|
|
* initialized, this file descriptor is only released on QEMU exit and
|
|
|
|
* we'll re-use it should another vfio device be attached before then.
|
|
|
|
*/
|
|
|
|
static int vfio_kvm_device_fd = -1;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Common VFIO interrupt disable
|
|
|
|
*/
|
|
|
|
void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
|
|
|
|
{
|
|
|
|
struct vfio_irq_set irq_set = {
|
|
|
|
.argsz = sizeof(irq_set),
|
|
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
|
|
|
|
.index = index,
|
|
|
|
.start = 0,
|
|
|
|
.count = 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
|
|
|
|
{
|
|
|
|
struct vfio_irq_set irq_set = {
|
|
|
|
.argsz = sizeof(irq_set),
|
|
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
|
|
|
|
.index = index,
|
|
|
|
.start = 0,
|
|
|
|
.count = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
|
|
|
|
{
|
|
|
|
struct vfio_irq_set irq_set = {
|
|
|
|
.argsz = sizeof(irq_set),
|
|
|
|
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
|
|
|
|
.index = index,
|
|
|
|
.start = 0,
|
|
|
|
.count = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
|
|
|
|
}
|
|
|
|
|
2019-06-13 18:57:37 +03:00
|
|
|
static inline const char *action_to_str(int action)
|
|
|
|
{
|
|
|
|
switch (action) {
|
|
|
|
case VFIO_IRQ_SET_ACTION_MASK:
|
|
|
|
return "MASK";
|
|
|
|
case VFIO_IRQ_SET_ACTION_UNMASK:
|
|
|
|
return "UNMASK";
|
|
|
|
case VFIO_IRQ_SET_ACTION_TRIGGER:
|
|
|
|
return "TRIGGER";
|
|
|
|
default:
|
|
|
|
return "UNKNOWN ACTION";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *index_to_str(VFIODevice *vbasedev, int index)
|
|
|
|
{
|
|
|
|
if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (index) {
|
|
|
|
case VFIO_PCI_INTX_IRQ_INDEX:
|
|
|
|
return "INTX";
|
|
|
|
case VFIO_PCI_MSI_IRQ_INDEX:
|
|
|
|
return "MSI";
|
|
|
|
case VFIO_PCI_MSIX_IRQ_INDEX:
|
|
|
|
return "MSIX";
|
|
|
|
case VFIO_PCI_ERR_IRQ_INDEX:
|
|
|
|
return "ERR";
|
|
|
|
case VFIO_PCI_REQ_IRQ_INDEX:
|
|
|
|
return "REQ";
|
|
|
|
default:
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state)
|
|
|
|
{
|
|
|
|
switch (container->iommu_type) {
|
|
|
|
case VFIO_TYPE1v2_IOMMU:
|
|
|
|
case VFIO_TYPE1_IOMMU:
|
|
|
|
/*
|
|
|
|
* We support coordinated discarding of RAM via the RamDiscardManager.
|
|
|
|
*/
|
|
|
|
return ram_block_uncoordinated_discard_disable(state);
|
|
|
|
default:
|
|
|
|
/*
|
|
|
|
* VFIO_SPAPR_TCE_IOMMU most probably works just fine with
|
|
|
|
* RamDiscardManager, however, it is completely untested.
|
|
|
|
*
|
|
|
|
* VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does
|
|
|
|
* completely the opposite of managing mapping/pinning dynamically as
|
|
|
|
* required by RamDiscardManager. We would have to special-case sections
|
|
|
|
* with a RamDiscardManager.
|
|
|
|
*/
|
|
|
|
return ram_block_discard_disable(state);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-13 18:57:37 +03:00
|
|
|
int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
|
|
|
|
int action, int fd, Error **errp)
|
|
|
|
{
|
|
|
|
struct vfio_irq_set *irq_set;
|
|
|
|
int argsz, ret = 0;
|
|
|
|
const char *name;
|
|
|
|
int32_t *pfd;
|
|
|
|
|
|
|
|
argsz = sizeof(*irq_set) + sizeof(*pfd);
|
|
|
|
|
|
|
|
irq_set = g_malloc0(argsz);
|
|
|
|
irq_set->argsz = argsz;
|
|
|
|
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
|
|
|
|
irq_set->index = index;
|
|
|
|
irq_set->start = subindex;
|
|
|
|
irq_set->count = 1;
|
|
|
|
pfd = (int32_t *)&irq_set->data;
|
|
|
|
*pfd = fd;
|
|
|
|
|
|
|
|
if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
|
|
|
|
ret = -errno;
|
|
|
|
}
|
|
|
|
g_free(irq_set);
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
|
|
|
|
|
|
|
|
name = index_to_str(vbasedev, index);
|
|
|
|
if (name) {
|
|
|
|
error_prepend(errp, "%s-%d: ", name, subindex);
|
|
|
|
} else {
|
|
|
|
error_prepend(errp, "index %d-%d: ", index, subindex);
|
|
|
|
}
|
|
|
|
error_prepend(errp,
|
|
|
|
"Failed to %s %s eventfd signaling for interrupt ",
|
|
|
|
fd < 0 ? "tear down" : "set up", action_to_str(action));
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
/*
|
|
|
|
* IO Port/MMIO - Beware of the endians, VFIO is always little endian
|
|
|
|
*/
|
|
|
|
void vfio_region_write(void *opaque, hwaddr addr,
|
|
|
|
uint64_t data, unsigned size)
|
|
|
|
{
|
|
|
|
VFIORegion *region = opaque;
|
|
|
|
VFIODevice *vbasedev = region->vbasedev;
|
|
|
|
union {
|
|
|
|
uint8_t byte;
|
|
|
|
uint16_t word;
|
|
|
|
uint32_t dword;
|
|
|
|
uint64_t qword;
|
|
|
|
} buf;
|
|
|
|
|
|
|
|
switch (size) {
|
|
|
|
case 1:
|
|
|
|
buf.byte = data;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
buf.word = cpu_to_le16(data);
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
buf.dword = cpu_to_le32(data);
|
|
|
|
break;
|
2017-05-03 23:52:34 +03:00
|
|
|
case 8:
|
|
|
|
buf.qword = cpu_to_le64(data);
|
|
|
|
break;
|
2014-12-22 19:54:51 +03:00
|
|
|
default:
|
2020-10-19 17:23:46 +03:00
|
|
|
hw_error("vfio: unsupported write size, %u bytes", size);
|
2014-12-22 19:54:51 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
|
|
|
|
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
|
|
|
|
",%d) failed: %m",
|
|
|
|
__func__, vbasedev->name, region->nr,
|
|
|
|
addr, data, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A read or write to a BAR always signals an INTx EOI. This will
|
|
|
|
* do nothing if not pending (including not in INTx mode). We assume
|
|
|
|
* that a BAR access is in response to an interrupt and that BAR
|
|
|
|
* accesses will service the interrupt. Unfortunately, we don't know
|
|
|
|
* which access will service the interrupt, so we're potentially
|
|
|
|
* getting quite a few host interrupts per guest interrupt.
|
|
|
|
*/
|
|
|
|
vbasedev->ops->vfio_eoi(vbasedev);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t vfio_region_read(void *opaque,
|
|
|
|
hwaddr addr, unsigned size)
|
|
|
|
{
|
|
|
|
VFIORegion *region = opaque;
|
|
|
|
VFIODevice *vbasedev = region->vbasedev;
|
|
|
|
union {
|
|
|
|
uint8_t byte;
|
|
|
|
uint16_t word;
|
|
|
|
uint32_t dword;
|
|
|
|
uint64_t qword;
|
|
|
|
} buf;
|
|
|
|
uint64_t data = 0;
|
|
|
|
|
|
|
|
if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
|
|
|
|
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
|
|
|
|
__func__, vbasedev->name, region->nr,
|
|
|
|
addr, size);
|
|
|
|
return (uint64_t)-1;
|
|
|
|
}
|
|
|
|
switch (size) {
|
|
|
|
case 1:
|
|
|
|
data = buf.byte;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
data = le16_to_cpu(buf.word);
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
data = le32_to_cpu(buf.dword);
|
|
|
|
break;
|
2017-05-03 23:52:34 +03:00
|
|
|
case 8:
|
|
|
|
data = le64_to_cpu(buf.qword);
|
|
|
|
break;
|
2014-12-22 19:54:51 +03:00
|
|
|
default:
|
2020-10-19 17:23:46 +03:00
|
|
|
hw_error("vfio: unsupported read size, %u bytes", size);
|
2014-12-22 19:54:51 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
|
|
|
|
|
|
|
|
/* Same as write above */
|
|
|
|
vbasedev->ops->vfio_eoi(vbasedev);
|
|
|
|
|
|
|
|
return data;
|
|
|
|
}
|
|
|
|
|
|
|
|
const MemoryRegionOps vfio_region_ops = {
|
|
|
|
.read = vfio_region_read,
|
|
|
|
.write = vfio_region_write,
|
|
|
|
.endianness = DEVICE_LITTLE_ENDIAN,
|
vfio: Set MemoryRegionOps:max_access_size and min_access_size
Sets valid.max_access_size and valid.min_access_size to ensure safe
8-byte accesses to vfio. Today, 8-byte accesses are broken into pairs
of 4-byte calls that goes unprotected:
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc0, 0x2020c, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc4, 0xa0000, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
which occasionally leads to:
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc0, 0x2030c, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc0, 0x1000c, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc4, 0xb0000, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc4, 0xa0000, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
causing strange errors in guest OS. With this patch, such accesses
are protected by the same lock guard:
qemu_mutex_lock locked mutex 0x10905ad8
vfio_region_write (0001:03:00.0:region1+0xc0, 0x2000c, 4)
vfio_region_write (0001:03:00.0:region1+0xc4, 0xb0000, 4)
qemu_mutex_unlock unlocked mutex 0x10905ad8
This happens because the 8-byte write should be broken into 4-byte
writes by memory.c:access_with_adjusted_size() in order to be under
the same lock. Today, it's done in exec.c:address_space_write_continue()
which was able to handle only 4 bytes due to a zero'ed
valid.max_access_size (see exec.c:memory_access_size()).
Signed-off-by: Jose Ricardo Ziviani <joserz@linux.vnet.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2017-05-03 23:52:34 +03:00
|
|
|
.valid = {
|
|
|
|
.min_access_size = 1,
|
|
|
|
.max_access_size = 8,
|
|
|
|
},
|
2017-05-03 23:52:34 +03:00
|
|
|
.impl = {
|
|
|
|
.min_access_size = 1,
|
|
|
|
.max_access_size = 8,
|
|
|
|
},
|
2014-12-22 19:54:51 +03:00
|
|
|
};
|
|
|
|
|
2020-10-26 12:36:23 +03:00
|
|
|
/*
|
|
|
|
* Device state interfaces
|
|
|
|
*/
|
|
|
|
|
2023-03-07 15:54:39 +03:00
|
|
|
typedef struct {
|
|
|
|
unsigned long *bitmap;
|
|
|
|
hwaddr size;
|
|
|
|
hwaddr pages;
|
|
|
|
} VFIOBitmap;
|
|
|
|
|
|
|
|
static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
|
|
|
|
{
|
|
|
|
vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
|
|
|
|
vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
|
|
|
|
BITS_PER_BYTE;
|
|
|
|
vbmap->bitmap = g_try_malloc0(vbmap->size);
|
|
|
|
if (!vbmap->bitmap) {
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
|
|
|
|
uint64_t size, ram_addr_t ram_addr);
|
|
|
|
|
2020-10-26 12:36:27 +03:00
|
|
|
bool vfio_mig_active(void)
|
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&vfio_group_list)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
if (vbasedev->migration_blocker) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-02-16 17:36:24 +03:00
|
|
|
static Error *multiple_devices_migration_blocker;
|
2023-03-07 15:54:48 +03:00
|
|
|
static Error *giommu_migration_blocker;
|
2023-02-16 17:36:24 +03:00
|
|
|
|
|
|
|
static unsigned int vfio_migratable_device_num(void)
|
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
unsigned int device_num = 0;
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
if (vbasedev->migration) {
|
|
|
|
device_num++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return device_num;
|
|
|
|
}
|
|
|
|
|
2023-06-28 10:31:12 +03:00
|
|
|
int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
|
2023-02-16 17:36:24 +03:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (multiple_devices_migration_blocker ||
|
|
|
|
vfio_migratable_device_num() <= 1) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-06-28 10:31:12 +03:00
|
|
|
if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
|
|
|
|
error_setg(errp, "Migration is currently not supported with multiple "
|
|
|
|
"VFIO devices");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2023-02-16 17:36:24 +03:00
|
|
|
error_setg(&multiple_devices_migration_blocker,
|
|
|
|
"Migration is currently not supported with multiple "
|
|
|
|
"VFIO devices");
|
|
|
|
ret = migrate_add_blocker(multiple_devices_migration_blocker, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_free(multiple_devices_migration_blocker);
|
|
|
|
multiple_devices_migration_blocker = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_unblock_multiple_devices_migration(void)
|
|
|
|
{
|
|
|
|
if (!multiple_devices_migration_blocker ||
|
|
|
|
vfio_migratable_device_num() > 1) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
migrate_del_blocker(multiple_devices_migration_blocker);
|
|
|
|
error_free(multiple_devices_migration_blocker);
|
|
|
|
multiple_devices_migration_blocker = NULL;
|
2023-03-07 15:54:48 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool vfio_viommu_preset(void)
|
|
|
|
{
|
|
|
|
VFIOAddressSpace *space;
|
|
|
|
|
|
|
|
QLIST_FOREACH(space, &vfio_address_spaces, list) {
|
|
|
|
if (space->as != &address_space_memory) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-06-28 10:31:12 +03:00
|
|
|
int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
|
2023-03-07 15:54:48 +03:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (giommu_migration_blocker ||
|
|
|
|
!vfio_viommu_preset()) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-06-28 10:31:12 +03:00
|
|
|
if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
|
|
|
|
error_setg(errp,
|
|
|
|
"Migration is currently not supported with vIOMMU enabled");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:48 +03:00
|
|
|
error_setg(&giommu_migration_blocker,
|
|
|
|
"Migration is currently not supported with vIOMMU enabled");
|
|
|
|
ret = migrate_add_blocker(giommu_migration_blocker, errp);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_free(giommu_migration_blocker);
|
|
|
|
giommu_migration_blocker = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-03-07 19:53:46 +03:00
|
|
|
void vfio_migration_finalize(void)
|
2023-03-07 15:54:48 +03:00
|
|
|
{
|
|
|
|
if (!giommu_migration_blocker ||
|
|
|
|
vfio_viommu_preset()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
migrate_del_blocker(giommu_migration_blocker);
|
|
|
|
error_free(giommu_migration_blocker);
|
|
|
|
giommu_migration_blocker = NULL;
|
2023-02-16 17:36:24 +03:00
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:38 +03:00
|
|
|
static void vfio_set_migration_error(int err)
|
|
|
|
{
|
|
|
|
MigrationState *ms = migrate_get_current();
|
|
|
|
|
|
|
|
if (migration_is_setup_or_active(ms->state)) {
|
|
|
|
WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
|
|
|
|
if (ms->to_dst_file) {
|
|
|
|
qemu_file_set_error(ms->to_dst_file, err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
|
2020-10-26 12:36:23 +03:00
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
MigrationState *ms = migrate_get_current();
|
|
|
|
|
2023-04-03 16:00:00 +03:00
|
|
|
if (ms->state != MIGRATION_STATUS_ACTIVE &&
|
|
|
|
ms->state != MIGRATION_STATUS_DEVICE) {
|
2020-10-26 12:36:23 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
VFIOMigration *migration = vbasedev->migration;
|
|
|
|
|
|
|
|
if (!migration) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-02-16 17:36:28 +03:00
|
|
|
if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
|
2023-06-21 14:12:00 +03:00
|
|
|
(migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
|
|
|
|
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
|
2023-02-16 17:36:27 +03:00
|
|
|
return false;
|
|
|
|
}
|
2020-10-26 12:36:23 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:45 +03:00
|
|
|
static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
|
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
if (!vbasedev->dirty_pages_supported) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-02-16 17:36:23 +03:00
|
|
|
/*
|
|
|
|
* Check if all VFIO devices are running and migration is active, which is
|
|
|
|
* essentially equivalent to the migration being in pre-copy phase.
|
|
|
|
*/
|
|
|
|
static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
|
2020-10-26 12:36:25 +03:00
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
|
2023-02-16 17:36:23 +03:00
|
|
|
if (!migration_is_active(migrate_get_current())) {
|
2020-10-26 12:36:25 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
VFIOMigration *migration = vbasedev->migration;
|
|
|
|
|
|
|
|
if (!migration) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-06-21 14:12:00 +03:00
|
|
|
if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
|
|
|
|
migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
|
2020-10-26 12:36:25 +03:00
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_dma_unmap_bitmap(VFIOContainer *container,
|
|
|
|
hwaddr iova, ram_addr_t size,
|
|
|
|
IOMMUTLBEntry *iotlb)
|
|
|
|
{
|
|
|
|
struct vfio_iommu_type1_dma_unmap *unmap;
|
|
|
|
struct vfio_bitmap *bitmap;
|
2023-03-07 15:54:39 +03:00
|
|
|
VFIOBitmap vbmap;
|
2020-10-26 12:36:25 +03:00
|
|
|
int ret;
|
|
|
|
|
2023-03-07 15:54:39 +03:00
|
|
|
ret = vfio_bitmap_alloc(&vbmap, size);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:25 +03:00
|
|
|
unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
|
|
|
|
|
|
|
|
unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
|
|
|
|
unmap->iova = iova;
|
|
|
|
unmap->size = size;
|
|
|
|
unmap->flags |= VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP;
|
|
|
|
bitmap = (struct vfio_bitmap *)&unmap->data;
|
|
|
|
|
|
|
|
/*
|
2021-03-04 16:34:46 +03:00
|
|
|
* cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
|
|
|
|
* qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
|
|
|
|
* to qemu_real_host_page_size.
|
2020-10-26 12:36:25 +03:00
|
|
|
*/
|
2022-03-23 18:57:22 +03:00
|
|
|
bitmap->pgsize = qemu_real_host_page_size();
|
2023-03-07 15:54:39 +03:00
|
|
|
bitmap->size = vbmap.size;
|
|
|
|
bitmap->data = (__u64 *)vbmap.bitmap;
|
2020-10-26 12:36:25 +03:00
|
|
|
|
2023-03-07 15:54:39 +03:00
|
|
|
if (vbmap.size > container->max_dirty_bitmap_size) {
|
|
|
|
error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
|
2020-10-26 12:36:25 +03:00
|
|
|
ret = -E2BIG;
|
|
|
|
goto unmap_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
|
|
|
|
if (!ret) {
|
2023-03-07 15:54:39 +03:00
|
|
|
cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
|
|
|
|
iotlb->translated_addr, vbmap.pages);
|
2020-10-26 12:36:25 +03:00
|
|
|
} else {
|
|
|
|
error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
|
|
|
|
}
|
|
|
|
|
|
|
|
unmap_exit:
|
|
|
|
g_free(unmap);
|
2023-03-07 15:54:39 +03:00
|
|
|
g_free(vbmap.bitmap);
|
|
|
|
|
2020-10-26 12:36:25 +03:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
/*
|
|
|
|
* DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
|
|
|
|
*/
|
|
|
|
static int vfio_dma_unmap(VFIOContainer *container,
|
2020-10-26 12:36:25 +03:00
|
|
|
hwaddr iova, ram_addr_t size,
|
|
|
|
IOMMUTLBEntry *iotlb)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
|
|
|
struct vfio_iommu_type1_dma_unmap unmap = {
|
|
|
|
.argsz = sizeof(unmap),
|
|
|
|
.flags = 0,
|
|
|
|
.iova = iova,
|
|
|
|
.size = size,
|
|
|
|
};
|
2023-03-07 15:54:47 +03:00
|
|
|
bool need_dirty_sync = false;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
|
|
|
|
if (!vfio_devices_all_device_dirty_tracking(container) &&
|
|
|
|
container->dirty_pages_supported) {
|
|
|
|
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
need_dirty_sync = true;
|
2020-10-26 12:36:25 +03:00
|
|
|
}
|
|
|
|
|
2019-02-22 07:07:03 +03:00
|
|
|
while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
|
|
|
|
/*
|
|
|
|
* The type1 backend has an off-by-one bug in the kernel (71a7d3d78e3c
|
|
|
|
* v4.15) where an overflow in its wrap-around check prevents us from
|
|
|
|
* unmapping the last page of the address space. Test for the error
|
|
|
|
* condition and re-try the unmap excluding the last page. The
|
|
|
|
* expectation is that we've never mapped the last page anyway and this
|
|
|
|
* unmap request comes via vIOMMU support which also makes it unlikely
|
|
|
|
* that this page is used. This bug was introduced well after type1 v2
|
|
|
|
* support was introduced, so we shouldn't need to test for v1. A fix
|
|
|
|
* is queued for kernel v5.0 so this workaround can be removed once
|
|
|
|
* affected kernels are sufficiently deprecated.
|
|
|
|
*/
|
|
|
|
if (errno == EINVAL && unmap.size && !(unmap.iova + unmap.size) &&
|
|
|
|
container->iommu_type == VFIO_TYPE1v2_IOMMU) {
|
|
|
|
trace_vfio_dma_unmap_overflow_workaround();
|
|
|
|
unmap.size -= 1ULL << ctz64(container->pgsizes);
|
|
|
|
continue;
|
|
|
|
}
|
2020-02-14 12:55:19 +03:00
|
|
|
error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
|
2014-12-22 19:54:51 +03:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
if (need_dirty_sync) {
|
|
|
|
ret = vfio_get_dirty_bitmap(container, iova, size,
|
|
|
|
iotlb->translated_addr);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
2023-02-16 17:36:22 +03:00
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_dma_map(VFIOContainer *container, hwaddr iova,
|
|
|
|
ram_addr_t size, void *vaddr, bool readonly)
|
|
|
|
{
|
|
|
|
struct vfio_iommu_type1_dma_map map = {
|
|
|
|
.argsz = sizeof(map),
|
|
|
|
.flags = VFIO_DMA_MAP_FLAG_READ,
|
|
|
|
.vaddr = (__u64)(uintptr_t)vaddr,
|
|
|
|
.iova = iova,
|
|
|
|
.size = size,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!readonly) {
|
|
|
|
map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Try the mapping, if it fails with EBUSY, unmap the region and try
|
|
|
|
* again. This shouldn't be necessary, but we sometimes see it in
|
2015-08-26 14:17:13 +03:00
|
|
|
* the VGA ROM space.
|
2014-12-22 19:54:51 +03:00
|
|
|
*/
|
|
|
|
if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 ||
|
2020-10-26 12:36:25 +03:00
|
|
|
(errno == EBUSY && vfio_dma_unmap(container, iova, size, NULL) == 0 &&
|
2014-12-22 19:54:51 +03:00
|
|
|
ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-02-14 12:55:19 +03:00
|
|
|
error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
|
2014-12-22 19:54:51 +03:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2016-07-04 06:33:05 +03:00
|
|
|
static void vfio_host_win_add(VFIOContainer *container,
|
|
|
|
hwaddr min_iova, hwaddr max_iova,
|
|
|
|
uint64_t iova_pgsizes)
|
|
|
|
{
|
|
|
|
VFIOHostDMAWindow *hostwin;
|
|
|
|
|
|
|
|
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
|
|
|
|
if (ranges_overlap(hostwin->min_iova,
|
|
|
|
hostwin->max_iova - hostwin->min_iova + 1,
|
|
|
|
min_iova,
|
|
|
|
max_iova - min_iova + 1)) {
|
|
|
|
hw_error("%s: Overlapped IOMMU are not enabled", __func__);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
hostwin = g_malloc0(sizeof(*hostwin));
|
|
|
|
|
|
|
|
hostwin->min_iova = min_iova;
|
|
|
|
hostwin->max_iova = max_iova;
|
|
|
|
hostwin->iova_pgsizes = iova_pgsizes;
|
|
|
|
QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
|
|
|
|
}
|
|
|
|
|
2016-07-04 06:33:06 +03:00
|
|
|
static int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
|
|
|
|
hwaddr max_iova)
|
|
|
|
{
|
|
|
|
VFIOHostDMAWindow *hostwin;
|
|
|
|
|
|
|
|
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
|
|
|
|
if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
|
|
|
|
QLIST_REMOVE(hostwin, hostwin_next);
|
2021-11-17 04:47:39 +03:00
|
|
|
g_free(hostwin);
|
2016-07-04 06:33:06 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
static bool vfio_listener_skipped_section(MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
return (!memory_region_is_ram(section->mr) &&
|
|
|
|
!memory_region_is_iommu(section->mr)) ||
|
2021-07-19 14:21:04 +03:00
|
|
|
memory_region_is_protected(section->mr) ||
|
2014-12-22 19:54:51 +03:00
|
|
|
/*
|
|
|
|
* Sizing an enabled 64-bit BAR can cause spurious mappings to
|
|
|
|
* addresses in the upper part of the 64-bit address space. These
|
|
|
|
* are never accessed by the CPU and beyond the address width of
|
|
|
|
* some IOMMU hardware. TODO: VFIO should tell us the IOMMU width.
|
|
|
|
*/
|
|
|
|
section->offset_within_address_space & (1ULL << 63);
|
|
|
|
}
|
|
|
|
|
2017-02-07 11:28:04 +03:00
|
|
|
/* Called with rcu_read_lock held. */
|
2020-10-26 12:36:24 +03:00
|
|
|
static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
|
|
|
|
ram_addr_t *ram_addr, bool *read_only)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
2022-10-31 06:10:19 +03:00
|
|
|
bool ret, mr_has_discard_manager;
|
2021-04-13 12:55:27 +03:00
|
|
|
|
2022-10-31 06:10:19 +03:00
|
|
|
ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
|
|
|
|
&mr_has_discard_manager);
|
|
|
|
if (ret && mr_has_discard_manager) {
|
2021-04-13 12:55:27 +03:00
|
|
|
/*
|
|
|
|
* Malicious VMs might trigger discarding of IOMMU-mapped memory. The
|
|
|
|
* pages will remain pinned inside vfio until unmapped, resulting in a
|
|
|
|
* higher memory consumption than expected. If memory would get
|
|
|
|
* populated again later, there would be an inconsistency between pages
|
|
|
|
* pinned by vfio and pages seen by QEMU. This is the case until
|
|
|
|
* unmapped from the IOMMU (e.g., during device reset).
|
|
|
|
*
|
|
|
|
* With malicious guests, we really only care about pinning more memory
|
|
|
|
* than expected. RLIMIT_MEMLOCK set for the user/process can never be
|
|
|
|
* exceeded and can be used to mitigate this problem.
|
|
|
|
*/
|
|
|
|
warn_report_once("Using vfio with vIOMMUs and coordinated discarding of"
|
|
|
|
" RAM (e.g., virtio-mem) works, however, malicious"
|
|
|
|
" guests can trigger pinning of more memory than"
|
|
|
|
" intended via an IOMMU. It's possible to mitigate "
|
|
|
|
" by setting/adjusting RLIMIT_MEMLOCK.");
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
2022-10-31 06:10:19 +03:00
|
|
|
return ret;
|
2017-02-07 11:28:04 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
|
|
|
|
{
|
|
|
|
VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n);
|
|
|
|
VFIOContainer *container = giommu->container;
|
|
|
|
hwaddr iova = iotlb->iova + giommu->iommu_offset;
|
|
|
|
void *vaddr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
trace_vfio_iommu_map_notify(iotlb->perm == IOMMU_NONE ? "UNMAP" : "MAP",
|
|
|
|
iova, iova + iotlb->addr_mask);
|
|
|
|
|
|
|
|
if (iotlb->target_as != &address_space_memory) {
|
|
|
|
error_report("Wrong target AS \"%s\", only system memory is allowed",
|
|
|
|
iotlb->target_as->name ? iotlb->target_as->name : "none");
|
2023-03-07 15:54:38 +03:00
|
|
|
vfio_set_migration_error(-EINVAL);
|
2017-02-07 11:28:04 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
|
2020-10-26 12:36:24 +03:00
|
|
|
bool read_only;
|
|
|
|
|
|
|
|
if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only)) {
|
2017-02-07 11:28:05 +03:00
|
|
|
goto out;
|
|
|
|
}
|
2017-02-07 11:28:04 +03:00
|
|
|
/*
|
|
|
|
* vaddr is only valid until rcu_read_unlock(). But after
|
|
|
|
* vfio_dma_map has set up the mapping the pages will be
|
|
|
|
* pinned by the kernel. This makes sure that the RAM backend
|
|
|
|
* of vaddr will always be there, even if the memory object is
|
|
|
|
* destroyed and its backing memory munmap-ed.
|
|
|
|
*/
|
2016-05-26 18:43:23 +03:00
|
|
|
ret = vfio_dma_map(container, iova,
|
2014-12-22 19:54:51 +03:00
|
|
|
iotlb->addr_mask + 1, vaddr,
|
2017-02-07 11:28:04 +03:00
|
|
|
read_only);
|
2014-12-22 19:54:51 +03:00
|
|
|
if (ret) {
|
|
|
|
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx", %p) = %d (%s)",
|
2016-05-26 18:43:23 +03:00
|
|
|
container, iova,
|
2023-03-07 15:54:37 +03:00
|
|
|
iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
} else {
|
2020-10-26 12:36:25 +03:00
|
|
|
ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
|
2014-12-22 19:54:51 +03:00
|
|
|
if (ret) {
|
|
|
|
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx") = %d (%s)",
|
2016-05-26 18:43:23 +03:00
|
|
|
container, iova,
|
2023-03-07 15:54:37 +03:00
|
|
|
iotlb->addr_mask + 1, ret, strerror(-ret));
|
2023-03-07 15:54:38 +03:00
|
|
|
vfio_set_migration_error(ret);
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
}
|
2015-03-18 16:21:43 +03:00
|
|
|
out:
|
|
|
|
rcu_read_unlock();
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2021-04-13 12:55:24 +03:00
|
|
|
static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
|
|
|
|
listener);
|
|
|
|
const hwaddr size = int128_get64(section->size);
|
|
|
|
const hwaddr iova = section->offset_within_address_space;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/* Unmap with a single call. */
|
|
|
|
ret = vfio_dma_unmap(vrdl->container, iova, size , NULL);
|
|
|
|
if (ret) {
|
|
|
|
error_report("%s: vfio_dma_unmap() failed: %s", __func__,
|
|
|
|
strerror(-ret));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
|
|
|
|
listener);
|
|
|
|
const hwaddr end = section->offset_within_region +
|
|
|
|
int128_get64(section->size);
|
|
|
|
hwaddr start, next, iova;
|
|
|
|
void *vaddr;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map in (aligned within memory region) minimum granularity, so we can
|
|
|
|
* unmap in minimum granularity later.
|
|
|
|
*/
|
|
|
|
for (start = section->offset_within_region; start < end; start = next) {
|
|
|
|
next = ROUND_UP(start + 1, vrdl->granularity);
|
|
|
|
next = MIN(next, end);
|
|
|
|
|
|
|
|
iova = start - section->offset_within_region +
|
|
|
|
section->offset_within_address_space;
|
|
|
|
vaddr = memory_region_get_ram_ptr(section->mr) + start;
|
|
|
|
|
|
|
|
ret = vfio_dma_map(vrdl->container, iova, next - start,
|
|
|
|
vaddr, section->readonly);
|
|
|
|
if (ret) {
|
|
|
|
/* Rollback */
|
|
|
|
vfio_ram_discard_notify_discard(rdl, section);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_register_ram_discard_listener(VFIOContainer *container,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
|
|
|
|
VFIORamDiscardListener *vrdl;
|
|
|
|
|
|
|
|
/* Ignore some corner cases not relevant in practice. */
|
|
|
|
g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE));
|
|
|
|
g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space,
|
|
|
|
TARGET_PAGE_SIZE));
|
|
|
|
g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE));
|
|
|
|
|
|
|
|
vrdl = g_new0(VFIORamDiscardListener, 1);
|
|
|
|
vrdl->container = container;
|
|
|
|
vrdl->mr = section->mr;
|
|
|
|
vrdl->offset_within_address_space = section->offset_within_address_space;
|
|
|
|
vrdl->size = int128_get64(section->size);
|
|
|
|
vrdl->granularity = ram_discard_manager_get_min_granularity(rdm,
|
|
|
|
section->mr);
|
|
|
|
|
|
|
|
g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity));
|
2021-07-12 11:31:35 +03:00
|
|
|
g_assert(container->pgsizes &&
|
|
|
|
vrdl->granularity >= 1ULL << ctz64(container->pgsizes));
|
2021-04-13 12:55:24 +03:00
|
|
|
|
|
|
|
ram_discard_listener_init(&vrdl->listener,
|
|
|
|
vfio_ram_discard_notify_populate,
|
|
|
|
vfio_ram_discard_notify_discard, true);
|
|
|
|
ram_discard_manager_register_listener(rdm, &vrdl->listener, section);
|
|
|
|
QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next);
|
2021-04-13 12:55:26 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Sanity-check if we have a theoretically problematic setup where we could
|
|
|
|
* exceed the maximum number of possible DMA mappings over time. We assume
|
|
|
|
* that each mapped section in the same address space as a RamDiscardManager
|
|
|
|
* section consumes exactly one DMA mapping, with the exception of
|
|
|
|
* RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections
|
|
|
|
* in the same address space as RamDiscardManager sections.
|
|
|
|
*
|
|
|
|
* We assume that each section in the address space consumes one memslot.
|
|
|
|
* We take the number of KVM memory slots as a best guess for the maximum
|
|
|
|
* number of sections in the address space we could have over time,
|
|
|
|
* also consuming DMA mappings.
|
|
|
|
*/
|
|
|
|
if (container->dma_max_mappings) {
|
|
|
|
unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512;
|
|
|
|
|
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
if (kvm_enabled()) {
|
|
|
|
max_memslots = kvm_get_max_memslots();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
|
|
|
|
hwaddr start, end;
|
|
|
|
|
|
|
|
start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space,
|
|
|
|
vrdl->granularity);
|
|
|
|
end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size,
|
|
|
|
vrdl->granularity);
|
|
|
|
vrdl_mappings += (end - start) / vrdl->granularity;
|
|
|
|
vrdl_count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (vrdl_mappings + max_memslots - vrdl_count >
|
|
|
|
container->dma_max_mappings) {
|
|
|
|
warn_report("%s: possibly running out of DMA mappings. E.g., try"
|
|
|
|
" increasing the 'block-size' of virtio-mem devies."
|
|
|
|
" Maximum possible DMA mappings: %d, Maximum possible"
|
|
|
|
" memslots: %d", __func__, container->dma_max_mappings,
|
|
|
|
max_memslots);
|
|
|
|
}
|
|
|
|
}
|
2021-04-13 12:55:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
|
|
|
|
VFIORamDiscardListener *vrdl = NULL;
|
|
|
|
|
|
|
|
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
|
|
|
|
if (vrdl->mr == section->mr &&
|
|
|
|
vrdl->offset_within_address_space ==
|
|
|
|
section->offset_within_address_space) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!vrdl) {
|
|
|
|
hw_error("vfio: Trying to unregister missing RAM discard listener");
|
|
|
|
}
|
|
|
|
|
|
|
|
ram_discard_manager_unregister_listener(rdm, &vrdl->listener);
|
|
|
|
QLIST_REMOVE(vrdl, next);
|
|
|
|
g_free(vrdl);
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:40 +03:00
|
|
|
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
|
|
|
|
hwaddr iova, hwaddr end)
|
|
|
|
{
|
|
|
|
VFIOHostDMAWindow *hostwin;
|
|
|
|
bool hostwin_found = false;
|
|
|
|
|
|
|
|
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
|
|
|
|
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
|
|
|
|
hostwin_found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return hostwin_found ? hostwin : NULL;
|
|
|
|
}
|
|
|
|
|
2022-05-06 16:25:10 +03:00
|
|
|
static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
MemoryRegion *mr = section->mr;
|
|
|
|
|
|
|
|
if (!TPM_IS_CRB(mr->owner)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* this is a known safe misaligned region, just trace for debug purpose */
|
|
|
|
trace_vfio_known_safe_misalignment(memory_region_name(mr),
|
|
|
|
section->offset_within_address_space,
|
|
|
|
section->offset_within_region,
|
|
|
|
qemu_real_host_page_size());
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:42 +03:00
|
|
|
static bool vfio_listener_valid_section(MemoryRegionSection *section,
|
|
|
|
const char *name)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
|
|
|
if (vfio_listener_skipped_section(section)) {
|
2023-03-07 15:54:42 +03:00
|
|
|
trace_vfio_listener_region_skip(name,
|
2014-12-22 19:54:51 +03:00
|
|
|
section->offset_within_address_space,
|
|
|
|
section->offset_within_address_space +
|
|
|
|
int128_get64(int128_sub(section->size, int128_one())));
|
2023-03-07 15:54:42 +03:00
|
|
|
return false;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2021-03-04 16:34:46 +03:00
|
|
|
if (unlikely((section->offset_within_address_space &
|
2022-03-23 18:57:22 +03:00
|
|
|
~qemu_real_host_page_mask()) !=
|
|
|
|
(section->offset_within_region & ~qemu_real_host_page_mask()))) {
|
2022-05-06 16:25:10 +03:00
|
|
|
if (!vfio_known_safe_misalignment(section)) {
|
|
|
|
error_report("%s received unaligned region %s iova=0x%"PRIx64
|
|
|
|
" offset_within_region=0x%"PRIx64
|
|
|
|
" qemu_real_host_page_size=0x%"PRIxPTR,
|
|
|
|
__func__, memory_region_name(section->mr),
|
|
|
|
section->offset_within_address_space,
|
|
|
|
section->offset_within_region,
|
|
|
|
qemu_real_host_page_size());
|
|
|
|
}
|
2023-03-07 15:54:42 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:43 +03:00
|
|
|
static bool vfio_get_section_iova_range(VFIOContainer *container,
|
|
|
|
MemoryRegionSection *section,
|
|
|
|
hwaddr *out_iova, hwaddr *out_end,
|
|
|
|
Int128 *out_llend)
|
|
|
|
{
|
|
|
|
Int128 llend;
|
|
|
|
hwaddr iova;
|
|
|
|
|
|
|
|
iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
|
|
|
|
llend = int128_make64(section->offset_within_address_space);
|
|
|
|
llend = int128_add(llend, section->size);
|
|
|
|
llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
|
|
|
|
|
|
|
|
if (int128_ge(int128_make64(iova), llend)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
*out_iova = iova;
|
|
|
|
*out_end = int128_get64(int128_sub(llend, int128_one()));
|
|
|
|
if (out_llend) {
|
|
|
|
*out_llend = llend;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:42 +03:00
|
|
|
static void vfio_listener_region_add(MemoryListener *listener,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
|
|
|
|
hwaddr iova, end;
|
|
|
|
Int128 llend, llsize;
|
|
|
|
void *vaddr;
|
|
|
|
int ret;
|
|
|
|
VFIOHostDMAWindow *hostwin;
|
|
|
|
Error *err = NULL;
|
|
|
|
|
|
|
|
if (!vfio_listener_valid_section(section, "region_add")) {
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:43 +03:00
|
|
|
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
|
2021-10-27 12:04:06 +03:00
|
|
|
if (memory_region_is_ram_device(section->mr)) {
|
|
|
|
trace_vfio_listener_region_add_no_dma_map(
|
|
|
|
memory_region_name(section->mr),
|
|
|
|
section->offset_within_address_space,
|
|
|
|
int128_getlo(section->size),
|
2022-03-23 18:57:22 +03:00
|
|
|
qemu_real_host_page_size());
|
2021-10-27 12:04:06 +03:00
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
|
2016-07-04 06:33:06 +03:00
|
|
|
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
|
|
|
|
hwaddr pgsize = 0;
|
|
|
|
|
|
|
|
/* For now intersections are not allowed, we may relax this later */
|
|
|
|
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
|
|
|
|
if (ranges_overlap(hostwin->min_iova,
|
|
|
|
hostwin->max_iova - hostwin->min_iova + 1,
|
|
|
|
section->offset_within_address_space,
|
|
|
|
int128_get64(section->size))) {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_setg(&err,
|
|
|
|
"region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
|
|
|
|
"host DMA window [0x%"PRIx64",0x%"PRIx64"]",
|
|
|
|
section->offset_within_address_space,
|
|
|
|
section->offset_within_address_space +
|
|
|
|
int128_get64(section->size) - 1,
|
|
|
|
hostwin->min_iova, hostwin->max_iova);
|
2016-07-04 06:33:06 +03:00
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vfio_spapr_create_window(container, section, &pgsize);
|
|
|
|
if (ret) {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_setg_errno(&err, -ret, "Failed to create SPAPR window");
|
2016-07-04 06:33:06 +03:00
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
|
|
|
vfio_host_win_add(container, section->offset_within_address_space,
|
|
|
|
section->offset_within_address_space +
|
|
|
|
int128_get64(section->size) - 1, pgsize);
|
2018-02-06 21:08:24 +03:00
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
if (kvm_enabled()) {
|
|
|
|
VFIOGroup *group;
|
|
|
|
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
|
|
|
|
struct kvm_vfio_spapr_tce param;
|
|
|
|
struct kvm_device_attr attr = {
|
|
|
|
.group = KVM_DEV_VFIO_GROUP,
|
|
|
|
.attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
|
|
|
|
.addr = (uint64_t)(unsigned long)¶m,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
|
|
|
|
¶m.tablefd)) {
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
param.groupfd = group->fd;
|
|
|
|
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
|
|
|
|
error_report("vfio: failed to setup fd %d "
|
|
|
|
"for a group with fd %d: %s",
|
|
|
|
param.tablefd, param.groupfd,
|
|
|
|
strerror(errno));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
2016-07-04 06:33:06 +03:00
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:40 +03:00
|
|
|
hostwin = vfio_find_hostwin(container, iova, end);
|
|
|
|
if (!hostwin) {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_setg(&err, "Container %p can't map guest IOVA region"
|
|
|
|
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
goto fail;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
|
|
|
memory_region_ref(section->mr);
|
|
|
|
|
|
|
|
if (memory_region_is_iommu(section->mr)) {
|
|
|
|
VFIOGuestIOMMU *giommu;
|
2017-07-11 06:56:19 +03:00
|
|
|
IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
|
2018-06-15 16:57:16 +03:00
|
|
|
int iommu_idx;
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-03-24 03:37:25 +03:00
|
|
|
trace_vfio_listener_region_add_iommu(iova, end);
|
2014-12-22 19:54:51 +03:00
|
|
|
/*
|
|
|
|
* FIXME: For VFIO iommu types which have KVM acceleration to
|
|
|
|
* avoid bouncing all map/unmaps through qemu this way, this
|
|
|
|
* would be the right place to wire that up (tell the KVM
|
|
|
|
* device emulation the VFIO iommu handles to use).
|
|
|
|
*/
|
|
|
|
giommu = g_malloc0(sizeof(*giommu));
|
2022-05-02 12:42:23 +03:00
|
|
|
giommu->iommu_mr = iommu_mr;
|
2016-05-26 18:43:23 +03:00
|
|
|
giommu->iommu_offset = section->offset_within_address_space -
|
|
|
|
section->offset_within_region;
|
2014-12-22 19:54:51 +03:00
|
|
|
giommu->container = container;
|
memory: add section range info for IOMMU notifier
In this patch, IOMMUNotifier.{start|end} are introduced to store section
information for a specific notifier. When notification occurs, we not
only check the notification type (MAP|UNMAP), but also check whether the
notified iova range overlaps with the range of specific IOMMU notifier,
and skip those notifiers if not in the listened range.
When removing an region, we need to make sure we removed the correct
VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well.
This patch is solving the problem that vfio-pci devices receive
duplicated UNMAP notification on x86 platform when vIOMMU is there. The
issue is that x86 IOMMU has a (0, 2^64-1) IOMMU region, which is
splitted by the (0xfee00000, 0xfeefffff) IRQ region. AFAIK
this (splitted IOMMU region) is only happening on x86.
This patch also helps vhost to leverage the new interface as well, so
that vhost won't get duplicated cache flushes. In that sense, it's an
slight performance improvement.
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1491562755-23867-2-git-send-email-peterx@redhat.com>
[ehabkost: included extra vhost_iommu_region_del() change from Peter Xu]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-04-07 13:59:07 +03:00
|
|
|
llend = int128_add(int128_make64(section->offset_within_region),
|
|
|
|
section->size);
|
|
|
|
llend = int128_sub(llend, int128_one());
|
2018-06-15 16:57:16 +03:00
|
|
|
iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
|
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
memory: add section range info for IOMMU notifier
In this patch, IOMMUNotifier.{start|end} are introduced to store section
information for a specific notifier. When notification occurs, we not
only check the notification type (MAP|UNMAP), but also check whether the
notified iova range overlaps with the range of specific IOMMU notifier,
and skip those notifiers if not in the listened range.
When removing an region, we need to make sure we removed the correct
VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well.
This patch is solving the problem that vfio-pci devices receive
duplicated UNMAP notification on x86 platform when vIOMMU is there. The
issue is that x86 IOMMU has a (0, 2^64-1) IOMMU region, which is
splitted by the (0xfee00000, 0xfeefffff) IRQ region. AFAIK
this (splitted IOMMU region) is only happening on x86.
This patch also helps vhost to leverage the new interface as well, so
that vhost won't get duplicated cache flushes. In that sense, it's an
slight performance improvement.
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1491562755-23867-2-git-send-email-peterx@redhat.com>
[ehabkost: included extra vhost_iommu_region_del() change from Peter Xu]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-04-07 13:59:07 +03:00
|
|
|
iommu_notifier_init(&giommu->n, vfio_iommu_map_notify,
|
2021-02-10 00:32:32 +03:00
|
|
|
IOMMU_NOTIFIER_IOTLB_EVENTS,
|
memory: add section range info for IOMMU notifier
In this patch, IOMMUNotifier.{start|end} are introduced to store section
information for a specific notifier. When notification occurs, we not
only check the notification type (MAP|UNMAP), but also check whether the
notified iova range overlaps with the range of specific IOMMU notifier,
and skip those notifiers if not in the listened range.
When removing an region, we need to make sure we removed the correct
VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well.
This patch is solving the problem that vfio-pci devices receive
duplicated UNMAP notification on x86 platform when vIOMMU is there. The
issue is that x86 IOMMU has a (0, 2^64-1) IOMMU region, which is
splitted by the (0xfee00000, 0xfeefffff) IRQ region. AFAIK
this (splitted IOMMU region) is only happening on x86.
This patch also helps vhost to leverage the new interface as well, so
that vhost won't get duplicated cache flushes. In that sense, it's an
slight performance improvement.
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1491562755-23867-2-git-send-email-peterx@redhat.com>
[ehabkost: included extra vhost_iommu_region_del() change from Peter Xu]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-04-07 13:59:07 +03:00
|
|
|
section->offset_within_region,
|
2018-06-15 16:57:16 +03:00
|
|
|
int128_get64(llend),
|
|
|
|
iommu_idx);
|
2015-09-30 05:13:56 +03:00
|
|
|
|
2022-05-02 12:42:23 +03:00
|
|
|
ret = memory_region_iommu_set_page_size_mask(giommu->iommu_mr,
|
2020-10-30 21:05:08 +03:00
|
|
|
container->pgsizes,
|
|
|
|
&err);
|
|
|
|
if (ret) {
|
|
|
|
g_free(giommu);
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
2019-09-24 11:25:17 +03:00
|
|
|
ret = memory_region_register_iommu_notifier(section->mr, &giommu->n,
|
|
|
|
&err);
|
|
|
|
if (ret) {
|
|
|
|
g_free(giommu);
|
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next);
|
2022-05-02 12:42:23 +03:00
|
|
|
memory_region_iommu_replay(giommu->iommu_mr, &giommu->n);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Here we assume that memory_region_is_ram(section->mr)==true */
|
|
|
|
|
2021-04-13 12:55:24 +03:00
|
|
|
/*
|
|
|
|
* For RAM memory regions with a RamDiscardManager, we only want to map the
|
|
|
|
* actually populated parts - and update the mapping whenever we're notified
|
|
|
|
* about changes.
|
|
|
|
*/
|
|
|
|
if (memory_region_has_ram_discard_manager(section->mr)) {
|
|
|
|
vfio_register_ram_discard_listener(container, section);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
vaddr = memory_region_get_ram_ptr(section->mr) +
|
|
|
|
section->offset_within_region +
|
|
|
|
(iova - section->offset_within_address_space);
|
|
|
|
|
2016-03-24 03:37:25 +03:00
|
|
|
trace_vfio_listener_region_add_ram(iova, end, vaddr);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-03-24 03:37:25 +03:00
|
|
|
llsize = int128_sub(llend, int128_make64(iova));
|
|
|
|
|
2018-03-13 20:17:30 +03:00
|
|
|
if (memory_region_is_ram_device(section->mr)) {
|
|
|
|
hwaddr pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
|
|
|
|
|
|
|
|
if ((iova & pgmask) || (int128_get64(llsize) & pgmask)) {
|
2018-04-04 23:30:50 +03:00
|
|
|
trace_vfio_listener_region_add_no_dma_map(
|
|
|
|
memory_region_name(section->mr),
|
|
|
|
section->offset_within_address_space,
|
|
|
|
int128_getlo(section->size),
|
|
|
|
pgmask + 1);
|
2018-03-13 20:17:30 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-24 03:37:25 +03:00
|
|
|
ret = vfio_dma_map(container, iova, int128_get64(llsize),
|
|
|
|
vaddr, section->readonly);
|
2014-12-22 19:54:51 +03:00
|
|
|
if (ret) {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx", %p) = %d (%s)",
|
|
|
|
container, iova, int128_get64(llsize), vaddr, ret,
|
|
|
|
strerror(-ret));
|
2018-03-13 20:17:30 +03:00
|
|
|
if (memory_region_is_ram_device(section->mr)) {
|
|
|
|
/* Allow unexpected mappings not to be fatal for RAM devices */
|
2019-09-24 11:25:16 +03:00
|
|
|
error_report_err(err);
|
2018-03-13 20:17:30 +03:00
|
|
|
return;
|
|
|
|
}
|
2015-09-30 05:13:52 +03:00
|
|
|
goto fail;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2015-09-30 05:13:52 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
fail:
|
2018-03-13 20:17:30 +03:00
|
|
|
if (memory_region_is_ram_device(section->mr)) {
|
|
|
|
error_report("failed to vfio_dma_map. pci p2p may not work");
|
|
|
|
return;
|
|
|
|
}
|
2015-09-30 05:13:52 +03:00
|
|
|
/*
|
|
|
|
* On the initfn path, store the first error in the container so we
|
|
|
|
* can gracefully fail. Runtime, there's not much we can do other
|
|
|
|
* than throw a hardware error.
|
|
|
|
*/
|
|
|
|
if (!container->initialized) {
|
|
|
|
if (!container->error) {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_propagate_prepend(&container->error, err,
|
|
|
|
"Region %s: ",
|
|
|
|
memory_region_name(section->mr));
|
|
|
|
} else {
|
|
|
|
error_free(err);
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
2015-09-30 05:13:52 +03:00
|
|
|
} else {
|
2019-09-24 11:25:16 +03:00
|
|
|
error_report_err(err);
|
2015-09-30 05:13:52 +03:00
|
|
|
hw_error("vfio: DMA mapping failed, unable to continue");
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_listener_region_del(MemoryListener *listener,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
2015-09-30 05:13:51 +03:00
|
|
|
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
|
2014-12-22 19:54:51 +03:00
|
|
|
hwaddr iova, end;
|
2016-05-26 18:43:22 +03:00
|
|
|
Int128 llend, llsize;
|
2014-12-22 19:54:51 +03:00
|
|
|
int ret;
|
2018-03-13 20:17:30 +03:00
|
|
|
bool try_unmap = true;
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2023-03-07 15:54:42 +03:00
|
|
|
if (!vfio_listener_valid_section(section, "region_del")) {
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (memory_region_is_iommu(section->mr)) {
|
|
|
|
VFIOGuestIOMMU *giommu;
|
|
|
|
|
|
|
|
QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
|
2022-05-02 12:42:23 +03:00
|
|
|
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
|
memory: add section range info for IOMMU notifier
In this patch, IOMMUNotifier.{start|end} are introduced to store section
information for a specific notifier. When notification occurs, we not
only check the notification type (MAP|UNMAP), but also check whether the
notified iova range overlaps with the range of specific IOMMU notifier,
and skip those notifiers if not in the listened range.
When removing an region, we need to make sure we removed the correct
VFIOGuestIOMMU by checking the IOMMUNotifier.start address as well.
This patch is solving the problem that vfio-pci devices receive
duplicated UNMAP notification on x86 platform when vIOMMU is there. The
issue is that x86 IOMMU has a (0, 2^64-1) IOMMU region, which is
splitted by the (0xfee00000, 0xfeefffff) IRQ region. AFAIK
this (splitted IOMMU region) is only happening on x86.
This patch also helps vhost to leverage the new interface as well, so
that vhost won't get duplicated cache flushes. In that sense, it's an
slight performance improvement.
Suggested-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Eric Auger <eric.auger@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <1491562755-23867-2-git-send-email-peterx@redhat.com>
[ehabkost: included extra vhost_iommu_region_del() change from Peter Xu]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-04-07 13:59:07 +03:00
|
|
|
giommu->n.start == section->offset_within_region) {
|
2017-07-11 06:56:19 +03:00
|
|
|
memory_region_unregister_iommu_notifier(section->mr,
|
2016-06-30 22:00:23 +03:00
|
|
|
&giommu->n);
|
2014-12-22 19:54:51 +03:00
|
|
|
QLIST_REMOVE(giommu, giommu_next);
|
|
|
|
g_free(giommu);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* FIXME: We assume the one big unmap below is adequate to
|
|
|
|
* remove any individual page mappings in the IOMMU which
|
|
|
|
* might have been copied into VFIO. This works for a page table
|
|
|
|
* based IOMMU where a big unmap flattens a large range of IO-PTEs.
|
|
|
|
* That may not be true for all IOMMU types.
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:43 +03:00
|
|
|
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
2016-05-26 18:43:22 +03:00
|
|
|
|
|
|
|
llsize = int128_sub(llend, int128_make64(iova));
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-05-26 18:43:22 +03:00
|
|
|
trace_vfio_listener_region_del(iova, end);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2018-03-13 20:17:30 +03:00
|
|
|
if (memory_region_is_ram_device(section->mr)) {
|
|
|
|
hwaddr pgmask;
|
|
|
|
VFIOHostDMAWindow *hostwin;
|
|
|
|
|
2023-03-07 15:54:40 +03:00
|
|
|
hostwin = vfio_find_hostwin(container, iova, end);
|
|
|
|
assert(hostwin); /* or region_add() would have failed */
|
2018-03-13 20:17:30 +03:00
|
|
|
|
|
|
|
pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
|
|
|
|
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
|
2021-04-13 12:55:24 +03:00
|
|
|
} else if (memory_region_has_ram_discard_manager(section->mr)) {
|
|
|
|
vfio_unregister_ram_discard_listener(container, section);
|
|
|
|
/* Unregistering will trigger an unmap. */
|
|
|
|
try_unmap = false;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
2016-07-04 06:33:06 +03:00
|
|
|
|
2018-03-13 20:17:30 +03:00
|
|
|
if (try_unmap) {
|
2020-10-30 21:05:10 +03:00
|
|
|
if (int128_eq(llsize, int128_2_64())) {
|
|
|
|
/* The unmap ioctl doesn't accept a full 64-bit span. */
|
|
|
|
llsize = int128_rshift(llsize, 1);
|
|
|
|
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
|
|
|
|
if (ret) {
|
|
|
|
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx") = %d (%s)",
|
|
|
|
container, iova, int128_get64(llsize), ret,
|
|
|
|
strerror(-ret));
|
2020-10-30 21:05:10 +03:00
|
|
|
}
|
|
|
|
iova += int128_get64(llsize);
|
|
|
|
}
|
2020-10-26 12:36:25 +03:00
|
|
|
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
|
2018-03-13 20:17:30 +03:00
|
|
|
if (ret) {
|
|
|
|
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx") = %d (%s)",
|
|
|
|
container, iova, int128_get64(llsize), ret,
|
|
|
|
strerror(-ret));
|
2018-03-13 20:17:30 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memory_region_unref(section->mr);
|
|
|
|
|
2016-07-04 06:33:06 +03:00
|
|
|
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
|
|
|
|
vfio_spapr_remove_window(container,
|
|
|
|
section->offset_within_address_space);
|
|
|
|
if (vfio_host_win_del(container,
|
|
|
|
section->offset_within_address_space,
|
|
|
|
section->offset_within_address_space +
|
|
|
|
int128_get64(section->size) - 1) < 0) {
|
|
|
|
hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
|
|
|
|
__func__, section->offset_within_address_space);
|
|
|
|
}
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:38 +03:00
|
|
|
static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct vfio_iommu_type1_dirty_bitmap dirty = {
|
|
|
|
.argsz = sizeof(dirty),
|
|
|
|
};
|
|
|
|
|
2023-02-16 17:36:22 +03:00
|
|
|
if (!container->dirty_pages_supported) {
|
2023-03-07 15:54:38 +03:00
|
|
|
return 0;
|
2023-02-16 17:36:22 +03:00
|
|
|
}
|
|
|
|
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
if (start) {
|
|
|
|
dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
|
|
|
|
} else {
|
|
|
|
dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
|
|
|
|
if (ret) {
|
2023-03-07 15:54:38 +03:00
|
|
|
ret = -errno;
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
error_report("Failed to set dirty tracking flag 0x%x errno: %d",
|
|
|
|
dirty.flags, errno);
|
|
|
|
}
|
2023-03-07 15:54:38 +03:00
|
|
|
|
|
|
|
return ret;
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:44 +03:00
|
|
|
typedef struct VFIODirtyRanges {
|
|
|
|
hwaddr min32;
|
|
|
|
hwaddr max32;
|
|
|
|
hwaddr min64;
|
|
|
|
hwaddr max64;
|
|
|
|
} VFIODirtyRanges;
|
|
|
|
|
|
|
|
typedef struct VFIODirtyRangesListener {
|
|
|
|
VFIOContainer *container;
|
|
|
|
VFIODirtyRanges ranges;
|
|
|
|
MemoryListener listener;
|
|
|
|
} VFIODirtyRangesListener;
|
|
|
|
|
|
|
|
static void vfio_dirty_tracking_update(MemoryListener *listener,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
VFIODirtyRangesListener *dirty = container_of(listener,
|
|
|
|
VFIODirtyRangesListener,
|
|
|
|
listener);
|
|
|
|
VFIODirtyRanges *range = &dirty->ranges;
|
|
|
|
hwaddr iova, end, *min, *max;
|
|
|
|
|
|
|
|
if (!vfio_listener_valid_section(section, "tracking_update") ||
|
|
|
|
!vfio_get_section_iova_range(dirty->container, section,
|
|
|
|
&iova, &end, NULL)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The address space passed to the dirty tracker is reduced to two ranges:
|
|
|
|
* one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
|
|
|
|
* The underlying reports of dirty will query a sub-interval of each of
|
|
|
|
* these ranges.
|
|
|
|
*
|
|
|
|
* The purpose of the dual range handling is to handle known cases of big
|
|
|
|
* holes in the address space, like the x86 AMD 1T hole. The alternative
|
|
|
|
* would be an IOVATree but that has a much bigger runtime overhead and
|
|
|
|
* unnecessary complexity.
|
|
|
|
*/
|
|
|
|
min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
|
|
|
|
max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
|
|
|
|
|
|
|
|
if (*min > iova) {
|
|
|
|
*min = iova;
|
|
|
|
}
|
|
|
|
if (*max < end) {
|
|
|
|
*max = end;
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static const MemoryListener vfio_dirty_tracking_listener = {
|
|
|
|
.name = "vfio-tracking",
|
|
|
|
.region_add = vfio_dirty_tracking_update,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void vfio_dirty_tracking_init(VFIOContainer *container,
|
|
|
|
VFIODirtyRanges *ranges)
|
|
|
|
{
|
|
|
|
VFIODirtyRangesListener dirty;
|
|
|
|
|
|
|
|
memset(&dirty, 0, sizeof(dirty));
|
|
|
|
dirty.ranges.min32 = UINT32_MAX;
|
|
|
|
dirty.ranges.min64 = UINT64_MAX;
|
|
|
|
dirty.listener = vfio_dirty_tracking_listener;
|
|
|
|
dirty.container = container;
|
|
|
|
|
|
|
|
memory_listener_register(&dirty.listener,
|
|
|
|
container->space->as);
|
|
|
|
|
|
|
|
*ranges = dirty.ranges;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The memory listener is synchronous, and used to calculate the range
|
|
|
|
* to dirty tracking. Unregister it after we are done as we are not
|
|
|
|
* interested in any follow-up updates.
|
|
|
|
*/
|
|
|
|
memory_listener_unregister(&dirty.listener);
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:45 +03:00
|
|
|
static void vfio_devices_dma_logging_stop(VFIOContainer *container)
|
|
|
|
{
|
|
|
|
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
|
|
|
|
sizeof(uint64_t))] = {};
|
|
|
|
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
VFIOGroup *group;
|
|
|
|
|
|
|
|
feature->argsz = sizeof(buf);
|
|
|
|
feature->flags = VFIO_DEVICE_FEATURE_SET |
|
|
|
|
VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
if (!vbasedev->dirty_tracking) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
|
|
|
|
warn_report("%s: Failed to stop DMA logging, err %d (%s)",
|
|
|
|
vbasedev->name, -errno, strerror(errno));
|
|
|
|
}
|
|
|
|
vbasedev->dirty_tracking = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct vfio_device_feature *
|
|
|
|
vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
|
|
|
|
VFIODirtyRanges *tracking)
|
|
|
|
{
|
|
|
|
struct vfio_device_feature *feature;
|
|
|
|
size_t feature_size;
|
|
|
|
struct vfio_device_feature_dma_logging_control *control;
|
|
|
|
struct vfio_device_feature_dma_logging_range *ranges;
|
|
|
|
|
|
|
|
feature_size = sizeof(struct vfio_device_feature) +
|
|
|
|
sizeof(struct vfio_device_feature_dma_logging_control);
|
|
|
|
feature = g_try_malloc0(feature_size);
|
|
|
|
if (!feature) {
|
|
|
|
errno = ENOMEM;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
feature->argsz = feature_size;
|
|
|
|
feature->flags = VFIO_DEVICE_FEATURE_SET |
|
|
|
|
VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
|
|
|
|
|
|
|
|
control = (struct vfio_device_feature_dma_logging_control *)feature->data;
|
|
|
|
control->page_size = qemu_real_host_page_size();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* DMA logging uAPI guarantees to support at least a number of ranges that
|
|
|
|
* fits into a single host kernel base page.
|
|
|
|
*/
|
|
|
|
control->num_ranges = !!tracking->max32 + !!tracking->max64;
|
|
|
|
ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
|
|
|
|
control->num_ranges);
|
|
|
|
if (!ranges) {
|
|
|
|
g_free(feature);
|
|
|
|
errno = ENOMEM;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
control->ranges = (__u64)(uintptr_t)ranges;
|
|
|
|
if (tracking->max32) {
|
|
|
|
ranges->iova = tracking->min32;
|
|
|
|
ranges->length = (tracking->max32 - tracking->min32) + 1;
|
|
|
|
ranges++;
|
|
|
|
}
|
|
|
|
if (tracking->max64) {
|
|
|
|
ranges->iova = tracking->min64;
|
|
|
|
ranges->length = (tracking->max64 - tracking->min64) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
trace_vfio_device_dirty_tracking_start(control->num_ranges,
|
|
|
|
tracking->min32, tracking->max32,
|
|
|
|
tracking->min64, tracking->max64);
|
|
|
|
|
|
|
|
return feature;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_device_feature_dma_logging_start_destroy(
|
|
|
|
struct vfio_device_feature *feature)
|
|
|
|
{
|
|
|
|
struct vfio_device_feature_dma_logging_control *control =
|
|
|
|
(struct vfio_device_feature_dma_logging_control *)feature->data;
|
|
|
|
struct vfio_device_feature_dma_logging_range *ranges =
|
|
|
|
(struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
|
|
|
|
|
|
|
|
g_free(ranges);
|
|
|
|
g_free(feature);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_devices_dma_logging_start(VFIOContainer *container)
|
|
|
|
{
|
|
|
|
struct vfio_device_feature *feature;
|
|
|
|
VFIODirtyRanges ranges;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
VFIOGroup *group;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
vfio_dirty_tracking_init(container, &ranges);
|
|
|
|
feature = vfio_device_feature_dma_logging_start_create(container,
|
|
|
|
&ranges);
|
|
|
|
if (!feature) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
if (vbasedev->dirty_tracking) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
|
|
|
|
if (ret) {
|
|
|
|
ret = -errno;
|
|
|
|
error_report("%s: Failed to start DMA logging, err %d (%s)",
|
|
|
|
vbasedev->name, ret, strerror(errno));
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
vbasedev->dirty_tracking = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret) {
|
|
|
|
vfio_devices_dma_logging_stop(container);
|
|
|
|
}
|
|
|
|
|
|
|
|
vfio_device_feature_dma_logging_start_destroy(feature);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
static void vfio_listener_log_global_start(MemoryListener *listener)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
|
2023-03-07 15:54:38 +03:00
|
|
|
int ret;
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
|
2023-03-07 15:54:45 +03:00
|
|
|
if (vfio_devices_all_device_dirty_tracking(container)) {
|
|
|
|
ret = vfio_devices_dma_logging_start(container);
|
|
|
|
} else {
|
|
|
|
ret = vfio_set_dirty_page_tracking(container, true);
|
|
|
|
}
|
2023-03-07 15:54:44 +03:00
|
|
|
|
2023-03-07 15:54:38 +03:00
|
|
|
if (ret) {
|
2023-03-07 15:54:45 +03:00
|
|
|
error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
|
|
|
|
ret, strerror(-ret));
|
2023-03-07 15:54:38 +03:00
|
|
|
vfio_set_migration_error(ret);
|
|
|
|
}
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_listener_log_global_stop(MemoryListener *listener)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
|
2023-03-07 15:54:45 +03:00
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
if (vfio_devices_all_device_dirty_tracking(container)) {
|
|
|
|
vfio_devices_dma_logging_stop(container);
|
|
|
|
} else {
|
|
|
|
ret = vfio_set_dirty_page_tracking(container, false);
|
|
|
|
}
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
|
2023-03-07 15:54:38 +03:00
|
|
|
if (ret) {
|
2023-03-07 15:54:45 +03:00
|
|
|
error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
|
|
|
|
ret, strerror(-ret));
|
2023-03-07 15:54:38 +03:00
|
|
|
vfio_set_migration_error(ret);
|
|
|
|
}
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
|
|
|
|
hwaddr size, void *bitmap)
|
|
|
|
{
|
|
|
|
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
|
|
|
|
sizeof(struct vfio_device_feature_dma_logging_report),
|
|
|
|
sizeof(__u64))] = {};
|
|
|
|
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
|
|
|
|
struct vfio_device_feature_dma_logging_report *report =
|
|
|
|
(struct vfio_device_feature_dma_logging_report *)feature->data;
|
|
|
|
|
|
|
|
report->iova = iova;
|
|
|
|
report->length = size;
|
|
|
|
report->page_size = qemu_real_host_page_size();
|
|
|
|
report->bitmap = (__u64)(uintptr_t)bitmap;
|
|
|
|
|
|
|
|
feature->argsz = sizeof(buf);
|
|
|
|
feature->flags = VFIO_DEVICE_FEATURE_GET |
|
|
|
|
VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
|
|
|
|
|
|
|
|
if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
|
|
|
|
VFIOBitmap *vbmap, hwaddr iova,
|
|
|
|
hwaddr size)
|
|
|
|
{
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
VFIOGroup *group;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &container->group_list, container_next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
|
|
|
ret = vfio_device_dma_logging_report(vbasedev, iova, size,
|
|
|
|
vbmap->bitmap);
|
|
|
|
if (ret) {
|
|
|
|
error_report("%s: Failed to get DMA logging report, iova: "
|
|
|
|
"0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
|
|
|
|
", err: %d (%s)",
|
|
|
|
vbasedev->name, iova, size, ret, strerror(-ret));
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:46 +03:00
|
|
|
static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
|
|
|
|
hwaddr iova, hwaddr size)
|
2020-10-26 12:36:23 +03:00
|
|
|
{
|
|
|
|
struct vfio_iommu_type1_dirty_bitmap *dbitmap;
|
|
|
|
struct vfio_iommu_type1_dirty_bitmap_get *range;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
|
|
|
|
|
|
|
|
dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
|
|
|
|
dbitmap->flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
|
|
|
|
range = (struct vfio_iommu_type1_dirty_bitmap_get *)&dbitmap->data;
|
|
|
|
range->iova = iova;
|
|
|
|
range->size = size;
|
|
|
|
|
|
|
|
/*
|
2021-03-04 16:34:46 +03:00
|
|
|
* cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
|
|
|
|
* qemu_real_host_page_size to mark those dirty. Hence set bitmap's pgsize
|
|
|
|
* to qemu_real_host_page_size.
|
2020-10-26 12:36:23 +03:00
|
|
|
*/
|
2022-03-23 18:57:22 +03:00
|
|
|
range->bitmap.pgsize = qemu_real_host_page_size();
|
2023-03-07 15:54:46 +03:00
|
|
|
range->bitmap.size = vbmap->size;
|
|
|
|
range->bitmap.data = (__u64 *)vbmap->bitmap;
|
2020-10-26 12:36:23 +03:00
|
|
|
|
|
|
|
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
|
|
|
|
if (ret) {
|
2023-03-07 15:54:36 +03:00
|
|
|
ret = -errno;
|
2020-10-26 12:36:23 +03:00
|
|
|
error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
|
|
|
|
" size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
|
|
|
|
(uint64_t)range->size, errno);
|
2023-03-07 15:54:46 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
g_free(dbitmap);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
|
|
|
|
uint64_t size, ram_addr_t ram_addr)
|
|
|
|
{
|
2023-03-07 15:54:47 +03:00
|
|
|
bool all_device_dirty_tracking =
|
|
|
|
vfio_devices_all_device_dirty_tracking(container);
|
2023-05-30 21:05:56 +03:00
|
|
|
uint64_t dirty_pages;
|
2023-03-07 15:54:46 +03:00
|
|
|
VFIOBitmap vbmap;
|
|
|
|
int ret;
|
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
|
2023-03-07 15:54:46 +03:00
|
|
|
cpu_physical_memory_set_dirty_range(ram_addr, size,
|
|
|
|
tcg_enabled() ? DIRTY_CLIENTS_ALL :
|
|
|
|
DIRTY_CLIENTS_NOCODE);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vfio_bitmap_alloc(&vbmap, size);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:47 +03:00
|
|
|
if (all_device_dirty_tracking) {
|
|
|
|
ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
|
|
|
|
} else {
|
|
|
|
ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
|
|
|
|
}
|
|
|
|
|
2023-03-07 15:54:46 +03:00
|
|
|
if (ret) {
|
|
|
|
goto out;
|
2020-10-26 12:36:23 +03:00
|
|
|
}
|
|
|
|
|
2023-05-30 21:05:56 +03:00
|
|
|
dirty_pages = cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
|
|
|
|
vbmap.pages);
|
2020-10-26 12:36:23 +03:00
|
|
|
|
2023-03-07 15:54:46 +03:00
|
|
|
trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
|
2023-05-30 21:05:56 +03:00
|
|
|
ram_addr, dirty_pages);
|
2023-03-07 15:54:46 +03:00
|
|
|
out:
|
2023-03-07 15:54:39 +03:00
|
|
|
g_free(vbmap.bitmap);
|
2020-10-26 12:36:23 +03:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:24 +03:00
|
|
|
typedef struct {
|
|
|
|
IOMMUNotifier n;
|
|
|
|
VFIOGuestIOMMU *giommu;
|
|
|
|
} vfio_giommu_dirty_notifier;
|
|
|
|
|
|
|
|
static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
|
|
|
|
{
|
|
|
|
vfio_giommu_dirty_notifier *gdn = container_of(n,
|
|
|
|
vfio_giommu_dirty_notifier, n);
|
|
|
|
VFIOGuestIOMMU *giommu = gdn->giommu;
|
|
|
|
VFIOContainer *container = giommu->container;
|
|
|
|
hwaddr iova = iotlb->iova + giommu->iommu_offset;
|
|
|
|
ram_addr_t translated_addr;
|
2023-03-07 15:54:38 +03:00
|
|
|
int ret = -EINVAL;
|
2020-10-26 12:36:24 +03:00
|
|
|
|
|
|
|
trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
|
|
|
|
|
|
|
|
if (iotlb->target_as != &address_space_memory) {
|
|
|
|
error_report("Wrong target AS \"%s\", only system memory is allowed",
|
|
|
|
iotlb->target_as->name ? iotlb->target_as->name : "none");
|
2023-03-07 15:54:38 +03:00
|
|
|
goto out;
|
2020-10-26 12:36:24 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
|
|
|
|
ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
|
|
|
|
translated_addr);
|
|
|
|
if (ret) {
|
|
|
|
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
|
2023-03-07 15:54:37 +03:00
|
|
|
"0x%"HWADDR_PRIx") = %d (%s)",
|
|
|
|
container, iova, iotlb->addr_mask + 1, ret,
|
|
|
|
strerror(-ret));
|
2020-10-26 12:36:24 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
2023-03-07 15:54:38 +03:00
|
|
|
|
|
|
|
out:
|
|
|
|
if (ret) {
|
|
|
|
vfio_set_migration_error(ret);
|
|
|
|
}
|
2020-10-26 12:36:24 +03:00
|
|
|
}
|
|
|
|
|
2021-04-13 12:55:24 +03:00
|
|
|
static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
|
|
|
|
void *opaque)
|
|
|
|
{
|
|
|
|
const hwaddr size = int128_get64(section->size);
|
|
|
|
const hwaddr iova = section->offset_within_address_space;
|
|
|
|
const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) +
|
|
|
|
section->offset_within_region;
|
|
|
|
VFIORamDiscardListener *vrdl = opaque;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sync the whole mapped region (spanning multiple individual mappings)
|
|
|
|
* in one go.
|
|
|
|
*/
|
|
|
|
return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr);
|
|
|
|
VFIORamDiscardListener *vrdl = NULL;
|
|
|
|
|
|
|
|
QLIST_FOREACH(vrdl, &container->vrdl_list, next) {
|
|
|
|
if (vrdl->mr == section->mr &&
|
|
|
|
vrdl->offset_within_address_space ==
|
|
|
|
section->offset_within_address_space) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!vrdl) {
|
|
|
|
hw_error("vfio: Trying to sync missing RAM discard listener");
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We only want/can synchronize the bitmap for actually mapped parts -
|
|
|
|
* which correspond to populated parts. Replay all populated parts.
|
|
|
|
*/
|
|
|
|
return ram_discard_manager_replay_populated(rdm, section,
|
|
|
|
vfio_ram_discard_get_dirty_bitmap,
|
|
|
|
&vrdl);
|
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:23 +03:00
|
|
|
static int vfio_sync_dirty_bitmap(VFIOContainer *container,
|
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
ram_addr_t ram_addr;
|
|
|
|
|
2020-10-26 12:36:24 +03:00
|
|
|
if (memory_region_is_iommu(section->mr)) {
|
|
|
|
VFIOGuestIOMMU *giommu;
|
|
|
|
|
|
|
|
QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) {
|
2022-05-02 12:42:23 +03:00
|
|
|
if (MEMORY_REGION(giommu->iommu_mr) == section->mr &&
|
2020-10-26 12:36:24 +03:00
|
|
|
giommu->n.start == section->offset_within_region) {
|
|
|
|
Int128 llend;
|
|
|
|
vfio_giommu_dirty_notifier gdn = { .giommu = giommu };
|
2022-05-02 12:42:23 +03:00
|
|
|
int idx = memory_region_iommu_attrs_to_index(giommu->iommu_mr,
|
2020-10-26 12:36:24 +03:00
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
|
|
|
|
|
|
|
llend = int128_add(int128_make64(section->offset_within_region),
|
|
|
|
section->size);
|
|
|
|
llend = int128_sub(llend, int128_one());
|
|
|
|
|
|
|
|
iommu_notifier_init(&gdn.n,
|
|
|
|
vfio_iommu_map_dirty_notify,
|
|
|
|
IOMMU_NOTIFIER_MAP,
|
|
|
|
section->offset_within_region,
|
|
|
|
int128_get64(llend),
|
|
|
|
idx);
|
2022-05-02 12:42:23 +03:00
|
|
|
memory_region_iommu_replay(giommu->iommu_mr, &gdn.n);
|
2020-10-26 12:36:24 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
2021-04-13 12:55:24 +03:00
|
|
|
} else if (memory_region_has_ram_discard_manager(section->mr)) {
|
|
|
|
return vfio_sync_ram_discard_listener_dirty_bitmap(container, section);
|
2020-10-26 12:36:24 +03:00
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:23 +03:00
|
|
|
ram_addr = memory_region_get_ram_addr(section->mr) +
|
|
|
|
section->offset_within_region;
|
|
|
|
|
|
|
|
return vfio_get_dirty_bitmap(container,
|
2021-03-04 16:34:46 +03:00
|
|
|
REAL_HOST_PAGE_ALIGN(section->offset_within_address_space),
|
|
|
|
int128_get64(section->size), ram_addr);
|
2020-10-26 12:36:23 +03:00
|
|
|
}
|
|
|
|
|
2020-12-04 04:42:40 +03:00
|
|
|
static void vfio_listener_log_sync(MemoryListener *listener,
|
2020-10-26 12:36:23 +03:00
|
|
|
MemoryRegionSection *section)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
|
2023-03-07 15:54:38 +03:00
|
|
|
int ret;
|
2020-10-26 12:36:23 +03:00
|
|
|
|
2023-02-16 17:36:22 +03:00
|
|
|
if (vfio_listener_skipped_section(section)) {
|
2020-10-26 12:36:23 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
if (vfio_devices_all_dirty_tracking(container)) {
|
2023-03-07 15:54:38 +03:00
|
|
|
ret = vfio_sync_dirty_bitmap(container, section);
|
|
|
|
if (ret) {
|
|
|
|
error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
|
|
|
|
strerror(-ret));
|
|
|
|
vfio_set_migration_error(ret);
|
|
|
|
}
|
2020-10-26 12:36:23 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-02 21:38:55 +03:00
|
|
|
static const MemoryListener vfio_memory_listener = {
|
2021-08-17 04:35:52 +03:00
|
|
|
.name = "vfio",
|
2014-12-22 19:54:51 +03:00
|
|
|
.region_add = vfio_listener_region_add,
|
|
|
|
.region_del = vfio_listener_region_del,
|
vfio/migrate: Move switch of dirty tracking into vfio_memory_listener
For now the switch of vfio dirty page tracking is integrated into
@vfio_save_handler. The reason is that some PCI vendor driver may
start to track dirty base on _SAVING state of device, so if dirty
tracking is started before setting device state, vfio will report
full-dirty to QEMU.
However, the dirty bmap of all ramblocks are fully set when setup
ram saving, so it's not matter whether the device is in _SAVING
state when start vfio dirty tracking.
Moreover, this logic causes some problems [1]. The object of dirty
tracking is guest memory, but the object of @vfio_save_handler is
device state, which produces unnecessary coupling and conflicts:
1. Coupling: Their saving granule is different (perVM vs perDevice).
vfio will enable dirty_page_tracking for each devices, actually
once is enough.
2. Conflicts: The ram_save_setup() traverses all memory_listeners
to execute their log_start() and log_sync() hooks to get the
first round dirty bitmap, which is used by the bulk stage of
ram saving. However, as vfio dirty tracking is not yet started,
it can't get dirty bitmap from vfio. Then we give up the chance
to handle vfio dirty page at bulk stage.
Move the switch of vfio dirty_page_tracking into vfio_memory_listener
can solve above problems. Besides, Do not require devices in SAVING
state for vfio_sync_dirty_bitmap().
[1] https://www.spinics.net/lists/kvm/msg229967.html
Reported-by: Zenghui Yu <yuzenghui@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Message-Id: <20210309031913.11508-1-zhukeqian1@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-09 06:19:13 +03:00
|
|
|
.log_global_start = vfio_listener_log_global_start,
|
|
|
|
.log_global_stop = vfio_listener_log_global_stop,
|
2020-12-04 04:42:40 +03:00
|
|
|
.log_sync = vfio_listener_log_sync,
|
2014-12-22 19:54:51 +03:00
|
|
|
};
|
|
|
|
|
2015-03-02 21:38:55 +03:00
|
|
|
static void vfio_listener_release(VFIOContainer *container)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
2015-09-30 05:13:51 +03:00
|
|
|
memory_listener_unregister(&container->listener);
|
2016-07-04 06:33:04 +03:00
|
|
|
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
|
|
|
|
memory_listener_unregister(&container->prereg_listener);
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2020-10-26 18:34:32 +03:00
|
|
|
static struct vfio_info_cap_header *
|
|
|
|
vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
|
2016-05-26 18:43:20 +03:00
|
|
|
{
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
|
2020-10-26 18:34:32 +03:00
|
|
|
for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
|
2016-05-26 18:43:20 +03:00
|
|
|
if (hdr->id == id) {
|
|
|
|
return hdr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-10-26 18:34:32 +03:00
|
|
|
struct vfio_info_cap_header *
|
|
|
|
vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
|
|
|
|
{
|
|
|
|
if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return vfio_get_cap((void *)info, info->cap_offset, id);
|
|
|
|
}
|
|
|
|
|
2020-10-26 18:34:33 +03:00
|
|
|
static struct vfio_info_cap_header *
|
|
|
|
vfio_get_iommu_type1_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
|
|
|
|
{
|
|
|
|
if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return vfio_get_cap((void *)info, info->cap_offset, id);
|
|
|
|
}
|
|
|
|
|
2020-10-26 18:34:40 +03:00
|
|
|
struct vfio_info_cap_header *
|
|
|
|
vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
|
|
|
|
{
|
|
|
|
if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return vfio_get_cap((void *)info, info->cap_offset, id);
|
|
|
|
}
|
|
|
|
|
2020-10-26 18:34:33 +03:00
|
|
|
bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
|
|
|
|
unsigned int *avail)
|
|
|
|
{
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
struct vfio_iommu_type1_info_dma_avail *cap;
|
|
|
|
|
|
|
|
/* If the capability cannot be found, assume no DMA limiting */
|
|
|
|
hdr = vfio_get_iommu_type1_info_cap(info,
|
|
|
|
VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL);
|
|
|
|
if (hdr == NULL) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (avail != NULL) {
|
|
|
|
cap = (void *) hdr;
|
|
|
|
*avail = cap->avail;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-10-31 18:53:03 +03:00
|
|
|
static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
|
|
|
|
struct vfio_region_info *info)
|
2016-05-26 18:43:20 +03:00
|
|
|
{
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
struct vfio_region_info_cap_sparse_mmap *sparse;
|
2016-10-31 18:53:03 +03:00
|
|
|
int i, j;
|
2016-05-26 18:43:20 +03:00
|
|
|
|
|
|
|
hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
|
|
|
|
if (!hdr) {
|
2016-10-31 18:53:03 +03:00
|
|
|
return -ENODEV;
|
2016-05-26 18:43:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
|
|
|
|
|
|
|
|
trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
|
|
|
|
region->nr, sparse->nr_areas);
|
|
|
|
|
2016-10-31 18:53:03 +03:00
|
|
|
region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
|
|
|
|
|
|
|
|
for (i = 0, j = 0; i < sparse->nr_areas; i++) {
|
|
|
|
if (sparse->areas[i].size) {
|
2022-04-16 12:08:24 +03:00
|
|
|
trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
|
|
|
|
sparse->areas[i].offset +
|
|
|
|
sparse->areas[i].size - 1);
|
2016-10-31 18:53:03 +03:00
|
|
|
region->mmaps[j].offset = sparse->areas[i].offset;
|
|
|
|
region->mmaps[j].size = sparse->areas[i].size;
|
|
|
|
j++;
|
|
|
|
}
|
2016-05-26 18:43:20 +03:00
|
|
|
}
|
2016-10-31 18:53:03 +03:00
|
|
|
|
|
|
|
region->nr_mmaps = j;
|
|
|
|
region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
|
|
|
|
|
|
|
|
return 0;
|
2016-05-26 18:43:20 +03:00
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
|
|
|
|
int index, const char *name)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
2016-03-10 19:39:07 +03:00
|
|
|
struct vfio_region_info *info;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = vfio_get_region_info(vbasedev, index, &info);
|
|
|
|
if (ret) {
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
region->vbasedev = vbasedev;
|
|
|
|
region->flags = info->flags;
|
|
|
|
region->size = info->size;
|
|
|
|
region->fd_offset = info->offset;
|
|
|
|
region->nr = index;
|
|
|
|
|
|
|
|
if (region->size) {
|
|
|
|
region->mem = g_new0(MemoryRegion, 1);
|
|
|
|
memory_region_init_io(region->mem, obj, &vfio_region_ops,
|
|
|
|
region, name, region->size);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
if (!vbasedev->no_mmap &&
|
2016-10-31 18:53:04 +03:00
|
|
|
region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-10-31 18:53:03 +03:00
|
|
|
ret = vfio_setup_region_sparse_mmaps(region, info);
|
2016-03-10 19:39:07 +03:00
|
|
|
|
2016-10-31 18:53:03 +03:00
|
|
|
if (ret) {
|
2016-05-26 18:43:20 +03:00
|
|
|
region->nr_mmaps = 1;
|
|
|
|
region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
|
|
|
|
region->mmaps[0].offset = 0;
|
|
|
|
region->mmaps[0].size = region->size;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
2016-03-10 19:39:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
g_free(info);
|
|
|
|
|
|
|
|
trace_vfio_region_setup(vbasedev->name, index, name,
|
|
|
|
region->flags, region->fd_offset, region->size);
|
|
|
|
return 0;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2020-10-26 12:36:11 +03:00
|
|
|
static void vfio_subregion_unmap(VFIORegion *region, int index)
|
|
|
|
{
|
|
|
|
trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem),
|
|
|
|
region->mmaps[index].offset,
|
|
|
|
region->mmaps[index].offset +
|
|
|
|
region->mmaps[index].size - 1);
|
|
|
|
memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem);
|
|
|
|
munmap(region->mmaps[index].mmap, region->mmaps[index].size);
|
|
|
|
object_unparent(OBJECT(®ion->mmaps[index].mem));
|
|
|
|
region->mmaps[index].mmap = NULL;
|
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
int vfio_region_mmap(VFIORegion *region)
|
|
|
|
{
|
|
|
|
int i, prot = 0;
|
|
|
|
char *name;
|
|
|
|
|
|
|
|
if (!region->mem) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
|
|
|
|
prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
|
|
|
|
|
|
|
|
for (i = 0; i < region->nr_mmaps; i++) {
|
|
|
|
region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
|
|
|
|
MAP_SHARED, region->vbasedev->fd,
|
|
|
|
region->fd_offset +
|
|
|
|
region->mmaps[i].offset);
|
|
|
|
if (region->mmaps[i].mmap == MAP_FAILED) {
|
|
|
|
int ret = -errno;
|
|
|
|
|
|
|
|
trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
|
|
|
|
region->fd_offset +
|
|
|
|
region->mmaps[i].offset,
|
|
|
|
region->fd_offset +
|
|
|
|
region->mmaps[i].offset +
|
|
|
|
region->mmaps[i].size - 1, ret);
|
|
|
|
|
|
|
|
region->mmaps[i].mmap = NULL;
|
|
|
|
|
|
|
|
for (i--; i >= 0; i--) {
|
2020-10-26 12:36:11 +03:00
|
|
|
vfio_subregion_unmap(region, i);
|
2016-03-10 19:39:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
name = g_strdup_printf("%s mmaps[%d]",
|
|
|
|
memory_region_name(region->mem), i);
|
2016-10-31 18:53:03 +03:00
|
|
|
memory_region_init_ram_device_ptr(®ion->mmaps[i].mem,
|
|
|
|
memory_region_owner(region->mem),
|
|
|
|
name, region->mmaps[i].size,
|
|
|
|
region->mmaps[i].mmap);
|
2016-03-10 19:39:07 +03:00
|
|
|
g_free(name);
|
|
|
|
memory_region_add_subregion(region->mem, region->mmaps[i].offset,
|
|
|
|
®ion->mmaps[i].mem);
|
|
|
|
|
|
|
|
trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem),
|
|
|
|
region->mmaps[i].offset,
|
|
|
|
region->mmaps[i].offset +
|
|
|
|
region->mmaps[i].size - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:11 +03:00
|
|
|
void vfio_region_unmap(VFIORegion *region)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!region->mem) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < region->nr_mmaps; i++) {
|
|
|
|
if (region->mmaps[i].mmap) {
|
|
|
|
vfio_subregion_unmap(region, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
void vfio_region_exit(VFIORegion *region)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!region->mem) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < region->nr_mmaps; i++) {
|
|
|
|
if (region->mmaps[i].mmap) {
|
|
|
|
memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem);
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
2016-03-10 19:39:07 +03:00
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
trace_vfio_region_exit(region->vbasedev->name, region->nr);
|
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_region_finalize(VFIORegion *region)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!region->mem) {
|
|
|
|
return;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
for (i = 0; i < region->nr_mmaps; i++) {
|
|
|
|
if (region->mmaps[i].mmap) {
|
|
|
|
munmap(region->mmaps[i].mmap, region->mmaps[i].size);
|
|
|
|
object_unparent(OBJECT(®ion->mmaps[i].mem));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
object_unparent(OBJECT(region->mem));
|
|
|
|
|
|
|
|
g_free(region->mem);
|
|
|
|
g_free(region->mmaps);
|
|
|
|
|
|
|
|
trace_vfio_region_finalize(region->vbasedev->name, region->nr);
|
2018-03-13 20:17:29 +03:00
|
|
|
|
|
|
|
region->mem = NULL;
|
|
|
|
region->mmaps = NULL;
|
|
|
|
region->nr_mmaps = 0;
|
|
|
|
region->size = 0;
|
|
|
|
region->flags = 0;
|
|
|
|
region->nr = 0;
|
2016-03-10 19:39:07 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (!region->mem) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < region->nr_mmaps; i++) {
|
|
|
|
if (region->mmaps[i].mmap) {
|
|
|
|
memory_region_set_enabled(®ion->mmaps[i].mem, enabled);
|
|
|
|
}
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
|
|
|
|
enabled);
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_reset_handler(void *opaque)
|
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
VFIODevice *vbasedev;
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
2017-07-10 19:39:43 +03:00
|
|
|
if (vbasedev->dev->realized) {
|
|
|
|
vbasedev->ops->vfio_compute_needs_reset(vbasedev);
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
|
|
QLIST_FOREACH(vbasedev, &group->device_list, next) {
|
2017-07-10 19:39:43 +03:00
|
|
|
if (vbasedev->dev->realized && vbasedev->needs_reset) {
|
2014-12-22 19:54:51 +03:00
|
|
|
vbasedev->ops->vfio_hot_reset_multi(vbasedev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_kvm_device_add_group(VFIOGroup *group)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
struct kvm_device_attr attr = {
|
|
|
|
.group = KVM_DEV_VFIO_GROUP,
|
|
|
|
.attr = KVM_DEV_VFIO_GROUP_ADD,
|
|
|
|
.addr = (uint64_t)(unsigned long)&group->fd,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (!kvm_enabled()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (vfio_kvm_device_fd < 0) {
|
|
|
|
struct kvm_create_device cd = {
|
|
|
|
.type = KVM_DEV_TYPE_VFIO,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (kvm_vm_ioctl(kvm_state, KVM_CREATE_DEVICE, &cd)) {
|
2015-02-25 07:22:33 +03:00
|
|
|
error_report("Failed to create KVM VFIO device: %m");
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
vfio_kvm_device_fd = cd.fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
|
|
|
|
error_report("Failed to add group %d to KVM VFIO device: %m",
|
|
|
|
group->groupid);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_kvm_device_del_group(VFIOGroup *group)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_KVM
|
|
|
|
struct kvm_device_attr attr = {
|
|
|
|
.group = KVM_DEV_VFIO_GROUP,
|
|
|
|
.attr = KVM_DEV_VFIO_GROUP_DEL,
|
|
|
|
.addr = (uint64_t)(unsigned long)&group->fd,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (vfio_kvm_device_fd < 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
|
|
|
|
error_report("Failed to remove group %d from KVM VFIO device: %m",
|
|
|
|
group->groupid);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as)
|
|
|
|
{
|
|
|
|
VFIOAddressSpace *space;
|
|
|
|
|
|
|
|
QLIST_FOREACH(space, &vfio_address_spaces, list) {
|
|
|
|
if (space->as == as) {
|
|
|
|
return space;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* No suitable VFIOAddressSpace, create a new one */
|
|
|
|
space = g_malloc0(sizeof(*space));
|
|
|
|
space->as = as;
|
|
|
|
QLIST_INIT(&space->containers);
|
|
|
|
|
|
|
|
QLIST_INSERT_HEAD(&vfio_address_spaces, space, list);
|
|
|
|
|
|
|
|
return space;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_put_address_space(VFIOAddressSpace *space)
|
|
|
|
{
|
|
|
|
if (QLIST_EMPTY(&space->containers)) {
|
|
|
|
QLIST_REMOVE(space, list);
|
|
|
|
g_free(space);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-22 07:07:03 +03:00
|
|
|
/*
|
|
|
|
* vfio_get_iommu_type - selects the richest iommu_type (v2 first)
|
|
|
|
*/
|
|
|
|
static int vfio_get_iommu_type(VFIOContainer *container,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
|
|
|
|
VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
|
|
|
|
if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
|
|
|
|
return iommu_types[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
error_setg(errp, "No available IOMMU models");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_init_container(VFIOContainer *container, int group_fd,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
int iommu_type, ret;
|
|
|
|
|
|
|
|
iommu_type = vfio_get_iommu_type(container, errp);
|
|
|
|
if (iommu_type < 0) {
|
|
|
|
return iommu_type;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, &container->fd);
|
|
|
|
if (ret) {
|
|
|
|
error_setg_errno(errp, errno, "Failed to set group container");
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
|
|
|
|
if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
|
|
|
|
/*
|
|
|
|
* On sPAPR, despite the IOMMU subdriver always advertises v1 and
|
|
|
|
* v2, the running platform may not support v2 and there is no
|
|
|
|
* way to guess it until an IOMMU group gets added to the container.
|
|
|
|
* So in case it fails with v2, try v1 as a fallback.
|
|
|
|
*/
|
|
|
|
iommu_type = VFIO_SPAPR_TCE_IOMMU;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
error_setg_errno(errp, errno, "Failed to set iommu for container");
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
container->iommu_type = iommu_type;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-10-26 12:36:21 +03:00
|
|
|
static int vfio_get_iommu_info(VFIOContainer *container,
|
|
|
|
struct vfio_iommu_type1_info **info)
|
|
|
|
{
|
|
|
|
|
|
|
|
size_t argsz = sizeof(struct vfio_iommu_type1_info);
|
|
|
|
|
|
|
|
*info = g_new0(struct vfio_iommu_type1_info, 1);
|
|
|
|
again:
|
|
|
|
(*info)->argsz = argsz;
|
|
|
|
|
|
|
|
if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
|
|
|
|
g_free(*info);
|
|
|
|
*info = NULL;
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (((*info)->argsz > argsz)) {
|
|
|
|
argsz = (*info)->argsz;
|
|
|
|
*info = g_realloc(*info, argsz);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct vfio_info_cap_header *
|
|
|
|
vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
|
|
|
|
{
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
void *ptr = info;
|
|
|
|
|
|
|
|
if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
|
|
|
|
if (hdr->id == id) {
|
|
|
|
return hdr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_get_iommu_info_migration(VFIOContainer *container,
|
|
|
|
struct vfio_iommu_type1_info *info)
|
|
|
|
{
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
struct vfio_iommu_type1_info_cap_migration *cap_mig;
|
|
|
|
|
|
|
|
hdr = vfio_get_iommu_info_cap(info, VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION);
|
|
|
|
if (!hdr) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
cap_mig = container_of(hdr, struct vfio_iommu_type1_info_cap_migration,
|
|
|
|
header);
|
|
|
|
|
|
|
|
/*
|
2021-03-04 16:34:46 +03:00
|
|
|
* cpu_physical_memory_set_dirty_lebitmap() supports pages in bitmap of
|
|
|
|
* qemu_real_host_page_size to mark those dirty.
|
2020-10-26 12:36:21 +03:00
|
|
|
*/
|
2022-03-23 18:57:22 +03:00
|
|
|
if (cap_mig->pgsize_bitmap & qemu_real_host_page_size()) {
|
2020-10-26 12:36:21 +03:00
|
|
|
container->dirty_pages_supported = true;
|
|
|
|
container->max_dirty_bitmap_size = cap_mig->max_dirty_bitmap_size;
|
|
|
|
container->dirty_pgsizes = cap_mig->pgsize_bitmap;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-17 19:57:59 +03:00
|
|
|
static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
|
|
|
|
Error **errp)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
|
|
|
VFIOContainer *container;
|
|
|
|
int ret, fd;
|
|
|
|
VFIOAddressSpace *space;
|
|
|
|
|
|
|
|
space = vfio_get_address_space(as);
|
|
|
|
|
2018-08-17 18:27:16 +03:00
|
|
|
/*
|
2020-06-26 10:22:30 +03:00
|
|
|
* VFIO is currently incompatible with discarding of RAM insofar as the
|
2018-08-17 18:27:16 +03:00
|
|
|
* madvise to purge (zap) the page from QEMU's address space does not
|
|
|
|
* interact with the memory API and therefore leaves stale virtual to
|
|
|
|
* physical mappings in the IOMMU if the page was previously pinned. We
|
2020-06-26 10:22:30 +03:00
|
|
|
* therefore set discarding broken for each group added to a container,
|
2018-08-17 18:27:16 +03:00
|
|
|
* whether the container is used individually or shared. This provides
|
|
|
|
* us with options to allow devices within a group to opt-in and allow
|
2020-06-26 10:22:30 +03:00
|
|
|
* discarding, so long as it is done consistently for a group (for instance
|
2018-08-17 18:27:16 +03:00
|
|
|
* if the device is an mdev device where it is known that the host vendor
|
|
|
|
* driver will never pin pages outside of the working set of the guest
|
2020-06-26 10:22:30 +03:00
|
|
|
* driver, which would thus not be discarding candidates).
|
2018-08-17 18:27:16 +03:00
|
|
|
*
|
|
|
|
* The first opportunity to induce pinning occurs here where we attempt to
|
|
|
|
* attach the group to existing containers within the AddressSpace. If any
|
2020-06-26 10:22:30 +03:00
|
|
|
* pages are already zapped from the virtual address space, such as from
|
|
|
|
* previous discards, new pinning will cause valid mappings to be
|
2018-08-17 18:27:16 +03:00
|
|
|
* re-established. Likewise, when the overall MemoryListener for a new
|
|
|
|
* container is registered, a replay of mappings within the AddressSpace
|
|
|
|
* will occur, re-establishing any previously zapped pages as well.
|
|
|
|
*
|
2020-06-26 10:22:30 +03:00
|
|
|
* Especially virtio-balloon is currently only prevented from discarding
|
|
|
|
* new memory, it will not yet set ram_block_discard_set_required() and
|
|
|
|
* therefore, neither stops us here or deals with the sudden memory
|
|
|
|
* consumption of inflated memory.
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
*
|
|
|
|
* We do support discarding of memory coordinated via the RamDiscardManager
|
|
|
|
* with some IOMMU types. vfio_ram_block_discard_disable() handles the
|
|
|
|
* details once we know which type of IOMMU we are using.
|
2018-08-17 18:27:16 +03:00
|
|
|
*/
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
QLIST_FOREACH(container, &space->containers, next) {
|
|
|
|
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
ret = vfio_ram_block_discard_disable(container, true);
|
|
|
|
if (ret) {
|
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"Cannot set discarding of RAM broken");
|
|
|
|
if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
|
|
|
|
&container->fd)) {
|
|
|
|
error_report("vfio: error disconnecting group %d from"
|
|
|
|
" container", group->groupid);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
group->container = container;
|
|
|
|
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
|
2017-12-13 20:19:32 +03:00
|
|
|
vfio_kvm_device_add_group(group);
|
2014-12-22 19:54:51 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-21 15:25:21 +03:00
|
|
|
fd = qemu_open_old("/dev/vfio/vfio", O_RDWR);
|
2014-12-22 19:54:51 +03:00
|
|
|
if (fd < 0) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, errno, "failed to open /dev/vfio/vfio");
|
2014-12-22 19:54:51 +03:00
|
|
|
ret = -errno;
|
|
|
|
goto put_space_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(fd, VFIO_GET_API_VERSION);
|
|
|
|
if (ret != VFIO_API_VERSION) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg(errp, "supported vfio version: %d, "
|
|
|
|
"reported version: %d", VFIO_API_VERSION, ret);
|
2014-12-22 19:54:51 +03:00
|
|
|
ret = -EINVAL;
|
|
|
|
goto close_fd_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
container = g_malloc0(sizeof(*container));
|
|
|
|
container->space = space;
|
|
|
|
container->fd = fd;
|
2019-09-24 11:25:16 +03:00
|
|
|
container->error = NULL;
|
2020-10-26 12:36:21 +03:00
|
|
|
container->dirty_pages_supported = false;
|
2021-04-13 12:55:25 +03:00
|
|
|
container->dma_max_mappings = 0;
|
2017-12-13 20:19:33 +03:00
|
|
|
QLIST_INIT(&container->giommu_list);
|
|
|
|
QLIST_INIT(&container->hostwin_list);
|
2021-04-13 12:55:24 +03:00
|
|
|
QLIST_INIT(&container->vrdl_list);
|
2015-02-10 20:25:44 +03:00
|
|
|
|
2019-02-22 07:07:03 +03:00
|
|
|
ret = vfio_init_container(container, group->fd, errp);
|
|
|
|
if (ret) {
|
|
|
|
goto free_container_exit;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
ret = vfio_ram_block_discard_disable(container, true);
|
|
|
|
if (ret) {
|
|
|
|
error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
|
|
|
|
goto free_container_exit;
|
|
|
|
}
|
|
|
|
|
2019-02-22 07:07:03 +03:00
|
|
|
switch (container->iommu_type) {
|
|
|
|
case VFIO_TYPE1v2_IOMMU:
|
|
|
|
case VFIO_TYPE1_IOMMU:
|
|
|
|
{
|
2020-10-26 12:36:21 +03:00
|
|
|
struct vfio_iommu_type1_info *info;
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
|
2020-10-26 12:36:21 +03:00
|
|
|
ret = vfio_get_iommu_info(container, &info);
|
2022-09-15 20:18:27 +03:00
|
|
|
if (ret) {
|
|
|
|
error_setg_errno(errp, -ret, "Failed to get VFIO IOMMU info");
|
|
|
|
goto enable_discards_exit;
|
|
|
|
}
|
2020-10-26 12:36:21 +03:00
|
|
|
|
2022-09-15 20:18:27 +03:00
|
|
|
if (info->flags & VFIO_IOMMU_INFO_PGSIZES) {
|
|
|
|
container->pgsizes = info->iova_pgsizes;
|
|
|
|
} else {
|
|
|
|
container->pgsizes = qemu_real_host_page_size();
|
2020-10-26 12:36:21 +03:00
|
|
|
}
|
2022-09-15 20:18:27 +03:00
|
|
|
|
|
|
|
if (!vfio_get_info_dma_avail(info, &container->dma_max_mappings)) {
|
|
|
|
container->dma_max_mappings = 65535;
|
2015-09-30 05:13:54 +03:00
|
|
|
}
|
2022-09-15 20:18:27 +03:00
|
|
|
vfio_get_iommu_info_migration(container, info);
|
2020-10-26 12:36:21 +03:00
|
|
|
g_free(info);
|
2022-09-15 20:18:27 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
|
|
|
|
* information to get the actual window extent rather than assume
|
|
|
|
* a 64-bit IOVA address space.
|
|
|
|
*/
|
|
|
|
vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
|
|
|
|
|
2019-02-22 07:07:03 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case VFIO_SPAPR_TCE_v2_IOMMU:
|
|
|
|
case VFIO_SPAPR_TCE_IOMMU:
|
|
|
|
{
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
struct vfio_iommu_spapr_tce_info info;
|
2019-02-22 07:07:03 +03:00
|
|
|
bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
|
2014-12-22 19:54:51 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The host kernel code implementing VFIO_IOMMU_DISABLE is called
|
|
|
|
* when container fd is closed so we do not call it explicitly
|
|
|
|
* in this file.
|
|
|
|
*/
|
2016-07-04 06:33:04 +03:00
|
|
|
if (!v2) {
|
|
|
|
ret = ioctl(fd, VFIO_IOMMU_ENABLE);
|
|
|
|
if (ret) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, errno, "failed to enable container");
|
2016-07-04 06:33:04 +03:00
|
|
|
ret = -errno;
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
goto enable_discards_exit;
|
2016-07-04 06:33:04 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
container->prereg_listener = vfio_prereg_listener;
|
|
|
|
|
|
|
|
memory_listener_register(&container->prereg_listener,
|
|
|
|
&address_space_memory);
|
|
|
|
if (container->error) {
|
|
|
|
memory_listener_unregister(&container->prereg_listener);
|
2019-09-24 11:25:16 +03:00
|
|
|
ret = -1;
|
|
|
|
error_propagate_prepend(errp, container->error,
|
|
|
|
"RAM memory listener initialization failed: ");
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
goto enable_discards_exit;
|
2016-07-04 06:33:04 +03:00
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
|
|
|
|
info.argsz = sizeof(info);
|
|
|
|
ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
|
|
|
|
if (ret) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, errno,
|
|
|
|
"VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
ret = -errno;
|
2016-07-04 06:33:04 +03:00
|
|
|
if (v2) {
|
|
|
|
memory_listener_unregister(&container->prereg_listener);
|
|
|
|
}
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
goto enable_discards_exit;
|
vfio: Check guest IOVA ranges against host IOMMU capabilities
The current vfio core code assumes that the host IOMMU is capable of
mapping any IOVA the guest wants to use to where we need. However, real
IOMMUs generally only support translating a certain range of IOVAs (the
"DMA window") not a full 64-bit address space.
The common x86 IOMMUs support a wide enough range that guests are very
unlikely to go beyond it in practice, however the IOMMU used on IBM Power
machines - in the default configuration - supports only a much more limited
IOVA range, usually 0..2GiB.
If the guest attempts to set up an IOVA range that the host IOMMU can't
map, qemu won't report an error until it actually attempts to map a bad
IOVA. If guest RAM is being mapped directly into the IOMMU (i.e. no guest
visible IOMMU) then this will show up very quickly. If there is a guest
visible IOMMU, however, the problem might not show up until much later when
the guest actually attempt to DMA with an IOVA the host can't handle.
This patch adds a test so that we will detect earlier if the guest is
attempting to use IOVA ranges that the host IOMMU won't be able to deal
with.
For now, we assume that "Type1" (x86) IOMMUs can support any IOVA, this is
incorrect, but no worse than what we have already. We can't do better for
now because the Type1 kernel interface doesn't tell us what IOVA range the
IOMMU actually supports.
For the Power "sPAPR TCE" IOMMU, however, we can retrieve the supported
IOVA range and validate guest IOVA ranges against it, and this patch does
so.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2015-09-30 05:13:53 +03:00
|
|
|
}
|
2015-09-30 05:13:54 +03:00
|
|
|
|
2016-07-04 06:33:06 +03:00
|
|
|
if (v2) {
|
2018-06-20 12:10:12 +03:00
|
|
|
container->pgsizes = info.ddw.pgsizes;
|
2016-07-04 06:33:06 +03:00
|
|
|
/*
|
|
|
|
* There is a default window in just created container.
|
|
|
|
* To make region_add/del simpler, we better remove this
|
|
|
|
* window now and let those iommu_listener callbacks
|
|
|
|
* create/remove them when needed.
|
|
|
|
*/
|
|
|
|
ret = vfio_spapr_remove_window(container, info.dma32_window_start);
|
|
|
|
if (ret) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, -ret,
|
|
|
|
"failed to remove existing window");
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
goto enable_discards_exit;
|
2016-07-04 06:33:06 +03:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* The default table uses 4K pages */
|
2018-06-20 12:10:12 +03:00
|
|
|
container->pgsizes = 0x1000;
|
2016-07-04 06:33:06 +03:00
|
|
|
vfio_host_win_add(container, info.dma32_window_start,
|
|
|
|
info.dma32_window_start +
|
|
|
|
info.dma32_window_size - 1,
|
|
|
|
0x1000);
|
|
|
|
}
|
2019-02-22 07:07:03 +03:00
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2017-07-17 21:39:09 +03:00
|
|
|
vfio_kvm_device_add_group(group);
|
|
|
|
|
|
|
|
QLIST_INIT(&container->group_list);
|
|
|
|
QLIST_INSERT_HEAD(&space->containers, container, next);
|
|
|
|
|
|
|
|
group->container = container;
|
|
|
|
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
|
|
|
|
|
2015-09-30 05:13:51 +03:00
|
|
|
container->listener = vfio_memory_listener;
|
|
|
|
|
|
|
|
memory_listener_register(&container->listener, container->space->as);
|
|
|
|
|
|
|
|
if (container->error) {
|
2019-09-24 11:25:16 +03:00
|
|
|
ret = -1;
|
|
|
|
error_propagate_prepend(errp, container->error,
|
|
|
|
"memory listener initialization failed: ");
|
2015-09-30 05:13:51 +03:00
|
|
|
goto listener_release_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
container->initialized = true;
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
return 0;
|
|
|
|
listener_release_exit:
|
2017-07-17 21:39:09 +03:00
|
|
|
QLIST_REMOVE(group, container_next);
|
|
|
|
QLIST_REMOVE(container, next);
|
|
|
|
vfio_kvm_device_del_group(group);
|
2014-12-22 19:54:51 +03:00
|
|
|
vfio_listener_release(container);
|
|
|
|
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
enable_discards_exit:
|
|
|
|
vfio_ram_block_discard_disable(container, false);
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
free_container_exit:
|
|
|
|
g_free(container);
|
|
|
|
|
|
|
|
close_fd_exit:
|
|
|
|
close(fd);
|
|
|
|
|
|
|
|
put_space_exit:
|
|
|
|
vfio_put_address_space(space);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_disconnect_container(VFIOGroup *group)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = group->container;
|
|
|
|
|
2018-01-22 09:02:43 +03:00
|
|
|
QLIST_REMOVE(group, container_next);
|
|
|
|
group->container = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Explicitly release the listener first before unset container,
|
|
|
|
* since unset may destroy the backend container if it's the last
|
|
|
|
* group.
|
|
|
|
*/
|
|
|
|
if (QLIST_EMPTY(&container->group_list)) {
|
|
|
|
vfio_listener_release(container);
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
|
|
|
|
error_report("vfio: error disconnecting group %d from container",
|
|
|
|
group->groupid);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&container->group_list)) {
|
|
|
|
VFIOAddressSpace *space = container->space;
|
2015-07-06 21:15:15 +03:00
|
|
|
VFIOGuestIOMMU *giommu, *tmp;
|
2021-11-17 04:47:39 +03:00
|
|
|
VFIOHostDMAWindow *hostwin, *next;
|
2014-12-22 19:54:51 +03:00
|
|
|
|
|
|
|
QLIST_REMOVE(container, next);
|
2015-07-06 21:15:15 +03:00
|
|
|
|
|
|
|
QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) {
|
2017-07-11 06:56:19 +03:00
|
|
|
memory_region_unregister_iommu_notifier(
|
2022-05-02 12:42:23 +03:00
|
|
|
MEMORY_REGION(giommu->iommu_mr), &giommu->n);
|
2015-07-06 21:15:15 +03:00
|
|
|
QLIST_REMOVE(giommu, giommu_next);
|
|
|
|
g_free(giommu);
|
|
|
|
}
|
|
|
|
|
2021-11-17 04:47:39 +03:00
|
|
|
QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
|
|
|
|
next) {
|
|
|
|
QLIST_REMOVE(hostwin, hostwin_next);
|
|
|
|
g_free(hostwin);
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
trace_vfio_disconnect_container(container->fd);
|
|
|
|
close(container->fd);
|
|
|
|
g_free(container);
|
|
|
|
|
|
|
|
vfio_put_address_space(space);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-17 19:57:59 +03:00
|
|
|
VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
|
|
|
VFIOGroup *group;
|
|
|
|
char path[32];
|
|
|
|
struct vfio_group_status status = { .argsz = sizeof(status) };
|
|
|
|
|
|
|
|
QLIST_FOREACH(group, &vfio_group_list, next) {
|
|
|
|
if (group->groupid == groupid) {
|
|
|
|
/* Found it. Now is it already in the right context? */
|
|
|
|
if (group->container->space->as == as) {
|
|
|
|
return group;
|
|
|
|
} else {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg(errp, "group %d used in multiple address spaces",
|
|
|
|
group->groupid);
|
2014-12-22 19:54:51 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
group = g_malloc0(sizeof(*group));
|
|
|
|
|
|
|
|
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
|
2020-07-21 15:25:21 +03:00
|
|
|
group->fd = qemu_open_old(path, O_RDWR);
|
2014-12-22 19:54:51 +03:00
|
|
|
if (group->fd < 0) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, errno, "failed to open %s", path);
|
2014-12-22 19:54:51 +03:00
|
|
|
goto free_group_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg_errno(errp, errno, "failed to get group %d status", groupid);
|
2014-12-22 19:54:51 +03:00
|
|
|
goto close_fd_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
|
2016-10-17 19:57:59 +03:00
|
|
|
error_setg(errp, "group %d is not viable", groupid);
|
|
|
|
error_append_hint(errp,
|
|
|
|
"Please ensure all devices within the iommu_group "
|
|
|
|
"are bound to their vfio bus driver.\n");
|
2014-12-22 19:54:51 +03:00
|
|
|
goto close_fd_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
group->groupid = groupid;
|
|
|
|
QLIST_INIT(&group->device_list);
|
|
|
|
|
2016-10-17 19:57:59 +03:00
|
|
|
if (vfio_connect_container(group, as, errp)) {
|
|
|
|
error_prepend(errp, "failed to setup container for group %d: ",
|
|
|
|
groupid);
|
2014-12-22 19:54:51 +03:00
|
|
|
goto close_fd_exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&vfio_group_list)) {
|
|
|
|
qemu_register_reset(vfio_reset_handler, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
QLIST_INSERT_HEAD(&vfio_group_list, group, next);
|
|
|
|
|
|
|
|
return group;
|
|
|
|
|
|
|
|
close_fd_exit:
|
|
|
|
close(group->fd);
|
|
|
|
|
|
|
|
free_group_exit:
|
|
|
|
g_free(group);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_put_group(VFIOGroup *group)
|
|
|
|
{
|
2015-02-10 20:25:44 +03:00
|
|
|
if (!group || !QLIST_EMPTY(&group->device_list)) {
|
2014-12-22 19:54:51 +03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-06-26 10:22:30 +03:00
|
|
|
if (!group->ram_block_discard_allowed) {
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
vfio_ram_block_discard_disable(group->container, false);
|
2018-08-17 18:27:16 +03:00
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
vfio_kvm_device_del_group(group);
|
|
|
|
vfio_disconnect_container(group);
|
|
|
|
QLIST_REMOVE(group, next);
|
|
|
|
trace_vfio_put_group(group->fd);
|
|
|
|
close(group->fd);
|
|
|
|
g_free(group);
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&vfio_group_list)) {
|
|
|
|
qemu_unregister_reset(vfio_reset_handler, NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-01 17:45:06 +03:00
|
|
|
struct vfio_device_info *vfio_get_device_info(int fd)
|
|
|
|
{
|
|
|
|
struct vfio_device_info *info;
|
|
|
|
uint32_t argsz = sizeof(*info);
|
|
|
|
|
|
|
|
info = g_malloc0(argsz);
|
|
|
|
|
|
|
|
retry:
|
|
|
|
info->argsz = argsz;
|
|
|
|
|
|
|
|
if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
|
|
|
|
g_free(info);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (info->argsz > argsz) {
|
|
|
|
argsz = info->argsz;
|
|
|
|
info = g_realloc(info, argsz);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
|
|
|
return info;
|
|
|
|
}
|
|
|
|
|
2014-12-22 19:54:51 +03:00
|
|
|
int vfio_get_device(VFIOGroup *group, const char *name,
|
2016-10-17 19:58:00 +03:00
|
|
|
VFIODevice *vbasedev, Error **errp)
|
2014-12-22 19:54:51 +03:00
|
|
|
{
|
2023-06-01 17:45:06 +03:00
|
|
|
g_autofree struct vfio_device_info *info = NULL;
|
|
|
|
int fd;
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2015-02-10 20:25:44 +03:00
|
|
|
fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
|
|
|
|
if (fd < 0) {
|
2016-10-17 19:58:00 +03:00
|
|
|
error_setg_errno(errp, errno, "error getting device from group %d",
|
|
|
|
group->groupid);
|
|
|
|
error_append_hint(errp,
|
|
|
|
"Verify all devices in group %d are bound to vfio-<bus> "
|
|
|
|
"or pci-stub and not already in use\n", group->groupid);
|
2015-02-10 20:25:44 +03:00
|
|
|
return fd;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2023-06-01 17:45:06 +03:00
|
|
|
info = vfio_get_device_info(fd);
|
|
|
|
if (!info) {
|
2016-10-17 19:58:00 +03:00
|
|
|
error_setg_errno(errp, errno, "error getting device info");
|
2015-02-10 20:25:44 +03:00
|
|
|
close(fd);
|
2023-06-01 17:45:06 +03:00
|
|
|
return -1;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
2018-08-17 18:27:16 +03:00
|
|
|
/*
|
2020-06-26 10:22:30 +03:00
|
|
|
* Set discarding of RAM as not broken for this group if the driver knows
|
|
|
|
* the device operates compatibly with discarding. Setting must be
|
|
|
|
* consistent per group, but since compatibility is really only possible
|
|
|
|
* with mdev currently, we expect singleton groups.
|
2018-08-17 18:27:16 +03:00
|
|
|
*/
|
2020-06-26 10:22:30 +03:00
|
|
|
if (vbasedev->ram_block_discard_allowed !=
|
|
|
|
group->ram_block_discard_allowed) {
|
2018-08-17 18:27:16 +03:00
|
|
|
if (!QLIST_EMPTY(&group->device_list)) {
|
2020-06-26 10:22:30 +03:00
|
|
|
error_setg(errp, "Inconsistent setting of support for discarding "
|
|
|
|
"RAM (e.g., balloon) within group");
|
2018-08-23 19:45:58 +03:00
|
|
|
close(fd);
|
2018-08-17 18:27:16 +03:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2020-06-26 10:22:30 +03:00
|
|
|
if (!group->ram_block_discard_allowed) {
|
|
|
|
group->ram_block_discard_allowed = true;
|
vfio: Disable only uncoordinated discards for VFIO_TYPE1 iommus
We support coordinated discarding of RAM using the RamDiscardManager for
the VFIO_TYPE1 iommus. Let's unlock support for coordinated discards,
keeping uncoordinated discards (e.g., via virtio-balloon) disabled if
possible.
This unlocks virtio-mem + vfio on x86-64. Note that vfio used via "nvme://"
by the block layer has to be implemented/unlocked separately. For now,
virtio-mem only supports x86-64; we don't restrict RamDiscardManager to
x86-64, though: arm64 and s390x are supposed to work as well, and we'll
test once unlocking virtio-mem support. The spapr IOMMUs will need special
care, to be tackled later, e.g.., once supporting virtio-mem.
Note: The block size of a virtio-mem device has to be set to sane sizes,
depending on the maximum hotplug size - to not run out of vfio mappings.
The default virtio-mem block size is usually in the range of a couple of
MBs. The maximum number of mapping is 64k, shared with other users.
Assume you want to hotplug 256GB using virtio-mem - the block size would
have to be set to at least 8 MiB (resulting in 32768 separate mappings).
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Pankaj Gupta <pankaj.gupta.linux@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Auger Eric <eric.auger@redhat.com>
Cc: Wei Yang <richard.weiyang@linux.alibaba.com>
Cc: teawater <teawaterz@linux.alibaba.com>
Cc: Marek Kedzierski <mkedzier@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210413095531.25603-14-david@redhat.com>
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2021-04-13 12:55:31 +03:00
|
|
|
vfio_ram_block_discard_disable(group->container, false);
|
2018-08-17 18:27:16 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-02-10 20:25:44 +03:00
|
|
|
vbasedev->fd = fd;
|
|
|
|
vbasedev->group = group;
|
|
|
|
QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
|
|
|
|
|
2023-06-01 17:45:06 +03:00
|
|
|
vbasedev->num_irqs = info->num_irqs;
|
|
|
|
vbasedev->num_regions = info->num_regions;
|
|
|
|
vbasedev->flags = info->flags;
|
|
|
|
|
|
|
|
trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2023-06-01 17:45:06 +03:00
|
|
|
vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
|
2014-12-22 19:54:51 +03:00
|
|
|
|
2015-02-10 20:25:44 +03:00
|
|
|
return 0;
|
2014-12-22 19:54:51 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void vfio_put_base_device(VFIODevice *vbasedev)
|
|
|
|
{
|
2015-02-10 20:25:44 +03:00
|
|
|
if (!vbasedev->group) {
|
|
|
|
return;
|
|
|
|
}
|
2014-12-22 19:54:51 +03:00
|
|
|
QLIST_REMOVE(vbasedev, next);
|
|
|
|
vbasedev->group = NULL;
|
|
|
|
trace_vfio_put_base_device(vbasedev->fd);
|
|
|
|
close(vbasedev->fd);
|
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
int vfio_get_region_info(VFIODevice *vbasedev, int index,
|
|
|
|
struct vfio_region_info **info)
|
|
|
|
{
|
|
|
|
size_t argsz = sizeof(struct vfio_region_info);
|
|
|
|
|
|
|
|
*info = g_malloc0(argsz);
|
|
|
|
|
|
|
|
(*info)->index = index;
|
2016-05-26 18:43:20 +03:00
|
|
|
retry:
|
2016-03-10 19:39:07 +03:00
|
|
|
(*info)->argsz = argsz;
|
|
|
|
|
|
|
|
if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
|
|
|
|
g_free(*info);
|
2016-05-26 18:43:20 +03:00
|
|
|
*info = NULL;
|
2016-03-10 19:39:07 +03:00
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2016-05-26 18:43:20 +03:00
|
|
|
if ((*info)->argsz > argsz) {
|
|
|
|
argsz = (*info)->argsz;
|
|
|
|
*info = g_realloc(*info, argsz);
|
|
|
|
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
|
2016-03-10 19:39:07 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-26 18:43:20 +03:00
|
|
|
int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
|
|
|
|
uint32_t subtype, struct vfio_region_info **info)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < vbasedev->num_regions; i++) {
|
|
|
|
struct vfio_info_cap_header *hdr;
|
|
|
|
struct vfio_region_info_cap_type *cap_type;
|
|
|
|
|
|
|
|
if (vfio_get_region_info(vbasedev, i, info)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
|
|
|
|
if (!hdr) {
|
|
|
|
g_free(*info);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
|
|
|
|
|
|
|
|
trace_vfio_get_dev_region(vbasedev->name, i,
|
|
|
|
cap_type->type, cap_type->subtype);
|
|
|
|
|
|
|
|
if (cap_type->type == type && cap_type->subtype == subtype) {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
g_free(*info);
|
|
|
|
}
|
|
|
|
|
|
|
|
*info = NULL;
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
|
2018-03-13 20:17:31 +03:00
|
|
|
bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
|
|
|
|
{
|
|
|
|
struct vfio_region_info *info = NULL;
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
if (!vfio_get_region_info(vbasedev, region, &info)) {
|
|
|
|
if (vfio_get_region_info_cap(info, cap_type)) {
|
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
g_free(info);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-03-09 03:56:06 +03:00
|
|
|
/*
|
|
|
|
* Interfaces for IBM EEH (Enhanced Error Handling)
|
|
|
|
*/
|
|
|
|
static bool vfio_eeh_container_ok(VFIOContainer *container)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* As of 2016-03-04 (linux-4.5) the host kernel EEH/VFIO
|
|
|
|
* implementation is broken if there are multiple groups in a
|
|
|
|
* container. The hardware works in units of Partitionable
|
|
|
|
* Endpoints (== IOMMU groups) and the EEH operations naively
|
|
|
|
* iterate across all groups in the container, without any logic
|
|
|
|
* to make sure the groups have their state synchronized. For
|
|
|
|
* certain operations (ENABLE) that might be ok, until an error
|
|
|
|
* occurs, but for others (GET_STATE) it's clearly broken.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX Once fixed kernels exist, test for them here
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&container->group_list)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (QLIST_NEXT(QLIST_FIRST(&container->group_list), container_next)) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_eeh_container_op(VFIOContainer *container, uint32_t op)
|
|
|
|
{
|
|
|
|
struct vfio_eeh_pe_op pe_op = {
|
|
|
|
.argsz = sizeof(pe_op),
|
|
|
|
.op = op,
|
|
|
|
};
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!vfio_eeh_container_ok(container)) {
|
|
|
|
error_report("vfio/eeh: EEH_PE_OP 0x%x: "
|
|
|
|
"kernel requires a container with exactly one group", op);
|
|
|
|
return -EPERM;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ioctl(container->fd, VFIO_EEH_PE_OP, &pe_op);
|
|
|
|
if (ret < 0) {
|
|
|
|
error_report("vfio/eeh: EEH_PE_OP 0x%x failed: %m", op);
|
|
|
|
return -errno;
|
|
|
|
}
|
|
|
|
|
2016-06-15 07:28:27 +03:00
|
|
|
return ret;
|
2016-03-09 03:56:06 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static VFIOContainer *vfio_eeh_as_container(AddressSpace *as)
|
|
|
|
{
|
|
|
|
VFIOAddressSpace *space = vfio_get_address_space(as);
|
|
|
|
VFIOContainer *container = NULL;
|
|
|
|
|
|
|
|
if (QLIST_EMPTY(&space->containers)) {
|
|
|
|
/* No containers to act on */
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
container = QLIST_FIRST(&space->containers);
|
|
|
|
|
|
|
|
if (QLIST_NEXT(container, next)) {
|
|
|
|
/* We don't yet have logic to synchronize EEH state across
|
|
|
|
* multiple containers */
|
|
|
|
container = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
vfio_put_address_space(space);
|
|
|
|
return container;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool vfio_eeh_as_ok(AddressSpace *as)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = vfio_eeh_as_container(as);
|
|
|
|
|
|
|
|
return (container != NULL) && vfio_eeh_container_ok(container);
|
|
|
|
}
|
|
|
|
|
|
|
|
int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
|
|
|
|
{
|
|
|
|
VFIOContainer *container = vfio_eeh_as_container(as);
|
|
|
|
|
|
|
|
if (!container) {
|
|
|
|
return -ENODEV;
|
|
|
|
}
|
|
|
|
return vfio_eeh_container_op(container, op);
|
|
|
|
}
|