vfio queue:

* Support for VFIODisplay migration with ramfb
 * Preliminary work for IOMMUFD support
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmUvlEYACgkQUaNDx8/7
 7KFlaw//X2053de2eTdo38/UMSzi5ACWWn2j1iGQZf/3+J2LcdlixZarZr/2DN56
 4axmwF6+GKozt5+EnvWtgodDn6U9iyMNaAB3CGBHFHsH8uqKeZd/Ii754q4Rcmy9
 ZufBOPWm9Ff7s2MMFiAZvso75jP2wuwVEe1YPRjeJnsNSNIJ6WZfemh3Sl96yRBb
 r38uqzqetKwl7HziMMWP3yb8v+dU8A9bqI1hf1FZGttfFz3XA+pmjXKA6XxdfiZF
 AAotu5x9w86a08sAlr/qVsZFLR37oQykkXM0D840DafJDyr5fbJiq8cwfOjMw9+D
 w6+udRm5KoBWPsvb/T3dR88GRMO22PChjH9Vjl51TstMNhdTxuKJTKhhSoUFZbXV
 8CMjwfALk5ggIOyCk1LRd04ed+9qkqgcbw1Guy5pYnyPnY/X6XurxxaxS6Gemgtn
 UvgRYhSjio+LgHLO77IVkWJMooTEPzUTty2Zxa7ldbbE+utPUtsmac9+1m2pnpqk
 5VQmB074QnsJuvf+7HPU6vYCzQWoXHsH1UY/A0fF7MPedNUAbVYzKrdGPyqEMqHy
 xbilAIaS3oO0pMT6kUpRv5c5vjbwkx94Nf/ii8fQVjWzPfCcaF3yEfaam62jMUku
 stySaRpavKIx2oYLlucBqeKaBGaUofk13gGTQlsFs8pKCOAV7r4=
 =s0fN
 -----END PGP SIGNATURE-----

Merge tag 'pull-vfio-20231018' of https://github.com/legoater/qemu into staging

vfio queue:

* Support for VFIODisplay migration with ramfb
* Preliminary work for IOMMUFD support

# -----BEGIN PGP SIGNATURE-----
#
# iQIzBAABCAAdFiEEoPZlSPBIlev+awtgUaNDx8/77KEFAmUvlEYACgkQUaNDx8/7
# 7KFlaw//X2053de2eTdo38/UMSzi5ACWWn2j1iGQZf/3+J2LcdlixZarZr/2DN56
# 4axmwF6+GKozt5+EnvWtgodDn6U9iyMNaAB3CGBHFHsH8uqKeZd/Ii754q4Rcmy9
# ZufBOPWm9Ff7s2MMFiAZvso75jP2wuwVEe1YPRjeJnsNSNIJ6WZfemh3Sl96yRBb
# r38uqzqetKwl7HziMMWP3yb8v+dU8A9bqI1hf1FZGttfFz3XA+pmjXKA6XxdfiZF
# AAotu5x9w86a08sAlr/qVsZFLR37oQykkXM0D840DafJDyr5fbJiq8cwfOjMw9+D
# w6+udRm5KoBWPsvb/T3dR88GRMO22PChjH9Vjl51TstMNhdTxuKJTKhhSoUFZbXV
# 8CMjwfALk5ggIOyCk1LRd04ed+9qkqgcbw1Guy5pYnyPnY/X6XurxxaxS6Gemgtn
# UvgRYhSjio+LgHLO77IVkWJMooTEPzUTty2Zxa7ldbbE+utPUtsmac9+1m2pnpqk
# 5VQmB074QnsJuvf+7HPU6vYCzQWoXHsH1UY/A0fF7MPedNUAbVYzKrdGPyqEMqHy
# xbilAIaS3oO0pMT6kUpRv5c5vjbwkx94Nf/ii8fQVjWzPfCcaF3yEfaam62jMUku
# stySaRpavKIx2oYLlucBqeKaBGaUofk13gGTQlsFs8pKCOAV7r4=
# =s0fN
# -----END PGP SIGNATURE-----
# gpg: Signature made Wed 18 Oct 2023 04:16:06 EDT
# gpg:                using RSA key A0F66548F04895EBFE6B0B6051A343C7CFFBECA1
# gpg: Good signature from "Cédric Le Goater <clg@redhat.com>" [unknown]
# gpg:                 aka "Cédric Le Goater <clg@kaod.org>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: A0F6 6548 F048 95EB FE6B  0B60 51A3 43C7 CFFB ECA1

* tag 'pull-vfio-20231018' of https://github.com/legoater/qemu: (22 commits)
  hw/vfio: add ramfb migration support
  ramfb-standalone: add migration support
  ramfb: add migration support
  vfio/pci: Remove vfio_detach_device from vfio_realize error path
  vfio/ccw: Remove redundant definition of TYPE_VFIO_CCW
  vfio/ap: Remove pointless apdev variable
  vfio/pci: Fix a potential memory leak in vfio_listener_region_add
  vfio/common: Move legacy VFIO backend code into separate container.c
  vfio/common: Introduce a global VFIODevice list
  vfio/common: Store the parent container in VFIODevice
  vfio/common: Introduce a per container device list
  vfio/common: Move VFIO reset handler registration to a group agnostic function
  vfio/ccw: Use vfio_[attach/detach]_device
  vfio/ap: Use vfio_[attach/detach]_device
  vfio/platform: Use vfio_[attach/detach]_device
  vfio/pci: Introduce vfio_[attach/detach]_device
  vfio/common: Extract out vfio_kvm_device_[add/del]_fd
  vfio/common: Introduce vfio_container_add|del_section_window()
  vfio/common: Propagate KVM_SET_DEVICE_ATTR error if any
  vfio/common: Move IOMMU agnostic helpers to a separate file
  ...

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-10-18 06:21:15 -04:00
commit deaca3fd30
20 changed files with 2576 additions and 1976 deletions

View File

@ -34,6 +34,8 @@
GlobalProperty hw_compat_8_1[] = {
{ TYPE_PCI_BRIDGE, "x-pci-express-writeable-slt-bug", "true" },
{ "ramfb", "x-migrate", "off" },
{ "vfio-pci-nohotplug", "x-ramfb-migrate", "off" }
};
const size_t hw_compat_8_1_len = G_N_ELEMENTS(hw_compat_8_1);

View File

@ -1,4 +1,5 @@
#include "qemu/osdep.h"
#include "migration/vmstate.h"
#include "qapi/error.h"
#include "qemu/module.h"
#include "hw/loader.h"
@ -15,6 +16,7 @@ struct RAMFBStandaloneState {
SysBusDevice parent_obj;
QemuConsole *con;
RAMFBState *state;
bool migrate;
};
static void display_update_wrapper(void *dev)
@ -40,14 +42,39 @@ static void ramfb_realizefn(DeviceState *dev, Error **errp)
ramfb->state = ramfb_setup(errp);
}
static bool migrate_needed(void *opaque)
{
RAMFBStandaloneState *ramfb = RAMFB(opaque);
return ramfb->migrate;
}
static const VMStateDescription ramfb_dev_vmstate = {
.name = "ramfb-dev",
.version_id = 1,
.minimum_version_id = 1,
.needed = migrate_needed,
.fields = (VMStateField[]) {
VMSTATE_STRUCT_POINTER(state, RAMFBStandaloneState, ramfb_vmstate, RAMFBState),
VMSTATE_END_OF_LIST()
}
};
static Property ramfb_properties[] = {
DEFINE_PROP_BOOL("x-migrate", RAMFBStandaloneState, migrate, true),
DEFINE_PROP_END_OF_LIST(),
};
static void ramfb_class_initfn(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
dc->vmsd = &ramfb_dev_vmstate;
dc->realize = ramfb_realizefn;
dc->desc = "ram framebuffer standalone device";
dc->user_creatable = true;
device_class_set_props(dc, ramfb_properties);
}
static const TypeInfo ramfb_info = {

View File

@ -28,6 +28,8 @@ struct QEMU_PACKED RAMFBCfg {
uint32_t stride;
};
typedef struct RAMFBCfg RAMFBCfg;
struct RAMFBState {
DisplaySurface *ds;
uint32_t width, height;
@ -116,6 +118,23 @@ void ramfb_display_update(QemuConsole *con, RAMFBState *s)
dpy_gfx_update_full(con);
}
static int ramfb_post_load(void *opaque, int version_id)
{
ramfb_fw_cfg_write(opaque, 0, 0);
return 0;
}
const VMStateDescription ramfb_vmstate = {
.name = "ramfb",
.version_id = 1,
.minimum_version_id = 1,
.post_load = ramfb_post_load,
.fields = (VMStateField[]) {
VMSTATE_BUFFER_UNSAFE(cfg, RAMFBState, 0, sizeof(RAMFBCfg)),
VMSTATE_END_OF_LIST()
}
};
RAMFBState *ramfb_setup(Error **errp)
{
FWCfgState *fw_cfg = fw_cfg_find();

View File

@ -53,40 +53,6 @@ struct VFIODeviceOps vfio_ap_ops = {
.vfio_compute_needs_reset = vfio_ap_compute_needs_reset,
};
static void vfio_ap_put_device(VFIOAPDevice *vapdev)
{
g_free(vapdev->vdev.name);
vfio_put_base_device(&vapdev->vdev);
}
static VFIOGroup *vfio_ap_get_group(VFIOAPDevice *vapdev, Error **errp)
{
GError *gerror = NULL;
char *symlink, *group_path;
int groupid;
symlink = g_strdup_printf("%s/iommu_group", vapdev->vdev.sysfsdev);
group_path = g_file_read_link(symlink, &gerror);
g_free(symlink);
if (!group_path) {
error_setg(errp, "%s: no iommu_group found for %s: %s",
TYPE_VFIO_AP_DEVICE, vapdev->vdev.sysfsdev, gerror->message);
g_error_free(gerror);
return NULL;
}
if (sscanf(basename(group_path), "%d", &groupid) != 1) {
error_setg(errp, "vfio: failed to read %s", group_path);
g_free(group_path);
return NULL;
}
g_free(group_path);
return vfio_get_group(groupid, &address_space_memory, errp);
}
static void vfio_ap_req_notifier_handler(void *opaque)
{
VFIOAPDevice *vapdev = opaque;
@ -189,22 +155,14 @@ static void vfio_ap_unregister_irq_notifier(VFIOAPDevice *vapdev,
static void vfio_ap_realize(DeviceState *dev, Error **errp)
{
int ret;
char *mdevid;
Error *err = NULL;
VFIOGroup *vfio_group;
APDevice *apdev = AP_DEVICE(dev);
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(apdev);
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
VFIODevice *vbasedev = &vapdev->vdev;
vfio_group = vfio_ap_get_group(vapdev, errp);
if (!vfio_group) {
return;
}
vapdev->vdev.ops = &vfio_ap_ops;
vapdev->vdev.type = VFIO_DEVICE_TYPE_AP;
mdevid = basename(vapdev->vdev.sysfsdev);
vapdev->vdev.name = g_strdup_printf("%s", mdevid);
vapdev->vdev.dev = dev;
vbasedev->name = g_path_get_basename(vbasedev->sysfsdev);
vbasedev->ops = &vfio_ap_ops;
vbasedev->type = VFIO_DEVICE_TYPE_AP;
vbasedev->dev = dev;
/*
* vfio-ap devices operate in a way compatible with discarding of
@ -214,9 +172,10 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
*/
vapdev->vdev.ram_block_discard_allowed = true;
ret = vfio_get_device(vfio_group, mdevid, &vapdev->vdev, errp);
ret = vfio_attach_device(vbasedev->name, vbasedev,
&address_space_memory, errp);
if (ret) {
goto out_get_dev_err;
goto error;
}
vfio_ap_register_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX, &err);
@ -230,20 +189,18 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
return;
out_get_dev_err:
vfio_ap_put_device(vapdev);
vfio_put_group(vfio_group);
error:
error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
g_free(vbasedev->name);
}
static void vfio_ap_unrealize(DeviceState *dev)
{
APDevice *apdev = AP_DEVICE(dev);
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(apdev);
VFIOGroup *group = vapdev->vdev.group;
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
vfio_ap_unregister_irq_notifier(vapdev, VFIO_AP_REQ_IRQ_INDEX);
vfio_ap_put_device(vapdev);
vfio_put_group(group);
vfio_detach_device(&vapdev->vdev);
g_free(vapdev->vdev.name);
}
static Property vfio_ap_properties[] = {
@ -254,8 +211,7 @@ static Property vfio_ap_properties[] = {
static void vfio_ap_reset(DeviceState *dev)
{
int ret;
APDevice *apdev = AP_DEVICE(dev);
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(apdev);
VFIOAPDevice *vapdev = VFIO_AP_DEVICE(dev);
ret = ioctl(vapdev->vdev.fd, VFIO_DEVICE_RESET);
if (ret) {

View File

@ -572,88 +572,14 @@ static void vfio_ccw_put_region(VFIOCCWDevice *vcdev)
g_free(vcdev->io_region);
}
static void vfio_ccw_put_device(VFIOCCWDevice *vcdev)
{
g_free(vcdev->vdev.name);
vfio_put_base_device(&vcdev->vdev);
}
static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev,
Error **errp)
{
S390CCWDevice *cdev = S390_CCW_DEVICE(vcdev);
char *name = g_strdup_printf("%x.%x.%04x", cdev->hostid.cssid,
cdev->hostid.ssid,
cdev->hostid.devid);
VFIODevice *vbasedev;
QLIST_FOREACH(vbasedev, &group->device_list, next) {
if (strcmp(vbasedev->name, name) == 0) {
error_setg(errp, "vfio: subchannel %s has already been attached",
name);
goto out_err;
}
}
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* discarding of memory in RAM blocks, ie. pages pinned in the host are
* in the current working set of the guest driver and therefore never
* overlap e.g., with pages available to the guest balloon driver. This
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
vcdev->vdev.ram_block_discard_allowed = true;
if (vfio_get_device(group, cdev->mdevid, &vcdev->vdev, errp)) {
goto out_err;
}
vcdev->vdev.ops = &vfio_ccw_ops;
vcdev->vdev.type = VFIO_DEVICE_TYPE_CCW;
vcdev->vdev.name = name;
vcdev->vdev.dev = DEVICE(vcdev);
return;
out_err:
g_free(name);
}
static VFIOGroup *vfio_ccw_get_group(S390CCWDevice *cdev, Error **errp)
{
char *tmp, group_path[PATH_MAX];
ssize_t len;
int groupid;
tmp = g_strdup_printf("/sys/bus/css/devices/%x.%x.%04x/%s/iommu_group",
cdev->hostid.cssid, cdev->hostid.ssid,
cdev->hostid.devid, cdev->mdevid);
len = readlink(tmp, group_path, sizeof(group_path));
g_free(tmp);
if (len <= 0 || len >= sizeof(group_path)) {
error_setg(errp, "vfio: no iommu_group found");
return NULL;
}
group_path[len] = 0;
if (sscanf(basename(group_path), "%d", &groupid) != 1) {
error_setg(errp, "vfio: failed to read %s", group_path);
return NULL;
}
return vfio_get_group(groupid, &address_space_memory, errp);
}
static void vfio_ccw_realize(DeviceState *dev, Error **errp)
{
VFIOGroup *group;
S390CCWDevice *cdev = S390_CCW_DEVICE(dev);
VFIOCCWDevice *vcdev = VFIO_CCW(cdev);
S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
VFIODevice *vbasedev = &vcdev->vdev;
Error *err = NULL;
int ret;
/* Call the class init function for subchannel. */
if (cdc->realize) {
@ -663,14 +589,27 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
}
}
group = vfio_ccw_get_group(cdev, &err);
if (!group) {
goto out_group_err;
}
vbasedev->ops = &vfio_ccw_ops;
vbasedev->type = VFIO_DEVICE_TYPE_CCW;
vbasedev->name = g_strdup_printf("%x.%x.%04x", vcdev->cdev.hostid.cssid,
vcdev->cdev.hostid.ssid,
vcdev->cdev.hostid.devid);
vbasedev->dev = dev;
vfio_ccw_get_device(group, vcdev, &err);
if (err) {
goto out_device_err;
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* discarding of memory in RAM blocks, ie. pages pinned in the host are
* in the current working set of the guest driver and therefore never
* overlap e.g., with pages available to the guest balloon driver. This
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
vbasedev->ram_block_discard_allowed = true;
ret = vfio_attach_device(cdev->mdevid, vbasedev,
&address_space_memory, errp);
if (ret) {
goto out_attach_dev_err;
}
vfio_ccw_get_region(vcdev, &err);
@ -708,10 +647,9 @@ out_irq_notifier_err:
out_io_notifier_err:
vfio_ccw_put_region(vcdev);
out_region_err:
vfio_ccw_put_device(vcdev);
out_device_err:
vfio_put_group(group);
out_group_err:
vfio_detach_device(vbasedev);
out_attach_dev_err:
g_free(vbasedev->name);
if (cdc->unrealize) {
cdc->unrealize(cdev);
}
@ -724,14 +662,13 @@ static void vfio_ccw_unrealize(DeviceState *dev)
S390CCWDevice *cdev = S390_CCW_DEVICE(dev);
VFIOCCWDevice *vcdev = VFIO_CCW(cdev);
S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev);
VFIOGroup *group = vcdev->vdev.group;
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_REQ_IRQ_INDEX);
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX);
vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX);
vfio_ccw_put_region(vcdev);
vfio_ccw_put_device(vcdev);
vfio_put_group(group);
vfio_detach_device(&vcdev->vdev);
g_free(vcdev->vdev.name);
if (cdc->unrealize) {
cdc->unrealize(cdev);

File diff suppressed because it is too large Load Diff

1161
hw/vfio/container.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -544,3 +544,24 @@ void vfio_display_finalize(VFIOPCIDevice *vdev)
vfio_display_edid_exit(vdev->dpy);
g_free(vdev->dpy);
}
static bool migrate_needed(void *opaque)
{
VFIODisplay *dpy = opaque;
bool ramfb_exists = dpy->ramfb != NULL;
/* see vfio_display_migration_needed() */
assert(ramfb_exists);
return ramfb_exists;
}
const VMStateDescription vfio_display_vmstate = {
.name = "VFIODisplay",
.version_id = 1,
.minimum_version_id = 1,
.needed = migrate_needed,
.fields = (VMStateField[]) {
VMSTATE_STRUCT_POINTER(ramfb, VFIODisplay, ramfb_vmstate, RAMFBState),
VMSTATE_END_OF_LIST(),
}
};

612
hw/vfio/helpers.c Normal file
View File

@ -0,0 +1,612 @@
/*
* low level and IOMMU backend agnostic helpers used by VFIO devices,
* related to regions, interrupts, capabilities
*
* Copyright Red Hat, Inc. 2012
*
* Authors:
* Alex Williamson <alex.williamson@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
* Based on qemu-kvm device-assignment:
* Adapted for KVM by Qumranet.
* Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
* Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
* Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
* Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
* Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
*/
#include "qemu/osdep.h"
#include <sys/ioctl.h>
#include "hw/vfio/vfio-common.h"
#include "hw/vfio/vfio.h"
#include "hw/hw.h"
#include "trace.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
/*
* Common VFIO interrupt disable
*/
void vfio_disable_irqindex(VFIODevice *vbasedev, int index)
{
struct vfio_irq_set irq_set = {
.argsz = sizeof(irq_set),
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
.index = index,
.start = 0,
.count = 0,
};
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
}
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index)
{
struct vfio_irq_set irq_set = {
.argsz = sizeof(irq_set),
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
.index = index,
.start = 0,
.count = 1,
};
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
}
void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index)
{
struct vfio_irq_set irq_set = {
.argsz = sizeof(irq_set),
.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK,
.index = index,
.start = 0,
.count = 1,
};
ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
}
static inline const char *action_to_str(int action)
{
switch (action) {
case VFIO_IRQ_SET_ACTION_MASK:
return "MASK";
case VFIO_IRQ_SET_ACTION_UNMASK:
return "UNMASK";
case VFIO_IRQ_SET_ACTION_TRIGGER:
return "TRIGGER";
default:
return "UNKNOWN ACTION";
}
}
static const char *index_to_str(VFIODevice *vbasedev, int index)
{
if (vbasedev->type != VFIO_DEVICE_TYPE_PCI) {
return NULL;
}
switch (index) {
case VFIO_PCI_INTX_IRQ_INDEX:
return "INTX";
case VFIO_PCI_MSI_IRQ_INDEX:
return "MSI";
case VFIO_PCI_MSIX_IRQ_INDEX:
return "MSIX";
case VFIO_PCI_ERR_IRQ_INDEX:
return "ERR";
case VFIO_PCI_REQ_IRQ_INDEX:
return "REQ";
default:
return NULL;
}
}
int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex,
int action, int fd, Error **errp)
{
struct vfio_irq_set *irq_set;
int argsz, ret = 0;
const char *name;
int32_t *pfd;
argsz = sizeof(*irq_set) + sizeof(*pfd);
irq_set = g_malloc0(argsz);
irq_set->argsz = argsz;
irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | action;
irq_set->index = index;
irq_set->start = subindex;
irq_set->count = 1;
pfd = (int32_t *)&irq_set->data;
*pfd = fd;
if (ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) {
ret = -errno;
}
g_free(irq_set);
if (!ret) {
return 0;
}
error_setg_errno(errp, -ret, "VFIO_DEVICE_SET_IRQS failure");
name = index_to_str(vbasedev, index);
if (name) {
error_prepend(errp, "%s-%d: ", name, subindex);
} else {
error_prepend(errp, "index %d-%d: ", index, subindex);
}
error_prepend(errp,
"Failed to %s %s eventfd signaling for interrupt ",
fd < 0 ? "tear down" : "set up", action_to_str(action));
return ret;
}
/*
* IO Port/MMIO - Beware of the endians, VFIO is always little endian
*/
void vfio_region_write(void *opaque, hwaddr addr,
uint64_t data, unsigned size)
{
VFIORegion *region = opaque;
VFIODevice *vbasedev = region->vbasedev;
union {
uint8_t byte;
uint16_t word;
uint32_t dword;
uint64_t qword;
} buf;
switch (size) {
case 1:
buf.byte = data;
break;
case 2:
buf.word = cpu_to_le16(data);
break;
case 4:
buf.dword = cpu_to_le32(data);
break;
case 8:
buf.qword = cpu_to_le64(data);
break;
default:
hw_error("vfio: unsupported write size, %u bytes", size);
break;
}
if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64
",%d) failed: %m",
__func__, vbasedev->name, region->nr,
addr, data, size);
}
trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size);
/*
* A read or write to a BAR always signals an INTx EOI. This will
* do nothing if not pending (including not in INTx mode). We assume
* that a BAR access is in response to an interrupt and that BAR
* accesses will service the interrupt. Unfortunately, we don't know
* which access will service the interrupt, so we're potentially
* getting quite a few host interrupts per guest interrupt.
*/
vbasedev->ops->vfio_eoi(vbasedev);
}
uint64_t vfio_region_read(void *opaque,
hwaddr addr, unsigned size)
{
VFIORegion *region = opaque;
VFIODevice *vbasedev = region->vbasedev;
union {
uint8_t byte;
uint16_t word;
uint32_t dword;
uint64_t qword;
} buf;
uint64_t data = 0;
if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) {
error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m",
__func__, vbasedev->name, region->nr,
addr, size);
return (uint64_t)-1;
}
switch (size) {
case 1:
data = buf.byte;
break;
case 2:
data = le16_to_cpu(buf.word);
break;
case 4:
data = le32_to_cpu(buf.dword);
break;
case 8:
data = le64_to_cpu(buf.qword);
break;
default:
hw_error("vfio: unsupported read size, %u bytes", size);
break;
}
trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data);
/* Same as write above */
vbasedev->ops->vfio_eoi(vbasedev);
return data;
}
const MemoryRegionOps vfio_region_ops = {
.read = vfio_region_read,
.write = vfio_region_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.valid = {
.min_access_size = 1,
.max_access_size = 8,
},
.impl = {
.min_access_size = 1,
.max_access_size = 8,
},
};
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
{
vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
BITS_PER_BYTE;
vbmap->bitmap = g_try_malloc0(vbmap->size);
if (!vbmap->bitmap) {
return -ENOMEM;
}
return 0;
}
struct vfio_info_cap_header *
vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id)
{
struct vfio_info_cap_header *hdr;
for (hdr = ptr + cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
if (hdr->id == id) {
return hdr;
}
}
return NULL;
}
struct vfio_info_cap_header *
vfio_get_region_info_cap(struct vfio_region_info *info, uint16_t id)
{
if (!(info->flags & VFIO_REGION_INFO_FLAG_CAPS)) {
return NULL;
}
return vfio_get_cap((void *)info, info->cap_offset, id);
}
struct vfio_info_cap_header *
vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id)
{
if (!(info->flags & VFIO_DEVICE_FLAGS_CAPS)) {
return NULL;
}
return vfio_get_cap((void *)info, info->cap_offset, id);
}
static int vfio_setup_region_sparse_mmaps(VFIORegion *region,
struct vfio_region_info *info)
{
struct vfio_info_cap_header *hdr;
struct vfio_region_info_cap_sparse_mmap *sparse;
int i, j;
hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP);
if (!hdr) {
return -ENODEV;
}
sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header);
trace_vfio_region_sparse_mmap_header(region->vbasedev->name,
region->nr, sparse->nr_areas);
region->mmaps = g_new0(VFIOMmap, sparse->nr_areas);
for (i = 0, j = 0; i < sparse->nr_areas; i++) {
if (sparse->areas[i].size) {
trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset,
sparse->areas[i].offset +
sparse->areas[i].size - 1);
region->mmaps[j].offset = sparse->areas[i].offset;
region->mmaps[j].size = sparse->areas[i].size;
j++;
}
}
region->nr_mmaps = j;
region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap));
return 0;
}
int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region,
int index, const char *name)
{
struct vfio_region_info *info;
int ret;
ret = vfio_get_region_info(vbasedev, index, &info);
if (ret) {
return ret;
}
region->vbasedev = vbasedev;
region->flags = info->flags;
region->size = info->size;
region->fd_offset = info->offset;
region->nr = index;
if (region->size) {
region->mem = g_new0(MemoryRegion, 1);
memory_region_init_io(region->mem, obj, &vfio_region_ops,
region, name, region->size);
if (!vbasedev->no_mmap &&
region->flags & VFIO_REGION_INFO_FLAG_MMAP) {
ret = vfio_setup_region_sparse_mmaps(region, info);
if (ret) {
region->nr_mmaps = 1;
region->mmaps = g_new0(VFIOMmap, region->nr_mmaps);
region->mmaps[0].offset = 0;
region->mmaps[0].size = region->size;
}
}
}
g_free(info);
trace_vfio_region_setup(vbasedev->name, index, name,
region->flags, region->fd_offset, region->size);
return 0;
}
static void vfio_subregion_unmap(VFIORegion *region, int index)
{
trace_vfio_region_unmap(memory_region_name(&region->mmaps[index].mem),
region->mmaps[index].offset,
region->mmaps[index].offset +
region->mmaps[index].size - 1);
memory_region_del_subregion(region->mem, &region->mmaps[index].mem);
munmap(region->mmaps[index].mmap, region->mmaps[index].size);
object_unparent(OBJECT(&region->mmaps[index].mem));
region->mmaps[index].mmap = NULL;
}
int vfio_region_mmap(VFIORegion *region)
{
int i, prot = 0;
char *name;
if (!region->mem) {
return 0;
}
prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0;
prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0;
for (i = 0; i < region->nr_mmaps; i++) {
region->mmaps[i].mmap = mmap(NULL, region->mmaps[i].size, prot,
MAP_SHARED, region->vbasedev->fd,
region->fd_offset +
region->mmaps[i].offset);
if (region->mmaps[i].mmap == MAP_FAILED) {
int ret = -errno;
trace_vfio_region_mmap_fault(memory_region_name(region->mem), i,
region->fd_offset +
region->mmaps[i].offset,
region->fd_offset +
region->mmaps[i].offset +
region->mmaps[i].size - 1, ret);
region->mmaps[i].mmap = NULL;
for (i--; i >= 0; i--) {
vfio_subregion_unmap(region, i);
}
return ret;
}
name = g_strdup_printf("%s mmaps[%d]",
memory_region_name(region->mem), i);
memory_region_init_ram_device_ptr(&region->mmaps[i].mem,
memory_region_owner(region->mem),
name, region->mmaps[i].size,
region->mmaps[i].mmap);
g_free(name);
memory_region_add_subregion(region->mem, region->mmaps[i].offset,
&region->mmaps[i].mem);
trace_vfio_region_mmap(memory_region_name(&region->mmaps[i].mem),
region->mmaps[i].offset,
region->mmaps[i].offset +
region->mmaps[i].size - 1);
}
return 0;
}
void vfio_region_unmap(VFIORegion *region)
{
int i;
if (!region->mem) {
return;
}
for (i = 0; i < region->nr_mmaps; i++) {
if (region->mmaps[i].mmap) {
vfio_subregion_unmap(region, i);
}
}
}
void vfio_region_exit(VFIORegion *region)
{
int i;
if (!region->mem) {
return;
}
for (i = 0; i < region->nr_mmaps; i++) {
if (region->mmaps[i].mmap) {
memory_region_del_subregion(region->mem, &region->mmaps[i].mem);
}
}
trace_vfio_region_exit(region->vbasedev->name, region->nr);
}
void vfio_region_finalize(VFIORegion *region)
{
int i;
if (!region->mem) {
return;
}
for (i = 0; i < region->nr_mmaps; i++) {
if (region->mmaps[i].mmap) {
munmap(region->mmaps[i].mmap, region->mmaps[i].size);
object_unparent(OBJECT(&region->mmaps[i].mem));
}
}
object_unparent(OBJECT(region->mem));
g_free(region->mem);
g_free(region->mmaps);
trace_vfio_region_finalize(region->vbasedev->name, region->nr);
region->mem = NULL;
region->mmaps = NULL;
region->nr_mmaps = 0;
region->size = 0;
region->flags = 0;
region->nr = 0;
}
void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled)
{
int i;
if (!region->mem) {
return;
}
for (i = 0; i < region->nr_mmaps; i++) {
if (region->mmaps[i].mmap) {
memory_region_set_enabled(&region->mmaps[i].mem, enabled);
}
}
trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem),
enabled);
}
int vfio_get_region_info(VFIODevice *vbasedev, int index,
struct vfio_region_info **info)
{
size_t argsz = sizeof(struct vfio_region_info);
*info = g_malloc0(argsz);
(*info)->index = index;
retry:
(*info)->argsz = argsz;
if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) {
g_free(*info);
*info = NULL;
return -errno;
}
if ((*info)->argsz > argsz) {
argsz = (*info)->argsz;
*info = g_realloc(*info, argsz);
goto retry;
}
return 0;
}
int vfio_get_dev_region_info(VFIODevice *vbasedev, uint32_t type,
uint32_t subtype, struct vfio_region_info **info)
{
int i;
for (i = 0; i < vbasedev->num_regions; i++) {
struct vfio_info_cap_header *hdr;
struct vfio_region_info_cap_type *cap_type;
if (vfio_get_region_info(vbasedev, i, info)) {
continue;
}
hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE);
if (!hdr) {
g_free(*info);
continue;
}
cap_type = container_of(hdr, struct vfio_region_info_cap_type, header);
trace_vfio_get_dev_region(vbasedev->name, i,
cap_type->type, cap_type->subtype);
if (cap_type->type == type && cap_type->subtype == subtype) {
return 0;
}
g_free(*info);
}
*info = NULL;
return -ENODEV;
}
bool vfio_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type)
{
struct vfio_region_info *info = NULL;
bool ret = false;
if (!vfio_get_region_info(vbasedev, region, &info)) {
if (vfio_get_region_info_cap(info, cap_type)) {
ret = true;
}
g_free(info);
}
return ret;
}

View File

@ -1,6 +1,8 @@
vfio_ss = ss.source_set()
vfio_ss.add(files(
'helpers.c',
'common.c',
'container.c',
'spapr.c',
'migration.c',
))

View File

@ -2675,6 +2675,33 @@ static bool vfio_msix_present(void *opaque, int version_id)
return msix_present(pdev);
}
static bool vfio_display_migration_needed(void *opaque)
{
VFIOPCIDevice *vdev = opaque;
/*
* We need to migrate the VFIODisplay object if ramfb *migration* was
* explicitly requested (in which case we enforced both ramfb=on and
* display=on), or ramfb migration was left at the default "auto"
* setting, and *ramfb* was explicitly requested (in which case we
* enforced display=on).
*/
return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
(vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
}
const VMStateDescription vmstate_vfio_display = {
.name = "VFIOPCIDevice/VFIODisplay",
.version_id = 1,
.minimum_version_id = 1,
.needed = vfio_display_migration_needed,
.fields = (VMStateField[]){
VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
VFIODisplay),
VMSTATE_END_OF_LIST()
}
};
const VMStateDescription vmstate_vfio_pci_config = {
.name = "VFIOPCIDevice",
.version_id = 1,
@ -2683,6 +2710,10 @@ const VMStateDescription vmstate_vfio_pci_config = {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription * []) {
&vmstate_vfio_display,
NULL
}
};
@ -2895,10 +2926,10 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
static void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
vfio_detach_device(&vdev->vbasedev);
g_free(vdev->vbasedev.name);
g_free(vdev->msix);
vfio_put_base_device(&vdev->vbasedev);
}
static void vfio_err_notifier_handler(void *opaque)
@ -3045,13 +3076,9 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
{
VFIOPCIDevice *vdev = VFIO_PCI(pdev);
VFIODevice *vbasedev = &vdev->vbasedev;
VFIODevice *vbasedev_iter;
VFIOGroup *group;
char *tmp, *subsys, group_path[PATH_MAX], *group_name;
char *tmp, *subsys;
Error *err = NULL;
ssize_t len;
struct stat st;
int groupid;
int i, ret;
bool is_mdev;
char uuid[UUID_FMT_LEN];
@ -3082,39 +3109,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
vbasedev->type = VFIO_DEVICE_TYPE_PCI;
vbasedev->dev = DEVICE(vdev);
tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
len = readlink(tmp, group_path, sizeof(group_path));
g_free(tmp);
if (len <= 0 || len >= sizeof(group_path)) {
error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
"no iommu_group found");
goto error;
}
group_path[len] = 0;
group_name = basename(group_path);
if (sscanf(group_name, "%d", &groupid) != 1) {
error_setg_errno(errp, errno, "failed to read %s", group_path);
goto error;
}
trace_vfio_realize(vbasedev->name, groupid);
group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
if (!group) {
goto error;
}
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
error_setg(errp, "device is already attached");
vfio_put_group(group);
goto error;
}
}
/*
* Mediated devices *might* operate compatibly with discarding of RAM, but
* we cannot know for certain, it depends on whether the mdev vendor driver
@ -3132,7 +3126,6 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
if (vbasedev->ram_block_discard_allowed && !is_mdev) {
error_setg(errp, "x-balloon-allowed only potentially compatible "
"with mdev devices");
vfio_put_group(group);
goto error;
}
@ -3143,10 +3136,10 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
name = g_strdup(vbasedev->name);
}
ret = vfio_get_device(group, name, vbasedev, errp);
ret = vfio_attach_device(name, vbasedev,
pci_device_iommu_address_space(pdev), errp);
g_free(name);
if (ret) {
vfio_put_group(group);
goto error;
}
@ -3338,6 +3331,20 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
}
if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
warn_report("x-ramfb-migrate=on but ramfb=off. "
"Forcing x-ramfb-migrate to off.");
vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
}
if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
} else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
error_setg(errp, "x-ramfb-migrate requires enable-migration");
goto out_deregister;
}
}
if (!pdev->failover_pair_id) {
if (!vfio_migration_realize(vbasedev, errp)) {
goto out_deregister;
@ -3371,7 +3378,6 @@ error:
static void vfio_instance_finalize(Object *obj)
{
VFIOPCIDevice *vdev = VFIO_PCI(obj);
VFIOGroup *group = vdev->vbasedev.group;
vfio_display_finalize(vdev);
vfio_bars_finalize(vdev);
@ -3385,7 +3391,6 @@ static void vfio_instance_finalize(Object *obj)
* g_free(vdev->igd_opregion);
*/
vfio_pci_put_device(vdev);
vfio_put_group(group);
}
static void vfio_exitfn(PCIDevice *pdev)
@ -3551,6 +3556,8 @@ static const TypeInfo vfio_pci_dev_info = {
static Property vfio_pci_dev_nohotplug_properties[] = {
DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
ON_OFF_AUTO_AUTO),
DEFINE_PROP_END_OF_LIST(),
};

View File

@ -174,6 +174,7 @@ struct VFIOPCIDevice {
bool no_kvm_ioeventfd;
bool no_vfio_ioeventfd;
bool enable_ramfb;
OnOffAuto ramfb_migrate;
bool defer_kvm_irq_routing;
bool clear_parent_atomics_on_exit;
VFIODisplay *dpy;
@ -227,4 +228,6 @@ void vfio_display_reset(VFIOPCIDevice *vdev);
int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
void vfio_display_finalize(VFIOPCIDevice *vdev);
extern const VMStateDescription vfio_display_vmstate;
#endif /* HW_VFIO_VFIO_PCI_H */

View File

@ -529,12 +529,7 @@ static VFIODeviceOps vfio_platform_ops = {
*/
static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
{
VFIOGroup *group;
VFIODevice *vbasedev_iter;
char *tmp, group_path[PATH_MAX], *group_name;
ssize_t len;
struct stat st;
int groupid;
int ret;
/* @sysfsdev takes precedence over @host */
@ -557,47 +552,15 @@ static int vfio_base_device_init(VFIODevice *vbasedev, Error **errp)
return -errno;
}
tmp = g_strdup_printf("%s/iommu_group", vbasedev->sysfsdev);
len = readlink(tmp, group_path, sizeof(group_path));
g_free(tmp);
if (len < 0 || len >= sizeof(group_path)) {
ret = len < 0 ? -errno : -ENAMETOOLONG;
error_setg_errno(errp, -ret, "no iommu_group found");
return ret;
}
group_path[len] = 0;
group_name = basename(group_path);
if (sscanf(group_name, "%d", &groupid) != 1) {
error_setg_errno(errp, errno, "failed to read %s", group_path);
return -errno;
}
trace_vfio_platform_base_device_init(vbasedev->name, groupid);
group = vfio_get_group(groupid, &address_space_memory, errp);
if (!group) {
return -ENOENT;
}
QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) {
error_setg(errp, "device is already attached");
vfio_put_group(group);
return -EBUSY;
}
}
ret = vfio_get_device(group, vbasedev->name, vbasedev, errp);
ret = vfio_attach_device(vbasedev->name, vbasedev,
&address_space_memory, errp);
if (ret) {
vfio_put_group(group);
return ret;
}
ret = vfio_populate_device(vbasedev, errp);
if (ret) {
vfio_put_group(group);
vfio_detach_device(vbasedev);
}
return ret;

View File

@ -37,7 +37,8 @@ vfio_pci_hot_reset_dep_devices(int domain, int bus, int slot, int function, int
vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: %s"
vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx"
vfio_populate_device_get_irq_info_failure(const char *errstr) "VFIO_DEVICE_GET_IRQ_INFO failure: %s"
vfio_realize(const char *name, int group_id) " (%s) group %d"
vfio_attach_device(const char *name, int group_id) " (%s) group %d"
vfio_detach_device(const char *name, int group_id) " (%s) group %d"
vfio_mdev(const char *name, bool is_mdev) " (%s) is_mdev %d"
vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s 0x%x@0x%x"
vfio_pci_reset(const char *name) " (%s)"
@ -120,7 +121,6 @@ vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64
# platform.c
vfio_platform_base_device_init(char *name, int groupid) "%s belongs to group #%d"
vfio_platform_realize(char *name, char *compat) "vfio device %s, compat = %s"
vfio_platform_eoi(int pin, int fd) "EOI IRQ pin %d (fd=%d)"
vfio_platform_intp_mmap_enable(int pin) "IRQ #%d still active, stay in slow path"

View File

@ -1,11 +1,15 @@
#ifndef RAMFB_H
#define RAMFB_H
#include "migration/vmstate.h"
/* ramfb.c */
typedef struct RAMFBState RAMFBState;
void ramfb_display_update(QemuConsole *con, RAMFBState *s);
RAMFBState *ramfb_setup(Error **errp);
extern const VMStateDescription ramfb_vmstate;
/* ramfb-standalone.c */
#define TYPE_RAMFB_DEVICE "ramfb"

View File

@ -22,6 +22,4 @@
#define TYPE_VFIO_CCW "vfio-ccw"
OBJECT_DECLARE_SIMPLE_TYPE(VFIOCCWDevice, VFIO_CCW)
#define TYPE_VFIO_CCW "vfio-ccw"
#endif

View File

@ -98,6 +98,7 @@ typedef struct VFIOContainer {
QLIST_HEAD(, VFIOGroup) group_list;
QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
QLIST_ENTRY(VFIOContainer) next;
QLIST_HEAD(, VFIODevice) device_list;
} VFIOContainer;
typedef struct VFIOGuestIOMMU {
@ -129,7 +130,10 @@ typedef struct VFIODeviceOps VFIODeviceOps;
typedef struct VFIODevice {
QLIST_ENTRY(VFIODevice) next;
QLIST_ENTRY(VFIODevice) container_next;
QLIST_ENTRY(VFIODevice) global_next;
struct VFIOGroup *group;
VFIOContainer *container;
char *sysfsdev;
char *name;
DeviceState *dev;
@ -196,7 +200,36 @@ typedef struct VFIODisplay {
} dmabuf;
} VFIODisplay;
void vfio_put_base_device(VFIODevice *vbasedev);
typedef struct {
unsigned long *bitmap;
hwaddr size;
hwaddr pages;
} VFIOBitmap;
void vfio_host_win_add(VFIOContainer *container,
hwaddr min_iova, hwaddr max_iova,
uint64_t iova_pgsizes);
int vfio_host_win_del(VFIOContainer *container, hwaddr min_iova,
hwaddr max_iova);
VFIOAddressSpace *vfio_get_address_space(AddressSpace *as);
void vfio_put_address_space(VFIOAddressSpace *space);
bool vfio_devices_all_running_and_saving(VFIOContainer *container);
/* container->fd */
int vfio_dma_unmap(VFIOContainer *container, hwaddr iova,
ram_addr_t size, IOMMUTLBEntry *iotlb);
int vfio_dma_map(VFIOContainer *container, hwaddr iova,
ram_addr_t size, void *vaddr, bool readonly);
int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start);
int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size);
int vfio_container_add_section_window(VFIOContainer *container,
MemoryRegionSection *section,
Error **errp);
void vfio_container_del_section_window(VFIOContainer *container,
MemoryRegionSection *section);
void vfio_disable_irqindex(VFIODevice *vbasedev, int index);
void vfio_unmask_single_irqindex(VFIODevice *vbasedev, int index);
void vfio_mask_single_irqindex(VFIODevice *vbasedev, int index);
@ -214,15 +247,22 @@ void vfio_region_unmap(VFIORegion *region);
void vfio_region_exit(VFIORegion *region);
void vfio_region_finalize(VFIORegion *region);
void vfio_reset_handler(void *opaque);
VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
void vfio_put_group(VFIOGroup *group);
struct vfio_device_info *vfio_get_device_info(int fd);
int vfio_get_device(VFIOGroup *group, const char *name,
VFIODevice *vbasedev, Error **errp);
int vfio_attach_device(char *name, VFIODevice *vbasedev,
AddressSpace *as, Error **errp);
void vfio_detach_device(VFIODevice *vbasedev);
int vfio_kvm_device_add_fd(int fd, Error **errp);
int vfio_kvm_device_del_fd(int fd, Error **errp);
extern const MemoryRegionOps vfio_region_ops;
typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList;
extern VFIOGroupList vfio_group_list;
extern VFIODeviceList vfio_device_list;
extern const MemoryListener vfio_memory_listener;
extern int vfio_kvm_device_fd;
bool vfio_mig_active(void);
int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
@ -245,6 +285,8 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info *info,
unsigned int *avail);
struct vfio_info_cap_header *
vfio_get_device_info_cap(struct vfio_device_info *info, uint16_t id);
struct vfio_info_cap_header *
vfio_get_cap(void *ptr, uint32_t cap_offset, uint16_t id);
#endif
extern const MemoryListener vfio_prereg_listener;
@ -257,4 +299,12 @@ int vfio_spapr_remove_window(VFIOContainer *container,
bool vfio_migration_realize(VFIODevice *vbasedev, Error **errp);
void vfio_migration_exit(VFIODevice *vbasedev);
int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size);
bool vfio_devices_all_running_and_mig_active(VFIOContainer *container);
bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container);
int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size);
int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
#endif /* HW_VFIO_VFIO_COMMON_H */

View File

@ -0,0 +1,444 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
*/
#ifndef _IOMMUFD_H
#define _IOMMUFD_H
#include <linux/types.h>
#include <linux/ioctl.h>
#define IOMMUFD_TYPE (';')
/**
* DOC: General ioctl format
*
* The ioctl interface follows a general format to allow for extensibility. Each
* ioctl is passed in a structure pointer as the argument providing the size of
* the structure in the first u32. The kernel checks that any structure space
* beyond what it understands is 0. This allows userspace to use the backward
* compatible portion while consistently using the newer, larger, structures.
*
* ioctls use a standard meaning for common errnos:
*
* - ENOTTY: The IOCTL number itself is not supported at all
* - E2BIG: The IOCTL number is supported, but the provided structure has
* non-zero in a part the kernel does not understand.
* - EOPNOTSUPP: The IOCTL number is supported, and the structure is
* understood, however a known field has a value the kernel does not
* understand or support.
* - EINVAL: Everything about the IOCTL was understood, but a field is not
* correct.
* - ENOENT: An ID or IOVA provided does not exist.
* - ENOMEM: Out of memory.
* - EOVERFLOW: Mathematics overflowed.
*
* As well as additional errnos, within specific ioctls.
*/
enum {
IOMMUFD_CMD_BASE = 0x80,
IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
IOMMUFD_CMD_IOAS_ALLOC,
IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
IOMMUFD_CMD_IOAS_COPY,
IOMMUFD_CMD_IOAS_IOVA_RANGES,
IOMMUFD_CMD_IOAS_MAP,
IOMMUFD_CMD_IOAS_UNMAP,
IOMMUFD_CMD_OPTION,
IOMMUFD_CMD_VFIO_IOAS,
IOMMUFD_CMD_HWPT_ALLOC,
IOMMUFD_CMD_GET_HW_INFO,
};
/**
* struct iommu_destroy - ioctl(IOMMU_DESTROY)
* @size: sizeof(struct iommu_destroy)
* @id: iommufd object ID to destroy. Can be any destroyable object type.
*
* Destroy any object held within iommufd.
*/
struct iommu_destroy {
__u32 size;
__u32 id;
};
#define IOMMU_DESTROY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_DESTROY)
/**
* struct iommu_ioas_alloc - ioctl(IOMMU_IOAS_ALLOC)
* @size: sizeof(struct iommu_ioas_alloc)
* @flags: Must be 0
* @out_ioas_id: Output IOAS ID for the allocated object
*
* Allocate an IO Address Space (IOAS) which holds an IO Virtual Address (IOVA)
* to memory mapping.
*/
struct iommu_ioas_alloc {
__u32 size;
__u32 flags;
__u32 out_ioas_id;
};
#define IOMMU_IOAS_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOC)
/**
* struct iommu_iova_range - ioctl(IOMMU_IOVA_RANGE)
* @start: First IOVA
* @last: Inclusive last IOVA
*
* An interval in IOVA space.
*/
struct iommu_iova_range {
__aligned_u64 start;
__aligned_u64 last;
};
/**
* struct iommu_ioas_iova_ranges - ioctl(IOMMU_IOAS_IOVA_RANGES)
* @size: sizeof(struct iommu_ioas_iova_ranges)
* @ioas_id: IOAS ID to read ranges from
* @num_iovas: Input/Output total number of ranges in the IOAS
* @__reserved: Must be 0
* @allowed_iovas: Pointer to the output array of struct iommu_iova_range
* @out_iova_alignment: Minimum alignment required for mapping IOVA
*
* Query an IOAS for ranges of allowed IOVAs. Mapping IOVA outside these ranges
* is not allowed. num_iovas will be set to the total number of iovas and
* the allowed_iovas[] will be filled in as space permits.
*
* The allowed ranges are dependent on the HW path the DMA operation takes, and
* can change during the lifetime of the IOAS. A fresh empty IOAS will have a
* full range, and each attached device will narrow the ranges based on that
* device's HW restrictions. Detaching a device can widen the ranges. Userspace
* should query ranges after every attach/detach to know what IOVAs are valid
* for mapping.
*
* On input num_iovas is the length of the allowed_iovas array. On output it is
* the total number of iovas filled in. The ioctl will return -EMSGSIZE and set
* num_iovas to the required value if num_iovas is too small. In this case the
* caller should allocate a larger output array and re-issue the ioctl.
*
* out_iova_alignment returns the minimum IOVA alignment that can be given
* to IOMMU_IOAS_MAP/COPY. IOVA's must satisfy::
*
* starting_iova % out_iova_alignment == 0
* (starting_iova + length) % out_iova_alignment == 0
*
* out_iova_alignment can be 1 indicating any IOVA is allowed. It cannot
* be higher than the system PAGE_SIZE.
*/
struct iommu_ioas_iova_ranges {
__u32 size;
__u32 ioas_id;
__u32 num_iovas;
__u32 __reserved;
__aligned_u64 allowed_iovas;
__aligned_u64 out_iova_alignment;
};
#define IOMMU_IOAS_IOVA_RANGES _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_IOVA_RANGES)
/**
* struct iommu_ioas_allow_iovas - ioctl(IOMMU_IOAS_ALLOW_IOVAS)
* @size: sizeof(struct iommu_ioas_allow_iovas)
* @ioas_id: IOAS ID to allow IOVAs from
* @num_iovas: Input/Output total number of ranges in the IOAS
* @__reserved: Must be 0
* @allowed_iovas: Pointer to array of struct iommu_iova_range
*
* Ensure a range of IOVAs are always available for allocation. If this call
* succeeds then IOMMU_IOAS_IOVA_RANGES will never return a list of IOVA ranges
* that are narrower than the ranges provided here. This call will fail if
* IOMMU_IOAS_IOVA_RANGES is currently narrower than the given ranges.
*
* When an IOAS is first created the IOVA_RANGES will be maximally sized, and as
* devices are attached the IOVA will narrow based on the device restrictions.
* When an allowed range is specified any narrowing will be refused, ie device
* attachment can fail if the device requires limiting within the allowed range.
*
* Automatic IOVA allocation is also impacted by this call. MAP will only
* allocate within the allowed IOVAs if they are present.
*
* This call replaces the entire allowed list with the given list.
*/
struct iommu_ioas_allow_iovas {
__u32 size;
__u32 ioas_id;
__u32 num_iovas;
__u32 __reserved;
__aligned_u64 allowed_iovas;
};
#define IOMMU_IOAS_ALLOW_IOVAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_ALLOW_IOVAS)
/**
* enum iommufd_ioas_map_flags - Flags for map and copy
* @IOMMU_IOAS_MAP_FIXED_IOVA: If clear the kernel will compute an appropriate
* IOVA to place the mapping at
* @IOMMU_IOAS_MAP_WRITEABLE: DMA is allowed to write to this mapping
* @IOMMU_IOAS_MAP_READABLE: DMA is allowed to read from this mapping
*/
enum iommufd_ioas_map_flags {
IOMMU_IOAS_MAP_FIXED_IOVA = 1 << 0,
IOMMU_IOAS_MAP_WRITEABLE = 1 << 1,
IOMMU_IOAS_MAP_READABLE = 1 << 2,
};
/**
* struct iommu_ioas_map - ioctl(IOMMU_IOAS_MAP)
* @size: sizeof(struct iommu_ioas_map)
* @flags: Combination of enum iommufd_ioas_map_flags
* @ioas_id: IOAS ID to change the mapping of
* @__reserved: Must be 0
* @user_va: Userspace pointer to start mapping from
* @length: Number of bytes to map
* @iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is set
* then this must be provided as input.
*
* Set an IOVA mapping from a user pointer. If FIXED_IOVA is specified then the
* mapping will be established at iova, otherwise a suitable location based on
* the reserved and allowed lists will be automatically selected and returned in
* iova.
*
* If IOMMU_IOAS_MAP_FIXED_IOVA is specified then the iova range must currently
* be unused, existing IOVA cannot be replaced.
*/
struct iommu_ioas_map {
__u32 size;
__u32 flags;
__u32 ioas_id;
__u32 __reserved;
__aligned_u64 user_va;
__aligned_u64 length;
__aligned_u64 iova;
};
#define IOMMU_IOAS_MAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_MAP)
/**
* struct iommu_ioas_copy - ioctl(IOMMU_IOAS_COPY)
* @size: sizeof(struct iommu_ioas_copy)
* @flags: Combination of enum iommufd_ioas_map_flags
* @dst_ioas_id: IOAS ID to change the mapping of
* @src_ioas_id: IOAS ID to copy from
* @length: Number of bytes to copy and map
* @dst_iova: IOVA the mapping was placed at. If IOMMU_IOAS_MAP_FIXED_IOVA is
* set then this must be provided as input.
* @src_iova: IOVA to start the copy
*
* Copy an already existing mapping from src_ioas_id and establish it in
* dst_ioas_id. The src iova/length must exactly match a range used with
* IOMMU_IOAS_MAP.
*
* This may be used to efficiently clone a subset of an IOAS to another, or as a
* kind of 'cache' to speed up mapping. Copy has an efficiency advantage over
* establishing equivalent new mappings, as internal resources are shared, and
* the kernel will pin the user memory only once.
*/
struct iommu_ioas_copy {
__u32 size;
__u32 flags;
__u32 dst_ioas_id;
__u32 src_ioas_id;
__aligned_u64 length;
__aligned_u64 dst_iova;
__aligned_u64 src_iova;
};
#define IOMMU_IOAS_COPY _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_COPY)
/**
* struct iommu_ioas_unmap - ioctl(IOMMU_IOAS_UNMAP)
* @size: sizeof(struct iommu_ioas_unmap)
* @ioas_id: IOAS ID to change the mapping of
* @iova: IOVA to start the unmapping at
* @length: Number of bytes to unmap, and return back the bytes unmapped
*
* Unmap an IOVA range. The iova/length must be a superset of a previously
* mapped range used with IOMMU_IOAS_MAP or IOMMU_IOAS_COPY. Splitting or
* truncating ranges is not allowed. The values 0 to U64_MAX will unmap
* everything.
*/
struct iommu_ioas_unmap {
__u32 size;
__u32 ioas_id;
__aligned_u64 iova;
__aligned_u64 length;
};
#define IOMMU_IOAS_UNMAP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_UNMAP)
/**
* enum iommufd_option - ioctl(IOMMU_OPTION_RLIMIT_MODE) and
* ioctl(IOMMU_OPTION_HUGE_PAGES)
* @IOMMU_OPTION_RLIMIT_MODE:
* Change how RLIMIT_MEMLOCK accounting works. The caller must have privilege
* to invoke this. Value 0 (default) is user based accouting, 1 uses process
* based accounting. Global option, object_id must be 0
* @IOMMU_OPTION_HUGE_PAGES:
* Value 1 (default) allows contiguous pages to be combined when generating
* iommu mappings. Value 0 disables combining, everything is mapped to
* PAGE_SIZE. This can be useful for benchmarking. This is a per-IOAS
* option, the object_id must be the IOAS ID.
*/
enum iommufd_option {
IOMMU_OPTION_RLIMIT_MODE = 0,
IOMMU_OPTION_HUGE_PAGES = 1,
};
/**
* enum iommufd_option_ops - ioctl(IOMMU_OPTION_OP_SET) and
* ioctl(IOMMU_OPTION_OP_GET)
* @IOMMU_OPTION_OP_SET: Set the option's value
* @IOMMU_OPTION_OP_GET: Get the option's value
*/
enum iommufd_option_ops {
IOMMU_OPTION_OP_SET = 0,
IOMMU_OPTION_OP_GET = 1,
};
/**
* struct iommu_option - iommu option multiplexer
* @size: sizeof(struct iommu_option)
* @option_id: One of enum iommufd_option
* @op: One of enum iommufd_option_ops
* @__reserved: Must be 0
* @object_id: ID of the object if required
* @val64: Option value to set or value returned on get
*
* Change a simple option value. This multiplexor allows controlling options
* on objects. IOMMU_OPTION_OP_SET will load an option and IOMMU_OPTION_OP_GET
* will return the current value.
*/
struct iommu_option {
__u32 size;
__u32 option_id;
__u16 op;
__u16 __reserved;
__u32 object_id;
__aligned_u64 val64;
};
#define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
/**
* enum iommufd_vfio_ioas_op - IOMMU_VFIO_IOAS_* ioctls
* @IOMMU_VFIO_IOAS_GET: Get the current compatibility IOAS
* @IOMMU_VFIO_IOAS_SET: Change the current compatibility IOAS
* @IOMMU_VFIO_IOAS_CLEAR: Disable VFIO compatibility
*/
enum iommufd_vfio_ioas_op {
IOMMU_VFIO_IOAS_GET = 0,
IOMMU_VFIO_IOAS_SET = 1,
IOMMU_VFIO_IOAS_CLEAR = 2,
};
/**
* struct iommu_vfio_ioas - ioctl(IOMMU_VFIO_IOAS)
* @size: sizeof(struct iommu_vfio_ioas)
* @ioas_id: For IOMMU_VFIO_IOAS_SET the input IOAS ID to set
* For IOMMU_VFIO_IOAS_GET will output the IOAS ID
* @op: One of enum iommufd_vfio_ioas_op
* @__reserved: Must be 0
*
* The VFIO compatibility support uses a single ioas because VFIO APIs do not
* support the ID field. Set or Get the IOAS that VFIO compatibility will use.
* When VFIO_GROUP_SET_CONTAINER is used on an iommufd it will get the
* compatibility ioas, either by taking what is already set, or auto creating
* one. From then on VFIO will continue to use that ioas and is not effected by
* this ioctl. SET or CLEAR does not destroy any auto-created IOAS.
*/
struct iommu_vfio_ioas {
__u32 size;
__u32 ioas_id;
__u16 op;
__u16 __reserved;
};
#define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS)
/**
* struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC)
* @size: sizeof(struct iommu_hwpt_alloc)
* @flags: Must be 0
* @dev_id: The device to allocate this HWPT for
* @pt_id: The IOAS to connect this HWPT to
* @out_hwpt_id: The ID of the new HWPT
* @__reserved: Must be 0
*
* Explicitly allocate a hardware page table object. This is the same object
* type that is returned by iommufd_device_attach() and represents the
* underlying iommu driver's iommu_domain kernel object.
*
* A HWPT will be created with the IOVA mappings from the given IOAS.
*/
struct iommu_hwpt_alloc {
__u32 size;
__u32 flags;
__u32 dev_id;
__u32 pt_id;
__u32 out_hwpt_id;
__u32 __reserved;
};
#define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
/**
* struct iommu_hw_info_vtd - Intel VT-d hardware information
*
* @flags: Must be 0
* @__reserved: Must be 0
*
* @cap_reg: Value of Intel VT-d capability register defined in VT-d spec
* section 11.4.2 Capability Register.
* @ecap_reg: Value of Intel VT-d capability register defined in VT-d spec
* section 11.4.3 Extended Capability Register.
*
* User needs to understand the Intel VT-d specification to decode the
* register value.
*/
struct iommu_hw_info_vtd {
__u32 flags;
__u32 __reserved;
__aligned_u64 cap_reg;
__aligned_u64 ecap_reg;
};
/**
* enum iommu_hw_info_type - IOMMU Hardware Info Types
* @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
* info
* @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
*/
enum iommu_hw_info_type {
IOMMU_HW_INFO_TYPE_NONE,
IOMMU_HW_INFO_TYPE_INTEL_VTD,
};
/**
* struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO)
* @size: sizeof(struct iommu_hw_info)
* @flags: Must be 0
* @dev_id: The device bound to the iommufd
* @data_len: Input the length of a user buffer in bytes. Output the length of
* data that kernel supports
* @data_uptr: User pointer to a user-space buffer used by the kernel to fill
* the iommu type specific hardware information data
* @out_data_type: Output the iommu hardware info type as defined in the enum
* iommu_hw_info_type.
* @__reserved: Must be 0
*
* Query an iommu type specific hardware information data from an iommu behind
* a given device that has been bound to iommufd. This hardware info data will
* be used to sync capabilities between the virtual iommu and the physical
* iommu, e.g. a nested translation setup needs to check the hardware info, so
* a guest stage-1 page table can be compatible with the physical iommu.
*
* To capture an iommu type specific hardware information data, @data_uptr and
* its length @data_len must be provided. Trailing bytes will be zeroed if the
* user buffer is larger than the data that kernel has. Otherwise, kernel only
* fills the buffer using the given length in @data_len. If the ioctl succeeds,
* @data_len will be updated to the length that kernel actually supports,
* @out_data_type will be filled to decode the data filled in the buffer
* pointed by @data_uptr. Input @data_len == zero is allowed.
*/
struct iommu_hw_info {
__u32 size;
__u32 flags;
__u32 dev_id;
__u32 data_len;
__aligned_u64 data_uptr;
__u32 out_data_type;
__u32 __reserved;
};
#define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO)
#endif

View File

@ -161,7 +161,8 @@ done
rm -rf "$output/linux-headers/linux"
mkdir -p "$output/linux-headers/linux"
for header in const.h stddef.h kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \
psci.h psp-sev.h userfaultfd.h memfd.h mman.h nvme_ioctl.h vduse.h; do
psci.h psp-sev.h userfaultfd.h memfd.h mman.h nvme_ioctl.h \
vduse.h iommufd.h; do
cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
done

View File

@ -2,6 +2,8 @@
#include "qapi/error.h"
#include "hw/display/ramfb.h"
const VMStateDescription ramfb_vmstate = {};
void ramfb_display_update(QemuConsole *con, RAMFBState *s)
{
}