From c7679d450ee021eb0826be65e4e018884443643a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Oct 2013 12:52:38 -0600 Subject: [PATCH 1/8] vfio-pci: Add support for MSI affinity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When MSI is accelerated through KVM the vectors are only programmed when the guest first enables MSI support.  Subsequent writes to the vector address or data fields are ignored.  Unfortunately that means we're ignore updates done to adjust SMP affinity of the vectors. MSI SMP affinity already works in non-KVM mode because the address and data fields are read from their backing store on each interrupt. This patch stores the MSIMessage programmed into KVM so that we can determine when changes are made and update the routes. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index a1c08fb74e..75a53e2a09 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -119,6 +119,7 @@ typedef struct VFIOINTx { typedef struct VFIOMSIVector { EventNotifier interrupt; /* eventfd triggered on interrupt */ struct VFIODevice *vdev; /* back pointer to device */ + MSIMessage msg; /* cache the MSI message so we know when it changes */ int virq; /* KVM irqchip route for QEMU bypass */ bool use; } VFIOMSIVector; @@ -795,7 +796,6 @@ retry: vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector)); for (i = 0; i < vdev->nr_vectors; i++) { - MSIMessage msg; VFIOMSIVector *vector = &vdev->msi_vectors[i]; vector->vdev = vdev; @@ -805,13 +805,13 @@ retry: error_report("vfio: Error: event_notifier_init failed"); } - msg = msi_get_message(&vdev->pdev, i); + vector->msg = msi_get_message(&vdev->pdev, i); /* * Attempt to enable route through KVM irqchip, * default to userspace handling if unavailable. */ - vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg); + vector->virq = kvm_irqchip_add_msi_route(kvm_state, vector->msg); if (vector->virq < 0 || kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt, NULL, vector->virq) < 0) { @@ -917,6 +917,33 @@ static void vfio_disable_msi(VFIODevice *vdev) vdev->host.bus, vdev->host.slot, vdev->host.function); } +static void vfio_update_msi(VFIODevice *vdev) +{ + int i; + + for (i = 0; i < vdev->nr_vectors; i++) { + VFIOMSIVector *vector = &vdev->msi_vectors[i]; + MSIMessage msg; + + if (!vector->use || vector->virq < 0) { + continue; + } + + msg = msi_get_message(&vdev->pdev, i); + + if (msg.address != vector->msg.address || + msg.data != vector->msg.data) { + + DPRINTF("%s(%04x:%02x:%02x.%x) MSI vector %d changed\n", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function, i); + + kvm_irqchip_update_msi_route(kvm_state, vector->virq, msg); + vector->msg = msg; + } + } +} + /* * IO Port/MMIO - Beware of the endians, VFIO is always little endian */ @@ -1834,10 +1861,16 @@ static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, is_enabled = msi_enabled(pdev); - if (!was_enabled && is_enabled) { - vfio_enable_msi(vdev); - } else if (was_enabled && !is_enabled) { - vfio_disable_msi(vdev); + if (!was_enabled) { + if (is_enabled) { + vfio_enable_msi(vdev); + } + } else { + if (!is_enabled) { + vfio_disable_msi(vdev); + } else { + vfio_update_msi(vdev); + } } } else if (pdev->cap_present & QEMU_PCI_CAP_MSIX && ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) { From befe5176ef7a0004ba23517c97c804e292273635 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Oct 2013 12:52:38 -0600 Subject: [PATCH 2/8] vfio-pci: Test device reset capabilities Not all resets are created equal. PM reset is not very reliable, especially for GPUs, so we might want to opt for a bus reset if a standard reset will only do a D3hot->D0 transition. We can also use this to tell if the standard reset will do a bus reset (if neither has_pm_reset or has_flr is probed, but the device still supports reset). Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 75a53e2a09..ede026d111 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -185,6 +185,8 @@ typedef struct VFIODevice { bool reset_works; bool has_vga; bool pci_aer; + bool has_flr; + bool has_pm_reset; } VFIODevice; typedef struct VFIOGroup { @@ -2513,6 +2515,42 @@ static int vfio_setup_pcie_cap(VFIODevice *vdev, int pos, uint8_t size) return pos; } +static void vfio_check_pcie_flr(VFIODevice *vdev, uint8_t pos) +{ + uint32_t cap = pci_get_long(vdev->pdev.config + pos + PCI_EXP_DEVCAP); + + if (cap & PCI_EXP_DEVCAP_FLR) { + DPRINTF("%04x:%02x:%02x.%x Supports FLR via PCIe cap\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + vdev->has_flr = true; + } +} + +static void vfio_check_pm_reset(VFIODevice *vdev, uint8_t pos) +{ + uint16_t csr = pci_get_word(vdev->pdev.config + pos + PCI_PM_CTRL); + + if (!(csr & PCI_PM_CTRL_NO_SOFT_RESET)) { + DPRINTF("%04x:%02x:%02x.%x Supports PM reset\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + vdev->has_pm_reset = true; + } +} + +static void vfio_check_af_flr(VFIODevice *vdev, uint8_t pos) +{ + uint8_t cap = pci_get_byte(vdev->pdev.config + pos + PCI_AF_CAP); + + if ((cap & PCI_AF_CAP_TP) && (cap & PCI_AF_CAP_FLR)) { + DPRINTF("%04x:%02x:%02x.%x Supports FLR via AF cap\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + vdev->has_flr = true; + } +} + static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos) { PCIDevice *pdev = &vdev->pdev; @@ -2557,13 +2595,21 @@ static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos) ret = vfio_setup_msi(vdev, pos); break; case PCI_CAP_ID_EXP: + vfio_check_pcie_flr(vdev, pos); ret = vfio_setup_pcie_cap(vdev, pos, size); break; case PCI_CAP_ID_MSIX: ret = vfio_setup_msix(vdev, pos); break; case PCI_CAP_ID_PM: + vfio_check_pm_reset(vdev, pos); vdev->pm_cap = pos; + ret = pci_add_capability(pdev, cap_id, pos, size); + break; + case PCI_CAP_ID_AF: + vfio_check_af_flr(vdev, pos); + ret = pci_add_capability(pdev, cap_id, pos, size); + break; default: ret = pci_add_capability(pdev, cap_id, pos, size); break; From 6f864e6ec8812d5a5525a7861ca599c6bcabdebe Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Oct 2013 12:52:38 -0600 Subject: [PATCH 3/8] vfio-pci: Lazy PCI option ROM loading During vfio-pci initfn, the device is not always in a state where the option ROM can be read. In the case of graphics cards, there's often no per function reset, which means we have host driver state affecting whether the option ROM is usable. Ideally we want to move reading the option ROM past any co-assigned device resets to the point where the guest first tries to read the ROM itself. To accomplish this, we switch the memory region for the option rom to an I/O region rather than a memory mapped region. This has the side benefit that we don't waste KVM memory slots for a BAR where we don't care about performance. This also allows us to delay loading the ROM from the device until the first read by the guest. We then use the PCI config space size of the ROM BAR when setting up the BAR through QEMU PCI. Another benefit of this approach is that previously when a user set the ROM to a file using the romfile= option, we still probed VFIO for the parameters of the ROM, which can result in dmesg errors about an invalid ROM. We now only probe VFIO to get the ROM contents if the guest actually tries to read the ROM. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 184 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 122 insertions(+), 62 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index ede026d111..730dec511c 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -166,6 +166,7 @@ typedef struct VFIODevice { off_t config_offset; /* Offset of config space region within device fd */ unsigned int rom_size; off_t rom_offset; /* Offset of ROM region within device fd */ + void *rom; int msi_cap_size; VFIOMSIVector *msi_vectors; VFIOMSIXInfo *msix; @@ -1058,6 +1059,125 @@ static const MemoryRegionOps vfio_bar_ops = { .endianness = DEVICE_LITTLE_ENDIAN, }; +static void vfio_pci_load_rom(VFIODevice *vdev) +{ + struct vfio_region_info reg_info = { + .argsz = sizeof(reg_info), + .index = VFIO_PCI_ROM_REGION_INDEX + }; + uint64_t size; + off_t off = 0; + size_t bytes; + + if (ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { + error_report("vfio: Error getting ROM info: %m"); + return; + } + + DPRINTF("Device %04x:%02x:%02x.%x ROM:\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", + (unsigned long)reg_info.size, (unsigned long)reg_info.offset, + (unsigned long)reg_info.flags); + + vdev->rom_size = size = reg_info.size; + vdev->rom_offset = reg_info.offset; + + if (!vdev->rom_size) { + return; + } + + vdev->rom = g_malloc(size); + memset(vdev->rom, 0xff, size); + + while (size) { + bytes = pread(vdev->fd, vdev->rom + off, size, vdev->rom_offset + off); + if (bytes == 0) { + break; + } else if (bytes > 0) { + off += bytes; + size -= bytes; + } else { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + error_report("vfio: Error reading device ROM: %m"); + break; + } + } +} + +static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) +{ + VFIODevice *vdev = opaque; + uint64_t val = ((uint64_t)1 << (size * 8)) - 1; + + /* Load the ROM lazily when the guest tries to read it */ + if (unlikely(!vdev->rom)) { + vfio_pci_load_rom(vdev); + } + + memcpy(&val, vdev->rom + addr, + (addr < vdev->rom_size) ? MIN(size, vdev->rom_size - addr) : 0); + + DPRINTF("%s(%04x:%02x:%02x.%x, 0x%"HWADDR_PRIx", 0x%x) = 0x%"PRIx64"\n", + __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, addr, size, val); + + return val; +} + +static const MemoryRegionOps vfio_rom_ops = { + .read = vfio_rom_read, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_pci_size_rom(VFIODevice *vdev) +{ + uint32_t orig, size = (uint32_t)PCI_ROM_ADDRESS_MASK; + off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; + char name[32]; + + if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { + return; + } + + /* + * Use the same size ROM BAR as the physical device. The contents + * will get filled in later when the guest tries to read it. + */ + if (pread(vdev->fd, &orig, 4, offset) != 4 || + pwrite(vdev->fd, &size, 4, offset) != 4 || + pread(vdev->fd, &size, 4, offset) != 4 || + pwrite(vdev->fd, &orig, 4, offset) != 4) { + error_report("%s(%04x:%02x:%02x.%x) failed: %m", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); + return; + } + + size = ~(size & PCI_ROM_ADDRESS_MASK) + 1; + + if (!size) { + return; + } + + DPRINTF("%04x:%02x:%02x.%x ROM size 0x%x\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, size); + + snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + + memory_region_init_io(&vdev->pdev.rom, OBJECT(vdev), + &vfio_rom_ops, vdev, name, size); + + pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, + PCI_BASE_ADDRESS_SPACE_MEMORY, &vdev->pdev.rom); + + vdev->pdev.has_rom = true; +} + static void vfio_vga_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { @@ -2638,51 +2758,6 @@ static int vfio_add_capabilities(VFIODevice *vdev) return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); } -static int vfio_load_rom(VFIODevice *vdev) -{ - uint64_t size = vdev->rom_size; - char name[32]; - off_t off = 0, voff = vdev->rom_offset; - ssize_t bytes; - void *ptr; - - /* If loading ROM from file, pci handles it */ - if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) { - return 0; - } - - DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, - vdev->host.bus, vdev->host.slot, vdev->host.function); - - snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom", - vdev->host.domain, vdev->host.bus, vdev->host.slot, - vdev->host.function); - memory_region_init_ram(&vdev->pdev.rom, OBJECT(vdev), name, size); - ptr = memory_region_get_ram_ptr(&vdev->pdev.rom); - memset(ptr, 0xff, size); - - while (size) { - bytes = pread(vdev->fd, ptr + off, size, voff + off); - if (bytes == 0) { - break; /* expect that we could get back less than the ROM BAR */ - } else if (bytes > 0) { - off += bytes; - size -= bytes; - } else { - if (errno == EINTR || errno == EAGAIN) { - continue; - } - error_report("vfio: Error reading device ROM: %m"); - memory_region_destroy(&vdev->pdev.rom); - return -errno; - } - } - - pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom); - vdev->pdev.has_rom = true; - return 0; -} - static int vfio_connect_container(VFIOGroup *group) { VFIOContainer *container; @@ -2916,22 +2991,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) QLIST_INIT(&vdev->bars[i].quirks); } - reg_info.index = VFIO_PCI_ROM_REGION_INDEX; - - ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); - if (ret) { - error_report("vfio: Error getting ROM info: %m"); - goto error; - } - - DPRINTF("Device %s ROM:\n", name); - DPRINTF(" size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n", - (unsigned long)reg_info.size, (unsigned long)reg_info.offset, - (unsigned long)reg_info.flags); - - vdev->rom_size = reg_info.size; - vdev->rom_offset = reg_info.offset; - reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX; ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); @@ -3229,7 +3288,7 @@ static int vfio_initfn(PCIDevice *pdev) memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24); memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4); - vfio_load_rom(vdev); + vfio_pci_size_rom(vdev); ret = vfio_early_setup_msix(vdev); if (ret) { @@ -3294,6 +3353,7 @@ static void vfio_exitfn(PCIDevice *pdev) vfio_teardown_msi(vdev); vfio_unmap_bars(vdev); g_free(vdev->emulated_config_bits); + g_free(vdev->rom); vfio_put_device(vdev); vfio_put_group(group); } From 8fbf47c3a8a7f37a11268a07290d20a325ba4cb6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Oct 2013 12:52:38 -0600 Subject: [PATCH 4/8] vfio-pci: Cleanup error_reports Remove carriage returns and tweak formatting for error_reports. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 730dec511c..a73e7f568c 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -3055,13 +3055,15 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) ret = ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure ret=%d\n", ret); + DPRINTF("VFIO_DEVICE_GET_IRQ_INFO failure: %m\n"); ret = 0; } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { - error_report("vfio: Warning: " - "Could not enable error recovery for the device\n"); + error_report("vfio: %04x:%02x:%02x.%x " + "Could not enable error recovery for the device", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); } error: @@ -3102,11 +3104,10 @@ static void vfio_err_notifier_handler(void *opaque) * guest to contain the error. */ - error_report("%s (%04x:%02x:%02x.%x)" - "Unrecoverable error detected...\n" - "Please collect any data possible and then kill the guest", - __func__, vdev->host.domain, vdev->host.bus, - vdev->host.slot, vdev->host.function); + error_report("%s(%04x:%02x:%02x.%x) Unrecoverable error detected. " + "Please collect any data possible and then kill the guest", + __func__, vdev->host.domain, vdev->host.bus, + vdev->host.slot, vdev->host.function); vm_stop(RUN_STATE_IO_ERROR); } @@ -3129,8 +3130,7 @@ static void vfio_register_err_notifier(VFIODevice *vdev) } if (event_notifier_init(&vdev->err_notifier, 0)) { - error_report("vfio: Warning: " - "Unable to init event notifier for error detection\n"); + error_report("vfio: Unable to init event notifier for error detection"); vdev->pci_aer = false; return; } @@ -3151,7 +3151,7 @@ static void vfio_register_err_notifier(VFIODevice *vdev) ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); if (ret) { - error_report("vfio: Failed to set up error notification\n"); + error_report("vfio: Failed to set up error notification"); qemu_set_fd_handler(*pfd, NULL, NULL, vdev); event_notifier_cleanup(&vdev->err_notifier); vdev->pci_aer = false; @@ -3184,7 +3184,7 @@ static void vfio_unregister_err_notifier(VFIODevice *vdev) ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set); if (ret) { - error_report("vfio: Failed to de-assign error fd: %d\n", ret); + error_report("vfio: Failed to de-assign error fd: %m"); } g_free(irq_set); qemu_set_fd_handler(event_notifier_get_fd(&vdev->err_notifier), From f16f39c3fc973c5d7cbc2224eefb4ef5eb1e64ff Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 2 Oct 2013 13:51:00 -0600 Subject: [PATCH 5/8] vfio-pci: Implement PCI hot reset Now that VFIO has a PCI hot reset interface, take advantage of it. There are two modes that we need to consider. The first is when only one device within the set of devices affected is actually assigned to the guest. In this case the other devices are are just held by VFIO for isolation and we can pretend they're not there, doing an entire bus reset whenever the device reset callback is triggered. Supporting this case separately allows us to do the best reset we can do of the device even if the device is hotplugged. The second mode is when multiple affected devices are all exposed to the guest. In this case we can only do a hot reset when the entire system is being reset. However, this also allows us to track which individual devices are affected by a reset and only do them once. We split our reset function into pre- and post-reset helper functions prioritize the types of device resets available to us, and create separate _one vs _multi reset interfaces to handle the distinct cases above. Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 340 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 301 insertions(+), 39 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index a73e7f568c..0c9bb95815 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -188,6 +188,7 @@ typedef struct VFIODevice { bool pci_aer; bool has_flr; bool has_pm_reset; + bool needs_reset; } VFIODevice; typedef struct VFIOGroup { @@ -2758,6 +2759,279 @@ static int vfio_add_capabilities(VFIODevice *vdev) return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); } +static void vfio_pci_pre_reset(VFIODevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + uint16_t cmd; + + vfio_disable_interrupts(vdev); + + /* Make sure the device is in D0 */ + if (vdev->pm_cap) { + uint16_t pmcsr; + uint8_t state; + + pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + state = pmcsr & PCI_PM_CTRL_STATE_MASK; + if (state) { + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); + /* vfio handles the necessary delay here */ + pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); + state = pmcsr & PCI_PM_CTRL_STATE_MASK; + if (state) { + error_report("vfio: Unable to power on device, stuck in D%d\n", + state); + } + } + } + + /* + * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. + * Also put INTx Disable in known state. + */ + cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); + cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); +} + +static void vfio_pci_post_reset(VFIODevice *vdev) +{ + vfio_enable_intx(vdev); +} + +static bool vfio_pci_host_match(PCIHostDeviceAddress *host1, + PCIHostDeviceAddress *host2) +{ + return (host1->domain == host2->domain && host1->bus == host2->bus && + host1->slot == host2->slot && host1->function == host2->function); +} + +static int vfio_pci_hot_reset(VFIODevice *vdev, bool single) +{ + VFIOGroup *group; + struct vfio_pci_hot_reset_info *info; + struct vfio_pci_dependent_device *devices; + struct vfio_pci_hot_reset *reset; + int32_t *fds; + int ret, i, count; + bool multi = false; + + DPRINTF("%s(%04x:%02x:%02x.%x) %s\n", __func__, vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, + single ? "one" : "multi"); + + vfio_pci_pre_reset(vdev); + vdev->needs_reset = false; + + info = g_malloc0(sizeof(*info)); + info->argsz = sizeof(*info); + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); + if (ret && errno != ENOSPC) { + ret = -errno; + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, " + "no available reset mechanism.", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + } + goto out_single; + } + + count = info->count; + info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); + info->argsz = sizeof(*info) + (count * sizeof(*devices)); + devices = &info->devices[0]; + + ret = ioctl(vdev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); + if (ret) { + ret = -errno; + error_report("vfio: hot reset info failed: %m"); + goto out_single; + } + + DPRINTF("%04x:%02x:%02x.%x: hot reset dependent devices:\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); + + /* Verify that we have all the groups required */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIODevice *tmp; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + DPRINTF("\t%04x:%02x:%02x.%x group %d\n", host.domain, + host.bus, host.slot, host.function, devices[i].group_id); + + if (vfio_pci_host_match(&host, &vdev->host)) { + continue; + } + + QLIST_FOREACH(group, &group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + if (!vdev->has_pm_reset) { + error_report("vfio: Cannot reset device %04x:%02x:%02x.%x, " + "depends on group %d which is not owned.", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function, devices[i].group_id); + } + ret = -EPERM; + goto out; + } + + /* Prep dependent devices for reset and clear our marker. */ + QLIST_FOREACH(tmp, &group->device_list, next) { + if (vfio_pci_host_match(&host, &tmp->host)) { + if (single) { + DPRINTF("vfio: found another in-use device " + "%04x:%02x:%02x.%x\n", host.domain, host.bus, + host.slot, host.function); + ret = -EINVAL; + goto out_single; + } + vfio_pci_pre_reset(tmp); + tmp->needs_reset = false; + multi = true; + break; + } + } + } + + if (!single && !multi) { + DPRINTF("vfio: No other in-use devices for multi hot reset\n"); + ret = -EINVAL; + goto out_single; + } + + /* Determine how many group fds need to be passed */ + count = 0; + QLIST_FOREACH(group, &group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + count++; + break; + } + } + } + + reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); + reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); + fds = &reset->group_fds[0]; + + /* Fill in group fds */ + QLIST_FOREACH(group, &group_list, next) { + for (i = 0; i < info->count; i++) { + if (group->groupid == devices[i].group_id) { + fds[reset->count++] = group->fd; + break; + } + } + } + + /* Bus reset! */ + ret = ioctl(vdev->fd, VFIO_DEVICE_PCI_HOT_RESET, reset); + g_free(reset); + + DPRINTF("%04x:%02x:%02x.%x hot reset: %s\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function, + ret ? "%m" : "Success"); + +out: + /* Re-enable INTx on affected devices */ + for (i = 0; i < info->count; i++) { + PCIHostDeviceAddress host; + VFIODevice *tmp; + + host.domain = devices[i].segment; + host.bus = devices[i].bus; + host.slot = PCI_SLOT(devices[i].devfn); + host.function = PCI_FUNC(devices[i].devfn); + + if (vfio_pci_host_match(&host, &vdev->host)) { + continue; + } + + QLIST_FOREACH(group, &group_list, next) { + if (group->groupid == devices[i].group_id) { + break; + } + } + + if (!group) { + break; + } + + QLIST_FOREACH(tmp, &group->device_list, next) { + if (vfio_pci_host_match(&host, &tmp->host)) { + vfio_pci_post_reset(tmp); + break; + } + } + } +out_single: + vfio_pci_post_reset(vdev); + g_free(info); + + return ret; +} + +/* + * We want to differentiate hot reset of mulitple in-use devices vs hot reset + * of a single in-use device. VFIO_DEVICE_RESET will already handle the case + * of doing hot resets when there is only a single device per bus. The in-use + * here refers to how many VFIODevices are affected. A hot reset that affects + * multiple devices, but only a single in-use device, means that we can call + * it from our bus ->reset() callback since the extent is effectively a single + * device. This allows us to make use of it in the hotplug path. When there + * are multiple in-use devices, we can only trigger the hot reset during a + * system reset and thus from our reset handler. We separate _one vs _multi + * here so that we don't overlap and do a double reset on the system reset + * path where both our reset handler and ->reset() callback are used. Calling + * _one() will only do a hot reset for the one in-use devices case, calling + * _multi() will do nothing if a _one() would have been sufficient. + */ +static int vfio_pci_hot_reset_one(VFIODevice *vdev) +{ + return vfio_pci_hot_reset(vdev, true); +} + +static int vfio_pci_hot_reset_multi(VFIODevice *vdev) +{ + return vfio_pci_hot_reset(vdev, false); +} + +static void vfio_pci_reset_handler(void *opaque) +{ + VFIOGroup *group; + VFIODevice *vdev; + + QLIST_FOREACH(group, &group_list, next) { + QLIST_FOREACH(vdev, &group->device_list, next) { + if (!vdev->reset_works || (!vdev->has_flr && vdev->has_pm_reset)) { + vdev->needs_reset = true; + } + } + } + + QLIST_FOREACH(group, &group_list, next) { + QLIST_FOREACH(vdev, &group->device_list, next) { + if (vdev->needs_reset) { + vfio_pci_hot_reset_multi(vdev); + } + } + } +} + static int vfio_connect_container(VFIOGroup *group) { VFIOContainer *container; @@ -2900,6 +3174,10 @@ static VFIOGroup *vfio_get_group(int groupid) return NULL; } + if (QLIST_EMPTY(&group_list)) { + qemu_register_reset(vfio_pci_reset_handler, NULL); + } + QLIST_INSERT_HEAD(&group_list, group, next); return group; @@ -2916,6 +3194,10 @@ static void vfio_put_group(VFIOGroup *group) DPRINTF("vfio_put_group: close group->fd\n"); close(group->fd); g_free(group); + + if (QLIST_EMPTY(&group_list)) { + qemu_unregister_reset(vfio_pci_reset_handler, NULL); + } } static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) @@ -2954,9 +3236,6 @@ static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev) } vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - if (!vdev->reset_works) { - error_report("Warning, device %s does not support reset", name); - } if (dev_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX + 1) { error_report("vfio: unexpected number of io regions %u", @@ -3362,51 +3641,34 @@ static void vfio_pci_reset(DeviceState *dev) { PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev); VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev); - uint16_t cmd; DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function); - vfio_disable_interrupts(vdev); + vfio_pci_pre_reset(vdev); - /* Make sure the device is in D0 */ - if (vdev->pm_cap) { - uint16_t pmcsr; - uint8_t state; - - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); - state = pmcsr & PCI_PM_CTRL_STATE_MASK; - if (state) { - pmcsr &= ~PCI_PM_CTRL_STATE_MASK; - vfio_pci_write_config(pdev, vdev->pm_cap + PCI_PM_CTRL, pmcsr, 2); - /* vfio handles the necessary delay here */ - pmcsr = vfio_pci_read_config(pdev, vdev->pm_cap + PCI_PM_CTRL, 2); - state = pmcsr & PCI_PM_CTRL_STATE_MASK; - if (state) { - error_report("vfio: Unable to power on device, stuck in D%d\n", - state); - } - } + if (vdev->reset_works && (vdev->has_flr || !vdev->has_pm_reset) && + !ioctl(vdev->fd, VFIO_DEVICE_RESET)) { + DPRINTF("%04x:%02x:%02x.%x FLR/VFIO_DEVICE_RESET\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + goto post_reset; } - /* - * Stop any ongoing DMA by disconecting I/O, MMIO, and bus master. - * Also put INTx Disable in known state. - */ - cmd = vfio_pci_read_config(pdev, PCI_COMMAND, 2); - cmd &= ~(PCI_COMMAND_IO | PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER | - PCI_COMMAND_INTX_DISABLE); - vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); - - if (vdev->reset_works) { - if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) { - error_report("vfio: Error unable to reset physical device " - "(%04x:%02x:%02x.%x): %m", vdev->host.domain, - vdev->host.bus, vdev->host.slot, vdev->host.function); - } + /* See if we can do our own bus reset */ + if (!vfio_pci_hot_reset_one(vdev)) { + goto post_reset; } - vfio_enable_intx(vdev); + /* If nothing else works and the device supports PM reset, use it */ + if (vdev->reset_works && vdev->has_pm_reset && + !ioctl(vdev->fd, VFIO_DEVICE_RESET)) { + DPRINTF("%04x:%02x:%02x.%x PCI PM Reset\n", vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + goto post_reset; + } + +post_reset: + vfio_pci_post_reset(vdev); } static Property vfio_pci_dev_properties[] = { From 1d5bf692e55ae22b59083741d521e27db704846d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 3 Oct 2013 09:10:09 -0600 Subject: [PATCH 6/8] vfio: Fix debug output for int128 values Memory regions can easily be 2^64 byte long and therefore overflow for just a bit but that is enough for int128_get64() to assert. This takes care of debug printing of huge section sizes. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Alex Williamson --- hw/misc/vfio.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 0c9bb95815..68e25bd8c3 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -2084,7 +2084,8 @@ static void vfio_listener_region_add(MemoryListener *listener, if (vfio_listener_skipped_section(section)) { DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n", section->offset_within_address_space, - section->offset_within_address_space + section->size - 1); + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); return; } @@ -2129,7 +2130,8 @@ static void vfio_listener_region_del(MemoryListener *listener, if (vfio_listener_skipped_section(section)) { DPRINTF("SKIPPING region_del %"HWADDR_PRIx" - %"PRIx64"\n", section->offset_within_address_space, - section->offset_within_address_space + section->size - 1); + section->offset_within_address_space + + int128_get64(int128_sub(section->size, int128_one()))); return; } From 64fa25a0efcadda6e8197e8ea578f6117d01bb4b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Oct 2013 08:51:36 -0600 Subject: [PATCH 7/8] vfio-pci: Add dummy PCI ROM write accessor Just to be sure we don't jump off any NULL pointer cliffs. Signed-off-by: Alex Williamson Reported-by: Paolo Bonzini --- hw/misc/vfio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 68e25bd8c3..1fbc40b960 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -1128,8 +1128,14 @@ static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) return val; } +static void vfio_rom_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ +} + static const MemoryRegionOps vfio_rom_ops = { .read = vfio_rom_read, + .write = vfio_rom_write, .endianness = DEVICE_LITTLE_ENDIAN, }; From b1c50c5f248805be747e96e89efbe784ee99f764 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Oct 2013 12:50:51 -0600 Subject: [PATCH 8/8] vfio-pci: Fix endian issues in vfio_pci_size_rom() VFIO is always little endian so do byte swapping of our mask on the way in and byte swapping of the size on the way out. Signed-off-by: Alex Williamson Reported-by: Alexey Kardashevskiy --- hw/misc/vfio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 1fbc40b960..a2d5283ff9 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -1141,7 +1141,7 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIODevice *vdev) { - uint32_t orig, size = (uint32_t)PCI_ROM_ADDRESS_MASK; + uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; char name[32]; @@ -1163,7 +1163,7 @@ static void vfio_pci_size_rom(VFIODevice *vdev) return; } - size = ~(size & PCI_ROM_ADDRESS_MASK) + 1; + size = ~(le32_to_cpu(size) & PCI_ROM_ADDRESS_MASK) + 1; if (!size) { return;