From 6e48e8f9e0f5b6b15c41f6f8a68c9bf330147d45 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 1/6] memory: unregister AddressSpace MemoryListener within BQL address_space_destroy_dispatch is called from an RCU callback and hence outside the iothread mutex (BQL). However, after address_space_destroy no new accesses can hit the destroyed AddressSpace so it is not necessary to observe changes to the memory map. Move the memory_listener_unregister call earlier, to make it thread-safe again. Reported-by: Alex Williamson Fixes: 374f2981d1f10bc4307f250f24b2a7ddb9b14be0 Signed-off-by: Paolo Bonzini Signed-off-by: Alex Williamson --- exec.c | 6 +++++- include/exec/memory-internal.h | 1 + memory.c | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/exec.c b/exec.c index 6b79ad1d11..6dff7bc43a 100644 --- a/exec.c +++ b/exec.c @@ -2059,11 +2059,15 @@ void address_space_init_dispatch(AddressSpace *as) memory_listener_register(&as->dispatch_listener, as); } +void address_space_unregister(AddressSpace *as) +{ + memory_listener_unregister(&as->dispatch_listener); +} + void address_space_destroy_dispatch(AddressSpace *as) { AddressSpaceDispatch *d = as->dispatch; - memory_listener_unregister(&as->dispatch_listener); g_free(d); as->dispatch = NULL; } diff --git a/include/exec/memory-internal.h b/include/exec/memory-internal.h index 25c43c06e9..fb467acdba 100644 --- a/include/exec/memory-internal.h +++ b/include/exec/memory-internal.h @@ -23,6 +23,7 @@ typedef struct AddressSpaceDispatch AddressSpaceDispatch; void address_space_init_dispatch(AddressSpace *as); +void address_space_unregister(AddressSpace *as); void address_space_destroy_dispatch(AddressSpace *as); extern const MemoryRegionOps unassigned_mem_ops; diff --git a/memory.c b/memory.c index 9b91243978..130152cf1d 100644 --- a/memory.c +++ b/memory.c @@ -1978,6 +1978,7 @@ void address_space_destroy(AddressSpace *as) as->root = NULL; memory_region_transaction_commit(); QTAILQ_REMOVE(&address_spaces, as, address_spaces_link); + address_space_unregister(as); /* At this point, as->dispatch and as->current_map are dummy * entries that the guest should never use. Wait for the old From 217e9fdcadb1dc7462f4d92866314f626426fa82 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 2/6] vfio: cleanup vfio_get_device error path, remove vfio_populate_device callback Now that vfio_put_base_device is called unconditionally at instance_finalize time, it can be called twice if vfio_populate_device fails. This works but it is slightly harder to follow. Change vfio_get_device to not touch the vbasedev struct until it will definitely succeed, moving the vfio_populate_device call back to vfio-pci. This way, vfio_put_base_device will only be called once. Signed-off-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/vfio/common.c | 30 ++++++++++++------------------ hw/vfio/pci.c | 11 +++++++---- include/hw/vfio/vfio-common.h | 1 - 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index e71385e4fe..6e299b3dfd 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -867,27 +867,28 @@ int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vbasedev) { struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) }; - int ret; + int ret, fd; - ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); - if (ret < 0) { + fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name); + if (fd < 0) { error_report("vfio: error getting device %s from group %d: %m", name, group->groupid); error_printf("Verify all devices in group %d are bound to vfio- " "or pci-stub and not already in use\n", group->groupid); + return fd; + } + + ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info); + if (ret) { + error_report("vfio: error getting device info: %m"); + close(fd); return ret; } - vbasedev->fd = ret; + vbasedev->fd = fd; vbasedev->group = group; QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_INFO, &dev_info); - if (ret) { - error_report("vfio: error getting device info: %m"); - goto error; - } - vbasedev->num_irqs = dev_info.num_irqs; vbasedev->num_regions = dev_info.num_regions; vbasedev->flags = dev_info.flags; @@ -896,14 +897,7 @@ int vfio_get_device(VFIOGroup *group, const char *name, dev_info.num_irqs); vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - - ret = vbasedev->ops->vfio_populate_device(vbasedev); - -error: - if (ret) { - vfio_put_base_device(vbasedev); - } - return ret; + return 0; } void vfio_put_base_device(VFIODevice *vbasedev) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 29caabc149..23fe9fa030 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -195,7 +195,6 @@ static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); -static int vfio_populate_device(VFIODevice *vbasedev); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -2934,12 +2933,11 @@ static VFIODeviceOps vfio_pci_ops = { .vfio_compute_needs_reset = vfio_pci_compute_needs_reset, .vfio_hot_reset_multi = vfio_pci_hot_reset_multi, .vfio_eoi = vfio_eoi, - .vfio_populate_device = vfio_populate_device, }; -static int vfio_populate_device(VFIODevice *vbasedev) +static int vfio_populate_device(VFIOPCIDevice *vdev) { - VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); + VFIODevice *vbasedev = &vdev->vbasedev; struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; int i, ret = -1; @@ -3248,6 +3246,11 @@ static int vfio_initfn(PCIDevice *pdev) return ret; } + ret = vfio_populate_device(vdev); + if (ret) { + goto out_put; + } + /* Get a copy of config space */ ret = pread(vdev->vbasedev.fd, vdev->pdev.config, MIN(pci_config_size(&vdev->pdev), vdev->config_size), diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h index 1d5bfe8fcb..5f3679b7b2 100644 --- a/include/hw/vfio/vfio-common.h +++ b/include/hw/vfio/vfio-common.h @@ -112,7 +112,6 @@ struct VFIODeviceOps { void (*vfio_compute_needs_reset)(VFIODevice *vdev); int (*vfio_hot_reset_multi)(VFIODevice *vdev); void (*vfio_eoi)(VFIODevice *vdev); - int (*vfio_populate_device)(VFIODevice *vdev); }; typedef struct VFIOGroup { From 77a10d04d033484a913a5ee76eed31a9acc57bae Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 3/6] vfio: free dynamically-allocated data in instance_finalize In order to enable out-of-BQL address space lookup, destruction of devices needs to be split in two phases. Unrealize is the first phase; once it complete no new accesses will be started, but there may still be pending memory accesses can still be completed. The second part is freeing the device, which only happens once all memory accesses are complete. At this point the reference count has dropped to zero, an RCU grace period must have completed (because the RCU-protected FlatViews hold a reference to the device via memory_region_ref). This is when instance_finalize is called. Freeing data belongs in an instance_finalize callback, because the dynamically allocated memory can still be used after unrealize by the pending memory accesses. This starts the process by creating an instance_finalize callback and freeing most of the dynamically-allocated data in instance_finalize. Because instance_finalize is also called on error paths or also when the device is actually not realized, the common code needs some changes to be ready for this. The error path in vfio_initfn can be simplified too. Signed-off-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/vfio/common.c | 5 ++++- hw/vfio/pci.c | 24 ++++++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 6e299b3dfd..b230ef102b 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -847,7 +847,7 @@ free_group_exit: void vfio_put_group(VFIOGroup *group) { - if (!QLIST_EMPTY(&group->device_list)) { + if (!group || !QLIST_EMPTY(&group->device_list)) { return; } @@ -902,6 +902,9 @@ int vfio_get_device(VFIOGroup *group, const char *name, void vfio_put_base_device(VFIODevice *vbasedev) { + if (!vbasedev->group) { + return; + } QLIST_REMOVE(vbasedev, next); vbasedev->group = NULL; trace_vfio_put_base_device(vbasedev->fd); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 23fe9fa030..0271c801b6 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3248,7 +3248,7 @@ static int vfio_initfn(PCIDevice *pdev) ret = vfio_populate_device(vdev); if (ret) { - goto out_put; + return ret; } /* Get a copy of config space */ @@ -3258,7 +3258,7 @@ static int vfio_initfn(PCIDevice *pdev) if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { ret = ret < 0 ? -errno : -EFAULT; error_report("vfio: Failed to read device config space"); - goto out_put; + return ret; } /* vfio emulates a lot for us, but some bits need extra love */ @@ -3290,7 +3290,7 @@ static int vfio_initfn(PCIDevice *pdev) ret = vfio_early_setup_msix(vdev); if (ret) { - goto out_put; + return ret; } vfio_map_bars(vdev); @@ -3329,17 +3329,24 @@ out_teardown: pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_teardown_msi(vdev); vfio_unmap_bars(vdev); -out_put: + return ret; +} + +static void vfio_instance_finalize(Object *obj) +{ + PCIDevice *pci_dev = PCI_DEVICE(obj); + VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev); + VFIOGroup *group = vdev->vbasedev.group; + g_free(vdev->emulated_config_bits); + g_free(vdev->rom); vfio_put_device(vdev); vfio_put_group(group); - return ret; } static void vfio_exitfn(PCIDevice *pdev) { VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); - VFIOGroup *group = vdev->vbasedev.group; vfio_unregister_err_notifier(vdev); pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); @@ -3349,10 +3356,6 @@ static void vfio_exitfn(PCIDevice *pdev) } vfio_teardown_msi(vdev); vfio_unmap_bars(vdev); - g_free(vdev->emulated_config_bits); - g_free(vdev->rom); - vfio_put_device(vdev); - vfio_put_group(group); } static void vfio_pci_reset(DeviceState *dev) @@ -3440,6 +3443,7 @@ static const TypeInfo vfio_pci_dev_info = { .instance_size = sizeof(VFIOPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, + .instance_finalize = vfio_instance_finalize, }; static void register_vfio_pci_dev_type(void) From ba5e6bfa1aee29a8f72c5538c565dfb9889cf273 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 4/6] vfio: unmap and free BAR data in instance_finalize In the case of VFIO, the unrealize callback is too early to munmap the BARs. The munmap must be delayed until memory accesses are complete. To do this, split vfio_unmap_bars in two. The removal step, now called vfio_unregister_bars, remains in vfio_exitfn. The reclamation step is vfio_unmap_bars and is moved to the instance_finalize callback. Similarly, quirk MemoryRegions have to be removed during vfio_unregister_bars, but freeing the data structure must be delayed to vfio_unmap_bars. Signed-off-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 65 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 8 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 0271c801b6..fa6a5e9268 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1995,13 +1995,24 @@ static void vfio_vga_quirk_setup(VFIOPCIDevice *vdev) } static void vfio_vga_quirk_teardown(VFIOPCIDevice *vdev) +{ + VFIOQuirk *quirk; + int i; + + for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { + QLIST_FOREACH(quirk, &vdev->vga.region[i].quirks, next) { + memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem); + } + } +} + +static void vfio_vga_quirk_free(VFIOPCIDevice *vdev) { int i; for (i = 0; i < ARRAY_SIZE(vdev->vga.region); i++) { while (!QLIST_EMPTY(&vdev->vga.region[i].quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&vdev->vga.region[i].quirks); - memory_region_del_subregion(&vdev->vga.region[i].mem, &quirk->mem); object_unparent(OBJECT(&quirk->mem)); QLIST_REMOVE(quirk, next); g_free(quirk); @@ -2020,12 +2031,21 @@ static void vfio_bar_quirk_setup(VFIOPCIDevice *vdev, int nr) } static void vfio_bar_quirk_teardown(VFIOPCIDevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + VFIOQuirk *quirk; + + QLIST_FOREACH(quirk, &bar->quirks, next) { + memory_region_del_subregion(&bar->region.mem, &quirk->mem); + } +} + +static void vfio_bar_quirk_free(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; while (!QLIST_EMPTY(&bar->quirks)) { VFIOQuirk *quirk = QLIST_FIRST(&bar->quirks); - memory_region_del_subregion(&bar->region.mem, &quirk->mem); object_unparent(OBJECT(&quirk->mem)); QLIST_REMOVE(quirk, next); g_free(quirk); @@ -2281,7 +2301,7 @@ static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled) } } -static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) +static void vfio_unregister_bar(VFIOPCIDevice *vdev, int nr) { VFIOBAR *bar = &vdev->bars[nr]; @@ -2292,10 +2312,25 @@ static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) vfio_bar_quirk_teardown(vdev, nr); memory_region_del_subregion(&bar->region.mem, &bar->region.mmap_mem); - munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem)); if (vdev->msix && vdev->msix->table_bar == nr) { memory_region_del_subregion(&bar->region.mem, &vdev->msix->mmap_mem); + } +} + +static void vfio_unmap_bar(VFIOPCIDevice *vdev, int nr) +{ + VFIOBAR *bar = &vdev->bars[nr]; + + if (!bar->region.size) { + return; + } + + vfio_bar_quirk_free(vdev, nr); + + munmap(bar->region.mmap, memory_region_size(&bar->region.mmap_mem)); + + if (vdev->msix && vdev->msix->table_bar == nr) { munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem)); } } @@ -2403,6 +2438,20 @@ static void vfio_map_bars(VFIOPCIDevice *vdev) } } +static void vfio_unregister_bars(VFIOPCIDevice *vdev) +{ + int i; + + for (i = 0; i < PCI_ROM_SLOT; i++) { + vfio_unregister_bar(vdev, i); + } + + if (vdev->has_vga) { + vfio_vga_quirk_teardown(vdev); + pci_unregister_vga(&vdev->pdev); + } +} + static void vfio_unmap_bars(VFIOPCIDevice *vdev) { int i; @@ -2412,8 +2461,7 @@ static void vfio_unmap_bars(VFIOPCIDevice *vdev) } if (vdev->has_vga) { - vfio_vga_quirk_teardown(vdev); - pci_unregister_vga(&vdev->pdev); + vfio_vga_quirk_free(vdev); } } @@ -3328,7 +3376,7 @@ static int vfio_initfn(PCIDevice *pdev) out_teardown: pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_teardown_msi(vdev); - vfio_unmap_bars(vdev); + vfio_unregister_bars(vdev); return ret; } @@ -3338,6 +3386,7 @@ static void vfio_instance_finalize(Object *obj) VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pci_dev); VFIOGroup *group = vdev->vbasedev.group; + vfio_unmap_bars(vdev); g_free(vdev->emulated_config_bits); g_free(vdev->rom); vfio_put_device(vdev); @@ -3355,7 +3404,7 @@ static void vfio_exitfn(PCIDevice *pdev) timer_free(vdev->intx.mmap_timer); } vfio_teardown_msi(vdev); - vfio_unmap_bars(vdev); + vfio_unregister_bars(vdev); } static void vfio_pci_reset(DeviceState *dev) From 2e6e697e166568fdd09ceaa8c7c8c8c53a5e345b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 5/6] vfio: Use vfio type1 v2 IOMMU interface The difference between v1 and v2 is fairly subtle, simply more deterministic behavior for unmaps. The v1 interface allows the user to attempt to unmap sub-regions of previous mappings, returning success with zero size if unable to comply. This was a reflection of the underlying IOMMU API. The v2 interface requires that the user may only unmap fully contained mappings, ie. an unmap cannot intersect or bisect a previous mapping, but may cover multiple mappings. QEMU never made use of the sub-region v1 support anyway, so we can support either v1 or v2. We'll favor v2 since it's newer. Signed-off-by: Alex Williamson --- hw/vfio/common.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b230ef102b..c5d15510dd 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -662,7 +662,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) container = g_malloc0(sizeof(*container)); container->space = space; container->fd = fd; - if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU) || + ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) { + bool v2 = !!ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU); + ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { error_report("vfio: failed to set group container: %m"); @@ -670,7 +673,8 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) goto free_container_exit; } - ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); + ret = ioctl(fd, VFIO_SET_IOMMU, + v2 ? VFIO_TYPE1v2_IOMMU : VFIO_TYPE1_IOMMU); if (ret) { error_report("vfio: failed to set iommu for container: %m"); ret = -errno; From bc5baffa3554e4c0d20c1dbe879aec931866bd69 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 10 Feb 2015 10:25:44 -0700 Subject: [PATCH 6/6] vfio: Fix debug message compile error This fixes a compiler error which occurs if DEBUG_VFIO is defined. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index fa6a5e9268..84e9d995aa 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -516,7 +516,7 @@ static void vfio_msi_interrupt(void *opaque) abort(); } - trace_vfio_msi_interrupt(vbasedev->name, nr, msg.address, msg.data); + trace_vfio_msi_interrupt(vdev->vbasedev.name, nr, msg.address, msg.data); #endif if (vdev->interrupt == VFIO_INT_MSIX) {