diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c index 9cf5b84045..7437c2e3c3 100644 --- a/hw/misc/vfio.c +++ b/hw/misc/vfio.c @@ -133,6 +133,15 @@ enum { VFIO_INT_MSIX = 3, }; +typedef struct VFIOAddressSpace { + AddressSpace *as; + QLIST_HEAD(, VFIOContainer) containers; + QLIST_ENTRY(VFIOAddressSpace) list; +} VFIOAddressSpace; + +static QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces = + QLIST_HEAD_INITIALIZER(vfio_address_spaces); + struct VFIOGroup; typedef struct VFIOType1 { @@ -142,6 +151,7 @@ typedef struct VFIOType1 { } VFIOType1; typedef struct VFIOContainer { + VFIOAddressSpace *space; int fd; /* /dev/vfio/vfio, empowered by the attached groups */ struct { /* enable abstraction to support various iommu backends */ @@ -150,10 +160,18 @@ typedef struct VFIOContainer { }; void (*release)(struct VFIOContainer *); } iommu_data; + QLIST_HEAD(, VFIOGuestIOMMU) giommu_list; QLIST_HEAD(, VFIOGroup) group_list; QLIST_ENTRY(VFIOContainer) next; } VFIOContainer; +typedef struct VFIOGuestIOMMU { + VFIOContainer *container; + MemoryRegion *iommu; + Notifier n; + QLIST_ENTRY(VFIOGuestIOMMU) giommu_next; +} VFIOGuestIOMMU; + /* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */ typedef struct VFIOMSIXInfo { uint8_t table_bar; @@ -234,9 +252,6 @@ static const VFIORomBlacklistEntry romblacklist[] = { #define MSIX_CAP_LENGTH 12 -static QLIST_HEAD(, VFIOContainer) - container_list = QLIST_HEAD_INITIALIZER(container_list); - static QLIST_HEAD(, VFIOGroup) group_list = QLIST_HEAD_INITIALIZER(group_list); @@ -1668,6 +1683,149 @@ static void vfio_probe_ati_bar4_window_quirk(VFIODevice *vdev, int nr) vdev->host.function); } +#define PCI_VENDOR_ID_REALTEK 0x10ec + +/* + * RTL8168 devices have a backdoor that can access the MSI-X table. At BAR2 + * offset 0x70 there is a dword data register, offset 0x74 is a dword address + * register. According to the Linux r8169 driver, the MSI-X table is addressed + * when the "type" portion of the address register is set to 0x1. This appears + * to be bits 16:30. Bit 31 is both a write indicator and some sort of + * "address latched" indicator. Bits 12:15 are a mask field, which we can + * ignore because the MSI-X table should always be accessed as a dword (full + * mask). Bits 0:11 is offset within the type. + * + * Example trace: + * + * Read from MSI-X table offset 0 + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x1f000, 4) // store read addr + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x8001f000 // latch + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x70, 4) = 0xfee00398 // read data + * + * Write 0xfee00000 to MSI-X table offset 0 + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x70, 0xfee00000, 4) // write data + * vfio: vfio_bar_write(0000:05:00.0:BAR2+0x74, 0x8001f000, 4) // do write + * vfio: vfio_bar_read(0000:05:00.0:BAR2+0x74, 4) = 0x1f000 // complete + */ + +static uint64_t vfio_rtl8168_window_quirk_read(void *opaque, + hwaddr addr, unsigned size) +{ + VFIOQuirk *quirk = opaque; + VFIODevice *vdev = quirk->vdev; + + switch (addr) { + case 4: /* address */ + if (quirk->data.flags) { + DPRINTF("%s fake read(%04x:%02x:%02x.%d)\n", + memory_region_name(&quirk->mem), vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + return quirk->data.address_match ^ 0x10000000U; + } + break; + case 0: /* data */ + if (quirk->data.flags) { + uint64_t val; + + DPRINTF("%s MSI-X table read(%04x:%02x:%02x.%d)\n", + memory_region_name(&quirk->mem), vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + if (!(vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX)) { + return 0; + } + + io_mem_read(&vdev->pdev.msix_table_mmio, + (hwaddr)(quirk->data.address_match & 0xfff), + &val, size); + return val; + } + } + + DPRINTF("%s direct read(%04x:%02x:%02x.%d)\n", + memory_region_name(&quirk->mem), vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + return vfio_bar_read(&vdev->bars[quirk->data.bar], addr + 0x70, size); +} + +static void vfio_rtl8168_window_quirk_write(void *opaque, hwaddr addr, + uint64_t data, unsigned size) +{ + VFIOQuirk *quirk = opaque; + VFIODevice *vdev = quirk->vdev; + + switch (addr) { + case 4: /* address */ + if ((data & 0x7fff0000) == 0x10000) { + if (data & 0x10000000U && + vdev->pdev.cap_present & QEMU_PCI_CAP_MSIX) { + + DPRINTF("%s MSI-X table write(%04x:%02x:%02x.%d)\n", + memory_region_name(&quirk->mem), vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + io_mem_write(&vdev->pdev.msix_table_mmio, + (hwaddr)(quirk->data.address_match & 0xfff), + data, size); + } + + quirk->data.flags = 1; + quirk->data.address_match = data; + + return; + } + quirk->data.flags = 0; + break; + case 0: /* data */ + quirk->data.address_mask = data; + break; + } + + DPRINTF("%s direct write(%04x:%02x:%02x.%d)\n", + memory_region_name(&quirk->mem), vdev->host.domain, + vdev->host.bus, vdev->host.slot, vdev->host.function); + + vfio_bar_write(&vdev->bars[quirk->data.bar], addr + 0x70, data, size); +} + +static const MemoryRegionOps vfio_rtl8168_window_quirk = { + .read = vfio_rtl8168_window_quirk_read, + .write = vfio_rtl8168_window_quirk_write, + .valid = { + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void vfio_probe_rtl8168_bar2_window_quirk(VFIODevice *vdev, int nr) +{ + PCIDevice *pdev = &vdev->pdev; + VFIOQuirk *quirk; + + if (pci_get_word(pdev->config + PCI_VENDOR_ID) != PCI_VENDOR_ID_REALTEK || + pci_get_word(pdev->config + PCI_DEVICE_ID) != 0x8168 || nr != 2) { + return; + } + + quirk = g_malloc0(sizeof(*quirk)); + quirk->vdev = vdev; + quirk->data.bar = nr; + + memory_region_init_io(&quirk->mem, OBJECT(vdev), &vfio_rtl8168_window_quirk, + quirk, "vfio-rtl8168-window-quirk", 8); + memory_region_add_subregion_overlap(&vdev->bars[nr].mem, + 0x70, &quirk->mem, 1); + + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); + + DPRINTF("Enabled RTL8168 BAR2 window quirk for device %04x:%02x:%02x.%x\n", + vdev->host.domain, vdev->host.bus, vdev->host.slot, + vdev->host.function); +} /* * Trap the BAR2 MMIO window to config space as well. */ @@ -2071,6 +2229,7 @@ static void vfio_bar_quirk_setup(VFIODevice *vdev, int nr) vfio_probe_nvidia_bar5_window_quirk(vdev, nr); vfio_probe_nvidia_bar0_88000_quirk(vdev, nr); vfio_probe_nvidia_bar0_1800_quirk(vdev, nr); + vfio_probe_rtl8168_bar2_window_quirk(vdev, nr); } static void vfio_bar_quirk_teardown(VFIODevice *vdev, int nr) @@ -2232,7 +2391,8 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr iova, static bool vfio_listener_skipped_section(MemoryRegionSection *section) { - return !memory_region_is_ram(section->mr) || + return (!memory_region_is_ram(section->mr) && + !memory_region_is_iommu(section->mr)) || /* * Sizing an enabled 64-bit BAR can cause spurious mappings to * addresses in the upper part of the 64-bit address space. These @@ -2242,17 +2402,75 @@ static bool vfio_listener_skipped_section(MemoryRegionSection *section) section->offset_within_address_space & (1ULL << 63); } +static void vfio_iommu_map_notify(Notifier *n, void *data) +{ + VFIOGuestIOMMU *giommu = container_of(n, VFIOGuestIOMMU, n); + VFIOContainer *container = giommu->container; + IOMMUTLBEntry *iotlb = data; + MemoryRegion *mr; + hwaddr xlat; + hwaddr len = iotlb->addr_mask + 1; + void *vaddr; + int ret; + + DPRINTF("iommu map @ %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", + iotlb->iova, iotlb->iova + iotlb->addr_mask); + + /* + * The IOMMU TLB entry we have just covers translation through + * this IOMMU to its immediate target. We need to translate + * it the rest of the way through to memory. + */ + mr = address_space_translate(&address_space_memory, + iotlb->translated_addr, + &xlat, &len, iotlb->perm & IOMMU_WO); + if (!memory_region_is_ram(mr)) { + DPRINTF("iommu map to non memory area %"HWADDR_PRIx"\n", + xlat); + return; + } + /* + * Translation truncates length to the IOMMU page size, + * check that it did not truncate too much. + */ + if (len & iotlb->addr_mask) { + DPRINTF("iommu has granularity incompatible with target AS\n"); + return; + } + + if (iotlb->perm != IOMMU_NONE) { + vaddr = memory_region_get_ram_ptr(mr) + xlat; + + ret = vfio_dma_map(container, iotlb->iova, + iotlb->addr_mask + 1, vaddr, + !(iotlb->perm & IOMMU_WO) || mr->readonly); + if (ret) { + error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx", %p) = %d (%m)", + container, iotlb->iova, + iotlb->addr_mask + 1, vaddr, ret); + } + } else { + ret = vfio_dma_unmap(container, iotlb->iova, iotlb->addr_mask + 1); + if (ret) { + error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", " + "0x%"HWADDR_PRIx") = %d (%m)", + container, iotlb->iova, + iotlb->addr_mask + 1, ret); + } + } +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { VFIOContainer *container = container_of(listener, VFIOContainer, iommu_data.type1.listener); hwaddr iova, end; + Int128 llend; void *vaddr; int ret; - assert(!memory_region_is_iommu(section->mr)); - if (vfio_listener_skipped_section(section)) { DPRINTF("SKIPPING region_add %"HWADDR_PRIx" - %"PRIx64"\n", section->offset_within_address_space, @@ -2268,21 +2486,65 @@ static void vfio_listener_region_add(MemoryListener *listener, } iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); - end = (section->offset_within_address_space + int128_get64(section->size)) & - TARGET_PAGE_MASK; + llend = int128_make64(section->offset_within_address_space); + llend = int128_add(llend, section->size); + llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK)); - if (iova >= end) { + if (int128_ge(int128_make64(iova), llend)) { return; } + memory_region_ref(section->mr); + + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + DPRINTF("region_add [iommu] %"HWADDR_PRIx" - %"HWADDR_PRIx"\n", + iova, int128_get64(int128_sub(llend, int128_one()))); + /* + * FIXME: We should do some checking to see if the + * capabilities of the host VFIO IOMMU are adequate to model + * the guest IOMMU + * + * FIXME: For VFIO iommu types which have KVM acceleration to + * avoid bouncing all map/unmaps through qemu this way, this + * would be the right place to wire that up (tell the KVM + * device emulation the VFIO iommu handles to use). + */ + /* + * This assumes that the guest IOMMU is empty of + * mappings at this point. + * + * One way of doing this is: + * 1. Avoid sharing IOMMUs between emulated devices or different + * IOMMU groups. + * 2. Implement VFIO_IOMMU_ENABLE in the host kernel to fail if + * there are some mappings in IOMMU. + * + * VFIO on SPAPR does that. Other IOMMU models may do that different, + * they must make sure there are no existing mappings or + * loop through existing mappings to map them into VFIO. + */ + giommu = g_malloc0(sizeof(*giommu)); + giommu->iommu = section->mr; + giommu->container = container; + giommu->n.notify = vfio_iommu_map_notify; + QLIST_INSERT_HEAD(&container->giommu_list, giommu, giommu_next); + memory_region_register_iommu_notifier(giommu->iommu, &giommu->n); + + return; + } + + /* Here we assume that memory_region_is_ram(section->mr)==true */ + + end = int128_get64(llend); vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); - DPRINTF("region_add %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n", + DPRINTF("region_add [ram] %"HWADDR_PRIx" - %"HWADDR_PRIx" [%p]\n", iova, end - 1, vaddr); - memory_region_ref(section->mr); ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly); if (ret) { error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", " @@ -2326,6 +2588,27 @@ static void vfio_listener_region_del(MemoryListener *listener, return; } + if (memory_region_is_iommu(section->mr)) { + VFIOGuestIOMMU *giommu; + + QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { + if (giommu->iommu == section->mr) { + memory_region_unregister_iommu_notifier(&giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + break; + } + } + + /* + * FIXME: We assume the one big unmap below is adequate to + * remove any individual page mappings in the IOMMU which + * might have been copied into VFIO. This works for a page table + * based IOMMU where a big unmap flattens a large range of IO-PTEs. + * That may not be true for all IOMMU types. + */ + } + iova = TARGET_PAGE_ALIGN(section->offset_within_address_space); end = (section->offset_within_address_space + int128_get64(section->size)) & TARGET_PAGE_MASK; @@ -3274,16 +3557,43 @@ static void vfio_kvm_device_del_group(VFIOGroup *group) #endif } -static int vfio_connect_container(VFIOGroup *group) +static VFIOAddressSpace *vfio_get_address_space(AddressSpace *as) +{ + VFIOAddressSpace *space; + + QLIST_FOREACH(space, &vfio_address_spaces, list) { + if (space->as == as) { + return space; + } + } + + /* No suitable VFIOAddressSpace, create a new one */ + space = g_malloc0(sizeof(*space)); + space->as = as; + QLIST_INIT(&space->containers); + + QLIST_INSERT_HEAD(&vfio_address_spaces, space, list); + + return space; +} + +static void vfio_put_address_space(VFIOAddressSpace *space) +{ + if (QLIST_EMPTY(&space->containers)) { + QLIST_REMOVE(space, list); + g_free(space); + } +} + +static int vfio_connect_container(VFIOGroup *group, AddressSpace *as) { VFIOContainer *container; int ret, fd; + VFIOAddressSpace *space; - if (group->container) { - return 0; - } + space = vfio_get_address_space(as); - QLIST_FOREACH(container, &container_list, next) { + QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); @@ -3294,35 +3604,35 @@ static int vfio_connect_container(VFIOGroup *group) fd = qemu_open("/dev/vfio/vfio", O_RDWR); if (fd < 0) { error_report("vfio: failed to open /dev/vfio/vfio: %m"); - return -errno; + ret = -errno; + goto put_space_exit; } ret = ioctl(fd, VFIO_GET_API_VERSION); if (ret != VFIO_API_VERSION) { error_report("vfio: supported vfio version: %d, " "reported version: %d", VFIO_API_VERSION, ret); - close(fd); - return -EINVAL; + ret = -EINVAL; + goto close_fd_exit; } container = g_malloc0(sizeof(*container)); + container->space = space; container->fd = fd; if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd); if (ret) { error_report("vfio: failed to set group container: %m"); - g_free(container); - close(fd); - return -errno; + ret = -errno; + goto free_container_exit; } ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU); if (ret) { error_report("vfio: failed to set iommu for container: %m"); - g_free(container); - close(fd); - return -errno; + ret = -errno; + goto free_container_exit; } container->iommu_data.type1.listener = vfio_memory_listener; @@ -3333,29 +3643,39 @@ static int vfio_connect_container(VFIOGroup *group) if (container->iommu_data.type1.error) { ret = container->iommu_data.type1.error; - vfio_listener_release(container); - g_free(container); - close(fd); error_report("vfio: memory listener initialization failed for container"); - return ret; + goto listener_release_exit; } container->iommu_data.type1.initialized = true; } else { error_report("vfio: No available IOMMU models"); - g_free(container); - close(fd); - return -EINVAL; + ret = -EINVAL; + goto free_container_exit; } QLIST_INIT(&container->group_list); - QLIST_INSERT_HEAD(&container_list, container, next); + QLIST_INSERT_HEAD(&space->containers, container, next); group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); return 0; + +listener_release_exit: + vfio_listener_release(container); + +free_container_exit: + g_free(container); + +close_fd_exit: + close(fd); + +put_space_exit: + vfio_put_address_space(space); + + return ret; } static void vfio_disconnect_container(VFIOGroup *group) @@ -3371,6 +3691,8 @@ static void vfio_disconnect_container(VFIOGroup *group) group->container = NULL; if (QLIST_EMPTY(&container->group_list)) { + VFIOAddressSpace *space = container->space; + if (container->iommu_data.release) { container->iommu_data.release(container); } @@ -3378,10 +3700,12 @@ static void vfio_disconnect_container(VFIOGroup *group) DPRINTF("vfio_disconnect_container: close container->fd\n"); close(container->fd); g_free(container); + + vfio_put_address_space(space); } } -static VFIOGroup *vfio_get_group(int groupid) +static VFIOGroup *vfio_get_group(int groupid, AddressSpace *as) { VFIOGroup *group; char path[32]; @@ -3389,7 +3713,14 @@ static VFIOGroup *vfio_get_group(int groupid) QLIST_FOREACH(group, &group_list, next) { if (group->groupid == groupid) { - return group; + /* Found it. Now is it already in the right context? */ + if (group->container->space->as == as) { + return group; + } else { + error_report("vfio: group %d used in multiple address spaces", + group->groupid); + return NULL; + } } } @@ -3399,34 +3730,27 @@ static VFIOGroup *vfio_get_group(int groupid) group->fd = qemu_open(path, O_RDWR); if (group->fd < 0) { error_report("vfio: error opening %s: %m", path); - g_free(group); - return NULL; + goto free_group_exit; } if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) { error_report("vfio: error getting group status: %m"); - close(group->fd); - g_free(group); - return NULL; + goto close_fd_exit; } if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) { error_report("vfio: error, group %d is not viable, please ensure " "all devices within the iommu_group are bound to their " "vfio bus driver.", groupid); - close(group->fd); - g_free(group); - return NULL; + goto close_fd_exit; } group->groupid = groupid; QLIST_INIT(&group->device_list); - if (vfio_connect_container(group)) { + if (vfio_connect_container(group, as)) { error_report("vfio: failed to setup container for group %d", groupid); - close(group->fd); - g_free(group); - return NULL; + goto close_fd_exit; } if (QLIST_EMPTY(&group_list)) { @@ -3438,6 +3762,14 @@ static VFIOGroup *vfio_get_group(int groupid) vfio_kvm_device_add_group(group); return group; + +close_fd_exit: + close(group->fd); + +free_group_exit: + g_free(group); + + return NULL; } static void vfio_put_group(VFIOGroup *group) @@ -3768,7 +4100,7 @@ static int vfio_initfn(PCIDevice *pdev) DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function, groupid); - group = vfio_get_group(groupid); + group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev)); if (!group) { error_report("vfio: failed to get group %d", groupid); return -ENOENT; diff --git a/include/qemu/int128.h b/include/qemu/int128.h index f59703143a..fb782aaddd 100644 --- a/include/qemu/int128.h +++ b/include/qemu/int128.h @@ -38,6 +38,11 @@ static inline Int128 int128_2_64(void) return (Int128) { 0, 1 }; } +static inline Int128 int128_exts64(int64_t a) +{ + return (Int128) { .lo = a, .hi = (a < 0) ? -1 : 0 }; +} + static inline Int128 int128_and(Int128 a, Int128 b) { return (Int128) { a.lo & b.lo, a.hi & b.hi }; diff --git a/memory.c b/memory.c index 3f1df238e2..678661e2bb 100644 --- a/memory.c +++ b/memory.c @@ -1722,12 +1722,19 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name) void address_space_destroy(AddressSpace *as) { + MemoryListener *listener; + /* Flush out anything from MemoryListeners listening in on this */ memory_region_transaction_begin(); as->root = NULL; memory_region_transaction_commit(); QTAILQ_REMOVE(&address_spaces, as, address_spaces_link); address_space_destroy_dispatch(as); + + QTAILQ_FOREACH(listener, &memory_listeners, link) { + assert(listener->address_space_filter != as); + } + flatview_unref(as->current_map); g_free(as->name); g_free(as->ioeventfds);