2020-06-26 10:22:38 +03:00
|
|
|
/*
|
|
|
|
* Virtio MEM PCI device
|
|
|
|
*
|
|
|
|
* Copyright (C) 2020 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* David Hildenbrand <david@redhat.com>
|
|
|
|
*
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2.
|
|
|
|
* See the COPYING file in the top-level directory.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "qemu/osdep.h"
|
|
|
|
#include "virtio-mem-pci.h"
|
|
|
|
#include "hw/mem/memory-device.h"
|
|
|
|
#include "qapi/error.h"
|
2020-09-13 22:53:46 +03:00
|
|
|
#include "qapi/qapi-events-machine.h"
|
2020-06-26 10:22:44 +03:00
|
|
|
#include "qapi/qapi-events-misc.h"
|
2020-06-26 10:22:38 +03:00
|
|
|
|
|
|
|
static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev);
|
|
|
|
DeviceState *vdev = DEVICE(&mem_pci->vdev);
|
|
|
|
|
2020-07-27 14:59:05 +03:00
|
|
|
virtio_pci_force_virtio_1(vpci_dev);
|
|
|
|
qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
|
2020-06-26 10:22:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
|
|
|
|
Error **errp)
|
|
|
|
{
|
qom: Put name parameter before value / visitor parameter
The object_property_set_FOO() setters take property name and value in
an unusual order:
void object_property_set_FOO(Object *obj, FOO_TYPE value,
const char *name, Error **errp)
Having to pass value before name feels grating. Swap them.
Same for object_property_set(), object_property_get(), and
object_property_parse().
Convert callers with this Coccinelle script:
@@
identifier fun = {
object_property_get, object_property_parse, object_property_set_str,
object_property_set_link, object_property_set_bool,
object_property_set_int, object_property_set_uint, object_property_set,
object_property_set_qobject
};
expression obj, v, name, errp;
@@
- fun(obj, v, name, errp)
+ fun(obj, name, v, errp)
Chokes on hw/arm/musicpal.c's lcd_refresh() with the unhelpful error
message "no position information". Convert that one manually.
Fails to convert hw/arm/armsse.c, because Coccinelle gets confused by
ARMSSE being used both as typedef and function-like macro there.
Convert manually.
Fails to convert hw/rx/rx-gdbsim.c, because Coccinelle gets confused
by RXCPU being used both as typedef and function-like macro there.
Convert manually. The other files using RXCPU that way don't need
conversion.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Message-Id: <20200707160613.848843-27-armbru@redhat.com>
[Straightforwad conflict with commit 2336172d9b "audio: set default
value for pcspk.iobase property" resolved]
2020-07-07 19:05:54 +03:00
|
|
|
object_property_set_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP, addr, errp);
|
2020-06-26 10:22:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md)
|
|
|
|
{
|
|
|
|
return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
|
|
|
|
&error_abort);
|
|
|
|
}
|
|
|
|
|
|
|
|
static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
|
2023-06-01 12:34:52 +03:00
|
|
|
VirtIOMEM *vmem = &pci_mem->vdev;
|
2020-06-26 10:22:38 +03:00
|
|
|
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
|
|
|
|
return vmc->get_memory_region(vmem, errp);
|
|
|
|
}
|
|
|
|
|
virtio-mem: Expose device memory dynamically via multiple memslots if enabled
Having large virtio-mem devices that only expose little memory to a VM
is currently a problem: we map the whole sparse memory region into the
guest using a single memslot, resulting in one gigantic memslot in KVM.
KVM allocates metadata for the whole memslot, which can result in quite
some memory waste.
Assuming we have a 1 TiB virtio-mem device and only expose little (e.g.,
1 GiB) memory, we would create a single 1 TiB memslot and KVM has to
allocate metadata for that 1 TiB memslot: on x86, this implies allocating
a significant amount of memory for metadata:
(1) RMAP: 8 bytes per 4 KiB, 8 bytes per 2 MiB, 8 bytes per 1 GiB
-> For 1 TiB: 2147483648 + 4194304 + 8192 = ~ 2 GiB (0.2 %)
With the TDP MMU (cat /sys/module/kvm/parameters/tdp_mmu) this gets
allocated lazily when required for nested VMs
(2) gfn_track: 2 bytes per 4 KiB
-> For 1 TiB: 536870912 = ~512 MiB (0.05 %)
(3) lpage_info: 4 bytes per 2 MiB, 4 bytes per 1 GiB
-> For 1 TiB: 2097152 + 4096 = ~2 MiB (0.0002 %)
(4) 2x dirty bitmaps for tracking: 2x 1 bit per 4 KiB page
-> For 1 TiB: 536870912 = 64 MiB (0.006 %)
So we primarily care about (1) and (2). The bad thing is, that the
memory consumption *doubles* once SMM is enabled, because we create the
memslot once for !SMM and once for SMM.
Having a 1 TiB memslot without the TDP MMU consumes around:
* With SMM: 5 GiB
* Without SMM: 2.5 GiB
Having a 1 TiB memslot with the TDP MMU consumes around:
* With SMM: 1 GiB
* Without SMM: 512 MiB
... and that's really something we want to optimize, to be able to just
start a VM with small boot memory (e.g., 4 GiB) and a virtio-mem device
that can grow very large (e.g., 1 TiB).
Consequently, using multiple memslots and only mapping the memslots we
really need can significantly reduce memory waste and speed up
memslot-related operations. Let's expose the sparse RAM memory region using
multiple memslots, mapping only the memslots we currently need into our
device memory region container.
The feature can be enabled using "dynamic-memslots=on" and requires
"unplugged-inaccessible=on", which is nowadays the default.
Once enabled, we'll auto-detect the number of memslots to use based on the
memslot limit provided by the core. We'll use at most 1 memslot per
gigabyte. Note that our global limit of memslots accross all memory devices
is currently set to 256: even with multiple large virtio-mem devices,
we'd still have a sane limit on the number of memslots used.
The default is to not dynamically map memslot for now
("dynamic-memslots=off"). The optimization must be enabled manually,
because some vhost setups (e.g., hotplug of vhost-user devices) might be
problematic until we support more memslots especially in vhost-user backends.
Note that "dynamic-memslots=on" is just a hint that multiple memslots
*may* be used for internal optimizations, not that multiple memslots
*must* be used. The actual number of memslots that are used is an
internal detail: for example, once memslot metadata is no longer an
issue, we could simply stop optimizing for that. Migration source and
destination can differ on the setting of "dynamic-memslots".
Message-ID: <20230926185738.277351-17-david@redhat.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-26 21:57:36 +03:00
|
|
|
static void virtio_mem_pci_decide_memslots(MemoryDeviceState *md,
|
|
|
|
unsigned int limit)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
|
|
|
|
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
|
|
|
|
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
|
|
|
|
vmc->decide_memslots(vmem, limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int virtio_mem_pci_get_memslots(MemoryDeviceState *md)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
|
|
|
|
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
|
|
|
|
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
|
|
|
|
return vmc->get_memslots(vmem);
|
|
|
|
}
|
|
|
|
|
2020-06-26 10:22:38 +03:00
|
|
|
static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
|
|
|
|
errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md,
|
|
|
|
MemoryDeviceInfo *info)
|
|
|
|
{
|
|
|
|
VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
|
2023-06-01 12:34:52 +03:00
|
|
|
VirtIOMEM *vmem = &pci_mem->vdev;
|
2020-06-26 10:22:38 +03:00
|
|
|
VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
DeviceState *dev = DEVICE(md);
|
|
|
|
|
|
|
|
if (dev->id) {
|
|
|
|
vi->id = g_strdup(dev->id);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* let the real device handle everything else */
|
|
|
|
vpc->fill_device_info(vmem, vi);
|
|
|
|
|
|
|
|
info->u.virtio_mem.data = vi;
|
|
|
|
info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
|
|
|
|
}
|
|
|
|
|
2020-10-08 11:30:29 +03:00
|
|
|
static uint64_t virtio_mem_pci_get_min_alignment(const MemoryDeviceState *md)
|
|
|
|
{
|
|
|
|
return object_property_get_uint(OBJECT(md), VIRTIO_MEM_BLOCK_SIZE_PROP,
|
|
|
|
&error_abort);
|
|
|
|
}
|
|
|
|
|
2020-06-26 10:22:44 +03:00
|
|
|
static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI,
|
|
|
|
size_change_notifier);
|
|
|
|
DeviceState *dev = DEVICE(pci_mem);
|
2021-09-29 19:24:44 +03:00
|
|
|
char *qom_path = object_get_canonical_path(OBJECT(dev));
|
2020-06-26 10:22:44 +03:00
|
|
|
const uint64_t * const size_p = data;
|
|
|
|
|
2022-11-04 19:06:57 +03:00
|
|
|
qapi_event_send_memory_device_size_change(dev->id, *size_p, qom_path);
|
2021-09-29 19:24:44 +03:00
|
|
|
g_free(qom_path);
|
2020-06-26 10:22:44 +03:00
|
|
|
}
|
|
|
|
|
2023-07-11 18:34:45 +03:00
|
|
|
static void virtio_mem_pci_unplug_request_check(VirtIOMDPCI *vmd, Error **errp)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(vmd);
|
|
|
|
VirtIOMEM *vmem = &pci_mem->vdev;
|
|
|
|
VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
|
|
|
|
vpc->unplug_request_check(vmem, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_mem_pci_get_requested_size(Object *obj, Visitor *v,
|
|
|
|
const char *name, void *opaque,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(obj);
|
|
|
|
|
|
|
|
object_property_get(OBJECT(&pci_mem->vdev), name, v, errp);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_mem_pci_set_requested_size(Object *obj, Visitor *v,
|
|
|
|
const char *name, void *opaque,
|
|
|
|
Error **errp)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(obj);
|
|
|
|
DeviceState *dev = DEVICE(obj);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we passed virtio_mem_pci_unplug_request_check(), making sure that
|
|
|
|
* the requested size is 0, don't allow modifying the requested size
|
|
|
|
* anymore, otherwise the VM might end up hotplugging memory before
|
|
|
|
* handling the unplug request.
|
|
|
|
*/
|
|
|
|
if (dev->pending_deleted_event) {
|
|
|
|
error_setg(errp, "'%s' cannot be changed if the device is in the"
|
|
|
|
" process of unplug", name);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
object_property_set(OBJECT(&pci_mem->vdev), name, v, errp);
|
|
|
|
}
|
|
|
|
|
2020-06-26 10:22:38 +03:00
|
|
|
static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
|
|
|
|
{
|
|
|
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
|
|
|
VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
|
|
|
|
PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
|
|
|
|
MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
|
2023-07-11 18:34:45 +03:00
|
|
|
VirtIOMDPCIClass *vmdc = VIRTIO_MD_PCI_CLASS(klass);
|
2020-06-26 10:22:38 +03:00
|
|
|
|
|
|
|
k->realize = virtio_mem_pci_realize;
|
|
|
|
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
|
|
|
|
pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
|
|
|
|
pcidev_k->class_id = PCI_CLASS_OTHERS;
|
|
|
|
|
|
|
|
mdc->get_addr = virtio_mem_pci_get_addr;
|
|
|
|
mdc->set_addr = virtio_mem_pci_set_addr;
|
|
|
|
mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
|
|
|
|
mdc->get_memory_region = virtio_mem_pci_get_memory_region;
|
virtio-mem: Expose device memory dynamically via multiple memslots if enabled
Having large virtio-mem devices that only expose little memory to a VM
is currently a problem: we map the whole sparse memory region into the
guest using a single memslot, resulting in one gigantic memslot in KVM.
KVM allocates metadata for the whole memslot, which can result in quite
some memory waste.
Assuming we have a 1 TiB virtio-mem device and only expose little (e.g.,
1 GiB) memory, we would create a single 1 TiB memslot and KVM has to
allocate metadata for that 1 TiB memslot: on x86, this implies allocating
a significant amount of memory for metadata:
(1) RMAP: 8 bytes per 4 KiB, 8 bytes per 2 MiB, 8 bytes per 1 GiB
-> For 1 TiB: 2147483648 + 4194304 + 8192 = ~ 2 GiB (0.2 %)
With the TDP MMU (cat /sys/module/kvm/parameters/tdp_mmu) this gets
allocated lazily when required for nested VMs
(2) gfn_track: 2 bytes per 4 KiB
-> For 1 TiB: 536870912 = ~512 MiB (0.05 %)
(3) lpage_info: 4 bytes per 2 MiB, 4 bytes per 1 GiB
-> For 1 TiB: 2097152 + 4096 = ~2 MiB (0.0002 %)
(4) 2x dirty bitmaps for tracking: 2x 1 bit per 4 KiB page
-> For 1 TiB: 536870912 = 64 MiB (0.006 %)
So we primarily care about (1) and (2). The bad thing is, that the
memory consumption *doubles* once SMM is enabled, because we create the
memslot once for !SMM and once for SMM.
Having a 1 TiB memslot without the TDP MMU consumes around:
* With SMM: 5 GiB
* Without SMM: 2.5 GiB
Having a 1 TiB memslot with the TDP MMU consumes around:
* With SMM: 1 GiB
* Without SMM: 512 MiB
... and that's really something we want to optimize, to be able to just
start a VM with small boot memory (e.g., 4 GiB) and a virtio-mem device
that can grow very large (e.g., 1 TiB).
Consequently, using multiple memslots and only mapping the memslots we
really need can significantly reduce memory waste and speed up
memslot-related operations. Let's expose the sparse RAM memory region using
multiple memslots, mapping only the memslots we currently need into our
device memory region container.
The feature can be enabled using "dynamic-memslots=on" and requires
"unplugged-inaccessible=on", which is nowadays the default.
Once enabled, we'll auto-detect the number of memslots to use based on the
memslot limit provided by the core. We'll use at most 1 memslot per
gigabyte. Note that our global limit of memslots accross all memory devices
is currently set to 256: even with multiple large virtio-mem devices,
we'd still have a sane limit on the number of memslots used.
The default is to not dynamically map memslot for now
("dynamic-memslots=off"). The optimization must be enabled manually,
because some vhost setups (e.g., hotplug of vhost-user devices) might be
problematic until we support more memslots especially in vhost-user backends.
Note that "dynamic-memslots=on" is just a hint that multiple memslots
*may* be used for internal optimizations, not that multiple memslots
*must* be used. The actual number of memslots that are used is an
internal detail: for example, once memslot metadata is no longer an
issue, we could simply stop optimizing for that. Migration source and
destination can differ on the setting of "dynamic-memslots".
Message-ID: <20230926185738.277351-17-david@redhat.com>
Reviewed-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-26 21:57:36 +03:00
|
|
|
mdc->decide_memslots = virtio_mem_pci_decide_memslots;
|
|
|
|
mdc->get_memslots = virtio_mem_pci_get_memslots;
|
2020-06-26 10:22:38 +03:00
|
|
|
mdc->fill_device_info = virtio_mem_pci_fill_device_info;
|
2020-10-08 11:30:29 +03:00
|
|
|
mdc->get_min_alignment = virtio_mem_pci_get_min_alignment;
|
2023-07-11 18:34:45 +03:00
|
|
|
|
|
|
|
vmdc->unplug_request_check = virtio_mem_pci_unplug_request_check;
|
2020-06-26 10:22:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void virtio_mem_pci_instance_init(Object *obj)
|
|
|
|
{
|
|
|
|
VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj);
|
2020-06-26 10:22:44 +03:00
|
|
|
VirtIOMEMClass *vmc;
|
|
|
|
VirtIOMEM *vmem;
|
2020-06-26 10:22:38 +03:00
|
|
|
|
|
|
|
virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
|
|
|
|
TYPE_VIRTIO_MEM);
|
2020-06-26 10:22:44 +03:00
|
|
|
|
|
|
|
dev->size_change_notifier.notify = virtio_mem_pci_size_change_notify;
|
2023-06-01 12:34:52 +03:00
|
|
|
vmem = &dev->vdev;
|
2020-06-26 10:22:44 +03:00
|
|
|
vmc = VIRTIO_MEM_GET_CLASS(vmem);
|
|
|
|
/*
|
|
|
|
* We never remove the notifier again, as we expect both devices to
|
|
|
|
* disappear at the same time.
|
|
|
|
*/
|
|
|
|
vmc->add_size_change_notifier(vmem, &dev->size_change_notifier);
|
|
|
|
|
2020-06-26 10:22:38 +03:00
|
|
|
object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP,
|
|
|
|
OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP);
|
|
|
|
object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev),
|
|
|
|
VIRTIO_MEM_SIZE_PROP);
|
2023-07-11 18:34:45 +03:00
|
|
|
object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
|
|
|
|
virtio_mem_pci_get_requested_size,
|
|
|
|
virtio_mem_pci_set_requested_size, NULL, NULL);
|
2020-06-26 10:22:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static const VirtioPCIDeviceTypeInfo virtio_mem_pci_info = {
|
|
|
|
.base_name = TYPE_VIRTIO_MEM_PCI,
|
2023-07-11 18:34:39 +03:00
|
|
|
.parent = TYPE_VIRTIO_MD_PCI,
|
2020-06-26 10:22:38 +03:00
|
|
|
.generic_name = "virtio-mem-pci",
|
|
|
|
.instance_size = sizeof(VirtIOMEMPCI),
|
|
|
|
.instance_init = virtio_mem_pci_instance_init,
|
|
|
|
.class_init = virtio_mem_pci_class_init,
|
|
|
|
};
|
|
|
|
|
|
|
|
static void virtio_mem_pci_register_types(void)
|
|
|
|
{
|
|
|
|
virtio_pci_types_register(&virtio_mem_pci_info);
|
|
|
|
}
|
|
|
|
type_init(virtio_mem_pci_register_types)
|