virtio,acpi: features, fixes, cleanups.

vdpa support
 virtio-mem support
 a handy script for disassembling acpi tables
 misc fixes and cleanups
 
 Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAl8EY+MPHG1zdEByZWRo
 YXQuY29tAAoJECgfDbjSjVRpOMkIAMMhfbzZXlwv1xiQ/pMTtEqXDnLeic7NK6xF
 RJkAFlMM+eEXBRZLYJXhPAFjneTA813vR0xlygHn2pYhCF3ozTfLqEABfQsG0w+d
 VDSYTnFHAx2GwGGQBNYltsIs+8lAADYhlo9VG/qC5nAsNaoVBeTJLuF96un1WGDz
 vWH0Cx/AG+yhiKvlSHA/CCSXMVGVTkRfUCjGF8Yq0mVtx23OZ9blQkJRkGfHWctB
 GxQlh/b+4YLaXhy+V1+/Iu2U45KgKN7qrsyKvHBMgKd2qazowr/D8Aexh4hN/eg5
 jibmxurDHXze+VUDCySy6qLBBySNkz++qLKfPOQ0iszDHLXQzOQ=
 =H/KL
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging

virtio,acpi: features, fixes, cleanups.

vdpa support
virtio-mem support
a handy script for disassembling acpi tables
misc fixes and cleanups

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

# gpg: Signature made Tue 07 Jul 2020 13:00:35 BST
# gpg:                using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469
# gpg:                issuer "mst@redhat.com"
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg:                 aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17  0970 C350 3912 AFBE 8E67
#      Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA  8A0D 281F 0DB8 D28D 5469

* remotes/mst/tags/for_upstream: (41 commits)
  vhost-vdpa: introduce vhost-vdpa net client
  vhost-vdpa: introduce vhost-vdpa backend
  vhost_net: introduce set_config & get_config
  vhost: implement vhost_force_iommu method
  vhost: introduce new VhostOps vhost_force_iommu
  vhost: implement vhost_vq_get_addr method
  vhost: introduce new VhostOps vhost_vq_get_addr
  vhost: implement vhost_dev_start method
  vhost: introduce new VhostOps vhost_dev_start
  vhost: check the existence of vhost_set_iotlb_callback
  virtio-pci: implement queue_enabled method
  virtio-bus: introduce queue_enabled method
  vhost_net: use the function qemu_get_peer
  net: introduce qemu_get_peer
  MAINTAINERS: add VT-d entry
  docs: vhost-user: add Virtio status protocol feature
  tests/acpi: remove stale allowed tables
  numa: Auto-enable NUMA when any memory devices are possible
  virtio-mem: Exclude unplugged memory during migration
  virtio-mem: Add trace events
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

# Conflicts:
#	hw/arm/virt.c
#	hw/virtio/trace-events
This commit is contained in:
Peter Maydell 2020-07-07 17:37:44 +01:00
commit c8eaf81fd2
67 changed files with 2639 additions and 169 deletions

View File

@ -1792,6 +1792,15 @@ F: hw/virtio/virtio-crypto.c
F: hw/virtio/virtio-crypto-pci.c
F: include/hw/virtio/virtio-crypto.h
virtio-mem
M: David Hildenbrand <david@redhat.com>
S: Supported
W: https://virtio-mem.gitlab.io/
F: hw/virtio/virtio-mem.c
F: hw/virtio/virtio-mem-pci.h
F: hw/virtio/virtio-mem-pci.c
F: include/hw/virtio/virtio-mem.h
nvme
M: Keith Busch <kbusch@kernel.org>
L: qemu-block@nongnu.org
@ -2617,6 +2626,15 @@ F: tests/uefi-test-tools/
F: .gitlab-ci.d/edk2.yml
F: .gitlab-ci.d/edk2/
VT-d Emulation
M: Michael S. Tsirkin <mst@redhat.com>
M: Peter Xu <peterx@redhat.com>
R: Jason Wang <jasowang@redhat.com>
S: Supported
F: hw/i386/intel_iommu.c
F: hw/i386/intel_iommu_internal.h
F: include/hw/i386/intel_iommu.h
Usermode Emulation
------------------
Overall usermode emulation

View File

@ -40,7 +40,6 @@
#include "trace.h"
#include "hw/irq.h"
#include "sysemu/sev.h"
#include "sysemu/balloon.h"
#include "qapi/visitor.h"
#include "qapi/qapi-types-common.h"
#include "qapi/qapi-visit-common.h"
@ -2229,7 +2228,8 @@ static int kvm_init(MachineState *ms)
s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
if (!s->sync_mmu) {
qemu_balloon_inhibit(true);
ret = ram_block_discard_disable(true);
assert(!ret);
}
return 0;

View File

@ -36,23 +36,6 @@
static QEMUBalloonEvent *balloon_event_fn;
static QEMUBalloonStatus *balloon_stat_fn;
static void *balloon_opaque;
static int balloon_inhibit_count;
bool qemu_balloon_is_inhibited(void)
{
return atomic_read(&balloon_inhibit_count) > 0;
}
void qemu_balloon_inhibit(bool state)
{
if (state) {
atomic_inc(&balloon_inhibit_count);
} else {
atomic_dec(&balloon_inhibit_count);
}
assert(atomic_read(&balloon_inhibit_count) >= 0);
}
static bool have_balloon(Error **errp)
{

21
configure vendored
View File

@ -1575,6 +1575,10 @@ for opt do
;;
--enable-vhost-user) vhost_user="yes"
;;
--disable-vhost-vdpa) vhost_vdpa="no"
;;
--enable-vhost-vdpa) vhost_vdpa="yes"
;;
--disable-vhost-kernel) vhost_kernel="no"
;;
--enable-vhost-kernel) vhost_kernel="yes"
@ -1883,6 +1887,7 @@ disabled with --disable-FEATURE, default is enabled if available:
vhost-crypto vhost-user-crypto backend support
vhost-kernel vhost kernel backend support
vhost-user vhost-user backend support
vhost-vdpa vhost-vdpa kernel backend support
spice spice
rbd rados block device (rbd)
libiscsi iscsi support
@ -2394,6 +2399,10 @@ test "$vhost_user" = "" && vhost_user=yes
if test "$vhost_user" = "yes" && test "$mingw32" = "yes"; then
error_exit "vhost-user isn't available on win32"
fi
test "$vhost_vdpa" = "" && vhost_vdpa=$linux
if test "$vhost_vdpa" = "yes" && test "$linux" != "yes"; then
error_exit "vhost-vdpa is only available on Linux"
fi
test "$vhost_kernel" = "" && vhost_kernel=$linux
if test "$vhost_kernel" = "yes" && test "$linux" != "yes"; then
error_exit "vhost-kernel is only available on Linux"
@ -2422,6 +2431,11 @@ test "$vhost_user_fs" = "" && vhost_user_fs=$vhost_user
if test "$vhost_user_fs" = "yes" && test "$vhost_user" = "no"; then
error_exit "--enable-vhost-user-fs requires --enable-vhost-user"
fi
#vhost-vdpa backends
test "$vhost_net_vdpa" = "" && vhost_net_vdpa=$vhost_vdpa
if test "$vhost_net_vdpa" = "yes" && test "$vhost_vdpa" = "no"; then
error_exit "--enable-vhost-net-vdpa requires --enable-vhost-vdpa"
fi
# OR the vhost-kernel and vhost-user values for simplicity
if test "$vhost_net" = ""; then
@ -6947,6 +6961,7 @@ echo "vhost-scsi support $vhost_scsi"
echo "vhost-vsock support $vhost_vsock"
echo "vhost-user support $vhost_user"
echo "vhost-user-fs support $vhost_user_fs"
echo "vhost-vdpa support $vhost_vdpa"
echo "Trace backends $trace_backends"
if have_backend "simple"; then
echo "Trace output file $trace_file-<pid>"
@ -7454,6 +7469,9 @@ fi
if test "$vhost_net_user" = "yes" ; then
echo "CONFIG_VHOST_NET_USER=y" >> $config_host_mak
fi
if test "$vhost_net_vdpa" = "yes" ; then
echo "CONFIG_VHOST_NET_VDPA=y" >> $config_host_mak
fi
if test "$vhost_crypto" = "yes" ; then
echo "CONFIG_VHOST_CRYPTO=y" >> $config_host_mak
fi
@ -7469,6 +7487,9 @@ fi
if test "$vhost_user" = "yes" ; then
echo "CONFIG_VHOST_USER=y" >> $config_host_mak
fi
if test "$vhost_vdpa" = "yes" ; then
echo "CONFIG_VHOST_VDPA=y" >> $config_host_mak
fi
if test "$vhost_user_fs" = "yes" ; then
echo "CONFIG_VHOST_USER_FS=y" >> $config_host_mak
fi

View File

@ -20,3 +20,4 @@ Contents:
qemu-ga
vhost-user
vhost-user-gpu
vhost-vdpa

View File

@ -816,6 +816,7 @@ Protocol features
#define VHOST_USER_PROTOCOL_F_RESET_DEVICE 13
#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
#define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15
#define VHOST_USER_PROTOCOL_F_STATUS 16
Master message types
--------------------
@ -1307,6 +1308,29 @@ Master message types
``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and
update the memory tables of the slave device.
``VHOST_USER_SET_STATUS``
:id: 39
:equivalent ioctl: VHOST_VDPA_SET_STATUS
:slave payload: N/A
:master payload: ``u64``
When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been
successfully negotiated, this message is submitted by the master to
notify the backend with updated device status as defined in the Virtio
specification.
``VHOST_USER_GET_STATUS``
:id: 40
:equivalent ioctl: VHOST_VDPA_GET_STATUS
:slave payload: ``u64``
:master payload: N/A
When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been
successfully negotiated, this message is submitted by the master to
query the backend for its device status as defined in the Virtio
specification.
Slave message types
-------------------

View File

@ -0,0 +1,17 @@
=====================
Vhost-vdpa Protocol
=====================
Introduction
=============
vDPA(Virtual data path acceleration) device is a device that uses
a datapath which complies with the virtio specifications with vendor
specific control path. vDPA devices can be both physically located on
the hardware or emulated by software.
This document describes the vDPA support in qemu
Here is the kernel commit here
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4c8cf31885f69e86be0b5b9e6677a26797365e1d
TODO : More information will add later

52
exec.c
View File

@ -4115,4 +4115,56 @@ void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
}
}
/*
* If positive, discarding RAM is disabled. If negative, discarding RAM is
* required to work and cannot be disabled.
*/
static int ram_block_discard_disabled;
int ram_block_discard_disable(bool state)
{
int old;
if (!state) {
atomic_dec(&ram_block_discard_disabled);
return 0;
}
do {
old = atomic_read(&ram_block_discard_disabled);
if (old < 0) {
return -EBUSY;
}
} while (atomic_cmpxchg(&ram_block_discard_disabled, old, old + 1) != old);
return 0;
}
int ram_block_discard_require(bool state)
{
int old;
if (!state) {
atomic_inc(&ram_block_discard_disabled);
return 0;
}
do {
old = atomic_read(&ram_block_discard_disabled);
if (old > 0) {
return -EBUSY;
}
} while (atomic_cmpxchg(&ram_block_discard_disabled, old, old - 1) != old);
return 0;
}
bool ram_block_discard_is_disabled(void)
{
return atomic_read(&ram_block_discard_disabled) > 0;
}
bool ram_block_discard_is_required(void)
{
return atomic_read(&ram_block_discard_disabled) < 0;
}
#endif

View File

@ -2401,6 +2401,7 @@ static void virt_machine_class_init(ObjectClass *oc, void *data)
hc->unplug = virt_machine_device_unplug_cb;
mc->nvdimm_supported = true;
mc->auto_enable_numa_with_memhp = true;
mc->auto_enable_numa_with_memdev = true;
mc->default_ram_id = "mach-virt.ram";
object_class_property_add(oc, "acpi", "OnOffAuto",
@ -2516,6 +2517,7 @@ static void virt_machine_5_0_options(MachineClass *mc)
compat_props_add(mc->compat_props, hw_compat_5_0, hw_compat_5_0_len);
mc->numa_mem_supported = true;
vmc->acpi_expose_flash = true;
mc->auto_enable_numa_with_memdev = false;
}
DEFINE_VIRT_MACHINE(5, 0)

View File

@ -688,8 +688,9 @@ void numa_complete_configuration(MachineState *ms)
NodeInfo *numa_info = ms->numa_state->nodes;
/*
* If memory hotplug is enabled (slots > 0) but without '-numa'
* options explicitly on CLI, guestes will break.
* If memory hotplug is enabled (slot > 0) or memory devices are enabled
* (ms->maxram_size > ram_size) but without '-numa' options explicitly on
* CLI, guests will break.
*
* Windows: won't enable memory hotplug without SRAT table at all
*
@ -704,9 +705,9 @@ void numa_complete_configuration(MachineState *ms)
* assume there is just one node with whole RAM.
*/
if (ms->numa_state->num_nodes == 0 &&
((ms->ram_slots > 0 &&
mc->auto_enable_numa_with_memhp) ||
mc->auto_enable_numa)) {
((ms->ram_slots && mc->auto_enable_numa_with_memhp) ||
(ms->maxram_size > ms->ram_size && mc->auto_enable_numa_with_memdev) ||
mc->auto_enable_numa)) {
NumaNodeOptions node = { };
parse_numa_node(ms, &node, &error_abort);
numa_info[0].node_mem = ram_size;
@ -824,6 +825,7 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
MemoryDeviceInfoList *info;
PCDIMMDeviceInfo *pcdimm_info;
VirtioPMEMDeviceInfo *vpi;
VirtioMEMDeviceInfo *vmi;
for (info = info_list; info; info = info->next) {
MemoryDeviceInfo *value = info->value;
@ -844,6 +846,11 @@ static void numa_stat_memory_devices(NumaNodeMem node_mem[])
node_mem[0].node_mem += vpi->size;
node_mem[0].node_plugged_mem += vpi->size;
break;
case MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM:
vmi = value->u.virtio_mem.data;
node_mem[vmi->node].node_mem += vmi->size;
node_mem[vmi->node].node_plugged_mem += vmi->size;
break;
default:
g_assert_not_reached();
}

View File

@ -35,6 +35,7 @@ config PC
select ACPI_PCI
select ACPI_VMGENID
select VIRTIO_PMEM_SUPPORTED
select VIRTIO_MEM_SUPPORTED
config PC_PCI
bool

View File

@ -464,6 +464,7 @@ static void microvm_class_init(ObjectClass *oc, void *data)
mc->max_cpus = 288;
mc->has_hotpluggable_cpus = false;
mc->auto_enable_numa_with_memhp = false;
mc->auto_enable_numa_with_memdev = false;
mc->default_cpu_type = TARGET_DEFAULT_CPU_TYPE;
mc->nvdimm_supported = false;
mc->default_ram_id = "microvm.ram";

View File

@ -88,6 +88,7 @@
#include "hw/net/ne2000-isa.h"
#include "standard-headers/asm-x86/bootparam.h"
#include "hw/virtio/virtio-pmem-pci.h"
#include "hw/virtio/virtio-mem-pci.h"
#include "hw/mem/memory-device.h"
#include "sysemu/replay.h"
#include "qapi/qmp/qerror.h"
@ -1637,19 +1638,20 @@ static void pc_cpu_pre_plug(HotplugHandler *hotplug_dev,
numa_cpu_pre_plug(cpu_slot, dev, errp);
}
static void pc_virtio_pmem_pci_pre_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
static void pc_virtio_md_pci_pre_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
Error *local_err = NULL;
if (!hotplug_dev2) {
if (!hotplug_dev2 && dev->hotplugged) {
/*
* Without a bus hotplug handler, we cannot control the plug/unplug
* order. This should never be the case on x86, however better add
* a safety net.
* order. We should never reach this point when hotplugging on x86,
* however, better add a safety net.
*/
error_setg(errp, "virtio-pmem-pci not supported on this bus.");
error_setg(errp, "hotplug of virtio based memory devices not supported"
" on this bus.");
return;
}
/*
@ -1658,14 +1660,14 @@ static void pc_virtio_pmem_pci_pre_plug(HotplugHandler *hotplug_dev,
*/
memory_device_pre_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev), NULL,
&local_err);
if (!local_err) {
if (!local_err && hotplug_dev2) {
hotplug_handler_pre_plug(hotplug_dev2, dev, &local_err);
}
error_propagate(errp, local_err);
}
static void pc_virtio_pmem_pci_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
static void pc_virtio_md_pci_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
HotplugHandler *hotplug_dev2 = qdev_get_bus_hotplug_handler(dev);
Error *local_err = NULL;
@ -1676,24 +1678,26 @@ static void pc_virtio_pmem_pci_plug(HotplugHandler *hotplug_dev,
* device bits.
*/
memory_device_plug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
hotplug_handler_plug(hotplug_dev2, dev, &local_err);
if (local_err) {
memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
if (hotplug_dev2) {
hotplug_handler_plug(hotplug_dev2, dev, &local_err);
if (local_err) {
memory_device_unplug(MEMORY_DEVICE(dev), MACHINE(hotplug_dev));
}
}
error_propagate(errp, local_err);
}
static void pc_virtio_pmem_pci_unplug_request(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
static void pc_virtio_md_pci_unplug_request(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
/* We don't support virtio pmem hot unplug */
error_setg(errp, "virtio pmem device unplug not supported.");
/* We don't support hot unplug of virtio based memory devices */
error_setg(errp, "virtio based memory devices cannot be unplugged.");
}
static void pc_virtio_pmem_pci_unplug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
static void pc_virtio_md_pci_unplug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
/* We don't support virtio pmem hot unplug */
/* We don't support hot unplug of virtio based memory devices */
}
static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
@ -1703,8 +1707,9 @@ static void pc_machine_device_pre_plug_cb(HotplugHandler *hotplug_dev,
pc_memory_pre_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
pc_cpu_pre_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
pc_virtio_pmem_pci_pre_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
pc_virtio_md_pci_pre_plug(hotplug_dev, dev, errp);
}
}
@ -1715,8 +1720,9 @@ static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev,
pc_memory_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
pc_cpu_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
pc_virtio_pmem_pci_plug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
pc_virtio_md_pci_plug(hotplug_dev, dev, errp);
}
}
@ -1727,8 +1733,9 @@ static void pc_machine_device_unplug_request_cb(HotplugHandler *hotplug_dev,
pc_memory_unplug_request(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
pc_cpu_unplug_request_cb(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
pc_virtio_pmem_pci_unplug_request(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
pc_virtio_md_pci_unplug_request(hotplug_dev, dev, errp);
} else {
error_setg(errp, "acpi: device unplug request for not supported device"
" type: %s", object_get_typename(OBJECT(dev)));
@ -1742,8 +1749,9 @@ static void pc_machine_device_unplug_cb(HotplugHandler *hotplug_dev,
pc_memory_unplug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_CPU)) {
pc_cpu_unplug_cb(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
pc_virtio_pmem_pci_unplug(hotplug_dev, dev, errp);
} else if (object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
pc_virtio_md_pci_unplug(hotplug_dev, dev, errp);
} else {
error_setg(errp, "acpi: device unplug for not supported device"
" type: %s", object_get_typename(OBJECT(dev)));
@ -1755,7 +1763,8 @@ static HotplugHandler *pc_get_hotplug_handler(MachineState *machine,
{
if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM) ||
object_dynamic_cast(OBJECT(dev), TYPE_CPU) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI)) {
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_PMEM_PCI) ||
object_dynamic_cast(OBJECT(dev), TYPE_VIRTIO_MEM_PCI)) {
return HOTPLUG_HANDLER(machine);
}
@ -1966,6 +1975,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
mc->get_default_cpu_node_id = x86_get_default_cpu_node_id;
mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids;
mc->auto_enable_numa_with_memhp = true;
mc->auto_enable_numa_with_memdev = true;
mc->has_hotpluggable_cpus = true;
mc->default_boot_order = "cad";
mc->hot_add_cpu = pc_hot_add_cpu;

View File

@ -444,6 +444,7 @@ static void pc_i440fx_5_0_machine_options(MachineClass *m)
m->numa_mem_supported = true;
compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
m->auto_enable_numa_with_memdev = false;
}
DEFINE_I440FX_MACHINE(v5_0, "pc-i440fx-5.0", NULL,

View File

@ -372,6 +372,7 @@ static void pc_q35_5_0_machine_options(MachineClass *m)
m->numa_mem_supported = true;
compat_props_add(m->compat_props, hw_compat_5_0, hw_compat_5_0_len);
compat_props_add(m->compat_props, pc_compat_5_0, pc_compat_5_0_len);
m->auto_enable_numa_with_memhp = false;
}
DEFINE_Q35_MACHINE(v5_0, "pc-q35-5.0", NULL,

View File

@ -52,6 +52,17 @@ uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features)
return features;
}
int vhost_net_get_config(struct vhost_net *net, uint8_t *config,
uint32_t config_len)
{
return 0;
}
int vhost_net_set_config(struct vhost_net *net, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags)
{
return 0;
}
void vhost_net_ack_features(struct vhost_net *net, uint64_t features)
{
}

View File

@ -17,6 +17,7 @@
#include "net/net.h"
#include "net/tap.h"
#include "net/vhost-user.h"
#include "net/vhost-vdpa.h"
#include "standard-headers/linux/vhost_types.h"
#include "hw/virtio/virtio-net.h"
@ -33,12 +34,6 @@
#include "hw/virtio/vhost.h"
#include "hw/virtio/virtio-bus.h"
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
int backend;
NetClientState *nc;
};
/* Features supported by host kernel. */
static const int kernel_feature_bits[] = {
@ -96,6 +91,11 @@ static const int *vhost_net_get_feature_bits(struct vhost_net *net)
case NET_CLIENT_DRIVER_VHOST_USER:
feature_bits = user_feature_bits;
break;
#ifdef CONFIG_VHOST_NET_VDPA
case NET_CLIENT_DRIVER_VHOST_VDPA:
feature_bits = vdpa_feature_bits;
break;
#endif
default:
error_report("Feature bits not defined for this type: %d",
net->nc->info->type);
@ -110,6 +110,16 @@ uint64_t vhost_net_get_features(struct vhost_net *net, uint64_t features)
return vhost_get_features(&net->dev, vhost_net_get_feature_bits(net),
features);
}
int vhost_net_get_config(struct vhost_net *net, uint8_t *config,
uint32_t config_len)
{
return vhost_dev_get_config(&net->dev, config, config_len);
}
int vhost_net_set_config(struct vhost_net *net, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags)
{
return vhost_dev_set_config(&net->dev, data, offset, size, flags);
}
void vhost_net_ack_features(struct vhost_net *net, uint64_t features)
{
@ -306,7 +316,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
VirtioBusState *vbus = VIRTIO_BUS(qbus);
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
struct vhost_net *net;
int r, e, i;
NetClientState *peer;
if (!k->set_guest_notifiers) {
error_report("binding does not support guest notifiers");
@ -314,9 +326,9 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
}
for (i = 0; i < total_queues; i++) {
struct vhost_net *net;
net = get_vhost_net(ncs[i].peer);
peer = qemu_get_peer(ncs, i);
net = get_vhost_net(peer);
vhost_net_set_vq_index(net, i * 2);
/* Suppress the masking guest notifiers on vhost user
@ -335,15 +347,16 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
}
for (i = 0; i < total_queues; i++) {
r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev);
peer = qemu_get_peer(ncs, i);
r = vhost_net_start_one(get_vhost_net(peer), dev);
if (r < 0) {
goto err_start;
}
if (ncs[i].peer->vring_enable) {
if (peer->vring_enable) {
/* restore vring enable state */
r = vhost_set_vring_enable(ncs[i].peer, ncs[i].peer->vring_enable);
r = vhost_set_vring_enable(peer, peer->vring_enable);
if (r < 0) {
goto err_start;
@ -355,7 +368,8 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
err_start:
while (--i >= 0) {
vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
peer = qemu_get_peer(ncs , i);
vhost_net_stop_one(get_vhost_net(peer), dev);
}
e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
if (e < 0) {
@ -429,6 +443,12 @@ VHostNetState *get_vhost_net(NetClientState *nc)
vhost_net = vhost_user_get_vhost_net(nc);
assert(vhost_net);
break;
#endif
#ifdef CONFIG_VHOST_NET_VDPA
case NET_CLIENT_DRIVER_VHOST_VDPA:
vhost_net = vhost_vdpa_get_vhost_net(nc);
assert(vhost_net);
break;
#endif
default:
break;

View File

@ -43,6 +43,7 @@
#include "monitor/qdev.h"
#include "hw/pci/pci.h"
#include "net_rx_pkt.h"
#include "hw/virtio/vhost.h"
#define VIRTIO_NET_VM_VERSION 11
@ -125,6 +126,8 @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
VirtIONet *n = VIRTIO_NET(vdev);
struct virtio_net_config netcfg;
int ret = 0;
memset(&netcfg, 0 , sizeof(struct virtio_net_config));
virtio_stw_p(vdev, &netcfg.status, n->status);
virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queues);
virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
@ -138,6 +141,15 @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
virtio_stl_p(vdev, &netcfg.supported_hash_types,
VIRTIO_NET_RSS_SUPPORTED_HASHES);
memcpy(config, &netcfg, n->config_size);
NetClientState *nc = qemu_get_queue(n->nic);
if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
n->config_size);
if (ret != -1) {
memcpy(config, &netcfg, n->config_size);
}
}
}
static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
@ -153,6 +165,13 @@ static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
memcpy(n->mac, netcfg.mac, ETH_ALEN);
qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
}
NetClientState *nc = qemu_get_queue(n->nic);
if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
vhost_net_set_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
0, n->config_size,
VHOST_SET_CONFIG_TYPE_MASTER);
}
}
static bool virtio_net_started(VirtIONet *n, uint8_t status)

View File

@ -43,7 +43,6 @@
#include "hw/qdev-properties.h"
#include "hw/s390x/tod.h"
#include "sysemu/sysemu.h"
#include "sysemu/balloon.h"
#include "hw/s390x/pv.h"
#include "migration/blocker.h"
@ -329,7 +328,7 @@ static void s390_machine_unprotect(S390CcwMachineState *ms)
ms->pv = false;
migrate_del_blocker(pv_mig_blocker);
error_free_or_abort(&pv_mig_blocker);
qemu_balloon_inhibit(false);
ram_block_discard_disable(false);
}
static int s390_machine_protect(S390CcwMachineState *ms)
@ -338,17 +337,22 @@ static int s390_machine_protect(S390CcwMachineState *ms)
int rc;
/*
* Ballooning on protected VMs needs support in the guest for
* sharing and unsharing balloon pages. Block ballooning for
* now, until we have a solution to make at least Linux guests
* either support it or fail gracefully.
* Discarding of memory in RAM blocks does not work as expected with
* protected VMs. Sharing and unsharing pages would be required. Disable
* it for now, until until we have a solution to make at least Linux
* guests either support it (e.g., virtio-balloon) or fail gracefully.
*/
qemu_balloon_inhibit(true);
rc = ram_block_discard_disable(true);
if (rc) {
error_report("protected VMs: cannot disable RAM discard");
return rc;
}
error_setg(&pv_mig_blocker,
"protected VMs are currently not migrateable.");
rc = migrate_add_blocker(pv_mig_blocker, &local_err);
if (rc) {
qemu_balloon_inhibit(false);
ram_block_discard_disable(false);
error_report_err(local_err);
error_free_or_abort(&pv_mig_blocker);
return rc;
@ -357,7 +361,7 @@ static int s390_machine_protect(S390CcwMachineState *ms)
/* Create SE VM */
rc = s390_pv_vm_enable();
if (rc) {
qemu_balloon_inhibit(false);
ram_block_discard_disable(false);
migrate_del_blocker(pv_mig_blocker);
error_free_or_abort(&pv_mig_blocker);
return rc;

View File

@ -105,12 +105,12 @@ static void vfio_ap_realize(DeviceState *dev, Error **errp)
vapdev->vdev.dev = dev;
/*
* vfio-ap devices operate in a way compatible with
* memory ballooning, as no pages are pinned in the host.
* vfio-ap devices operate in a way compatible with discarding of
* memory in RAM blocks, as no pages are pinned in the host.
* This needs to be set before vfio_get_device() for vfio common to
* handle the balloon inhibitor.
* handle ram_block_discard_disable().
*/
vapdev->vdev.balloon_allowed = true;
vapdev->vdev.ram_block_discard_allowed = true;
ret = vfio_get_device(vfio_group, mdevid, &vapdev->vdev, errp);
if (ret) {

View File

@ -574,12 +574,13 @@ static void vfio_ccw_get_device(VFIOGroup *group, VFIOCCWDevice *vcdev,
/*
* All vfio-ccw devices are believed to operate in a way compatible with
* memory ballooning, ie. pages pinned in the host are in the current
* working set of the guest driver and therefore never overlap with pages
* available to the guest balloon driver. This needs to be set before
* vfio_get_device() for vfio common to handle the balloon inhibitor.
* discarding of memory in RAM blocks, ie. pages pinned in the host are
* in the current working set of the guest driver and therefore never
* overlap e.g., with pages available to the guest balloon driver. This
* needs to be set before vfio_get_device() for vfio common to handle
* ram_block_discard_disable().
*/
vcdev->vdev.balloon_allowed = true;
vcdev->vdev.ram_block_discard_allowed = true;
if (vfio_get_device(group, vcdev->cdev.mdevid, &vcdev->vdev, errp)) {
goto out_err;

View File

@ -33,7 +33,6 @@
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/range.h"
#include "sysemu/balloon.h"
#include "sysemu/kvm.h"
#include "sysemu/reset.h"
#include "trace.h"
@ -1215,31 +1214,36 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
space = vfio_get_address_space(as);
/*
* VFIO is currently incompatible with memory ballooning insofar as the
* VFIO is currently incompatible with discarding of RAM insofar as the
* madvise to purge (zap) the page from QEMU's address space does not
* interact with the memory API and therefore leaves stale virtual to
* physical mappings in the IOMMU if the page was previously pinned. We
* therefore add a balloon inhibit for each group added to a container,
* therefore set discarding broken for each group added to a container,
* whether the container is used individually or shared. This provides
* us with options to allow devices within a group to opt-in and allow
* ballooning, so long as it is done consistently for a group (for instance
* discarding, so long as it is done consistently for a group (for instance
* if the device is an mdev device where it is known that the host vendor
* driver will never pin pages outside of the working set of the guest
* driver, which would thus not be ballooning candidates).
* driver, which would thus not be discarding candidates).
*
* The first opportunity to induce pinning occurs here where we attempt to
* attach the group to existing containers within the AddressSpace. If any
* pages are already zapped from the virtual address space, such as from a
* previous ballooning opt-in, new pinning will cause valid mappings to be
* pages are already zapped from the virtual address space, such as from
* previous discards, new pinning will cause valid mappings to be
* re-established. Likewise, when the overall MemoryListener for a new
* container is registered, a replay of mappings within the AddressSpace
* will occur, re-establishing any previously zapped pages as well.
*
* NB. Balloon inhibiting does not currently block operation of the
* balloon driver or revoke previously pinned pages, it only prevents
* calling madvise to modify the virtual mapping of ballooned pages.
* Especially virtio-balloon is currently only prevented from discarding
* new memory, it will not yet set ram_block_discard_set_required() and
* therefore, neither stops us here or deals with the sudden memory
* consumption of inflated memory.
*/
qemu_balloon_inhibit(true);
ret = ram_block_discard_disable(true);
if (ret) {
error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken");
return ret;
}
QLIST_FOREACH(container, &space->containers, next) {
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
@ -1405,7 +1409,7 @@ close_fd_exit:
close(fd);
put_space_exit:
qemu_balloon_inhibit(false);
ram_block_discard_disable(false);
vfio_put_address_space(space);
return ret;
@ -1526,8 +1530,8 @@ void vfio_put_group(VFIOGroup *group)
return;
}
if (!group->balloon_allowed) {
qemu_balloon_inhibit(false);
if (!group->ram_block_discard_allowed) {
ram_block_discard_disable(false);
}
vfio_kvm_device_del_group(group);
vfio_disconnect_container(group);
@ -1565,22 +1569,23 @@ int vfio_get_device(VFIOGroup *group, const char *name,
}
/*
* Clear the balloon inhibitor for this group if the driver knows the
* device operates compatibly with ballooning. Setting must be consistent
* per group, but since compatibility is really only possible with mdev
* currently, we expect singleton groups.
* Set discarding of RAM as not broken for this group if the driver knows
* the device operates compatibly with discarding. Setting must be
* consistent per group, but since compatibility is really only possible
* with mdev currently, we expect singleton groups.
*/
if (vbasedev->balloon_allowed != group->balloon_allowed) {
if (vbasedev->ram_block_discard_allowed !=
group->ram_block_discard_allowed) {
if (!QLIST_EMPTY(&group->device_list)) {
error_setg(errp,
"Inconsistent device balloon setting within group");
error_setg(errp, "Inconsistent setting of support for discarding "
"RAM (e.g., balloon) within group");
close(fd);
return -1;
}
if (!group->balloon_allowed) {
group->balloon_allowed = true;
qemu_balloon_inhibit(false);
if (!group->ram_block_discard_allowed) {
group->ram_block_discard_allowed = true;
ram_block_discard_disable(false);
}
}

View File

@ -2789,7 +2789,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
/*
* Mediated devices *might* operate compatibly with memory ballooning, but
* Mediated devices *might* operate compatibly with discarding of RAM, but
* we cannot know for certain, it depends on whether the mdev vendor driver
* stays in sync with the active working set of the guest driver. Prevent
* the x-balloon-allowed option unless this is minimally an mdev device.
@ -2802,7 +2802,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
if (vdev->vbasedev.balloon_allowed && !is_mdev) {
if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) {
error_setg(errp, "x-balloon-allowed only potentially compatible "
"with mdev devices");
vfio_put_group(group);
@ -3156,7 +3156,7 @@ static Property vfio_pci_dev_properties[] = {
VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
vbasedev.balloon_allowed, false),
vbasedev.ram_block_discard_allowed, false),
DEFINE_PROP_BOOL("x-no-kvm-intx", VFIOPCIDevice, no_kvm_intx, false),
DEFINE_PROP_BOOL("x-no-kvm-msi", VFIOPCIDevice, no_kvm_msi, false),
DEFINE_PROP_BOOL("x-no-kvm-msix", VFIOPCIDevice, no_kvm_msix, false),

View File

@ -47,3 +47,14 @@ config VIRTIO_PMEM
depends on VIRTIO
depends on VIRTIO_PMEM_SUPPORTED
select MEM_DEVICE
config VIRTIO_MEM_SUPPORTED
bool
config VIRTIO_MEM
bool
default y
depends on VIRTIO
depends on LINUX
depends on VIRTIO_MEM_SUPPORTED
select MEM_DEVICE

View File

@ -5,6 +5,7 @@ obj-y += virtio.o
obj-$(CONFIG_VHOST) += vhost.o vhost-backend.o
common-obj-$(call lnot,$(CONFIG_VHOST)) += vhost-stub.o
obj-$(CONFIG_VHOST_USER) += vhost-user.o
obj-$(CONFIG_VHOST_VDPA) += vhost-vdpa.o
common-obj-$(CONFIG_VIRTIO_RNG) += virtio-rng.o
common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
@ -19,6 +20,8 @@ obj-$(call land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-p
obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-common.o vhost-vsock.o
obj-$(CONFIG_VHOST_USER_VSOCK) += vhost-vsock-common.o vhost-user-vsock.o
obj-$(CONFIG_VIRTIO_MEM) += virtio-mem.o
common-obj-$(call land,$(CONFIG_VIRTIO_MEM),$(CONFIG_VIRTIO_PCI)) += virtio-mem-pci.o
ifeq ($(CONFIG_VIRTIO_PCI),y)
obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o

View File

@ -75,3 +75,13 @@ virtio_iommu_put_domain(uint32_t domain_id) "Free domain=%d"
virtio_iommu_translate_out(uint64_t virt_addr, uint64_t phys_addr, uint32_t sid) "0x%"PRIx64" -> 0x%"PRIx64 " for sid=%d"
virtio_iommu_report_fault(uint8_t reason, uint32_t flags, uint32_t endpoint, uint64_t addr) "FAULT reason=%d flags=%d endpoint=%d address =0x%"PRIx64
virtio_iommu_fill_resv_property(uint32_t devid, uint8_t subtype, uint64_t start, uint64_t end) "dev= %d, type=%d start=0x%"PRIx64" end=0x%"PRIx64
# virtio-mem.c
virtio_mem_send_response(uint16_t type) "type=%" PRIu16
virtio_mem_plug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
virtio_mem_unplug_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
virtio_mem_unplugged_all(void) ""
virtio_mem_unplug_all_request(void) ""
virtio_mem_resized_usable_region(uint64_t old_size, uint64_t new_size) "old_size=0x%" PRIx64 "new_size=0x%" PRIx64
virtio_mem_state_request(uint64_t addr, uint16_t nb_blocks) "addr=0x%" PRIx64 " nb_blocks=%" PRIu16
virtio_mem_state_response(uint16_t state) "state=%" PRIu16

View File

@ -15,6 +15,7 @@
#include "qemu/main-loop.h"
#include "standard-headers/linux/vhost_types.h"
#include "hw/virtio/vhost-vdpa.h"
#ifdef CONFIG_VHOST_KERNEL
#include <linux/vhost.h>
#include <sys/ioctl.h>
@ -285,6 +286,11 @@ int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
case VHOST_BACKEND_TYPE_USER:
dev->vhost_ops = &user_ops;
break;
#endif
#ifdef CONFIG_VHOST_VDPA
case VHOST_BACKEND_TYPE_VDPA:
dev->vhost_ops = &vdpa_ops;
break;
#endif
default:
error_report("Unknown vhost backend type");

475
hw/virtio/vhost-vdpa.c Normal file
View File

@ -0,0 +1,475 @@
/*
* vhost-vdpa
*
* Copyright(c) 2017-2018 Intel Corporation.
* Copyright(c) 2020 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include <linux/vhost.h>
#include <linux/vfio.h>
#include <sys/eventfd.h>
#include <sys/ioctl.h>
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-backend.h"
#include "hw/virtio/virtio-net.h"
#include "hw/virtio/vhost-vdpa.h"
#include "qemu/main-loop.h"
#include <linux/kvm.h>
#include "sysemu/kvm.h"
static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section)
{
return (!memory_region_is_ram(section->mr) &&
!memory_region_is_iommu(section->mr)) ||
/*
* Sizing an enabled 64-bit BAR can cause spurious mappings to
* addresses in the upper part of the 64-bit address space. These
* are never accessed by the CPU and beyond the address width of
* some IOMMU hardware. TODO: VDPA should tell us the IOMMU width.
*/
section->offset_within_address_space & (1ULL << 63);
}
static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
void *vaddr, bool readonly)
{
struct vhost_msg_v2 msg;
int fd = v->device_fd;
int ret = 0;
msg.type = v->msg_type;
msg.iotlb.iova = iova;
msg.iotlb.size = size;
msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
msg.iotlb.type = VHOST_IOTLB_UPDATE;
if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
error_report("failed to write, fd=%d, errno=%d (%s)",
fd, errno, strerror(errno));
return -EIO ;
}
return ret;
}
static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
hwaddr size)
{
struct vhost_msg_v2 msg;
int fd = v->device_fd;
int ret = 0;
msg.type = v->msg_type;
msg.iotlb.iova = iova;
msg.iotlb.size = size;
msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
error_report("failed to write, fd=%d, errno=%d (%s)",
fd, errno, strerror(errno));
return -EIO ;
}
return ret;
}
static void vhost_vdpa_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
hwaddr iova;
Int128 llend, llsize;
void *vaddr;
int ret;
if (vhost_vdpa_listener_skipped_section(section)) {
return;
}
if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
(section->offset_within_region & ~TARGET_PAGE_MASK))) {
error_report("%s received unaligned region", __func__);
return;
}
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
llend = int128_make64(section->offset_within_address_space);
llend = int128_add(llend, section->size);
llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
if (int128_ge(int128_make64(iova), llend)) {
return;
}
memory_region_ref(section->mr);
/* Here we assume that memory_region_is_ram(section->mr)==true */
vaddr = memory_region_get_ram_ptr(section->mr) +
section->offset_within_region +
(iova - section->offset_within_address_space);
llsize = int128_sub(llend, int128_make64(iova));
ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
vaddr, section->readonly);
if (ret) {
error_report("vhost vdpa map fail!");
if (memory_region_is_ram_device(section->mr)) {
/* Allow unexpected mappings not to be fatal for RAM devices */
error_report("map ram fail!");
return ;
}
goto fail;
}
return;
fail:
if (memory_region_is_ram_device(section->mr)) {
error_report("failed to vdpa_dma_map. pci p2p may not work");
return;
}
/*
* On the initfn path, store the first error in the container so we
* can gracefully fail. Runtime, there's not much we can do other
* than throw a hardware error.
*/
error_report("vhost-vdpa: DMA mapping failed, unable to continue");
return;
}
static void vhost_vdpa_listener_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
hwaddr iova;
Int128 llend, llsize;
int ret;
bool try_unmap = true;
if (vhost_vdpa_listener_skipped_section(section)) {
return;
}
if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
(section->offset_within_region & ~TARGET_PAGE_MASK))) {
error_report("%s received unaligned region", __func__);
return;
}
iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
llend = int128_make64(section->offset_within_address_space);
llend = int128_add(llend, section->size);
llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
if (int128_ge(int128_make64(iova), llend)) {
return;
}
llsize = int128_sub(llend, int128_make64(iova));
if (try_unmap) {
ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
if (ret) {
error_report("vhost_vdpa dma unmap error!");
}
}
memory_region_unref(section->mr);
}
/*
* IOTLB API is used by vhost-vpda which requires incremental updating
* of the mapping. So we can not use generic vhost memory listener which
* depends on the addnop().
*/
static const MemoryListener vhost_vdpa_memory_listener = {
.region_add = vhost_vdpa_listener_region_add,
.region_del = vhost_vdpa_listener_region_del,
};
static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
void *arg)
{
struct vhost_vdpa *v = dev->opaque;
int fd = v->device_fd;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
return ioctl(fd, request, arg);
}
static void vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
{
uint8_t s;
if (vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s)) {
return;
}
s |= status;
vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
}
static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque)
{
struct vhost_vdpa *v;
uint64_t features;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
v = opaque;
dev->opaque = opaque ;
vhost_vdpa_call(dev, VHOST_GET_FEATURES, &features);
dev->backend_features = features;
v->listener = vhost_vdpa_memory_listener;
v->msg_type = VHOST_IOTLB_MSG_V2;
vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
VIRTIO_CONFIG_S_DRIVER);
return 0;
}
static int vhost_vdpa_cleanup(struct vhost_dev *dev)
{
struct vhost_vdpa *v;
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
v = dev->opaque;
memory_listener_unregister(&v->listener);
dev->opaque = NULL;
return 0;
}
static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
{
return INT_MAX;
}
static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
struct vhost_memory *mem)
{
if (mem->padding) {
return -1;
}
return 0;
}
static int vhost_vdpa_set_features(struct vhost_dev *dev,
uint64_t features)
{
int ret;
ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
uint8_t status = 0;
if (ret) {
return ret;
}
vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &status);
return !(status & VIRTIO_CONFIG_S_FEATURES_OK);
}
int vhost_vdpa_get_device_id(struct vhost_dev *dev,
uint32_t *device_id)
{
return vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
}
static int vhost_vdpa_reset_device(struct vhost_dev *dev)
{
uint8_t status = 0;
return vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
}
static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
{
assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
return idx - dev->vq_index;
}
static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
{
int i;
for (i = 0; i < dev->nvqs; ++i) {
struct vhost_vring_state state = {
.index = dev->vq_index + i,
.num = 1,
};
vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
}
return 0;
}
static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
uint32_t offset, uint32_t size,
uint32_t flags)
{
struct vhost_vdpa_config *config;
int ret;
unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
config = g_malloc(size + config_size);
if (config == NULL) {
return -1;
}
config->off = offset;
config->len = size;
memcpy(config->buf, data, size);
ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
g_free(config);
return ret;
}
static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
uint32_t config_len)
{
struct vhost_vdpa_config *v_config;
unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
int ret;
v_config = g_malloc(config_len + config_size);
if (v_config == NULL) {
return -1;
}
v_config->len = config_len;
v_config->off = 0;
ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
memcpy(config, v_config->buf, config_len);
g_free(v_config);
return ret;
}
static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
{
struct vhost_vdpa *v = dev->opaque;
if (started) {
uint8_t status = 0;
memory_listener_register(&v->listener, &address_space_memory);
vhost_vdpa_set_vring_ready(dev);
vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &status);
return !(status & VIRTIO_CONFIG_S_DRIVER_OK);
} else {
vhost_vdpa_reset_device(dev);
vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
VIRTIO_CONFIG_S_DRIVER);
memory_listener_unregister(&v->listener);
return 0;
}
}
static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
struct vhost_log *log)
{
return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
}
static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
struct vhost_vring_addr *addr)
{
return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
}
static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
}
static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
}
static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
struct vhost_vring_state *ring)
{
return vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
}
static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
}
static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
struct vhost_vring_file *file)
{
return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
}
static int vhost_vdpa_get_features(struct vhost_dev *dev,
uint64_t *features)
{
return vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
}
static int vhost_vdpa_set_owner(struct vhost_dev *dev)
{
return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
}
static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
{
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
return 0;
}
static bool vhost_vdpa_force_iommu(struct vhost_dev *dev)
{
return true;
}
const VhostOps vdpa_ops = {
.backend_type = VHOST_BACKEND_TYPE_VDPA,
.vhost_backend_init = vhost_vdpa_init,
.vhost_backend_cleanup = vhost_vdpa_cleanup,
.vhost_set_log_base = vhost_vdpa_set_log_base,
.vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
.vhost_set_vring_num = vhost_vdpa_set_vring_num,
.vhost_set_vring_base = vhost_vdpa_set_vring_base,
.vhost_get_vring_base = vhost_vdpa_get_vring_base,
.vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
.vhost_set_vring_call = vhost_vdpa_set_vring_call,
.vhost_get_features = vhost_vdpa_get_features,
.vhost_set_owner = vhost_vdpa_set_owner,
.vhost_set_vring_endian = NULL,
.vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
.vhost_set_mem_table = vhost_vdpa_set_mem_table,
.vhost_set_features = vhost_vdpa_set_features,
.vhost_reset_device = vhost_vdpa_reset_device,
.vhost_get_vq_index = vhost_vdpa_get_vq_index,
.vhost_get_config = vhost_vdpa_get_config,
.vhost_set_config = vhost_vdpa_set_config,
.vhost_requires_shm_log = NULL,
.vhost_migration_done = NULL,
.vhost_backend_can_merge = NULL,
.vhost_net_set_mtu = NULL,
.vhost_set_iotlb_callback = NULL,
.vhost_send_device_iotlb_msg = NULL,
.vhost_dev_start = vhost_vdpa_dev_start,
.vhost_get_device_id = vhost_vdpa_get_device_id,
.vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
.vhost_force_iommu = vhost_vdpa_force_iommu,
};

View File

@ -773,15 +773,25 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
unsigned idx, bool enable_log)
{
struct vhost_vring_addr addr = {
.index = idx,
.desc_user_addr = (uint64_t)(unsigned long)vq->desc,
.avail_user_addr = (uint64_t)(unsigned long)vq->avail,
.used_user_addr = (uint64_t)(unsigned long)vq->used,
.log_guest_addr = vq->used_phys,
.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
};
int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
struct vhost_vring_addr addr;
int r;
memset(&addr, 0, sizeof(struct vhost_vring_addr));
if (dev->vhost_ops->vhost_vq_get_addr) {
r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
if (r < 0) {
VHOST_OPS_DEBUG("vhost_vq_get_addr failed");
return -errno;
}
} else {
addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
}
addr.index = idx;
addr.log_guest_addr = vq->used_phys;
addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
if (r < 0) {
VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
return -errno;
@ -800,6 +810,11 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
if (!vhost_dev_has_iommu(dev)) {
features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
}
if (dev->vhost_ops->vhost_force_iommu) {
if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
}
}
r = dev->vhost_ops->vhost_set_features(dev, features);
if (r < 0) {
VHOST_OPS_DEBUG("vhost_set_features failed");
@ -1685,9 +1700,15 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
goto fail_log;
}
}
if (vhost_dev_has_iommu(hdev)) {
hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
if (hdev->vhost_ops->vhost_dev_start) {
r = hdev->vhost_ops->vhost_dev_start(hdev, true);
if (r) {
goto fail_log;
}
}
if (vhost_dev_has_iommu(hdev) &&
hdev->vhost_ops->vhost_set_iotlb_callback) {
hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
/* Update used ring information for IOTLB to work correctly,
* vhost-kernel code requires for this.*/
@ -1722,6 +1743,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
/* should only be called after backend is connected */
assert(hdev->vhost_ops);
if (hdev->vhost_ops->vhost_dev_start) {
hdev->vhost_ops->vhost_dev_start(hdev, false);
}
for (i = 0; i < hdev->nvqs; ++i) {
vhost_virtqueue_stop(hdev,
vdev,
@ -1730,7 +1754,9 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
}
if (vhost_dev_has_iommu(hdev)) {
hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
if (hdev->vhost_ops->vhost_set_iotlb_callback) {
hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
}
memory_listener_unregister(&hdev->iommu_listener);
}
vhost_log_put(hdev, true);

View File

@ -63,6 +63,12 @@ static bool virtio_balloon_pbp_matches(PartiallyBalloonedPage *pbp,
return pbp->base_gpa == base_gpa;
}
static bool virtio_balloon_inhibited(void)
{
/* Postcopy cannot deal with concurrent discards, so it's special. */
return ram_block_discard_is_disabled() || migration_in_incoming_postcopy();
}
static void balloon_inflate_page(VirtIOBalloon *balloon,
MemoryRegion *mr, hwaddr mr_offset,
PartiallyBalloonedPage *pbp)
@ -336,7 +342,7 @@ static void virtio_balloon_handle_report(VirtIODevice *vdev, VirtQueue *vq)
* accessible by another device or process, or if the guest is
* expecting it to retain a non-zero value.
*/
if (qemu_balloon_is_inhibited() || dev->poison_val) {
if (virtio_balloon_inhibited() || dev->poison_val) {
goto skip_element;
}
@ -421,7 +427,7 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
trace_virtio_balloon_handle_output(memory_region_name(section.mr),
pa);
if (!qemu_balloon_is_inhibited()) {
if (!virtio_balloon_inhibited()) {
if (vq == s->ivq) {
balloon_inflate_page(s, section.mr,
section.offset_within_region, &pbp);
@ -628,8 +634,13 @@ static void virtio_balloon_free_page_done(VirtIOBalloon *s)
{
VirtIODevice *vdev = VIRTIO_DEVICE(s);
s->free_page_report_status = FREE_PAGE_REPORT_S_DONE;
virtio_notify_config(vdev);
if (s->free_page_report_status != FREE_PAGE_REPORT_S_DONE) {
/* See virtio_balloon_free_page_stop() */
qemu_mutex_lock(&s->free_page_lock);
s->free_page_report_status = FREE_PAGE_REPORT_S_DONE;
qemu_mutex_unlock(&s->free_page_lock);
virtio_notify_config(vdev);
}
}
static int
@ -653,17 +664,26 @@ virtio_balloon_free_page_report_notify(NotifierWithReturn *n, void *data)
case PRECOPY_NOTIFY_SETUP:
precopy_enable_free_page_optimization();
break;
case PRECOPY_NOTIFY_COMPLETE:
case PRECOPY_NOTIFY_CLEANUP:
case PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC:
virtio_balloon_free_page_stop(dev);
break;
case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
if (vdev->vm_running) {
virtio_balloon_free_page_start(dev);
} else {
virtio_balloon_free_page_done(dev);
break;
}
/*
* Set S_DONE before migrating the vmstate, so the guest will reuse
* all hinted pages once running on the destination. Fall through.
*/
case PRECOPY_NOTIFY_CLEANUP:
/*
* Especially, if something goes wrong during precopy or if migration
* is canceled, we have to properly communicate S_DONE to the VM.
*/
virtio_balloon_free_page_done(dev);
break;
case PRECOPY_NOTIFY_COMPLETE:
break;
default:
virtio_error(vdev, "%s: %d reason unknown", __func__, pnd->reason);

157
hw/virtio/virtio-mem-pci.c Normal file
View File

@ -0,0 +1,157 @@
/*
* Virtio MEM PCI device
*
* Copyright (C) 2020 Red Hat, Inc.
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "virtio-mem-pci.h"
#include "hw/mem/memory-device.h"
#include "qapi/error.h"
#include "qapi/qapi-events-misc.h"
static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
{
VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev);
DeviceState *vdev = DEVICE(&mem_pci->vdev);
qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
object_property_set_bool(OBJECT(vdev), true, "realized", errp);
}
static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
Error **errp)
{
object_property_set_uint(OBJECT(md), addr, VIRTIO_MEM_ADDR_PROP, errp);
}
static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md)
{
return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
&error_abort);
}
static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
Error **errp)
{
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
return vmc->get_memory_region(vmem, errp);
}
static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
Error **errp)
{
return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
errp);
}
static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md,
MemoryDeviceInfo *info)
{
VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
DeviceState *dev = DEVICE(md);
if (dev->id) {
vi->has_id = true;
vi->id = g_strdup(dev->id);
}
/* let the real device handle everything else */
vpc->fill_device_info(vmem, vi);
info->u.virtio_mem.data = vi;
info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
}
static void virtio_mem_pci_size_change_notify(Notifier *notifier, void *data)
{
VirtIOMEMPCI *pci_mem = container_of(notifier, VirtIOMEMPCI,
size_change_notifier);
DeviceState *dev = DEVICE(pci_mem);
const uint64_t * const size_p = data;
const char *id = NULL;
if (dev->id) {
id = g_strdup(dev->id);
}
qapi_event_send_memory_device_size_change(!!id, id, *size_p);
}
static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
k->realize = virtio_mem_pci_realize;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_MEM;
pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
pcidev_k->class_id = PCI_CLASS_OTHERS;
mdc->get_addr = virtio_mem_pci_get_addr;
mdc->set_addr = virtio_mem_pci_set_addr;
mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
mdc->get_memory_region = virtio_mem_pci_get_memory_region;
mdc->fill_device_info = virtio_mem_pci_fill_device_info;
}
static void virtio_mem_pci_instance_init(Object *obj)
{
VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj);
VirtIOMEMClass *vmc;
VirtIOMEM *vmem;
virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
TYPE_VIRTIO_MEM);
dev->size_change_notifier.notify = virtio_mem_pci_size_change_notify;
vmem = VIRTIO_MEM(&dev->vdev);
vmc = VIRTIO_MEM_GET_CLASS(vmem);
/*
* We never remove the notifier again, as we expect both devices to
* disappear at the same time.
*/
vmc->add_size_change_notifier(vmem, &dev->size_change_notifier);
object_property_add_alias(obj, VIRTIO_MEM_BLOCK_SIZE_PROP,
OBJECT(&dev->vdev), VIRTIO_MEM_BLOCK_SIZE_PROP);
object_property_add_alias(obj, VIRTIO_MEM_SIZE_PROP, OBJECT(&dev->vdev),
VIRTIO_MEM_SIZE_PROP);
object_property_add_alias(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP,
OBJECT(&dev->vdev),
VIRTIO_MEM_REQUESTED_SIZE_PROP);
}
static const VirtioPCIDeviceTypeInfo virtio_mem_pci_info = {
.base_name = TYPE_VIRTIO_MEM_PCI,
.generic_name = "virtio-mem-pci",
.instance_size = sizeof(VirtIOMEMPCI),
.instance_init = virtio_mem_pci_instance_init,
.class_init = virtio_mem_pci_class_init,
.interfaces = (InterfaceInfo[]) {
{ TYPE_MEMORY_DEVICE },
{ }
},
};
static void virtio_mem_pci_register_types(void)
{
virtio_pci_types_register(&virtio_mem_pci_info);
}
type_init(virtio_mem_pci_register_types)

View File

@ -0,0 +1,34 @@
/*
* Virtio MEM PCI device
*
* Copyright (C) 2020 Red Hat, Inc.
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
* See the COPYING file in the top-level directory.
*/
#ifndef QEMU_VIRTIO_MEM_PCI_H
#define QEMU_VIRTIO_MEM_PCI_H
#include "hw/virtio/virtio-pci.h"
#include "hw/virtio/virtio-mem.h"
typedef struct VirtIOMEMPCI VirtIOMEMPCI;
/*
* virtio-mem-pci: This extends VirtioPCIProxy.
*/
#define TYPE_VIRTIO_MEM_PCI "virtio-mem-pci-base"
#define VIRTIO_MEM_PCI(obj) \
OBJECT_CHECK(VirtIOMEMPCI, (obj), TYPE_VIRTIO_MEM_PCI)
struct VirtIOMEMPCI {
VirtIOPCIProxy parent_obj;
VirtIOMEM vdev;
Notifier size_change_notifier;
};
#endif /* QEMU_VIRTIO_MEM_PCI_H */

873
hw/virtio/virtio-mem.c Normal file
View File

@ -0,0 +1,873 @@
/*
* Virtio MEM device
*
* Copyright (C) 2020 Red Hat, Inc.
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "qemu-common.h"
#include "qemu/iov.h"
#include "qemu/cutils.h"
#include "qemu/error-report.h"
#include "qemu/units.h"
#include "sysemu/numa.h"
#include "sysemu/sysemu.h"
#include "sysemu/reset.h"
#include "hw/virtio/virtio.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#include "hw/virtio/virtio-mem.h"
#include "qapi/error.h"
#include "qapi/visitor.h"
#include "exec/ram_addr.h"
#include "migration/misc.h"
#include "hw/boards.h"
#include "hw/qdev-properties.h"
#include "config-devices.h"
#include "trace.h"
/*
* Use QEMU_VMALLOC_ALIGN, so no THP will have to be split when unplugging
* memory (e.g., 2MB on x86_64).
*/
#define VIRTIO_MEM_MIN_BLOCK_SIZE QEMU_VMALLOC_ALIGN
/*
* Size the usable region bigger than the requested size if possible. Esp.
* Linux guests will only add (aligned) memory blocks in case they fully
* fit into the usable region, but plug+online only a subset of the pages.
* The memory block size corresponds mostly to the section size.
*
* This allows e.g., to add 20MB with a section size of 128MB on x86_64, and
* a section size of 1GB on arm64 (as long as the start address is properly
* aligned, similar to ordinary DIMMs).
*
* We can change this at any time and maybe even make it configurable if
* necessary (as the section size can change). But it's more likely that the
* section size will rather get smaller and not bigger over time.
*/
#if defined(TARGET_X86_64) || defined(TARGET_I386)
#define VIRTIO_MEM_USABLE_EXTENT (2 * (128 * MiB))
#else
#error VIRTIO_MEM_USABLE_EXTENT not defined
#endif
static bool virtio_mem_is_busy(void)
{
/*
* Postcopy cannot handle concurrent discards and we don't want to migrate
* pages on-demand with stale content when plugging new blocks.
*
* For precopy, we don't want unplugged blocks in our migration stream, and
* when plugging new blocks, the page content might differ between source
* and destination (observable by the guest when not initializing pages
* after plugging them) until we're running on the destination (as we didn't
* migrate these blocks when they were unplugged).
*/
return migration_in_incoming_postcopy() || !migration_is_idle();
}
static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa,
uint64_t size, bool plugged)
{
const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size;
const unsigned long last_bit = first_bit + (size / vmem->block_size) - 1;
unsigned long found_bit;
/* We fake a shorter bitmap to avoid searching too far. */
if (plugged) {
found_bit = find_next_zero_bit(vmem->bitmap, last_bit + 1, first_bit);
} else {
found_bit = find_next_bit(vmem->bitmap, last_bit + 1, first_bit);
}
return found_bit > last_bit;
}
static void virtio_mem_set_bitmap(VirtIOMEM *vmem, uint64_t start_gpa,
uint64_t size, bool plugged)
{
const unsigned long bit = (start_gpa - vmem->addr) / vmem->block_size;
const unsigned long nbits = size / vmem->block_size;
if (plugged) {
bitmap_set(vmem->bitmap, bit, nbits);
} else {
bitmap_clear(vmem->bitmap, bit, nbits);
}
}
static void virtio_mem_send_response(VirtIOMEM *vmem, VirtQueueElement *elem,
struct virtio_mem_resp *resp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(vmem);
VirtQueue *vq = vmem->vq;
trace_virtio_mem_send_response(le16_to_cpu(resp->type));
iov_from_buf(elem->in_sg, elem->in_num, 0, resp, sizeof(*resp));
virtqueue_push(vq, elem, sizeof(*resp));
virtio_notify(vdev, vq);
}
static void virtio_mem_send_response_simple(VirtIOMEM *vmem,
VirtQueueElement *elem,
uint16_t type)
{
struct virtio_mem_resp resp = {
.type = cpu_to_le16(type),
};
virtio_mem_send_response(vmem, elem, &resp);
}
static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size)
{
if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) {
return false;
}
if (gpa + size < gpa || !size) {
return false;
}
if (gpa < vmem->addr || gpa >= vmem->addr + vmem->usable_region_size) {
return false;
}
if (gpa + size > vmem->addr + vmem->usable_region_size) {
return false;
}
return true;
}
static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
uint64_t size, bool plug)
{
const uint64_t offset = start_gpa - vmem->addr;
int ret;
if (virtio_mem_is_busy()) {
return -EBUSY;
}
if (!plug) {
ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size);
if (ret) {
error_report("Unexpected error discarding RAM: %s",
strerror(-ret));
return -EBUSY;
}
}
virtio_mem_set_bitmap(vmem, start_gpa, size, plug);
return 0;
}
static int virtio_mem_state_change_request(VirtIOMEM *vmem, uint64_t gpa,
uint16_t nb_blocks, bool plug)
{
const uint64_t size = nb_blocks * vmem->block_size;
int ret;
if (!virtio_mem_valid_range(vmem, gpa, size)) {
return VIRTIO_MEM_RESP_ERROR;
}
if (plug && (vmem->size + size > vmem->requested_size)) {
return VIRTIO_MEM_RESP_NACK;
}
/* test if really all blocks are in the opposite state */
if (!virtio_mem_test_bitmap(vmem, gpa, size, !plug)) {
return VIRTIO_MEM_RESP_ERROR;
}
ret = virtio_mem_set_block_state(vmem, gpa, size, plug);
if (ret) {
return VIRTIO_MEM_RESP_BUSY;
}
if (plug) {
vmem->size += size;
} else {
vmem->size -= size;
}
notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
return VIRTIO_MEM_RESP_ACK;
}
static void virtio_mem_plug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
struct virtio_mem_req *req)
{
const uint64_t gpa = le64_to_cpu(req->u.plug.addr);
const uint16_t nb_blocks = le16_to_cpu(req->u.plug.nb_blocks);
uint16_t type;
trace_virtio_mem_plug_request(gpa, nb_blocks);
type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, true);
virtio_mem_send_response_simple(vmem, elem, type);
}
static void virtio_mem_unplug_request(VirtIOMEM *vmem, VirtQueueElement *elem,
struct virtio_mem_req *req)
{
const uint64_t gpa = le64_to_cpu(req->u.unplug.addr);
const uint16_t nb_blocks = le16_to_cpu(req->u.unplug.nb_blocks);
uint16_t type;
trace_virtio_mem_unplug_request(gpa, nb_blocks);
type = virtio_mem_state_change_request(vmem, gpa, nb_blocks, false);
virtio_mem_send_response_simple(vmem, elem, type);
}
static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
uint64_t requested_size,
bool can_shrink)
{
uint64_t newsize = MIN(memory_region_size(&vmem->memdev->mr),
requested_size + VIRTIO_MEM_USABLE_EXTENT);
if (!requested_size) {
newsize = 0;
}
if (newsize < vmem->usable_region_size && !can_shrink) {
return;
}
trace_virtio_mem_resized_usable_region(vmem->usable_region_size, newsize);
vmem->usable_region_size = newsize;
}
static int virtio_mem_unplug_all(VirtIOMEM *vmem)
{
RAMBlock *rb = vmem->memdev->mr.ram_block;
int ret;
if (virtio_mem_is_busy()) {
return -EBUSY;
}
ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
if (ret) {
error_report("Unexpected error discarding RAM: %s", strerror(-ret));
return -EBUSY;
}
bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
if (vmem->size) {
vmem->size = 0;
notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
}
trace_virtio_mem_unplugged_all();
virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
return 0;
}
static void virtio_mem_unplug_all_request(VirtIOMEM *vmem,
VirtQueueElement *elem)
{
trace_virtio_mem_unplug_all_request();
if (virtio_mem_unplug_all(vmem)) {
virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_BUSY);
} else {
virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ACK);
}
}
static void virtio_mem_state_request(VirtIOMEM *vmem, VirtQueueElement *elem,
struct virtio_mem_req *req)
{
const uint16_t nb_blocks = le16_to_cpu(req->u.state.nb_blocks);
const uint64_t gpa = le64_to_cpu(req->u.state.addr);
const uint64_t size = nb_blocks * vmem->block_size;
struct virtio_mem_resp resp = {
.type = cpu_to_le16(VIRTIO_MEM_RESP_ACK),
};
trace_virtio_mem_state_request(gpa, nb_blocks);
if (!virtio_mem_valid_range(vmem, gpa, size)) {
virtio_mem_send_response_simple(vmem, elem, VIRTIO_MEM_RESP_ERROR);
return;
}
if (virtio_mem_test_bitmap(vmem, gpa, size, true)) {
resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_PLUGGED);
} else if (virtio_mem_test_bitmap(vmem, gpa, size, false)) {
resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_UNPLUGGED);
} else {
resp.u.state.state = cpu_to_le16(VIRTIO_MEM_STATE_MIXED);
}
trace_virtio_mem_state_response(le16_to_cpu(resp.u.state.state));
virtio_mem_send_response(vmem, elem, &resp);
}
static void virtio_mem_handle_request(VirtIODevice *vdev, VirtQueue *vq)
{
const int len = sizeof(struct virtio_mem_req);
VirtIOMEM *vmem = VIRTIO_MEM(vdev);
VirtQueueElement *elem;
struct virtio_mem_req req;
uint16_t type;
while (true) {
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}
if (iov_to_buf(elem->out_sg, elem->out_num, 0, &req, len) < len) {
virtio_error(vdev, "virtio-mem protocol violation: invalid request"
" size: %d", len);
g_free(elem);
return;
}
if (iov_size(elem->in_sg, elem->in_num) <
sizeof(struct virtio_mem_resp)) {
virtio_error(vdev, "virtio-mem protocol violation: not enough space"
" for response: %zu",
iov_size(elem->in_sg, elem->in_num));
g_free(elem);
return;
}
type = le16_to_cpu(req.type);
switch (type) {
case VIRTIO_MEM_REQ_PLUG:
virtio_mem_plug_request(vmem, elem, &req);
break;
case VIRTIO_MEM_REQ_UNPLUG:
virtio_mem_unplug_request(vmem, elem, &req);
break;
case VIRTIO_MEM_REQ_UNPLUG_ALL:
virtio_mem_unplug_all_request(vmem, elem);
break;
case VIRTIO_MEM_REQ_STATE:
virtio_mem_state_request(vmem, elem, &req);
break;
default:
virtio_error(vdev, "virtio-mem protocol violation: unknown request"
" type: %d", type);
g_free(elem);
return;
}
g_free(elem);
}
}
static void virtio_mem_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOMEM *vmem = VIRTIO_MEM(vdev);
struct virtio_mem_config *config = (void *) config_data;
config->block_size = cpu_to_le64(vmem->block_size);
config->node_id = cpu_to_le16(vmem->node);
config->requested_size = cpu_to_le64(vmem->requested_size);
config->plugged_size = cpu_to_le64(vmem->size);
config->addr = cpu_to_le64(vmem->addr);
config->region_size = cpu_to_le64(memory_region_size(&vmem->memdev->mr));
config->usable_region_size = cpu_to_le64(vmem->usable_region_size);
}
static uint64_t virtio_mem_get_features(VirtIODevice *vdev, uint64_t features,
Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
if (ms->numa_state) {
#if defined(CONFIG_ACPI)
virtio_add_feature(&features, VIRTIO_MEM_F_ACPI_PXM);
#endif
}
return features;
}
static void virtio_mem_system_reset(void *opaque)
{
VirtIOMEM *vmem = VIRTIO_MEM(opaque);
/*
* During usual resets, we will unplug all memory and shrink the usable
* region size. This is, however, not possible in all scenarios. Then,
* the guest has to deal with this manually (VIRTIO_MEM_REQ_UNPLUG_ALL).
*/
virtio_mem_unplug_all(vmem);
}
static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
int nb_numa_nodes = ms->numa_state ? ms->numa_state->num_nodes : 0;
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOMEM *vmem = VIRTIO_MEM(dev);
uint64_t page_size;
RAMBlock *rb;
int ret;
if (!vmem->memdev) {
error_setg(errp, "'%s' property is not set", VIRTIO_MEM_MEMDEV_PROP);
return;
} else if (host_memory_backend_is_mapped(vmem->memdev)) {
char *path = object_get_canonical_path_component(OBJECT(vmem->memdev));
error_setg(errp, "'%s' property specifies a busy memdev: %s",
VIRTIO_MEM_MEMDEV_PROP, path);
g_free(path);
return;
} else if (!memory_region_is_ram(&vmem->memdev->mr) ||
memory_region_is_rom(&vmem->memdev->mr) ||
!vmem->memdev->mr.ram_block) {
error_setg(errp, "'%s' property specifies an unsupported memdev",
VIRTIO_MEM_MEMDEV_PROP);
return;
}
if ((nb_numa_nodes && vmem->node >= nb_numa_nodes) ||
(!nb_numa_nodes && vmem->node)) {
error_setg(errp, "'%s' property has value '%" PRIu32 "', which exceeds"
"the number of numa nodes: %d", VIRTIO_MEM_NODE_PROP,
vmem->node, nb_numa_nodes ? nb_numa_nodes : 1);
return;
}
if (enable_mlock) {
error_setg(errp, "Incompatible with mlock");
return;
}
rb = vmem->memdev->mr.ram_block;
page_size = qemu_ram_pagesize(rb);
if (vmem->block_size < page_size) {
error_setg(errp, "'%s' property has to be at least the page size (0x%"
PRIx64 ")", VIRTIO_MEM_BLOCK_SIZE_PROP, page_size);
return;
} else if (!QEMU_IS_ALIGNED(vmem->requested_size, vmem->block_size)) {
error_setg(errp, "'%s' property has to be multiples of '%s' (0x%" PRIx64
")", VIRTIO_MEM_REQUESTED_SIZE_PROP,
VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
return;
} else if (!QEMU_IS_ALIGNED(memory_region_size(&vmem->memdev->mr),
vmem->block_size)) {
error_setg(errp, "'%s' property memdev size has to be multiples of"
"'%s' (0x%" PRIx64 ")", VIRTIO_MEM_MEMDEV_PROP,
VIRTIO_MEM_BLOCK_SIZE_PROP, vmem->block_size);
return;
}
if (ram_block_discard_require(true)) {
error_setg(errp, "Discarding RAM is disabled");
return;
}
ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb));
if (ret) {
error_setg_errno(errp, -ret, "Unexpected error discarding RAM");
ram_block_discard_require(false);
return;
}
virtio_mem_resize_usable_region(vmem, vmem->requested_size, true);
vmem->bitmap_size = memory_region_size(&vmem->memdev->mr) /
vmem->block_size;
vmem->bitmap = bitmap_new(vmem->bitmap_size);
virtio_init(vdev, TYPE_VIRTIO_MEM, VIRTIO_ID_MEM,
sizeof(struct virtio_mem_config));
vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
qemu_register_reset(virtio_mem_system_reset, vmem);
precopy_add_notifier(&vmem->precopy_notifier);
}
static void virtio_mem_device_unrealize(DeviceState *dev)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOMEM *vmem = VIRTIO_MEM(dev);
precopy_remove_notifier(&vmem->precopy_notifier);
qemu_unregister_reset(virtio_mem_system_reset, vmem);
vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
host_memory_backend_set_mapped(vmem->memdev, false);
virtio_del_queue(vdev, 0);
virtio_cleanup(vdev);
g_free(vmem->bitmap);
ram_block_discard_require(false);
}
static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
{
RAMBlock *rb = vmem->memdev->mr.ram_block;
unsigned long first_zero_bit, last_zero_bit;
uint64_t offset, length;
int ret;
/* Find consecutive unplugged blocks and discard the consecutive range. */
first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
while (first_zero_bit < vmem->bitmap_size) {
offset = first_zero_bit * vmem->block_size;
last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
first_zero_bit + 1) - 1;
length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
ret = ram_block_discard_range(rb, offset, length);
if (ret) {
error_report("Unexpected error discarding RAM: %s",
strerror(-ret));
return -EINVAL;
}
first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
last_zero_bit + 2);
}
return 0;
}
static int virtio_mem_post_load(void *opaque, int version_id)
{
if (migration_in_incoming_postcopy()) {
return 0;
}
return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque));
}
typedef struct VirtIOMEMMigSanityChecks {
VirtIOMEM *parent;
uint64_t addr;
uint64_t region_size;
uint64_t block_size;
uint32_t node;
} VirtIOMEMMigSanityChecks;
static int virtio_mem_mig_sanity_checks_pre_save(void *opaque)
{
VirtIOMEMMigSanityChecks *tmp = opaque;
VirtIOMEM *vmem = tmp->parent;
tmp->addr = vmem->addr;
tmp->region_size = memory_region_size(&vmem->memdev->mr);
tmp->block_size = vmem->block_size;
tmp->node = vmem->node;
return 0;
}
static int virtio_mem_mig_sanity_checks_post_load(void *opaque, int version_id)
{
VirtIOMEMMigSanityChecks *tmp = opaque;
VirtIOMEM *vmem = tmp->parent;
const uint64_t new_region_size = memory_region_size(&vmem->memdev->mr);
if (tmp->addr != vmem->addr) {
error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
VIRTIO_MEM_ADDR_PROP, tmp->addr, vmem->addr);
return -EINVAL;
}
/*
* Note: Preparation for resizeable memory regions. The maximum size
* of the memory region must not change during migration.
*/
if (tmp->region_size != new_region_size) {
error_report("Property '%s' size changed from 0x%" PRIx64 " to 0x%"
PRIx64, VIRTIO_MEM_MEMDEV_PROP, tmp->region_size,
new_region_size);
return -EINVAL;
}
if (tmp->block_size != vmem->block_size) {
error_report("Property '%s' changed from 0x%" PRIx64 " to 0x%" PRIx64,
VIRTIO_MEM_BLOCK_SIZE_PROP, tmp->block_size,
vmem->block_size);
return -EINVAL;
}
if (tmp->node != vmem->node) {
error_report("Property '%s' changed from %" PRIu32 " to %" PRIu32,
VIRTIO_MEM_NODE_PROP, tmp->node, vmem->node);
return -EINVAL;
}
return 0;
}
static const VMStateDescription vmstate_virtio_mem_sanity_checks = {
.name = "virtio-mem-device/sanity-checks",
.pre_save = virtio_mem_mig_sanity_checks_pre_save,
.post_load = virtio_mem_mig_sanity_checks_post_load,
.fields = (VMStateField[]) {
VMSTATE_UINT64(addr, VirtIOMEMMigSanityChecks),
VMSTATE_UINT64(region_size, VirtIOMEMMigSanityChecks),
VMSTATE_UINT64(block_size, VirtIOMEMMigSanityChecks),
VMSTATE_UINT32(node, VirtIOMEMMigSanityChecks),
VMSTATE_END_OF_LIST(),
},
};
static const VMStateDescription vmstate_virtio_mem_device = {
.name = "virtio-mem-device",
.minimum_version_id = 1,
.version_id = 1,
.post_load = virtio_mem_post_load,
.fields = (VMStateField[]) {
VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks,
vmstate_virtio_mem_sanity_checks),
VMSTATE_UINT64(usable_region_size, VirtIOMEM),
VMSTATE_UINT64(size, VirtIOMEM),
VMSTATE_UINT64(requested_size, VirtIOMEM),
VMSTATE_BITMAP(bitmap, VirtIOMEM, 0, bitmap_size),
VMSTATE_END_OF_LIST()
},
};
static const VMStateDescription vmstate_virtio_mem = {
.name = "virtio-mem",
.minimum_version_id = 1,
.version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_VIRTIO_DEVICE,
VMSTATE_END_OF_LIST()
},
};
static void virtio_mem_fill_device_info(const VirtIOMEM *vmem,
VirtioMEMDeviceInfo *vi)
{
vi->memaddr = vmem->addr;
vi->node = vmem->node;
vi->requested_size = vmem->requested_size;
vi->size = vmem->size;
vi->max_size = memory_region_size(&vmem->memdev->mr);
vi->block_size = vmem->block_size;
vi->memdev = object_get_canonical_path(OBJECT(vmem->memdev));
}
static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
{
if (!vmem->memdev) {
error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
return NULL;
}
return &vmem->memdev->mr;
}
static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
Notifier *notifier)
{
notifier_list_add(&vmem->size_change_notifiers, notifier);
}
static void virtio_mem_remove_size_change_notifier(VirtIOMEM *vmem,
Notifier *notifier)
{
notifier_remove(notifier);
}
static void virtio_mem_get_size(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
const VirtIOMEM *vmem = VIRTIO_MEM(obj);
uint64_t value = vmem->size;
visit_type_size(v, name, &value, errp);
}
static void virtio_mem_get_requested_size(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
const VirtIOMEM *vmem = VIRTIO_MEM(obj);
uint64_t value = vmem->requested_size;
visit_type_size(v, name, &value, errp);
}
static void virtio_mem_set_requested_size(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
VirtIOMEM *vmem = VIRTIO_MEM(obj);
Error *err = NULL;
uint64_t value;
visit_type_size(v, name, &value, &err);
if (err) {
error_propagate(errp, err);
return;
}
/*
* The block size and memory backend are not fixed until the device was
* realized. realize() will verify these properties then.
*/
if (DEVICE(obj)->realized) {
if (!QEMU_IS_ALIGNED(value, vmem->block_size)) {
error_setg(errp, "'%s' has to be multiples of '%s' (0x%" PRIx64
")", name, VIRTIO_MEM_BLOCK_SIZE_PROP,
vmem->block_size);
return;
} else if (value > memory_region_size(&vmem->memdev->mr)) {
error_setg(errp, "'%s' cannot exceed the memory backend size"
"(0x%" PRIx64 ")", name,
memory_region_size(&vmem->memdev->mr));
return;
}
if (value != vmem->requested_size) {
virtio_mem_resize_usable_region(vmem, value, false);
vmem->requested_size = value;
}
/*
* Trigger a config update so the guest gets notified. We trigger
* even if the size didn't change (especially helpful for debugging).
*/
virtio_notify_config(VIRTIO_DEVICE(vmem));
} else {
vmem->requested_size = value;
}
}
static void virtio_mem_get_block_size(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
const VirtIOMEM *vmem = VIRTIO_MEM(obj);
uint64_t value = vmem->block_size;
visit_type_size(v, name, &value, errp);
}
static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
VirtIOMEM *vmem = VIRTIO_MEM(obj);
Error *err = NULL;
uint64_t value;
if (DEVICE(obj)->realized) {
error_setg(errp, "'%s' cannot be changed", name);
return;
}
visit_type_size(v, name, &value, &err);
if (err) {
error_propagate(errp, err);
return;
}
if (value < VIRTIO_MEM_MIN_BLOCK_SIZE) {
error_setg(errp, "'%s' property has to be at least 0x%" PRIx32, name,
VIRTIO_MEM_MIN_BLOCK_SIZE);
return;
} else if (!is_power_of_2(value)) {
error_setg(errp, "'%s' property has to be a power of two", name);
return;
}
vmem->block_size = value;
}
static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem)
{
void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block);
unsigned long first_zero_bit, last_zero_bit;
uint64_t offset, length;
/*
* Find consecutive unplugged blocks and exclude them from migration.
*
* Note: Blocks cannot get (un)plugged during precopy, no locking needed.
*/
first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size);
while (first_zero_bit < vmem->bitmap_size) {
offset = first_zero_bit * vmem->block_size;
last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
first_zero_bit + 1) - 1;
length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size;
qemu_guest_free_page_hint(host + offset, length);
first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
last_zero_bit + 2);
}
}
static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data)
{
VirtIOMEM *vmem = container_of(n, VirtIOMEM, precopy_notifier);
PrecopyNotifyData *pnd = data;
switch (pnd->reason) {
case PRECOPY_NOTIFY_SETUP:
precopy_enable_free_page_optimization();
break;
case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
virtio_mem_precopy_exclude_unplugged(vmem);
break;
default:
break;
}
return 0;
}
static void virtio_mem_instance_init(Object *obj)
{
VirtIOMEM *vmem = VIRTIO_MEM(obj);
vmem->block_size = VIRTIO_MEM_MIN_BLOCK_SIZE;
notifier_list_init(&vmem->size_change_notifiers);
vmem->precopy_notifier.notify = virtio_mem_precopy_notify;
object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
NULL, NULL, NULL);
object_property_add(obj, VIRTIO_MEM_REQUESTED_SIZE_PROP, "size",
virtio_mem_get_requested_size,
virtio_mem_set_requested_size, NULL, NULL);
object_property_add(obj, VIRTIO_MEM_BLOCK_SIZE_PROP, "size",
virtio_mem_get_block_size, virtio_mem_set_block_size,
NULL, NULL);
}
static Property virtio_mem_properties[] = {
DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
DEFINE_PROP_LINK(VIRTIO_MEM_MEMDEV_PROP, VirtIOMEM, memdev,
TYPE_MEMORY_BACKEND, HostMemoryBackend *),
DEFINE_PROP_END_OF_LIST(),
};
static void virtio_mem_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass);
device_class_set_props(dc, virtio_mem_properties);
dc->vmsd = &vmstate_virtio_mem;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_mem_device_realize;
vdc->unrealize = virtio_mem_device_unrealize;
vdc->get_config = virtio_mem_get_config;
vdc->get_features = virtio_mem_get_features;
vdc->vmsd = &vmstate_virtio_mem_device;
vmc->fill_device_info = virtio_mem_fill_device_info;
vmc->get_memory_region = virtio_mem_get_memory_region;
vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
}
static const TypeInfo virtio_mem_info = {
.name = TYPE_VIRTIO_MEM,
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOMEM),
.instance_init = virtio_mem_instance_init,
.class_init = virtio_mem_class_init,
.class_size = sizeof(VirtIOMEMClass),
};
static void virtio_register_types(void)
{
type_register_static(&virtio_mem_info);
}
type_init(virtio_register_types)

View File

@ -1107,6 +1107,18 @@ static AddressSpace *virtio_pci_get_dma_as(DeviceState *d)
return pci_get_address_space(dev);
}
static bool virtio_pci_queue_enabled(DeviceState *d, int n)
{
VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
return proxy->vqs[vdev->queue_sel].enabled;
}
return virtio_queue_enabled(vdev, n);
}
static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
struct virtio_pci_cap *cap)
{
@ -2064,6 +2076,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
k->ioeventfd_enabled = virtio_pci_ioeventfd_enabled;
k->ioeventfd_assign = virtio_pci_ioeventfd_assign;
k->get_dma_as = virtio_pci_get_dma_as;
k->queue_enabled = virtio_pci_queue_enabled;
}
static const TypeInfo virtio_pci_bus_info = {

View File

@ -3286,6 +3286,12 @@ hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n)
bool virtio_queue_enabled(VirtIODevice *vdev, int n)
{
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
if (k->queue_enabled) {
return k->queue_enabled(qbus->parent, n);
}
return virtio_queue_get_desc_addr(vdev, n) != 0;
}

View File

@ -2478,6 +2478,47 @@ static inline MemOp devend_memop(enum device_endian end)
}
#endif
/*
* Inhibit technologies that require discarding of pages in RAM blocks, e.g.,
* to manage the actual amount of memory consumed by the VM (then, the memory
* provided by RAM blocks might be bigger than the desired memory consumption).
* This *must* be set if:
* - Discarding parts of a RAM blocks does not result in the change being
* reflected in the VM and the pages getting freed.
* - All memory in RAM blocks is pinned or duplicated, invaldiating any previous
* discards blindly.
* - Discarding parts of a RAM blocks will result in integrity issues (e.g.,
* encrypted VMs).
* Technologies that only temporarily pin the current working set of a
* driver are fine, because we don't expect such pages to be discarded
* (esp. based on guest action like balloon inflation).
*
* This is *not* to be used to protect from concurrent discards (esp.,
* postcopy).
*
* Returns 0 if successful. Returns -EBUSY if a technology that relies on
* discards to work reliably is active.
*/
int ram_block_discard_disable(bool state);
/*
* Inhibit technologies that disable discarding of pages in RAM blocks.
*
* Returns 0 if successful. Returns -EBUSY if discards are already set to
* broken.
*/
int ram_block_discard_require(bool state);
/*
* Test if discarding of memory in ram blocks is disabled.
*/
bool ram_block_discard_is_disabled(void);
/*
* Test if discarding of memory in ram blocks is required to work reliably.
*/
bool ram_block_discard_is_required(void);
#endif
#endif

View File

@ -207,6 +207,7 @@ struct MachineClass {
const char **valid_cpu_types;
strList *allowed_dynamic_sysbus_devices;
bool auto_enable_numa_with_memhp;
bool auto_enable_numa_with_memdev;
void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes,
int nb_nodes, ram_addr_t size);
bool ignore_boot_device_suffixes;

View File

@ -87,6 +87,7 @@ extern bool pci_available;
#define PCI_DEVICE_ID_VIRTIO_VSOCK 0x1012
#define PCI_DEVICE_ID_VIRTIO_PMEM 0x1013
#define PCI_DEVICE_ID_VIRTIO_IOMMU 0x1014
#define PCI_DEVICE_ID_VIRTIO_MEM 0x1015
#define PCI_VENDOR_ID_REDHAT 0x1b36
#define PCI_DEVICE_ID_REDHAT_BRIDGE 0x0001

View File

@ -108,7 +108,7 @@ typedef struct VFIODevice {
bool reset_works;
bool needs_reset;
bool no_mmap;
bool balloon_allowed;
bool ram_block_discard_allowed;
VFIODeviceOps *ops;
unsigned int num_irqs;
unsigned int num_regions;
@ -128,7 +128,7 @@ typedef struct VFIOGroup {
QLIST_HEAD(, VFIODevice) device_list;
QLIST_ENTRY(VFIOGroup) next;
QLIST_ENTRY(VFIOGroup) container_next;
bool balloon_allowed;
bool ram_block_discard_allowed;
} VFIOGroup;
typedef struct VFIODMABuf {

View File

@ -17,7 +17,8 @@ typedef enum VhostBackendType {
VHOST_BACKEND_TYPE_NONE = 0,
VHOST_BACKEND_TYPE_KERNEL = 1,
VHOST_BACKEND_TYPE_USER = 2,
VHOST_BACKEND_TYPE_MAX = 3,
VHOST_BACKEND_TYPE_VDPA = 3,
VHOST_BACKEND_TYPE_MAX = 4,
} VhostBackendType;
typedef enum VhostSetConfigType {
@ -34,6 +35,7 @@ struct vhost_vring_state;
struct vhost_vring_addr;
struct vhost_scsi_target;
struct vhost_iotlb_msg;
struct vhost_virtqueue;
typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque);
typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev);
@ -112,6 +114,16 @@ typedef int (*vhost_get_inflight_fd_op)(struct vhost_dev *dev,
typedef int (*vhost_set_inflight_fd_op)(struct vhost_dev *dev,
struct vhost_inflight *inflight);
typedef int (*vhost_dev_start_op)(struct vhost_dev *dev, bool started);
typedef int (*vhost_vq_get_addr_op)(struct vhost_dev *dev,
struct vhost_vring_addr *addr,
struct vhost_virtqueue *vq);
typedef int (*vhost_get_device_id_op)(struct vhost_dev *dev, uint32_t *dev_id);
typedef bool (*vhost_force_iommu_op)(struct vhost_dev *dev);
typedef struct VhostOps {
VhostBackendType backend_type;
vhost_backend_init vhost_backend_init;
@ -152,9 +164,14 @@ typedef struct VhostOps {
vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter;
vhost_get_inflight_fd_op vhost_get_inflight_fd;
vhost_set_inflight_fd_op vhost_set_inflight_fd;
vhost_dev_start_op vhost_dev_start;
vhost_vq_get_addr_op vhost_vq_get_addr;
vhost_get_device_id_op vhost_get_device_id;
vhost_force_iommu_op vhost_force_iommu;
} VhostOps;
extern const VhostOps user_ops;
extern const VhostOps vdpa_ops;
int vhost_set_backend_type(struct vhost_dev *dev,
VhostBackendType backend_type);

View File

@ -0,0 +1,26 @@
/*
* vhost-vdpa.h
*
* Copyright(c) 2017-2018 Intel Corporation.
* Copyright(c) 2020 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef HW_VIRTIO_VHOST_VDPA_H
#define HW_VIRTIO_VHOST_VDPA_H
#include "hw/virtio/virtio.h"
typedef struct vhost_vdpa {
int device_fd;
uint32_t msg_type;
MemoryListener listener;
} VhostVDPA;
extern AddressSpace address_space_memory;
extern int vhost_vdpa_get_device_id(struct vhost_dev *dev,
uint32_t *device_id);
#endif

View File

@ -92,6 +92,13 @@ struct vhost_dev {
const VhostDevConfigOps *config_ops;
};
struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[2];
int backend;
NetClientState *nc;
};
int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type,
uint32_t busyloop_timeout);

View File

@ -83,6 +83,10 @@ typedef struct VirtioBusClass {
*/
int (*ioeventfd_assign)(DeviceState *d, EventNotifier *notifier,
int n, bool assign);
/*
* Whether queue number n is enabled.
*/
bool (*queue_enabled)(DeviceState *d, int n);
/*
* Does the transport have variable vring alignment?
* (ie can it ever call virtio_queue_set_align()?)

View File

@ -0,0 +1,86 @@
/*
* Virtio MEM device
*
* Copyright (C) 2020 Red Hat, Inc.
*
* Authors:
* David Hildenbrand <david@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2.
* See the COPYING file in the top-level directory.
*/
#ifndef HW_VIRTIO_MEM_H
#define HW_VIRTIO_MEM_H
#include "standard-headers/linux/virtio_mem.h"
#include "hw/virtio/virtio.h"
#include "qapi/qapi-types-misc.h"
#include "sysemu/hostmem.h"
#define TYPE_VIRTIO_MEM "virtio-mem"
#define VIRTIO_MEM(obj) \
OBJECT_CHECK(VirtIOMEM, (obj), TYPE_VIRTIO_MEM)
#define VIRTIO_MEM_CLASS(oc) \
OBJECT_CLASS_CHECK(VirtIOMEMClass, (oc), TYPE_VIRTIO_MEM)
#define VIRTIO_MEM_GET_CLASS(obj) \
OBJECT_GET_CLASS(VirtIOMEMClass, (obj), TYPE_VIRTIO_MEM)
#define VIRTIO_MEM_MEMDEV_PROP "memdev"
#define VIRTIO_MEM_NODE_PROP "node"
#define VIRTIO_MEM_SIZE_PROP "size"
#define VIRTIO_MEM_REQUESTED_SIZE_PROP "requested-size"
#define VIRTIO_MEM_BLOCK_SIZE_PROP "block-size"
#define VIRTIO_MEM_ADDR_PROP "memaddr"
typedef struct VirtIOMEM {
VirtIODevice parent_obj;
/* guest -> host request queue */
VirtQueue *vq;
/* bitmap used to track unplugged memory */
int32_t bitmap_size;
unsigned long *bitmap;
/* assigned memory backend and memory region */
HostMemoryBackend *memdev;
/* NUMA node */
uint32_t node;
/* assigned address of the region in guest physical memory */
uint64_t addr;
/* usable region size (<= region_size) */
uint64_t usable_region_size;
/* actual size (how much the guest plugged) */
uint64_t size;
/* requested size */
uint64_t requested_size;
/* block size and alignment */
uint64_t block_size;
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;
/* don't migrate unplugged memory */
NotifierWithReturn precopy_notifier;
} VirtIOMEM;
typedef struct VirtIOMEMClass {
/* private */
VirtIODevice parent;
/* public */
void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi);
MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp);
void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
} VirtIOMEMClass;
#endif

View File

@ -25,7 +25,7 @@ void migrate_start_colo_process(MigrationState *s);
bool migration_in_colo_state(void);
/* loadvm */
void migration_incoming_enable_colo(void);
int migration_incoming_enable_colo(void);
void migration_incoming_disable_colo(void);
bool migration_incoming_colo_enabled(void);
void *colo_process_incoming_thread(void *opaque);

View File

@ -69,6 +69,8 @@ bool migration_has_failed(MigrationState *);
/* ...and after the device transmission */
bool migration_in_postcopy_after_devices(MigrationState *);
void migration_global_dump(Monitor *mon);
/* True if incomming migration entered POSTCOPY_INCOMING_DISCARD */
bool migration_in_incoming_postcopy(void);
/* migration/block-dirty-bitmap.c */
void dirty_bitmap_mig_init(void);

View File

@ -176,6 +176,7 @@ void hmp_info_network(Monitor *mon, const QDict *qdict);
void net_socket_rs_init(SocketReadState *rs,
SocketReadStateFinalize *finalize,
bool vnet_hdr);
NetClientState *qemu_get_peer(NetClientState *nc, int queue_index);
/* NIC info */

22
include/net/vhost-vdpa.h Normal file
View File

@ -0,0 +1,22 @@
/*
* vhost-vdpa.h
*
* Copyright(c) 2017-2018 Intel Corporation.
* Copyright(c) 2020 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef VHOST_VDPA_H
#define VHOST_VDPA_H
#define TYPE_VHOST_VDPA "vhost-vdpa"
struct vhost_net *vhost_vdpa_get_vhost_net(NetClientState *nc);
uint64_t vhost_vdpa_get_acked_features(NetClientState *nc);
extern const int vdpa_feature_bits[];
#endif /* VHOST_VDPA_H */

View File

@ -28,6 +28,11 @@ void vhost_net_cleanup(VHostNetState *net);
uint64_t vhost_net_get_features(VHostNetState *net, uint64_t features);
void vhost_net_ack_features(VHostNetState *net, uint64_t features);
int vhost_net_get_config(struct vhost_net *net, uint8_t *config,
uint32_t config_len);
int vhost_net_set_config(struct vhost_net *net, const uint8_t *data,
uint32_t offset, uint32_t size, uint32_t flags);
bool vhost_net_virtqueue_pending(VHostNetState *net, int n);
void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev,
int idx, bool mask);

View File

@ -23,7 +23,5 @@ typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
QEMUBalloonStatus *stat_func, void *opaque);
void qemu_remove_balloon_handler(void *opaque);
bool qemu_balloon_is_inhibited(void);
void qemu_balloon_inhibit(bool state);
#endif

View File

@ -338,12 +338,18 @@ bool migration_incoming_colo_enabled(void)
void migration_incoming_disable_colo(void)
{
ram_block_discard_disable(false);
migration_colo_enabled = false;
}
void migration_incoming_enable_colo(void)
int migration_incoming_enable_colo(void)
{
if (ram_block_discard_disable(true)) {
error_report("COLO: cannot disable RAM discard");
return -EBUSY;
}
migration_colo_enabled = true;
return 0;
}
void migrate_add_address(SocketAddress *address)
@ -1772,6 +1778,13 @@ bool migration_in_postcopy_after_devices(MigrationState *s)
return migration_in_postcopy() && s->postcopy_after_devices;
}
bool migration_in_incoming_postcopy(void)
{
PostcopyState ps = postcopy_state_get();
return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
}
bool migration_is_idle(void)
{
MigrationState *s = current_migration;

View File

@ -27,7 +27,6 @@
#include "qemu/notify.h"
#include "qemu/rcu.h"
#include "sysemu/sysemu.h"
#include "sysemu/balloon.h"
#include "qemu/error-report.h"
#include "trace.h"
#include "hw/boards.h"
@ -520,20 +519,6 @@ int postcopy_ram_incoming_init(MigrationIncomingState *mis)
return 0;
}
/*
* Manage a single vote to the QEMU balloon inhibitor for all postcopy usage,
* last caller wins.
*/
static void postcopy_balloon_inhibit(bool state)
{
static bool cur_state = false;
if (state != cur_state) {
qemu_balloon_inhibit(state);
cur_state = state;
}
}
/*
* At the end of a migration where postcopy_ram_incoming_init was called.
*/
@ -565,8 +550,6 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
mis->have_fault_thread = false;
}
postcopy_balloon_inhibit(false);
if (enable_mlock) {
if (os_mlock() < 0) {
error_report("mlock: %s", strerror(errno));
@ -1160,12 +1143,6 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
}
memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
/*
* Ballooning can mark pages as absent while we're postcopying
* that would cause false userfaults.
*/
postcopy_balloon_inhibit(true);
trace_postcopy_ram_enable_notify();
return 0;

View File

@ -29,6 +29,7 @@
#include "qemu/sockets.h"
#include "qemu/bitmap.h"
#include "qemu/coroutine.h"
#include "exec/memory.h"
#include <sys/socket.h>
#include <netdb.h>
#include <arpa/inet.h>
@ -4016,8 +4017,14 @@ void rdma_start_incoming_migration(const char *host_port, Error **errp)
Error *local_err = NULL;
trace_rdma_start_incoming_migration();
rdma = qemu_rdma_data_init(host_port, &local_err);
/* Avoid ram_block_discard_disable(), cannot change during migration. */
if (ram_block_discard_is_required()) {
error_setg(errp, "RDMA: cannot disable RAM discard");
return;
}
rdma = qemu_rdma_data_init(host_port, &local_err);
if (rdma == NULL) {
goto err;
}
@ -4066,10 +4073,17 @@ void rdma_start_outgoing_migration(void *opaque,
const char *host_port, Error **errp)
{
MigrationState *s = opaque;
RDMAContext *rdma = qemu_rdma_data_init(host_port, errp);
RDMAContext *rdma_return_path = NULL;
RDMAContext *rdma;
int ret = 0;
/* Avoid ram_block_discard_disable(), cannot change during migration. */
if (ram_block_discard_is_required()) {
error_setg(errp, "RDMA: cannot disable RAM discard");
return;
}
rdma = qemu_rdma_data_init(host_port, errp);
if (rdma == NULL) {
goto err;
}

View File

@ -2111,8 +2111,15 @@ static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
static int loadvm_process_enable_colo(MigrationIncomingState *mis)
{
migration_incoming_enable_colo();
return colo_init_ram_cache();
int ret = migration_incoming_enable_colo();
if (!ret) {
ret = colo_init_ram_cache();
if (ret) {
migration_incoming_disable_colo();
}
}
return ret;
}
/*

View File

@ -1821,6 +1821,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
MemoryDeviceInfoList *info_list = qmp_query_memory_devices(&err);
MemoryDeviceInfoList *info;
VirtioPMEMDeviceInfo *vpi;
VirtioMEMDeviceInfo *vmi;
MemoryDeviceInfo *value;
PCDIMMDeviceInfo *di;
@ -1855,6 +1856,21 @@ void hmp_info_memory_devices(Monitor *mon, const QDict *qdict)
monitor_printf(mon, " size: %" PRIu64 "\n", vpi->size);
monitor_printf(mon, " memdev: %s\n", vpi->memdev);
break;
case MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM:
vmi = value->u.virtio_mem.data;
monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
MemoryDeviceInfoKind_str(value->type),
vmi->id ? vmi->id : "");
monitor_printf(mon, " memaddr: 0x%" PRIx64 "\n", vmi->memaddr);
monitor_printf(mon, " node: %" PRId64 "\n", vmi->node);
monitor_printf(mon, " requested-size: %" PRIu64 "\n",
vmi->requested_size);
monitor_printf(mon, " size: %" PRIu64 "\n", vmi->size);
monitor_printf(mon, " max-size: %" PRIu64 "\n", vmi->max_size);
monitor_printf(mon, " block-size: %" PRIu64 "\n",
vmi->block_size);
monitor_printf(mon, " memdev: %s\n", vmi->memdev);
break;
default:
g_assert_not_reached();
}

View File

@ -235,6 +235,7 @@ static MonitorQAPIEventConf monitor_qapi_event_conf[QAPI_EVENT__MAX] = {
[QAPI_EVENT_QUORUM_REPORT_BAD] = { 1000 * SCALE_MS },
[QAPI_EVENT_QUORUM_FAILURE] = { 1000 * SCALE_MS },
[QAPI_EVENT_VSERPORT_CHANGE] = { 1000 * SCALE_MS },
[QAPI_EVENT_MEMORY_DEVICE_SIZE_CHANGE] = { 1000 * SCALE_MS },
};
/*

View File

@ -26,7 +26,7 @@ tap-obj-$(CONFIG_SOLARIS) = tap-solaris.o
tap-obj-y ?= tap-stub.o
common-obj-$(CONFIG_POSIX) += tap.o $(tap-obj-y)
common-obj-$(CONFIG_WIN32) += tap-win32.o
common-obj-$(CONFIG_VHOST_NET_VDPA) += vhost-vdpa.o
vde.o-libs = $(VDE_LIBS)
common-obj-$(CONFIG_CAN_BUS) += can/

View File

@ -61,4 +61,6 @@ int net_init_netmap(const Netdev *netdev, const char *name,
int net_init_vhost_user(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp);
#endif /* QEMU_NET_CLIENTS_H */

View File

@ -325,6 +325,13 @@ void *qemu_get_nic_opaque(NetClientState *nc)
return nic->opaque;
}
NetClientState *qemu_get_peer(NetClientState *nc, int queue_index)
{
assert(nc != NULL);
NetClientState *ncs = nc + queue_index;
return ncs->peer;
}
static void qemu_cleanup_net_client(NetClientState *nc)
{
QTAILQ_REMOVE(&net_clients, nc, next);
@ -959,6 +966,9 @@ static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
#ifdef CONFIG_VHOST_NET_USER
[NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user,
#endif
#ifdef CONFIG_VHOST_NET_VDPA
[NET_CLIENT_DRIVER_VHOST_VDPA] = net_init_vhost_vdpa,
#endif
#ifdef CONFIG_L2TPV3
[NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
#endif

228
net/vhost-vdpa.c Normal file
View File

@ -0,0 +1,228 @@
/*
* vhost-vdpa.c
*
* Copyright(c) 2017-2018 Intel Corporation.
* Copyright(c) 2020 Red Hat, Inc.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "clients.h"
#include "net/vhost_net.h"
#include "net/vhost-vdpa.h"
#include "hw/virtio/vhost-vdpa.h"
#include "qemu/config-file.h"
#include "qemu/error-report.h"
#include "qemu/option.h"
#include "qapi/error.h"
#include <sys/ioctl.h>
#include <err.h>
#include "standard-headers/linux/virtio_net.h"
#include "monitor/monitor.h"
#include "hw/virtio/vhost.h"
/* Todo:need to add the multiqueue support here */
typedef struct VhostVDPAState {
NetClientState nc;
struct vhost_vdpa vhost_vdpa;
VHostNetState *vhost_net;
uint64_t acked_features;
bool started;
} VhostVDPAState;
const int vdpa_feature_bits[] = {
VIRTIO_F_NOTIFY_ON_EMPTY,
VIRTIO_RING_F_INDIRECT_DESC,
VIRTIO_RING_F_EVENT_IDX,
VIRTIO_F_ANY_LAYOUT,
VIRTIO_F_VERSION_1,
VIRTIO_NET_F_CSUM,
VIRTIO_NET_F_GUEST_CSUM,
VIRTIO_NET_F_GSO,
VIRTIO_NET_F_GUEST_TSO4,
VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_ECN,
VIRTIO_NET_F_GUEST_UFO,
VIRTIO_NET_F_HOST_TSO4,
VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_ECN,
VIRTIO_NET_F_HOST_UFO,
VIRTIO_NET_F_MRG_RXBUF,
VIRTIO_NET_F_MTU,
VIRTIO_F_IOMMU_PLATFORM,
VIRTIO_F_RING_PACKED,
VIRTIO_NET_F_GUEST_ANNOUNCE,
VHOST_INVALID_FEATURE_BIT
};
VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
{
VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
return s->vhost_net;
}
uint64_t vhost_vdpa_get_acked_features(NetClientState *nc)
{
VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
s->acked_features = vhost_net_get_acked_features(s->vhost_net);
return s->acked_features;
}
static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
{
uint32_t device_id;
int ret;
struct vhost_dev *hdev;
hdev = (struct vhost_dev *)&net->dev;
ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
if (device_id != VIRTIO_ID_NET) {
return -ENOTSUP;
}
return ret;
}
static void vhost_vdpa_del(NetClientState *ncs)
{
VhostVDPAState *s;
assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
s = DO_UPCAST(VhostVDPAState, nc, ncs);
if (s->vhost_net) {
vhost_net_cleanup(s->vhost_net);
}
}
static int vhost_vdpa_add(NetClientState *ncs, void *be)
{
VhostNetOptions options;
struct vhost_net *net = NULL;
VhostVDPAState *s;
int ret;
options.backend_type = VHOST_BACKEND_TYPE_VDPA;
assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
s = DO_UPCAST(VhostVDPAState, nc, ncs);
options.net_backend = ncs;
options.opaque = be;
options.busyloop_timeout = 0;
net = vhost_net_init(&options);
if (!net) {
error_report("failed to init vhost_net for queue");
goto err;
}
if (s->vhost_net) {
vhost_net_cleanup(s->vhost_net);
g_free(s->vhost_net);
}
s->vhost_net = net;
ret = vhost_vdpa_net_check_device_id(net);
if (ret) {
goto err;
}
return 0;
err:
if (net) {
vhost_net_cleanup(net);
}
vhost_vdpa_del(ncs);
return -1;
}
static void vhost_vdpa_cleanup(NetClientState *nc)
{
VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
if (s->vhost_net) {
vhost_net_cleanup(s->vhost_net);
g_free(s->vhost_net);
s->vhost_net = NULL;
}
}
static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
{
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
return true;
}
static bool vhost_vdpa_has_ufo(NetClientState *nc)
{
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
uint64_t features = 0;
features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
features = vhost_net_get_features(s->vhost_net, features);
return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
}
static NetClientInfo net_vhost_vdpa_info = {
.type = NET_CLIENT_DRIVER_VHOST_VDPA,
.size = sizeof(VhostVDPAState),
.cleanup = vhost_vdpa_cleanup,
.has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
.has_ufo = vhost_vdpa_has_ufo,
};
static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
const char *name, const char *vhostdev)
{
NetClientState *nc = NULL;
VhostVDPAState *s;
int vdpa_device_fd = -1;
int ret = 0;
assert(name);
nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, name);
snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
nc->queue_index = 0;
s = DO_UPCAST(VhostVDPAState, nc, nc);
vdpa_device_fd = qemu_open(vhostdev, O_RDWR);
if (vdpa_device_fd == -1) {
return -errno;
}
s->vhost_vdpa.device_fd = vdpa_device_fd;
ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa);
assert(s->vhost_net);
return ret;
}
static int net_vhost_check_net(void *opaque, QemuOpts *opts, Error **errp)
{
const char *name = opaque;
const char *driver, *netdev;
driver = qemu_opt_get(opts, "driver");
netdev = qemu_opt_get(opts, "netdev");
if (!driver || !netdev) {
return 0;
}
if (strcmp(netdev, name) == 0 &&
!g_str_has_prefix(driver, "virtio-net-")) {
error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
return -1;
}
return 0;
}
int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
NetClientState *peer, Error **errp)
{
const NetdevVhostVDPAOptions *opts;
assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
opts = &netdev->u.vhost_vdpa;
/* verify net frontend */
if (qemu_opts_foreach(qemu_find_opts("device"), net_vhost_check_net,
(char *)name, errp)) {
return -1;
}
return net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name, opts->vhostdev);
}

View File

@ -1356,19 +1356,56 @@
}
}
##
# @VirtioMEMDeviceInfo:
#
# VirtioMEMDevice state information
#
# @id: device's ID
#
# @memaddr: physical address in memory, where device is mapped
#
# @requested-size: the user requested size of the device
#
# @size: the (current) size of memory that the device provides
#
# @max-size: the maximum size of memory that the device can provide
#
# @block-size: the block size of memory that the device provides
#
# @node: NUMA node number where device is assigned to
#
# @memdev: memory backend linked with the region
#
# Since: 5.1
##
{ 'struct': 'VirtioMEMDeviceInfo',
'data': { '*id': 'str',
'memaddr': 'size',
'requested-size': 'size',
'size': 'size',
'max-size': 'size',
'block-size': 'size',
'node': 'int',
'memdev': 'str'
}
}
##
# @MemoryDeviceInfo:
#
# Union containing information about a memory device
#
# nvdimm is included since 2.12. virtio-pmem is included since 4.1.
# virtio-mem is included since 5.1.
#
# Since: 2.1
##
{ 'union': 'MemoryDeviceInfo',
'data': { 'dimm': 'PCDIMMDeviceInfo',
'nvdimm': 'PCDIMMDeviceInfo',
'virtio-pmem': 'VirtioPMEMDeviceInfo'
'virtio-pmem': 'VirtioPMEMDeviceInfo',
'virtio-mem': 'VirtioMEMDeviceInfo'
}
}
@ -1397,6 +1434,31 @@
##
{ 'command': 'query-memory-devices', 'returns': ['MemoryDeviceInfo'] }
##
# @MEMORY_DEVICE_SIZE_CHANGE:
#
# Emitted when the size of a memory device changes. Only emitted for memory
# devices that can actually change the size (e.g., virtio-mem due to guest
# action).
#
# @id: device's ID
# @size: the new size of memory that the device provides
#
# Note: this event is rate-limited.
#
# Since: 5.1
#
# Example:
#
# <- { "event": "MEMORY_DEVICE_SIZE_CHANGE",
# "data": { "id": "vm0", "size": 1073741824},
# "timestamp": { "seconds": 1588168529, "microseconds": 201316 } }
#
##
{ 'event': 'MEMORY_DEVICE_SIZE_CHANGE',
'data': { '*id': 'str', 'size': 'size' } }
##
# @MEM_UNPLUG_ERROR:
#

View File

@ -428,16 +428,39 @@
'*vhostforce': 'bool',
'*queues': 'int' } }
##
# @NetdevVhostVDPAOptions:
#
# Vhost-vdpa network backend
#
# vDPA device is a device that uses a datapath which complies with the virtio
# specifications with a vendor specific control path.
#
# @vhostdev: path of vhost-vdpa device
# (default:'/dev/vhost-vdpa-0')
#
# @queues: number of queues to be created for multiqueue vhost-vdpa
# (default: 1)
#
# Since: 5.1
##
{ 'struct': 'NetdevVhostVDPAOptions',
'data': {
'*vhostdev': 'str',
'*queues': 'int' } }
##
# @NetClientDriver:
#
# Available netdev drivers.
#
# Since: 2.7
#
# @vhost-vdpa since 5.1
##
{ 'enum': 'NetClientDriver',
'data': [ 'none', 'nic', 'user', 'tap', 'l2tpv3', 'socket', 'vde',
'bridge', 'hubport', 'netmap', 'vhost-user' ] }
'bridge', 'hubport', 'netmap', 'vhost-user', 'vhost-vdpa' ] }
##
# @Netdev:
@ -465,7 +488,8 @@
'bridge': 'NetdevBridgeOptions',
'hubport': 'NetdevHubPortOptions',
'netmap': 'NetdevNetmapOptions',
'vhost-user': 'NetdevVhostUserOptions' } }
'vhost-user': 'NetdevVhostUserOptions',
'vhost-vdpa': 'NetdevVhostVDPAOptions' } }
##
# @NetFilterDirection:

View File

@ -2418,6 +2418,10 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
#ifdef CONFIG_POSIX
"-netdev vhost-user,id=str,chardev=dev[,vhostforce=on|off]\n"
" configure a vhost-user network, backed by a chardev 'dev'\n"
#endif
#ifdef __linux__
"-netdev vhost-vdpa,id=str,vhostdev=/path/to/dev\n"
" configure a vhost-vdpa network,Establish a vhost-vdpa netdev\n"
#endif
"-netdev hubport,id=str,hubid=n[,netdev=nd]\n"
" configure a hub port on the hub with ID 'n'\n", QEMU_ARCH_ALL)
@ -2897,6 +2901,14 @@ SRST
-netdev type=vhost-user,id=net0,chardev=chr0 \
-device virtio-net-pci,netdev=net0
``-netdev vhost-vdpa,vhostdev=/path/to/dev``
Establish a vhost-vdpa netdev.
vDPA device is a device that uses a datapath which complies with
the virtio specifications with a vendor specific control path.
vDPA devices can be both physically located on the hardware or
emulated by software.
``-netdev hubport,id=id,hubid=hubid[,netdev=nd]``
Create a hub port on the emulated hub with ID hubid.

View File

@ -680,6 +680,12 @@ sev_guest_init(const char *id)
uint32_t host_cbitpos;
struct sev_user_data_status status = {};
ret = ram_block_discard_disable(true);
if (ret) {
error_report("%s: cannot disable RAM discard", __func__);
return NULL;
}
sev = lookup_sev_guest_info(id);
if (!sev) {
error_report("%s: '%s' is not a valid '%s' object",
@ -751,6 +757,7 @@ sev_guest_init(const char *id)
return sev;
err:
sev_guest = NULL;
ram_block_discard_disable(false);
return NULL;
}

View File

@ -0,0 +1,52 @@
#!/usr/bin/bash
outdir=
while getopts "o:" arg; do
case ${arg} in
o )
outdir=$OPTARG
;;
\? )
echo "Usage: ./tests/data/acpi/disassemle-aml.sh [-o <output-directory>]"
exit 1
;;
esac
done
for machine in tests/data/acpi/*
do
if [[ ! -d "$machine" ]];
then
continue
fi
if [[ "${outdir}" ]];
then
mkdir -p "${outdir}"/${machine} || exit $?
fi
for aml in $machine/*
do
if [[ "$aml" == $machine/*.dsl ]];
then
continue
fi
if [[ "$aml" == $machine/SSDT*.* ]];
then
dsdt=${aml/SSDT*./DSDT.}
extra="-e ${dsdt}"
elif [[ "$aml" == $machine/SSDT* ]];
then
dsdt=${aml/SSDT*/DSDT};
extra="-e ${dsdt}"
else
extra=""
fi
asl=${aml}.dsl
if [[ "${outdir}" ]];
then
asl="${outdir}"/${machine}/${asl}
fi
iasl -d -p ${asl} ${extra} ${aml}
done
done

View File

@ -36,6 +36,7 @@ old_allowed_dif=`grep -v -e 'List of comma-separated changed AML files to ignore
echo '/* List of comma-separated changed AML files to ignore */' > ${SRC_PATH}/tests/qtest/bios-tables-test-allowed-diff.h
echo "The files were rebuilt and can be added to git."
echo "You can use ${SRC_PATH}/tests/data/acpi/disassemle-aml.sh to disassemble them to ASL."
if [ -z "$old_allowed_dif" ]; then
echo "Note! Please do not commit expected files with source changes"

View File

@ -1211,7 +1211,7 @@ static void test_migrate_auto_converge(void)
* without throttling.
*/
migrate_set_parameter_int(from, "downtime-limit", 1);
migrate_set_parameter_int(from, "max-bandwidth", 1000000); /* ~1Mb/s */
migrate_set_parameter_int(from, "max-bandwidth", 100000000); /* ~100Mb/s */
/* To check remaining size after precopy */
migrate_set_capability(from, "pause-before-switchover", true);