"Host Memory Backends" and "Memory devices" queue ("mem"):
 - Support memory devices with multiple memslots
 - Support memory devices that dynamically consume memslots
 - Support memory devices that can automatically decide on the number of
   memslots to use
 - virtio-mem support for exposing memory dynamically via multiple
   memslots
 - Some required cleanups/refactorings
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmUn+XMRHGRhdmlkQHJl
 ZGhhdC5jb20ACgkQTd4Q9wD/g1qDHA//T01suTa+uzrcoJHoMWN11S47WnAmbuTo
 vVakucLBPMJAa9xZeCy3OavXaVGpHkw+t6g3OFknof0LfQ5/j9iE3Q1PxURN7g5j
 SJ2WJXCoceM6T4TMhPvVvgEaYjFmESqZB5FZgedMT0QRyhAxMuF9pCkWhk1O3OAV
 JqQKqLFiGcv60AEuBYGZGzgiOUv8EJ5gKwRF4VOdyHIxqZDw1aZXzlcd4TzFZBQ7
 rwW/3ef+sFmUJdmfrSrqcIlQSRrqZ2w95xATDzLTIEEUT3SWqh/E95EZWIz1M0oQ
 NgWgFiLCR1KOj7bWFhLXT7IfyLh0mEysD+P/hY6QwQ4RewWG7EW5UK+JFswssdcZ
 rEj5XpHZzev/wx7hM4bWsoQ+VIvrH7j3uYGyWkcgYRbdDEkWDv2rsT23lwGYNhht
 oBsrdEBELRw6v4C8doq/+sCmHmuxUMqTGwbArCQVnB1XnLxOEkuqlnfq5MORkzNF
 fxbIRx+LRluOllC0HVaDQd8qxRq1+UC5WIpAcDcrouy4HGgi1onWKrXpgjIAbVyH
 M6cENkK7rnRk96gpeXdmrf0h9HqRciAOY8oUsFsvLyKBOCPBWDrLyOQEY5UoSdtD
 m4QpEVgywCy2z1uU/UObeT/UxJy/9EL/Zb+DHoEK06iEhwONoUJjEBYMJD38RMkk
 mwPTB4UAk9g=
 =s69t
 -----END PGP SIGNATURE-----

Merge tag 'mem-2023-10-12' of https://github.com/davidhildenbrand/qemu into staging

Hi,

"Host Memory Backends" and "Memory devices" queue ("mem"):
- Support memory devices with multiple memslots
- Support memory devices that dynamically consume memslots
- Support memory devices that can automatically decide on the number of
  memslots to use
- virtio-mem support for exposing memory dynamically via multiple
  memslots
- Some required cleanups/refactorings

# -----BEGIN PGP SIGNATURE-----
#
# iQJFBAABCAAvFiEEG9nKrXNcTDpGDfzKTd4Q9wD/g1oFAmUn+XMRHGRhdmlkQHJl
# ZGhhdC5jb20ACgkQTd4Q9wD/g1qDHA//T01suTa+uzrcoJHoMWN11S47WnAmbuTo
# vVakucLBPMJAa9xZeCy3OavXaVGpHkw+t6g3OFknof0LfQ5/j9iE3Q1PxURN7g5j
# SJ2WJXCoceM6T4TMhPvVvgEaYjFmESqZB5FZgedMT0QRyhAxMuF9pCkWhk1O3OAV
# JqQKqLFiGcv60AEuBYGZGzgiOUv8EJ5gKwRF4VOdyHIxqZDw1aZXzlcd4TzFZBQ7
# rwW/3ef+sFmUJdmfrSrqcIlQSRrqZ2w95xATDzLTIEEUT3SWqh/E95EZWIz1M0oQ
# NgWgFiLCR1KOj7bWFhLXT7IfyLh0mEysD+P/hY6QwQ4RewWG7EW5UK+JFswssdcZ
# rEj5XpHZzev/wx7hM4bWsoQ+VIvrH7j3uYGyWkcgYRbdDEkWDv2rsT23lwGYNhht
# oBsrdEBELRw6v4C8doq/+sCmHmuxUMqTGwbArCQVnB1XnLxOEkuqlnfq5MORkzNF
# fxbIRx+LRluOllC0HVaDQd8qxRq1+UC5WIpAcDcrouy4HGgi1onWKrXpgjIAbVyH
# M6cENkK7rnRk96gpeXdmrf0h9HqRciAOY8oUsFsvLyKBOCPBWDrLyOQEY5UoSdtD
# m4QpEVgywCy2z1uU/UObeT/UxJy/9EL/Zb+DHoEK06iEhwONoUJjEBYMJD38RMkk
# mwPTB4UAk9g=
# =s69t
# -----END PGP SIGNATURE-----
# gpg: Signature made Thu 12 Oct 2023 09:49:39 EDT
# gpg:                using RSA key 1BD9CAAD735C4C3A460DFCCA4DDE10F700FF835A
# gpg:                issuer "david@redhat.com"
# gpg: Good signature from "David Hildenbrand <david@redhat.com>" [unknown]
# gpg:                 aka "David Hildenbrand <davidhildenbrand@gmail.com>" [full]
# gpg:                 aka "David Hildenbrand <hildenbr@in.tum.de>" [unknown]
# gpg: WARNING: The key's User ID is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: 1BD9 CAAD 735C 4C3A 460D  FCCA 4DDE 10F7 00FF 835A

* tag 'mem-2023-10-12' of https://github.com/davidhildenbrand/qemu:
  virtio-mem: Mark memslot alias memory regions unmergeable
  memory,vhost: Allow for marking memory device memory regions unmergeable
  virtio-mem: Expose device memory dynamically via multiple memslots if enabled
  virtio-mem: Update state to match bitmap as soon as it's been migrated
  virtio-mem: Pass non-const VirtIOMEM via virtio_mem_range_cb
  memory: Clarify mapping requirements for RamDiscardManager
  memory-device,vhost: Support automatic decision on the number of memslots
  vhost: Add vhost_get_max_memslots()
  kvm: Add stub for kvm_get_max_memslots()
  memory-device,vhost: Support memory devices that dynamically consume memslots
  memory-device: Track required and actually used memslots in DeviceMemoryState
  stubs: Rename qmp_memory_device.c to memory_device.c
  memory-device: Support memory devices with multiple memslots
  vhost: Return number of free memslots
  kvm: Return number of free memslots
  softmmu/physmem: Fixup qemu_ram_block_from_host() documentation
  vhost: Remove vhost_backend_can_merge() callback
  vhost: Rework memslot filtering and fix "used_memslot" tracking

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
Stefan Hajnoczi 2023-10-16 12:34:17 -04:00
commit bc2b89b385
23 changed files with 839 additions and 113 deletions

View File

@ -2891,6 +2891,7 @@ F: hw/mem/pc-dimm.c
F: include/hw/mem/memory-device.h
F: include/hw/mem/nvdimm.h
F: include/hw/mem/pc-dimm.h
F: stubs/memory_device.c
F: docs/nvdimm.txt
SPICE

View File

@ -174,13 +174,31 @@ void kvm_resample_fd_notify(int gsi)
}
}
int kvm_get_max_memslots(void)
unsigned int kvm_get_max_memslots(void)
{
KVMState *s = KVM_STATE(current_accel());
return s->nr_slots;
}
unsigned int kvm_get_free_memslots(void)
{
unsigned int used_slots = 0;
KVMState *s = kvm_state;
int i;
kvm_slots_lock();
for (i = 0; i < s->nr_as; i++) {
if (!s->as[i].ml) {
continue;
}
used_slots = MAX(used_slots, s->as[i].ml->nr_used_slots);
}
kvm_slots_unlock();
return s->nr_slots - used_slots;
}
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
{
@ -196,19 +214,6 @@ static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
return NULL;
}
bool kvm_has_free_slot(MachineState *ms)
{
KVMState *s = KVM_STATE(ms->accelerator);
bool result;
KVMMemoryListener *kml = &s->memory_listener;
kvm_slots_lock();
result = !!kvm_get_free_slot(kml);
kvm_slots_unlock();
return result;
}
/* Called with KVMMemoryListener.slots_lock held */
static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
{
@ -1387,6 +1392,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
}
start_addr += slot_size;
size -= slot_size;
kml->nr_used_slots--;
} while (size);
return;
}
@ -1412,6 +1418,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
ram_start_offset += slot_size;
ram += slot_size;
size -= slot_size;
kml->nr_used_slots++;
} while (size);
}

View File

@ -109,9 +109,14 @@ int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
return -ENOSYS;
}
bool kvm_has_free_slot(MachineState *ms)
unsigned int kvm_get_max_memslots(void)
{
return false;
return 0;
}
unsigned int kvm_get_free_memslots(void)
{
return 0;
}
void kvm_init_cpu_signals(CPUState *cpu)

View File

@ -52,19 +52,135 @@ static int memory_device_build_list(Object *obj, void *opaque)
return 0;
}
static void memory_device_check_addable(MachineState *ms, MemoryRegion *mr,
Error **errp)
static unsigned int memory_device_get_memslots(MemoryDeviceState *md)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
if (mdc->get_memslots) {
return mdc->get_memslots(md);
}
return 1;
}
/*
* Memslots that are reserved by memory devices (required but still reported
* as free from KVM / vhost).
*/
static unsigned int get_reserved_memslots(MachineState *ms)
{
if (ms->device_memory->used_memslots >
ms->device_memory->required_memslots) {
/* This is unexpected, and we warned already in the memory notifier. */
return 0;
}
return ms->device_memory->required_memslots -
ms->device_memory->used_memslots;
}
unsigned int memory_devices_get_reserved_memslots(void)
{
if (!current_machine->device_memory) {
return 0;
}
return get_reserved_memslots(current_machine);
}
bool memory_devices_memslot_auto_decision_active(void)
{
if (!current_machine->device_memory) {
return false;
}
return current_machine->device_memory->memslot_auto_decision_active;
}
static unsigned int memory_device_memslot_decision_limit(MachineState *ms,
MemoryRegion *mr)
{
const unsigned int reserved = get_reserved_memslots(ms);
const uint64_t size = memory_region_size(mr);
unsigned int max = vhost_get_max_memslots();
unsigned int free = vhost_get_free_memslots();
uint64_t available_space;
unsigned int memslots;
if (kvm_enabled()) {
max = MIN(max, kvm_get_max_memslots());
free = MIN(free, kvm_get_free_memslots());
}
/*
* If we only have less overall memslots than what we consider reasonable,
* just keep it to a minimum.
*/
if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) {
return 1;
}
/*
* Consider our soft-limit across all memory devices. We don't really
* expect to exceed this limit in reasonable configurations.
*/
if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <=
ms->device_memory->required_memslots) {
return 1;
}
memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT -
ms->device_memory->required_memslots;
/*
* Consider the actually still free memslots. This is only relevant if
* other memslot consumers would consume *significantly* more memslots than
* what we prepared for (> 253). Unlikely, but let's just handle it
* cleanly.
*/
memslots = MIN(memslots, free - reserved);
if (memslots < 1 || unlikely(free < reserved)) {
return 1;
}
/* We cannot have any other memory devices? So give all to this device. */
if (size == ms->maxram_size - ms->ram_size) {
return memslots;
}
/*
* Simple heuristic: equally distribute the memslots over the space
* still available for memory devices.
*/
available_space = ms->maxram_size - ms->ram_size -
ms->device_memory->used_region_size;
memslots = (double)memslots * size / available_space;
return memslots < 1 ? 1 : memslots;
}
static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md,
MemoryRegion *mr, Error **errp)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
const uint64_t used_region_size = ms->device_memory->used_region_size;
const uint64_t size = memory_region_size(mr);
const unsigned int reserved_memslots = get_reserved_memslots(ms);
unsigned int required_memslots, memslot_limit;
/* we will need a new memory slot for kvm and vhost */
if (kvm_enabled() && !kvm_has_free_slot(ms)) {
error_setg(errp, "hypervisor has no free memory slots left");
/*
* Instruct the device to decide how many memslots to use, if applicable,
* before we query the number of required memslots the first time.
*/
if (mdc->decide_memslots) {
memslot_limit = memory_device_memslot_decision_limit(ms, mr);
mdc->decide_memslots(md, memslot_limit);
}
required_memslots = memory_device_get_memslots(md);
/* we will need memory slots for kvm and vhost */
if (kvm_enabled() &&
kvm_get_free_memslots() < required_memslots + reserved_memslots) {
error_setg(errp, "hypervisor has not enough free memory slots left");
return;
}
if (!vhost_has_free_slot()) {
error_setg(errp, "a used vhost backend has no free memory slots left");
if (vhost_get_free_memslots() < required_memslots + reserved_memslots) {
error_setg(errp, "a used vhost backend has not enough free memory slots left");
return;
}
@ -233,7 +349,7 @@ void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
goto out;
}
memory_device_check_addable(ms, mr, &local_err);
memory_device_check_addable(ms, md, mr, &local_err);
if (local_err) {
goto out;
}
@ -264,6 +380,7 @@ out:
void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
const unsigned int memslots = memory_device_get_memslots(md);
const uint64_t addr = mdc->get_addr(md);
MemoryRegion *mr;
@ -275,6 +392,11 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
g_assert(ms->device_memory);
ms->device_memory->used_region_size += memory_region_size(mr);
ms->device_memory->required_memslots += memslots;
if (mdc->decide_memslots && memslots > 1) {
ms->device_memory->memslot_auto_decision_active++;
}
memory_region_add_subregion(&ms->device_memory->mr,
addr - ms->device_memory->base, mr);
trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr);
@ -283,6 +405,7 @@ void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
{
const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
const unsigned int memslots = memory_device_get_memslots(md);
MemoryRegion *mr;
/*
@ -293,7 +416,12 @@ void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
g_assert(ms->device_memory);
memory_region_del_subregion(&ms->device_memory->mr, mr);
if (mdc->decide_memslots && memslots > 1) {
ms->device_memory->memslot_auto_decision_active--;
}
ms->device_memory->used_region_size -= memory_region_size(mr);
ms->device_memory->required_memslots -= memslots;
trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "",
mdc->get_addr(md));
}
@ -313,6 +441,50 @@ uint64_t memory_device_get_region_size(const MemoryDeviceState *md,
return memory_region_size(mr);
}
static void memory_devices_region_mod(MemoryListener *listener,
MemoryRegionSection *mrs, bool add)
{
DeviceMemoryState *dms = container_of(listener, DeviceMemoryState,
listener);
if (!memory_region_is_ram(mrs->mr)) {
warn_report("Unexpected memory region mapped into device memory region.");
return;
}
/*
* The expectation is that each distinct RAM memory region section in
* our region for memory devices consumes exactly one memslot in KVM
* and in vhost. For vhost, this is true, except:
* * ROM memory regions don't consume a memslot. These get used very
* rarely for memory devices (R/O NVDIMMs).
* * Memslots without a fd (memory-backend-ram) don't necessarily
* consume a memslot. Such setups are quite rare and possibly bogus:
* the memory would be inaccessible by such vhost devices.
*
* So for vhost, in corner cases we might over-estimate the number of
* memslots that are currently used or that might still be reserved
* (required - used).
*/
dms->used_memslots += add ? 1 : -1;
if (dms->used_memslots > dms->required_memslots) {
warn_report("Memory devices use more memory slots than indicated as required.");
}
}
static void memory_devices_region_add(MemoryListener *listener,
MemoryRegionSection *mrs)
{
return memory_devices_region_mod(listener, mrs, true);
}
static void memory_devices_region_del(MemoryListener *listener,
MemoryRegionSection *mrs)
{
return memory_devices_region_mod(listener, mrs, false);
}
void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
{
g_assert(size);
@ -322,8 +494,16 @@ void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory",
size);
address_space_init(&ms->device_memory->as, &ms->device_memory->mr,
"device-memory");
memory_region_add_subregion(get_system_memory(), ms->device_memory->base,
&ms->device_memory->mr);
/* Track the number of memslots used by memory devices. */
ms->device_memory->listener.region_add = memory_devices_region_add;
ms->device_memory->listener.region_del = memory_devices_region_del;
memory_listener_register(&ms->device_memory->listener,
&ms->device_memory->as);
}
static const TypeInfo memory_device_info = {

View File

@ -2,9 +2,14 @@
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-user.h"
bool vhost_has_free_slot(void)
unsigned int vhost_get_max_memslots(void)
{
return true;
return UINT_MAX;
}
unsigned int vhost_get_free_memslots(void)
{
return UINT_MAX;
}
bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)

View File

@ -2327,19 +2327,6 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
return -ENOTSUP;
}
static bool vhost_user_can_merge(struct vhost_dev *dev,
uint64_t start1, uint64_t size1,
uint64_t start2, uint64_t size2)
{
ram_addr_t offset;
int mfd, rfd;
(void)vhost_user_get_mr_data(start1, &offset, &mfd);
(void)vhost_user_get_mr_data(start2, &offset, &rfd);
return mfd == rfd;
}
static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
{
VhostUserMsg msg;
@ -2622,10 +2609,9 @@ vhost_user_crypto_close_session(struct vhost_dev *dev, uint64_t session_id)
return 0;
}
static bool vhost_user_mem_section_filter(struct vhost_dev *dev,
MemoryRegionSection *section)
static bool vhost_user_no_private_memslots(struct vhost_dev *dev)
{
return memory_region_get_fd(section->mr) >= 0;
return true;
}
static int vhost_user_get_inflight_fd(struct vhost_dev *dev,
@ -2868,6 +2854,7 @@ const VhostOps user_ops = {
.vhost_backend_init = vhost_user_backend_init,
.vhost_backend_cleanup = vhost_user_backend_cleanup,
.vhost_backend_memslots_limit = vhost_user_memslots_limit,
.vhost_backend_no_private_memslots = vhost_user_no_private_memslots,
.vhost_set_log_base = vhost_user_set_log_base,
.vhost_set_mem_table = vhost_user_set_mem_table,
.vhost_set_vring_addr = vhost_user_set_vring_addr,
@ -2886,7 +2873,6 @@ const VhostOps user_ops = {
.vhost_set_vring_enable = vhost_user_set_vring_enable,
.vhost_requires_shm_log = vhost_user_requires_shm_log,
.vhost_migration_done = vhost_user_migration_done,
.vhost_backend_can_merge = vhost_user_can_merge,
.vhost_net_set_mtu = vhost_user_net_set_mtu,
.vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
.vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
@ -2894,7 +2880,6 @@ const VhostOps user_ops = {
.vhost_set_config = vhost_user_set_config,
.vhost_crypto_create_session = vhost_user_crypto_create_session,
.vhost_crypto_close_session = vhost_user_crypto_close_session,
.vhost_backend_mem_section_filter = vhost_user_mem_section_filter,
.vhost_get_inflight_fd = vhost_user_get_inflight_fd,
.vhost_set_inflight_fd = vhost_user_set_inflight_fd,
.vhost_dev_start = vhost_user_dev_start,

View File

@ -1512,7 +1512,6 @@ const VhostOps vdpa_ops = {
.vhost_set_config = vhost_vdpa_set_config,
.vhost_requires_shm_log = NULL,
.vhost_migration_done = NULL,
.vhost_backend_can_merge = NULL,
.vhost_net_set_mtu = NULL,
.vhost_set_iotlb_callback = NULL,
.vhost_send_device_iotlb_msg = NULL,

View File

@ -23,6 +23,7 @@
#include "qemu/log.h"
#include "standard-headers/linux/vhost_types.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/mem/memory-device.h"
#include "migration/blocker.h"
#include "migration/qemu-file-types.h"
#include "sysemu/dma.h"
@ -45,20 +46,44 @@
static struct vhost_log *vhost_log;
static struct vhost_log *vhost_log_shm;
/* Memslots used by backends that support private memslots (without an fd). */
static unsigned int used_memslots;
/* Memslots used by backends that only support shared memslots (with an fd). */
static unsigned int used_shared_memslots;
static QLIST_HEAD(, vhost_dev) vhost_devices =
QLIST_HEAD_INITIALIZER(vhost_devices);
bool vhost_has_free_slot(void)
unsigned int vhost_get_max_memslots(void)
{
unsigned int slots_limit = ~0U;
unsigned int max = UINT_MAX;
struct vhost_dev *hdev;
QLIST_FOREACH(hdev, &vhost_devices, entry) {
max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev));
}
return max;
}
unsigned int vhost_get_free_memslots(void)
{
unsigned int free = UINT_MAX;
struct vhost_dev *hdev;
QLIST_FOREACH(hdev, &vhost_devices, entry) {
unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
slots_limit = MIN(slots_limit, r);
unsigned int cur_free;
if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
cur_free = r - used_shared_memslots;
} else {
cur_free = r - used_memslots;
}
free = MIN(free, cur_free);
}
return slots_limit > used_memslots;
return free;
}
static void vhost_dev_sync_region(struct vhost_dev *dev,
@ -474,8 +499,7 @@ static int vhost_verify_ring_mappings(struct vhost_dev *dev,
* vhost_section: identify sections needed for vhost access
*
* We only care about RAM sections here (where virtqueue and guest
* internals accessed by virtio might live). If we find one we still
* allow the backend to potentially filter it out of our list.
* internals accessed by virtio might live).
*/
static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
{
@ -502,8 +526,16 @@ static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
return false;
}
if (dev->vhost_ops->vhost_backend_mem_section_filter &&
!dev->vhost_ops->vhost_backend_mem_section_filter(dev, section)) {
/*
* Some backends (like vhost-user) can only handle memory regions
* that have an fd (can be mapped into a different process). Filter
* the ones without an fd out, if requested.
*
* TODO: we might have to limit to MAP_SHARED as well.
*/
if (memory_region_get_fd(section->mr) < 0 &&
dev->vhost_ops->vhost_backend_no_private_memslots &&
dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
trace_vhost_reject_section(mr->name, 2);
return false;
}
@ -568,7 +600,14 @@ static void vhost_commit(MemoryListener *listener)
dev->n_mem_sections * sizeof dev->mem->regions[0];
dev->mem = g_realloc(dev->mem, regions_size);
dev->mem->nregions = dev->n_mem_sections;
used_memslots = dev->mem->nregions;
if (dev->vhost_ops->vhost_backend_no_private_memslots &&
dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
used_shared_memslots = dev->mem->nregions;
} else {
used_memslots = dev->mem->nregions;
}
for (i = 0; i < dev->n_mem_sections; i++) {
struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
struct MemoryRegionSection *mrs = dev->mem_sections + i;
@ -668,7 +707,7 @@ static void vhost_region_add_section(struct vhost_dev *dev,
mrs_size, mrs_host);
}
if (dev->n_tmp_sections) {
if (dev->n_tmp_sections && !section->unmergeable) {
/* Since we already have at least one section, lets see if
* this extends it; since we're scanning in order, we only
* have to look at the last one, and the FlatView that calls
@ -701,11 +740,7 @@ static void vhost_region_add_section(struct vhost_dev *dev,
size_t offset = mrs_gpa - prev_gpa_start;
if (prev_host_start + offset == mrs_host &&
section->mr == prev_sec->mr &&
(!dev->vhost_ops->vhost_backend_can_merge ||
dev->vhost_ops->vhost_backend_can_merge(dev,
mrs_host, mrs_size,
prev_host_start, prev_size))) {
section->mr == prev_sec->mr && !prev_sec->unmergeable) {
uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
need_add = false;
prev_sec->offset_within_address_space =
@ -1400,6 +1435,7 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type, uint32_t busyloop_timeout,
Error **errp)
{
unsigned int used, reserved, limit;
uint64_t features;
int i, r, n_initialized_vqs = 0;
@ -1426,6 +1462,19 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
goto fail;
}
limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS &&
memory_devices_memslot_auto_decision_active()) {
error_setg(errp, "some memory device (like virtio-mem)"
" decided how many memory slots to use based on the overall"
" number of memory slots; this vhost backend would further"
" restricts the overall number of memory slots");
error_append_hint(errp, "Try plugging this vhost backend before"
" plugging such memory devices.\n");
r = -EINVAL;
goto fail;
}
for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
if (r < 0) {
@ -1495,9 +1544,27 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
memory_listener_register(&hdev->memory_listener, &address_space_memory);
QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
error_setg(errp, "vhost backend memory slots limit is less"
" than current number of present memory slots");
/*
* The listener we registered properly updated the corresponding counter.
* So we can trust that these values are accurate.
*/
if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
used = used_shared_memslots;
} else {
used = used_memslots;
}
/*
* We assume that all reserved memslots actually require a real memslot
* in our vhost backend. This might not be true, for example, if the
* memslot would be ROM. If ever relevant, we can optimize for that --
* but we'll need additional information about the reservations.
*/
reserved = memory_devices_get_reserved_memslots();
if (used + reserved > limit) {
error_setg(errp, "vhost backend memory slots limit (%d) is less"
" than current number of used (%d) and reserved (%d)"
" memory slots for memory devices.", limit, used, reserved);
r = -EINVAL;
goto fail_busyloop;
}

View File

@ -48,6 +48,25 @@ static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
return vmc->get_memory_region(vmem, errp);
}
static void virtio_mem_pci_decide_memslots(MemoryDeviceState *md,
unsigned int limit)
{
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
vmc->decide_memslots(vmem, limit);
}
static unsigned int virtio_mem_pci_get_memslots(MemoryDeviceState *md)
{
VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
VirtIOMEM *vmem = VIRTIO_MEM(&pci_mem->vdev);
VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
return vmc->get_memslots(vmem);
}
static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
Error **errp)
{
@ -150,6 +169,8 @@ static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
mdc->set_addr = virtio_mem_pci_set_addr;
mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
mdc->get_memory_region = virtio_mem_pci_get_memory_region;
mdc->decide_memslots = virtio_mem_pci_decide_memslots;
mdc->get_memslots = virtio_mem_pci_get_memslots;
mdc->fill_device_info = virtio_mem_pci_fill_device_info;
mdc->get_min_alignment = virtio_mem_pci_get_min_alignment;

View File

@ -66,6 +66,13 @@ static uint32_t virtio_mem_default_thp_size(void)
return default_thp_size;
}
/*
* The minimum memslot size depends on this setting ("sane default"), the
* device block size, and the memory backend page size. The last (or single)
* memslot might be smaller than this constant.
*/
#define VIRTIO_MEM_MIN_MEMSLOT_SIZE (1 * GiB)
/*
* We want to have a reasonable default block size such that
* 1. We avoid splitting THPs when unplugging memory, which degrades
@ -177,10 +184,10 @@ static bool virtio_mem_is_busy(void)
return migration_in_incoming_postcopy() || !migration_is_idle();
}
typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg,
typedef int (*virtio_mem_range_cb)(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size);
static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
static int virtio_mem_for_each_unplugged_range(VirtIOMEM *vmem, void *arg,
virtio_mem_range_cb cb)
{
unsigned long first_zero_bit, last_zero_bit;
@ -204,7 +211,7 @@ static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg,
return ret;
}
static int virtio_mem_for_each_plugged_range(const VirtIOMEM *vmem, void *arg,
static int virtio_mem_for_each_plugged_range(VirtIOMEM *vmem, void *arg,
virtio_mem_range_cb cb)
{
unsigned long first_bit, last_bit;
@ -483,6 +490,96 @@ static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa,
return true;
}
static void virtio_mem_activate_memslot(VirtIOMEM *vmem, unsigned int idx)
{
const uint64_t memslot_offset = idx * vmem->memslot_size;
assert(vmem->memslots);
/*
* Instead of enabling/disabling memslots, we add/remove them. This should
* make address space updates faster, because we don't have to loop over
* many disabled subregions.
*/
if (memory_region_is_mapped(&vmem->memslots[idx])) {
return;
}
memory_region_add_subregion(vmem->mr, memslot_offset, &vmem->memslots[idx]);
}
static void virtio_mem_deactivate_memslot(VirtIOMEM *vmem, unsigned int idx)
{
assert(vmem->memslots);
if (!memory_region_is_mapped(&vmem->memslots[idx])) {
return;
}
memory_region_del_subregion(vmem->mr, &vmem->memslots[idx]);
}
static void virtio_mem_activate_memslots_to_plug(VirtIOMEM *vmem,
uint64_t offset, uint64_t size)
{
const unsigned int start_idx = offset / vmem->memslot_size;
const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
vmem->memslot_size;
unsigned int idx;
if (!vmem->dynamic_memslots) {
return;
}
/* Activate all involved memslots in a single transaction. */
memory_region_transaction_begin();
for (idx = start_idx; idx < end_idx; idx++) {
virtio_mem_activate_memslot(vmem, idx);
}
memory_region_transaction_commit();
}
static void virtio_mem_deactivate_unplugged_memslots(VirtIOMEM *vmem,
uint64_t offset,
uint64_t size)
{
const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
const unsigned int start_idx = offset / vmem->memslot_size;
const unsigned int end_idx = (offset + size + vmem->memslot_size - 1) /
vmem->memslot_size;
unsigned int idx;
if (!vmem->dynamic_memslots) {
return;
}
/* Deactivate all memslots with unplugged blocks in a single transaction. */
memory_region_transaction_begin();
for (idx = start_idx; idx < end_idx; idx++) {
const uint64_t memslot_offset = idx * vmem->memslot_size;
uint64_t memslot_size = vmem->memslot_size;
/* The size of the last memslot might be smaller. */
if (idx == vmem->nb_memslots - 1) {
memslot_size = region_size - memslot_offset;
}
/*
* Partially covered memslots might still have some blocks plugged and
* have to remain active if that's the case.
*/
if (offset > memslot_offset ||
offset + size < memslot_offset + memslot_size) {
const uint64_t gpa = vmem->addr + memslot_offset;
if (!virtio_mem_is_range_unplugged(vmem, gpa, memslot_size)) {
continue;
}
}
virtio_mem_deactivate_memslot(vmem, idx);
}
memory_region_transaction_commit();
}
static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
uint64_t size, bool plug)
{
@ -500,6 +597,8 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
}
virtio_mem_notify_unplug(vmem, offset, size);
virtio_mem_set_range_unplugged(vmem, start_gpa, size);
/* Deactivate completely unplugged memslots after updating the state. */
virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
return 0;
}
@ -527,7 +626,20 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
}
if (!ret) {
/*
* Activate before notifying and rollback in case of any errors.
*
* When activating a yet inactive memslot, memory notifiers will get
* notified about the added memory region and can register with the
* RamDiscardManager; this will traverse all plugged blocks and skip the
* blocks we are plugging here. The following notification will inform
* registered listeners about the blocks we're plugging.
*/
virtio_mem_activate_memslots_to_plug(vmem, offset, size);
ret = virtio_mem_notify_plug(vmem, offset, size);
if (ret) {
virtio_mem_deactivate_unplugged_memslots(vmem, offset, size);
}
}
if (ret) {
/* Could be preallocation or a notifier populated memory. */
@ -620,6 +732,7 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem,
static int virtio_mem_unplug_all(VirtIOMEM *vmem)
{
const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
RAMBlock *rb = vmem->memdev->mr.ram_block;
if (vmem->size) {
@ -634,6 +747,9 @@ static int virtio_mem_unplug_all(VirtIOMEM *vmem)
bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size);
vmem->size = 0;
notifier_list_notify(&vmem->size_change_notifiers, &vmem->size);
/* Deactivate all memslots after updating the state. */
virtio_mem_deactivate_unplugged_memslots(vmem, 0, region_size);
}
trace_virtio_mem_unplugged_all();
@ -790,6 +906,49 @@ static void virtio_mem_system_reset(void *opaque)
virtio_mem_unplug_all(vmem);
}
static void virtio_mem_prepare_mr(VirtIOMEM *vmem)
{
const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
assert(!vmem->mr && vmem->dynamic_memslots);
vmem->mr = g_new0(MemoryRegion, 1);
memory_region_init(vmem->mr, OBJECT(vmem), "virtio-mem",
region_size);
vmem->mr->align = memory_region_get_alignment(&vmem->memdev->mr);
}
static void virtio_mem_prepare_memslots(VirtIOMEM *vmem)
{
const uint64_t region_size = memory_region_size(&vmem->memdev->mr);
unsigned int idx;
g_assert(!vmem->memslots && vmem->nb_memslots && vmem->dynamic_memslots);
vmem->memslots = g_new0(MemoryRegion, vmem->nb_memslots);
/* Initialize our memslots, but don't map them yet. */
for (idx = 0; idx < vmem->nb_memslots; idx++) {
const uint64_t memslot_offset = idx * vmem->memslot_size;
uint64_t memslot_size = vmem->memslot_size;
char name[20];
/* The size of the last memslot might be smaller. */
if (idx == vmem->nb_memslots - 1) {
memslot_size = region_size - memslot_offset;
}
snprintf(name, sizeof(name), "memslot-%u", idx);
memory_region_init_alias(&vmem->memslots[idx], OBJECT(vmem), name,
&vmem->memdev->mr, memslot_offset,
memslot_size);
/*
* We want to be able to atomically and efficiently activate/deactivate
* individual memslots without affecting adjacent memslots in memory
* notifiers.
*/
memory_region_set_unmergeable(&vmem->memslots[idx], true);
}
}
static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
{
MachineState *ms = MACHINE(qdev_get_machine());
@ -861,6 +1020,14 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
vmem->unplugged_inaccessible = ON_OFF_AUTO_ON;
#endif /* VIRTIO_MEM_HAS_LEGACY_GUESTS */
if (vmem->dynamic_memslots &&
vmem->unplugged_inaccessible != ON_OFF_AUTO_ON) {
error_setg(errp, "'%s' property set to 'on' requires '%s' to be 'on'",
VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP,
VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP);
return;
}
/*
* If the block size wasn't configured by the user, use a sane default. This
* allows using hugetlbfs backends of any page size without manual
@ -930,6 +1097,25 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
virtio_init(vdev, VIRTIO_ID_MEM, sizeof(struct virtio_mem_config));
vmem->vq = virtio_add_queue(vdev, 128, virtio_mem_handle_request);
/*
* With "dynamic-memslots=off" (old behavior) we always map the whole
* RAM memory region directly.
*/
if (vmem->dynamic_memslots) {
if (!vmem->mr) {
virtio_mem_prepare_mr(vmem);
}
if (vmem->nb_memslots <= 1) {
vmem->nb_memslots = 1;
vmem->memslot_size = memory_region_size(&vmem->memdev->mr);
}
if (!vmem->memslots) {
virtio_mem_prepare_memslots(vmem);
}
} else {
assert(!vmem->mr && !vmem->nb_memslots && !vmem->memslots);
}
host_memory_backend_set_mapped(vmem->memdev, true);
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
if (vmem->early_migration) {
@ -969,7 +1155,7 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
ram_block_coordinated_discard_require(false);
}
static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg,
static int virtio_mem_discard_range_cb(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
RAMBlock *rb = vmem->memdev->mr.ram_block;
@ -984,12 +1170,31 @@ static int virtio_mem_restore_unplugged(VirtIOMEM *vmem)
virtio_mem_discard_range_cb);
}
static int virtio_mem_post_load(void *opaque, int version_id)
static int virtio_mem_activate_memslot_range_cb(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
virtio_mem_activate_memslots_to_plug(vmem, offset, size);
return 0;
}
static int virtio_mem_post_load_bitmap(VirtIOMEM *vmem)
{
VirtIOMEM *vmem = VIRTIO_MEM(opaque);
RamDiscardListener *rdl;
int ret;
/*
* We restored the bitmap and updated the requested size; activate all
* memslots (so listeners register) before notifying about plugged blocks.
*/
if (vmem->dynamic_memslots) {
/*
* We don't expect any active memslots at this point to deactivate: no
* memory was plugged on the migration destination.
*/
virtio_mem_for_each_plugged_range(vmem, NULL,
virtio_mem_activate_memslot_range_cb);
}
/*
* We started out with all memory discarded and our memory region is mapped
* into an address space. Replay, now that we updated the bitmap.
@ -1001,6 +1206,20 @@ static int virtio_mem_post_load(void *opaque, int version_id)
return ret;
}
}
return 0;
}
static int virtio_mem_post_load(void *opaque, int version_id)
{
VirtIOMEM *vmem = VIRTIO_MEM(opaque);
int ret;
if (!vmem->early_migration) {
ret = virtio_mem_post_load_bitmap(vmem);
if (ret) {
return ret;
}
}
/*
* If shared RAM is migrated using the file content and not using QEMU,
@ -1021,7 +1240,7 @@ static int virtio_mem_post_load(void *opaque, int version_id)
return virtio_mem_restore_unplugged(vmem);
}
static int virtio_mem_prealloc_range_cb(const VirtIOMEM *vmem, void *arg,
static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
uint64_t offset, uint64_t size)
{
void *area = memory_region_get_ram_ptr(&vmem->memdev->mr) + offset;
@ -1043,7 +1262,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
int ret;
if (!vmem->prealloc) {
return 0;
goto post_load_bitmap;
}
/*
@ -1051,7 +1270,7 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
* don't mess with preallocation and postcopy.
*/
if (migrate_ram_is_ignored(rb)) {
return 0;
goto post_load_bitmap;
}
/*
@ -1084,7 +1303,10 @@ static int virtio_mem_post_load_early(void *opaque, int version_id)
return -EBUSY;
}
}
return 0;
post_load_bitmap:
/* Finally, update any other state to be consistent with the new bitmap. */
return virtio_mem_post_load_bitmap(vmem);
}
typedef struct VirtIOMEMMigSanityChecks {
@ -1235,11 +1457,79 @@ static MemoryRegion *virtio_mem_get_memory_region(VirtIOMEM *vmem, Error **errp)
if (!vmem->memdev) {
error_setg(errp, "'%s' property must be set", VIRTIO_MEM_MEMDEV_PROP);
return NULL;
} else if (vmem->dynamic_memslots) {
if (!vmem->mr) {
virtio_mem_prepare_mr(vmem);
}
return vmem->mr;
}
return &vmem->memdev->mr;
}
static void virtio_mem_decide_memslots(VirtIOMEM *vmem, unsigned int limit)
{
uint64_t region_size, memslot_size, min_memslot_size;
unsigned int memslots;
RAMBlock *rb;
if (!vmem->dynamic_memslots) {
return;
}
/* We're called exactly once, before realizing the device. */
assert(!vmem->nb_memslots);
/* If realizing the device will fail, just assume a single memslot. */
if (limit <= 1 || !vmem->memdev || !vmem->memdev->mr.ram_block) {
vmem->nb_memslots = 1;
return;
}
rb = vmem->memdev->mr.ram_block;
region_size = memory_region_size(&vmem->memdev->mr);
/*
* Determine the default block size now, to determine the minimum memslot
* size. We want the minimum slot size to be at least the device block size.
*/
if (!vmem->block_size) {
vmem->block_size = virtio_mem_default_block_size(rb);
}
/* If realizing the device will fail, just assume a single memslot. */
if (vmem->block_size < qemu_ram_pagesize(rb) ||
!QEMU_IS_ALIGNED(region_size, vmem->block_size)) {
vmem->nb_memslots = 1;
return;
}
/*
* All memslots except the last one have a reasonable minimum size, and
* and all memslot sizes are aligned to the device block size.
*/
memslot_size = QEMU_ALIGN_UP(region_size / limit, vmem->block_size);
min_memslot_size = MAX(vmem->block_size, VIRTIO_MEM_MIN_MEMSLOT_SIZE);
memslot_size = MAX(memslot_size, min_memslot_size);
memslots = QEMU_ALIGN_UP(region_size, memslot_size) / memslot_size;
if (memslots != 1) {
vmem->memslot_size = memslot_size;
}
vmem->nb_memslots = memslots;
}
static unsigned int virtio_mem_get_memslots(VirtIOMEM *vmem)
{
if (!vmem->dynamic_memslots) {
/* Exactly one static RAM memory region. */
return 1;
}
/* We're called after instructed to make a decision. */
g_assert(vmem->nb_memslots);
return vmem->nb_memslots;
}
static void virtio_mem_add_size_change_notifier(VirtIOMEM *vmem,
Notifier *notifier)
{
@ -1377,6 +1667,21 @@ static void virtio_mem_instance_init(Object *obj)
NULL, NULL);
}
static void virtio_mem_instance_finalize(Object *obj)
{
VirtIOMEM *vmem = VIRTIO_MEM(obj);
/*
* Note: the core already dropped the references on all memory regions
* (it's passed as the owner to memory_region_init_*()) and finalized
* these objects. We can simply free the memory.
*/
g_free(vmem->memslots);
vmem->memslots = NULL;
g_free(vmem->mr);
vmem->mr = NULL;
}
static Property virtio_mem_properties[] = {
DEFINE_PROP_UINT64(VIRTIO_MEM_ADDR_PROP, VirtIOMEM, addr, 0),
DEFINE_PROP_UINT32(VIRTIO_MEM_NODE_PROP, VirtIOMEM, node, 0),
@ -1389,6 +1694,8 @@ static Property virtio_mem_properties[] = {
#endif
DEFINE_PROP_BOOL(VIRTIO_MEM_EARLY_MIGRATION_PROP, VirtIOMEM,
early_migration, true),
DEFINE_PROP_BOOL(VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP, VirtIOMEM,
dynamic_memslots, false),
DEFINE_PROP_END_OF_LIST(),
};
@ -1556,6 +1863,8 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data)
vmc->fill_device_info = virtio_mem_fill_device_info;
vmc->get_memory_region = virtio_mem_get_memory_region;
vmc->decide_memslots = virtio_mem_decide_memslots;
vmc->get_memslots = virtio_mem_get_memslots;
vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier;
vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier;
vmc->unplug_request_check = virtio_mem_unplug_request_check;
@ -1573,6 +1882,7 @@ static const TypeInfo virtio_mem_info = {
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOMEM),
.instance_init = virtio_mem_instance_init,
.instance_finalize = virtio_mem_instance_finalize,
.class_init = virtio_mem_class_init,
.class_size = sizeof(VirtIOMEMClass),
.interfaces = (InterfaceInfo[]) {

View File

@ -83,6 +83,21 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
ram_addr_t qemu_ram_addr_from_host(void *ptr);
ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr);
RAMBlock *qemu_ram_block_by_name(const char *name);
/*
* Translates a host ptr back to a RAMBlock and an offset in that RAMBlock.
*
* @ptr: The host pointer to translate.
* @round_offset: Whether to round the result offset down to a target page
* @offset: Will be set to the offset within the returned RAMBlock.
*
* Returns: RAMBlock (or NULL if not found)
*
* By the time this function returns, the returned pointer is not protected
* by RCU anymore. If the caller is not within an RCU critical section and
* does not hold the iothread lock, it must have other means of protecting the
* pointer, such as a reference to the memory region that owns the RAMBlock.
*/
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
ram_addr_t *offset);
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host);

View File

@ -95,6 +95,7 @@ struct ReservedRegion {
* relative to the region's address space
* @readonly: writes to this section are ignored
* @nonvolatile: this section is non-volatile
* @unmergeable: this section should not get merged with adjacent sections
*/
struct MemoryRegionSection {
Int128 size;
@ -104,6 +105,7 @@ struct MemoryRegionSection {
hwaddr offset_within_address_space;
bool readonly;
bool nonvolatile;
bool unmergeable;
};
typedef struct IOMMUTLBEntry IOMMUTLBEntry;
@ -599,8 +601,9 @@ typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque);
* populated (consuming memory), to be used/accessed by the VM.
*
* A #RamDiscardManager can only be set for a RAM #MemoryRegion while the
* #MemoryRegion isn't mapped yet; it cannot change while the #MemoryRegion is
* mapped.
* #MemoryRegion isn't mapped into an address space yet (either directly
* or via an alias); it cannot change while the #MemoryRegion is
* mapped into an address space.
*
* The #RamDiscardManager is intended to be used by technologies that are
* incompatible with discarding of RAM (e.g., VFIO, which may pin all
@ -772,6 +775,7 @@ struct MemoryRegion {
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
bool unmergeable;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
@ -2350,6 +2354,25 @@ void memory_region_set_size(MemoryRegion *mr, uint64_t size);
void memory_region_set_alias_offset(MemoryRegion *mr,
hwaddr offset);
/*
* memory_region_set_unmergeable: Set a memory region unmergeable
*
* Mark a memory region unmergeable, resulting in the memory region (or
* everything contained in a memory region container) not getting merged when
* simplifying the address space and notifying memory listeners. Consequently,
* memory listeners will never get notified about ranges that are larger than
* the original memory regions.
*
* This is primarily useful when multiple aliases to a RAM memory region are
* mapped into a memory region container, and updates (e.g., enable/disable or
* map/unmap) of individual memory region aliases are not supposed to affect
* other memory regions in the same container.
*
* @mr: the #MemoryRegion to be updated
* @unmergeable: whether to mark the #MemoryRegion unmergeable
*/
void memory_region_set_unmergeable(MemoryRegion *mr, bool unmergeable);
/**
* memory_region_present: checks if an address relative to a @container
* translates into #MemoryRegion within @container

View File

@ -297,15 +297,27 @@ struct MachineClass {
* DeviceMemoryState:
* @base: address in guest physical address space where the memory
* address space for memory devices starts
* @mr: address space container for memory devices
* @mr: memory region container for memory devices
* @as: address space for memory devices
* @listener: memory listener used to track used memslots in the address space
* @dimm_size: the sum of plugged DIMMs' sizes
* @used_region_size: the part of @mr already used by memory devices
* @required_memslots: the number of memslots required by memory devices
* @used_memslots: the number of memslots currently used by memory devices
* @memslot_auto_decision_active: whether any plugged memory device
* automatically decided to use more than
* one memslot
*/
typedef struct DeviceMemoryState {
hwaddr base;
MemoryRegion mr;
AddressSpace as;
MemoryListener listener;
uint64_t dimm_size;
uint64_t used_region_size;
unsigned int required_memslots;
unsigned int used_memslots;
unsigned int memslot_auto_decision_active;
} DeviceMemoryState;
/**

View File

@ -14,6 +14,7 @@
#define MEMORY_DEVICE_H
#include "hw/qdev-core.h"
#include "qemu/typedefs.h"
#include "qapi/qapi-types-machine.h"
#include "qom/object.h"
@ -41,6 +42,17 @@ typedef struct MemoryDeviceState MemoryDeviceState;
* successive memory regions are used, a covering memory region has to
* be provided. Scattered memory regions are not supported for single
* devices.
*
* The device memory region returned via @get_memory_region may either be a
* single RAM memory region or a memory region container with subregions
* that are RAM memory regions or aliases to RAM memory regions. Other
* memory regions or subregions are not supported.
*
* If the device memory region returned via @get_memory_region is a
* memory region container, it's supported to dynamically (un)map subregions
* as long as the number of memslots returned by @get_memslots() won't
* be exceeded and as long as all memory regions are of the same kind (e.g.,
* all RAM or all ROM).
*/
struct MemoryDeviceClass {
/* private */
@ -88,6 +100,28 @@ struct MemoryDeviceClass {
*/
MemoryRegion *(*get_memory_region)(MemoryDeviceState *md, Error **errp);
/*
* Optional: Instruct the memory device to decide how many memory slots
* it requires, not exceeding the given limit.
*
* Called exactly once when pre-plugging the memory device, before
* querying the number of memslots using @get_memslots the first time.
*/
void (*decide_memslots)(MemoryDeviceState *md, unsigned int limit);
/*
* Optional for memory devices that require only a single memslot,
* required for all other memory devices: Return the number of memslots
* (distinct RAM memory regions in the device memory region) that are
* required by the device.
*
* If this function is not implemented, the assumption is "1".
*
* Called when (un)plugging the memory device, to check if the requirements
* can be satisfied, and to do proper accounting.
*/
unsigned int (*get_memslots)(MemoryDeviceState *md);
/*
* Optional: Return the desired minimum alignment of the device in guest
* physical address space. The final alignment is computed based on this
@ -105,8 +139,31 @@ struct MemoryDeviceClass {
MemoryDeviceInfo *info);
};
/*
* Traditionally, KVM/vhost in many setups supported 509 memslots, whereby
* 253 memslots were "reserved" for boot memory and other devices (such
* as PCI BARs, which can get mapped dynamically) and 256 memslots were
* dedicated for DIMMs. These magic numbers worked reliably in the past.
*
* Further, using many memslots can negatively affect performance, so setting
* the soft-limit of memslots used by memory devices to the traditional
* DIMM limit of 256 sounds reasonable.
*
* If we have less than 509 memslots, we will instruct memory devices that
* support automatically deciding how many memslots to use to only use a single
* one.
*
* Hotplugging vhost devices with at least 509 memslots is not expected to
* cause problems, not even when devices automatically decided how many memslots
* to use.
*/
#define MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT 256
#define MEMORY_DEVICES_SAFE_MAX_MEMSLOTS 509
MemoryDeviceInfoList *qmp_memory_device_list(void);
uint64_t get_plugged_memory_size(void);
unsigned int memory_devices_get_reserved_memslots(void);
bool memory_devices_memslot_auto_decision_active(void);
void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
const uint64_t *legacy_align, Error **errp);
void memory_device_plug(MemoryDeviceState *md, MachineState *ms);

View File

@ -86,9 +86,6 @@ typedef int (*vhost_set_vring_enable_op)(struct vhost_dev *dev,
typedef bool (*vhost_requires_shm_log_op)(struct vhost_dev *dev);
typedef int (*vhost_migration_done_op)(struct vhost_dev *dev,
char *mac_addr);
typedef bool (*vhost_backend_can_merge_op)(struct vhost_dev *dev,
uint64_t start1, uint64_t size1,
uint64_t start2, uint64_t size2);
typedef int (*vhost_vsock_set_guest_cid_op)(struct vhost_dev *dev,
uint64_t guest_cid);
typedef int (*vhost_vsock_set_running_op)(struct vhost_dev *dev, int start);
@ -108,8 +105,7 @@ typedef int (*vhost_crypto_create_session_op)(struct vhost_dev *dev,
typedef int (*vhost_crypto_close_session_op)(struct vhost_dev *dev,
uint64_t session_id);
typedef bool (*vhost_backend_mem_section_filter_op)(struct vhost_dev *dev,
MemoryRegionSection *section);
typedef bool (*vhost_backend_no_private_memslots_op)(struct vhost_dev *dev);
typedef int (*vhost_get_inflight_fd_op)(struct vhost_dev *dev,
uint16_t queue_size,
@ -138,6 +134,7 @@ typedef struct VhostOps {
vhost_backend_init vhost_backend_init;
vhost_backend_cleanup vhost_backend_cleanup;
vhost_backend_memslots_limit vhost_backend_memslots_limit;
vhost_backend_no_private_memslots_op vhost_backend_no_private_memslots;
vhost_net_set_backend_op vhost_net_set_backend;
vhost_net_set_mtu_op vhost_net_set_mtu;
vhost_scsi_set_endpoint_op vhost_scsi_set_endpoint;
@ -163,7 +160,6 @@ typedef struct VhostOps {
vhost_set_vring_enable_op vhost_set_vring_enable;
vhost_requires_shm_log_op vhost_requires_shm_log;
vhost_migration_done_op vhost_migration_done;
vhost_backend_can_merge_op vhost_backend_can_merge;
vhost_vsock_set_guest_cid_op vhost_vsock_set_guest_cid;
vhost_vsock_set_running_op vhost_vsock_set_running;
vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
@ -172,7 +168,6 @@ typedef struct VhostOps {
vhost_set_config_op vhost_set_config;
vhost_crypto_create_session_op vhost_crypto_create_session;
vhost_crypto_close_session_op vhost_crypto_close_session;
vhost_backend_mem_section_filter_op vhost_backend_mem_section_filter;
vhost_get_inflight_fd_op vhost_get_inflight_fd;
vhost_set_inflight_fd_op vhost_set_inflight_fd;
vhost_dev_start_op vhost_dev_start;

View File

@ -315,7 +315,8 @@ uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
*/
void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
uint64_t features);
bool vhost_has_free_slot(void);
unsigned int vhost_get_max_memslots(void);
unsigned int vhost_get_free_memslots(void);
int vhost_net_set_backend(struct vhost_dev *hdev,
struct vhost_vring_file *file);

View File

@ -33,6 +33,7 @@ OBJECT_DECLARE_TYPE(VirtIOMEM, VirtIOMEMClass,
#define VIRTIO_MEM_UNPLUGGED_INACCESSIBLE_PROP "unplugged-inaccessible"
#define VIRTIO_MEM_EARLY_MIGRATION_PROP "x-early-migration"
#define VIRTIO_MEM_PREALLOC_PROP "prealloc"
#define VIRTIO_MEM_DYNAMIC_MEMSLOTS_PROP "dynamic-memslots"
struct VirtIOMEM {
VirtIODevice parent_obj;
@ -44,7 +45,28 @@ struct VirtIOMEM {
int32_t bitmap_size;
unsigned long *bitmap;
/* assigned memory backend and memory region */
/*
* With "dynamic-memslots=on": Device memory region in which we dynamically
* map the memslots.
*/
MemoryRegion *mr;
/*
* With "dynamic-memslots=on": The individual memslots (aliases into the
* memory backend).
*/
MemoryRegion *memslots;
/* With "dynamic-memslots=on": The total number of memslots. */
uint16_t nb_memslots;
/*
* With "dynamic-memslots=on": Size of one memslot (the size of the
* last one can differ).
*/
uint64_t memslot_size;
/* Assigned memory backend with the RAM memory region. */
HostMemoryBackend *memdev;
/* NUMA node */
@ -82,6 +104,12 @@ struct VirtIOMEM {
*/
bool early_migration;
/*
* Whether we dynamically map (multiple, if possible) memslots instead of
* statically mapping the whole RAM memory region.
*/
bool dynamic_memslots;
/* notifiers to notify when "size" changes */
NotifierList size_change_notifiers;
@ -96,6 +124,8 @@ struct VirtIOMEMClass {
/* public */
void (*fill_device_info)(const VirtIOMEM *vmen, VirtioMEMDeviceInfo *vi);
MemoryRegion *(*get_memory_region)(VirtIOMEM *vmem, Error **errp);
void (*decide_memslots)(VirtIOMEM *vmem, unsigned int limit);
unsigned int (*get_memslots)(VirtIOMEM *vmem);
void (*add_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
void (*remove_size_change_notifier)(VirtIOMEM *vmem, Notifier *notifier);
void (*unplug_request_check)(VirtIOMEM *vmem, Error **errp);

View File

@ -215,7 +215,8 @@ typedef struct KVMRouteChange {
/* external API */
bool kvm_has_free_slot(MachineState *ms);
unsigned int kvm_get_max_memslots(void);
unsigned int kvm_get_free_memslots(void);
bool kvm_has_sync_mmu(void);
int kvm_has_vcpu_events(void);
int kvm_has_robust_singlestep(void);
@ -552,7 +553,6 @@ int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source);
*/
int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target);
struct ppc_radix_page_info *kvm_get_radix_page_info(void);
int kvm_get_max_memslots(void);
/* Notify resamplefd for EOI of specific interrupts. */
void kvm_resample_fd_notify(int gsi);

View File

@ -40,6 +40,7 @@ typedef struct KVMMemoryUpdate {
typedef struct KVMMemoryListener {
MemoryListener listener;
KVMSlot *slots;
unsigned int nr_used_slots;
int as_id;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;

View File

@ -10,3 +10,13 @@ uint64_t get_plugged_memory_size(void)
{
return (uint64_t)-1;
}
unsigned int memory_devices_get_reserved_memslots(void)
{
return 0;
}
bool memory_devices_memslot_auto_decision_active(void)
{
return false;
}

View File

@ -32,7 +32,7 @@ stub_ss.add(files('monitor.c'))
stub_ss.add(files('monitor-core.c'))
stub_ss.add(files('physmem.c'))
stub_ss.add(files('qemu-timer-notify-cb.c'))
stub_ss.add(files('qmp_memory_device.c'))
stub_ss.add(files('memory_device.c'))
stub_ss.add(files('qmp-command-available.c'))
stub_ss.add(files('qmp-quit.c'))
stub_ss.add(files('qtest.c'))

View File

@ -224,6 +224,7 @@ struct FlatRange {
bool romd_mode;
bool readonly;
bool nonvolatile;
bool unmergeable;
};
#define FOR_EACH_FLAT_RANGE(var, view) \
@ -240,6 +241,7 @@ section_from_flat_range(FlatRange *fr, FlatView *fv)
.offset_within_address_space = int128_get64(fr->addr.start),
.readonly = fr->readonly,
.nonvolatile = fr->nonvolatile,
.unmergeable = fr->unmergeable,
};
}
@ -250,7 +252,8 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b)
&& a->offset_in_region == b->offset_in_region
&& a->romd_mode == b->romd_mode
&& a->readonly == b->readonly
&& a->nonvolatile == b->nonvolatile;
&& a->nonvolatile == b->nonvolatile
&& a->unmergeable == b->unmergeable;
}
static FlatView *flatview_new(MemoryRegion *mr_root)
@ -323,7 +326,8 @@ static bool can_merge(FlatRange *r1, FlatRange *r2)
&& r1->dirty_log_mask == r2->dirty_log_mask
&& r1->romd_mode == r2->romd_mode
&& r1->readonly == r2->readonly
&& r1->nonvolatile == r2->nonvolatile;
&& r1->nonvolatile == r2->nonvolatile
&& !r1->unmergeable && !r2->unmergeable;
}
/* Attempt to simplify a view by merging adjacent ranges */
@ -599,7 +603,8 @@ static void render_memory_region(FlatView *view,
Int128 base,
AddrRange clip,
bool readonly,
bool nonvolatile)
bool nonvolatile,
bool unmergeable)
{
MemoryRegion *subregion;
unsigned i;
@ -616,6 +621,7 @@ static void render_memory_region(FlatView *view,
int128_addto(&base, int128_make64(mr->addr));
readonly |= mr->readonly;
nonvolatile |= mr->nonvolatile;
unmergeable |= mr->unmergeable;
tmp = addrrange_make(base, mr->size);
@ -629,14 +635,14 @@ static void render_memory_region(FlatView *view,
int128_subfrom(&base, int128_make64(mr->alias->addr));
int128_subfrom(&base, int128_make64(mr->alias_offset));
render_memory_region(view, mr->alias, base, clip,
readonly, nonvolatile);
readonly, nonvolatile, unmergeable);
return;
}
/* Render subregions in priority order. */
QTAILQ_FOREACH(subregion, &mr->subregions, subregions_link) {
render_memory_region(view, subregion, base, clip,
readonly, nonvolatile);
readonly, nonvolatile, unmergeable);
}
if (!mr->terminates) {
@ -652,6 +658,7 @@ static void render_memory_region(FlatView *view,
fr.romd_mode = mr->romd_mode;
fr.readonly = readonly;
fr.nonvolatile = nonvolatile;
fr.unmergeable = unmergeable;
/* Render the region itself into any gaps left by the current view. */
for (i = 0; i < view->nr && int128_nz(remain); ++i) {
@ -753,7 +760,7 @@ static FlatView *generate_memory_topology(MemoryRegion *mr)
if (mr) {
render_memory_region(view, mr, int128_zero(),
addrrange_make(int128_zero(), int128_2_64()),
false, false);
false, false, false);
}
flatview_simplify(view);
@ -2085,7 +2092,7 @@ int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr)
{
if (!memory_region_is_mapped(mr) || !memory_region_is_ram(mr)) {
if (!memory_region_is_ram(mr)) {
return NULL;
}
return mr->rdm;
@ -2094,7 +2101,7 @@ RamDiscardManager *memory_region_get_ram_discard_manager(MemoryRegion *mr)
void memory_region_set_ram_discard_manager(MemoryRegion *mr,
RamDiscardManager *rdm)
{
g_assert(memory_region_is_ram(mr) && !memory_region_is_mapped(mr));
g_assert(memory_region_is_ram(mr));
g_assert(!rdm || !mr->rdm);
mr->rdm = rdm;
}
@ -2755,6 +2762,18 @@ void memory_region_set_alias_offset(MemoryRegion *mr, hwaddr offset)
memory_region_transaction_commit();
}
void memory_region_set_unmergeable(MemoryRegion *mr, bool unmergeable)
{
if (unmergeable == mr->unmergeable) {
return;
}
memory_region_transaction_begin();
mr->unmergeable = unmergeable;
memory_region_update_pending |= mr->enabled;
memory_region_transaction_commit();
}
uint64_t memory_region_get_alignment(const MemoryRegion *mr)
{
return mr->align;

View File

@ -2221,23 +2221,6 @@ ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
return res;
}
/*
* Translates a host ptr back to a RAMBlock, a ram_addr and an offset
* in that RAMBlock.
*
* ptr: Host pointer to look up
* round_offset: If true round the result offset down to a page boundary
* *ram_addr: set to result ram_addr
* *offset: set to result offset within the RAMBlock
*
* Returns: RAMBlock (or NULL if not found)
*
* By the time this function returns, the returned pointer is not protected
* by RCU anymore. If the caller is not within an RCU critical section and
* does not hold the iothread lock, it must have other means of protecting the
* pointer, such as a reference to the region that includes the incoming
* ram_addr_t.
*/
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
ram_addr_t *offset)
{