Migration Pull request
Hi this includes pending bits of migration patches. - virtio-mem support by David Hildenbrand - dirtyrate improvements by Hyman Huang - fix rdma wrid by Li Zhijian - dump-guest-memory fixes by Peter Xu Pleas apply. Thanks, Juan. -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEGJn/jt6/WMzuA0uC9IfvGFhy1yMFAmGAZEAACgkQ9IfvGFhy 1yPMlxAAx3HRMTCqlluM6B28TKHpGmg7O87g6F0U5fRZNJEro+8p08zYC1Yo2HNm Po7dd++lZxcGPKrq7q1IKPH+wbQ5Yg/3jCeruXP2GRq3AKo9MyUK4WKd2BKRZbnl q2oioUSLKYmsUqyl6YI/8nlgyDvmdGet8+GHxhmG5fVNGabWnGhwJDlCbOh1LAqb cqACvahXuIVj3X7nMbz3e3Xy4YY/hJqJb3+e0DrQwlPDQRLDhadlQ7zv9vJ75BeY Lt0/jnYI223m5LuiTecjv1S9AQjQpqJZq9N2K9miXmd3jtVkm2iqHdXZDK/Sr5oO TE5OCf8xtFEcZ2KNwxQYMW+gkx2Gj6aoxIobu3HJ5kELErmvVhdnM7rkLmSHf8WB Un/O55xUE/Hyg4G/oZOjAwk6eHS7RM+fIBq5wDGn5MNyYpBXid6JhWxSKv0i/gFX 8JA5i8wyzkUD23c8Ez+Ms6nmIL9LJS7xpVx9jqV2fNBdf+15opHg2ufnB5NnQ9y8 JJkzPjW2xKh5EsznY8iDeTztN7Im9Bn+4VcNl53Okugh5QFlTOtcAE21EjPrhv0K XC6PJmDnSZenhJkhgXeDzUe4wZu9wvAjH/R/yTVrW2jT51Azebw3dtreX8F/Dqap n+T+jupShCrrNFw0tCWsuLu+OZJrSwA83tFo+6DfH/idi0CJoJs= =8B3Y -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/juanquintela/tags/migration-20211031-pull-request' into staging Migration Pull request Hi this includes pending bits of migration patches. - virtio-mem support by David Hildenbrand - dirtyrate improvements by Hyman Huang - fix rdma wrid by Li Zhijian - dump-guest-memory fixes by Peter Xu Pleas apply. Thanks, Juan. # gpg: Signature made Mon 01 Nov 2021 06:03:44 PM EDT # gpg: using RSA key 1899FF8EDEBF58CCEE034B82F487EF185872D723 # gpg: Good signature from "Juan Quintela <quintela@redhat.com>" [full] # gpg: aka "Juan Quintela <quintela@trasno.org>" [full] * remotes/juanquintela/tags/migration-20211031-pull-request: migration/dirtyrate: implement dirty-bitmap dirtyrate calculation memory: introduce total_dirty_pages to stat dirty pages migration/ram: Handle RAMBlocks with a RamDiscardManager on background snapshots migration/ram: Factor out populating pages readable in ram_block_populate_pages() migration: Simplify alignment and alignment checks migration/postcopy: Handle RAMBlocks with a RamDiscardManager on the destination virtio-mem: Drop precopy notifier migration/ram: Handle RAMBlocks with a RamDiscardManager on the migration source virtio-mem: Implement replay_discarded RamDiscardManager callback memory: Introduce replay_discarded callback for RamDiscardManager dump-guest-memory: Block live migration migration: Add migrate_add_blocker_internal() migration: Make migration blocker work for snapshots too migration/dirtyrate: implement dirty-ring dirtyrate calculation migration/dirtyrate: move init step of calculation to main thread migration/dirtyrate: adjust order of registering thread migration/dirtyrate: introduce struct and adjust DirtyRateStat memory: make global_dirty_tracking a bitmask KVM: introduce dirty_pages and kvm_dirty_ring_enabled migration/rdma: Fix out of order wrid Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
commit
91e8394415
@ -469,6 +469,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
|
||||
cpu->kvm_fd = ret;
|
||||
cpu->kvm_state = s;
|
||||
cpu->vcpu_dirty = true;
|
||||
cpu->dirty_pages = 0;
|
||||
|
||||
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
|
||||
if (mmap_size < 0) {
|
||||
@ -743,6 +744,7 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
|
||||
count++;
|
||||
}
|
||||
cpu->kvm_fetch_index = fetch;
|
||||
cpu->dirty_pages += count;
|
||||
|
||||
return count;
|
||||
}
|
||||
@ -2296,6 +2298,11 @@ bool kvm_vcpu_id_is_valid(int vcpu_id)
|
||||
return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
|
||||
}
|
||||
|
||||
bool kvm_dirty_ring_enabled(void)
|
||||
{
|
||||
return kvm_state->kvm_dirty_ring_size ? true : false;
|
||||
}
|
||||
|
||||
static int kvm_init(MachineState *ms)
|
||||
{
|
||||
MachineClass *mc = MACHINE_GET_CLASS(ms);
|
||||
|
@ -147,4 +147,9 @@ bool kvm_arm_supports_user_irq(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool kvm_dirty_ring_enabled(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
19
dump/dump.c
19
dump/dump.c
@ -29,6 +29,7 @@
|
||||
#include "qemu/error-report.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "hw/misc/vmcoreinfo.h"
|
||||
#include "migration/blocker.h"
|
||||
|
||||
#ifdef TARGET_X86_64
|
||||
#include "win_dump.h"
|
||||
@ -47,6 +48,8 @@
|
||||
|
||||
#define MAX_GUEST_NOTE_SIZE (1 << 20) /* 1MB should be enough */
|
||||
|
||||
static Error *dump_migration_blocker;
|
||||
|
||||
#define ELF_NOTE_SIZE(hdr_size, name_size, desc_size) \
|
||||
((DIV_ROUND_UP((hdr_size), 4) + \
|
||||
DIV_ROUND_UP((name_size), 4) + \
|
||||
@ -101,6 +104,7 @@ static int dump_cleanup(DumpState *s)
|
||||
qemu_mutex_unlock_iothread();
|
||||
}
|
||||
}
|
||||
migrate_del_blocker(dump_migration_blocker);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2005,6 +2009,21 @@ void qmp_dump_guest_memory(bool paging, const char *file,
|
||||
return;
|
||||
}
|
||||
|
||||
if (!dump_migration_blocker) {
|
||||
error_setg(&dump_migration_blocker,
|
||||
"Live migration disabled: dump-guest-memory in progress");
|
||||
}
|
||||
|
||||
/*
|
||||
* Allows even for -only-migratable, but forbid migration during the
|
||||
* process of dump guest memory.
|
||||
*/
|
||||
if (migrate_add_blocker_internal(dump_migration_blocker, errp)) {
|
||||
/* Remember to release the fd before passing it over to dump state */
|
||||
close(fd);
|
||||
return;
|
||||
}
|
||||
|
||||
s = &dump_state_global;
|
||||
dump_state_prepare(s);
|
||||
|
||||
|
@ -1737,8 +1737,10 @@ ERST
|
||||
|
||||
{
|
||||
.name = "calc_dirty_rate",
|
||||
.args_type = "second:l,sample_pages_per_GB:l?",
|
||||
.params = "second [sample_pages_per_GB]",
|
||||
.help = "start a round of guest dirty rate measurement",
|
||||
.args_type = "dirty_ring:-r,dirty_bitmap:-b,second:l,sample_pages_per_GB:l?",
|
||||
.params = "[-r] [-b] second [sample_pages_per_GB]",
|
||||
.help = "start a round of guest dirty rate measurement (using -r to"
|
||||
"\n\t\t\t specify dirty ring as the method of calculation and"
|
||||
"\n\t\t\t -b to specify dirty bitmap as method of calculation)",
|
||||
.cmd = hmp_calc_dirty_rate,
|
||||
},
|
||||
|
@ -1613,8 +1613,8 @@ void xen_hvm_modified_memory(ram_addr_t start, ram_addr_t length)
|
||||
void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
|
||||
{
|
||||
if (enable) {
|
||||
memory_global_dirty_log_start();
|
||||
memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
|
||||
} else {
|
||||
memory_global_dirty_log_stop();
|
||||
memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
|
||||
}
|
||||
}
|
||||
|
@ -228,6 +228,38 @@ static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int virtio_mem_for_each_unplugged_section(const VirtIOMEM *vmem,
|
||||
MemoryRegionSection *s,
|
||||
void *arg,
|
||||
virtio_mem_section_cb cb)
|
||||
{
|
||||
unsigned long first_bit, last_bit;
|
||||
uint64_t offset, size;
|
||||
int ret = 0;
|
||||
|
||||
first_bit = s->offset_within_region / vmem->bitmap_size;
|
||||
first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, first_bit);
|
||||
while (first_bit < vmem->bitmap_size) {
|
||||
MemoryRegionSection tmp = *s;
|
||||
|
||||
offset = first_bit * vmem->block_size;
|
||||
last_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size,
|
||||
first_bit + 1) - 1;
|
||||
size = (last_bit - first_bit + 1) * vmem->block_size;
|
||||
|
||||
if (!virito_mem_intersect_memory_section(&tmp, offset, size)) {
|
||||
break;
|
||||
}
|
||||
ret = cb(&tmp, arg);
|
||||
if (ret) {
|
||||
break;
|
||||
}
|
||||
first_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size,
|
||||
last_bit + 2);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg)
|
||||
{
|
||||
RamDiscardListener *rdl = arg;
|
||||
@ -744,7 +776,6 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp)
|
||||
host_memory_backend_set_mapped(vmem->memdev, true);
|
||||
vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem));
|
||||
qemu_register_reset(virtio_mem_system_reset, vmem);
|
||||
precopy_add_notifier(&vmem->precopy_notifier);
|
||||
|
||||
/*
|
||||
* Set ourselves as RamDiscardManager before the plug handler maps the
|
||||
@ -764,7 +795,6 @@ static void virtio_mem_device_unrealize(DeviceState *dev)
|
||||
* found via an address space anymore. Unset ourselves.
|
||||
*/
|
||||
memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL);
|
||||
precopy_remove_notifier(&vmem->precopy_notifier);
|
||||
qemu_unregister_reset(virtio_mem_system_reset, vmem);
|
||||
vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem));
|
||||
host_memory_backend_set_mapped(vmem->memdev, false);
|
||||
@ -1057,43 +1087,11 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name,
|
||||
vmem->block_size = value;
|
||||
}
|
||||
|
||||
static int virtio_mem_precopy_exclude_range_cb(const VirtIOMEM *vmem, void *arg,
|
||||
uint64_t offset, uint64_t size)
|
||||
{
|
||||
void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block);
|
||||
|
||||
qemu_guest_free_page_hint(host + offset, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem)
|
||||
{
|
||||
virtio_mem_for_each_unplugged_range(vmem, NULL,
|
||||
virtio_mem_precopy_exclude_range_cb);
|
||||
}
|
||||
|
||||
static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data)
|
||||
{
|
||||
VirtIOMEM *vmem = container_of(n, VirtIOMEM, precopy_notifier);
|
||||
PrecopyNotifyData *pnd = data;
|
||||
|
||||
switch (pnd->reason) {
|
||||
case PRECOPY_NOTIFY_AFTER_BITMAP_SYNC:
|
||||
virtio_mem_precopy_exclude_unplugged(vmem);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void virtio_mem_instance_init(Object *obj)
|
||||
{
|
||||
VirtIOMEM *vmem = VIRTIO_MEM(obj);
|
||||
|
||||
notifier_list_init(&vmem->size_change_notifiers);
|
||||
vmem->precopy_notifier.notify = virtio_mem_precopy_notify;
|
||||
QLIST_INIT(&vmem->rdl_list);
|
||||
|
||||
object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size,
|
||||
@ -1170,6 +1168,31 @@ static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm,
|
||||
virtio_mem_rdm_replay_populated_cb);
|
||||
}
|
||||
|
||||
static int virtio_mem_rdm_replay_discarded_cb(MemoryRegionSection *s,
|
||||
void *arg)
|
||||
{
|
||||
struct VirtIOMEMReplayData *data = arg;
|
||||
|
||||
((ReplayRamDiscard)data->fn)(s, data->opaque);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void virtio_mem_rdm_replay_discarded(const RamDiscardManager *rdm,
|
||||
MemoryRegionSection *s,
|
||||
ReplayRamDiscard replay_fn,
|
||||
void *opaque)
|
||||
{
|
||||
const VirtIOMEM *vmem = VIRTIO_MEM(rdm);
|
||||
struct VirtIOMEMReplayData data = {
|
||||
.fn = replay_fn,
|
||||
.opaque = opaque,
|
||||
};
|
||||
|
||||
g_assert(s->mr == &vmem->memdev->mr);
|
||||
virtio_mem_for_each_unplugged_section(vmem, s, &data,
|
||||
virtio_mem_rdm_replay_discarded_cb);
|
||||
}
|
||||
|
||||
static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm,
|
||||
RamDiscardListener *rdl,
|
||||
MemoryRegionSection *s)
|
||||
@ -1234,6 +1257,7 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data)
|
||||
rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity;
|
||||
rdmc->is_populated = virtio_mem_rdm_is_populated;
|
||||
rdmc->replay_populated = virtio_mem_rdm_replay_populated;
|
||||
rdmc->replay_discarded = virtio_mem_rdm_replay_discarded;
|
||||
rdmc->register_listener = virtio_mem_rdm_register_listener;
|
||||
rdmc->unregister_listener = virtio_mem_rdm_unregister_listener;
|
||||
}
|
||||
|
@ -61,7 +61,17 @@ static inline void fuzz_dma_read_cb(size_t addr,
|
||||
}
|
||||
#endif
|
||||
|
||||
extern bool global_dirty_log;
|
||||
/* Possible bits for global_dirty_log_{start|stop} */
|
||||
|
||||
/* Dirty tracking enabled because migration is running */
|
||||
#define GLOBAL_DIRTY_MIGRATION (1U << 0)
|
||||
|
||||
/* Dirty tracking enabled because measuring dirty rate */
|
||||
#define GLOBAL_DIRTY_DIRTY_RATE (1U << 1)
|
||||
|
||||
#define GLOBAL_DIRTY_MASK (0x3)
|
||||
|
||||
extern unsigned int global_dirty_tracking;
|
||||
|
||||
typedef struct MemoryRegionOps MemoryRegionOps;
|
||||
|
||||
@ -540,6 +550,7 @@ static inline void ram_discard_listener_init(RamDiscardListener *rdl,
|
||||
}
|
||||
|
||||
typedef int (*ReplayRamPopulate)(MemoryRegionSection *section, void *opaque);
|
||||
typedef void (*ReplayRamDiscard)(MemoryRegionSection *section, void *opaque);
|
||||
|
||||
/*
|
||||
* RamDiscardManagerClass:
|
||||
@ -628,6 +639,21 @@ struct RamDiscardManagerClass {
|
||||
MemoryRegionSection *section,
|
||||
ReplayRamPopulate replay_fn, void *opaque);
|
||||
|
||||
/**
|
||||
* @replay_discarded:
|
||||
*
|
||||
* Call the #ReplayRamDiscard callback for all discarded parts within the
|
||||
* #MemoryRegionSection via the #RamDiscardManager.
|
||||
*
|
||||
* @rdm: the #RamDiscardManager
|
||||
* @section: the #MemoryRegionSection
|
||||
* @replay_fn: the #ReplayRamDiscard callback
|
||||
* @opaque: pointer to forward to the callback
|
||||
*/
|
||||
void (*replay_discarded)(const RamDiscardManager *rdm,
|
||||
MemoryRegionSection *section,
|
||||
ReplayRamDiscard replay_fn, void *opaque);
|
||||
|
||||
/**
|
||||
* @register_listener:
|
||||
*
|
||||
@ -672,6 +698,11 @@ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
|
||||
ReplayRamPopulate replay_fn,
|
||||
void *opaque);
|
||||
|
||||
void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
|
||||
MemoryRegionSection *section,
|
||||
ReplayRamDiscard replay_fn,
|
||||
void *opaque);
|
||||
|
||||
void ram_discard_manager_register_listener(RamDiscardManager *rdm,
|
||||
RamDiscardListener *rdl,
|
||||
MemoryRegionSection *section);
|
||||
@ -2388,13 +2419,17 @@ void memory_listener_unregister(MemoryListener *listener);
|
||||
|
||||
/**
|
||||
* memory_global_dirty_log_start: begin dirty logging for all regions
|
||||
*
|
||||
* @flags: purpose of starting dirty log, migration or dirty rate
|
||||
*/
|
||||
void memory_global_dirty_log_start(void);
|
||||
void memory_global_dirty_log_start(unsigned int flags);
|
||||
|
||||
/**
|
||||
* memory_global_dirty_log_stop: end dirty logging for all regions
|
||||
*
|
||||
* @flags: purpose of stopping dirty log, migration or dirty rate
|
||||
*/
|
||||
void memory_global_dirty_log_stop(void);
|
||||
void memory_global_dirty_log_stop(unsigned int flags);
|
||||
|
||||
void mtree_info(bool flatview, bool dispatch_tree, bool owner, bool disabled);
|
||||
|
||||
|
@ -26,6 +26,8 @@
|
||||
#include "exec/ramlist.h"
|
||||
#include "exec/ramblock.h"
|
||||
|
||||
extern uint64_t total_dirty_pages;
|
||||
|
||||
/**
|
||||
* clear_bmap_size: calculate clear bitmap size
|
||||
*
|
||||
@ -369,10 +371,14 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
|
||||
|
||||
qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
|
||||
|
||||
if (global_dirty_log) {
|
||||
if (global_dirty_tracking) {
|
||||
qatomic_or(
|
||||
&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
|
||||
temp);
|
||||
if (unlikely(
|
||||
global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
|
||||
total_dirty_pages += ctpopl(temp);
|
||||
}
|
||||
}
|
||||
|
||||
if (tcg_enabled()) {
|
||||
@ -392,7 +398,7 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
|
||||
} else {
|
||||
uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL : DIRTY_CLIENTS_NOCODE;
|
||||
|
||||
if (!global_dirty_log) {
|
||||
if (!global_dirty_tracking) {
|
||||
clients &= ~(1 << DIRTY_MEMORY_MIGRATION);
|
||||
}
|
||||
|
||||
@ -403,6 +409,9 @@ static inline void cpu_physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
|
||||
for (i = 0; i < len; i++) {
|
||||
if (bitmap[i] != 0) {
|
||||
c = leul_to_cpu(bitmap[i]);
|
||||
if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
|
||||
total_dirty_pages += ctpopl(c);
|
||||
}
|
||||
do {
|
||||
j = ctzl(c);
|
||||
c &= ~(1ul << j);
|
||||
|
@ -381,6 +381,7 @@ struct CPUState {
|
||||
struct kvm_run *kvm_run;
|
||||
struct kvm_dirty_gfn *kvm_dirty_gfns;
|
||||
uint32_t kvm_fetch_index;
|
||||
uint64_t dirty_pages;
|
||||
|
||||
/* Used for events with 'vcpu' and *without* the 'disabled' properties */
|
||||
DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
|
||||
|
@ -65,9 +65,6 @@ struct VirtIOMEM {
|
||||
/* notifiers to notify when "size" changes */
|
||||
NotifierList size_change_notifiers;
|
||||
|
||||
/* don't migrate unplugged memory */
|
||||
NotifierWithReturn precopy_notifier;
|
||||
|
||||
/* listeners to notify on plug/unplug activity. */
|
||||
QLIST_HEAD(, RamDiscardListener) rdl_list;
|
||||
};
|
||||
|
@ -25,6 +25,22 @@
|
||||
*/
|
||||
int migrate_add_blocker(Error *reason, Error **errp);
|
||||
|
||||
/**
|
||||
* @migrate_add_blocker_internal - prevent migration from proceeding without
|
||||
* only-migrate implications
|
||||
*
|
||||
* @reason - an error to be returned whenever migration is attempted
|
||||
*
|
||||
* @errp - [out] The reason (if any) we cannot block migration right now.
|
||||
*
|
||||
* @returns - 0 on success, -EBUSY on failure, with errp set.
|
||||
*
|
||||
* Some of the migration blockers can be temporary (e.g., for a few seconds),
|
||||
* so it shouldn't need to conflict with "-only-migratable". For those cases,
|
||||
* we can call this function rather than @migrate_add_blocker().
|
||||
*/
|
||||
int migrate_add_blocker_internal(Error *reason, Error **errp);
|
||||
|
||||
/**
|
||||
* @migrate_del_blocker - remove a blocking error from migration
|
||||
*
|
||||
|
@ -547,4 +547,5 @@ bool kvm_cpu_check_are_resettable(void);
|
||||
|
||||
bool kvm_arch_cpu_check_are_resettable(void);
|
||||
|
||||
bool kvm_dirty_ring_enabled(void);
|
||||
#endif
|
||||
|
@ -15,7 +15,9 @@
|
||||
#include "qapi/error.h"
|
||||
#include "cpu.h"
|
||||
#include "exec/ramblock.h"
|
||||
#include "exec/ram_addr.h"
|
||||
#include "qemu/rcu_queue.h"
|
||||
#include "qemu/main-loop.h"
|
||||
#include "qapi/qapi-commands-migration.h"
|
||||
#include "ram.h"
|
||||
#include "trace.h"
|
||||
@ -23,9 +25,26 @@
|
||||
#include "monitor/hmp.h"
|
||||
#include "monitor/monitor.h"
|
||||
#include "qapi/qmp/qdict.h"
|
||||
#include "sysemu/kvm.h"
|
||||
#include "sysemu/runstate.h"
|
||||
#include "exec/memory.h"
|
||||
|
||||
/*
|
||||
* total_dirty_pages is procted by BQL and is used
|
||||
* to stat dirty pages during the period of two
|
||||
* memory_global_dirty_log_sync
|
||||
*/
|
||||
uint64_t total_dirty_pages;
|
||||
|
||||
typedef struct DirtyPageRecord {
|
||||
uint64_t start_pages;
|
||||
uint64_t end_pages;
|
||||
} DirtyPageRecord;
|
||||
|
||||
static int CalculatingState = DIRTY_RATE_STATUS_UNSTARTED;
|
||||
static struct DirtyRateStat DirtyStat;
|
||||
static DirtyRateMeasureMode dirtyrate_mode =
|
||||
DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
|
||||
|
||||
static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
|
||||
{
|
||||
@ -70,51 +89,94 @@ static int dirtyrate_set_state(int *state, int old_state, int new_state)
|
||||
|
||||
static struct DirtyRateInfo *query_dirty_rate_info(void)
|
||||
{
|
||||
int i;
|
||||
int64_t dirty_rate = DirtyStat.dirty_rate;
|
||||
struct DirtyRateInfo *info = g_malloc0(sizeof(DirtyRateInfo));
|
||||
|
||||
if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) {
|
||||
info->has_dirty_rate = true;
|
||||
info->dirty_rate = dirty_rate;
|
||||
}
|
||||
DirtyRateVcpuList *head = NULL, **tail = &head;
|
||||
|
||||
info->status = CalculatingState;
|
||||
info->start_time = DirtyStat.start_time;
|
||||
info->calc_time = DirtyStat.calc_time;
|
||||
info->sample_pages = DirtyStat.sample_pages;
|
||||
info->mode = dirtyrate_mode;
|
||||
|
||||
if (qatomic_read(&CalculatingState) == DIRTY_RATE_STATUS_MEASURED) {
|
||||
info->has_dirty_rate = true;
|
||||
info->dirty_rate = dirty_rate;
|
||||
|
||||
if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
|
||||
/*
|
||||
* set sample_pages with 0 to indicate page sampling
|
||||
* isn't enabled
|
||||
**/
|
||||
info->sample_pages = 0;
|
||||
info->has_vcpu_dirty_rate = true;
|
||||
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
|
||||
DirtyRateVcpu *rate = g_malloc0(sizeof(DirtyRateVcpu));
|
||||
rate->id = DirtyStat.dirty_ring.rates[i].id;
|
||||
rate->dirty_rate = DirtyStat.dirty_ring.rates[i].dirty_rate;
|
||||
QAPI_LIST_APPEND(tail, rate);
|
||||
}
|
||||
info->vcpu_dirty_rate = head;
|
||||
}
|
||||
|
||||
if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) {
|
||||
info->sample_pages = 0;
|
||||
}
|
||||
}
|
||||
|
||||
trace_query_dirty_rate_info(DirtyRateStatus_str(CalculatingState));
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
static void init_dirtyrate_stat(int64_t start_time, int64_t calc_time,
|
||||
uint64_t sample_pages)
|
||||
static void init_dirtyrate_stat(int64_t start_time,
|
||||
struct DirtyRateConfig config)
|
||||
{
|
||||
DirtyStat.total_dirty_samples = 0;
|
||||
DirtyStat.total_sample_count = 0;
|
||||
DirtyStat.total_block_mem_MB = 0;
|
||||
DirtyStat.dirty_rate = -1;
|
||||
DirtyStat.start_time = start_time;
|
||||
DirtyStat.calc_time = calc_time;
|
||||
DirtyStat.sample_pages = sample_pages;
|
||||
DirtyStat.calc_time = config.sample_period_seconds;
|
||||
DirtyStat.sample_pages = config.sample_pages_per_gigabytes;
|
||||
|
||||
switch (config.mode) {
|
||||
case DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING:
|
||||
DirtyStat.page_sampling.total_dirty_samples = 0;
|
||||
DirtyStat.page_sampling.total_sample_count = 0;
|
||||
DirtyStat.page_sampling.total_block_mem_MB = 0;
|
||||
break;
|
||||
case DIRTY_RATE_MEASURE_MODE_DIRTY_RING:
|
||||
DirtyStat.dirty_ring.nvcpu = -1;
|
||||
DirtyStat.dirty_ring.rates = NULL;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static void cleanup_dirtyrate_stat(struct DirtyRateConfig config)
|
||||
{
|
||||
/* last calc-dirty-rate qmp use dirty ring mode */
|
||||
if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
|
||||
free(DirtyStat.dirty_ring.rates);
|
||||
DirtyStat.dirty_ring.rates = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void update_dirtyrate_stat(struct RamblockDirtyInfo *info)
|
||||
{
|
||||
DirtyStat.total_dirty_samples += info->sample_dirty_count;
|
||||
DirtyStat.total_sample_count += info->sample_pages_count;
|
||||
DirtyStat.page_sampling.total_dirty_samples += info->sample_dirty_count;
|
||||
DirtyStat.page_sampling.total_sample_count += info->sample_pages_count;
|
||||
/* size of total pages in MB */
|
||||
DirtyStat.total_block_mem_MB += (info->ramblock_pages *
|
||||
TARGET_PAGE_SIZE) >> 20;
|
||||
DirtyStat.page_sampling.total_block_mem_MB += (info->ramblock_pages *
|
||||
TARGET_PAGE_SIZE) >> 20;
|
||||
}
|
||||
|
||||
static void update_dirtyrate(uint64_t msec)
|
||||
{
|
||||
uint64_t dirtyrate;
|
||||
uint64_t total_dirty_samples = DirtyStat.total_dirty_samples;
|
||||
uint64_t total_sample_count = DirtyStat.total_sample_count;
|
||||
uint64_t total_block_mem_MB = DirtyStat.total_block_mem_MB;
|
||||
uint64_t total_dirty_samples = DirtyStat.page_sampling.total_dirty_samples;
|
||||
uint64_t total_sample_count = DirtyStat.page_sampling.total_sample_count;
|
||||
uint64_t total_block_mem_MB = DirtyStat.page_sampling.total_block_mem_MB;
|
||||
|
||||
dirtyrate = total_dirty_samples * total_block_mem_MB *
|
||||
1000 / (total_sample_count * msec);
|
||||
@ -327,21 +389,183 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
|
||||
update_dirtyrate_stat(block_dinfo);
|
||||
}
|
||||
|
||||
if (DirtyStat.total_sample_count == 0) {
|
||||
if (DirtyStat.page_sampling.total_sample_count == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void calculate_dirtyrate(struct DirtyRateConfig config)
|
||||
static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
|
||||
CPUState *cpu, bool start)
|
||||
{
|
||||
if (start) {
|
||||
dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
|
||||
} else {
|
||||
dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
|
||||
}
|
||||
}
|
||||
|
||||
static void dirtyrate_global_dirty_log_start(void)
|
||||
{
|
||||
qemu_mutex_lock_iothread();
|
||||
memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
|
||||
qemu_mutex_unlock_iothread();
|
||||
}
|
||||
|
||||
static void dirtyrate_global_dirty_log_stop(void)
|
||||
{
|
||||
qemu_mutex_lock_iothread();
|
||||
memory_global_dirty_log_sync();
|
||||
memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
|
||||
qemu_mutex_unlock_iothread();
|
||||
}
|
||||
|
||||
static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
|
||||
{
|
||||
uint64_t memory_size_MB;
|
||||
int64_t time_s;
|
||||
uint64_t increased_dirty_pages =
|
||||
dirty_pages.end_pages - dirty_pages.start_pages;
|
||||
|
||||
memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
|
||||
time_s = DirtyStat.calc_time;
|
||||
|
||||
return memory_size_MB / time_s;
|
||||
}
|
||||
|
||||
static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
|
||||
bool start)
|
||||
{
|
||||
if (start) {
|
||||
dirty_pages->start_pages = total_dirty_pages;
|
||||
} else {
|
||||
dirty_pages->end_pages = total_dirty_pages;
|
||||
}
|
||||
}
|
||||
|
||||
static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
|
||||
{
|
||||
DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
|
||||
}
|
||||
|
||||
static inline void dirtyrate_manual_reset_protect(void)
|
||||
{
|
||||
RAMBlock *block = NULL;
|
||||
|
||||
WITH_RCU_READ_LOCK_GUARD() {
|
||||
RAMBLOCK_FOREACH_MIGRATABLE(block) {
|
||||
memory_region_clear_dirty_bitmap(block->mr, 0,
|
||||
block->used_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
|
||||
{
|
||||
int64_t msec = 0;
|
||||
int64_t start_time;
|
||||
DirtyPageRecord dirty_pages;
|
||||
|
||||
qemu_mutex_lock_iothread();
|
||||
memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
|
||||
|
||||
/*
|
||||
* 1'round of log sync may return all 1 bits with
|
||||
* KVM_DIRTY_LOG_INITIALLY_SET enable
|
||||
* skip it unconditionally and start dirty tracking
|
||||
* from 2'round of log sync
|
||||
*/
|
||||
memory_global_dirty_log_sync();
|
||||
|
||||
/*
|
||||
* reset page protect manually and unconditionally.
|
||||
* this make sure kvm dirty log be cleared if
|
||||
* KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE cap is enabled.
|
||||
*/
|
||||
dirtyrate_manual_reset_protect();
|
||||
qemu_mutex_unlock_iothread();
|
||||
|
||||
record_dirtypages_bitmap(&dirty_pages, true);
|
||||
|
||||
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
DirtyStat.start_time = start_time / 1000;
|
||||
|
||||
msec = config.sample_period_seconds * 1000;
|
||||
msec = set_sample_page_period(msec, start_time);
|
||||
DirtyStat.calc_time = msec / 1000;
|
||||
|
||||
/*
|
||||
* dirtyrate_global_dirty_log_stop do two things.
|
||||
* 1. fetch dirty bitmap from kvm
|
||||
* 2. stop dirty tracking
|
||||
*/
|
||||
dirtyrate_global_dirty_log_stop();
|
||||
|
||||
record_dirtypages_bitmap(&dirty_pages, false);
|
||||
|
||||
do_calculate_dirtyrate_bitmap(dirty_pages);
|
||||
}
|
||||
|
||||
static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
|
||||
{
|
||||
CPUState *cpu;
|
||||
int64_t msec = 0;
|
||||
int64_t start_time;
|
||||
uint64_t dirtyrate = 0;
|
||||
uint64_t dirtyrate_sum = 0;
|
||||
DirtyPageRecord *dirty_pages;
|
||||
int nvcpu = 0;
|
||||
int i = 0;
|
||||
|
||||
CPU_FOREACH(cpu) {
|
||||
nvcpu++;
|
||||
}
|
||||
|
||||
dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
|
||||
|
||||
DirtyStat.dirty_ring.nvcpu = nvcpu;
|
||||
DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
|
||||
|
||||
dirtyrate_global_dirty_log_start();
|
||||
|
||||
CPU_FOREACH(cpu) {
|
||||
record_dirtypages(dirty_pages, cpu, true);
|
||||
}
|
||||
|
||||
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
DirtyStat.start_time = start_time / 1000;
|
||||
|
||||
msec = config.sample_period_seconds * 1000;
|
||||
msec = set_sample_page_period(msec, start_time);
|
||||
DirtyStat.calc_time = msec / 1000;
|
||||
|
||||
dirtyrate_global_dirty_log_stop();
|
||||
|
||||
CPU_FOREACH(cpu) {
|
||||
record_dirtypages(dirty_pages, cpu, false);
|
||||
}
|
||||
|
||||
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
|
||||
dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
|
||||
trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
|
||||
|
||||
DirtyStat.dirty_ring.rates[i].id = i;
|
||||
DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
|
||||
dirtyrate_sum += dirtyrate;
|
||||
}
|
||||
|
||||
DirtyStat.dirty_rate = dirtyrate_sum;
|
||||
free(dirty_pages);
|
||||
}
|
||||
|
||||
static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
|
||||
{
|
||||
struct RamblockDirtyInfo *block_dinfo = NULL;
|
||||
int block_count = 0;
|
||||
int64_t msec = 0;
|
||||
int64_t initial_time;
|
||||
|
||||
rcu_register_thread();
|
||||
rcu_read_lock();
|
||||
initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
|
||||
if (!record_ramblock_hash_info(&block_dinfo, config, &block_count)) {
|
||||
@ -364,16 +588,26 @@ static void calculate_dirtyrate(struct DirtyRateConfig config)
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
free_ramblock_dirty_info(block_dinfo, block_count);
|
||||
rcu_unregister_thread();
|
||||
}
|
||||
|
||||
static void calculate_dirtyrate(struct DirtyRateConfig config)
|
||||
{
|
||||
if (config.mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) {
|
||||
calculate_dirtyrate_dirty_bitmap(config);
|
||||
} else if (config.mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
|
||||
calculate_dirtyrate_dirty_ring(config);
|
||||
} else {
|
||||
calculate_dirtyrate_sample_vm(config);
|
||||
}
|
||||
|
||||
trace_dirtyrate_calculate(DirtyStat.dirty_rate);
|
||||
}
|
||||
|
||||
void *get_dirtyrate_thread(void *arg)
|
||||
{
|
||||
struct DirtyRateConfig config = *(struct DirtyRateConfig *)arg;
|
||||
int ret;
|
||||
int64_t start_time;
|
||||
int64_t calc_time;
|
||||
uint64_t sample_pages;
|
||||
rcu_register_thread();
|
||||
|
||||
ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_UNSTARTED,
|
||||
DIRTY_RATE_STATUS_MEASURING);
|
||||
@ -382,11 +616,6 @@ void *get_dirtyrate_thread(void *arg)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
|
||||
calc_time = config.sample_period_seconds;
|
||||
sample_pages = config.sample_pages_per_gigabytes;
|
||||
init_dirtyrate_stat(start_time, calc_time, sample_pages);
|
||||
|
||||
calculate_dirtyrate(config);
|
||||
|
||||
ret = dirtyrate_set_state(&CalculatingState, DIRTY_RATE_STATUS_MEASURING,
|
||||
@ -394,15 +623,22 @@ void *get_dirtyrate_thread(void *arg)
|
||||
if (ret == -1) {
|
||||
error_report("change dirtyrate state failed.");
|
||||
}
|
||||
|
||||
rcu_unregister_thread();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
|
||||
int64_t sample_pages, Error **errp)
|
||||
void qmp_calc_dirty_rate(int64_t calc_time,
|
||||
bool has_sample_pages,
|
||||
int64_t sample_pages,
|
||||
bool has_mode,
|
||||
DirtyRateMeasureMode mode,
|
||||
Error **errp)
|
||||
{
|
||||
static struct DirtyRateConfig config;
|
||||
QemuThread thread;
|
||||
int ret;
|
||||
int64_t start_time;
|
||||
|
||||
/*
|
||||
* If the dirty rate is already being measured, don't attempt to start.
|
||||
@ -419,6 +655,15 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
|
||||
return;
|
||||
}
|
||||
|
||||
if (!has_mode) {
|
||||
mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
|
||||
}
|
||||
|
||||
if (has_sample_pages && mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) {
|
||||
error_setg(errp, "either sample-pages or dirty-ring can be specified.");
|
||||
return;
|
||||
}
|
||||
|
||||
if (has_sample_pages) {
|
||||
if (!is_sample_pages_valid(sample_pages)) {
|
||||
error_setg(errp, "sample-pages is out of range[%d, %d].",
|
||||
@ -430,6 +675,19 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
|
||||
sample_pages = DIRTYRATE_DEFAULT_SAMPLE_PAGES;
|
||||
}
|
||||
|
||||
/*
|
||||
* dirty ring mode only works when kvm dirty ring is enabled.
|
||||
* on the contrary, dirty bitmap mode is not.
|
||||
*/
|
||||
if (((mode == DIRTY_RATE_MEASURE_MODE_DIRTY_RING) &&
|
||||
!kvm_dirty_ring_enabled()) ||
|
||||
((mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) &&
|
||||
kvm_dirty_ring_enabled())) {
|
||||
error_setg(errp, "mode %s is not enabled, use other method instead.",
|
||||
DirtyRateMeasureMode_str(mode));
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Init calculation state as unstarted.
|
||||
*/
|
||||
@ -442,6 +700,19 @@ void qmp_calc_dirty_rate(int64_t calc_time, bool has_sample_pages,
|
||||
|
||||
config.sample_period_seconds = calc_time;
|
||||
config.sample_pages_per_gigabytes = sample_pages;
|
||||
config.mode = mode;
|
||||
|
||||
cleanup_dirtyrate_stat(config);
|
||||
|
||||
/*
|
||||
* update dirty rate mode so that we can figure out what mode has
|
||||
* been used in last calculation
|
||||
**/
|
||||
dirtyrate_mode = mode;
|
||||
|
||||
start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
|
||||
init_dirtyrate_stat(start_time, config);
|
||||
|
||||
qemu_thread_create(&thread, "get_dirtyrate", get_dirtyrate_thread,
|
||||
(void *)&config, QEMU_THREAD_DETACHED);
|
||||
}
|
||||
@ -463,12 +734,24 @@ void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict)
|
||||
info->sample_pages);
|
||||
monitor_printf(mon, "Period: %"PRIi64" (sec)\n",
|
||||
info->calc_time);
|
||||
monitor_printf(mon, "Mode: %s\n",
|
||||
DirtyRateMeasureMode_str(info->mode));
|
||||
monitor_printf(mon, "Dirty rate: ");
|
||||
if (info->has_dirty_rate) {
|
||||
monitor_printf(mon, "%"PRIi64" (MB/s)\n", info->dirty_rate);
|
||||
if (info->has_vcpu_dirty_rate) {
|
||||
DirtyRateVcpuList *rate, *head = info->vcpu_dirty_rate;
|
||||
for (rate = head; rate != NULL; rate = rate->next) {
|
||||
monitor_printf(mon, "vcpu[%"PRIi64"], Dirty rate: %"PRIi64
|
||||
" (MB/s)\n", rate->value->id,
|
||||
rate->value->dirty_rate);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
monitor_printf(mon, "(not ready)\n");
|
||||
}
|
||||
|
||||
qapi_free_DirtyRateVcpuList(info->vcpu_dirty_rate);
|
||||
g_free(info);
|
||||
}
|
||||
|
||||
@ -477,6 +760,9 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict)
|
||||
int64_t sec = qdict_get_try_int(qdict, "second", 0);
|
||||
int64_t sample_pages = qdict_get_try_int(qdict, "sample_pages_per_GB", -1);
|
||||
bool has_sample_pages = (sample_pages != -1);
|
||||
bool dirty_ring = qdict_get_try_bool(qdict, "dirty_ring", false);
|
||||
bool dirty_bitmap = qdict_get_try_bool(qdict, "dirty_bitmap", false);
|
||||
DirtyRateMeasureMode mode = DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
|
||||
Error *err = NULL;
|
||||
|
||||
if (!sec) {
|
||||
@ -484,7 +770,20 @@ void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict)
|
||||
return;
|
||||
}
|
||||
|
||||
qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, &err);
|
||||
if (dirty_ring && dirty_bitmap) {
|
||||
monitor_printf(mon, "Either dirty ring or dirty bitmap "
|
||||
"can be specified!\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (dirty_bitmap) {
|
||||
mode = DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP;
|
||||
} else if (dirty_ring) {
|
||||
mode = DIRTY_RATE_MEASURE_MODE_DIRTY_RING;
|
||||
}
|
||||
|
||||
qmp_calc_dirty_rate(sec, has_sample_pages, sample_pages, true,
|
||||
mode, &err);
|
||||
if (err) {
|
||||
hmp_handle_error(mon, err);
|
||||
return;
|
||||
|
@ -43,6 +43,7 @@
|
||||
struct DirtyRateConfig {
|
||||
uint64_t sample_pages_per_gigabytes; /* sample pages per GB */
|
||||
int64_t sample_period_seconds; /* time duration between two sampling */
|
||||
DirtyRateMeasureMode mode; /* mode of dirtyrate measurement */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -58,17 +59,29 @@ struct RamblockDirtyInfo {
|
||||
uint32_t *hash_result; /* array of hash result for sampled pages */
|
||||
};
|
||||
|
||||
typedef struct SampleVMStat {
|
||||
uint64_t total_dirty_samples; /* total dirty sampled page */
|
||||
uint64_t total_sample_count; /* total sampled pages */
|
||||
uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
|
||||
} SampleVMStat;
|
||||
|
||||
typedef struct VcpuStat {
|
||||
int nvcpu; /* number of vcpu */
|
||||
DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
|
||||
} VcpuStat;
|
||||
|
||||
/*
|
||||
* Store calculation statistics for each measure.
|
||||
*/
|
||||
struct DirtyRateStat {
|
||||
uint64_t total_dirty_samples; /* total dirty sampled page */
|
||||
uint64_t total_sample_count; /* total sampled pages */
|
||||
uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
|
||||
int64_t dirty_rate; /* dirty rate in MB/s */
|
||||
int64_t start_time; /* calculation start time in units of second */
|
||||
int64_t calc_time; /* time duration of two sampling in units of second */
|
||||
uint64_t sample_pages; /* sample pages per GB */
|
||||
union {
|
||||
SampleVMStat page_sampling;
|
||||
VcpuStat dirty_ring;
|
||||
};
|
||||
};
|
||||
|
||||
void *get_dirtyrate_thread(void *arg);
|
||||
|
@ -391,7 +391,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
|
||||
int migrate_send_rp_req_pages(MigrationIncomingState *mis,
|
||||
RAMBlock *rb, ram_addr_t start, uint64_t haddr)
|
||||
{
|
||||
void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
|
||||
void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
|
||||
bool received = false;
|
||||
|
||||
WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
|
||||
@ -2049,6 +2049,20 @@ void migrate_init(MigrationState *s)
|
||||
s->threshold_size = 0;
|
||||
}
|
||||
|
||||
int migrate_add_blocker_internal(Error *reason, Error **errp)
|
||||
{
|
||||
/* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
|
||||
if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
|
||||
error_propagate_prepend(errp, error_copy(reason),
|
||||
"disallowing migration blocker "
|
||||
"(migration/snapshot in progress) for: ");
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
migration_blockers = g_slist_prepend(migration_blockers, reason);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int migrate_add_blocker(Error *reason, Error **errp)
|
||||
{
|
||||
if (only_migratable) {
|
||||
@ -2058,15 +2072,7 @@ int migrate_add_blocker(Error *reason, Error **errp)
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
if (migration_is_idle()) {
|
||||
migration_blockers = g_slist_prepend(migration_blockers, reason);
|
||||
return 0;
|
||||
}
|
||||
|
||||
error_propagate_prepend(errp, error_copy(reason),
|
||||
"disallowing migration blocker "
|
||||
"(migration in progress) for: ");
|
||||
return -EBUSY;
|
||||
return migrate_add_blocker_internal(reason, errp);
|
||||
}
|
||||
|
||||
void migrate_del_blocker(Error *reason)
|
||||
@ -2631,8 +2637,8 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
|
||||
* Since we currently insist on matching page sizes, just sanity check
|
||||
* we're being asked for whole host pages.
|
||||
*/
|
||||
if (start & (our_host_ps - 1) ||
|
||||
(len & (our_host_ps - 1))) {
|
||||
if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
|
||||
!QEMU_IS_ALIGNED(len, our_host_ps)) {
|
||||
error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
|
||||
" len: %zd", __func__, start, len);
|
||||
mark_source_rp_bad(ms);
|
||||
|
@ -402,7 +402,7 @@ bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
|
||||
strerror(errno));
|
||||
goto out;
|
||||
}
|
||||
g_assert(((size_t)testarea & (pagesize - 1)) == 0);
|
||||
g_assert(QEMU_PTR_IS_ALIGNED(testarea, pagesize));
|
||||
|
||||
reg_struct.range.start = (uintptr_t)testarea;
|
||||
reg_struct.range.len = pagesize;
|
||||
@ -660,7 +660,7 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
|
||||
struct uffdio_range range;
|
||||
int ret;
|
||||
trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
|
||||
range.start = client_addr & ~(pagesize - 1);
|
||||
range.start = ROUND_DOWN(client_addr, pagesize);
|
||||
range.len = pagesize;
|
||||
ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
|
||||
if (ret) {
|
||||
@ -671,6 +671,29 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int postcopy_request_page(MigrationIncomingState *mis, RAMBlock *rb,
|
||||
ram_addr_t start, uint64_t haddr)
|
||||
{
|
||||
void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
|
||||
|
||||
/*
|
||||
* Discarded pages (via RamDiscardManager) are never migrated. On unlikely
|
||||
* access, place a zeropage, which will also set the relevant bits in the
|
||||
* recv_bitmap accordingly, so we won't try placing a zeropage twice.
|
||||
*
|
||||
* Checking a single bit is sufficient to handle pagesize > TPS as either
|
||||
* all relevant bits are set or not.
|
||||
*/
|
||||
assert(QEMU_IS_ALIGNED(start, qemu_ram_pagesize(rb)));
|
||||
if (ramblock_page_is_discarded(rb, start)) {
|
||||
bool received = ramblock_recv_bitmap_test_byte_offset(rb, start);
|
||||
|
||||
return received ? 0 : postcopy_place_page_zero(mis, aligned, rb);
|
||||
}
|
||||
|
||||
return migrate_send_rp_req_pages(mis, rb, start, haddr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Callback from shared fault handlers to ask for a page,
|
||||
* the page must be specified by a RAMBlock and an offset in that rb
|
||||
@ -679,8 +702,7 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd,
|
||||
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
uint64_t client_addr, uint64_t rb_offset)
|
||||
{
|
||||
size_t pagesize = qemu_ram_pagesize(rb);
|
||||
uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
|
||||
uint64_t aligned_rbo = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
|
||||
MigrationIncomingState *mis = migration_incoming_get_current();
|
||||
|
||||
trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
|
||||
@ -690,7 +712,7 @@ int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
|
||||
qemu_ram_get_idstr(rb), rb_offset);
|
||||
return postcopy_wake_shared(pcfd, client_addr, rb);
|
||||
}
|
||||
migrate_send_rp_req_pages(mis, rb, aligned_rbo, client_addr);
|
||||
postcopy_request_page(mis, rb, aligned_rbo, client_addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -970,7 +992,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
||||
break;
|
||||
}
|
||||
|
||||
rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
|
||||
rb_offset = ROUND_DOWN(rb_offset, qemu_ram_pagesize(rb));
|
||||
trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
|
||||
qemu_ram_get_idstr(rb),
|
||||
rb_offset,
|
||||
@ -984,8 +1006,8 @@ retry:
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
ret = migrate_send_rp_req_pages(mis, rb, rb_offset,
|
||||
msg.arg.pagefault.address);
|
||||
ret = postcopy_request_page(mis, rb, rb_offset,
|
||||
msg.arg.pagefault.address);
|
||||
if (ret) {
|
||||
/* May be network failure, try to wait for recovery */
|
||||
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
|
||||
@ -993,7 +1015,7 @@ retry:
|
||||
goto retry;
|
||||
} else {
|
||||
/* This is a unavoidable fault */
|
||||
error_report("%s: migrate_send_rp_req_pages() get %d",
|
||||
error_report("%s: postcopy_request_page() get %d",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
|
180
migration/ram.c
180
migration/ram.c
@ -811,7 +811,7 @@ static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
|
||||
assert(shift >= 6);
|
||||
|
||||
size = 1ULL << (TARGET_PAGE_BITS + shift);
|
||||
start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
|
||||
start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
|
||||
trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
|
||||
memory_region_clear_dirty_bitmap(rb->mr, start, size);
|
||||
}
|
||||
@ -858,6 +858,81 @@ static inline bool migration_bitmap_clear_dirty(RAMState *rs,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void dirty_bitmap_clear_section(MemoryRegionSection *section,
|
||||
void *opaque)
|
||||
{
|
||||
const hwaddr offset = section->offset_within_region;
|
||||
const hwaddr size = int128_get64(section->size);
|
||||
const unsigned long start = offset >> TARGET_PAGE_BITS;
|
||||
const unsigned long npages = size >> TARGET_PAGE_BITS;
|
||||
RAMBlock *rb = section->mr->ram_block;
|
||||
uint64_t *cleared_bits = opaque;
|
||||
|
||||
/*
|
||||
* We don't grab ram_state->bitmap_mutex because we expect to run
|
||||
* only when starting migration or during postcopy recovery where
|
||||
* we don't have concurrent access.
|
||||
*/
|
||||
if (!migration_in_postcopy() && !migrate_background_snapshot()) {
|
||||
migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
|
||||
}
|
||||
*cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
|
||||
bitmap_clear(rb->bmap, start, npages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Exclude all dirty pages from migration that fall into a discarded range as
|
||||
* managed by a RamDiscardManager responsible for the mapped memory region of
|
||||
* the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
|
||||
*
|
||||
* Discarded pages ("logically unplugged") have undefined content and must
|
||||
* not get migrated, because even reading these pages for migration might
|
||||
* result in undesired behavior.
|
||||
*
|
||||
* Returns the number of cleared bits in the RAMBlock dirty bitmap.
|
||||
*
|
||||
* Note: The result is only stable while migrating (precopy/postcopy).
|
||||
*/
|
||||
static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
|
||||
{
|
||||
uint64_t cleared_bits = 0;
|
||||
|
||||
if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
|
||||
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
|
||||
MemoryRegionSection section = {
|
||||
.mr = rb->mr,
|
||||
.offset_within_region = 0,
|
||||
.size = int128_make64(qemu_ram_get_used_length(rb)),
|
||||
};
|
||||
|
||||
ram_discard_manager_replay_discarded(rdm, §ion,
|
||||
dirty_bitmap_clear_section,
|
||||
&cleared_bits);
|
||||
}
|
||||
return cleared_bits;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a host-page aligned page falls into a discarded range as managed by
|
||||
* a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
|
||||
*
|
||||
* Note: The result is only stable while migrating (precopy/postcopy).
|
||||
*/
|
||||
bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
|
||||
{
|
||||
if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
|
||||
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
|
||||
MemoryRegionSection section = {
|
||||
.mr = rb->mr,
|
||||
.offset_within_region = start,
|
||||
.size = int128_make64(qemu_ram_pagesize(rb)),
|
||||
};
|
||||
|
||||
return !ram_discard_manager_is_populated(rdm, §ion);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Called with RCU critical section */
|
||||
static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
|
||||
{
|
||||
@ -1564,25 +1639,68 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
|
||||
ram_addr_t size)
|
||||
{
|
||||
/*
|
||||
* We read one byte of each page; this will preallocate page tables if
|
||||
* required and populate the shared zeropage on MAP_PRIVATE anonymous memory
|
||||
* where no page was populated yet. This might require adaption when
|
||||
* supporting other mappings, like shmem.
|
||||
*/
|
||||
for (; offset < size; offset += block->page_size) {
|
||||
char tmp = *((char *)block->host + offset);
|
||||
|
||||
/* Don't optimize the read out */
|
||||
asm volatile("" : "+r" (tmp));
|
||||
}
|
||||
}
|
||||
|
||||
static inline int populate_read_section(MemoryRegionSection *section,
|
||||
void *opaque)
|
||||
{
|
||||
const hwaddr size = int128_get64(section->size);
|
||||
hwaddr offset = section->offset_within_region;
|
||||
RAMBlock *block = section->mr->ram_block;
|
||||
|
||||
populate_read_range(block, offset, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* ram_block_populate_pages: populate memory in the RAM block by reading
|
||||
* an integer from the beginning of each page.
|
||||
* ram_block_populate_read: preallocate page tables and populate pages in the
|
||||
* RAM block by reading a byte of each page.
|
||||
*
|
||||
* Since it's solely used for userfault_fd WP feature, here we just
|
||||
* hardcode page size to qemu_real_host_page_size.
|
||||
*
|
||||
* @block: RAM block to populate
|
||||
*/
|
||||
static void ram_block_populate_pages(RAMBlock *block)
|
||||
static void ram_block_populate_read(RAMBlock *rb)
|
||||
{
|
||||
char *ptr = (char *) block->host;
|
||||
/*
|
||||
* Skip populating all pages that fall into a discarded range as managed by
|
||||
* a RamDiscardManager responsible for the mapped memory region of the
|
||||
* RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
|
||||
* must not get populated automatically. We don't have to track
|
||||
* modifications via userfaultfd WP reliably, because these pages will
|
||||
* not be part of the migration stream either way -- see
|
||||
* ramblock_dirty_bitmap_exclude_discarded_pages().
|
||||
*
|
||||
* Note: The result is only stable while migrating (precopy/postcopy).
|
||||
*/
|
||||
if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
|
||||
RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
|
||||
MemoryRegionSection section = {
|
||||
.mr = rb->mr,
|
||||
.offset_within_region = 0,
|
||||
.size = rb->mr->size,
|
||||
};
|
||||
|
||||
for (ram_addr_t offset = 0; offset < block->used_length;
|
||||
offset += qemu_real_host_page_size) {
|
||||
char tmp = *(ptr + offset);
|
||||
|
||||
/* Don't optimize the read out */
|
||||
asm volatile("" : "+r" (tmp));
|
||||
ram_discard_manager_replay_populated(rdm, §ion,
|
||||
populate_read_section, NULL);
|
||||
} else {
|
||||
populate_read_range(rb, 0, rb->used_length);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1609,7 +1727,7 @@ void ram_write_tracking_prepare(void)
|
||||
* UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
|
||||
* pages with pte_none() entries in page table.
|
||||
*/
|
||||
ram_block_populate_pages(block);
|
||||
ram_block_populate_read(block);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2216,7 +2334,14 @@ static void ram_save_cleanup(void *opaque)
|
||||
/* caller have hold iothread lock or is in a bh, so there is
|
||||
* no writing race against the migration bitmap
|
||||
*/
|
||||
memory_global_dirty_log_stop();
|
||||
if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
|
||||
/*
|
||||
* do not stop dirty log without starting it, since
|
||||
* memory_global_dirty_log_stop will assert that
|
||||
* memory_global_dirty_log_start/stop used in pairs
|
||||
*/
|
||||
memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
|
||||
}
|
||||
}
|
||||
|
||||
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
|
||||
@ -2668,6 +2793,19 @@ static void ram_list_init_bitmaps(void)
|
||||
}
|
||||
}
|
||||
|
||||
static void migration_bitmap_clear_discarded_pages(RAMState *rs)
|
||||
{
|
||||
unsigned long pages;
|
||||
RAMBlock *rb;
|
||||
|
||||
RCU_READ_LOCK_GUARD();
|
||||
|
||||
RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
|
||||
pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
|
||||
rs->migration_dirty_pages -= pages;
|
||||
}
|
||||
}
|
||||
|
||||
static void ram_init_bitmaps(RAMState *rs)
|
||||
{
|
||||
/* For memory_global_dirty_log_start below. */
|
||||
@ -2678,12 +2816,18 @@ static void ram_init_bitmaps(RAMState *rs)
|
||||
ram_list_init_bitmaps();
|
||||
/* We don't use dirty log with background snapshots */
|
||||
if (!migrate_background_snapshot()) {
|
||||
memory_global_dirty_log_start();
|
||||
memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
|
||||
migration_bitmap_sync_precopy(rs);
|
||||
}
|
||||
}
|
||||
qemu_mutex_unlock_ramlist();
|
||||
qemu_mutex_unlock_iothread();
|
||||
|
||||
/*
|
||||
* After an eventual first bitmap sync, fixup the initial bitmap
|
||||
* containing all 1s to exclude any discarded pages from migration.
|
||||
*/
|
||||
migration_bitmap_clear_discarded_pages(rs);
|
||||
}
|
||||
|
||||
static int ram_init_all(RAMState **rsp)
|
||||
@ -3434,7 +3578,7 @@ void colo_incoming_start_dirty_log(void)
|
||||
/* Discard this dirty bitmap record */
|
||||
bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
|
||||
}
|
||||
memory_global_dirty_log_start();
|
||||
memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
|
||||
}
|
||||
ram_state->migration_dirty_pages = 0;
|
||||
qemu_mutex_unlock_ramlist();
|
||||
@ -3446,7 +3590,7 @@ void colo_release_ram_cache(void)
|
||||
{
|
||||
RAMBlock *block;
|
||||
|
||||
memory_global_dirty_log_stop();
|
||||
memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
|
||||
RAMBLOCK_FOREACH_NOT_IGNORED(block) {
|
||||
g_free(block->bmap);
|
||||
block->bmap = NULL;
|
||||
@ -4112,6 +4256,10 @@ int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
|
||||
*/
|
||||
bitmap_complement(block->bmap, block->bmap, nbits);
|
||||
|
||||
/* Clear dirty bits of discarded ranges that we don't want to migrate. */
|
||||
ramblock_dirty_bitmap_clear_discarded_pages(block);
|
||||
|
||||
/* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
|
||||
trace_ram_dirty_bitmap_reload_complete(block->idstr);
|
||||
|
||||
/*
|
||||
|
@ -72,6 +72,7 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr);
|
||||
int64_t ramblock_recv_bitmap_send(QEMUFile *file,
|
||||
const char *block_name);
|
||||
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
|
||||
bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
|
||||
|
||||
/* ram cache */
|
||||
int colo_init_ram_cache(void);
|
||||
|
138
migration/rdma.c
138
migration/rdma.c
@ -358,9 +358,11 @@ typedef struct RDMAContext {
|
||||
struct ibv_context *verbs;
|
||||
struct rdma_event_channel *channel;
|
||||
struct ibv_qp *qp; /* queue pair */
|
||||
struct ibv_comp_channel *comp_channel; /* completion channel */
|
||||
struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
|
||||
struct ibv_comp_channel *send_comp_channel; /* send completion channel */
|
||||
struct ibv_pd *pd; /* protection domain */
|
||||
struct ibv_cq *cq; /* completion queue */
|
||||
struct ibv_cq *recv_cq; /* recvieve completion queue */
|
||||
struct ibv_cq *send_cq; /* send completion queue */
|
||||
|
||||
/*
|
||||
* If a previous write failed (perhaps because of a failed
|
||||
@ -1059,21 +1061,34 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* create completion channel */
|
||||
rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
|
||||
if (!rdma->comp_channel) {
|
||||
error_report("failed to allocate completion channel");
|
||||
/* create receive completion channel */
|
||||
rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
|
||||
if (!rdma->recv_comp_channel) {
|
||||
error_report("failed to allocate receive completion channel");
|
||||
goto err_alloc_pd_cq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Completion queue can be filled by both read and write work requests,
|
||||
* so must reflect the sum of both possible queue sizes.
|
||||
* Completion queue can be filled by read work requests.
|
||||
*/
|
||||
rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
||||
NULL, rdma->comp_channel, 0);
|
||||
if (!rdma->cq) {
|
||||
error_report("failed to allocate completion queue");
|
||||
rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
||||
NULL, rdma->recv_comp_channel, 0);
|
||||
if (!rdma->recv_cq) {
|
||||
error_report("failed to allocate receive completion queue");
|
||||
goto err_alloc_pd_cq;
|
||||
}
|
||||
|
||||
/* create send completion channel */
|
||||
rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
|
||||
if (!rdma->send_comp_channel) {
|
||||
error_report("failed to allocate send completion channel");
|
||||
goto err_alloc_pd_cq;
|
||||
}
|
||||
|
||||
rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
|
||||
NULL, rdma->send_comp_channel, 0);
|
||||
if (!rdma->send_cq) {
|
||||
error_report("failed to allocate send completion queue");
|
||||
goto err_alloc_pd_cq;
|
||||
}
|
||||
|
||||
@ -1083,11 +1098,19 @@ err_alloc_pd_cq:
|
||||
if (rdma->pd) {
|
||||
ibv_dealloc_pd(rdma->pd);
|
||||
}
|
||||
if (rdma->comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->comp_channel);
|
||||
if (rdma->recv_comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->recv_comp_channel);
|
||||
}
|
||||
if (rdma->send_comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->send_comp_channel);
|
||||
}
|
||||
if (rdma->recv_cq) {
|
||||
ibv_destroy_cq(rdma->recv_cq);
|
||||
rdma->recv_cq = NULL;
|
||||
}
|
||||
rdma->pd = NULL;
|
||||
rdma->comp_channel = NULL;
|
||||
rdma->recv_comp_channel = NULL;
|
||||
rdma->send_comp_channel = NULL;
|
||||
return -1;
|
||||
|
||||
}
|
||||
@ -1104,8 +1127,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
|
||||
attr.cap.max_recv_wr = 3;
|
||||
attr.cap.max_send_sge = 1;
|
||||
attr.cap.max_recv_sge = 1;
|
||||
attr.send_cq = rdma->cq;
|
||||
attr.recv_cq = rdma->cq;
|
||||
attr.send_cq = rdma->send_cq;
|
||||
attr.recv_cq = rdma->recv_cq;
|
||||
attr.qp_type = IBV_QPT_RC;
|
||||
|
||||
ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
|
||||
@ -1496,14 +1519,14 @@ static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
|
||||
* (of any kind) has completed.
|
||||
* Return the work request ID that completed.
|
||||
*/
|
||||
static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
|
||||
uint32_t *byte_len)
|
||||
static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
|
||||
uint64_t *wr_id_out, uint32_t *byte_len)
|
||||
{
|
||||
int ret;
|
||||
struct ibv_wc wc;
|
||||
uint64_t wr_id;
|
||||
|
||||
ret = ibv_poll_cq(rdma->cq, 1, &wc);
|
||||
ret = ibv_poll_cq(cq, 1, &wc);
|
||||
|
||||
if (!ret) {
|
||||
*wr_id_out = RDMA_WRID_NONE;
|
||||
@ -1575,7 +1598,8 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
|
||||
/* Wait for activity on the completion channel.
|
||||
* Returns 0 on success, none-0 on error.
|
||||
*/
|
||||
static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
|
||||
static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
|
||||
struct ibv_comp_channel *comp_channel)
|
||||
{
|
||||
struct rdma_cm_event *cm_event;
|
||||
int ret = -1;
|
||||
@ -1586,7 +1610,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
|
||||
*/
|
||||
if (rdma->migration_started_on_destination &&
|
||||
migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
|
||||
yield_until_fd_readable(rdma->comp_channel->fd);
|
||||
yield_until_fd_readable(comp_channel->fd);
|
||||
} else {
|
||||
/* This is the source side, we're in a separate thread
|
||||
* or destination prior to migration_fd_process_incoming()
|
||||
@ -1597,7 +1621,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
|
||||
*/
|
||||
while (!rdma->error_state && !rdma->received_error) {
|
||||
GPollFD pfds[2];
|
||||
pfds[0].fd = rdma->comp_channel->fd;
|
||||
pfds[0].fd = comp_channel->fd;
|
||||
pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
|
||||
pfds[0].revents = 0;
|
||||
|
||||
@ -1655,6 +1679,17 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
|
||||
return rdma->error_state;
|
||||
}
|
||||
|
||||
static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
|
||||
{
|
||||
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
|
||||
rdma->recv_comp_channel;
|
||||
}
|
||||
|
||||
static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
|
||||
{
|
||||
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Block until the next work request has completed.
|
||||
*
|
||||
@ -1675,13 +1710,15 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
|
||||
struct ibv_cq *cq;
|
||||
void *cq_ctx;
|
||||
uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
|
||||
struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
|
||||
struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
|
||||
|
||||
if (ibv_req_notify_cq(rdma->cq, 0)) {
|
||||
if (ibv_req_notify_cq(poll_cq, 0)) {
|
||||
return -1;
|
||||
}
|
||||
/* poll cq first */
|
||||
while (wr_id != wrid_requested) {
|
||||
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
|
||||
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
@ -1702,12 +1739,12 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
|
||||
}
|
||||
|
||||
while (1) {
|
||||
ret = qemu_rdma_wait_comp_channel(rdma);
|
||||
ret = qemu_rdma_wait_comp_channel(rdma, ch);
|
||||
if (ret) {
|
||||
goto err_block_for_wrid;
|
||||
}
|
||||
|
||||
ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
|
||||
ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
|
||||
if (ret) {
|
||||
perror("ibv_get_cq_event");
|
||||
goto err_block_for_wrid;
|
||||
@ -1721,7 +1758,7 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
|
||||
}
|
||||
|
||||
while (wr_id != wrid_requested) {
|
||||
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
|
||||
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
|
||||
if (ret < 0) {
|
||||
goto err_block_for_wrid;
|
||||
}
|
||||
@ -2437,13 +2474,21 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
|
||||
rdma_destroy_qp(rdma->cm_id);
|
||||
rdma->qp = NULL;
|
||||
}
|
||||
if (rdma->cq) {
|
||||
ibv_destroy_cq(rdma->cq);
|
||||
rdma->cq = NULL;
|
||||
if (rdma->recv_cq) {
|
||||
ibv_destroy_cq(rdma->recv_cq);
|
||||
rdma->recv_cq = NULL;
|
||||
}
|
||||
if (rdma->comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->comp_channel);
|
||||
rdma->comp_channel = NULL;
|
||||
if (rdma->send_cq) {
|
||||
ibv_destroy_cq(rdma->send_cq);
|
||||
rdma->send_cq = NULL;
|
||||
}
|
||||
if (rdma->recv_comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->recv_comp_channel);
|
||||
rdma->recv_comp_channel = NULL;
|
||||
}
|
||||
if (rdma->send_comp_channel) {
|
||||
ibv_destroy_comp_channel(rdma->send_comp_channel);
|
||||
rdma->send_comp_channel = NULL;
|
||||
}
|
||||
if (rdma->pd) {
|
||||
ibv_dealloc_pd(rdma->pd);
|
||||
@ -3115,10 +3160,14 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
|
||||
{
|
||||
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
|
||||
if (io_read) {
|
||||
aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
|
||||
aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
|
||||
false, io_read, io_write, NULL, opaque);
|
||||
aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
|
||||
false, io_read, io_write, NULL, opaque);
|
||||
} else {
|
||||
aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
|
||||
aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
|
||||
false, io_read, io_write, NULL, opaque);
|
||||
aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
|
||||
false, io_read, io_write, NULL, opaque);
|
||||
}
|
||||
}
|
||||
@ -3332,7 +3381,22 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
|
||||
*/
|
||||
while (1) {
|
||||
uint64_t wr_id, wr_id_in;
|
||||
int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
|
||||
int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
|
||||
if (ret < 0) {
|
||||
error_report("rdma migration: polling error! %d", ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
|
||||
|
||||
if (wr_id == RDMA_WRID_NONE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (1) {
|
||||
uint64_t wr_id, wr_id_in;
|
||||
int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
|
||||
if (ret < 0) {
|
||||
error_report("rdma migration: polling error! %d", ret);
|
||||
goto err;
|
||||
|
@ -333,6 +333,8 @@ get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t crc) "ramblock n
|
||||
calc_page_dirty_rate(const char *idstr, uint32_t new_crc, uint32_t old_crc) "ramblock name: %s, new crc: %" PRIu32 ", old crc: %" PRIu32
|
||||
skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: %s, ramblock size: %" PRIu64
|
||||
find_page_matched(const char *idstr) "ramblock %s addr or size changed"
|
||||
dirtyrate_calculate(int64_t dirtyrate) "dirty rate: %" PRIi64 " MB/s"
|
||||
dirtyrate_do_calculate_vcpu(int idx, uint64_t rate) "vcpu[%d]: %"PRIu64 " MB/s"
|
||||
|
||||
# block.c
|
||||
migration_block_init_shared(const char *blk_device_name) "Start migration for %s with shared base image"
|
||||
|
@ -1731,6 +1731,21 @@
|
||||
{ 'event': 'UNPLUG_PRIMARY',
|
||||
'data': { 'device-id': 'str' } }
|
||||
|
||||
##
|
||||
# @DirtyRateVcpu:
|
||||
#
|
||||
# Dirty rate of vcpu.
|
||||
#
|
||||
# @id: vcpu index.
|
||||
#
|
||||
# @dirty-rate: dirty rate.
|
||||
#
|
||||
# Since: 6.1
|
||||
#
|
||||
##
|
||||
{ 'struct': 'DirtyRateVcpu',
|
||||
'data': { 'id': 'int', 'dirty-rate': 'int64' } }
|
||||
|
||||
##
|
||||
# @DirtyRateStatus:
|
||||
#
|
||||
@ -1748,6 +1763,23 @@
|
||||
{ 'enum': 'DirtyRateStatus',
|
||||
'data': [ 'unstarted', 'measuring', 'measured'] }
|
||||
|
||||
##
|
||||
# @DirtyRateMeasureMode:
|
||||
#
|
||||
# An enumeration of mode of measuring dirtyrate.
|
||||
#
|
||||
# @page-sampling: calculate dirtyrate by sampling pages.
|
||||
#
|
||||
# @dirty-ring: calculate dirtyrate by dirty ring.
|
||||
#
|
||||
# @dirty-bitmap: calculate dirtyrate by dirty bitmap.
|
||||
#
|
||||
# Since: 6.1
|
||||
#
|
||||
##
|
||||
{ 'enum': 'DirtyRateMeasureMode',
|
||||
'data': ['page-sampling', 'dirty-ring', 'dirty-bitmap'] }
|
||||
|
||||
##
|
||||
# @DirtyRateInfo:
|
||||
#
|
||||
@ -1766,6 +1798,12 @@
|
||||
# @sample-pages: page count per GB for sample dirty pages
|
||||
# the default value is 512 (since 6.1)
|
||||
#
|
||||
# @mode: mode containing method of calculate dirtyrate includes
|
||||
# 'page-sampling' and 'dirty-ring' (Since 6.1)
|
||||
#
|
||||
# @vcpu-dirty-rate: dirtyrate for each vcpu if dirty-ring
|
||||
# mode specified (Since 6.1)
|
||||
#
|
||||
# Since: 5.2
|
||||
#
|
||||
##
|
||||
@ -1774,7 +1812,9 @@
|
||||
'status': 'DirtyRateStatus',
|
||||
'start-time': 'int64',
|
||||
'calc-time': 'int64',
|
||||
'sample-pages': 'uint64'} }
|
||||
'sample-pages': 'uint64',
|
||||
'mode': 'DirtyRateMeasureMode',
|
||||
'*vcpu-dirty-rate': [ 'DirtyRateVcpu' ] } }
|
||||
|
||||
##
|
||||
# @calc-dirty-rate:
|
||||
@ -1786,6 +1826,9 @@
|
||||
# @sample-pages: page count per GB for sample dirty pages
|
||||
# the default value is 512 (since 6.1)
|
||||
#
|
||||
# @mode: mechanism of calculating dirtyrate includes
|
||||
# 'page-sampling' and 'dirty-ring' (Since 6.1)
|
||||
#
|
||||
# Since: 5.2
|
||||
#
|
||||
# Example:
|
||||
@ -1794,7 +1837,8 @@
|
||||
#
|
||||
##
|
||||
{ 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64',
|
||||
'*sample-pages': 'int'} }
|
||||
'*sample-pages': 'int',
|
||||
'*mode': 'DirtyRateMeasureMode'} }
|
||||
|
||||
##
|
||||
# @query-dirty-rate:
|
||||
|
@ -39,7 +39,7 @@
|
||||
static unsigned memory_region_transaction_depth;
|
||||
static bool memory_region_update_pending;
|
||||
static bool ioeventfd_update_pending;
|
||||
bool global_dirty_log;
|
||||
unsigned int global_dirty_tracking;
|
||||
|
||||
static QTAILQ_HEAD(, MemoryListener) memory_listeners
|
||||
= QTAILQ_HEAD_INITIALIZER(memory_listeners);
|
||||
@ -1821,7 +1821,7 @@ uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr)
|
||||
uint8_t mask = mr->dirty_log_mask;
|
||||
RAMBlock *rb = mr->ram_block;
|
||||
|
||||
if (global_dirty_log && ((rb && qemu_ram_is_migratable(rb)) ||
|
||||
if (global_dirty_tracking && ((rb && qemu_ram_is_migratable(rb)) ||
|
||||
memory_region_is_iommu(mr))) {
|
||||
mask |= (1 << DIRTY_MEMORY_MIGRATION);
|
||||
}
|
||||
@ -2081,6 +2081,17 @@ int ram_discard_manager_replay_populated(const RamDiscardManager *rdm,
|
||||
return rdmc->replay_populated(rdm, section, replay_fn, opaque);
|
||||
}
|
||||
|
||||
void ram_discard_manager_replay_discarded(const RamDiscardManager *rdm,
|
||||
MemoryRegionSection *section,
|
||||
ReplayRamDiscard replay_fn,
|
||||
void *opaque)
|
||||
{
|
||||
RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_GET_CLASS(rdm);
|
||||
|
||||
g_assert(rdmc->replay_discarded);
|
||||
rdmc->replay_discarded(rdm, section, replay_fn, opaque);
|
||||
}
|
||||
|
||||
void ram_discard_manager_register_listener(RamDiscardManager *rdm,
|
||||
RamDiscardListener *rdl,
|
||||
MemoryRegionSection *section)
|
||||
@ -2760,14 +2771,18 @@ void memory_global_after_dirty_log_sync(void)
|
||||
|
||||
static VMChangeStateEntry *vmstate_change;
|
||||
|
||||
void memory_global_dirty_log_start(void)
|
||||
void memory_global_dirty_log_start(unsigned int flags)
|
||||
{
|
||||
if (vmstate_change) {
|
||||
qemu_del_vm_change_state_handler(vmstate_change);
|
||||
vmstate_change = NULL;
|
||||
}
|
||||
|
||||
global_dirty_log = true;
|
||||
assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
|
||||
assert(!(global_dirty_tracking & flags));
|
||||
global_dirty_tracking |= flags;
|
||||
|
||||
trace_global_dirty_changed(global_dirty_tracking);
|
||||
|
||||
MEMORY_LISTENER_CALL_GLOBAL(log_global_start, Forward);
|
||||
|
||||
@ -2777,9 +2792,13 @@ void memory_global_dirty_log_start(void)
|
||||
memory_region_transaction_commit();
|
||||
}
|
||||
|
||||
static void memory_global_dirty_log_do_stop(void)
|
||||
static void memory_global_dirty_log_do_stop(unsigned int flags)
|
||||
{
|
||||
global_dirty_log = false;
|
||||
assert(flags && !(flags & (~GLOBAL_DIRTY_MASK)));
|
||||
assert((global_dirty_tracking & flags) == flags);
|
||||
global_dirty_tracking &= ~flags;
|
||||
|
||||
trace_global_dirty_changed(global_dirty_tracking);
|
||||
|
||||
/* Refresh DIRTY_MEMORY_MIGRATION bit. */
|
||||
memory_region_transaction_begin();
|
||||
@ -2792,8 +2811,9 @@ static void memory_global_dirty_log_do_stop(void)
|
||||
static void memory_vm_change_state_handler(void *opaque, bool running,
|
||||
RunState state)
|
||||
{
|
||||
unsigned int flags = (unsigned int)(uintptr_t)opaque;
|
||||
if (running) {
|
||||
memory_global_dirty_log_do_stop();
|
||||
memory_global_dirty_log_do_stop(flags);
|
||||
|
||||
if (vmstate_change) {
|
||||
qemu_del_vm_change_state_handler(vmstate_change);
|
||||
@ -2802,18 +2822,19 @@ static void memory_vm_change_state_handler(void *opaque, bool running,
|
||||
}
|
||||
}
|
||||
|
||||
void memory_global_dirty_log_stop(void)
|
||||
void memory_global_dirty_log_stop(unsigned int flags)
|
||||
{
|
||||
if (!runstate_is_running()) {
|
||||
if (vmstate_change) {
|
||||
return;
|
||||
}
|
||||
vmstate_change = qemu_add_vm_change_state_handler(
|
||||
memory_vm_change_state_handler, NULL);
|
||||
memory_vm_change_state_handler,
|
||||
(void *)(uintptr_t)flags);
|
||||
return;
|
||||
}
|
||||
|
||||
memory_global_dirty_log_do_stop();
|
||||
memory_global_dirty_log_do_stop(flags);
|
||||
}
|
||||
|
||||
static void listener_add_address_space(MemoryListener *listener,
|
||||
@ -2825,7 +2846,7 @@ static void listener_add_address_space(MemoryListener *listener,
|
||||
if (listener->begin) {
|
||||
listener->begin(listener);
|
||||
}
|
||||
if (global_dirty_log) {
|
||||
if (global_dirty_tracking) {
|
||||
if (listener->log_global_start) {
|
||||
listener->log_global_start(listener);
|
||||
}
|
||||
|
@ -19,6 +19,7 @@ memory_region_sync_dirty(const char *mr, const char *listener, int global) "mr '
|
||||
flatview_new(void *view, void *root) "%p (root %p)"
|
||||
flatview_destroy(void *view, void *root) "%p (root %p)"
|
||||
flatview_destroy_rcu(void *view, void *root) "%p (root %p)"
|
||||
global_dirty_changed(unsigned int bitmask) "bitmask 0x%"PRIx32
|
||||
|
||||
# softmmu.c
|
||||
vm_stop_flush_all(int ret) "ret %d"
|
||||
|
Loading…
Reference in New Issue
Block a user