qemu/system/dirtylimit.c

678 lines
17 KiB
C
Raw Normal View History

/*
* Dirty page rate limit implementation code
*
* Copyright (c) 2022 CHINA TELECOM CO.,LTD.
*
* Authors:
* Hyman Huang() <huangy81@chinatelecom.cn>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
#include "qapi/qapi-commands-migration.h"
#include "qapi/qmp/qdict.h"
#include "qapi/error.h"
#include "sysemu/dirtyrate.h"
#include "sysemu/dirtylimit.h"
#include "monitor/hmp.h"
#include "monitor/monitor.h"
#include "exec/memory.h"
#include "exec/target_page.h"
#include "hw/boards.h"
#include "sysemu/kvm.h"
#include "trace.h"
#include "migration/misc.h"
/*
* Dirtylimit stop working if dirty page rate error
* value less than DIRTYLIMIT_TOLERANCE_RANGE
*/
#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
/*
* Plus or minus vcpu sleep time linearly if dirty
* page rate error value percentage over
* DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
* Otherwise, plus or minus a fixed vcpu sleep time.
*/
#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
/*
* Max vcpu sleep time percentage during a cycle
* composed of dirty ring full and sleep time.
*/
#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
struct {
VcpuStat stat;
bool running;
QemuThread thread;
} *vcpu_dirty_rate_stat;
typedef struct VcpuDirtyLimitState {
int cpu_index;
bool enabled;
/*
* Quota dirty page rate, unit is MB/s
* zero if not enabled.
*/
uint64_t quota;
} VcpuDirtyLimitState;
struct {
VcpuDirtyLimitState *states;
/* Max cpus number configured by user */
int max_cpus;
/* Number of vcpu under dirtylimit */
int limited_nvcpu;
} *dirtylimit_state;
/* protect dirtylimit_state */
static QemuMutex dirtylimit_mutex;
/* dirtylimit thread quit if dirtylimit_quit is true */
static bool dirtylimit_quit;
static void vcpu_dirty_rate_stat_collect(void)
{
VcpuStat stat;
int i = 0;
int64_t period = DIRTYLIMIT_CALC_TIME_MS;
if (migrate_dirty_limit() &&
migration_is_active()) {
period = migrate_vcpu_dirty_limit_period();
}
/* calculate vcpu dirtyrate */
vcpu_calculate_dirtyrate(period,
&stat,
GLOBAL_DIRTY_LIMIT,
false);
for (i = 0; i < stat.nvcpu; i++) {
vcpu_dirty_rate_stat->stat.rates[i].id = i;
vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
stat.rates[i].dirty_rate;
}
g_free(stat.rates);
}
static void *vcpu_dirty_rate_stat_thread(void *opaque)
{
rcu_register_thread();
/* start log sync */
global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
vcpu_dirty_rate_stat_collect();
if (dirtylimit_in_service()) {
dirtylimit_process();
}
}
/* stop log sync */
global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
rcu_unregister_thread();
return NULL;
}
int64_t vcpu_dirty_rate_get(int cpu_index)
{
DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
return qatomic_read_i64(&rates[cpu_index].dirty_rate);
}
void vcpu_dirty_rate_stat_start(void)
{
if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
return;
}
qatomic_set(&vcpu_dirty_rate_stat->running, 1);
qemu_thread_create(&vcpu_dirty_rate_stat->thread,
"dirtyrate-stat",
vcpu_dirty_rate_stat_thread,
NULL,
QEMU_THREAD_JOINABLE);
}
void vcpu_dirty_rate_stat_stop(void)
{
qatomic_set(&vcpu_dirty_rate_stat->running, 0);
dirtylimit_state_unlock();
system/cpus: rename qemu_mutex_lock_iothread() to bql_lock() The Big QEMU Lock (BQL) has many names and they are confusing. The actual QemuMutex variable is called qemu_global_mutex but it's commonly referred to as the BQL in discussions and some code comments. The locking APIs, however, are called qemu_mutex_lock_iothread() and qemu_mutex_unlock_iothread(). The "iothread" name is historic and comes from when the main thread was split into into KVM vcpu threads and the "iothread" (now called the main loop thread). I have contributed to the confusion myself by introducing a separate --object iothread, a separate concept unrelated to the BQL. The "iothread" name is no longer appropriate for the BQL. Rename the locking APIs to: - void bql_lock(void) - void bql_unlock(void) - bool bql_locked(void) There are more APIs with "iothread" in their names. Subsequent patches will rename them. There are also comments and documentation that will be updated in later patches. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Fabiano Rosas <farosas@suse.de> Acked-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Cédric Le Goater <clg@kaod.org> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: Eric Farman <farman@linux.ibm.com> Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com> Acked-by: Hyman Huang <yong.huang@smartx.com> Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com> Message-id: 20240102153529.486531-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2024-01-02 18:35:25 +03:00
bql_unlock();
qemu_thread_join(&vcpu_dirty_rate_stat->thread);
system/cpus: rename qemu_mutex_lock_iothread() to bql_lock() The Big QEMU Lock (BQL) has many names and they are confusing. The actual QemuMutex variable is called qemu_global_mutex but it's commonly referred to as the BQL in discussions and some code comments. The locking APIs, however, are called qemu_mutex_lock_iothread() and qemu_mutex_unlock_iothread(). The "iothread" name is historic and comes from when the main thread was split into into KVM vcpu threads and the "iothread" (now called the main loop thread). I have contributed to the confusion myself by introducing a separate --object iothread, a separate concept unrelated to the BQL. The "iothread" name is no longer appropriate for the BQL. Rename the locking APIs to: - void bql_lock(void) - void bql_unlock(void) - bool bql_locked(void) There are more APIs with "iothread" in their names. Subsequent patches will rename them. There are also comments and documentation that will be updated in later patches. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Fabiano Rosas <farosas@suse.de> Acked-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Cédric Le Goater <clg@kaod.org> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: Eric Farman <farman@linux.ibm.com> Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com> Acked-by: Hyman Huang <yong.huang@smartx.com> Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com> Message-id: 20240102153529.486531-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2024-01-02 18:35:25 +03:00
bql_lock();
dirtylimit_state_lock();
}
void vcpu_dirty_rate_stat_initialize(void)
{
MachineState *ms = MACHINE(qdev_get_machine());
int max_cpus = ms->smp.max_cpus;
vcpu_dirty_rate_stat =
g_malloc0(sizeof(*vcpu_dirty_rate_stat));
vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
vcpu_dirty_rate_stat->stat.rates =
g_new0(DirtyRateVcpu, max_cpus);
vcpu_dirty_rate_stat->running = false;
}
void vcpu_dirty_rate_stat_finalize(void)
{
g_free(vcpu_dirty_rate_stat->stat.rates);
vcpu_dirty_rate_stat->stat.rates = NULL;
g_free(vcpu_dirty_rate_stat);
vcpu_dirty_rate_stat = NULL;
}
void dirtylimit_state_lock(void)
{
qemu_mutex_lock(&dirtylimit_mutex);
}
void dirtylimit_state_unlock(void)
{
qemu_mutex_unlock(&dirtylimit_mutex);
}
static void
__attribute__((__constructor__)) dirtylimit_mutex_init(void)
{
qemu_mutex_init(&dirtylimit_mutex);
}
static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
{
return &dirtylimit_state->states[cpu_index];
}
void dirtylimit_state_initialize(void)
{
MachineState *ms = MACHINE(qdev_get_machine());
int max_cpus = ms->smp.max_cpus;
int i;
dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
dirtylimit_state->states =
g_new0(VcpuDirtyLimitState, max_cpus);
for (i = 0; i < max_cpus; i++) {
dirtylimit_state->states[i].cpu_index = i;
}
dirtylimit_state->max_cpus = max_cpus;
trace_dirtylimit_state_initialize(max_cpus);
}
void dirtylimit_state_finalize(void)
{
g_free(dirtylimit_state->states);
dirtylimit_state->states = NULL;
g_free(dirtylimit_state);
dirtylimit_state = NULL;
trace_dirtylimit_state_finalize();
}
bool dirtylimit_in_service(void)
{
return !!dirtylimit_state;
}
bool dirtylimit_vcpu_index_valid(int cpu_index)
{
MachineState *ms = MACHINE(qdev_get_machine());
return !(cpu_index < 0 ||
cpu_index >= ms->smp.max_cpus);
}
static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
{
static uint64_t max_dirtyrate;
uint64_t dirty_ring_size_MiB;
dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size());
if (max_dirtyrate < dirtyrate) {
max_dirtyrate = dirtyrate;
}
return dirty_ring_size_MiB * 1000000 / max_dirtyrate;
}
static inline bool dirtylimit_done(uint64_t quota,
uint64_t current)
{
uint64_t min, max;
min = MIN(quota, current);
max = MAX(quota, current);
return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
}
static inline bool
dirtylimit_need_linear_adjustment(uint64_t quota,
uint64_t current)
{
uint64_t min, max;
min = MIN(quota, current);
max = MAX(quota, current);
return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
}
static void dirtylimit_set_throttle(CPUState *cpu,
uint64_t quota,
uint64_t current)
{
int64_t ring_full_time_us = 0;
uint64_t sleep_pct = 0;
uint64_t throttle_us = 0;
if (current == 0) {
cpu->throttle_us_per_full = 0;
return;
}
ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
if (dirtylimit_need_linear_adjustment(quota, current)) {
if (quota < current) {
sleep_pct = (current - quota) * 100 / current;
throttle_us =
ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
cpu->throttle_us_per_full += throttle_us;
} else {
sleep_pct = (quota - current) * 100 / quota;
throttle_us =
ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
cpu->throttle_us_per_full -= throttle_us;
}
trace_dirtylimit_throttle_pct(cpu->cpu_index,
sleep_pct,
throttle_us);
} else {
if (quota < current) {
cpu->throttle_us_per_full += ring_full_time_us / 10;
} else {
cpu->throttle_us_per_full -= ring_full_time_us / 10;
}
}
/*
* TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
* current dirty page rate may never reach the quota, we should stop
* increasing sleep time?
*/
cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
}
static void dirtylimit_adjust_throttle(CPUState *cpu)
{
uint64_t quota = 0;
uint64_t current = 0;
int cpu_index = cpu->cpu_index;
quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
current = vcpu_dirty_rate_get(cpu_index);
if (!dirtylimit_done(quota, current)) {
dirtylimit_set_throttle(cpu, quota, current);
}
return;
}
void dirtylimit_process(void)
{
CPUState *cpu;
if (!qatomic_read(&dirtylimit_quit)) {
dirtylimit_state_lock();
if (!dirtylimit_in_service()) {
dirtylimit_state_unlock();
return;
}
CPU_FOREACH(cpu) {
if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
continue;
}
dirtylimit_adjust_throttle(cpu);
}
dirtylimit_state_unlock();
}
}
void dirtylimit_change(bool start)
{
if (start) {
qatomic_set(&dirtylimit_quit, 0);
} else {
qatomic_set(&dirtylimit_quit, 1);
}
}
void dirtylimit_set_vcpu(int cpu_index,
uint64_t quota,
bool enable)
{
trace_dirtylimit_set_vcpu(cpu_index, quota);
if (enable) {
dirtylimit_state->states[cpu_index].quota = quota;
if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
dirtylimit_state->limited_nvcpu++;
}
} else {
dirtylimit_state->states[cpu_index].quota = 0;
if (dirtylimit_state->states[cpu_index].enabled) {
dirtylimit_state->limited_nvcpu--;
}
}
dirtylimit_state->states[cpu_index].enabled = enable;
}
void dirtylimit_set_all(uint64_t quota,
bool enable)
{
MachineState *ms = MACHINE(qdev_get_machine());
int max_cpus = ms->smp.max_cpus;
int i;
for (i = 0; i < max_cpus; i++) {
dirtylimit_set_vcpu(i, quota, enable);
}
}
void dirtylimit_vcpu_execute(CPUState *cpu)
{
if (cpu->throttle_us_per_full) {
dirtylimit_state_lock();
if (dirtylimit_in_service() &&
dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
dirtylimit_state_unlock();
trace_dirtylimit_vcpu_execute(cpu->cpu_index,
cpu->throttle_us_per_full);
g_usleep(cpu->throttle_us_per_full);
return;
}
dirtylimit_state_unlock();
}
}
static void dirtylimit_init(void)
{
dirtylimit_state_initialize();
dirtylimit_change(true);
vcpu_dirty_rate_stat_initialize();
vcpu_dirty_rate_stat_start();
}
static void dirtylimit_cleanup(void)
{
vcpu_dirty_rate_stat_stop();
vcpu_dirty_rate_stat_finalize();
dirtylimit_change(false);
dirtylimit_state_finalize();
}
/*
* dirty page rate limit is not allowed to set if migration
* is running with dirty-limit capability enabled.
*/
static bool dirtylimit_is_allowed(void)
{
if (migration_is_running() &&
!migration_thread_is_self() &&
migrate_dirty_limit() &&
dirtylimit_in_service()) {
return false;
}
return true;
}
void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
int64_t cpu_index,
Error **errp)
{
if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
return;
}
if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
error_setg(errp, "incorrect cpu index specified");
return;
}
if (!dirtylimit_is_allowed()) {
error_setg(errp, "can't cancel dirty page rate limit while"
" migration is running");
return;
}
if (!dirtylimit_in_service()) {
return;
}
dirtylimit_state_lock();
if (has_cpu_index) {
dirtylimit_set_vcpu(cpu_index, 0, false);
} else {
dirtylimit_set_all(0, false);
}
if (!dirtylimit_state->limited_nvcpu) {
dirtylimit_cleanup();
}
dirtylimit_state_unlock();
}
void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
{
int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
Error *err = NULL;
qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
if (err) {
hmp_handle_error(mon, err);
return;
}
monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
"dirty limit for virtual CPU]\n");
}
void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
int64_t cpu_index,
uint64_t dirty_rate,
Error **errp)
{
if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
error_setg(errp, "dirty page limit feature requires KVM with"
" accelerator property 'dirty-ring-size' set'");
return;
}
if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
error_setg(errp, "incorrect cpu index specified");
return;
}
if (!dirtylimit_is_allowed()) {
error_setg(errp, "can't set dirty page rate limit while"
" migration is running");
return;
}
if (!dirty_rate) {
qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
return;
}
dirtylimit_state_lock();
if (!dirtylimit_in_service()) {
dirtylimit_init();
}
if (has_cpu_index) {
dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
} else {
dirtylimit_set_all(dirty_rate, true);
}
dirtylimit_state_unlock();
}
void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
{
int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
Error *err = NULL;
if (dirty_rate < 0) {
error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate);
goto out;
}
qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
out:
hmp_handle_error(mon, err);
}
/* Return the max throttle time of each virtual CPU */
uint64_t dirtylimit_throttle_time_per_round(void)
{
CPUState *cpu;
int64_t max = 0;
CPU_FOREACH(cpu) {
if (cpu->throttle_us_per_full > max) {
max = cpu->throttle_us_per_full;
}
}
return max;
}
/*
* Estimate average dirty ring full time of each virtaul CPU.
* Return 0 if guest doesn't dirty memory.
*/
uint64_t dirtylimit_ring_full_time(void)
{
CPUState *cpu;
uint64_t curr_rate = 0;
int nvcpus = 0;
CPU_FOREACH(cpu) {
if (cpu->running) {
nvcpus++;
curr_rate += vcpu_dirty_rate_get(cpu->cpu_index);
}
}
if (!curr_rate || !nvcpus) {
return 0;
}
return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus);
}
static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
{
DirtyLimitInfo *info = NULL;
info = g_malloc0(sizeof(*info));
info->cpu_index = cpu_index;
info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
info->current_rate = vcpu_dirty_rate_get(cpu_index);
return info;
}
static struct DirtyLimitInfoList *dirtylimit_query_all(void)
{
int i, index;
DirtyLimitInfo *info = NULL;
DirtyLimitInfoList *head = NULL, **tail = &head;
dirtylimit_state_lock();
if (!dirtylimit_in_service()) {
dirtylimit_state_unlock();
return NULL;
}
for (i = 0; i < dirtylimit_state->max_cpus; i++) {
index = dirtylimit_state->states[i].cpu_index;
if (dirtylimit_vcpu_get_state(index)->enabled) {
info = dirtylimit_query_vcpu(index);
QAPI_LIST_APPEND(tail, info);
}
}
dirtylimit_state_unlock();
return head;
}
struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
{
return dirtylimit_query_all();
}
void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
{
DirtyLimitInfoList *info;
g_autoptr(DirtyLimitInfoList) head = NULL;
Error *err = NULL;
if (!dirtylimit_in_service()) {
monitor_printf(mon, "Dirty page limit not enabled!\n");
return;
}
head = qmp_query_vcpu_dirty_limit(&err);
if (err) {
hmp_handle_error(mon, err);
return;
}
for (info = head; info != NULL; info = info->next) {
monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
" current rate %"PRIi64 " (MB/s)\n",
info->value->cpu_index,
info->value->limit_rate,
info->value->current_rate);
}
}