qemu/target/i386/nvmm/nvmm-all.c
Steve Sistare c8a7fc5179 migration: simplify blockers
Modify migrate_add_blocker and migrate_del_blocker to take an Error **
reason.  This allows migration to own the Error object, so that if
an error occurs in migrate_add_blocker, migration code can free the Error
and clear the client handle, simplifying client code.  It also simplifies
the migrate_del_blocker call site.

In addition, this is a pre-requisite for a proposed future patch that would
add a mode argument to migration requests to support live update, and
maintain a list of blockers for each mode.  A blocker may apply to a single
mode or to multiple modes, and passing Error** will allow one Error object
to be registered for multiple modes.

No functional change.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Tested-by: Michael Galaxy <mgalaxy@akamai.com>
Reviewed-by: Michael Galaxy <mgalaxy@akamai.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <1697634216-84215-1-git-send-email-steven.sistare@oracle.com>
2023-10-20 08:51:41 +02:00

1225 lines
34 KiB
C

/*
* Copyright (c) 2018-2019 Maxime Villard, All rights reserved.
*
* NetBSD Virtual Machine Monitor (NVMM) accelerator for QEMU.
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/address-spaces.h"
#include "exec/ioport.h"
#include "qemu/accel.h"
#include "sysemu/nvmm.h"
#include "sysemu/cpus.h"
#include "sysemu/runstate.h"
#include "qemu/main-loop.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qemu/queue.h"
#include "migration/blocker.h"
#include "strings.h"
#include "nvmm-accel-ops.h"
#include <nvmm.h>
struct AccelCPUState {
struct nvmm_vcpu vcpu;
uint8_t tpr;
bool stop;
/* Window-exiting for INTs/NMIs. */
bool int_window_exit;
bool nmi_window_exit;
/* The guest is in an interrupt shadow (POP SS, etc). */
bool int_shadow;
};
struct qemu_machine {
struct nvmm_capability cap;
struct nvmm_machine mach;
};
/* -------------------------------------------------------------------------- */
static bool nvmm_allowed;
static struct qemu_machine qemu_mach;
static struct nvmm_machine *
get_nvmm_mach(void)
{
return &qemu_mach.mach;
}
/* -------------------------------------------------------------------------- */
static void
nvmm_set_segment(struct nvmm_x64_state_seg *nseg, const SegmentCache *qseg)
{
uint32_t attrib = qseg->flags;
nseg->selector = qseg->selector;
nseg->limit = qseg->limit;
nseg->base = qseg->base;
nseg->attrib.type = __SHIFTOUT(attrib, DESC_TYPE_MASK);
nseg->attrib.s = __SHIFTOUT(attrib, DESC_S_MASK);
nseg->attrib.dpl = __SHIFTOUT(attrib, DESC_DPL_MASK);
nseg->attrib.p = __SHIFTOUT(attrib, DESC_P_MASK);
nseg->attrib.avl = __SHIFTOUT(attrib, DESC_AVL_MASK);
nseg->attrib.l = __SHIFTOUT(attrib, DESC_L_MASK);
nseg->attrib.def = __SHIFTOUT(attrib, DESC_B_MASK);
nseg->attrib.g = __SHIFTOUT(attrib, DESC_G_MASK);
}
static void
nvmm_set_registers(CPUState *cpu)
{
CPUX86State *env = cpu_env(cpu);
struct nvmm_machine *mach = get_nvmm_mach();
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
struct nvmm_x64_state *state = vcpu->state;
uint64_t bitmap;
size_t i;
int ret;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
/* GPRs. */
state->gprs[NVMM_X64_GPR_RAX] = env->regs[R_EAX];
state->gprs[NVMM_X64_GPR_RCX] = env->regs[R_ECX];
state->gprs[NVMM_X64_GPR_RDX] = env->regs[R_EDX];
state->gprs[NVMM_X64_GPR_RBX] = env->regs[R_EBX];
state->gprs[NVMM_X64_GPR_RSP] = env->regs[R_ESP];
state->gprs[NVMM_X64_GPR_RBP] = env->regs[R_EBP];
state->gprs[NVMM_X64_GPR_RSI] = env->regs[R_ESI];
state->gprs[NVMM_X64_GPR_RDI] = env->regs[R_EDI];
#ifdef TARGET_X86_64
state->gprs[NVMM_X64_GPR_R8] = env->regs[R_R8];
state->gprs[NVMM_X64_GPR_R9] = env->regs[R_R9];
state->gprs[NVMM_X64_GPR_R10] = env->regs[R_R10];
state->gprs[NVMM_X64_GPR_R11] = env->regs[R_R11];
state->gprs[NVMM_X64_GPR_R12] = env->regs[R_R12];
state->gprs[NVMM_X64_GPR_R13] = env->regs[R_R13];
state->gprs[NVMM_X64_GPR_R14] = env->regs[R_R14];
state->gprs[NVMM_X64_GPR_R15] = env->regs[R_R15];
#endif
/* RIP and RFLAGS. */
state->gprs[NVMM_X64_GPR_RIP] = env->eip;
state->gprs[NVMM_X64_GPR_RFLAGS] = env->eflags;
/* Segments. */
nvmm_set_segment(&state->segs[NVMM_X64_SEG_CS], &env->segs[R_CS]);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_DS], &env->segs[R_DS]);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_ES], &env->segs[R_ES]);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_FS], &env->segs[R_FS]);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_GS], &env->segs[R_GS]);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_SS], &env->segs[R_SS]);
/* Special segments. */
nvmm_set_segment(&state->segs[NVMM_X64_SEG_GDT], &env->gdt);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_LDT], &env->ldt);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_TR], &env->tr);
nvmm_set_segment(&state->segs[NVMM_X64_SEG_IDT], &env->idt);
/* Control registers. */
state->crs[NVMM_X64_CR_CR0] = env->cr[0];
state->crs[NVMM_X64_CR_CR2] = env->cr[2];
state->crs[NVMM_X64_CR_CR3] = env->cr[3];
state->crs[NVMM_X64_CR_CR4] = env->cr[4];
state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
state->crs[NVMM_X64_CR_XCR0] = env->xcr0;
/* Debug registers. */
state->drs[NVMM_X64_DR_DR0] = env->dr[0];
state->drs[NVMM_X64_DR_DR1] = env->dr[1];
state->drs[NVMM_X64_DR_DR2] = env->dr[2];
state->drs[NVMM_X64_DR_DR3] = env->dr[3];
state->drs[NVMM_X64_DR_DR6] = env->dr[6];
state->drs[NVMM_X64_DR_DR7] = env->dr[7];
/* FPU. */
state->fpu.fx_cw = env->fpuc;
state->fpu.fx_sw = (env->fpus & ~0x3800) | ((env->fpstt & 0x7) << 11);
state->fpu.fx_tw = 0;
for (i = 0; i < 8; i++) {
state->fpu.fx_tw |= (!env->fptags[i]) << i;
}
state->fpu.fx_opcode = env->fpop;
state->fpu.fx_ip.fa_64 = env->fpip;
state->fpu.fx_dp.fa_64 = env->fpdp;
state->fpu.fx_mxcsr = env->mxcsr;
state->fpu.fx_mxcsr_mask = 0x0000FFFF;
assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
memcpy(state->fpu.fx_87_ac, env->fpregs, sizeof(env->fpregs));
for (i = 0; i < CPU_NB_REGS; i++) {
memcpy(&state->fpu.fx_xmm[i].xmm_bytes[0],
&env->xmm_regs[i].ZMM_Q(0), 8);
memcpy(&state->fpu.fx_xmm[i].xmm_bytes[8],
&env->xmm_regs[i].ZMM_Q(1), 8);
}
/* MSRs. */
state->msrs[NVMM_X64_MSR_EFER] = env->efer;
state->msrs[NVMM_X64_MSR_STAR] = env->star;
#ifdef TARGET_X86_64
state->msrs[NVMM_X64_MSR_LSTAR] = env->lstar;
state->msrs[NVMM_X64_MSR_CSTAR] = env->cstar;
state->msrs[NVMM_X64_MSR_SFMASK] = env->fmask;
state->msrs[NVMM_X64_MSR_KERNELGSBASE] = env->kernelgsbase;
#endif
state->msrs[NVMM_X64_MSR_SYSENTER_CS] = env->sysenter_cs;
state->msrs[NVMM_X64_MSR_SYSENTER_ESP] = env->sysenter_esp;
state->msrs[NVMM_X64_MSR_SYSENTER_EIP] = env->sysenter_eip;
state->msrs[NVMM_X64_MSR_PAT] = env->pat;
state->msrs[NVMM_X64_MSR_TSC] = env->tsc;
bitmap =
NVMM_X64_STATE_SEGS |
NVMM_X64_STATE_GPRS |
NVMM_X64_STATE_CRS |
NVMM_X64_STATE_DRS |
NVMM_X64_STATE_MSRS |
NVMM_X64_STATE_FPU;
ret = nvmm_vcpu_setstate(mach, vcpu, bitmap);
if (ret == -1) {
error_report("NVMM: Failed to set virtual processor context,"
" error=%d", errno);
}
}
static void
nvmm_get_segment(SegmentCache *qseg, const struct nvmm_x64_state_seg *nseg)
{
qseg->selector = nseg->selector;
qseg->limit = nseg->limit;
qseg->base = nseg->base;
qseg->flags =
__SHIFTIN((uint32_t)nseg->attrib.type, DESC_TYPE_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.s, DESC_S_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.dpl, DESC_DPL_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.p, DESC_P_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.avl, DESC_AVL_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.l, DESC_L_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.def, DESC_B_MASK) |
__SHIFTIN((uint32_t)nseg->attrib.g, DESC_G_MASK);
}
static void
nvmm_get_registers(CPUState *cpu)
{
CPUX86State *env = cpu_env(cpu);
struct nvmm_machine *mach = get_nvmm_mach();
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
X86CPU *x86_cpu = X86_CPU(cpu);
struct nvmm_x64_state *state = vcpu->state;
uint64_t bitmap, tpr;
size_t i;
int ret;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
bitmap =
NVMM_X64_STATE_SEGS |
NVMM_X64_STATE_GPRS |
NVMM_X64_STATE_CRS |
NVMM_X64_STATE_DRS |
NVMM_X64_STATE_MSRS |
NVMM_X64_STATE_FPU;
ret = nvmm_vcpu_getstate(mach, vcpu, bitmap);
if (ret == -1) {
error_report("NVMM: Failed to get virtual processor context,"
" error=%d", errno);
}
/* GPRs. */
env->regs[R_EAX] = state->gprs[NVMM_X64_GPR_RAX];
env->regs[R_ECX] = state->gprs[NVMM_X64_GPR_RCX];
env->regs[R_EDX] = state->gprs[NVMM_X64_GPR_RDX];
env->regs[R_EBX] = state->gprs[NVMM_X64_GPR_RBX];
env->regs[R_ESP] = state->gprs[NVMM_X64_GPR_RSP];
env->regs[R_EBP] = state->gprs[NVMM_X64_GPR_RBP];
env->regs[R_ESI] = state->gprs[NVMM_X64_GPR_RSI];
env->regs[R_EDI] = state->gprs[NVMM_X64_GPR_RDI];
#ifdef TARGET_X86_64
env->regs[R_R8] = state->gprs[NVMM_X64_GPR_R8];
env->regs[R_R9] = state->gprs[NVMM_X64_GPR_R9];
env->regs[R_R10] = state->gprs[NVMM_X64_GPR_R10];
env->regs[R_R11] = state->gprs[NVMM_X64_GPR_R11];
env->regs[R_R12] = state->gprs[NVMM_X64_GPR_R12];
env->regs[R_R13] = state->gprs[NVMM_X64_GPR_R13];
env->regs[R_R14] = state->gprs[NVMM_X64_GPR_R14];
env->regs[R_R15] = state->gprs[NVMM_X64_GPR_R15];
#endif
/* RIP and RFLAGS. */
env->eip = state->gprs[NVMM_X64_GPR_RIP];
env->eflags = state->gprs[NVMM_X64_GPR_RFLAGS];
/* Segments. */
nvmm_get_segment(&env->segs[R_ES], &state->segs[NVMM_X64_SEG_ES]);
nvmm_get_segment(&env->segs[R_CS], &state->segs[NVMM_X64_SEG_CS]);
nvmm_get_segment(&env->segs[R_SS], &state->segs[NVMM_X64_SEG_SS]);
nvmm_get_segment(&env->segs[R_DS], &state->segs[NVMM_X64_SEG_DS]);
nvmm_get_segment(&env->segs[R_FS], &state->segs[NVMM_X64_SEG_FS]);
nvmm_get_segment(&env->segs[R_GS], &state->segs[NVMM_X64_SEG_GS]);
/* Special segments. */
nvmm_get_segment(&env->gdt, &state->segs[NVMM_X64_SEG_GDT]);
nvmm_get_segment(&env->ldt, &state->segs[NVMM_X64_SEG_LDT]);
nvmm_get_segment(&env->tr, &state->segs[NVMM_X64_SEG_TR]);
nvmm_get_segment(&env->idt, &state->segs[NVMM_X64_SEG_IDT]);
/* Control registers. */
env->cr[0] = state->crs[NVMM_X64_CR_CR0];
env->cr[2] = state->crs[NVMM_X64_CR_CR2];
env->cr[3] = state->crs[NVMM_X64_CR_CR3];
env->cr[4] = state->crs[NVMM_X64_CR_CR4];
tpr = state->crs[NVMM_X64_CR_CR8];
if (tpr != qcpu->tpr) {
qcpu->tpr = tpr;
cpu_set_apic_tpr(x86_cpu->apic_state, tpr);
}
env->xcr0 = state->crs[NVMM_X64_CR_XCR0];
/* Debug registers. */
env->dr[0] = state->drs[NVMM_X64_DR_DR0];
env->dr[1] = state->drs[NVMM_X64_DR_DR1];
env->dr[2] = state->drs[NVMM_X64_DR_DR2];
env->dr[3] = state->drs[NVMM_X64_DR_DR3];
env->dr[6] = state->drs[NVMM_X64_DR_DR6];
env->dr[7] = state->drs[NVMM_X64_DR_DR7];
/* FPU. */
env->fpuc = state->fpu.fx_cw;
env->fpstt = (state->fpu.fx_sw >> 11) & 0x7;
env->fpus = state->fpu.fx_sw & ~0x3800;
for (i = 0; i < 8; i++) {
env->fptags[i] = !((state->fpu.fx_tw >> i) & 1);
}
env->fpop = state->fpu.fx_opcode;
env->fpip = state->fpu.fx_ip.fa_64;
env->fpdp = state->fpu.fx_dp.fa_64;
env->mxcsr = state->fpu.fx_mxcsr;
assert(sizeof(state->fpu.fx_87_ac) == sizeof(env->fpregs));
memcpy(env->fpregs, state->fpu.fx_87_ac, sizeof(env->fpregs));
for (i = 0; i < CPU_NB_REGS; i++) {
memcpy(&env->xmm_regs[i].ZMM_Q(0),
&state->fpu.fx_xmm[i].xmm_bytes[0], 8);
memcpy(&env->xmm_regs[i].ZMM_Q(1),
&state->fpu.fx_xmm[i].xmm_bytes[8], 8);
}
/* MSRs. */
env->efer = state->msrs[NVMM_X64_MSR_EFER];
env->star = state->msrs[NVMM_X64_MSR_STAR];
#ifdef TARGET_X86_64
env->lstar = state->msrs[NVMM_X64_MSR_LSTAR];
env->cstar = state->msrs[NVMM_X64_MSR_CSTAR];
env->fmask = state->msrs[NVMM_X64_MSR_SFMASK];
env->kernelgsbase = state->msrs[NVMM_X64_MSR_KERNELGSBASE];
#endif
env->sysenter_cs = state->msrs[NVMM_X64_MSR_SYSENTER_CS];
env->sysenter_esp = state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
env->sysenter_eip = state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
env->pat = state->msrs[NVMM_X64_MSR_PAT];
env->tsc = state->msrs[NVMM_X64_MSR_TSC];
x86_update_hflags(env);
}
static bool
nvmm_can_take_int(CPUState *cpu)
{
CPUX86State *env = cpu_env(cpu);
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
struct nvmm_machine *mach = get_nvmm_mach();
if (qcpu->int_window_exit) {
return false;
}
if (qcpu->int_shadow || !(env->eflags & IF_MASK)) {
struct nvmm_x64_state *state = vcpu->state;
/* Exit on interrupt window. */
nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_INTR);
state->intr.int_window_exiting = 1;
nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_INTR);
return false;
}
return true;
}
static bool
nvmm_can_take_nmi(CPUState *cpu)
{
AccelCPUState *qcpu = cpu->accel;
/*
* Contrary to INTs, NMIs always schedule an exit when they are
* completed. Therefore, if window-exiting is enabled, it means
* NMIs are blocked.
*/
if (qcpu->nmi_window_exit) {
return false;
}
return true;
}
/*
* Called before the VCPU is run. We inject events generated by the I/O
* thread, and synchronize the guest TPR.
*/
static void
nvmm_vcpu_pre_run(CPUState *cpu)
{
CPUX86State *env = cpu_env(cpu);
struct nvmm_machine *mach = get_nvmm_mach();
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
X86CPU *x86_cpu = X86_CPU(cpu);
struct nvmm_x64_state *state = vcpu->state;
struct nvmm_vcpu_event *event = vcpu->event;
bool has_event = false;
bool sync_tpr = false;
uint8_t tpr;
int ret;
qemu_mutex_lock_iothread();
tpr = cpu_get_apic_tpr(x86_cpu->apic_state);
if (tpr != qcpu->tpr) {
qcpu->tpr = tpr;
sync_tpr = true;
}
/*
* Force the VCPU out of its inner loop to process any INIT requests
* or commit pending TPR access.
*/
if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
cpu->exit_request = 1;
}
if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
if (nvmm_can_take_nmi(cpu)) {
cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
event->type = NVMM_VCPU_EVENT_INTR;
event->vector = 2;
has_event = true;
}
}
if (!has_event && (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
if (nvmm_can_take_int(cpu)) {
cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
event->type = NVMM_VCPU_EVENT_INTR;
event->vector = cpu_get_pic_interrupt(env);
has_event = true;
}
}
/* Don't want SMIs. */
if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
}
if (sync_tpr) {
ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_CRS);
if (ret == -1) {
error_report("NVMM: Failed to get CPU state,"
" error=%d", errno);
}
state->crs[NVMM_X64_CR_CR8] = qcpu->tpr;
ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_CRS);
if (ret == -1) {
error_report("NVMM: Failed to set CPU state,"
" error=%d", errno);
}
}
if (has_event) {
ret = nvmm_vcpu_inject(mach, vcpu);
if (ret == -1) {
error_report("NVMM: Failed to inject event,"
" error=%d", errno);
}
}
qemu_mutex_unlock_iothread();
}
/*
* Called after the VCPU ran. We synchronize the host view of the TPR and
* RFLAGS.
*/
static void
nvmm_vcpu_post_run(CPUState *cpu, struct nvmm_vcpu_exit *exit)
{
AccelCPUState *qcpu = cpu->accel;
X86CPU *x86_cpu = X86_CPU(cpu);
CPUX86State *env = &x86_cpu->env;
uint64_t tpr;
env->eflags = exit->exitstate.rflags;
qcpu->int_shadow = exit->exitstate.int_shadow;
qcpu->int_window_exit = exit->exitstate.int_window_exiting;
qcpu->nmi_window_exit = exit->exitstate.nmi_window_exiting;
tpr = exit->exitstate.cr8;
if (qcpu->tpr != tpr) {
qcpu->tpr = tpr;
qemu_mutex_lock_iothread();
cpu_set_apic_tpr(x86_cpu->apic_state, qcpu->tpr);
qemu_mutex_unlock_iothread();
}
}
/* -------------------------------------------------------------------------- */
static void
nvmm_io_callback(struct nvmm_io *io)
{
MemTxAttrs attrs = { 0 };
int ret;
ret = address_space_rw(&address_space_io, io->port, attrs, io->data,
io->size, !io->in);
if (ret != MEMTX_OK) {
error_report("NVMM: I/O Transaction Failed "
"[%s, port=%u, size=%zu]", (io->in ? "in" : "out"),
io->port, io->size);
}
/* Needed, otherwise infinite loop. */
current_cpu->vcpu_dirty = false;
}
static void
nvmm_mem_callback(struct nvmm_mem *mem)
{
cpu_physical_memory_rw(mem->gpa, mem->data, mem->size, mem->write);
/* Needed, otherwise infinite loop. */
current_cpu->vcpu_dirty = false;
}
static struct nvmm_assist_callbacks nvmm_callbacks = {
.io = nvmm_io_callback,
.mem = nvmm_mem_callback
};
/* -------------------------------------------------------------------------- */
static int
nvmm_handle_mem(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
{
int ret;
ret = nvmm_assist_mem(mach, vcpu);
if (ret == -1) {
error_report("NVMM: Mem Assist Failed [gpa=%p]",
(void *)vcpu->exit->u.mem.gpa);
}
return ret;
}
static int
nvmm_handle_io(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
{
int ret;
ret = nvmm_assist_io(mach, vcpu);
if (ret == -1) {
error_report("NVMM: I/O Assist Failed [port=%d]",
(int)vcpu->exit->u.io.port);
}
return ret;
}
static int
nvmm_handle_rdmsr(struct nvmm_machine *mach, CPUState *cpu,
struct nvmm_vcpu_exit *exit)
{
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
X86CPU *x86_cpu = X86_CPU(cpu);
struct nvmm_x64_state *state = vcpu->state;
uint64_t val;
int ret;
switch (exit->u.rdmsr.msr) {
case MSR_IA32_APICBASE:
val = cpu_get_apic_base(x86_cpu->apic_state);
break;
case MSR_MTRRcap:
case MSR_MTRRdefType:
case MSR_MCG_CAP:
case MSR_MCG_STATUS:
val = 0;
break;
default: /* More MSRs to add? */
val = 0;
error_report("NVMM: Unexpected RDMSR 0x%x, ignored",
exit->u.rdmsr.msr);
break;
}
ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
if (ret == -1) {
return -1;
}
state->gprs[NVMM_X64_GPR_RAX] = (val & 0xFFFFFFFF);
state->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
state->gprs[NVMM_X64_GPR_RIP] = exit->u.rdmsr.npc;
ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
if (ret == -1) {
return -1;
}
return 0;
}
static int
nvmm_handle_wrmsr(struct nvmm_machine *mach, CPUState *cpu,
struct nvmm_vcpu_exit *exit)
{
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
X86CPU *x86_cpu = X86_CPU(cpu);
struct nvmm_x64_state *state = vcpu->state;
uint64_t val;
int ret;
val = exit->u.wrmsr.val;
switch (exit->u.wrmsr.msr) {
case MSR_IA32_APICBASE:
cpu_set_apic_base(x86_cpu->apic_state, val);
break;
case MSR_MTRRdefType:
case MSR_MCG_STATUS:
break;
default: /* More MSRs to add? */
error_report("NVMM: Unexpected WRMSR 0x%x [val=0x%lx], ignored",
exit->u.wrmsr.msr, val);
break;
}
ret = nvmm_vcpu_getstate(mach, vcpu, NVMM_X64_STATE_GPRS);
if (ret == -1) {
return -1;
}
state->gprs[NVMM_X64_GPR_RIP] = exit->u.wrmsr.npc;
ret = nvmm_vcpu_setstate(mach, vcpu, NVMM_X64_STATE_GPRS);
if (ret == -1) {
return -1;
}
return 0;
}
static int
nvmm_handle_halted(struct nvmm_machine *mach, CPUState *cpu,
struct nvmm_vcpu_exit *exit)
{
CPUX86State *env = cpu_env(cpu);
int ret = 0;
qemu_mutex_lock_iothread();
if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) &&
!(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->exception_index = EXCP_HLT;
cpu->halted = true;
ret = 1;
}
qemu_mutex_unlock_iothread();
return ret;
}
static int
nvmm_inject_ud(struct nvmm_machine *mach, struct nvmm_vcpu *vcpu)
{
struct nvmm_vcpu_event *event = vcpu->event;
event->type = NVMM_VCPU_EVENT_EXCP;
event->vector = 6;
event->u.excp.error = 0;
return nvmm_vcpu_inject(mach, vcpu);
}
static int
nvmm_vcpu_loop(CPUState *cpu)
{
struct nvmm_machine *mach = get_nvmm_mach();
AccelCPUState *qcpu = cpu->accel;
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
X86CPU *x86_cpu = X86_CPU(cpu);
CPUX86State *env = &x86_cpu->env;
struct nvmm_vcpu_exit *exit = vcpu->exit;
int ret;
/*
* Some asynchronous events must be handled outside of the inner
* VCPU loop. They are handled here.
*/
if (cpu->interrupt_request & CPU_INTERRUPT_INIT) {
nvmm_cpu_synchronize_state(cpu);
do_cpu_init(x86_cpu);
/* set int/nmi windows back to the reset state */
}
if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
apic_poll_irq(x86_cpu->apic_state);
}
if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) ||
(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->halted = false;
}
if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
nvmm_cpu_synchronize_state(cpu);
do_cpu_sipi(x86_cpu);
}
if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
nvmm_cpu_synchronize_state(cpu);
apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
env->tpr_access_type);
}
if (cpu->halted) {
cpu->exception_index = EXCP_HLT;
qatomic_set(&cpu->exit_request, false);
return 0;
}
qemu_mutex_unlock_iothread();
cpu_exec_start(cpu);
/*
* Inner VCPU loop.
*/
do {
if (cpu->vcpu_dirty) {
nvmm_set_registers(cpu);
cpu->vcpu_dirty = false;
}
if (qcpu->stop) {
cpu->exception_index = EXCP_INTERRUPT;
qcpu->stop = false;
ret = 1;
break;
}
nvmm_vcpu_pre_run(cpu);
if (qatomic_read(&cpu->exit_request)) {
#if NVMM_USER_VERSION >= 2
nvmm_vcpu_stop(vcpu);
#else
qemu_cpu_kick_self();
#endif
}
/* Read exit_request before the kernel reads the immediate exit flag */
smp_rmb();
ret = nvmm_vcpu_run(mach, vcpu);
if (ret == -1) {
error_report("NVMM: Failed to exec a virtual processor,"
" error=%d", errno);
break;
}
nvmm_vcpu_post_run(cpu, exit);
switch (exit->reason) {
case NVMM_VCPU_EXIT_NONE:
break;
#if NVMM_USER_VERSION >= 2
case NVMM_VCPU_EXIT_STOPPED:
/*
* The kernel cleared the immediate exit flag; cpu->exit_request
* must be cleared after
*/
smp_wmb();
qcpu->stop = true;
break;
#endif
case NVMM_VCPU_EXIT_MEMORY:
ret = nvmm_handle_mem(mach, vcpu);
break;
case NVMM_VCPU_EXIT_IO:
ret = nvmm_handle_io(mach, vcpu);
break;
case NVMM_VCPU_EXIT_INT_READY:
case NVMM_VCPU_EXIT_NMI_READY:
case NVMM_VCPU_EXIT_TPR_CHANGED:
break;
case NVMM_VCPU_EXIT_HALTED:
ret = nvmm_handle_halted(mach, cpu, exit);
break;
case NVMM_VCPU_EXIT_SHUTDOWN:
qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
cpu->exception_index = EXCP_INTERRUPT;
ret = 1;
break;
case NVMM_VCPU_EXIT_RDMSR:
ret = nvmm_handle_rdmsr(mach, cpu, exit);
break;
case NVMM_VCPU_EXIT_WRMSR:
ret = nvmm_handle_wrmsr(mach, cpu, exit);
break;
case NVMM_VCPU_EXIT_MONITOR:
case NVMM_VCPU_EXIT_MWAIT:
ret = nvmm_inject_ud(mach, vcpu);
break;
default:
error_report("NVMM: Unexpected VM exit code 0x%lx [hw=0x%lx]",
exit->reason, exit->u.inv.hwcode);
nvmm_get_registers(cpu);
qemu_mutex_lock_iothread();
qemu_system_guest_panicked(cpu_get_crash_info(cpu));
qemu_mutex_unlock_iothread();
ret = -1;
break;
}
} while (ret == 0);
cpu_exec_end(cpu);
qemu_mutex_lock_iothread();
qatomic_set(&cpu->exit_request, false);
return ret < 0;
}
/* -------------------------------------------------------------------------- */
static void
do_nvmm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
nvmm_get_registers(cpu);
cpu->vcpu_dirty = true;
}
static void
do_nvmm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
{
nvmm_set_registers(cpu);
cpu->vcpu_dirty = false;
}
static void
do_nvmm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
{
nvmm_set_registers(cpu);
cpu->vcpu_dirty = false;
}
static void
do_nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
{
cpu->vcpu_dirty = true;
}
void nvmm_cpu_synchronize_state(CPUState *cpu)
{
if (!cpu->vcpu_dirty) {
run_on_cpu(cpu, do_nvmm_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
void nvmm_cpu_synchronize_post_reset(CPUState *cpu)
{
run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
}
void nvmm_cpu_synchronize_post_init(CPUState *cpu)
{
run_on_cpu(cpu, do_nvmm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
}
void nvmm_cpu_synchronize_pre_loadvm(CPUState *cpu)
{
run_on_cpu(cpu, do_nvmm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
}
/* -------------------------------------------------------------------------- */
static Error *nvmm_migration_blocker;
/*
* The nvmm_vcpu_stop() mechanism breaks races between entering the VMM
* and another thread signaling the vCPU thread to exit.
*/
static void
nvmm_ipi_signal(int sigcpu)
{
if (current_cpu) {
AccelCPUState *qcpu = current_cpu->accel;
#if NVMM_USER_VERSION >= 2
struct nvmm_vcpu *vcpu = &qcpu->vcpu;
nvmm_vcpu_stop(vcpu);
#else
qcpu->stop = true;
#endif
}
}
static void
nvmm_init_cpu_signals(void)
{
struct sigaction sigact;
sigset_t set;
/* Install the IPI handler. */
memset(&sigact, 0, sizeof(sigact));
sigact.sa_handler = nvmm_ipi_signal;
sigaction(SIG_IPI, &sigact, NULL);
/* Allow IPIs on the current thread. */
sigprocmask(SIG_BLOCK, NULL, &set);
sigdelset(&set, SIG_IPI);
pthread_sigmask(SIG_SETMASK, &set, NULL);
}
int
nvmm_init_vcpu(CPUState *cpu)
{
struct nvmm_machine *mach = get_nvmm_mach();
struct nvmm_vcpu_conf_cpuid cpuid;
struct nvmm_vcpu_conf_tpr tpr;
Error *local_error = NULL;
AccelCPUState *qcpu;
int ret, err;
nvmm_init_cpu_signals();
if (nvmm_migration_blocker == NULL) {
error_setg(&nvmm_migration_blocker,
"NVMM: Migration not supported");
if (migrate_add_blocker(&nvmm_migration_blocker, &local_error) < 0) {
error_report_err(local_error);
return -EINVAL;
}
}
qcpu = g_new0(AccelCPUState, 1);
ret = nvmm_vcpu_create(mach, cpu->cpu_index, &qcpu->vcpu);
if (ret == -1) {
err = errno;
error_report("NVMM: Failed to create a virtual processor,"
" error=%d", err);
g_free(qcpu);
return -err;
}
memset(&cpuid, 0, sizeof(cpuid));
cpuid.mask = 1;
cpuid.leaf = 0x00000001;
cpuid.u.mask.set.edx = CPUID_MCE | CPUID_MCA | CPUID_MTRR;
ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CPUID,
&cpuid);
if (ret == -1) {
err = errno;
error_report("NVMM: Failed to configure a virtual processor,"
" error=%d", err);
g_free(qcpu);
return -err;
}
ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_CALLBACKS,
&nvmm_callbacks);
if (ret == -1) {
err = errno;
error_report("NVMM: Failed to configure a virtual processor,"
" error=%d", err);
g_free(qcpu);
return -err;
}
if (qemu_mach.cap.arch.vcpu_conf_support & NVMM_CAP_ARCH_VCPU_CONF_TPR) {
memset(&tpr, 0, sizeof(tpr));
tpr.exit_changed = 1;
ret = nvmm_vcpu_configure(mach, &qcpu->vcpu, NVMM_VCPU_CONF_TPR, &tpr);
if (ret == -1) {
err = errno;
error_report("NVMM: Failed to configure a virtual processor,"
" error=%d", err);
g_free(qcpu);
return -err;
}
}
cpu->vcpu_dirty = true;
cpu->accel = qcpu;
return 0;
}
int
nvmm_vcpu_exec(CPUState *cpu)
{
int ret, fatal;
while (1) {
if (cpu->exception_index >= EXCP_INTERRUPT) {
ret = cpu->exception_index;
cpu->exception_index = -1;
break;
}
fatal = nvmm_vcpu_loop(cpu);
if (fatal) {
error_report("NVMM: Failed to execute a VCPU.");
abort();
}
}
return ret;
}
void
nvmm_destroy_vcpu(CPUState *cpu)
{
struct nvmm_machine *mach = get_nvmm_mach();
AccelCPUState *qcpu = cpu->accel;
nvmm_vcpu_destroy(mach, &qcpu->vcpu);
g_free(cpu->accel);
}
/* -------------------------------------------------------------------------- */
static void
nvmm_update_mapping(hwaddr start_pa, ram_addr_t size, uintptr_t hva,
bool add, bool rom, const char *name)
{
struct nvmm_machine *mach = get_nvmm_mach();
int ret, prot;
if (add) {
prot = PROT_READ | PROT_EXEC;
if (!rom) {
prot |= PROT_WRITE;
}
ret = nvmm_gpa_map(mach, hva, start_pa, size, prot);
} else {
ret = nvmm_gpa_unmap(mach, hva, start_pa, size);
}
if (ret == -1) {
error_report("NVMM: Failed to %s GPA range '%s' PA:%p, "
"Size:%p bytes, HostVA:%p, error=%d",
(add ? "map" : "unmap"), name, (void *)(uintptr_t)start_pa,
(void *)size, (void *)hva, errno);
}
}
static void
nvmm_process_section(MemoryRegionSection *section, int add)
{
MemoryRegion *mr = section->mr;
hwaddr start_pa = section->offset_within_address_space;
ram_addr_t size = int128_get64(section->size);
unsigned int delta;
uintptr_t hva;
if (!memory_region_is_ram(mr)) {
return;
}
/* Adjust start_pa and size so that they are page-aligned. */
delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
delta &= ~qemu_real_host_page_mask();
if (delta > size) {
return;
}
start_pa += delta;
size -= delta;
size &= qemu_real_host_page_mask();
if (!size || (start_pa & ~qemu_real_host_page_mask())) {
return;
}
hva = (uintptr_t)memory_region_get_ram_ptr(mr) +
section->offset_within_region + delta;
nvmm_update_mapping(start_pa, size, hva, add,
memory_region_is_rom(mr), mr->name);
}
static void
nvmm_region_add(MemoryListener *listener, MemoryRegionSection *section)
{
memory_region_ref(section->mr);
nvmm_process_section(section, 1);
}
static void
nvmm_region_del(MemoryListener *listener, MemoryRegionSection *section)
{
nvmm_process_section(section, 0);
memory_region_unref(section->mr);
}
static void
nvmm_transaction_begin(MemoryListener *listener)
{
/* nothing */
}
static void
nvmm_transaction_commit(MemoryListener *listener)
{
/* nothing */
}
static void
nvmm_log_sync(MemoryListener *listener, MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
if (!memory_region_is_ram(mr)) {
return;
}
memory_region_set_dirty(mr, 0, int128_get64(section->size));
}
static MemoryListener nvmm_memory_listener = {
.name = "nvmm",
.begin = nvmm_transaction_begin,
.commit = nvmm_transaction_commit,
.region_add = nvmm_region_add,
.region_del = nvmm_region_del,
.log_sync = nvmm_log_sync,
.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
};
static void
nvmm_ram_block_added(RAMBlockNotifier *n, void *host, size_t size,
size_t max_size)
{
struct nvmm_machine *mach = get_nvmm_mach();
uintptr_t hva = (uintptr_t)host;
int ret;
ret = nvmm_hva_map(mach, hva, max_size);
if (ret == -1) {
error_report("NVMM: Failed to map HVA, HostVA:%p "
"Size:%p bytes, error=%d",
(void *)hva, (void *)size, errno);
}
}
static struct RAMBlockNotifier nvmm_ram_notifier = {
.ram_block_added = nvmm_ram_block_added
};
/* -------------------------------------------------------------------------- */
static int
nvmm_accel_init(MachineState *ms)
{
int ret, err;
ret = nvmm_init();
if (ret == -1) {
err = errno;
error_report("NVMM: Initialization failed, error=%d", errno);
return -err;
}
ret = nvmm_capability(&qemu_mach.cap);
if (ret == -1) {
err = errno;
error_report("NVMM: Unable to fetch capability, error=%d", errno);
return -err;
}
if (qemu_mach.cap.version < NVMM_KERN_VERSION) {
error_report("NVMM: Unsupported version %u", qemu_mach.cap.version);
return -EPROGMISMATCH;
}
if (qemu_mach.cap.state_size != sizeof(struct nvmm_x64_state)) {
error_report("NVMM: Wrong state size %u", qemu_mach.cap.state_size);
return -EPROGMISMATCH;
}
ret = nvmm_machine_create(&qemu_mach.mach);
if (ret == -1) {
err = errno;
error_report("NVMM: Machine creation failed, error=%d", errno);
return -err;
}
memory_listener_register(&nvmm_memory_listener, &address_space_memory);
ram_block_notifier_add(&nvmm_ram_notifier);
printf("NetBSD Virtual Machine Monitor accelerator is operational\n");
return 0;
}
int
nvmm_enabled(void)
{
return nvmm_allowed;
}
static void
nvmm_accel_class_init(ObjectClass *oc, void *data)
{
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "NVMM";
ac->init_machine = nvmm_accel_init;
ac->allowed = &nvmm_allowed;
}
static const TypeInfo nvmm_accel_type = {
.name = ACCEL_CLASS_NAME("nvmm"),
.parent = TYPE_ACCEL,
.class_init = nvmm_accel_class_init,
};
static void
nvmm_type_init(void)
{
type_register_static(&nvmm_accel_type);
}
type_init(nvmm_type_init);