qemu/target/i386/whpx/whpx-all.c
Alex Bennée 4ea5fe997d gdbstub: move register helpers into standalone include
These inline helpers are all used by target specific code so move them
out of the general header so we don't needlessly pollute the rest of
the API with target specific stuff.

Note we have to include cpu.h in semihosting as it was relying on a
side effect before.

Reviewed-by: Taylor Simpson <tsimpson@quicinc.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Message-Id: <20230302190846.2593720-21-alex.bennee@linaro.org>
Message-Id: <20230303025805.625589-21-richard.henderson@linaro.org>
2023-03-07 20:44:08 +00:00

2800 lines
85 KiB
C

/*
* QEMU Windows Hypervisor Platform accelerator (WHPX)
*
* Copyright Microsoft Corp. 2017
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "cpu.h"
#include "exec/address-spaces.h"
#include "exec/ioport.h"
#include "gdbstub/helpers.h"
#include "qemu/accel.h"
#include "sysemu/whpx.h"
#include "sysemu/cpus.h"
#include "sysemu/runstate.h"
#include "qemu/main-loop.h"
#include "hw/boards.h"
#include "hw/intc/ioapic.h"
#include "hw/i386/apic_internal.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qapi/qapi-types-common.h"
#include "qapi/qapi-visit-common.h"
#include "migration/blocker.h"
#include <winerror.h>
#include "whpx-internal.h"
#include "whpx-accel-ops.h"
#include <WinHvPlatform.h>
#include <WinHvEmulation.h>
#define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
static const WHV_REGISTER_NAME whpx_register_names[] = {
/* X64 General purpose registers */
WHvX64RegisterRax,
WHvX64RegisterRcx,
WHvX64RegisterRdx,
WHvX64RegisterRbx,
WHvX64RegisterRsp,
WHvX64RegisterRbp,
WHvX64RegisterRsi,
WHvX64RegisterRdi,
WHvX64RegisterR8,
WHvX64RegisterR9,
WHvX64RegisterR10,
WHvX64RegisterR11,
WHvX64RegisterR12,
WHvX64RegisterR13,
WHvX64RegisterR14,
WHvX64RegisterR15,
WHvX64RegisterRip,
WHvX64RegisterRflags,
/* X64 Segment registers */
WHvX64RegisterEs,
WHvX64RegisterCs,
WHvX64RegisterSs,
WHvX64RegisterDs,
WHvX64RegisterFs,
WHvX64RegisterGs,
WHvX64RegisterLdtr,
WHvX64RegisterTr,
/* X64 Table registers */
WHvX64RegisterIdtr,
WHvX64RegisterGdtr,
/* X64 Control Registers */
WHvX64RegisterCr0,
WHvX64RegisterCr2,
WHvX64RegisterCr3,
WHvX64RegisterCr4,
WHvX64RegisterCr8,
/* X64 Debug Registers */
/*
* WHvX64RegisterDr0,
* WHvX64RegisterDr1,
* WHvX64RegisterDr2,
* WHvX64RegisterDr3,
* WHvX64RegisterDr6,
* WHvX64RegisterDr7,
*/
/* X64 Floating Point and Vector Registers */
WHvX64RegisterXmm0,
WHvX64RegisterXmm1,
WHvX64RegisterXmm2,
WHvX64RegisterXmm3,
WHvX64RegisterXmm4,
WHvX64RegisterXmm5,
WHvX64RegisterXmm6,
WHvX64RegisterXmm7,
WHvX64RegisterXmm8,
WHvX64RegisterXmm9,
WHvX64RegisterXmm10,
WHvX64RegisterXmm11,
WHvX64RegisterXmm12,
WHvX64RegisterXmm13,
WHvX64RegisterXmm14,
WHvX64RegisterXmm15,
WHvX64RegisterFpMmx0,
WHvX64RegisterFpMmx1,
WHvX64RegisterFpMmx2,
WHvX64RegisterFpMmx3,
WHvX64RegisterFpMmx4,
WHvX64RegisterFpMmx5,
WHvX64RegisterFpMmx6,
WHvX64RegisterFpMmx7,
WHvX64RegisterFpControlStatus,
WHvX64RegisterXmmControlStatus,
/* X64 MSRs */
WHvX64RegisterEfer,
#ifdef TARGET_X86_64
WHvX64RegisterKernelGsBase,
#endif
WHvX64RegisterApicBase,
/* WHvX64RegisterPat, */
WHvX64RegisterSysenterCs,
WHvX64RegisterSysenterEip,
WHvX64RegisterSysenterEsp,
WHvX64RegisterStar,
#ifdef TARGET_X86_64
WHvX64RegisterLstar,
WHvX64RegisterCstar,
WHvX64RegisterSfmask,
#endif
/* Interrupt / Event Registers */
/*
* WHvRegisterPendingInterruption,
* WHvRegisterInterruptState,
* WHvRegisterPendingEvent0,
* WHvRegisterPendingEvent1
* WHvX64RegisterDeliverabilityNotifications,
*/
};
struct whpx_register_set {
WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
};
/*
* The current implementation of instruction stepping sets the TF flag
* in RFLAGS, causing the CPU to raise an INT1 after each instruction.
* This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
*
* This approach has a few limitations:
* 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
* along with the other flags, possibly restoring it later. It would
* result in another INT1 when the flags are restored, triggering
* a stop in gdb that could be cleared by doing another step.
*
* Stepping over a POPF/LAHF instruction will let it overwrite the
* TF flags, ending the stepping mode.
*
* 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
* or anything that could result in a page fault) will save the flags
* to the stack, clear the TF flag, and let the guest execute the
* handler. Normally, the guest will restore the original flags,
* that will continue single-stepping.
*
* 3. Debuggers running on the guest may wish to set TF to do instruction
* stepping. INT1 events generated by it would be intercepted by us,
* as long as the gdb is connected to QEMU.
*
* In practice this means that:
* 1. Stepping through flags-modifying instructions may cause gdb to
* continue or stop in unexpected places. This will be fully recoverable
* and will not crash the target.
*
* 2. Stepping over an instruction that triggers an exception will step
* over the exception handler, not into it.
*
* 3. Debugging the guest via gdb, while running debugger on the guest
* at the same time may lead to unexpected effects. Removing all
* breakpoints set via QEMU will prevent any further interference
* with the guest-level debuggers.
*
* The limitations can be addressed as shown below:
* 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
* stepping through them. The exact semantics of the instructions is
* defined in the "Combined Volume Set of Intel 64 and IA-32
* Architectures Software Developer's Manuals", however it involves a
* fair amount of corner cases due to compatibility with real mode,
* virtual 8086 mode, and differences between 64-bit and 32-bit modes.
*
* 2. We could step into the guest's exception handlers using the following
* sequence:
* a. Temporarily enable catching of all exception types via
* whpx_set_exception_exit_bitmap().
* b. Once an exception is intercepted, read the IDT/GDT and locate
* the original handler.
* c. Patch the original handler, injecting an INT3 at the beginning.
* d. Update the exception exit bitmap to only catch the
* WHvX64ExceptionTypeBreakpointTrap exception.
* e. Let the affected CPU run in the exclusive mode.
* f. Restore the original handler and the exception exit bitmap.
* Note that handling all corner cases related to IDT/GDT is harder
* than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
* rough idea.
*
* 3. In order to properly support guest-level debugging in parallel with
* the QEMU-level debugging, we would need to be able to pass some INT1
* events to the guest. This could be done via the following methods:
* a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
* it seems to only work for interrupts and not software
* exceptions.
* b. Locating and patching the original handler by parsing IDT/GDT.
* This involves relatively complex logic outlined in the previous
* paragraph.
* c. Emulating the exception invocation (i.e. manually updating RIP,
* RFLAGS, and pushing the old values to stack). This is even more
* complicated than the previous option, since it involves checking
* CPL, gate attributes, and doing various adjustments depending
* on the current CPU mode, whether the CPL is changing, etc.
*/
typedef enum WhpxStepMode {
WHPX_STEP_NONE = 0,
/* Halt other VCPUs */
WHPX_STEP_EXCLUSIVE,
} WhpxStepMode;
struct whpx_vcpu {
WHV_EMULATOR_HANDLE emulator;
bool window_registered;
bool interruptable;
bool ready_for_pic_interrupt;
uint64_t tpr;
uint64_t apic_base;
bool interruption_pending;
/* Must be the last field as it may have a tail */
WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
};
static bool whpx_allowed;
static bool whp_dispatch_initialized;
static HMODULE hWinHvPlatform, hWinHvEmulation;
static uint32_t max_vcpu_index;
static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
struct whpx_state whpx_global;
struct WHPDispatch whp_dispatch;
static bool whpx_has_xsave(void)
{
return whpx_xsave_cap.XsaveSupport;
}
/*
* VP support
*/
static struct whpx_vcpu *get_whpx_vcpu(CPUState *cpu)
{
return (struct whpx_vcpu *)cpu->hax_vcpu;
}
static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
int r86)
{
WHV_X64_SEGMENT_REGISTER hs;
unsigned flags = qs->flags;
hs.Base = qs->base;
hs.Limit = qs->limit;
hs.Selector = qs->selector;
if (v86) {
hs.Attributes = 0;
hs.SegmentType = 3;
hs.Present = 1;
hs.DescriptorPrivilegeLevel = 3;
hs.NonSystemSegment = 1;
} else {
hs.Attributes = (flags >> DESC_TYPE_SHIFT);
if (r86) {
/* hs.Base &= 0xfffff; */
}
}
return hs;
}
static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
{
SegmentCache qs;
qs.base = hs->Base;
qs.limit = hs->Limit;
qs.selector = hs->Selector;
qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
return qs;
}
/* X64 Extended Control Registers */
static void whpx_set_xcrs(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
WHV_REGISTER_VALUE xcr0;
WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
if (!whpx_has_xsave()) {
return;
}
/* Only xcr0 is supported by the hypervisor currently */
xcr0.Reg64 = env->xcr0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
if (FAILED(hr)) {
error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
}
}
static int whpx_set_tsc(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
WHV_REGISTER_VALUE tsc_val;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
/*
* Suspend the partition prior to setting the TSC to reduce the variance
* in TSC across vCPUs. When the first vCPU runs post suspend, the
* partition is automatically resumed.
*/
if (whp_dispatch.WHvSuspendPartitionTime) {
/*
* Unable to suspend partition while setting TSC is not a fatal
* error. It just increases the likelihood of TSC variance between
* vCPUs and some guest OS are able to handle that just fine.
*/
hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
if (FAILED(hr)) {
warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
}
}
tsc_val.Reg64 = env->tsc;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
if (FAILED(hr)) {
error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
return -1;
}
return 0;
}
/*
* The CR8 register in the CPU is mapped to the TPR register of the APIC,
* however, they use a slightly different encoding. Specifically:
*
* APIC.TPR[bits 7:4] = CR8[bits 3:0]
*
* This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
* and IA-32 Architectures Software Developer's Manual.
*
* The functions below translate the value of CR8 to TPR and vice versa.
*/
static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
{
return tpr >> 4;
}
static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
{
return cr8 << 4;
}
static void whpx_set_registers(CPUState *cpu, int level)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_register_set vcxt;
HRESULT hr;
int idx;
int idx_next;
int i;
int v86, r86;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
/*
* Following MSRs have side effects on the guest or are too heavy for
* runtime. Limit them to full state update.
*/
if (level >= WHPX_SET_RESET_STATE) {
whpx_set_tsc(cpu);
}
memset(&vcxt, 0, sizeof(struct whpx_register_set));
v86 = (env->eflags & VM_MASK);
r86 = !(env->cr[0] & CR0_PE_MASK);
vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
idx = 0;
/* Indexes for first 16 registers match between HV and QEMU definitions */
idx_next = 16;
for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
}
idx = idx_next;
/* Same goes for RIP and RFLAGS */
assert(whpx_register_names[idx] == WHvX64RegisterRip);
vcxt.values[idx++].Reg64 = env->eip;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
vcxt.values[idx++].Reg64 = env->eflags;
/* Translate 6+4 segment registers. HV and QEMU order matches */
assert(idx == WHvX64RegisterEs);
for (i = 0; i < 6; i += 1, idx += 1) {
vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
}
assert(idx == WHvX64RegisterLdtr);
vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
assert(idx == WHvX64RegisterTr);
vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
assert(idx == WHvX64RegisterIdtr);
vcxt.values[idx].Table.Base = env->idt.base;
vcxt.values[idx].Table.Limit = env->idt.limit;
idx += 1;
assert(idx == WHvX64RegisterGdtr);
vcxt.values[idx].Table.Base = env->gdt.base;
vcxt.values[idx].Table.Limit = env->gdt.limit;
idx += 1;
/* CR0, 2, 3, 4, 8 */
assert(whpx_register_names[idx] == WHvX64RegisterCr0);
vcxt.values[idx++].Reg64 = env->cr[0];
assert(whpx_register_names[idx] == WHvX64RegisterCr2);
vcxt.values[idx++].Reg64 = env->cr[2];
assert(whpx_register_names[idx] == WHvX64RegisterCr3);
vcxt.values[idx++].Reg64 = env->cr[3];
assert(whpx_register_names[idx] == WHvX64RegisterCr4);
vcxt.values[idx++].Reg64 = env->cr[4];
assert(whpx_register_names[idx] == WHvX64RegisterCr8);
vcxt.values[idx++].Reg64 = vcpu->tpr;
/* 8 Debug Registers - Skipped */
/*
* Extended control registers needs to be handled separately depending
* on whether xsave is supported/enabled or not.
*/
whpx_set_xcrs(cpu);
/* 16 XMM registers */
assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
idx_next = idx + 16;
for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
}
idx = idx_next;
/* 8 FP registers */
assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
for (i = 0; i < 8; i += 1, idx += 1) {
vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
/* vcxt.values[idx].Fp.AsUINT128.High64 =
env->fpregs[i].mmx.MMX_Q(1);
*/
}
/* FP control status register */
assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
vcxt.values[idx].FpControlStatus.FpStatus =
(env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
vcxt.values[idx].FpControlStatus.FpTag = 0;
for (i = 0; i < 8; ++i) {
vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
}
vcxt.values[idx].FpControlStatus.Reserved = 0;
vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
idx += 1;
/* XMM control status register */
assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
idx += 1;
/* MSRs */
assert(whpx_register_names[idx] == WHvX64RegisterEfer);
vcxt.values[idx++].Reg64 = env->efer;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
vcxt.values[idx++].Reg64 = env->kernelgsbase;
#endif
assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
vcxt.values[idx++].Reg64 = vcpu->apic_base;
/* WHvX64RegisterPat - Skipped */
assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
vcxt.values[idx++].Reg64 = env->sysenter_cs;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
vcxt.values[idx++].Reg64 = env->sysenter_eip;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
vcxt.values[idx++].Reg64 = env->sysenter_esp;
assert(whpx_register_names[idx] == WHvX64RegisterStar);
vcxt.values[idx++].Reg64 = env->star;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterLstar);
vcxt.values[idx++].Reg64 = env->lstar;
assert(whpx_register_names[idx] == WHvX64RegisterCstar);
vcxt.values[idx++].Reg64 = env->cstar;
assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
vcxt.values[idx++].Reg64 = env->fmask;
#endif
/* Interrupt / Event Registers - Skipped */
assert(idx == RTL_NUMBER_OF(whpx_register_names));
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
whpx_register_names,
RTL_NUMBER_OF(whpx_register_names),
&vcxt.values[0]);
if (FAILED(hr)) {
error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
hr);
}
return;
}
static int whpx_get_tsc(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
WHV_REGISTER_VALUE tsc_val;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
if (FAILED(hr)) {
error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
return -1;
}
env->tsc = tsc_val.Reg64;
return 0;
}
/* X64 Extended Control Registers */
static void whpx_get_xcrs(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
WHV_REGISTER_VALUE xcr0;
WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
if (!whpx_has_xsave()) {
return;
}
/* Only xcr0 is supported by the hypervisor currently */
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
if (FAILED(hr)) {
error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
return;
}
env->xcr0 = xcr0.Reg64;
}
static void whpx_get_registers(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_register_set vcxt;
uint64_t tpr, apic_base;
HRESULT hr;
int idx;
int idx_next;
int i;
assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
if (!env->tsc_valid) {
whpx_get_tsc(cpu);
env->tsc_valid = !runstate_is_running();
}
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
whpx_register_names,
RTL_NUMBER_OF(whpx_register_names),
&vcxt.values[0]);
if (FAILED(hr)) {
error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
hr);
}
if (whpx_apic_in_platform()) {
/*
* Fetch the TPR value from the emulated APIC. It may get overwritten
* below with the value from CR8 returned by
* WHvGetVirtualProcessorRegisters().
*/
whpx_apic_get(x86_cpu->apic_state);
vcpu->tpr = whpx_apic_tpr_to_cr8(
cpu_get_apic_tpr(x86_cpu->apic_state));
}
idx = 0;
/* Indexes for first 16 registers match between HV and QEMU definitions */
idx_next = 16;
for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
env->regs[idx] = vcxt.values[idx].Reg64;
}
idx = idx_next;
/* Same goes for RIP and RFLAGS */
assert(whpx_register_names[idx] == WHvX64RegisterRip);
env->eip = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterRflags);
env->eflags = vcxt.values[idx++].Reg64;
/* Translate 6+4 segment registers. HV and QEMU order matches */
assert(idx == WHvX64RegisterEs);
for (i = 0; i < 6; i += 1, idx += 1) {
env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
}
assert(idx == WHvX64RegisterLdtr);
env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
assert(idx == WHvX64RegisterTr);
env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
assert(idx == WHvX64RegisterIdtr);
env->idt.base = vcxt.values[idx].Table.Base;
env->idt.limit = vcxt.values[idx].Table.Limit;
idx += 1;
assert(idx == WHvX64RegisterGdtr);
env->gdt.base = vcxt.values[idx].Table.Base;
env->gdt.limit = vcxt.values[idx].Table.Limit;
idx += 1;
/* CR0, 2, 3, 4, 8 */
assert(whpx_register_names[idx] == WHvX64RegisterCr0);
env->cr[0] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr2);
env->cr[2] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr3);
env->cr[3] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr4);
env->cr[4] = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCr8);
tpr = vcxt.values[idx++].Reg64;
if (tpr != vcpu->tpr) {
vcpu->tpr = tpr;
cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
}
/* 8 Debug Registers - Skipped */
/*
* Extended control registers needs to be handled separately depending
* on whether xsave is supported/enabled or not.
*/
whpx_get_xcrs(cpu);
/* 16 XMM registers */
assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
idx_next = idx + 16;
for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
}
idx = idx_next;
/* 8 FP registers */
assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
for (i = 0; i < 8; i += 1, idx += 1) {
env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
/* env->fpregs[i].mmx.MMX_Q(1) =
vcxt.values[idx].Fp.AsUINT128.High64;
*/
}
/* FP control status register */
assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
for (i = 0; i < 8; ++i) {
env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
}
env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
idx += 1;
/* XMM control status register */
assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
idx += 1;
/* MSRs */
assert(whpx_register_names[idx] == WHvX64RegisterEfer);
env->efer = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
env->kernelgsbase = vcxt.values[idx++].Reg64;
#endif
assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
apic_base = vcxt.values[idx++].Reg64;
if (apic_base != vcpu->apic_base) {
vcpu->apic_base = apic_base;
cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
}
/* WHvX64RegisterPat - Skipped */
assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
env->sysenter_cs = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
env->sysenter_eip = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
env->sysenter_esp = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterStar);
env->star = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
assert(whpx_register_names[idx] == WHvX64RegisterLstar);
env->lstar = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterCstar);
env->cstar = vcxt.values[idx++].Reg64;
assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
env->fmask = vcxt.values[idx++].Reg64;
#endif
/* Interrupt / Event Registers - Skipped */
assert(idx == RTL_NUMBER_OF(whpx_register_names));
if (whpx_apic_in_platform()) {
whpx_apic_get(x86_cpu->apic_state);
}
x86_update_hflags(env);
return;
}
static HRESULT CALLBACK whpx_emu_ioport_callback(
void *ctx,
WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
{
MemTxAttrs attrs = { 0 };
address_space_rw(&address_space_io, IoAccess->Port, attrs,
&IoAccess->Data, IoAccess->AccessSize,
IoAccess->Direction);
return S_OK;
}
static HRESULT CALLBACK whpx_emu_mmio_callback(
void *ctx,
WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
{
cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
ma->Direction);
return S_OK;
}
static HRESULT CALLBACK whpx_emu_getreg_callback(
void *ctx,
const WHV_REGISTER_NAME *RegisterNames,
UINT32 RegisterCount,
WHV_REGISTER_VALUE *RegisterValues)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
RegisterNames, RegisterCount,
RegisterValues);
if (FAILED(hr)) {
error_report("WHPX: Failed to get virtual processor registers,"
" hr=%08lx", hr);
}
return hr;
}
static HRESULT CALLBACK whpx_emu_setreg_callback(
void *ctx,
const WHV_REGISTER_NAME *RegisterNames,
UINT32 RegisterCount,
const WHV_REGISTER_VALUE *RegisterValues)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
RegisterNames, RegisterCount,
RegisterValues);
if (FAILED(hr)) {
error_report("WHPX: Failed to set virtual processor registers,"
" hr=%08lx", hr);
}
/*
* The emulator just successfully wrote the register state. We clear the
* dirty state so we avoid the double write on resume of the VP.
*/
cpu->vcpu_dirty = false;
return hr;
}
static HRESULT CALLBACK whpx_emu_translate_callback(
void *ctx,
WHV_GUEST_VIRTUAL_ADDRESS Gva,
WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
CPUState *cpu = (CPUState *)ctx;
WHV_TRANSLATE_GVA_RESULT res;
hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
Gva, TranslateFlags, &res, Gpa);
if (FAILED(hr)) {
error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
} else {
*TranslationResult = res.ResultCode;
}
return hr;
}
static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
.Size = sizeof(WHV_EMULATOR_CALLBACKS),
.WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
.WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
.WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
.WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
.WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
};
static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
{
HRESULT hr;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
WHV_EMULATOR_STATUS emu_status;
hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
vcpu->emulator, cpu,
&vcpu->exit_ctx.VpContext, ctx,
&emu_status);
if (FAILED(hr)) {
error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
return -1;
}
if (!emu_status.EmulationSuccessful) {
error_report("WHPX: Failed to emulate MMIO access with"
" EmulatorReturnStatus: %u", emu_status.AsUINT32);
return -1;
}
return 0;
}
static int whpx_handle_portio(CPUState *cpu,
WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
{
HRESULT hr;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
WHV_EMULATOR_STATUS emu_status;
hr = whp_dispatch.WHvEmulatorTryIoEmulation(
vcpu->emulator, cpu,
&vcpu->exit_ctx.VpContext, ctx,
&emu_status);
if (FAILED(hr)) {
error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
return -1;
}
if (!emu_status.EmulationSuccessful) {
error_report("WHPX: Failed to emulate PortIO access with"
" EmulatorReturnStatus: %u", emu_status.AsUINT32);
return -1;
}
return 0;
}
/*
* Controls whether we should intercept various exceptions on the guest,
* namely breakpoint/single-step events.
*
* The 'exceptions' argument accepts a bitmask, e.g:
* (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
*/
static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
{
struct whpx_state *whpx = &whpx_global;
WHV_PARTITION_PROPERTY prop = { 0, };
HRESULT hr;
if (exceptions == whpx->exception_exit_bitmap) {
return S_OK;
}
prop.ExceptionExitBitmap = exceptions;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeExceptionExitBitmap,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (SUCCEEDED(hr)) {
whpx->exception_exit_bitmap = exceptions;
}
return hr;
}
/*
* This function is called before/after stepping over a single instruction.
* It will update the CPU registers to arm/disarm the instruction stepping
* accordingly.
*/
static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
bool set,
uint64_t *exit_context_rflags)
{
WHV_REGISTER_NAME reg_name;
WHV_REGISTER_VALUE reg_value;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
/*
* If we are trying to step over a single instruction, we need to set the
* TF bit in rflags. Otherwise, clear it.
*/
reg_name = WHvX64RegisterRflags;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
return hr;
}
if (exit_context_rflags) {
assert(*exit_context_rflags == reg_value.Reg64);
}
if (set) {
/* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
reg_value.Reg64 |= TF_MASK;
} else {
reg_value.Reg64 &= ~TF_MASK;
}
if (exit_context_rflags) {
*exit_context_rflags = reg_value.Reg64;
}
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to set rflags,"
" hr=%08lx",
hr);
return hr;
}
reg_name = WHvRegisterInterruptState;
reg_value.Reg64 = 0;
/* Suspend delivery of hardware interrupts during single-stepping. */
reg_value.InterruptState.InterruptShadow = set != 0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to set InterruptState,"
" hr=%08lx",
hr);
return hr;
}
if (!set) {
/*
* We have just finished stepping over a single instruction,
* and intercepted the INT1 generated by it.
* We need to now hide the INT1 from the guest,
* as it would not be expecting it.
*/
reg_name = WHvX64RegisterPendingDebugException;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get pending debug exceptions,"
"hr=%08lx", hr);
return hr;
}
if (reg_value.PendingDebugException.SingleStep) {
reg_value.PendingDebugException.SingleStep = 0;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to clear pending debug exceptions,"
"hr=%08lx", hr);
return hr;
}
}
}
return S_OK;
}
/* Tries to find a breakpoint at the specified address. */
static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
{
struct whpx_state *whpx = &whpx_global;
int i;
if (whpx->breakpoints.breakpoints) {
for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
if (address == whpx->breakpoints.breakpoints->data[i].address) {
return &whpx->breakpoints.breakpoints->data[i];
}
}
}
return NULL;
}
/*
* Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
* debugging user-mode applications. Since the WHPX API does not offer
* an easy way to pass the intercepted exception back to the guest, we
* resort to using INT1 instead, and let the guest always handle INT3.
*/
static const uint8_t whpx_breakpoint_instruction = 0xF1;
/*
* The WHPX QEMU backend implements breakpoints by writing the INT1
* instruction into memory (ignoring the DRx registers). This raises a few
* issues that need to be carefully handled:
*
* 1. Although unlikely, other parts of QEMU may set multiple breakpoints
* at the same location, and later remove them in arbitrary order.
* This should not cause memory corruption, and should only remove the
* physical breakpoint instruction when the last QEMU breakpoint is gone.
*
* 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
* physical location. Hence, physically adding/removing a breakpoint can
* theoretically fail at any time. We need to keep track of it.
*
* The function below rebuilds a list of low-level breakpoints (one per
* address, tracking the original instruction and any errors) from the list of
* high-level breakpoints (set via cpu_breakpoint_insert()).
*
* In order to optimize performance, this function stores the list of
* high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
* low-level ones, so that it won't be re-invoked until these breakpoints
* change.
*
* Note that this function decides which breakpoints should be inserted into,
* memory, but doesn't actually do it. The memory accessing is done in
* whpx_apply_breakpoints().
*/
static void whpx_translate_cpu_breakpoints(
struct whpx_breakpoints *breakpoints,
CPUState *cpu,
int cpu_breakpoint_count)
{
CPUBreakpoint *bp;
int cpu_bp_index = 0;
breakpoints->original_addresses =
g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
breakpoints->original_address_count = cpu_breakpoint_count;
int max_breakpoints = cpu_breakpoint_count +
(breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
struct whpx_breakpoint_collection *new_breakpoints =
g_malloc0(sizeof(struct whpx_breakpoint_collection)
+ max_breakpoints * sizeof(struct whpx_breakpoint));
new_breakpoints->allocated = max_breakpoints;
new_breakpoints->used = 0;
/*
* 1. Preserve all old breakpoints that could not be automatically
* cleared when the CPU got stopped.
*/
if (breakpoints->breakpoints) {
int i;
for (i = 0; i < breakpoints->breakpoints->used; i++) {
if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
new_breakpoints->data[new_breakpoints->used++] =
breakpoints->breakpoints->data[i];
}
}
}
/* 2. Map all CPU breakpoints to WHPX breakpoints */
QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
int i;
bool found = false;
/* This will be used to detect changed CPU breakpoints later. */
breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
for (i = 0; i < new_breakpoints->used; i++) {
/*
* WARNING: This loop has O(N^2) complexity, where N is the
* number of breakpoints. It should not be a bottleneck in
* real-world scenarios, since it only needs to run once after
* the breakpoints have been modified.
* If this ever becomes a concern, it can be optimized by storing
* high-level breakpoint objects in a tree or hash map.
*/
if (new_breakpoints->data[i].address == bp->pc) {
/* There was already a breakpoint at this address. */
if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
new_breakpoints->data[i].state = WHPX_BP_SET;
} else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
}
found = true;
break;
}
}
if (!found && new_breakpoints->used < new_breakpoints->allocated) {
/* No WHPX breakpoint at this address. Create one. */
new_breakpoints->data[new_breakpoints->used].address = bp->pc;
new_breakpoints->data[new_breakpoints->used].state =
WHPX_BP_SET_PENDING;
new_breakpoints->used++;
}
}
/*
* Free the previous breakpoint list. This can be optimized by keeping
* it as shadow buffer for the next computation instead of freeing
* it immediately.
*/
g_free(breakpoints->breakpoints);
breakpoints->breakpoints = new_breakpoints;
}
/*
* Physically inserts/removes the breakpoints by reading and writing the
* physical memory, keeping a track of the failed attempts.
*
* Passing resuming=true will try to set all previously unset breakpoints.
* Passing resuming=false will remove all inserted ones.
*/
static void whpx_apply_breakpoints(
struct whpx_breakpoint_collection *breakpoints,
CPUState *cpu,
bool resuming)
{
int i, rc;
if (!breakpoints) {
return;
}
for (i = 0; i < breakpoints->used; i++) {
/* Decide what to do right now based on the last known state. */
WhpxBreakpointState state = breakpoints->data[i].state;
switch (state) {
case WHPX_BP_CLEARED:
if (resuming) {
state = WHPX_BP_SET_PENDING;
}
break;
case WHPX_BP_SET_PENDING:
if (!resuming) {
state = WHPX_BP_CLEARED;
}
break;
case WHPX_BP_SET:
if (!resuming) {
state = WHPX_BP_CLEAR_PENDING;
}
break;
case WHPX_BP_CLEAR_PENDING:
if (resuming) {
state = WHPX_BP_SET;
}
break;
}
if (state == WHPX_BP_SET_PENDING) {
/* Remember the original instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
&breakpoints->data[i].original_instruction,
1,
false);
if (!rc) {
/* Write the breakpoint instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
(void *)&whpx_breakpoint_instruction,
1,
true);
}
if (!rc) {
state = WHPX_BP_SET;
}
}
if (state == WHPX_BP_CLEAR_PENDING) {
/* Restore the original instruction. */
rc = cpu_memory_rw_debug(cpu,
breakpoints->data[i].address,
&breakpoints->data[i].original_instruction,
1,
true);
if (!rc) {
state = WHPX_BP_CLEARED;
}
}
breakpoints->data[i].state = state;
}
}
/*
* This function is called when the a VCPU is about to start and no other
* VCPUs have been started so far. Since the VCPU start order could be
* arbitrary, it doesn't have to be VCPU#0.
*
* It is used to commit the breakpoints into memory, and configure WHPX
* to intercept debug exceptions.
*
* Note that whpx_set_exception_exit_bitmap() cannot be called if one or
* more VCPUs are already running, so this is the best place to do it.
*/
static int whpx_first_vcpu_starting(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
HRESULT hr;
g_assert(qemu_mutex_iothread_locked());
if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
(whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used)) {
CPUBreakpoint *bp;
int i = 0;
bool update_pending = false;
QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
if (i >= whpx->breakpoints.original_address_count ||
bp->pc != whpx->breakpoints.original_addresses[i]) {
update_pending = true;
}
i++;
}
if (i != whpx->breakpoints.original_address_count) {
update_pending = true;
}
if (update_pending) {
/*
* The CPU breakpoints have changed since the last call to
* whpx_translate_cpu_breakpoints(). WHPX breakpoints must
* now be recomputed.
*/
whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
}
/* Actually insert the breakpoints into the memory. */
whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
}
uint64_t exception_mask;
if (whpx->step_pending ||
(whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used)) {
/*
* We are either attempting to single-step one or more CPUs, or
* have one or more breakpoints enabled. Both require intercepting
* the WHvX64ExceptionTypeBreakpointTrap exception.
*/
exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
} else {
/* Let the guest handle all exceptions. */
exception_mask = 0;
}
hr = whpx_set_exception_exit_bitmap(exception_mask);
if (!SUCCEEDED(hr)) {
error_report("WHPX: Failed to update exception exit mask,"
"hr=%08lx.", hr);
return 1;
}
return 0;
}
/*
* This function is called when the last VCPU has finished running.
* It is used to remove any previously set breakpoints from memory.
*/
static int whpx_last_vcpu_stopping(CPUState *cpu)
{
whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
return 0;
}
/* Returns the address of the next instruction that is about to be executed. */
static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
{
if (cpu->vcpu_dirty) {
/* The CPU registers have been modified by other parts of QEMU. */
CPUArchState *env = (CPUArchState *)(cpu->env_ptr);
return env->eip;
} else if (exit_context_valid) {
/*
* The CPU registers have not been modified by neither other parts
* of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
* This is the most common case.
*/
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
return vcpu->exit_ctx.VpContext.Rip;
} else {
/*
* The CPU registers have been modified by a call to
* WHvSetVirtualProcessorRegisters() and must be re-queried from
* the target.
*/
WHV_REGISTER_VALUE reg_value;
WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
&reg_name,
1,
&reg_value);
if (FAILED(hr)) {
error_report("WHPX: Failed to get PC, hr=%08lx", hr);
return 0;
}
return reg_value.Reg64;
}
}
static int whpx_handle_halt(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
int ret = 0;
qemu_mutex_lock_iothread();
if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) &&
!(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->exception_index = EXCP_HLT;
cpu->halted = true;
ret = 1;
}
qemu_mutex_unlock_iothread();
return ret;
}
static void whpx_vcpu_pre_run(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
int irq;
uint8_t tpr;
WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
UINT32 reg_count = 0;
WHV_REGISTER_VALUE reg_values[3];
WHV_REGISTER_NAME reg_names[3];
memset(&new_int, 0, sizeof(new_int));
memset(reg_values, 0, sizeof(reg_values));
qemu_mutex_lock_iothread();
/* Inject NMI */
if (!vcpu->interruption_pending &&
cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
vcpu->interruptable = false;
new_int.InterruptionType = WHvX64PendingNmi;
new_int.InterruptionPending = 1;
new_int.InterruptionVector = 2;
}
if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
}
}
/*
* Force the VCPU out of its inner loop to process any INIT requests or
* commit pending TPR access.
*/
if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
cpu->exit_request = 1;
}
if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
cpu->exit_request = 1;
}
}
/* Get pending hard interruption or replay one that was overwritten */
if (!whpx_apic_in_platform()) {
if (!vcpu->interruption_pending &&
vcpu->interruptable && (env->eflags & IF_MASK)) {
assert(!new_int.InterruptionPending);
if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
new_int.InterruptionType = WHvX64PendingInterrupt;
new_int.InterruptionPending = 1;
new_int.InterruptionVector = irq;
}
}
}
/* Setup interrupt state if new one was prepared */
if (new_int.InterruptionPending) {
reg_values[reg_count].PendingInterruption = new_int;
reg_names[reg_count] = WHvRegisterPendingInterruption;
reg_count += 1;
}
} else if (vcpu->ready_for_pic_interrupt &&
(cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
irq = cpu_get_pic_interrupt(env);
if (irq >= 0) {
reg_names[reg_count] = WHvRegisterPendingEvent;
reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
{
.EventPending = 1,
.EventType = WHvX64PendingEventExtInt,
.Vector = irq,
};
reg_count += 1;
}
}
/* Sync the TPR to the CR8 if was modified during the intercept */
tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
if (tpr != vcpu->tpr) {
vcpu->tpr = tpr;
reg_values[reg_count].Reg64 = tpr;
cpu->exit_request = 1;
reg_names[reg_count] = WHvX64RegisterCr8;
reg_count += 1;
}
/* Update the state of the interrupt delivery notification */
if (!vcpu->window_registered &&
cpu->interrupt_request & CPU_INTERRUPT_HARD) {
reg_values[reg_count].DeliverabilityNotifications =
(WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
.InterruptNotification = 1
};
vcpu->window_registered = 1;
reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
reg_count += 1;
}
qemu_mutex_unlock_iothread();
vcpu->ready_for_pic_interrupt = false;
if (reg_count) {
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
reg_names, reg_count, reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set interrupt state registers,"
" hr=%08lx", hr);
}
}
return;
}
static void whpx_vcpu_post_run(CPUState *cpu)
{
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
env->eflags = vcpu->exit_ctx.VpContext.Rflags;
uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
if (vcpu->tpr != tpr) {
vcpu->tpr = tpr;
qemu_mutex_lock_iothread();
cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
qemu_mutex_unlock_iothread();
}
vcpu->interruption_pending =
vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
vcpu->interruptable =
!vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
return;
}
static void whpx_vcpu_process_async_events(CPUState *cpu)
{
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
!(env->hflags & HF_SMM_MASK)) {
whpx_cpu_synchronize_state(cpu);
do_cpu_init(x86_cpu);
vcpu->interruptable = true;
}
if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
apic_poll_irq(x86_cpu->apic_state);
}
if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
(env->eflags & IF_MASK)) ||
(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
cpu->halted = false;
}
if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
whpx_cpu_synchronize_state(cpu);
do_cpu_sipi(x86_cpu);
}
if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
whpx_cpu_synchronize_state(cpu);
apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
env->tpr_access_type);
}
return;
}
static int whpx_vcpu_run(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
struct whpx_breakpoint *stepped_over_bp = NULL;
WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
int ret;
g_assert(qemu_mutex_iothread_locked());
if (whpx->running_cpus++ == 0) {
/* Insert breakpoints into memory, update exception exit bitmap. */
ret = whpx_first_vcpu_starting(cpu);
if (ret != 0) {
return ret;
}
}
if (whpx->breakpoints.breakpoints &&
whpx->breakpoints.breakpoints->used > 0)
{
uint64_t pc = whpx_vcpu_get_pc(cpu, true);
stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
stepped_over_bp = NULL;
}
if (stepped_over_bp) {
/*
* We are trying to run the instruction overwritten by an active
* breakpoint. We will temporarily disable the breakpoint, suspend
* other CPUs, and step over the instruction.
*/
exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
}
}
if (exclusive_step_mode == WHPX_STEP_NONE) {
whpx_vcpu_process_async_events(cpu);
if (cpu->halted && !whpx_apic_in_platform()) {
cpu->exception_index = EXCP_HLT;
qatomic_set(&cpu->exit_request, false);
return 0;
}
}
qemu_mutex_unlock_iothread();
if (exclusive_step_mode != WHPX_STEP_NONE) {
start_exclusive();
g_assert(cpu == current_cpu);
g_assert(!cpu->running);
cpu->running = true;
hr = whpx_set_exception_exit_bitmap(
1UL << WHvX64ExceptionTypeDebugTrapOrFault);
if (!SUCCEEDED(hr)) {
error_report("WHPX: Failed to update exception exit mask, "
"hr=%08lx.", hr);
return 1;
}
if (stepped_over_bp) {
/* Temporarily disable the triggered breakpoint. */
cpu_memory_rw_debug(cpu,
stepped_over_bp->address,
&stepped_over_bp->original_instruction,
1,
true);
}
} else {
cpu_exec_start(cpu);
}
do {
if (cpu->vcpu_dirty) {
whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
cpu->vcpu_dirty = false;
}
if (exclusive_step_mode == WHPX_STEP_NONE) {
whpx_vcpu_pre_run(cpu);
if (qatomic_read(&cpu->exit_request)) {
whpx_vcpu_kick(cpu);
}
}
if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
whpx_vcpu_configure_single_stepping(cpu, true, NULL);
}
hr = whp_dispatch.WHvRunVirtualProcessor(
whpx->partition, cpu->cpu_index,
&vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
if (FAILED(hr)) {
error_report("WHPX: Failed to exec a virtual processor,"
" hr=%08lx", hr);
ret = -1;
break;
}
if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
whpx_vcpu_configure_single_stepping(cpu,
false,
&vcpu->exit_ctx.VpContext.Rflags);
}
whpx_vcpu_post_run(cpu);
switch (vcpu->exit_ctx.ExitReason) {
case WHvRunVpExitReasonMemoryAccess:
ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
break;
case WHvRunVpExitReasonX64IoPortAccess:
ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
break;
case WHvRunVpExitReasonX64InterruptWindow:
vcpu->ready_for_pic_interrupt = 1;
vcpu->window_registered = 0;
ret = 0;
break;
case WHvRunVpExitReasonX64ApicEoi:
assert(whpx_apic_in_platform());
ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
break;
case WHvRunVpExitReasonX64Halt:
/*
* WARNING: as of build 19043.1526 (21H1), this exit reason is no
* longer used.
*/
ret = whpx_handle_halt(cpu);
break;
case WHvRunVpExitReasonX64ApicInitSipiTrap: {
WHV_INTERRUPT_CONTROL ipi = {0};
uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
uint32_t delivery_mode =
(icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
int dest_shorthand =
(icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
bool broadcast = false;
bool include_self = false;
uint32_t i;
/* We only registered for INIT and SIPI exits. */
if ((delivery_mode != APIC_DM_INIT) &&
(delivery_mode != APIC_DM_SIPI)) {
error_report(
"WHPX: Unexpected APIC exit that is not a INIT or SIPI");
break;
}
if (delivery_mode == APIC_DM_INIT) {
ipi.Type = WHvX64InterruptTypeInit;
} else {
ipi.Type = WHvX64InterruptTypeSipi;
}
ipi.DestinationMode =
((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
WHvX64InterruptDestinationModeLogical :
WHvX64InterruptDestinationModePhysical;
ipi.TriggerMode =
((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
WHvX64InterruptTriggerModeLevel :
WHvX64InterruptTriggerModeEdge;
ipi.Vector = icr & APIC_VECTOR_MASK;
switch (dest_shorthand) {
/* no shorthand. Bits 56-63 contain the destination. */
case 0:
ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
&ipi, sizeof(ipi));
if (FAILED(hr)) {
error_report("WHPX: Failed to request interrupt hr=%08lx",
hr);
}
break;
/* self */
case 1:
include_self = true;
break;
/* broadcast, including self */
case 2:
broadcast = true;
include_self = true;
break;
/* broadcast, excluding self */
case 3:
broadcast = true;
break;
}
if (!broadcast && !include_self) {
break;
}
for (i = 0; i <= max_vcpu_index; i++) {
if (i == cpu->cpu_index && !include_self) {
continue;
}
/*
* Assuming that APIC Ids are identity mapped since
* WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
* are not handled yet and the hypervisor doesn't allow the
* guest to modify the APIC ID.
*/
ipi.Destination = i;
hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
&ipi, sizeof(ipi));
if (FAILED(hr)) {
error_report(
"WHPX: Failed to request SIPI for %d, hr=%08lx",
i, hr);
}
}
break;
}
case WHvRunVpExitReasonCanceled:
if (exclusive_step_mode != WHPX_STEP_NONE) {
/*
* We are trying to step over a single instruction, and
* likely got a request to stop from another thread.
* Delay it until we are done stepping
* over.
*/
ret = 0;
} else {
cpu->exception_index = EXCP_INTERRUPT;
ret = 1;
}
break;
case WHvRunVpExitReasonX64MsrAccess: {
WHV_REGISTER_VALUE reg_values[3] = {0};
WHV_REGISTER_NAME reg_names[3];
UINT32 reg_count;
reg_names[0] = WHvX64RegisterRip;
reg_names[1] = WHvX64RegisterRax;
reg_names[2] = WHvX64RegisterRdx;
reg_values[0].Reg64 =
vcpu->exit_ctx.VpContext.Rip +
vcpu->exit_ctx.VpContext.InstructionLength;
/*
* For all unsupported MSR access we:
* ignore writes
* return 0 on read.
*/
reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1 : 3;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition,
cpu->cpu_index,
reg_names, reg_count,
reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set MsrAccess state "
" registers, hr=%08lx", hr);
}
ret = 0;
break;
}
case WHvRunVpExitReasonX64Cpuid: {
WHV_REGISTER_VALUE reg_values[5];
WHV_REGISTER_NAME reg_names[5];
UINT32 reg_count = 5;
UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
X86CPU *x86_cpu = X86_CPU(cpu);
CPUX86State *env = &x86_cpu->env;
memset(reg_values, 0, sizeof(reg_values));
rip = vcpu->exit_ctx.VpContext.Rip +
vcpu->exit_ctx.VpContext.InstructionLength;
cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
/*
* Ideally, these should be supplied to the hypervisor during VCPU
* initialization and it should be able to satisfy this request.
* But, currently, WHPX doesn't support setting CPUID values in the
* hypervisor once the partition has been setup, which is too late
* since VCPUs are realized later. For now, use the values from
* QEMU to satisfy these requests, until WHPX adds support for
* being able to set these values in the hypervisor at runtime.
*/
cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
(UINT32 *)&rcx, (UINT32 *)&rdx);
switch (cpuid_fn) {
case 0x40000000:
/* Expose the vmware cpu frequency cpuid leaf */
rax = 0x40000010;
rbx = rcx = rdx = 0;
break;
case 0x40000010:
rax = env->tsc_khz;
rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
rcx = rdx = 0;
break;
case 0x80000001:
/* Remove any support of OSVW */
rcx &= ~CPUID_EXT3_OSVW;
break;
}
reg_names[0] = WHvX64RegisterRip;
reg_names[1] = WHvX64RegisterRax;
reg_names[2] = WHvX64RegisterRcx;
reg_names[3] = WHvX64RegisterRdx;
reg_names[4] = WHvX64RegisterRbx;
reg_values[0].Reg64 = rip;
reg_values[1].Reg64 = rax;
reg_values[2].Reg64 = rcx;
reg_values[3].Reg64 = rdx;
reg_values[4].Reg64 = rbx;
hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
whpx->partition, cpu->cpu_index,
reg_names,
reg_count,
reg_values);
if (FAILED(hr)) {
error_report("WHPX: Failed to set CpuidAccess state registers,"
" hr=%08lx", hr);
}
ret = 0;
break;
}
case WHvRunVpExitReasonException:
whpx_get_registers(cpu);
if ((vcpu->exit_ctx.VpException.ExceptionType ==
WHvX64ExceptionTypeDebugTrapOrFault) &&
(vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
(vcpu->exit_ctx.VpException.InstructionBytes[0] ==
whpx_breakpoint_instruction)) {
/* Stopped at a software breakpoint. */
cpu->exception_index = EXCP_DEBUG;
} else if ((vcpu->exit_ctx.VpException.ExceptionType ==
WHvX64ExceptionTypeDebugTrapOrFault) &&
!cpu->singlestep_enabled) {
/*
* Just finished stepping over a breakpoint, but the
* gdb does not expect us to do single-stepping.
* Don't do anything special.
*/
cpu->exception_index = EXCP_INTERRUPT;
} else {
/* Another exception or debug event. Report it to GDB. */
cpu->exception_index = EXCP_DEBUG;
}
ret = 1;
break;
case WHvRunVpExitReasonNone:
case WHvRunVpExitReasonUnrecoverableException:
case WHvRunVpExitReasonInvalidVpRegisterValue:
case WHvRunVpExitReasonUnsupportedFeature:
default:
error_report("WHPX: Unexpected VP exit code %d",
vcpu->exit_ctx.ExitReason);
whpx_get_registers(cpu);
qemu_mutex_lock_iothread();
qemu_system_guest_panicked(cpu_get_crash_info(cpu));
qemu_mutex_unlock_iothread();
break;
}
} while (!ret);
if (stepped_over_bp) {
/* Restore the breakpoint we stepped over */
cpu_memory_rw_debug(cpu,
stepped_over_bp->address,
(void *)&whpx_breakpoint_instruction,
1,
true);
}
if (exclusive_step_mode != WHPX_STEP_NONE) {
g_assert(cpu_in_exclusive_context(cpu));
cpu->running = false;
end_exclusive();
exclusive_step_mode = WHPX_STEP_NONE;
} else {
cpu_exec_end(cpu);
}
qemu_mutex_lock_iothread();
current_cpu = cpu;
if (--whpx->running_cpus == 0) {
whpx_last_vcpu_stopping(cpu);
}
qatomic_set(&cpu->exit_request, false);
return ret < 0;
}
static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->vcpu_dirty) {
whpx_get_registers(cpu);
cpu->vcpu_dirty = true;
}
}
static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
run_on_cpu_data arg)
{
whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
cpu->vcpu_dirty = false;
}
static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
run_on_cpu_data arg)
{
whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
cpu->vcpu_dirty = false;
}
static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
run_on_cpu_data arg)
{
cpu->vcpu_dirty = true;
}
/*
* CPU support.
*/
void whpx_cpu_synchronize_state(CPUState *cpu)
{
if (!cpu->vcpu_dirty) {
run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
void whpx_cpu_synchronize_post_reset(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_post_init(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
{
run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
}
void whpx_cpu_synchronize_pre_resume(bool step_pending)
{
whpx_global.step_pending = step_pending;
}
/*
* Vcpu support.
*/
static Error *whpx_migration_blocker;
static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
{
CPUX86State *env = opaque;
if (running) {
env->tsc_valid = false;
}
}
int whpx_init_vcpu(CPUState *cpu)
{
HRESULT hr;
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = NULL;
Error *local_error = NULL;
CPUX86State *env = cpu->env_ptr;
X86CPU *x86_cpu = X86_CPU(cpu);
UINT64 freq = 0;
int ret;
/* Add migration blockers for all unsupported features of the
* Windows Hypervisor Platform
*/
if (whpx_migration_blocker == NULL) {
error_setg(&whpx_migration_blocker,
"State blocked due to non-migratable CPUID feature support,"
"dirty memory tracking support, and XSAVE/XRSTOR support");
if (migrate_add_blocker(whpx_migration_blocker, &local_error) < 0) {
error_report_err(local_error);
error_free(whpx_migration_blocker);
ret = -EINVAL;
goto error;
}
}
vcpu = g_new0(struct whpx_vcpu, 1);
if (!vcpu) {
error_report("WHPX: Failed to allocte VCPU context.");
ret = -ENOMEM;
goto error;
}
hr = whp_dispatch.WHvEmulatorCreateEmulator(
&whpx_emu_callbacks,
&vcpu->emulator);
if (FAILED(hr)) {
error_report("WHPX: Failed to setup instruction completion support,"
" hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvCreateVirtualProcessor(
whpx->partition, cpu->cpu_index, 0);
if (FAILED(hr)) {
error_report("WHPX: Failed to create a virtual processor,"
" hr=%08lx", hr);
whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
ret = -EINVAL;
goto error;
}
/*
* vcpu's TSC frequency is either specified by user, or use the value
* provided by Hyper-V if the former is not present. In the latter case, we
* query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
* frequency can be migrated later via this field.
*/
if (!env->tsc_khz) {
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
NULL);
if (hr != WHV_E_UNKNOWN_CAPABILITY) {
if (FAILED(hr)) {
printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
} else {
env->tsc_khz = freq / 1000; /* Hz to KHz */
}
}
}
env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
if (hr != WHV_E_UNKNOWN_CAPABILITY) {
if (FAILED(hr)) {
printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
} else {
env->apic_bus_freq = freq;
}
}
/*
* If the vmware cpuid frequency leaf option is set, and we have a valid
* tsc value, trap the corresponding cpuid's.
*/
if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeCpuidExitList,
cpuidExitList,
RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
hr);
ret = -EINVAL;
goto error;
}
}
vcpu->interruptable = true;
cpu->vcpu_dirty = true;
cpu->hax_vcpu = (struct hax_vcpu_state *)vcpu;
max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
qemu_add_vm_change_state_handler(whpx_cpu_update_state, cpu->env_ptr);
return 0;
error:
g_free(vcpu);
return ret;
}
int whpx_vcpu_exec(CPUState *cpu)
{
int ret;
int fatal;
for (;;) {
if (cpu->exception_index >= EXCP_INTERRUPT) {
ret = cpu->exception_index;
cpu->exception_index = -1;
break;
}
fatal = whpx_vcpu_run(cpu);
if (fatal) {
error_report("WHPX: Failed to exec a virtual processor");
abort();
}
}
return ret;
}
void whpx_destroy_vcpu(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
struct whpx_vcpu *vcpu = get_whpx_vcpu(cpu);
whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
g_free(cpu->hax_vcpu);
return;
}
void whpx_vcpu_kick(CPUState *cpu)
{
struct whpx_state *whpx = &whpx_global;
whp_dispatch.WHvCancelRunVirtualProcessor(
whpx->partition, cpu->cpu_index, 0);
}
/*
* Memory support.
*/
static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
void *host_va, int add, int rom,
const char *name)
{
struct whpx_state *whpx = &whpx_global;
HRESULT hr;
/*
if (add) {
printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
(void*)start_pa, (void*)size, host_va,
(rom ? "ROM" : "RAM"), name);
} else {
printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
(void*)start_pa, (void*)size, host_va, name);
}
*/
if (add) {
hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
host_va,
start_pa,
size,
(WHvMapGpaRangeFlagRead |
WHvMapGpaRangeFlagExecute |
(rom ? 0 : WHvMapGpaRangeFlagWrite)));
} else {
hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
start_pa,
size);
}
if (FAILED(hr)) {
error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
" Host:%p, hr=%08lx",
(add ? "MAP" : "UNMAP"), name,
(void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
}
}
static void whpx_process_section(MemoryRegionSection *section, int add)
{
MemoryRegion *mr = section->mr;
hwaddr start_pa = section->offset_within_address_space;
ram_addr_t size = int128_get64(section->size);
unsigned int delta;
uint64_t host_va;
if (!memory_region_is_ram(mr)) {
return;
}
delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
delta &= ~qemu_real_host_page_mask();
if (delta > size) {
return;
}
start_pa += delta;
size -= delta;
size &= qemu_real_host_page_mask();
if (!size || (start_pa & ~qemu_real_host_page_mask())) {
return;
}
host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
+ section->offset_within_region + delta;
whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
memory_region_is_rom(mr), mr->name);
}
static void whpx_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
memory_region_ref(section->mr);
whpx_process_section(section, 1);
}
static void whpx_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
whpx_process_section(section, 0);
memory_region_unref(section->mr);
}
static void whpx_transaction_begin(MemoryListener *listener)
{
}
static void whpx_transaction_commit(MemoryListener *listener)
{
}
static void whpx_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
if (!memory_region_is_ram(mr)) {
return;
}
memory_region_set_dirty(mr, 0, int128_get64(section->size));
}
static MemoryListener whpx_memory_listener = {
.name = "whpx",
.begin = whpx_transaction_begin,
.commit = whpx_transaction_commit,
.region_add = whpx_region_add,
.region_del = whpx_region_del,
.log_sync = whpx_log_sync,
.priority = 10,
};
static void whpx_memory_init(void)
{
memory_listener_register(&whpx_memory_listener, &address_space_memory);
}
/*
* Load the functions from the given library, using the given handle. If a
* handle is provided, it is used, otherwise the library is opened. The
* handle will be updated on return with the opened one.
*/
static bool load_whp_dispatch_fns(HMODULE *handle,
WHPFunctionList function_list)
{
HMODULE hLib = *handle;
#define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
#define WINHV_EMULATION_DLL "WinHvEmulation.dll"
#define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
#define WHP_LOAD_FIELD(return_type, function_name, signature) \
whp_dispatch.function_name = \
(function_name ## _t)GetProcAddress(hLib, #function_name); \
if (!whp_dispatch.function_name) { \
error_report("Could not load function %s", #function_name); \
goto error; \
} \
#define WHP_LOAD_LIB(lib_name, handle_lib) \
if (!handle_lib) { \
handle_lib = LoadLibrary(lib_name); \
if (!handle_lib) { \
error_report("Could not load library %s.", lib_name); \
goto error; \
} \
} \
switch (function_list) {
case WINHV_PLATFORM_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_EMULATION_FNS_DEFAULT:
WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
break;
case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
break;
}
*handle = hLib;
return true;
error:
if (hLib) {
FreeLibrary(hLib);
}
return false;
}
static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
struct whpx_state *whpx = &whpx_global;
OnOffSplit mode;
if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
return;
}
switch (mode) {
case ON_OFF_SPLIT_ON:
whpx->kernel_irqchip_allowed = true;
whpx->kernel_irqchip_required = true;
break;
case ON_OFF_SPLIT_OFF:
whpx->kernel_irqchip_allowed = false;
whpx->kernel_irqchip_required = false;
break;
case ON_OFF_SPLIT_SPLIT:
error_setg(errp, "WHPX: split irqchip currently not supported");
error_append_hint(errp,
"Try without kernel-irqchip or with kernel-irqchip=on|off");
break;
default:
/*
* The value was checked in visit_type_OnOffSplit() above. If
* we get here, then something is wrong in QEMU.
*/
abort();
}
}
/*
* Partition support
*/
static int whpx_accel_init(MachineState *ms)
{
struct whpx_state *whpx;
int ret;
HRESULT hr;
WHV_CAPABILITY whpx_cap;
UINT32 whpx_cap_size;
WHV_PARTITION_PROPERTY prop;
UINT32 cpuidExitList[] = {1, 0x80000001};
WHV_CAPABILITY_FEATURES features = {0};
whpx = &whpx_global;
if (!init_whp_dispatch()) {
ret = -ENOSYS;
goto error;
}
whpx->mem_quota = ms->ram_size;
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeHypervisorPresent, &whpx_cap,
sizeof(whpx_cap), &whpx_cap_size);
if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
error_report("WHPX: No accelerator found, hr=%08lx", hr);
ret = -ENOSPC;
goto error;
}
hr = whp_dispatch.WHvGetCapability(
WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
if (FAILED(hr)) {
error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to create partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
/*
* Query the XSAVE capability of the partition. Any error here is not
* considered fatal.
*/
hr = whp_dispatch.WHvGetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorXsaveFeatures,
&whpx_xsave_cap,
sizeof(whpx_xsave_cap),
&whpx_cap_size);
/*
* Windows version which don't support this property will return with the
* specific error code.
*/
if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
}
if (!whpx_has_xsave()) {
printf("WHPX: Partition is not XSAVE capable\n");
}
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ProcessorCount = ms->smp.cpus;
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeProcessorCount,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition core count to %d,"
" hr=%08lx", ms->smp.cores, hr);
ret = -EINVAL;
goto error;
}
/*
* Error out if WHP doesn't support apic emulation and user is requiring
* it.
*/
if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
!whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
error_report("WHPX: kernel irqchip requested, but unavailable. "
"Try without kernel-irqchip or with kernel-irqchip=off");
ret = -EINVAL;
goto error;
}
if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
WHvX64LocalApicEmulationModeXApic;
printf("WHPX: setting APIC emulation mode in the hypervisor\n");
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeLocalApicEmulationMode,
&mode,
sizeof(mode));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
if (whpx->kernel_irqchip_required) {
error_report("WHPX: kernel irqchip requested, but unavailable");
ret = -EINVAL;
goto error;
}
} else {
whpx->apic_in_platform = true;
}
}
/* Register for MSR and CPUID exits */
memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
prop.ExtendedVmExits.X64MsrExit = 1;
prop.ExtendedVmExits.X64CpuidExit = 1;
prop.ExtendedVmExits.ExceptionExit = 1;
if (whpx_apic_in_platform()) {
prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeExtendedVmExits,
&prop,
sizeof(WHV_PARTITION_PROPERTY));
if (FAILED(hr)) {
error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetPartitionProperty(
whpx->partition,
WHvPartitionPropertyCodeCpuidExitList,
cpuidExitList,
RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
if (FAILED(hr)) {
error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
hr);
ret = -EINVAL;
goto error;
}
/*
* We do not want to intercept any exceptions from the guest,
* until we actually start debugging with gdb.
*/
whpx->exception_exit_bitmap = -1;
hr = whpx_set_exception_exit_bitmap(0);
if (FAILED(hr)) {
error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
hr = whp_dispatch.WHvSetupPartition(whpx->partition);
if (FAILED(hr)) {
error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
ret = -EINVAL;
goto error;
}
whpx_memory_init();
printf("Windows Hypervisor Platform accelerator is operational\n");
return 0;
error:
if (NULL != whpx->partition) {
whp_dispatch.WHvDeletePartition(whpx->partition);
whpx->partition = NULL;
}
return ret;
}
int whpx_enabled(void)
{
return whpx_allowed;
}
bool whpx_apic_in_platform(void) {
return whpx_global.apic_in_platform;
}
static void whpx_accel_class_init(ObjectClass *oc, void *data)
{
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "WHPX";
ac->init_machine = whpx_accel_init;
ac->allowed = &whpx_allowed;
object_class_property_add(oc, "kernel-irqchip", "on|off|split",
NULL, whpx_set_kernel_irqchip,
NULL, NULL);
object_class_property_set_description(oc, "kernel-irqchip",
"Configure WHPX in-kernel irqchip");
}
static void whpx_accel_instance_init(Object *obj)
{
struct whpx_state *whpx = &whpx_global;
memset(whpx, 0, sizeof(struct whpx_state));
/* Turn on kernel-irqchip, by default */
whpx->kernel_irqchip_allowed = true;
}
static const TypeInfo whpx_accel_type = {
.name = ACCEL_CLASS_NAME("whpx"),
.parent = TYPE_ACCEL,
.instance_init = whpx_accel_instance_init,
.class_init = whpx_accel_class_init,
};
static void whpx_type_init(void)
{
type_register_static(&whpx_accel_type);
}
bool init_whp_dispatch(void)
{
if (whp_dispatch_initialized) {
return true;
}
if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
goto error;
}
if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
goto error;
}
assert(load_whp_dispatch_fns(&hWinHvPlatform,
WINHV_PLATFORM_FNS_SUPPLEMENTAL));
whp_dispatch_initialized = true;
return true;
error:
if (hWinHvPlatform) {
FreeLibrary(hWinHvPlatform);
}
if (hWinHvEmulation) {
FreeLibrary(hWinHvEmulation);
}
return false;
}
type_init(whpx_type_init);