qemu/accel/hvf/hvf-accel-ops.c
Danny Canter d54ffa54fb hvf: arm: Implement and use hvf_get_physical_address_range
This patch's main focus is to use the previously added
hvf_get_physical_address_range to inform VM creation
about the IPA size we need for the VM, so we can extend
the default 36b IPA size and support VMs with 64+GB of
RAM. This is done by freezing the memory map, computing
the highest GPA and then (depending on if the platform
supports an IPA size that large) telling the kernel to
use a size >= for the VM. In pursuit of this a couple of
things related to how we handle the physical address range
we expose to guests were altered, but for an explanation of
what we were doing:

Today, to get the IPA size we were reading id_aa64mmfr0_el1's
PARange field from a newly made vcpu. Unfortunately, HVF just
returns the hosts PARange directly for the initial value and
not the IPA size that will actually back the VM, so we believe
we have much more address space than we actually do today it seems.

Starting in macOS 13.0 some APIs were introduced to be able to
query the maximum IPA size the kernel supports, and to set the IPA
size for a given VM. However, this still has a couple of issues
on < macOS 15. Up until macOS 15 (and if the hardware supported
it) the max IPA size was 39 bits which is not a valid PARange
value, so we can't clamp down what we advertise in the vcpu's
id_aa64mmfr0_el1 to our IPA size. Starting in macOS 15 however,
the maximum IPA size is 40 bits (if it's supported in the hardware
as well) which is also a valid PARange value so we can set our IPA
size to the maximum as well as clamp down the PARange we advertise
to the guest. This allows VMs with 64+ GB of RAM and should fix the
oddness of the PARange situation as well.

Signed-off-by: Danny Canter <danny_canter@apple.com>
Message-id: 20240828111552.93482-4-danny_canter@apple.com
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
2024-09-13 15:31:47 +01:00

610 lines
16 KiB
C

/*
* Copyright 2008 IBM Corporation
* 2008 Red Hat, Inc.
* Copyright 2011 Intel Corporation
* Copyright 2016 Veertu, Inc.
* Copyright 2017 The Android Open Source Project
*
* QEMU Hypervisor.framework support
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*
* This file contain code under public domain from the hvdos project:
* https://github.com/mist64/hvdos
*
* Parts Copyright (c) 2011 NetApp, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "qemu/osdep.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "exec/address-spaces.h"
#include "exec/exec-all.h"
#include "gdbstub/enums.h"
#include "hw/boards.h"
#include "sysemu/cpus.h"
#include "sysemu/hvf.h"
#include "sysemu/hvf_int.h"
#include "sysemu/runstate.h"
#include "qemu/guest-random.h"
HVFState *hvf_state;
/* Memory slots */
hvf_slot *hvf_find_overlap_slot(uint64_t start, uint64_t size)
{
hvf_slot *slot;
int x;
for (x = 0; x < hvf_state->num_slots; ++x) {
slot = &hvf_state->slots[x];
if (slot->size && start < (slot->start + slot->size) &&
(start + size) > slot->start) {
return slot;
}
}
return NULL;
}
struct mac_slot {
int present;
uint64_t size;
uint64_t gpa_start;
uint64_t gva;
};
struct mac_slot mac_slots[32];
static int do_hvf_set_memory(hvf_slot *slot, hv_memory_flags_t flags)
{
struct mac_slot *macslot;
hv_return_t ret;
macslot = &mac_slots[slot->slot_id];
if (macslot->present) {
if (macslot->size != slot->size) {
macslot->present = 0;
ret = hv_vm_unmap(macslot->gpa_start, macslot->size);
assert_hvf_ok(ret);
}
}
if (!slot->size) {
return 0;
}
macslot->present = 1;
macslot->gpa_start = slot->start;
macslot->size = slot->size;
ret = hv_vm_map(slot->mem, slot->start, slot->size, flags);
assert_hvf_ok(ret);
return 0;
}
static void hvf_set_phys_mem(MemoryRegionSection *section, bool add)
{
hvf_slot *mem;
MemoryRegion *area = section->mr;
bool writable = !area->readonly && !area->rom_device;
hv_memory_flags_t flags;
uint64_t page_size = qemu_real_host_page_size();
if (!memory_region_is_ram(area)) {
if (writable) {
return;
} else if (!memory_region_is_romd(area)) {
/*
* If the memory device is not in romd_mode, then we actually want
* to remove the hvf memory slot so all accesses will trap.
*/
add = false;
}
}
if (!QEMU_IS_ALIGNED(int128_get64(section->size), page_size) ||
!QEMU_IS_ALIGNED(section->offset_within_address_space, page_size)) {
/* Not page aligned, so we can not map as RAM */
add = false;
}
mem = hvf_find_overlap_slot(
section->offset_within_address_space,
int128_get64(section->size));
if (mem && add) {
if (mem->size == int128_get64(section->size) &&
mem->start == section->offset_within_address_space &&
mem->mem == (memory_region_get_ram_ptr(area) +
section->offset_within_region)) {
return; /* Same region was attempted to register, go away. */
}
}
/* Region needs to be reset. set the size to 0 and remap it. */
if (mem) {
mem->size = 0;
if (do_hvf_set_memory(mem, 0)) {
error_report("Failed to reset overlapping slot");
abort();
}
}
if (!add) {
return;
}
if (area->readonly ||
(!memory_region_is_ram(area) && memory_region_is_romd(area))) {
flags = HV_MEMORY_READ | HV_MEMORY_EXEC;
} else {
flags = HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC;
}
/* Now make a new slot. */
int x;
for (x = 0; x < hvf_state->num_slots; ++x) {
mem = &hvf_state->slots[x];
if (!mem->size) {
break;
}
}
if (x == hvf_state->num_slots) {
error_report("No free slots");
abort();
}
mem->size = int128_get64(section->size);
mem->mem = memory_region_get_ram_ptr(area) + section->offset_within_region;
mem->start = section->offset_within_address_space;
mem->region = area;
if (do_hvf_set_memory(mem, flags)) {
error_report("Error registering new memory slot");
abort();
}
}
static void do_hvf_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
{
if (!cpu->accel->dirty) {
hvf_get_registers(cpu);
cpu->accel->dirty = true;
}
}
static void hvf_cpu_synchronize_state(CPUState *cpu)
{
if (!cpu->accel->dirty) {
run_on_cpu(cpu, do_hvf_cpu_synchronize_state, RUN_ON_CPU_NULL);
}
}
static void do_hvf_cpu_synchronize_set_dirty(CPUState *cpu,
run_on_cpu_data arg)
{
/* QEMU state is the reference, push it to HVF now and on next entry */
cpu->accel->dirty = true;
}
static void hvf_cpu_synchronize_post_reset(CPUState *cpu)
{
run_on_cpu(cpu, do_hvf_cpu_synchronize_set_dirty, RUN_ON_CPU_NULL);
}
static void hvf_cpu_synchronize_post_init(CPUState *cpu)
{
run_on_cpu(cpu, do_hvf_cpu_synchronize_set_dirty, RUN_ON_CPU_NULL);
}
static void hvf_cpu_synchronize_pre_loadvm(CPUState *cpu)
{
run_on_cpu(cpu, do_hvf_cpu_synchronize_set_dirty, RUN_ON_CPU_NULL);
}
static void hvf_set_dirty_tracking(MemoryRegionSection *section, bool on)
{
hvf_slot *slot;
slot = hvf_find_overlap_slot(
section->offset_within_address_space,
int128_get64(section->size));
/* protect region against writes; begin tracking it */
if (on) {
slot->flags |= HVF_SLOT_LOG;
hv_vm_protect((uintptr_t)slot->start, (size_t)slot->size,
HV_MEMORY_READ | HV_MEMORY_EXEC);
/* stop tracking region*/
} else {
slot->flags &= ~HVF_SLOT_LOG;
hv_vm_protect((uintptr_t)slot->start, (size_t)slot->size,
HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC);
}
}
static void hvf_log_start(MemoryListener *listener,
MemoryRegionSection *section, int old, int new)
{
if (old != 0) {
return;
}
hvf_set_dirty_tracking(section, 1);
}
static void hvf_log_stop(MemoryListener *listener,
MemoryRegionSection *section, int old, int new)
{
if (new != 0) {
return;
}
hvf_set_dirty_tracking(section, 0);
}
static void hvf_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
/*
* sync of dirty pages is handled elsewhere; just make sure we keep
* tracking the region.
*/
hvf_set_dirty_tracking(section, 1);
}
static void hvf_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
hvf_set_phys_mem(section, true);
}
static void hvf_region_del(MemoryListener *listener,
MemoryRegionSection *section)
{
hvf_set_phys_mem(section, false);
}
static MemoryListener hvf_memory_listener = {
.name = "hvf",
.priority = MEMORY_LISTENER_PRIORITY_ACCEL,
.region_add = hvf_region_add,
.region_del = hvf_region_del,
.log_start = hvf_log_start,
.log_stop = hvf_log_stop,
.log_sync = hvf_log_sync,
};
static void dummy_signal(int sig)
{
}
bool hvf_allowed;
static int hvf_accel_init(MachineState *ms)
{
int x;
hv_return_t ret;
HVFState *s;
int pa_range = 36;
MachineClass *mc = MACHINE_GET_CLASS(ms);
if (mc->hvf_get_physical_address_range) {
pa_range = mc->hvf_get_physical_address_range(ms);
if (pa_range < 0) {
return -EINVAL;
}
}
ret = hvf_arch_vm_create(ms, (uint32_t)pa_range);
assert_hvf_ok(ret);
s = g_new0(HVFState, 1);
s->num_slots = ARRAY_SIZE(s->slots);
for (x = 0; x < s->num_slots; ++x) {
s->slots[x].size = 0;
s->slots[x].slot_id = x;
}
QTAILQ_INIT(&s->hvf_sw_breakpoints);
hvf_state = s;
memory_listener_register(&hvf_memory_listener, &address_space_memory);
return hvf_arch_init();
}
static inline int hvf_gdbstub_sstep_flags(void)
{
return SSTEP_ENABLE | SSTEP_NOIRQ;
}
static void hvf_accel_class_init(ObjectClass *oc, void *data)
{
AccelClass *ac = ACCEL_CLASS(oc);
ac->name = "HVF";
ac->init_machine = hvf_accel_init;
ac->allowed = &hvf_allowed;
ac->gdbstub_supported_sstep_flags = hvf_gdbstub_sstep_flags;
}
static const TypeInfo hvf_accel_type = {
.name = TYPE_HVF_ACCEL,
.parent = TYPE_ACCEL,
.class_init = hvf_accel_class_init,
};
static void hvf_type_init(void)
{
type_register_static(&hvf_accel_type);
}
type_init(hvf_type_init);
static void hvf_vcpu_destroy(CPUState *cpu)
{
hv_return_t ret = hv_vcpu_destroy(cpu->accel->fd);
assert_hvf_ok(ret);
hvf_arch_vcpu_destroy(cpu);
g_free(cpu->accel);
cpu->accel = NULL;
}
static int hvf_init_vcpu(CPUState *cpu)
{
int r;
cpu->accel = g_new0(AccelCPUState, 1);
/* init cpu signals */
struct sigaction sigact;
memset(&sigact, 0, sizeof(sigact));
sigact.sa_handler = dummy_signal;
sigaction(SIG_IPI, &sigact, NULL);
pthread_sigmask(SIG_BLOCK, NULL, &cpu->accel->unblock_ipi_mask);
sigdelset(&cpu->accel->unblock_ipi_mask, SIG_IPI);
#ifdef __aarch64__
r = hv_vcpu_create(&cpu->accel->fd,
(hv_vcpu_exit_t **)&cpu->accel->exit, NULL);
#else
r = hv_vcpu_create(&cpu->accel->fd, HV_VCPU_DEFAULT);
#endif
cpu->accel->dirty = true;
assert_hvf_ok(r);
cpu->accel->guest_debug_enabled = false;
return hvf_arch_init_vcpu(cpu);
}
/*
* The HVF-specific vCPU thread function. This one should only run when the host
* CPU supports the VMX "unrestricted guest" feature.
*/
static void *hvf_cpu_thread_fn(void *arg)
{
CPUState *cpu = arg;
int r;
assert(hvf_enabled());
rcu_register_thread();
bql_lock();
qemu_thread_get_self(cpu->thread);
cpu->thread_id = qemu_get_thread_id();
current_cpu = cpu;
hvf_init_vcpu(cpu);
/* signal CPU creation */
cpu_thread_signal_created(cpu);
qemu_guest_random_seed_thread_part2(cpu->random_seed);
do {
if (cpu_can_run(cpu)) {
r = hvf_vcpu_exec(cpu);
if (r == EXCP_DEBUG) {
cpu_handle_guest_debug(cpu);
}
}
qemu_wait_io_event(cpu);
} while (!cpu->unplug || cpu_can_run(cpu));
hvf_vcpu_destroy(cpu);
cpu_thread_signal_destroyed(cpu);
bql_unlock();
rcu_unregister_thread();
return NULL;
}
static void hvf_start_vcpu_thread(CPUState *cpu)
{
char thread_name[VCPU_THREAD_NAME_SIZE];
/*
* HVF currently does not support TCG, and only runs in
* unrestricted-guest mode.
*/
assert(hvf_enabled());
snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
cpu->cpu_index);
qemu_thread_create(cpu->thread, thread_name, hvf_cpu_thread_fn,
cpu, QEMU_THREAD_JOINABLE);
}
static int hvf_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
{
struct hvf_sw_breakpoint *bp;
int err;
if (type == GDB_BREAKPOINT_SW) {
bp = hvf_find_sw_breakpoint(cpu, addr);
if (bp) {
bp->use_count++;
return 0;
}
bp = g_new(struct hvf_sw_breakpoint, 1);
bp->pc = addr;
bp->use_count = 1;
err = hvf_arch_insert_sw_breakpoint(cpu, bp);
if (err) {
g_free(bp);
return err;
}
QTAILQ_INSERT_HEAD(&hvf_state->hvf_sw_breakpoints, bp, entry);
} else {
err = hvf_arch_insert_hw_breakpoint(addr, len, type);
if (err) {
return err;
}
}
CPU_FOREACH(cpu) {
err = hvf_update_guest_debug(cpu);
if (err) {
return err;
}
}
return 0;
}
static int hvf_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
{
struct hvf_sw_breakpoint *bp;
int err;
if (type == GDB_BREAKPOINT_SW) {
bp = hvf_find_sw_breakpoint(cpu, addr);
if (!bp) {
return -ENOENT;
}
if (bp->use_count > 1) {
bp->use_count--;
return 0;
}
err = hvf_arch_remove_sw_breakpoint(cpu, bp);
if (err) {
return err;
}
QTAILQ_REMOVE(&hvf_state->hvf_sw_breakpoints, bp, entry);
g_free(bp);
} else {
err = hvf_arch_remove_hw_breakpoint(addr, len, type);
if (err) {
return err;
}
}
CPU_FOREACH(cpu) {
err = hvf_update_guest_debug(cpu);
if (err) {
return err;
}
}
return 0;
}
static void hvf_remove_all_breakpoints(CPUState *cpu)
{
struct hvf_sw_breakpoint *bp, *next;
CPUState *tmpcpu;
QTAILQ_FOREACH_SAFE(bp, &hvf_state->hvf_sw_breakpoints, entry, next) {
if (hvf_arch_remove_sw_breakpoint(cpu, bp) != 0) {
/* Try harder to find a CPU that currently sees the breakpoint. */
CPU_FOREACH(tmpcpu)
{
if (hvf_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
break;
}
}
}
QTAILQ_REMOVE(&hvf_state->hvf_sw_breakpoints, bp, entry);
g_free(bp);
}
hvf_arch_remove_all_hw_breakpoints();
CPU_FOREACH(cpu) {
hvf_update_guest_debug(cpu);
}
}
static void hvf_accel_ops_class_init(ObjectClass *oc, void *data)
{
AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
ops->create_vcpu_thread = hvf_start_vcpu_thread;
ops->kick_vcpu_thread = hvf_kick_vcpu_thread;
ops->synchronize_post_reset = hvf_cpu_synchronize_post_reset;
ops->synchronize_post_init = hvf_cpu_synchronize_post_init;
ops->synchronize_state = hvf_cpu_synchronize_state;
ops->synchronize_pre_loadvm = hvf_cpu_synchronize_pre_loadvm;
ops->insert_breakpoint = hvf_insert_breakpoint;
ops->remove_breakpoint = hvf_remove_breakpoint;
ops->remove_all_breakpoints = hvf_remove_all_breakpoints;
ops->update_guest_debug = hvf_update_guest_debug;
ops->supports_guest_debug = hvf_arch_supports_guest_debug;
};
static const TypeInfo hvf_accel_ops_type = {
.name = ACCEL_OPS_NAME("hvf"),
.parent = TYPE_ACCEL_OPS,
.class_init = hvf_accel_ops_class_init,
.abstract = true,
};
static void hvf_accel_ops_register_types(void)
{
type_register_static(&hvf_accel_ops_type);
}
type_init(hvf_accel_ops_register_types);