0418f90809
Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL interface (Running Average Power Limit) for advertising the accumulated energy consumption of various power domains (e.g. CPU packages, DRAM, etc.). The consumption is reported via MSRs (model specific registers) like MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits registers that represent the accumulated energy consumption in micro Joules. They are updated by microcode every ~1ms. For now, KVM always returns 0 when the guest requests the value of these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle these MSRs dynamically in userspace. To limit the amount of system calls for every MSR call, create a new thread in QEMU that updates the "virtual" MSR values asynchronously. Each vCPU has its own vMSR to reflect the independence of vCPUs. The thread updates the vMSR values with the ratio of energy consumed of the whole physical CPU package the vCPU thread runs on and the thread's utime and stime values. All other non-vCPU threads are also taken into account. Their energy consumption is evenly distributed among all vCPUs threads running on the same physical CPU package. To overcome the problem that reading the RAPL MSR requires priviliged access, a socket communication between QEMU and the qemu-vmsr-helper is mandatory. You can specified the socket path in the parameter. This feature is activated with -accel kvm,rapl=true,path=/path/sock.sock Actual limitation: - Works only on Intel host CPU because AMD CPUs are using different MSR adresses. - Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the moment. Signed-off-by: Anthony Harivel <aharivel@redhat.com> Link: https://lore.kernel.org/r/20240522153453.1230389-4-aharivel@redhat.com Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
178 lines
4.8 KiB
C
178 lines
4.8 KiB
C
/*
|
|
* Internal definitions for a target's KVM support
|
|
*
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
* See the COPYING file in the top-level directory.
|
|
*
|
|
*/
|
|
|
|
#ifndef QEMU_KVM_INT_H
|
|
#define QEMU_KVM_INT_H
|
|
|
|
#include "exec/memory.h"
|
|
#include "qapi/qapi-types-common.h"
|
|
#include "qemu/accel.h"
|
|
#include "qemu/queue.h"
|
|
#include "sysemu/kvm.h"
|
|
#include "hw/boards.h"
|
|
#include "hw/i386/topology.h"
|
|
#include "io/channel-socket.h"
|
|
|
|
typedef struct KVMSlot
|
|
{
|
|
hwaddr start_addr;
|
|
ram_addr_t memory_size;
|
|
void *ram;
|
|
int slot;
|
|
int flags;
|
|
int old_flags;
|
|
/* Dirty bitmap cache for the slot */
|
|
unsigned long *dirty_bmap;
|
|
unsigned long dirty_bmap_size;
|
|
/* Cache of the address space ID */
|
|
int as_id;
|
|
/* Cache of the offset in ram address space */
|
|
ram_addr_t ram_start_offset;
|
|
int guest_memfd;
|
|
hwaddr guest_memfd_offset;
|
|
} KVMSlot;
|
|
|
|
typedef struct KVMMemoryUpdate {
|
|
QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
|
|
MemoryRegionSection section;
|
|
} KVMMemoryUpdate;
|
|
|
|
typedef struct KVMMemoryListener {
|
|
MemoryListener listener;
|
|
KVMSlot *slots;
|
|
unsigned int nr_used_slots;
|
|
int as_id;
|
|
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
|
|
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
|
|
} KVMMemoryListener;
|
|
|
|
#define KVM_MSI_HASHTAB_SIZE 256
|
|
|
|
typedef struct KVMHostTopoInfo {
|
|
/* Number of package on the Host */
|
|
unsigned int maxpkgs;
|
|
/* Number of cpus on the Host */
|
|
unsigned int maxcpus;
|
|
/* Number of cpus on each different package */
|
|
unsigned int *pkg_cpu_count;
|
|
/* Each package can have different maxticks */
|
|
unsigned int *maxticks;
|
|
} KVMHostTopoInfo;
|
|
|
|
struct KVMMsrEnergy {
|
|
pid_t pid;
|
|
bool enable;
|
|
char *socket_path;
|
|
QIOChannelSocket *sioc;
|
|
QemuThread msr_thr;
|
|
unsigned int guest_vcpus;
|
|
unsigned int guest_vsockets;
|
|
X86CPUTopoInfo guest_topo_info;
|
|
KVMHostTopoInfo host_topo;
|
|
const CPUArchIdList *guest_cpu_list;
|
|
uint64_t *msr_value;
|
|
uint64_t msr_unit;
|
|
uint64_t msr_limit;
|
|
uint64_t msr_info;
|
|
};
|
|
|
|
enum KVMDirtyRingReaperState {
|
|
KVM_DIRTY_RING_REAPER_NONE = 0,
|
|
/* The reaper is sleeping */
|
|
KVM_DIRTY_RING_REAPER_WAIT,
|
|
/* The reaper is reaping for dirty pages */
|
|
KVM_DIRTY_RING_REAPER_REAPING,
|
|
};
|
|
|
|
/*
|
|
* KVM reaper instance, responsible for collecting the KVM dirty bits
|
|
* via the dirty ring.
|
|
*/
|
|
struct KVMDirtyRingReaper {
|
|
/* The reaper thread */
|
|
QemuThread reaper_thr;
|
|
volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
|
|
volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
|
|
};
|
|
struct KVMState
|
|
{
|
|
AccelState parent_obj;
|
|
|
|
int nr_slots;
|
|
int fd;
|
|
int vmfd;
|
|
int coalesced_mmio;
|
|
int coalesced_pio;
|
|
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
|
|
bool coalesced_flush_in_progress;
|
|
int vcpu_events;
|
|
#ifdef TARGET_KVM_HAVE_GUEST_DEBUG
|
|
QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
|
|
#endif
|
|
int max_nested_state_len;
|
|
int kvm_shadow_mem;
|
|
bool kernel_irqchip_allowed;
|
|
bool kernel_irqchip_required;
|
|
OnOffAuto kernel_irqchip_split;
|
|
bool sync_mmu;
|
|
bool guest_state_protected;
|
|
uint64_t manual_dirty_log_protect;
|
|
/* The man page (and posix) say ioctl numbers are signed int, but
|
|
* they're not. Linux, glibc and *BSD all treat ioctl numbers as
|
|
* unsigned, and treating them as signed here can break things */
|
|
unsigned irq_set_ioctl;
|
|
unsigned int sigmask_len;
|
|
GHashTable *gsimap;
|
|
#ifdef KVM_CAP_IRQ_ROUTING
|
|
struct kvm_irq_routing *irq_routes;
|
|
int nr_allocated_irq_routes;
|
|
unsigned long *used_gsi_bitmap;
|
|
unsigned int gsi_count;
|
|
#endif
|
|
KVMMemoryListener memory_listener;
|
|
QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
|
|
|
|
/* For "info mtree -f" to tell if an MR is registered in KVM */
|
|
int nr_as;
|
|
struct KVMAs {
|
|
KVMMemoryListener *ml;
|
|
AddressSpace *as;
|
|
} *as;
|
|
uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */
|
|
uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */
|
|
bool kvm_dirty_ring_with_bitmap;
|
|
uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */
|
|
struct KVMDirtyRingReaper reaper;
|
|
struct KVMMsrEnergy msr_energy;
|
|
NotifyVmexitOption notify_vmexit;
|
|
uint32_t notify_window;
|
|
uint32_t xen_version;
|
|
uint32_t xen_caps;
|
|
uint16_t xen_gnttab_max_frames;
|
|
uint16_t xen_evtchn_max_pirq;
|
|
char *device;
|
|
};
|
|
|
|
void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
|
|
AddressSpace *as, int as_id, const char *name);
|
|
|
|
void kvm_set_max_memslot_size(hwaddr max_slot_size);
|
|
|
|
/**
|
|
* kvm_hwpoison_page_add:
|
|
*
|
|
* Parameters:
|
|
* @ram_addr: the address in the RAM for the poisoned page
|
|
*
|
|
* Add a poisoned page to the list
|
|
*
|
|
* Return: None.
|
|
*/
|
|
void kvm_hwpoison_page_add(ram_addr_t ram_addr);
|
|
#endif
|