From d5747cace73a9002f097a7da41253749c59fb36a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:24:57 +0200 Subject: [PATCH 001/109] pc: create custom generic PC machine type it will be used for PC specific options/variables Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ hw/i386/pc_piix.c | 36 ++++++++++++++-------------- hw/i386/pc_q35.c | 12 +++++----- include/hw/i386/pc.h | 24 +++++++++++++++++++ 4 files changed, 105 insertions(+), 24 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 3e0ecf140d..77c587b291 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1455,3 +1455,60 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name) gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i); } } + +static void pc_generic_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + QEMUMachine *qm = data; + + mc->name = qm->name; + mc->alias = qm->alias; + mc->desc = qm->desc; + mc->init = qm->init; + mc->reset = qm->reset; + mc->hot_add_cpu = qm->hot_add_cpu; + mc->kvm_type = qm->kvm_type; + mc->block_default_type = qm->block_default_type; + mc->max_cpus = qm->max_cpus; + mc->no_serial = qm->no_serial; + mc->no_parallel = qm->no_parallel; + mc->use_virtcon = qm->use_virtcon; + mc->use_sclp = qm->use_sclp; + mc->no_floppy = qm->no_floppy; + mc->no_cdrom = qm->no_cdrom; + mc->no_sdcard = qm->no_sdcard; + mc->is_default = qm->is_default; + mc->default_machine_opts = qm->default_machine_opts; + mc->default_boot_order = qm->default_boot_order; + mc->compat_props = qm->compat_props; + mc->hw_version = qm->hw_version; +} + +void qemu_register_pc_machine(QEMUMachine *m) +{ + char *name = g_strconcat(m->name, TYPE_MACHINE_SUFFIX, NULL); + TypeInfo ti = { + .name = name, + .parent = TYPE_PC_MACHINE, + .class_init = pc_generic_machine_class_init, + .class_data = (void *)m, + }; + + type_register(&ti); + g_free(name); +} + +static const TypeInfo pc_machine_info = { + .name = TYPE_PC_MACHINE, + .parent = TYPE_MACHINE, + .abstract = true, + .instance_size = sizeof(PCMachineState), + .class_size = sizeof(PCMachineClass), +}; + +static void pc_machine_register_types(void) +{ + type_register_static(&pc_machine_info); +} + +type_init(pc_machine_register_types) diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index a48e26367d..abb599b7be 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -843,25 +843,25 @@ static QEMUMachine xenfv_machine = { static void pc_machine_init(void) { - qemu_register_machine(&pc_i440fx_machine_v2_1); - qemu_register_machine(&pc_i440fx_machine_v2_0); - qemu_register_machine(&pc_i440fx_machine_v1_7); - qemu_register_machine(&pc_i440fx_machine_v1_6); - qemu_register_machine(&pc_i440fx_machine_v1_5); - qemu_register_machine(&pc_i440fx_machine_v1_4); - qemu_register_machine(&pc_machine_v1_3); - qemu_register_machine(&pc_machine_v1_2); - qemu_register_machine(&pc_machine_v1_1); - qemu_register_machine(&pc_machine_v1_0); - qemu_register_machine(&pc_machine_v0_15); - qemu_register_machine(&pc_machine_v0_14); - qemu_register_machine(&pc_machine_v0_13); - qemu_register_machine(&pc_machine_v0_12); - qemu_register_machine(&pc_machine_v0_11); - qemu_register_machine(&pc_machine_v0_10); - qemu_register_machine(&isapc_machine); + qemu_register_pc_machine(&pc_i440fx_machine_v2_1); + qemu_register_pc_machine(&pc_i440fx_machine_v2_0); + qemu_register_pc_machine(&pc_i440fx_machine_v1_7); + qemu_register_pc_machine(&pc_i440fx_machine_v1_6); + qemu_register_pc_machine(&pc_i440fx_machine_v1_5); + qemu_register_pc_machine(&pc_i440fx_machine_v1_4); + qemu_register_pc_machine(&pc_machine_v1_3); + qemu_register_pc_machine(&pc_machine_v1_2); + qemu_register_pc_machine(&pc_machine_v1_1); + qemu_register_pc_machine(&pc_machine_v1_0); + qemu_register_pc_machine(&pc_machine_v0_15); + qemu_register_pc_machine(&pc_machine_v0_14); + qemu_register_pc_machine(&pc_machine_v0_13); + qemu_register_pc_machine(&pc_machine_v0_12); + qemu_register_pc_machine(&pc_machine_v0_11); + qemu_register_pc_machine(&pc_machine_v0_10); + qemu_register_pc_machine(&isapc_machine); #ifdef CONFIG_XEN - qemu_register_machine(&xenfv_machine); + qemu_register_pc_machine(&xenfv_machine); #endif } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index b3c02c163d..d2113937be 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -384,12 +384,12 @@ static QEMUMachine pc_q35_machine_v1_4 = { static void pc_q35_machine_init(void) { - qemu_register_machine(&pc_q35_machine_v2_1); - qemu_register_machine(&pc_q35_machine_v2_0); - qemu_register_machine(&pc_q35_machine_v1_7); - qemu_register_machine(&pc_q35_machine_v1_6); - qemu_register_machine(&pc_q35_machine_v1_5); - qemu_register_machine(&pc_q35_machine_v1_4); + qemu_register_pc_machine(&pc_q35_machine_v2_1); + qemu_register_pc_machine(&pc_q35_machine_v2_0); + qemu_register_pc_machine(&pc_q35_machine_v1_7); + qemu_register_pc_machine(&pc_q35_machine_v1_6); + qemu_register_pc_machine(&pc_q35_machine_v1_5); + qemu_register_pc_machine(&pc_q35_machine_v1_4); } machine_init(pc_q35_machine_init); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index fa9d99792a..aade1b23bb 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -12,9 +12,33 @@ #include "qemu/bitmap.h" #include "sysemu/sysemu.h" #include "hw/pci/pci.h" +#include "hw/boards.h" #define HPET_INTCAP "hpet-intcap" +struct PCMachineState { + /*< private >*/ + MachineState parent_obj; +}; + +struct PCMachineClass { + /*< private >*/ + MachineClass parent_class; +}; + +typedef struct PCMachineState PCMachineState; +typedef struct PCMachineClass PCMachineClass; + +#define TYPE_PC_MACHINE "generic-pc-machine" +#define PC_MACHINE(obj) \ + OBJECT_CHECK(PCMachineState, (obj), TYPE_PC_MACHINE) +#define PC_MACHINE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(PCMachineClass, (obj), TYPE_PC_MACHINE) +#define PC_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(PCMachineClass, (klass), TYPE_PC_MACHINE) + +void qemu_register_pc_machine(QEMUMachine *m); + /* PC-style peripherals (also used by other machines). */ typedef struct PcPciInfo { From 04ed3ea892386162b287b3ed704b75b0fc0c4616 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:24:58 +0200 Subject: [PATCH 002/109] pc: ACPI BIOS: use enum for defining memory affinity flags replace magic numbers with enum describing Flags field of memory affinity in SRAT table. MemoryAffinityFlags enum will define flags decribed by: ACPI spec 5.0, "5.2.16.2 Memory Affinity Structure", "Table 5-69 Flags - Memory Affinity Structure" Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 1e0aa09bc8..9a4151083e 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1132,15 +1132,22 @@ build_hpet(GArray *table_data, GArray *linker) (void *)hpet, "HPET", sizeof(*hpet), 1); } +typedef enum { + MEM_AFFINITY_NOFLAGS = 0, + MEM_AFFINITY_ENABLED = (1 << 0), + MEM_AFFINITY_HOTPLUGGABLE = (1 << 1), + MEM_AFFINITY_NON_VOLATILE = (1 << 2), +} MemoryAffinityFlags; + static void -acpi_build_srat_memory(AcpiSratMemoryAffinity *numamem, - uint64_t base, uint64_t len, int node, int enabled) +acpi_build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base, + uint64_t len, int node, MemoryAffinityFlags flags) { numamem->type = ACPI_SRAT_MEMORY; numamem->length = sizeof(*numamem); memset(numamem->proximity, 0, 4); numamem->proximity[0] = node; - numamem->flags = cpu_to_le32(!!enabled); + numamem->flags = cpu_to_le32(flags); numamem->base_addr = cpu_to_le64(base); numamem->range_length = cpu_to_le64(len); } @@ -1188,7 +1195,7 @@ build_srat(GArray *table_data, GArray *linker, numa_start = table_data->len; numamem = acpi_data_push(table_data, sizeof *numamem); - acpi_build_srat_memory(numamem, 0, 640*1024, 0, 1); + acpi_build_srat_memory(numamem, 0, 640*1024, 0, MEM_AFFINITY_ENABLED); next_base = 1024 * 1024; for (i = 1; i < guest_info->numa_nodes + 1; ++i) { mem_base = next_base; @@ -1204,19 +1211,21 @@ build_srat(GArray *table_data, GArray *linker, mem_len -= next_base - guest_info->ram_size_below_4g; if (mem_len > 0) { numamem = acpi_data_push(table_data, sizeof *numamem); - acpi_build_srat_memory(numamem, mem_base, mem_len, i-1, 1); + acpi_build_srat_memory(numamem, mem_base, mem_len, i - 1, + MEM_AFFINITY_ENABLED); } mem_base = 1ULL << 32; mem_len = next_base - guest_info->ram_size_below_4g; next_base += (1ULL << 32) - guest_info->ram_size_below_4g; } numamem = acpi_data_push(table_data, sizeof *numamem); - acpi_build_srat_memory(numamem, mem_base, mem_len, i - 1, 1); + acpi_build_srat_memory(numamem, mem_base, mem_len, i - 1, + MEM_AFFINITY_ENABLED); } slots = (table_data->len - numa_start) / sizeof *numamem; for (; slots < guest_info->numa_nodes + 2; slots++) { numamem = acpi_data_push(table_data, sizeof *numamem); - acpi_build_srat_memory(numamem, 0, 0, 0, 0); + acpi_build_srat_memory(numamem, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } build_header(linker, table_data, From a790f4ecc9ac54769f483efc00d8e91f31bca14d Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:24:59 +0200 Subject: [PATCH 003/109] object_add: allow completion handler to get canonical path Add object to /objects before calling user_creatable_complete() handler, so that object might be able to call object_get_canonical_path() in its completion handler. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- qmp.c | 11 ++++++++--- vl.c | 11 ++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/qmp.c b/qmp.c index a7f432b37e..b722dbec9a 100644 --- a/qmp.c +++ b/qmp.c @@ -565,13 +565,18 @@ void object_add(const char *type, const char *id, const QDict *qdict, } } - user_creatable_complete(obj, &local_err); + object_property_add_child(container_get(object_get_root(), "/objects"), + id, obj, &local_err); if (local_err) { goto out; } - object_property_add_child(container_get(object_get_root(), "/objects"), - id, obj, &local_err); + user_creatable_complete(obj, &local_err); + if (local_err) { + object_property_del(container_get(object_get_root(), "/objects"), + id, &error_abort); + goto out; + } out: if (local_err) { error_propagate(errp, local_err); diff --git a/vl.c b/vl.c index be69c7f346..9ce65fc3dd 100644 --- a/vl.c +++ b/vl.c @@ -2935,14 +2935,15 @@ static int object_create(QemuOpts *opts, void *opaque) goto out; } - user_creatable_complete(obj, &local_err); - if (local_err) { - goto out; - } - object_property_add_child(container_get(object_get_root(), "/objects"), id, obj, &local_err); + user_creatable_complete(obj, &local_err); + if (local_err) { + object_property_del(container_get(object_get_root(), "/objects"), + id, &error_abort); + goto out; + } out: object_unref(obj); if (local_err) { From 2d9c2725f7bd08ffaf824b8c823f1423258ec78d Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:00 +0200 Subject: [PATCH 004/109] vl.c: daemonize before guest memory allocation memory allocated for guest before QEMU is daemonized and then mapped later in guest's address space after it is daemonized, leads to EPT violation and QEMU aborts. To avoid this and similar issues switch to daemonized mode early before applying/processing other options. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- vl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vl.c b/vl.c index 9ce65fc3dd..16418ecf8c 100644 --- a/vl.c +++ b/vl.c @@ -3965,6 +3965,8 @@ int main(int argc, char **argv, char **envp) } loc_set_none(); + os_daemonize(); + if (qemu_init_main_loop()) { fprintf(stderr, "qemu_init_main_loop failed\n"); exit(1); @@ -4206,8 +4208,6 @@ int main(int argc, char **argv, char **envp) } #endif - os_daemonize(); - if (pid_file && qemu_create_pidfile(pid_file) != 0) { os_pidfile_error(); exit(1); From 1f07048933d9c21acb40bce18af4f3ad6bae59b6 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 6 Jun 2014 17:54:29 +0200 Subject: [PATCH 005/109] add memdev backend infrastructure Provides framework for splitting host RAM allocation/ policies into a separate backend that could be used by devices. Initially only legacy RAM backend is provided, which uses memory_region_init_ram() allocator and compatible with every CLI option that affects memory_region_init_ram(). Signed-off-by: Igor Mammedov Signed-off-by: Paolo Bonzini Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/Makefile.objs | 2 + backends/hostmem-ram.c | 54 ++++++++++++++++++++++ backends/hostmem.c | 97 ++++++++++++++++++++++++++++++++++++++++ include/sysemu/hostmem.h | 60 +++++++++++++++++++++++++ 4 files changed, 213 insertions(+) create mode 100644 backends/hostmem-ram.c create mode 100644 backends/hostmem.c create mode 100644 include/sysemu/hostmem.h diff --git a/backends/Makefile.objs b/backends/Makefile.objs index 591ddcf6f3..7fb7acdd11 100644 --- a/backends/Makefile.objs +++ b/backends/Makefile.objs @@ -6,3 +6,5 @@ common-obj-$(CONFIG_BRLAPI) += baum.o baum.o-cflags := $(SDL_CFLAGS) common-obj-$(CONFIG_TPM) += tpm.o + +common-obj-y += hostmem.o hostmem-ram.o diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c new file mode 100644 index 0000000000..bba2ebcce8 --- /dev/null +++ b/backends/hostmem-ram.c @@ -0,0 +1,54 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "sysemu/hostmem.h" +#include "qom/object_interfaces.h" + +#define TYPE_MEMORY_BACKEND_RAM "memory-backend-ram" + + +static void +ram_backend_memory_init(UserCreatable *uc, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(uc); + char *path; + + if (!backend->size) { + error_setg(errp, "can't create backend with size 0"); + return; + } + + path = object_get_canonical_path_component(OBJECT(backend)); + memory_region_init_ram(&backend->mr, OBJECT(backend), path, + backend->size); + g_free(path); +} + +static void +ram_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = ram_backend_memory_init; +} + +static const TypeInfo ram_backend_info = { + .name = TYPE_MEMORY_BACKEND_RAM, + .parent = TYPE_MEMORY_BACKEND, + .class_init = ram_backend_class_init, +}; + +static void register_types(void) +{ + type_register_static(&ram_backend_info); +} + +type_init(register_types); diff --git a/backends/hostmem.c b/backends/hostmem.c new file mode 100644 index 0000000000..2f578acc81 --- /dev/null +++ b/backends/hostmem.c @@ -0,0 +1,97 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "sysemu/hostmem.h" +#include "sysemu/sysemu.h" +#include "qapi/visitor.h" +#include "qapi/qmp/qerror.h" +#include "qemu/config-file.h" +#include "qom/object_interfaces.h" + +static void +hostmemory_backend_get_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint64_t value = backend->size; + + visit_type_size(v, &value, name, errp); +} + +static void +hostmemory_backend_set_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + Error *local_err = NULL; + uint64_t value; + + if (memory_region_size(&backend->mr)) { + error_setg(&local_err, "cannot change property value"); + goto out; + } + + visit_type_size(v, &value, name, &local_err); + if (local_err) { + goto out; + } + if (!value) { + error_setg(&local_err, "Property '%s.%s' doesn't take value '%" + PRIu64 "'", object_get_typename(obj), name, value); + goto out; + } + backend->size = value; +out: + error_propagate(errp, local_err); +} + +static void hostmemory_backend_init(Object *obj) +{ + object_property_add(obj, "size", "int", + hostmemory_backend_get_size, + hostmemory_backend_set_size, NULL, NULL, NULL); +} + +static void hostmemory_backend_finalize(Object *obj) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (memory_region_size(&backend->mr)) { + memory_region_destroy(&backend->mr); + } +} + +MemoryRegion * +host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) +{ + return memory_region_size(&backend->mr) ? &backend->mr : NULL; +} + +static const TypeInfo hostmemory_backend_info = { + .name = TYPE_MEMORY_BACKEND, + .parent = TYPE_OBJECT, + .abstract = true, + .class_size = sizeof(HostMemoryBackendClass), + .instance_size = sizeof(HostMemoryBackend), + .instance_init = hostmemory_backend_init, + .instance_finalize = hostmemory_backend_finalize, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&hostmemory_backend_info); +} + +type_init(register_types); diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h new file mode 100644 index 0000000000..4fc081eb17 --- /dev/null +++ b/include/sysemu/hostmem.h @@ -0,0 +1,60 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#ifndef QEMU_RAM_H +#define QEMU_RAM_H + +#include "qom/object.h" +#include "qapi/error.h" +#include "exec/memory.h" +#include "qemu/option.h" + +#define TYPE_MEMORY_BACKEND "memory-backend" +#define MEMORY_BACKEND(obj) \ + OBJECT_CHECK(HostMemoryBackend, (obj), TYPE_MEMORY_BACKEND) +#define MEMORY_BACKEND_GET_CLASS(obj) \ + OBJECT_GET_CLASS(HostMemoryBackendClass, (obj), TYPE_MEMORY_BACKEND) +#define MEMORY_BACKEND_CLASS(klass) \ + OBJECT_CLASS_CHECK(HostMemoryBackendClass, (klass), TYPE_MEMORY_BACKEND) + +typedef struct HostMemoryBackend HostMemoryBackend; +typedef struct HostMemoryBackendClass HostMemoryBackendClass; + +/** + * HostMemoryBackendClass: + * @parent_class: opaque parent class container + */ +struct HostMemoryBackendClass { + ObjectClass parent_class; +}; + +/** + * @HostMemoryBackend + * + * @parent: opaque parent object container + * @size: amount of memory backend provides + * @id: unique identification string in memdev namespace + * @mr: MemoryRegion representing host memory belonging to backend + */ +struct HostMemoryBackend { + /* private */ + Object parent; + + /* protected */ + uint64_t size; + + MemoryRegion mr; +}; + +MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend, + Error **errp); + +#endif From c270fb9eff1290e8a4a49040eba1305fec2ce0ec Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:02 +0200 Subject: [PATCH 006/109] vl.c: extend -m option to support options for memory hotplug Add following parameters: "slots" - total number of hotplug memory slots "maxmem" - maximum possible memory "slots" and "maxmem" should go in pair and "maxmem" should be greater than "mem" for memory hotplug to be enabled. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: fix build on 32 bit --- include/hw/boards.h | 2 ++ qemu-options.hx | 9 +++++--- vl.c | 51 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/include/hw/boards.h b/include/hw/boards.h index 2d2e2bef19..184d2450ee 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -113,6 +113,8 @@ struct MachineState { char *firmware; ram_addr_t ram_size; + ram_addr_t maxram_size; + uint64_t ram_slots; const char *boot_order; char *kernel_filename; char *kernel_cmdline; diff --git a/qemu-options.hx b/qemu-options.hx index d0714c43a6..5a4eff9aa4 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -210,17 +210,20 @@ use is discouraged as it may be removed from future versions. ETEXI DEF("m", HAS_ARG, QEMU_OPTION_m, - "-m [size=]megs\n" + "-m[emory] [size=]megs[,slots=n,maxmem=size]\n" " configure guest RAM\n" " size: initial amount of guest memory (default: " - stringify(DEFAULT_RAM_SIZE) "MiB)\n", + stringify(DEFAULT_RAM_SIZE) "MiB)\n" + " slots: number of hotplug slots (default: none)\n" + " maxmem: maximum amount of guest memory (default: none)\n", QEMU_ARCH_ALL) STEXI @item -m [size=]@var{megs} @findex -m Set virtual RAM size to @var{megs} megabytes. Default is 128 MiB. Optionally, a suffix of ``M'' or ``G'' can be used to signify a value in megabytes or -gigabytes respectively. +gigabytes respectively. Optional pair @var{slots}, @var{maxmem} could be used +to set amount of hotluggable memory slots and possible maximum amount of memory. ETEXI DEF("mem-path", HAS_ARG, QEMU_OPTION_mempath, diff --git a/vl.c b/vl.c index 16418ecf8c..4c6d6df26c 100644 --- a/vl.c +++ b/vl.c @@ -520,6 +520,14 @@ static QemuOptsList qemu_mem_opts = { .name = "size", .type = QEMU_OPT_SIZE, }, + { + .name = "slots", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "maxmem", + .type = QEMU_OPT_SIZE, + }, { /* end of list */ } }, }; @@ -2992,6 +3000,8 @@ int main(int argc, char **argv, char **envp) const char *trace_file = NULL; const ram_addr_t default_ram_size = (ram_addr_t)DEFAULT_RAM_SIZE * 1024 * 1024; + ram_addr_t maxram_size = default_ram_size; + uint64_t ram_slots = 0; atexit(qemu_run_exit_notifiers); error_set_progname(argv[0]); @@ -3327,6 +3337,7 @@ int main(int argc, char **argv, char **envp) case QEMU_OPTION_m: { uint64_t sz; const char *mem_str; + const char *maxmem_str, *slots_str; opts = qemu_opts_parse(qemu_find_opts("memory"), optarg, 1); @@ -3368,6 +3379,44 @@ int main(int argc, char **argv, char **envp) error_report("ram size too large"); exit(EXIT_FAILURE); } + + maxmem_str = qemu_opt_get(opts, "maxmem"); + slots_str = qemu_opt_get(opts, "slots"); + if (maxmem_str && slots_str) { + uint64_t slots; + + sz = qemu_opt_get_size(opts, "maxmem", 0); + if (sz < ram_size) { + fprintf(stderr, "qemu: invalid -m option value: maxmem " + "(%" PRIu64 ") <= initial memory (%" + PRIu64 ")\n", sz, ram_size); + exit(EXIT_FAILURE); + } + + slots = qemu_opt_get_number(opts, "slots", 0); + if ((sz > ram_size) && !slots) { + fprintf(stderr, "qemu: invalid -m option value: maxmem " + "(%" PRIu64 ") more than initial memory (%" + PRIu64 ") but no hotplug slots where " + "specified\n", sz, ram_size); + exit(EXIT_FAILURE); + } + + if ((sz <= ram_size) && slots) { + fprintf(stderr, "qemu: invalid -m option value: %" + PRIu64 " hotplug slots where specified but " + "maxmem (%" PRIu64 ") <= initial memory (%" + PRIu64 ")\n", slots, sz, ram_size); + exit(EXIT_FAILURE); + } + maxram_size = sz; + ram_slots = slots; + } else if ((!maxmem_str && slots_str) || + (maxmem_str && !slots_str)) { + fprintf(stderr, "qemu: invalid -m option value: missing " + "'%s' option\n", slots_str ? "maxmem" : "slots"); + exit(EXIT_FAILURE); + } break; } #ifdef CONFIG_TPM @@ -4436,6 +4485,8 @@ int main(int argc, char **argv, char **envp) qdev_machine_init(); current_machine->ram_size = ram_size; + current_machine->maxram_size = maxram_size; + current_machine->ram_slots = ram_slots; current_machine->boot_order = boot_order; current_machine->cpu_model = cpu_model; From b74545481187945965cf9f4df70b71d2f6cded4a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:03 +0200 Subject: [PATCH 007/109] qdev: hotplug for bus-less devices Add get_hotplug_handler() method to machine, and make bus-less device use it during hotplug as a means to discover a hotplug handler controller. The returned controller is used to perform hotplug actions. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/qdev.c | 13 +++++++++++++ include/hw/boards.h | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index e65a5aa3a8..fded64575d 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -34,6 +34,7 @@ #include "qapi/qmp/qjson.h" #include "monitor/monitor.h" #include "hw/hotplug.h" +#include "hw/boards.h" int qdev_hotplug = 0; static bool qdev_hot_added = false; @@ -813,6 +814,18 @@ static void device_set_realized(Object *obj, bool value, Error **errp) local_err == NULL) { hotplug_handler_plug(dev->parent_bus->hotplug_handler, dev, &local_err); + } else if (local_err == NULL && + object_dynamic_cast(qdev_get_machine(), TYPE_MACHINE)) { + HotplugHandler *hotplug_ctrl; + MachineState *machine = MACHINE(qdev_get_machine()); + MachineClass *mc = MACHINE_GET_CLASS(machine); + + if (mc->get_hotplug_handler) { + hotplug_ctrl = mc->get_hotplug_handler(machine, dev); + if (hotplug_ctrl) { + hotplug_handler_plug(hotplug_ctrl, dev, &local_err); + } + } } if (qdev_get_vmsd(dev) && local_err == NULL) { diff --git a/include/hw/boards.h b/include/hw/boards.h index 184d2450ee..429ac43abc 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -61,6 +61,11 @@ extern MachineState *current_machine; /** * MachineClass: * @qemu_machine: #QEMUMachine + * @get_hotplug_handler: this function is called during bus-less + * device hotplug. If defined it returns pointer to an instance + * of HotplugHandler object, which handles hotplug operation + * for a given @dev. It may return NULL if @dev doesn't require + * any actions to be performed by hotplug handler. */ struct MachineClass { /*< private >*/ @@ -90,6 +95,9 @@ struct MachineClass { const char *default_boot_order; GlobalProperty *compat_props; const char *hw_version; + + HotplugHandler *(*get_hotplug_handler)(MachineState *machine, + DeviceState *dev); }; /** From d012ffc189197962c8ac179611396f6af92c1860 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:04 +0200 Subject: [PATCH 008/109] qdev: expose DeviceState.hotplugged field as a property so that management could detect via QOM interface if device was hotplugged Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/core/qdev.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index fded64575d..3226a71d30 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -878,6 +878,20 @@ static bool device_get_hotpluggable(Object *obj, Error **errp) dev->parent_bus->allow_hotplug); } +static bool device_get_hotplugged(Object *obj, Error **err) +{ + DeviceState *dev = DEVICE(obj); + + return dev->hotplugged; +} + +static void device_set_hotplugged(Object *obj, bool value, Error **err) +{ + DeviceState *dev = DEVICE(obj); + + dev->hotplugged = value; +} + static void device_initfn(Object *obj) { DeviceState *dev = DEVICE(obj); @@ -896,6 +910,9 @@ static void device_initfn(Object *obj) device_get_realized, device_set_realized, NULL); object_property_add_bool(obj, "hotpluggable", device_get_hotpluggable, NULL, NULL); + object_property_add_bool(obj, "hotplugged", + device_get_hotplugged, device_set_hotplugged, + &error_abort); class = object_get_class(OBJECT(dev)); do { From 10b7e74bf2de8587a8f9e15f216b87be43d6a7cf Mon Sep 17 00:00:00 2001 From: Vasilis Liaskovitis Date: Mon, 2 Jun 2014 15:25:05 +0200 Subject: [PATCH 009/109] pc: implement pc-dimm device abstraction Each hotplug-able memory slot is a PCDIMMDevice. A hot-add operation for a memory device: - creates a new PCDIMMDevice and makes hotplug controller to map it into guest address space Hotplug operations are done through normal device_add commands. For migration case, all hotplugged memory devices on source should be specified on target's command line using '-device' option with properties set to the same values as on source. To simplify review, patch introduces only PCDIMMDevice QOM skeleton that will be extended by following patches to implement actual memory hotplug and related functions. Signed-off-by: Vasilis Liaskovitis Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- default-configs/i386-softmmu.mak | 1 + default-configs/x86_64-softmmu.mak | 1 + hw/Makefile.objs | 1 + hw/mem/Makefile.objs | 1 + hw/mem/pc-dimm.c | 99 ++++++++++++++++++++++++++++++ include/hw/mem/pc-dimm.h | 73 ++++++++++++++++++++++ 6 files changed, 176 insertions(+) create mode 100644 hw/mem/Makefile.objs create mode 100644 hw/mem/pc-dimm.c create mode 100644 include/hw/mem/pc-dimm.h diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak index 37ef90f585..8e08841760 100644 --- a/default-configs/i386-softmmu.mak +++ b/default-configs/i386-softmmu.mak @@ -44,3 +44,4 @@ CONFIG_APIC=y CONFIG_IOAPIC=y CONFIG_ICC_BUS=y CONFIG_PVPANIC=y +CONFIG_MEM_HOTPLUG=y diff --git a/default-configs/x86_64-softmmu.mak b/default-configs/x86_64-softmmu.mak index 31bddce4f4..66557ac590 100644 --- a/default-configs/x86_64-softmmu.mak +++ b/default-configs/x86_64-softmmu.mak @@ -44,3 +44,4 @@ CONFIG_APIC=y CONFIG_IOAPIC=y CONFIG_ICC_BUS=y CONFIG_PVPANIC=y +CONFIG_MEM_HOTPLUG=y diff --git a/hw/Makefile.objs b/hw/Makefile.objs index d178b65de4..52a1464051 100644 --- a/hw/Makefile.objs +++ b/hw/Makefile.objs @@ -29,6 +29,7 @@ devices-dirs-$(CONFIG_SOFTMMU) += usb/ devices-dirs-$(CONFIG_VIRTIO) += virtio/ devices-dirs-$(CONFIG_SOFTMMU) += watchdog/ devices-dirs-$(CONFIG_SOFTMMU) += xen/ +devices-dirs-$(CONFIG_MEM_HOTPLUG) += mem/ devices-dirs-y += core/ common-obj-y += $(devices-dirs-y) obj-y += $(devices-dirs-y) diff --git a/hw/mem/Makefile.objs b/hw/mem/Makefile.objs new file mode 100644 index 0000000000..b000fb42bf --- /dev/null +++ b/hw/mem/Makefile.objs @@ -0,0 +1 @@ +common-obj-$(CONFIG_MEM_HOTPLUG) += pc-dimm.o diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c new file mode 100644 index 0000000000..b4937fe547 --- /dev/null +++ b/hw/mem/pc-dimm.c @@ -0,0 +1,99 @@ +/* + * Dimm device for Memory Hotplug + * + * Copyright ProfitBricks GmbH 2012 + * Copyright (C) 2014 Red Hat Inc + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +#include "hw/mem/pc-dimm.h" +#include "qemu/config-file.h" +#include "qapi/visitor.h" + +static Property pc_dimm_properties[] = { + DEFINE_PROP_UINT64(PC_DIMM_ADDR_PROP, PCDIMMDevice, addr, 0), + DEFINE_PROP_UINT32(PC_DIMM_NODE_PROP, PCDIMMDevice, node, 0), + DEFINE_PROP_INT32(PC_DIMM_SLOT_PROP, PCDIMMDevice, slot, + PC_DIMM_UNASSIGNED_SLOT), + DEFINE_PROP_END_OF_LIST(), +}; + +static void pc_dimm_get_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + int64_t value; + MemoryRegion *mr; + PCDIMMDevice *dimm = PC_DIMM(obj); + + mr = host_memory_backend_get_memory(dimm->hostmem, errp); + value = memory_region_size(mr); + + visit_type_int(v, &value, name, errp); +} + +static void pc_dimm_init(Object *obj) +{ + PCDIMMDevice *dimm = PC_DIMM(obj); + + object_property_add(obj, PC_DIMM_SIZE_PROP, "int", pc_dimm_get_size, + NULL, NULL, NULL, &error_abort); + object_property_add_link(obj, PC_DIMM_MEMDEV_PROP, TYPE_MEMORY_BACKEND, + (Object **)&dimm->hostmem, + qdev_prop_allow_set_link_before_realize, + OBJ_PROP_LINK_UNREF_ON_RELEASE, + &error_abort); +} + +static void pc_dimm_realize(DeviceState *dev, Error **errp) +{ + PCDIMMDevice *dimm = PC_DIMM(dev); + + if (!dimm->hostmem) { + error_setg(errp, "'" PC_DIMM_MEMDEV_PROP "' property is not set"); + return; + } +} + +static MemoryRegion *pc_dimm_get_memory_region(PCDIMMDevice *dimm) +{ + return host_memory_backend_get_memory(dimm->hostmem, &error_abort); +} + +static void pc_dimm_class_init(ObjectClass *oc, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(oc); + PCDIMMDeviceClass *ddc = PC_DIMM_CLASS(oc); + + dc->realize = pc_dimm_realize; + dc->props = pc_dimm_properties; + + ddc->get_memory_region = pc_dimm_get_memory_region; +} + +static TypeInfo pc_dimm_info = { + .name = TYPE_PC_DIMM, + .parent = TYPE_DEVICE, + .instance_size = sizeof(PCDIMMDevice), + .instance_init = pc_dimm_init, + .class_init = pc_dimm_class_init, + .class_size = sizeof(PCDIMMDeviceClass), +}; + +static void pc_dimm_register_types(void) +{ + type_register_static(&pc_dimm_info); +} + +type_init(pc_dimm_register_types) diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h new file mode 100644 index 0000000000..42118cb91d --- /dev/null +++ b/include/hw/mem/pc-dimm.h @@ -0,0 +1,73 @@ +/* + * PC DIMM device + * + * Copyright ProfitBricks GmbH 2012 + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Vasilis Liaskovitis + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_PC_DIMM_H +#define QEMU_PC_DIMM_H + +#include "exec/memory.h" +#include "sysemu/hostmem.h" +#include "hw/qdev.h" + +#define DEFAULT_PC_DIMMSIZE (1024*1024*1024) + +#define TYPE_PC_DIMM "pc-dimm" +#define PC_DIMM(obj) \ + OBJECT_CHECK(PCDIMMDevice, (obj), TYPE_PC_DIMM) +#define PC_DIMM_CLASS(oc) \ + OBJECT_CLASS_CHECK(PCDIMMDeviceClass, (oc), TYPE_PC_DIMM) +#define PC_DIMM_GET_CLASS(obj) \ + OBJECT_GET_CLASS(PCDIMMDeviceClass, (obj), TYPE_PC_DIMM) + +#define PC_DIMM_ADDR_PROP "addr" +#define PC_DIMM_SLOT_PROP "slot" +#define PC_DIMM_NODE_PROP "node" +#define PC_DIMM_SIZE_PROP "size" +#define PC_DIMM_MEMDEV_PROP "memdev" + +#define PC_DIMM_UNASSIGNED_SLOT -1 + +/** + * PCDIMMDevice: + * @addr: starting guest physical address, where @PCDIMMDevice is mapped. + * Default value: 0, means that address is auto-allocated. + * @node: numa node to which @PCDIMMDevice is attached. + * @slot: slot number into which @PCDIMMDevice is plugged in. + * Default value: -1, means that slot is auto-allocated. + * @hostmem: host memory backend providing memory for @PCDIMMDevice + */ +typedef struct PCDIMMDevice { + /* private */ + DeviceState parent_obj; + + /* public */ + uint64_t addr; + uint32_t node; + int32_t slot; + HostMemoryBackend *hostmem; +} PCDIMMDevice; + +/** + * PCDIMMDeviceClass: + * @get_memory_region: returns #MemoryRegion associated with @dimm + */ +typedef struct PCDIMMDeviceClass { + /* private */ + DeviceClass parent_class; + + /* public */ + MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm); +} PCDIMMDeviceClass; + +#endif From eed2bacfd2519e45498b585a147f11b0fd01c3c7 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:06 +0200 Subject: [PATCH 010/109] memory: add memory_region_is_mapped() API which allows to check if MemoryRegion is already mapped. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/exec/memory.h | 8 ++++++++ memory.c | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/exec/memory.h b/include/exec/memory.h index 549ae734e6..f4c8d4933e 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -847,6 +847,14 @@ void memory_region_set_alias_offset(MemoryRegion *mr, */ bool memory_region_present(MemoryRegion *container, hwaddr addr); +/** + * memory_region_is_mapped: returns true if #MemoryRegion is mapped + * into any address space. + * + * @mr: a #MemoryRegion which should be checked if it's mapped + */ +bool memory_region_is_mapped(MemoryRegion *mr); + /** * memory_region_find: translate an address/size relative to a * MemoryRegion into a #MemoryRegionSection. diff --git a/memory.c b/memory.c index 4895e25376..3e8cd5680b 100644 --- a/memory.c +++ b/memory.c @@ -493,7 +493,7 @@ static AddressSpace *memory_region_to_address_space(MemoryRegion *mr) return as; } } - abort(); + return NULL; } /* Render a memory region into the global view. Ranges in @view obscure @@ -1593,6 +1593,11 @@ bool memory_region_present(MemoryRegion *container, hwaddr addr) return true; } +bool memory_region_is_mapped(MemoryRegion *mr) +{ + return mr->container ? true : false; +} + MemoryRegionSection memory_region_find(MemoryRegion *mr, hwaddr addr, uint64_t size) { @@ -1610,6 +1615,9 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr, } as = memory_region_to_address_space(root); + if (!as) { + return ret; + } range = addrrange_make(int128_make64(addr), int128_make64(size)); view = address_space_get_flatview(as); From 7bb5d6ade6d8afbcad72a871f712370ffae457c6 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:07 +0200 Subject: [PATCH 011/109] pc-dimm: do not allow setting an in-use memdev using the same memdev backend more than once will cause assertion at MemoryRegion mapping time because it's already mapped. Prevent it by checking that the associated MemoryRegion is not mapped. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: tweak commit log --- hw/mem/pc-dimm.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index b4937fe547..3cced63de3 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -43,6 +43,21 @@ static void pc_dimm_get_size(Object *obj, Visitor *v, void *opaque, visit_type_int(v, &value, name, errp); } +static void pc_dimm_check_memdev_is_busy(Object *obj, const char *name, + Object *val, Error **errp) +{ + MemoryRegion *mr; + + mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), errp); + if (memory_region_is_mapped(mr)) { + char *path = object_get_canonical_path_component(val); + error_setg(errp, "can't use already busy memdev: %s", path); + g_free(path); + } else { + qdev_prop_allow_set_link_before_realize(obj, name, val, errp); + } +} + static void pc_dimm_init(Object *obj) { PCDIMMDevice *dimm = PC_DIMM(obj); @@ -51,7 +66,7 @@ static void pc_dimm_init(Object *obj) NULL, NULL, NULL, &error_abort); object_property_add_link(obj, PC_DIMM_MEMDEV_PROP, TYPE_MEMORY_BACKEND, (Object **)&dimm->hostmem, - qdev_prop_allow_set_link_before_realize, + pc_dimm_check_memdev_is_busy, OBJ_PROP_LINK_UNREF_ON_RELEASE, &error_abort); } From 619d11e4631000f20318dec90b87f314272bfa4a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:08 +0200 Subject: [PATCH 012/109] pc: initialize memory hotplug address space initialize and map hotplug memory address space container into guest's RAM address space. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 26 ++++++++++++++++++++++++-- include/hw/i386/pc.h | 10 ++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 77c587b291..6cdcb0048d 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1197,6 +1197,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, MemoryRegion *ram, *option_rom_mr; MemoryRegion *ram_below_4g, *ram_above_4g; FWCfgState *fw_cfg; + ram_addr_t ram_size = below_4g_mem_size + above_4g_mem_size; + MachineState *machine = MACHINE(qdev_get_machine()); + PCMachineState *pcms = PC_MACHINE(machine); linux_boot = (kernel_filename != NULL); @@ -1205,8 +1208,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, * with older qemus that used qemu_ram_alloc(). */ ram = g_malloc(sizeof(*ram)); - memory_region_init_ram(ram, NULL, "pc.ram", - below_4g_mem_size + above_4g_mem_size); + memory_region_init_ram(ram, NULL, "pc.ram", ram_size); vmstate_register_ram_global(ram); *ram_memory = ram; ram_below_4g = g_malloc(sizeof(*ram_below_4g)); @@ -1223,6 +1225,26 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM); } + /* initialize hotplug memory address space */ + if (ram_size < machine->maxram_size) { + ram_addr_t hotplug_mem_size = + machine->maxram_size - ram_size; + + pcms->hotplug_memory_base = + ROUND_UP(0x100000000ULL + above_4g_mem_size, 1ULL << 30); + + if ((pcms->hotplug_memory_base + hotplug_mem_size) < + hotplug_mem_size) { + error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT, + machine->maxram_size); + exit(EXIT_FAILURE); + } + + memory_region_init(&pcms->hotplug_memory, OBJECT(pcms), + "hotplug-memory", hotplug_mem_size); + memory_region_add_subregion(system_memory, pcms->hotplug_memory_base, + &pcms->hotplug_memory); + } /* Initialize PC system firmware */ pc_system_firmware_init(rom_memory, guest_info->isapc_ram_fw); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index aade1b23bb..48d4c5e9f0 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -16,9 +16,19 @@ #define HPET_INTCAP "hpet-intcap" +/** + * PCMachineState: + * @hotplug_memory_base: address in guest RAM address space where hotplug memory + * address space begins. + * @hotplug_memory: hotplug memory addess space container + */ struct PCMachineState { /*< private >*/ MachineState parent_obj; + + /* */ + ram_addr_t hotplug_memory_base; + MemoryRegion hotplug_memory; }; struct PCMachineClass { From a0cc8856e8722188583901665c7d9e5ddc752fc0 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:09 +0200 Subject: [PATCH 013/109] pc: exit QEMU if number of slots more than supported 256 ... which is imposed by current naming scheme of ACPI memory devices. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++++++ include/hw/acpi/acpi.h | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 6cdcb0048d..366e799bb0 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1230,6 +1230,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, ram_addr_t hotplug_mem_size = machine->maxram_size - ram_size; + if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { + error_report("unsupported amount of memory slots: %"PRIu64, + machine->ram_slots); + exit(EXIT_FAILURE); + } + pcms->hotplug_memory_base = ROUND_UP(0x100000000ULL + above_4g_mem_size, 1ULL << 30); diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h index a9fae9d5c5..e93de6cab1 100644 --- a/include/hw/acpi/acpi.h +++ b/include/hw/acpi/acpi.h @@ -26,6 +26,12 @@ #include "exec/memory.h" #include "hw/irq.h" +/* + * current device naming scheme supports + * only upto 256 memory devices + */ +#define ACPI_MAX_RAM_SLOTS 256 + /* from linux include/acpi/actype.h */ /* Default ACPI register widths */ From de268e134c03612970d6f2c214df6287c9621cc8 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:10 +0200 Subject: [PATCH 014/109] pc: add 'etc/reserved-memory-end' fw_cfg interface for SeaBIOS 'etc/reserved-memory-end' will allow QEMU to tell BIOS where PCI BARs mapping could safely start in high memory. Allowing BIOS to start mapping 64-bit PCI BARs at address where it wouldn't conflict with other mappings QEMU might place before it. That permits QEMU to reserve extra address space before 64-bit PCI hole for memory hotplug. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 9 ++++++++- hw/i386/pc_piix.c | 3 +++ hw/i386/pc_q35.c | 3 +++ include/hw/i386/pc.h | 1 + 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 366e799bb0..830aeeec91 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1226,7 +1226,8 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, } /* initialize hotplug memory address space */ - if (ram_size < machine->maxram_size) { + if (guest_info->has_reserved_memory && + (ram_size < machine->maxram_size)) { ram_addr_t hotplug_mem_size = machine->maxram_size - ram_size; @@ -1266,6 +1267,12 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, fw_cfg = bochs_bios_init(); rom_set_fw(fw_cfg); + if (guest_info->has_reserved_memory && pcms->hotplug_memory_base) { + uint64_t *val = g_malloc(sizeof(*val)); + *val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory_base, 0x1ULL << 30)); + fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val)); + } + if (linux_boot) { load_linux(fw_cfg, kernel_filename, initrd_filename, kernel_cmdline, below_4g_mem_size); } diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index abb599b7be..e133b6aa59 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -67,6 +67,7 @@ static bool smbios_legacy_mode; * pages in the host. */ static bool gigabyte_align = true; +static bool has_reserved_memory = true; /* PC hardware initialisation */ static void pc_init1(MachineState *machine, @@ -143,6 +144,7 @@ static void pc_init1(MachineState *machine, guest_info->has_pci_info = has_pci_info; guest_info->isapc_ram_fw = !pci_enabled; + guest_info->has_reserved_memory = has_reserved_memory; if (smbios_defaults) { MachineClass *mc = MACHINE_GET_CLASS(machine); @@ -267,6 +269,7 @@ static void pc_init_pci(MachineState *machine) static void pc_compat_2_0(MachineState *machine) { smbios_legacy_mode = true; + has_reserved_memory = false; } static void pc_compat_1_7(MachineState *machine) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index d2113937be..0e774765e9 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -57,6 +57,7 @@ static bool smbios_legacy_mode; * pages in the host. */ static bool gigabyte_align = true; +static bool has_reserved_memory = true; /* PC hardware initialisation */ static void pc_q35_init(MachineState *machine) @@ -130,6 +131,7 @@ static void pc_q35_init(MachineState *machine) guest_info->has_pci_info = has_pci_info; guest_info->isapc_ram_fw = false; guest_info->has_acpi_build = has_acpi_build; + guest_info->has_reserved_memory = has_reserved_memory; if (smbios_defaults) { MachineClass *mc = MACHINE_GET_CLASS(machine); @@ -245,6 +247,7 @@ static void pc_q35_init(MachineState *machine) static void pc_compat_2_0(MachineState *machine) { smbios_legacy_mode = true; + has_reserved_memory = false; } static void pc_compat_1_7(MachineState *machine) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 48d4c5e9f0..501d9a0773 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -77,6 +77,7 @@ struct PcGuestInfo { uint64_t *node_cpu; FWCfgState *fw_cfg; bool has_acpi_build; + bool has_reserved_memory; }; /* parallel.c */ From ca8336f385d998d2bf25b66176c05a3794ee003a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:11 +0200 Subject: [PATCH 015/109] pc: exit QEMU if compat machine doesn't support memory hotlpug ... if user attempts to start it with memory hotplug enabled. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 830aeeec91..4fad414632 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1225,6 +1225,16 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM); } + if (!guest_info->has_reserved_memory && + (machine->ram_slots || + (machine->maxram_size > ram_size))) { + MachineClass *mc = MACHINE_GET_CLASS(machine); + + error_report("\"-memory 'slots|maxmem'\" is not supported by: %s", + mc->name); + exit(EXIT_FAILURE); + } + /* initialize hotplug memory address space */ if (guest_info->has_reserved_memory && (ram_size < machine->maxram_size)) { From 95bee274fd1d22dc6d35e52987f8b5d29fe754dd Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:12 +0200 Subject: [PATCH 016/109] pc: add memory hotplug handler to PC_MACHINE that will perform mapping of PC_DIMM device into guest's RAM address space Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 59 ++++++++++++++++++++++++++++++++++++++++++++ include/hw/i386/pc.h | 8 ++++++ 2 files changed, 67 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4fad414632..70527d8880 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -58,6 +58,7 @@ #include "hw/boards.h" #include "hw/pci/pci_host.h" #include "acpi-build.h" +#include "hw/mem/pc-dimm.h" /* debug PC/ISA interrupts */ //#define DEBUG_IRQ @@ -1543,12 +1544,70 @@ void qemu_register_pc_machine(QEMUMachine *m) g_free(name); } +static void pc_dimm_plug(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + Error *local_err = NULL; + PCMachineState *pcms = PC_MACHINE(hotplug_dev); + PCDIMMDevice *dimm = PC_DIMM(dev); + PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); + MemoryRegion *mr = ddc->get_memory_region(dimm); + uint64_t addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, + &local_err); + if (local_err) { + goto out; + } + + memory_region_add_subregion(&pcms->hotplug_memory, + addr - pcms->hotplug_memory_base, mr); + vmstate_register_ram(mr, dev); +out: + error_propagate(errp, local_err); +} + +static void pc_machine_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + pc_dimm_plug(hotplug_dev, dev, errp); + } +} + +static HotplugHandler *pc_get_hotpug_handler(MachineState *machine, + DeviceState *dev) +{ + PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(machine); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + return HOTPLUG_HANDLER(machine); + } + + return pcmc->get_hotplug_handler ? + pcmc->get_hotplug_handler(machine, dev) : NULL; +} + +static void pc_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + PCMachineClass *pcmc = PC_MACHINE_CLASS(oc); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); + + pcmc->get_hotplug_handler = mc->get_hotplug_handler; + mc->get_hotplug_handler = pc_get_hotpug_handler; + hc->plug = pc_machine_device_plug_cb; +} + static const TypeInfo pc_machine_info = { .name = TYPE_PC_MACHINE, .parent = TYPE_MACHINE, .abstract = true, .instance_size = sizeof(PCMachineState), .class_size = sizeof(PCMachineClass), + .class_init = pc_machine_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + }, }; static void pc_machine_register_types(void) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 501d9a0773..c3ed8586df 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -31,9 +31,17 @@ struct PCMachineState { MemoryRegion hotplug_memory; }; +/** + * PCMachineClass: + * @get_hotplug_handler: pointer to parent class callback @get_hotplug_handler + */ struct PCMachineClass { /*< private >*/ MachineClass parent_class; + + /*< public >*/ + HotplugHandler *(*get_hotplug_handler)(MachineState *machine, + DeviceState *dev); }; typedef struct PCMachineState PCMachineState; From 0b3125711606374f04e279aebefcb7275480f040 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:13 +0200 Subject: [PATCH 017/109] pc-dimm: add busy address check and address auto-allocation - if 'addr' property is not specified on -device/device_add command, treat the default value as request for assigning PCDIMMDevice to the first free memory region. - if 'addr' is provided with -device/device_add command, attempt to use it or fail command if it's already occupied or falls inside of an existing PCDIMMDevice memory region. Note: GCompareFunc(a, b) used by g_slist_insert_sorted() returns 'gint', however it might be too small to fit difference between 2 addresses. So use 128bit to calculate the difference and normalize result to -1/0/1 return values. Signed-off-by: Igor Mammedov Signed-off-by: Tang Chen Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Tested-by: Andrey Korolyov MST: commit log tweaks --- hw/i386/pc.c | 9 +++++ hw/mem/pc-dimm.c | 82 ++++++++++++++++++++++++++++++++++++++++ include/hw/mem/pc-dimm.h | 4 ++ 3 files changed, 95 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 70527d8880..4dc86d5bc4 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1558,6 +1558,15 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } + addr = pc_dimm_get_free_addr(pcms->hotplug_memory_base, + memory_region_size(&pcms->hotplug_memory), + !addr ? NULL : &addr, + memory_region_size(mr), &local_err); + if (local_err) { + goto out; + } + object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err); + memory_region_add_subregion(&pcms->hotplug_memory, addr - pcms->hotplug_memory_base, mr); vmstate_register_ram(mr, dev); diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 3cced63de3..40b51c883e 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -21,6 +21,88 @@ #include "hw/mem/pc-dimm.h" #include "qemu/config-file.h" #include "qapi/visitor.h" +#include "qemu/range.h" + +static gint pc_dimm_addr_sort(gconstpointer a, gconstpointer b) +{ + PCDIMMDevice *x = PC_DIMM(a); + PCDIMMDevice *y = PC_DIMM(b); + Int128 diff = int128_sub(int128_make64(x->addr), int128_make64(y->addr)); + + if (int128_lt(diff, int128_zero())) { + return -1; + } else if (int128_gt(diff, int128_zero())) { + return 1; + } + return 0; +} + +static int pc_dimm_built_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_insert_sorted(*list, dev, pc_dimm_addr_sort); + } + } + + object_child_foreach(obj, pc_dimm_built_list, opaque); + return 0; +} + +uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, + uint64_t address_space_size, + uint64_t *hint, uint64_t size, + Error **errp) +{ + GSList *list = NULL, *item; + uint64_t new_addr, ret = 0; + uint64_t address_space_end = address_space_start + address_space_size; + + assert(address_space_end > address_space_size); + object_child_foreach(qdev_get_machine(), pc_dimm_built_list, &list); + + if (hint) { + new_addr = *hint; + } else { + new_addr = address_space_start; + } + + /* find address range that will fit new DIMM */ + for (item = list; item; item = g_slist_next(item)) { + PCDIMMDevice *dimm = item->data; + uint64_t dimm_size = object_property_get_int(OBJECT(dimm), + PC_DIMM_SIZE_PROP, + errp); + if (errp && *errp) { + goto out; + } + + if (ranges_overlap(dimm->addr, dimm_size, new_addr, size)) { + if (hint) { + DeviceState *d = DEVICE(dimm); + error_setg(errp, "address range conflicts with '%s'", d->id); + goto out; + } + new_addr = dimm->addr + dimm_size; + } + } + ret = new_addr; + + if (new_addr < address_space_start) { + error_setg(errp, "can't add memory [0x%" PRIx64 ":0x%" PRIx64 + "] at 0x%" PRIx64, new_addr, size, address_space_start); + } else if ((new_addr + size) > address_space_end) { + error_setg(errp, "can't add memory [0x%" PRIx64 ":0x%" PRIx64 + "] beyond 0x%" PRIx64, new_addr, size, address_space_end); + } + +out: + g_slist_free(list); + return ret; +} static Property pc_dimm_properties[] = { DEFINE_PROP_UINT64(PC_DIMM_ADDR_PROP, PCDIMMDevice, addr, 0), diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index 42118cb91d..5a3e7df13c 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -70,4 +70,8 @@ typedef struct PCDIMMDeviceClass { MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm); } PCDIMMDeviceClass; +uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, + uint64_t address_space_size, + uint64_t *hint, uint64_t size, + Error **errp); #endif From 0cd03d89b98154fcc03d9a8eeea70b9c50cb9457 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:14 +0200 Subject: [PATCH 018/109] pc-dimm: add busy slot check and slot auto-allocation - if slot property is not specified on -device/device_add command, treat default value as request for assigning PCDIMMDevice to the first free slot. - if slot is provided with -device/device_add command, attempt to use it or fail command if it's already occupied. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 21 ++++++++++++++++++ hw/mem/pc-dimm.c | 46 ++++++++++++++++++++++++++++++++++++++++ include/hw/mem/pc-dimm.h | 2 ++ 3 files changed, 69 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 4dc86d5bc4..be5e3bbafd 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1547,8 +1547,10 @@ void qemu_register_pc_machine(QEMUMachine *m) static void pc_dimm_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { + int slot; Error *local_err = NULL; PCMachineState *pcms = PC_MACHINE(hotplug_dev); + MachineState *machine = MACHINE(hotplug_dev); PCDIMMDevice *dimm = PC_DIMM(dev); PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); MemoryRegion *mr = ddc->get_memory_region(dimm); @@ -1565,7 +1567,26 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, if (local_err) { goto out; } + object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err); + if (local_err) { + goto out; + } + + slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err); + if (local_err) { + goto out; + } + + slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot, + machine->ram_slots, &local_err); + if (local_err) { + goto out; + } + object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err); + if (local_err) { + goto out; + } memory_region_add_subregion(&pcms->hotplug_memory, addr - pcms->hotplug_memory_base, mr); diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 40b51c883e..8c2656821a 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -23,6 +23,52 @@ #include "qapi/visitor.h" #include "qemu/range.h" +static int pc_dimm_slot2bitmap(Object *obj, void *opaque) +{ + unsigned long *bitmap = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* count only realized DIMMs */ + PCDIMMDevice *d = PC_DIMM(obj); + set_bit(d->slot, bitmap); + } + } + + object_child_foreach(obj, pc_dimm_slot2bitmap, opaque); + return 0; +} + +int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp) +{ + unsigned long *bitmap = bitmap_new(max_slots); + int slot = 0; + + object_child_foreach(qdev_get_machine(), pc_dimm_slot2bitmap, bitmap); + + /* check if requested slot is not occupied */ + if (hint) { + if (*hint >= max_slots) { + error_setg(errp, "invalid slot# %d, should be less than %d", + *hint, max_slots); + } else if (!test_bit(*hint, bitmap)) { + slot = *hint; + } else { + error_setg(errp, "slot %d is busy", *hint); + } + goto out; + } + + /* search for free slot */ + slot = find_first_zero_bit(bitmap, max_slots); + if (slot == max_slots) { + error_setg(errp, "no free slots available"); + } +out: + g_free(bitmap); + return slot; +} + static gint pc_dimm_addr_sort(gconstpointer a, gconstpointer b) { PCDIMMDevice *x = PC_DIMM(a); diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index 5a3e7df13c..0f4a6ba1e4 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -74,4 +74,6 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, uint64_t *hint, uint64_t size, Error **errp); + +int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp); #endif From 7e629d1d8d09d9e9294def8bcac9c0f283e9b909 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:15 +0200 Subject: [PATCH 019/109] acpi: rename cpu_hotplug_defs.h to pc-hotplug.h to make it more generic, so it could be used for memory hotplug as well. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-dsdt.dsl | 2 +- hw/i386/q35-acpi-dsdt.dsl | 2 +- include/hw/acpi/cpu_hotplug.h | 2 +- include/hw/acpi/{cpu_hotplug_defs.h => pc-hotplug.h} | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) rename include/hw/acpi/{cpu_hotplug_defs.h => pc-hotplug.h} (88%) diff --git a/hw/i386/acpi-dsdt.dsl b/hw/i386/acpi-dsdt.dsl index 0a1e252d21..8ae933bc38 100644 --- a/hw/i386/acpi-dsdt.dsl +++ b/hw/i386/acpi-dsdt.dsl @@ -306,7 +306,7 @@ DefinitionBlock ( } } -#include "hw/acpi/cpu_hotplug_defs.h" +#include "hw/acpi/pc-hotplug.h" #define CPU_STATUS_BASE PIIX4_CPU_HOTPLUG_IO_BASE #include "acpi-dsdt-cpu-hotplug.dsl" diff --git a/hw/i386/q35-acpi-dsdt.dsl b/hw/i386/q35-acpi-dsdt.dsl index f4d2a2daee..fddc3b27ff 100644 --- a/hw/i386/q35-acpi-dsdt.dsl +++ b/hw/i386/q35-acpi-dsdt.dsl @@ -402,7 +402,7 @@ DefinitionBlock ( define_gsi_link(GSIH, 0, 0x17) } -#include "hw/acpi/cpu_hotplug_defs.h" +#include "hw/acpi/pc-hotplug.h" #define CPU_STATUS_BASE ICH9_CPU_HOTPLUG_IO_BASE #include "acpi-dsdt-cpu-hotplug.dsl" diff --git a/include/hw/acpi/cpu_hotplug.h b/include/hw/acpi/cpu_hotplug.h index 4576400fd7..9e5d30c9df 100644 --- a/include/hw/acpi/cpu_hotplug.h +++ b/include/hw/acpi/cpu_hotplug.h @@ -13,7 +13,7 @@ #define ACPI_HOTPLUG_H #include "hw/acpi/acpi.h" -#include "hw/acpi/cpu_hotplug_defs.h" +#include "hw/acpi/pc-hotplug.h" typedef struct AcpiCpuHotplug { MemoryRegion io; diff --git a/include/hw/acpi/cpu_hotplug_defs.h b/include/hw/acpi/pc-hotplug.h similarity index 88% rename from include/hw/acpi/cpu_hotplug_defs.h rename to include/hw/acpi/pc-hotplug.h index 9f33663511..cee479de63 100644 --- a/include/hw/acpi/cpu_hotplug_defs.h +++ b/include/hw/acpi/pc-hotplug.h @@ -1,7 +1,7 @@ /* * QEMU ACPI hotplug utilities shared defines * - * Copyright (C) 2013 Red Hat Inc + * Copyright (C) 2014 Red Hat Inc * * Authors: * Igor Mammedov @@ -9,8 +9,8 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ -#ifndef ACPI_HOTPLUG_DEFS_H -#define ACPI_HOTPLUG_DEFS_H +#ifndef PC_HOTPLUG_H +#define PC_HOTPLUG_H /* * ONLY DEFINEs are permited in this file since it's shared From 3ef77acab21210b7d0853bf95798f8e0e748e500 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:16 +0200 Subject: [PATCH 020/109] acpi: memory hotplug ACPI hardware implementation - implements QEMU hardware part of memory hotplug protocol described at "docs/specs/acpi_mem_hotplug.txt" - handles only memory add notification event for now Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/specs/acpi_mem_hotplug.txt | 44 +++++++++ hw/acpi/Makefile.objs | 1 + hw/acpi/memory_hotplug.c | 147 +++++++++++++++++++++++++++++++ include/hw/acpi/memory_hotplug.h | 29 ++++++ include/hw/acpi/pc-hotplug.h | 3 + 5 files changed, 224 insertions(+) create mode 100644 docs/specs/acpi_mem_hotplug.txt create mode 100644 hw/acpi/memory_hotplug.c create mode 100644 include/hw/acpi/memory_hotplug.h diff --git a/docs/specs/acpi_mem_hotplug.txt b/docs/specs/acpi_mem_hotplug.txt new file mode 100644 index 0000000000..12909940cc --- /dev/null +++ b/docs/specs/acpi_mem_hotplug.txt @@ -0,0 +1,44 @@ +QEMU<->ACPI BIOS memory hotplug interface +-------------------------------------- + +ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add +events. + +Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): +--------------------------------------------------------------- +0xa00: + read access: + [0x0-0x3] Lo part of memory device phys address + [0x4-0x7] Hi part of memory device phys address + [0x8-0xb] Lo part of memory device size in bytes + [0xc-0xf] Hi part of memory device size in bytes + [0x10-0x13] Memory device proximity domain + [0x14] Memory device status fields + bits: + 0: Device is enabled and may be used by guest + 1: Device insert event, used to distinguish device for which + no device check event to OSPM was issued. + It's valid only when bit 1 is set. + 2-7: reserved and should be ignored by OSPM + [0x15-0x17] reserved + + write access: + [0x0-0x3] Memory device slot selector, selects active memory device. + All following accesses to other registers in 0xa00-0xa17 + region will read/store data from/to selected memory device. + [0x4-0x7] OST event code reported by OSPM + [0x8-0xb] OST status code reported by OSPM + [0xc-0x13] reserved, writes into it are ignored + [0x14] Memory device control fields + bits: + 0: reserved, OSPM must clear it before writing to register + 1: if set to 1 clears device insert event, set by OSPM + after it has emitted device check event for the + selected memory device + 2-7: reserved, OSPM must clear them before writing to register + +Selecting memory device slot beyond present range has no effect on platform: + - write accesses to memory hot-plug registers not documented above are + ignored + - read accesses to memory hot-plug registers not documented above return + all bits set to 1. diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 397d32babd..004e1b2090 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -1 +1,2 @@ common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o pcihp.o cpu_hotplug.o +common-obj-$(CONFIG_ACPI) += memory_hotplug.o diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c new file mode 100644 index 0000000000..e3a13ed412 --- /dev/null +++ b/hw/acpi/memory_hotplug.c @@ -0,0 +1,147 @@ +#include "hw/acpi/memory_hotplug.h" +#include "hw/acpi/pc-hotplug.h" +#include "hw/mem/pc-dimm.h" +#include "hw/boards.h" + +static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, + unsigned int size) +{ + uint32_t val = 0; + MemHotplugState *mem_st = opaque; + MemStatus *mdev; + Object *o; + + if (mem_st->selector >= mem_st->dev_count) { + return 0; + } + + mdev = &mem_st->devs[mem_st->selector]; + o = OBJECT(mdev->dimm); + switch (addr) { + case 0x0: /* Lo part of phys address where DIMM is mapped */ + val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) : 0; + break; + case 0x4: /* Hi part of phys address where DIMM is mapped */ + val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) >> 32 : 0; + break; + case 0x8: /* Lo part of DIMM size */ + val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) : 0; + break; + case 0xc: /* Hi part of DIMM size */ + val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) >> 32 : 0; + break; + case 0x10: /* node proximity for _PXM method */ + val = o ? object_property_get_int(o, PC_DIMM_NODE_PROP, NULL) : 0; + break; + case 0x14: /* pack and return is_* fields */ + val |= mdev->is_enabled ? 1 : 0; + val |= mdev->is_inserting ? 2 : 0; + break; + default: + val = ~0; + break; + } + return val; +} + +static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, + unsigned int size) +{ + MemHotplugState *mem_st = opaque; + MemStatus *mdev; + + if (!mem_st->dev_count) { + return; + } + + if (addr) { + if (mem_st->selector >= mem_st->dev_count) { + return; + } + } + + switch (addr) { + case 0x0: /* DIMM slot selector */ + mem_st->selector = data; + break; + case 0x4: /* _OST event */ + mdev = &mem_st->devs[mem_st->selector]; + if (data == 1) { + /* TODO: handle device insert OST event */ + } else if (data == 3) { + /* TODO: handle device remove OST event */ + } + mdev->ost_event = data; + break; + case 0x8: /* _OST status */ + mdev = &mem_st->devs[mem_st->selector]; + mdev->ost_status = data; + /* TODO: report async error */ + /* TODO: implement memory removal on guest signal */ + break; + case 0x14: + mdev = &mem_st->devs[mem_st->selector]; + if (data & 2) { /* clear insert event */ + mdev->is_inserting = false; + } + break; + } + +} +static const MemoryRegionOps acpi_memory_hotplug_ops = { + .read = acpi_memory_hotplug_read, + .write = acpi_memory_hotplug_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .valid = { + .min_access_size = 1, + .max_access_size = 4, + }, +}; + +void acpi_memory_hotplug_init(MemoryRegion *as, Object *owner, + MemHotplugState *state) +{ + MachineState *machine = MACHINE(qdev_get_machine()); + + state->dev_count = machine->ram_slots; + if (!state->dev_count) { + return; + } + + state->devs = g_malloc0(sizeof(*state->devs) * state->dev_count); + memory_region_init_io(&state->io, owner, &acpi_memory_hotplug_ops, state, + "apci-mem-hotplug", ACPI_MEMORY_HOTPLUG_IO_LEN); + memory_region_add_subregion(as, ACPI_MEMORY_HOTPLUG_BASE, &state->io); +} + +void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, + DeviceState *dev, Error **errp) +{ + MemStatus *mdev; + Error *local_err = NULL; + int slot = object_property_get_int(OBJECT(dev), "slot", &local_err); + + if (local_err) { + error_propagate(errp, local_err); + return; + } + + if (slot >= mem_st->dev_count) { + char *dev_path = object_get_canonical_path(OBJECT(dev)); + error_setg(errp, "acpi_memory_plug_cb: " + "device [%s] returned invalid memory slot[%d]", + dev_path, slot); + g_free(dev_path); + return; + } + + mdev = &mem_st->devs[slot]; + mdev->dimm = dev; + mdev->is_enabled = true; + mdev->is_inserting = true; + + /* do ACPI magic */ + ar->gpe.sts[0] |= ACPI_MEMORY_HOTPLUG_STATUS; + acpi_update_sci(ar, irq); + return; +} diff --git a/include/hw/acpi/memory_hotplug.h b/include/hw/acpi/memory_hotplug.h new file mode 100644 index 0000000000..8f90f72fdb --- /dev/null +++ b/include/hw/acpi/memory_hotplug.h @@ -0,0 +1,29 @@ +#ifndef QEMU_HW_ACPI_MEMORY_HOTPLUG_H +#define QEMU_HW_ACPI_MEMORY_HOTPLUG_H + +#include "hw/qdev-core.h" +#include "hw/acpi/acpi.h" + +#define ACPI_MEMORY_HOTPLUG_STATUS 8 + +typedef struct MemStatus { + DeviceState *dimm; + bool is_enabled; + bool is_inserting; + uint32_t ost_event; + uint32_t ost_status; +} MemStatus; + +typedef struct MemHotplugState { + MemoryRegion io; + uint32_t selector; + uint32_t dev_count; + MemStatus *devs; +} MemHotplugState; + +void acpi_memory_hotplug_init(MemoryRegion *as, Object *owner, + MemHotplugState *state); + +void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, + DeviceState *dev, Error **errp); +#endif diff --git a/include/hw/acpi/pc-hotplug.h b/include/hw/acpi/pc-hotplug.h index cee479de63..01d38e2e7e 100644 --- a/include/hw/acpi/pc-hotplug.h +++ b/include/hw/acpi/pc-hotplug.h @@ -29,4 +29,7 @@ #define ICH9_CPU_HOTPLUG_IO_BASE 0x0CD8 #define PIIX4_CPU_HOTPLUG_IO_BASE 0xaf00 +#define ACPI_MEMORY_HOTPLUG_IO_LEN 24 +#define ACPI_MEMORY_HOTPLUG_BASE 0x0a00 + #endif From dfe292ffc44ad3645b8fe473b34a4a29d5256ee6 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:17 +0200 Subject: [PATCH 021/109] trace: add acpi memory hotplug IO region events Add events for tracing accesses to memory hotplug IO ports. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/memory_hotplug.c | 13 +++++++++++++ trace-events | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index e3a13ed412..8944ad358a 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -2,6 +2,7 @@ #include "hw/acpi/pc-hotplug.h" #include "hw/mem/pc-dimm.h" #include "hw/boards.h" +#include "trace.h" static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, unsigned int size) @@ -12,6 +13,7 @@ static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, Object *o; if (mem_st->selector >= mem_st->dev_count) { + trace_mhp_acpi_invalid_slot_selected(mem_st->selector); return 0; } @@ -20,22 +22,28 @@ static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, switch (addr) { case 0x0: /* Lo part of phys address where DIMM is mapped */ val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) : 0; + trace_mhp_acpi_read_addr_lo(mem_st->selector, val); break; case 0x4: /* Hi part of phys address where DIMM is mapped */ val = o ? object_property_get_int(o, PC_DIMM_ADDR_PROP, NULL) >> 32 : 0; + trace_mhp_acpi_read_addr_hi(mem_st->selector, val); break; case 0x8: /* Lo part of DIMM size */ val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) : 0; + trace_mhp_acpi_read_size_lo(mem_st->selector, val); break; case 0xc: /* Hi part of DIMM size */ val = o ? object_property_get_int(o, PC_DIMM_SIZE_PROP, NULL) >> 32 : 0; + trace_mhp_acpi_read_size_hi(mem_st->selector, val); break; case 0x10: /* node proximity for _PXM method */ val = o ? object_property_get_int(o, PC_DIMM_NODE_PROP, NULL) : 0; + trace_mhp_acpi_read_pxm(mem_st->selector, val); break; case 0x14: /* pack and return is_* fields */ val |= mdev->is_enabled ? 1 : 0; val |= mdev->is_inserting ? 2 : 0; + trace_mhp_acpi_read_flags(mem_st->selector, val); break; default: val = ~0; @@ -56,6 +64,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, if (addr) { if (mem_st->selector >= mem_st->dev_count) { + trace_mhp_acpi_invalid_slot_selected(mem_st->selector); return; } } @@ -63,6 +72,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, switch (addr) { case 0x0: /* DIMM slot selector */ mem_st->selector = data; + trace_mhp_acpi_write_slot(mem_st->selector); break; case 0x4: /* _OST event */ mdev = &mem_st->devs[mem_st->selector]; @@ -72,10 +82,12 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, /* TODO: handle device remove OST event */ } mdev->ost_event = data; + trace_mhp_acpi_write_ost_ev(mem_st->selector, mdev->ost_event); break; case 0x8: /* _OST status */ mdev = &mem_st->devs[mem_st->selector]; mdev->ost_status = data; + trace_mhp_acpi_write_ost_status(mem_st->selector, mdev->ost_status); /* TODO: report async error */ /* TODO: implement memory removal on guest signal */ break; @@ -83,6 +95,7 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, mdev = &mem_st->devs[mem_st->selector]; if (data & 2) { /* clear insert event */ mdev->is_inserting = false; + trace_mhp_acpi_clear_insert_evt(mem_st->selector); } break; } diff --git a/trace-events b/trace-events index f8dff485b2..d7d6a10b56 100644 --- a/trace-events +++ b/trace-events @@ -1272,6 +1272,19 @@ xen_pv_mmio_write(uint64_t addr) "WARNING: write to Xen PV Device MMIO space (ad pci_cfg_read(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x -> 0x%x" pci_cfg_write(const char *dev, unsigned devid, unsigned fnid, unsigned offs, unsigned val) "%s %02u:%u @0x%x <- 0x%x" +#hw/acpi/memory_hotplug.c +mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32 +mhp_acpi_read_addr_lo(uint32_t slot, uint32_t addr) "slot[0x%"PRIx32"] addr lo: 0x%"PRIx32 +mhp_acpi_read_addr_hi(uint32_t slot, uint32_t addr) "slot[0x%"PRIx32"] addr hi: 0x%"PRIx32 +mhp_acpi_read_size_lo(uint32_t slot, uint32_t size) "slot[0x%"PRIx32"] size lo: 0x%"PRIx32 +mhp_acpi_read_size_hi(uint32_t slot, uint32_t size) "slot[0x%"PRIx32"] size hi: 0x%"PRIx32 +mhp_acpi_read_pxm(uint32_t slot, uint32_t pxm) "slot[0x%"PRIx32"] proximity: 0x%"PRIx32 +mhp_acpi_read_flags(uint32_t slot, uint32_t flags) "slot[0x%"PRIx32"] flags: 0x%"PRIx32 +mhp_acpi_write_slot(uint32_t slot) "set active slot: 0x%"PRIx32 +mhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "slot[0x%"PRIx32"] OST EVENT: 0x%"PRIx32 +mhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "slot[0x%"PRIx32"] OST STATUS: 0x%"PRIx32 +mhp_acpi_clear_insert_evt(uint32_t slot) "slot[0x%"PRIx32"] clear insert event" + # target-s390x/kvm.c kvm_enable_cmma(int rc) "CMMA: enabling with result code %d" kvm_clear_cmma(int rc) "CMMA: clearing with result code %d" From 2e1ac493f13f5fe9a47023c443d339664cc9e9a2 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:18 +0200 Subject: [PATCH 022/109] trace: pc: add PC_DIMM slot & address allocation Add mhp_pc_dimm_assigned_slot & mhp_pc_dimm_assigned_address events to trace which address and slot where assigned to plugged in PC_DIMM device on target-i386 machine. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 3 +++ trace-events | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index be5e3bbafd..c9d888fac6 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -59,6 +59,7 @@ #include "hw/pci/pci_host.h" #include "acpi-build.h" #include "hw/mem/pc-dimm.h" +#include "trace.h" /* debug PC/ISA interrupts */ //#define DEBUG_IRQ @@ -1572,6 +1573,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, if (local_err) { goto out; } + trace_mhp_pc_dimm_assigned_address(addr); slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err); if (local_err) { @@ -1587,6 +1589,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, if (local_err) { goto out; } + trace_mhp_pc_dimm_assigned_slot(slot); memory_region_add_subregion(&pcms->hotplug_memory, addr - pcms->hotplug_memory_base, mr); diff --git a/trace-events b/trace-events index d7d6a10b56..ba01ad52cf 100644 --- a/trace-events +++ b/trace-events @@ -1285,6 +1285,10 @@ mhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "slot[0x%"PRIx32"] OST EVENT: mhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "slot[0x%"PRIx32"] OST STATUS: 0x%"PRIx32 mhp_acpi_clear_insert_evt(uint32_t slot) "slot[0x%"PRIx32"] clear insert event" +#hw/i386/pc.c +mhp_pc_dimm_assigned_slot(int slot) "0x%d" +mhp_pc_dimm_assigned_address(uint64_t addr) "0x%"PRIx64 + # target-s390x/kvm.c kvm_enable_cmma(int rc) "CMMA: enabling with result code %d" kvm_clear_cmma(int rc) "CMMA: clearing with result code %d" From f1adc360b4ea3ed3bf3a5061681a986b0df85ad7 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:19 +0200 Subject: [PATCH 023/109] acpi:piix4: allow plug/unlug callbacks handle not only PCI devices ... and report error if plugged in device is not supported. Later these callbacks will be used by memory hotplug. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/piix4.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 252bbf2c77..c9c6b8b619 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -308,19 +308,32 @@ static void piix4_pm_powerdown_req(Notifier *n, void *opaque) acpi_pm1_evt_power_down(&s->ar); } -static void piix4_pci_device_plug_cb(HotplugHandler *hotplug_dev, - DeviceState *dev, Error **errp) +static void piix4_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) { PIIX4PMState *s = PIIX4_PM(hotplug_dev); - acpi_pcihp_device_plug_cb(&s->ar, s->irq, &s->acpi_pci_hotplug, dev, errp); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_plug_cb(&s->ar, s->irq, &s->acpi_pci_hotplug, dev, + errp); + } else { + error_setg(errp, "acpi: device plug request for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); + } } -static void piix4_pci_device_unplug_cb(HotplugHandler *hotplug_dev, - DeviceState *dev, Error **errp) +static void piix4_device_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) { PIIX4PMState *s = PIIX4_PM(hotplug_dev); - acpi_pcihp_device_unplug_cb(&s->ar, s->irq, &s->acpi_pci_hotplug, dev, - errp); + + if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + acpi_pcihp_device_unplug_cb(&s->ar, s->irq, &s->acpi_pci_hotplug, dev, + errp); + } else { + error_setg(errp, "acpi: device unplug request for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); + } } static void piix4_update_bus_hotplug(PCIBus *pci_bus, void *opaque) @@ -551,8 +564,8 @@ static void piix4_pm_class_init(ObjectClass *klass, void *data) */ dc->cannot_instantiate_with_device_add_yet = true; dc->hotpluggable = false; - hc->plug = piix4_pci_device_plug_cb; - hc->unplug = piix4_pci_device_unplug_cb; + hc->plug = piix4_device_plug_cb; + hc->unplug = piix4_device_unplug_cb; } static const TypeInfo piix4_pm_info = { From 34774320c3b05287a06775c31578bd1e2cb20b83 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:20 +0200 Subject: [PATCH 024/109] acpi:piix4: add memory hotplug handling Add memory hotplug initialization/handling to PIIX4_PM device and enable it by default for post 2.0 machine types Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: resolve conflict in pc.h --- hw/acpi/piix4.c | 15 ++++++++++++++- include/hw/acpi/memory_hotplug.h | 1 + include/hw/i386/pc.h | 8 +++++++- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index c9c6b8b619..5c542b764b 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -33,6 +33,8 @@ #include "hw/acpi/pcihp.h" #include "hw/acpi/cpu_hotplug.h" #include "hw/hotplug.h" +#include "hw/mem/pc-dimm.h" +#include "hw/acpi/memory_hotplug.h" //#define DEBUG @@ -81,6 +83,8 @@ typedef struct PIIX4PMState { AcpiCpuHotplug gpe_cpu; Notifier cpu_added_notifier; + + MemHotplugState acpi_memory_hotplug; } PIIX4PMState; #define TYPE_PIIX4_PM "PIIX4_PM" @@ -313,7 +317,10 @@ static void piix4_device_plug_cb(HotplugHandler *hotplug_dev, { PIIX4PMState *s = PIIX4_PM(hotplug_dev); - if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { + if (s->acpi_memory_hotplug.is_enabled && + object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + acpi_memory_plug_cb(&s->ar, s->irq, &s->acpi_memory_hotplug, dev, errp); + } else if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) { acpi_pcihp_device_plug_cb(&s->ar, s->irq, &s->acpi_pci_hotplug, dev, errp); } else { @@ -531,6 +538,10 @@ static void piix4_acpi_system_hot_add_init(MemoryRegion *parent, PIIX4_CPU_HOTPLUG_IO_BASE); s->cpu_added_notifier.notify = piix4_cpu_added_req; qemu_register_cpu_added_notifier(&s->cpu_added_notifier); + + if (s->acpi_memory_hotplug.is_enabled) { + acpi_memory_hotplug_init(parent, OBJECT(s), &s->acpi_memory_hotplug); + } } static Property piix4_pm_properties[] = { @@ -540,6 +551,8 @@ static Property piix4_pm_properties[] = { DEFINE_PROP_UINT8(ACPI_PM_PROP_S4_VAL, PIIX4PMState, s4_val, 2), DEFINE_PROP_BOOL("acpi-pci-hotplug-with-bridge-support", PIIX4PMState, use_acpi_pci_hotplug, true), + DEFINE_PROP_BOOL("memory-hotplug-support", PIIX4PMState, + acpi_memory_hotplug.is_enabled, true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/include/hw/acpi/memory_hotplug.h b/include/hw/acpi/memory_hotplug.h index 8f90f72fdb..912c53f3ea 100644 --- a/include/hw/acpi/memory_hotplug.h +++ b/include/hw/acpi/memory_hotplug.h @@ -15,6 +15,7 @@ typedef struct MemStatus { } MemStatus; typedef struct MemHotplugState { + bool is_enabled; /* true if memory hotplug is supported */ MemoryRegion io; uint32_t selector; uint32_t dev_count; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index c3ed8586df..f6d41729c5 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -310,11 +310,17 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); PC_Q35_COMPAT_1_5 #define PC_COMPAT_2_0 \ + {\ + .driver = "PIIX4_PM",\ + .property = "memory-hotplug-support",\ + .value = "off",\ + },\ {\ .driver = "apic",\ .property = "version",\ .value = stringify(0x11),\ - },{\ + },\ + {\ .driver = "nec-usb-xhci",\ .property = "superspeed-ports-first",\ .value = "off",\ From d6b38b661135ceacfc4c3dd9c085f80751465e4f Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:21 +0200 Subject: [PATCH 025/109] pc: ich9 lpc: make it work with global/compat properties Propeties of object should be available after its instances_init() callback is finished and not added in PCIDeviceClass.init which is roughly corresponds to realize() method. Moving properties adding into instances_init will fix missing property error when global/compat property mechanism is used. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/isa/lpc_ich9.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 97f69d6001..ad43475640 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -563,7 +563,14 @@ static void ich9_lpc_add_properties(ICH9LPCState *lpc) ich9_pm_add_properties(OBJECT(lpc), &lpc->pm, NULL); } -static int ich9_lpc_initfn(PCIDevice *d) +static void ich9_lpc_initfn(Object *obj) +{ + ICH9LPCState *lpc = ICH9_LPC_DEVICE(obj); + + ich9_lpc_add_properties(lpc); +} + +static int ich9_lpc_init(PCIDevice *d) { ICH9LPCState *lpc = ICH9_LPC_DEVICE(d); ISABus *isa_bus; @@ -589,9 +596,6 @@ static int ich9_lpc_initfn(PCIDevice *d) memory_region_add_subregion_overlap(pci_address_space_io(d), ICH9_RST_CNT_IOPORT, &lpc->rst_cnt_mem, 1); - - ich9_lpc_add_properties(lpc); - return 0; } @@ -641,7 +645,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); dc->reset = ich9_lpc_reset; - k->init = ich9_lpc_initfn; + k->init = ich9_lpc_init; dc->vmsd = &vmstate_ich9_lpc; k->config_write = ich9_lpc_config_write; dc->desc = "ICH9 LPC bridge"; @@ -660,6 +664,7 @@ static const TypeInfo ich9_lpc_info = { .name = TYPE_ICH9_LPC_DEVICE, .parent = TYPE_PCI_DEVICE, .instance_size = sizeof(struct ICH9LPCState), + .instance_init = ich9_lpc_initfn, .class_init = ich9_lpc_class_init, }; From 1f8621842ebba95002f500935ac84e7dcf71a540 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:22 +0200 Subject: [PATCH 026/109] acpi:ich9: add memory hotplug handling Add memory hotplug initialization/handling to ICH9 LPC device and enable it by default for post 2.0 machine types Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/ich9.c | 38 ++++++++++++++++++++++++++++++++++++++ hw/isa/lpc_ich9.c | 20 ++++++++++++++++++++ include/hw/acpi/ich9.h | 4 ++++ include/hw/i386/pc.h | 7 ++++++- 4 files changed, 68 insertions(+), 1 deletion(-) diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index 407ae8900c..a818bed0fb 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -34,6 +34,7 @@ #include "exec/address-spaces.h" #include "hw/i386/ich9.h" +#include "hw/mem/pc-dimm.h" //#define DEBUG @@ -223,6 +224,11 @@ void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, &pm->gpe_cpu, ICH9_CPU_HOTPLUG_IO_BASE); pm->cpu_added_notifier.notify = ich9_cpu_added_req; qemu_register_cpu_added_notifier(&pm->cpu_added_notifier); + + if (pm->acpi_memory_hotplug.is_enabled) { + acpi_memory_hotplug_init(pci_address_space_io(lpc_pci), OBJECT(lpc_pci), + &pm->acpi_memory_hotplug); + } } static void ich9_pm_get_gpe0_blk(Object *obj, Visitor *v, @@ -235,9 +241,25 @@ static void ich9_pm_get_gpe0_blk(Object *obj, Visitor *v, visit_type_uint32(v, &value, name, errp); } +static bool ich9_pm_get_memory_hotplug_support(Object *obj, Error **errp) +{ + ICH9LPCState *s = ICH9_LPC_DEVICE(obj); + + return s->pm.acpi_memory_hotplug.is_enabled; +} + +static void ich9_pm_set_memory_hotplug_support(Object *obj, bool value, + Error **errp) +{ + ICH9LPCState *s = ICH9_LPC_DEVICE(obj); + + s->pm.acpi_memory_hotplug.is_enabled = value; +} + void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) { static const uint32_t gpe0_len = ICH9_PMIO_GPE0_LEN; + pm->acpi_memory_hotplug.is_enabled = true; object_property_add_uint32_ptr(obj, ACPI_PM_PROP_PM_IO_BASE, &pm->pm_io_base, errp); @@ -246,4 +268,20 @@ void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp) NULL, NULL, pm, NULL); object_property_add_uint32_ptr(obj, ACPI_PM_PROP_GPE0_BLK_LEN, &gpe0_len, errp); + object_property_add_bool(obj, "memory-hotplug-support", + ich9_pm_get_memory_hotplug_support, + ich9_pm_set_memory_hotplug_support, + NULL); +} + +void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp) +{ + if (pm->acpi_memory_hotplug.is_enabled && + object_dynamic_cast(OBJECT(dev), TYPE_PC_DIMM)) { + acpi_memory_plug_cb(&pm->acpi_regs, pm->irq, &pm->acpi_memory_hotplug, + dev, errp); + } else { + error_setg(errp, "acpi: device plug request for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); + } } diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index ad43475640..fb2b82d13f 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -599,6 +599,19 @@ static int ich9_lpc_init(PCIDevice *d) return 0; } +static void ich9_device_plug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ + ICH9LPCState *lpc = ICH9_LPC_DEVICE(hotplug_dev); + + ich9_pm_device_plug_cb(&lpc->pm, dev, errp); +} + +static void ich9_device_unplug_cb(HotplugHandler *hotplug_dev, + DeviceState *dev, Error **errp) +{ +} + static bool ich9_rst_cnt_needed(void *opaque) { ICH9LPCState *lpc = opaque; @@ -642,6 +655,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); dc->reset = ich9_lpc_reset; @@ -658,6 +672,8 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) * pc_q35_init() */ dc->cannot_instantiate_with_device_add_yet = true; + hc->plug = ich9_device_plug_cb; + hc->unplug = ich9_device_unplug_cb; } static const TypeInfo ich9_lpc_info = { @@ -666,6 +682,10 @@ static const TypeInfo ich9_lpc_info = { .instance_size = sizeof(struct ICH9LPCState), .instance_init = ich9_lpc_initfn, .class_init = ich9_lpc_class_init, + .interfaces = (InterfaceInfo[]) { + { TYPE_HOTPLUG_HANDLER }, + { } + } }; static void ich9_lpc_register(void) diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h index 104f419852..1977f1b206 100644 --- a/include/hw/acpi/ich9.h +++ b/include/hw/acpi/ich9.h @@ -23,6 +23,7 @@ #include "hw/acpi/acpi.h" #include "hw/acpi/cpu_hotplug.h" +#include "hw/acpi/memory_hotplug.h" typedef struct ICH9LPCPMRegs { /* @@ -46,6 +47,8 @@ typedef struct ICH9LPCPMRegs { AcpiCpuHotplug gpe_cpu; Notifier cpu_added_notifier; + + MemHotplugState acpi_memory_hotplug; } ICH9LPCPMRegs; void ich9_pm_init(PCIDevice *lpc_pci, ICH9LPCPMRegs *pm, @@ -55,4 +58,5 @@ extern const VMStateDescription vmstate_ich9_pm; void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp); +void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp); #endif /* HW_ACPI_ICH9_H */ diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index f6d41729c5..1635aede0f 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -286,7 +286,12 @@ int e820_get_num_entries(void); bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); #define PC_Q35_COMPAT_2_0 \ - PC_COMPAT_2_0 + PC_COMPAT_2_0, \ + {\ + .driver = "ICH9 LPC",\ + .property = "memory-hotplug-support",\ + .value = "off",\ + } #define PC_Q35_COMPAT_1_7 \ PC_COMPAT_1_7, \ From f816a62daa05ae3b973c76834be7f319b2d02f03 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:23 +0200 Subject: [PATCH 027/109] pc: migrate piix4 & ich9 MemHotplugState Adds an optional subsection that allows to migrate current state of acpi_memory_hotplug of ACPI PM device. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/ich9.c | 24 ++++++++++++++++++++++++ hw/acpi/memory_hotplug.c | 27 +++++++++++++++++++++++++++ hw/acpi/piix4.c | 24 ++++++++++++++++++++++++ include/hw/acpi/memory_hotplug.h | 7 +++++++ 4 files changed, 82 insertions(+) diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index a818bed0fb..0ecc241458 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -140,6 +140,23 @@ static int ich9_pm_post_load(void *opaque, int version_id) .offset = vmstate_offset_pointer(_state, _field, uint8_t), \ } +static bool vmstate_test_use_memhp(void *opaque) +{ + ICH9LPCPMRegs *s = opaque; + return s->acpi_memory_hotplug.is_enabled; +} + +static const VMStateDescription vmstate_memhp_state = { + .name = "ich9_pm/memhp", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_MEMORY_HOTPLUG(acpi_memory_hotplug, ICH9LPCPMRegs), + VMSTATE_END_OF_LIST() + } +}; + const VMStateDescription vmstate_ich9_pm = { .name = "ich9_pm", .version_id = 1, @@ -156,6 +173,13 @@ const VMStateDescription vmstate_ich9_pm = { VMSTATE_UINT32(smi_en, ICH9LPCPMRegs), VMSTATE_UINT32(smi_sts, ICH9LPCPMRegs), VMSTATE_END_OF_LIST() + }, + .subsections = (VMStateSubsection[]) { + { + .vmsd = &vmstate_memhp_state, + .needed = vmstate_test_use_memhp, + }, + VMSTATE_END_OF_LIST() } }; diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 8944ad358a..71c7a5e83f 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -158,3 +158,30 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, acpi_update_sci(ar, irq); return; } + +static const VMStateDescription vmstate_memhp_sts = { + .name = "memory hotplug device state", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_BOOL(is_enabled, MemStatus), + VMSTATE_BOOL(is_inserting, MemStatus), + VMSTATE_UINT32(ost_event, MemStatus), + VMSTATE_UINT32(ost_status, MemStatus), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_memory_hotplug = { + .name = "memory hotplug state", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(selector, MemHotplugState), + VMSTATE_STRUCT_VARRAY_POINTER_UINT32(devs, MemHotplugState, dev_count, + vmstate_memhp_sts, MemStatus), + VMSTATE_END_OF_LIST() + } +}; diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 5c542b764b..17627ee675 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -248,6 +248,23 @@ static bool vmstate_test_no_use_acpi_pci_hotplug(void *opaque, int version_id) return !s->use_acpi_pci_hotplug; } +static bool vmstate_test_use_memhp(void *opaque) +{ + PIIX4PMState *s = opaque; + return s->acpi_memory_hotplug.is_enabled; +} + +static const VMStateDescription vmstate_memhp_state = { + .name = "piix4_pm/memhp", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_MEMORY_HOTPLUG(acpi_memory_hotplug, PIIX4PMState), + VMSTATE_END_OF_LIST() + } +}; + /* qemu-kvm 1.2 uses version 3 but advertised as 2 * To support incoming qemu-kvm 1.2 migration, change version_id * and minimum_version_id to 2 below (which breaks migration from @@ -279,6 +296,13 @@ static const VMStateDescription vmstate_acpi = { VMSTATE_PCI_HOTPLUG(acpi_pci_hotplug, PIIX4PMState, vmstate_test_use_acpi_pci_hotplug), VMSTATE_END_OF_LIST() + }, + .subsections = (VMStateSubsection[]) { + { + .vmsd = &vmstate_memhp_state, + .needed = vmstate_test_use_memhp, + }, + VMSTATE_END_OF_LIST() } }; diff --git a/include/hw/acpi/memory_hotplug.h b/include/hw/acpi/memory_hotplug.h index 912c53f3ea..458845933b 100644 --- a/include/hw/acpi/memory_hotplug.h +++ b/include/hw/acpi/memory_hotplug.h @@ -3,6 +3,7 @@ #include "hw/qdev-core.h" #include "hw/acpi/acpi.h" +#include "migration/vmstate.h" #define ACPI_MEMORY_HOTPLUG_STATUS 8 @@ -27,4 +28,10 @@ void acpi_memory_hotplug_init(MemoryRegion *as, Object *owner, void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, DeviceState *dev, Error **errp); + +extern const VMStateDescription vmstate_memory_hotplug; +#define VMSTATE_MEMORY_HOTPLUG(memhp, state) \ + VMSTATE_STRUCT(memhp, state, 1, \ + vmstate_memory_hotplug, MemHotplugState) + #endif From 781bbd6bec587606b1b98b78512f2bba64183b0c Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:24 +0200 Subject: [PATCH 028/109] pc: add acpi-device link to PCMachineState the link will used later to access device implementing ACPI functions instead of adhoc lookup in QOM tree. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/piix4.c | 6 +++++- hw/i386/pc_piix.c | 12 +++++++++++- hw/i386/pc_q35.c | 10 ++++++++++ hw/mips/mips_malta.c | 2 +- include/hw/i386/pc.h | 8 +++++++- 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 17627ee675..01b3b4cb64 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -483,13 +483,17 @@ Object *piix4_pm_find(void) I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base, qemu_irq sci_irq, qemu_irq smi_irq, - int kvm_enabled, FWCfgState *fw_cfg) + int kvm_enabled, FWCfgState *fw_cfg, + DeviceState **piix4_pm) { DeviceState *dev; PIIX4PMState *s; dev = DEVICE(pci_create(bus, devfn, TYPE_PIIX4_PM)); qdev_prop_set_uint32(dev, "smb_io_base", smb_io_base); + if (piix4_pm) { + *piix4_pm = dev; + } s = PIIX4_PM(dev); s->irq = sci_irq; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index e133b6aa59..a13e8d67db 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -74,6 +74,7 @@ static void pc_init1(MachineState *machine, int pci_enabled, int kvmclock_enabled) { + PCMachineState *pc_machine = PC_MACHINE(machine); MemoryRegion *system_memory = get_system_memory(); MemoryRegion *system_io = get_system_io(); int i; @@ -246,14 +247,23 @@ static void pc_init1(MachineState *machine, } if (pci_enabled && acpi_enabled) { + DeviceState *piix4_pm; I2CBus *smbus; smi_irq = qemu_allocate_irqs(pc_acpi_smi_interrupt, first_cpu, 1); /* TODO: Populate SPD eeprom data. */ smbus = piix4_pm_init(pci_bus, piix3_devfn + 3, 0xb100, gsi[9], *smi_irq, - kvm_enabled(), fw_cfg); + kvm_enabled(), fw_cfg, &piix4_pm); smbus_eeprom_init(smbus, 8, NULL, 0); + + object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + TYPE_HOTPLUG_HANDLER, + (Object **)&pc_machine->acpi_dev, + object_property_allow_set_link, + OBJ_PROP_LINK_UNREF_ON_RELEASE, &error_abort); + object_property_set_link(OBJECT(machine), OBJECT(piix4_pm), + PC_MACHINE_ACPI_DEVICE_PROP, &error_abort); } if (pci_enabled) { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 0e774765e9..629eb2d69b 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -62,6 +62,7 @@ static bool has_reserved_memory = true; /* PC hardware initialisation */ static void pc_q35_init(MachineState *machine) { + PCMachineState *pc_machine = PC_MACHINE(machine); ram_addr_t below_4g_mem_size, above_4g_mem_size; Q35PCIHost *q35_host; PCIHostState *phb; @@ -178,6 +179,15 @@ static void pc_q35_init(MachineState *machine) lpc = pci_create_simple_multifunction(host_bus, PCI_DEVFN(ICH9_LPC_DEV, ICH9_LPC_FUNC), true, TYPE_ICH9_LPC_DEVICE); + + object_property_add_link(OBJECT(machine), PC_MACHINE_ACPI_DEVICE_PROP, + TYPE_HOTPLUG_HANDLER, + (Object **)&pc_machine->acpi_dev, + object_property_allow_set_link, + OBJ_PROP_LINK_UNREF_ON_RELEASE, &error_abort); + object_property_set_link(OBJECT(machine), OBJECT(lpc), + PC_MACHINE_ACPI_DEVICE_PROP, &error_abort); + ich9_lpc = ICH9_LPC_DEVICE(lpc); ich9_lpc->pic = gsi; ich9_lpc->ioapic = gsi_state->ioapic_irq; diff --git a/hw/mips/mips_malta.c b/hw/mips/mips_malta.c index f4a7d47129..3c04342236 100644 --- a/hw/mips/mips_malta.c +++ b/hw/mips/mips_malta.c @@ -1104,7 +1104,7 @@ void mips_malta_init(MachineState *machine) pci_piix4_ide_init(pci_bus, hd, piix4_devfn + 1); pci_create_simple(pci_bus, piix4_devfn + 2, "piix4-usb-uhci"); smbus = piix4_pm_init(pci_bus, piix4_devfn + 3, 0x1100, - isa_get_irq(NULL, 9), NULL, 0, NULL); + isa_get_irq(NULL, 9), NULL, 0, NULL, NULL); smbus_eeprom_init(smbus, 8, smbus_eeprom_buf, smbus_eeprom_size); g_free(smbus_eeprom_buf); pit = pit_init(isa_bus, 0x40, 0, NULL); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 1635aede0f..6050115042 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -21,6 +21,7 @@ * @hotplug_memory_base: address in guest RAM address space where hotplug memory * address space begins. * @hotplug_memory: hotplug memory addess space container + * @acpi_dev: link to ACPI PM device that performs ACPI hotplug handling */ struct PCMachineState { /*< private >*/ @@ -29,8 +30,12 @@ struct PCMachineState { /* */ ram_addr_t hotplug_memory_base; MemoryRegion hotplug_memory; + + HotplugHandler *acpi_dev; }; +#define PC_MACHINE_ACPI_DEVICE_PROP "acpi-device" + /** * PCMachineClass: * @get_hotplug_handler: pointer to parent class callback @get_hotplug_handler @@ -210,7 +215,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name); I2CBus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t smb_io_base, qemu_irq sci_irq, qemu_irq smi_irq, - int kvm_enabled, FWCfgState *fw_cfg); + int kvm_enabled, FWCfgState *fw_cfg, + DeviceState **piix4_pm); void piix4_smbus_register_device(SMBusDevice *dev, uint8_t addr); /* hpet.c */ From 3fbcdc27b18b383039b360be1cedf52a1ffb2dff Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:25 +0200 Subject: [PATCH 029/109] pc: propagate memory hotplug event to ACPI device Notify PIIX4_PM/ICH9LPC device about hotplug event, so that it would send SCI to guest notifying about newly added memory. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c9d888fac6..0bedd4695e 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1549,6 +1549,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { int slot; + HotplugHandlerClass *hhc; Error *local_err = NULL; PCMachineState *pcms = PC_MACHINE(hotplug_dev); MachineState *machine = MACHINE(hotplug_dev); @@ -1591,9 +1592,18 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, } trace_mhp_pc_dimm_assigned_slot(slot); + if (!pcms->acpi_dev) { + error_setg(&local_err, + "memory hotplug is not enabled: missing acpi device"); + goto out; + } + memory_region_add_subregion(&pcms->hotplug_memory, addr - pcms->hotplug_memory_base, mr); vmstate_register_ram(mr, dev); + + hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev); + hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err); out: error_propagate(errp, local_err); } From bef3492d1169a54c966cecd0e0b1dd25e9341582 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:26 +0200 Subject: [PATCH 030/109] pc: ACPI BIOS: implement memory hotplug interface - provides static SSDT object for memory hotplug that can handle upto 256 hotplugable memory slots - SSDT template for memory devices and runtime generator of them in SSDT table. Signed-off-by: Vasilis Liaskovitis Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/Makefile.objs | 3 +- hw/i386/acpi-build.c | 35 ++++++++ hw/i386/ssdt-mem.dsl | 77 ++++++++++++++++ hw/i386/ssdt-misc.dsl | 164 +++++++++++++++++++++++++++++++++++ include/hw/acpi/pc-hotplug.h | 21 +++++ 5 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 hw/i386/ssdt-mem.dsl diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs index f66c349508..a65473c253 100644 --- a/hw/i386/Makefile.objs +++ b/hw/i386/Makefile.objs @@ -9,7 +9,8 @@ obj-y += acpi-build.o obj-y += bios-linker-loader.o hw/i386/acpi-build.o: hw/i386/acpi-build.c hw/i386/acpi-dsdt.hex \ hw/i386/ssdt-proc.hex hw/i386/ssdt-pcihp.hex hw/i386/ssdt-misc.hex \ - hw/i386/acpi-dsdt.hex hw/i386/q35-acpi-dsdt.hex + hw/i386/acpi-dsdt.hex hw/i386/q35-acpi-dsdt.hex \ + hw/i386/q35-acpi-dsdt.hex hw/i386/ssdt-mem.hex iasl-option=$(shell if test -z "`$(1) $(2) 2>&1 > /dev/null`" \ ; then echo "$(2)"; else echo "$(3)"; fi ;) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 9a4151083e..6ff7fa3980 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -37,6 +37,7 @@ #include "bios-linker-loader.h" #include "hw/loader.h" #include "hw/isa/isa.h" +#include "hw/acpi/memory_hotplug.h" /* Supported chipsets: */ #include "hw/acpi/piix4.h" @@ -667,6 +668,14 @@ static inline char acpi_get_hex(uint32_t val) #define ACPI_PCIQXL_SIZEOF (*ssdt_pciqxl_end - *ssdt_pciqxl_start) #define ACPI_PCIQXL_AML (ssdp_pcihp_aml + *ssdt_pciqxl_start) +#include "hw/i386/ssdt-mem.hex" + +/* 0x5B 0x82 DeviceOp PkgLength NameString DimmID */ +#define ACPI_MEM_OFFSET_HEX (*ssdt_mem_name - *ssdt_mem_start + 2) +#define ACPI_MEM_OFFSET_ID (*ssdt_mem_id - *ssdt_mem_start + 7) +#define ACPI_MEM_SIZEOF (*ssdt_mem_end - *ssdt_mem_start) +#define ACPI_MEM_AML (ssdm_mem_aml + *ssdt_mem_start) + #define ACPI_SSDT_SIGNATURE 0x54445353 /* SSDT */ #define ACPI_SSDT_HEADER_LENGTH 36 @@ -1003,6 +1012,8 @@ build_ssdt(GArray *table_data, GArray *linker, AcpiCpuInfo *cpu, AcpiPmInfo *pm, AcpiMiscInfo *misc, PcPciInfo *pci, PcGuestInfo *guest_info) { + MachineState *machine = MACHINE(qdev_get_machine()); + uint32_t nr_mem = machine->ram_slots; unsigned acpi_cpus = guest_info->apic_id_limit; int ssdt_start = table_data->len; uint8_t *ssdt_ptr; @@ -1031,6 +1042,9 @@ build_ssdt(GArray *table_data, GArray *linker, ACPI_BUILD_SET_LE(ssdt_ptr, sizeof(ssdp_misc_aml), ssdt_isa_pest[0], 16, misc->pvpanic_port); + ACPI_BUILD_SET_LE(ssdt_ptr, sizeof(ssdp_misc_aml), + ssdt_mctrl_nr_slots[0], 32, nr_mem); + { GArray *sb_scope = build_alloc_array(); uint8_t op = 0x10; /* ScopeOp */ @@ -1084,6 +1098,27 @@ build_ssdt(GArray *table_data, GArray *linker, build_free_array(package); } + if (nr_mem) { + assert(nr_mem <= ACPI_MAX_RAM_SLOTS); + /* build memory devices */ + for (i = 0; i < nr_mem; i++) { + char id[3]; + uint8_t *mem = acpi_data_push(sb_scope, ACPI_MEM_SIZEOF); + + snprintf(id, sizeof(id), "%02X", i); + memcpy(mem, ACPI_MEM_AML, ACPI_MEM_SIZEOF); + memcpy(mem + ACPI_MEM_OFFSET_HEX, id, 2); + memcpy(mem + ACPI_MEM_OFFSET_ID, id, 2); + } + + /* build Method(MEMORY_SLOT_NOTIFY_METHOD, 2) { + * If (LEqual(Arg0, 0x00)) {Notify(MP00, Arg1)} ... + */ + build_append_notify_method(sb_scope, + stringify(MEMORY_SLOT_NOTIFY_METHOD), + "MP%0.02X", nr_mem); + } + { AcpiBuildPciBusHotplugState hotplug_state; Object *pci_host; diff --git a/hw/i386/ssdt-mem.dsl b/hw/i386/ssdt-mem.dsl new file mode 100644 index 0000000000..8e17bd1f97 --- /dev/null +++ b/hw/i386/ssdt-mem.dsl @@ -0,0 +1,77 @@ +/* + * Memory hotplug ACPI DSDT static objects definitions + * + * Copyright ProfitBricks GmbH 2012 + * Copyright (C) 2013-2014 Red Hat Inc + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see + */ + +/* This file is the basis for the ssdt_mem[] variable in src/acpi.c. + * It defines the contents of the memory device object. At + * runtime, a dynamically generated SSDT will contain one copy of this + * AML snippet for every possible memory device in the system. The + * objects will be placed in the \_SB_ namespace. + * + * In addition to the aml code generated from this file, the + * src/acpi.c file creates a MTFY method with an entry for each memdevice: + * Method(MTFY, 2) { + * If (LEqual(Arg0, 0x00)) { Notify(MP00, Arg1) } + * If (LEqual(Arg0, 0x01)) { Notify(MP01, Arg1) } + * ... + * } + */ +#include "hw/acpi/pc-hotplug.h" + +ACPI_EXTRACT_ALL_CODE ssdm_mem_aml + +DefinitionBlock ("ssdt-mem.aml", "SSDT", 0x02, "BXPC", "CSSDT", 0x1) +{ + + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_CRS_METHOD, MethodObj) + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_STATUS_METHOD, MethodObj) + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_OST_METHOD, MethodObj) + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_PROXIMITY_METHOD, MethodObj) + + Scope(\_SB) { +/* v------------------ DO NOT EDIT ------------------v */ + ACPI_EXTRACT_DEVICE_START ssdt_mem_start + ACPI_EXTRACT_DEVICE_END ssdt_mem_end + ACPI_EXTRACT_DEVICE_STRING ssdt_mem_name + Device(MPAA) { + ACPI_EXTRACT_NAME_STRING ssdt_mem_id + Name(_UID, "0xAA") +/* ^------------------ DO NOT EDIT ------------------^ + * Don't change the above without also updating the C code. + */ + Name(_HID, EISAID("PNP0C80")) + + Method(_CRS, 0) { + Return(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_CRS_METHOD(_UID)) + } + + Method(_STA, 0) { + Return(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_STATUS_METHOD(_UID)) + } + + Method(_PXM, 0) { + Return(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_PROXIMITY_METHOD(_UID)) + } + + Method(_OST, 3) { + \_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_OST_METHOD(_UID, Arg0, Arg1, Arg2) + } + } + } +} diff --git a/hw/i386/ssdt-misc.dsl b/hw/i386/ssdt-misc.dsl index a4484b8176..d329b8ba57 100644 --- a/hw/i386/ssdt-misc.dsl +++ b/hw/i386/ssdt-misc.dsl @@ -12,6 +12,7 @@ * You should have received a copy of the GNU General Public License along * with this program; if not, see . */ +#include "hw/acpi/pc-hotplug.h" ACPI_EXTRACT_ALL_CODE ssdp_misc_aml @@ -116,4 +117,167 @@ DefinitionBlock ("ssdt-misc.aml", "SSDT", 0x01, "BXPC", "BXSSDTSUSP", 0x1) } } } + + External(MEMORY_SLOT_NOTIFY_METHOD, MethodObj) + Scope(\_SB.PCI0) { + Device(MEMORY_HOPTLUG_DEVICE) { + Name(_HID, "PNP0A06") + Name(_UID, "Memory hotplug resources") + + ACPI_EXTRACT_NAME_DWORD_CONST ssdt_mctrl_nr_slots + Name(MEMORY_SLOTS_NUMBER, 0x12345678) + + /* Memory hotplug IO registers */ + OperationRegion(MEMORY_HOTPLUG_IO_REGION, SystemIO, + ACPI_MEMORY_HOTPLUG_BASE, + ACPI_MEMORY_HOTPLUG_IO_LEN) + + Name(_CRS, ResourceTemplate() { + IO(Decode16, ACPI_MEMORY_HOTPLUG_BASE, ACPI_MEMORY_HOTPLUG_BASE, + 0, ACPI_MEMORY_HOTPLUG_IO_LEN, IO) + }) + + Method(_STA, 0) { + If (LEqual(MEMORY_SLOTS_NUMBER, Zero)) { + Return(0x0) + } + /* present, functioning, decoding, not shown in UI */ + Return(0xB) + } + + Field(MEMORY_HOTPLUG_IO_REGION, DWordAcc, NoLock, Preserve) { + MEMORY_SLOT_ADDR_LOW, 32, // read only + MEMORY_SLOT_ADDR_HIGH, 32, // read only + MEMORY_SLOT_SIZE_LOW, 32, // read only + MEMORY_SLOT_SIZE_HIGH, 32, // read only + MEMORY_SLOT_PROXIMITY, 32, // read only + } + Field(MEMORY_HOTPLUG_IO_REGION, ByteAcc, NoLock, Preserve) { + Offset(20), + MEMORY_SLOT_ENABLED, 1, // 1 if enabled, read only + MEMORY_SLOT_INSERT_EVENT, 1, // (read) 1 if has a insert event. (write) 1 to clear event + } + + Mutex (MEMORY_SLOT_LOCK, 0) + Field (MEMORY_HOTPLUG_IO_REGION, DWordAcc, NoLock, Preserve) { + MEMORY_SLOT_SLECTOR, 32, // DIMM selector, write only + MEMORY_SLOT_OST_EVENT, 32, // _OST event code, write only + MEMORY_SLOT_OST_STATUS, 32, // _OST status code, write only + } + + Method(MEMORY_SLOT_SCAN_METHOD, 0) { + If (LEqual(MEMORY_SLOTS_NUMBER, Zero)) { + Return(Zero) + } + + Store(Zero, Local0) // Mem devs iterrator + Acquire(MEMORY_SLOT_LOCK, 0xFFFF) + while (LLess(Local0, MEMORY_SLOTS_NUMBER)) { + Store(Local0, MEMORY_SLOT_SLECTOR) // select Local0 DIMM + If (LEqual(MEMORY_SLOT_INSERT_EVENT, One)) { // Memory device needs check + MEMORY_SLOT_NOTIFY_METHOD(Local0, 1) + Store(1, MEMORY_SLOT_INSERT_EVENT) + } + // TODO: handle memory eject request + Add(Local0, One, Local0) // goto next DIMM + } + Release(MEMORY_SLOT_LOCK) + Return(One) + } + + Method(MEMORY_SLOT_STATUS_METHOD, 1) { + Store(Zero, Local0) + + Acquire(MEMORY_SLOT_LOCK, 0xFFFF) + Store(ToInteger(Arg0), MEMORY_SLOT_SLECTOR) // select DIMM + + If (LEqual(MEMORY_SLOT_ENABLED, One)) { + Store(0xF, Local0) + } + + Release(MEMORY_SLOT_LOCK) + Return(Local0) + } + + Method(MEMORY_SLOT_CRS_METHOD, 1, Serialized) { + Acquire(MEMORY_SLOT_LOCK, 0xFFFF) + Store(ToInteger(Arg0), MEMORY_SLOT_SLECTOR) // select DIMM + + Name(MR64, ResourceTemplate() { + QWordMemory(ResourceProducer, PosDecode, MinFixed, MaxFixed, + Cacheable, ReadWrite, + 0x0000000000000000, // Address Space Granularity + 0x0000000000000000, // Address Range Minimum + 0xFFFFFFFFFFFFFFFE, // Address Range Maximum + 0x0000000000000000, // Address Translation Offset + 0xFFFFFFFFFFFFFFFF, // Address Length + ,, MW64, AddressRangeMemory, TypeStatic) + }) + + CreateDWordField(MR64, 14, MINL) + CreateDWordField(MR64, 18, MINH) + CreateDWordField(MR64, 38, LENL) + CreateDWordField(MR64, 42, LENH) + CreateDWordField(MR64, 22, MAXL) + CreateDWordField(MR64, 26, MAXH) + + Store(MEMORY_SLOT_ADDR_HIGH, MINH) + Store(MEMORY_SLOT_ADDR_LOW, MINL) + Store(MEMORY_SLOT_SIZE_HIGH, LENH) + Store(MEMORY_SLOT_SIZE_LOW, LENL) + + // 64-bit math: MAX = MIN + LEN - 1 + Add(MINL, LENL, MAXL) + Add(MINH, LENH, MAXH) + If (LLess(MAXL, MINL)) { + Add(MAXH, One, MAXH) + } + If (LLess(MAXL, One)) { + Subtract(MAXH, One, MAXH) + } + Subtract(MAXL, One, MAXL) + + If (LEqual(MAXH, Zero)){ + Name(MR32, ResourceTemplate() { + DWordMemory(ResourceProducer, PosDecode, MinFixed, MaxFixed, + Cacheable, ReadWrite, + 0x00000000, // Address Space Granularity + 0x00000000, // Address Range Minimum + 0xFFFFFFFE, // Address Range Maximum + 0x00000000, // Address Translation Offset + 0xFFFFFFFF, // Address Length + ,, MW32, AddressRangeMemory, TypeStatic) + }) + CreateDWordField(MR32, MW32._MIN, MIN) + CreateDWordField(MR32, MW32._MAX, MAX) + CreateDWordField(MR32, MW32._LEN, LEN) + Store(MINL, MIN) + Store(MAXL, MAX) + Store(LENL, LEN) + + Release(MEMORY_SLOT_LOCK) + Return(MR32) + } + + Release(MEMORY_SLOT_LOCK) + Return(MR64) + } + + Method(MEMORY_SLOT_PROXIMITY_METHOD, 1) { + Acquire(MEMORY_SLOT_LOCK, 0xFFFF) + Store(ToInteger(Arg0), MEMORY_SLOT_SLECTOR) // select DIMM + Store(MEMORY_SLOT_PROXIMITY, Local0) + Release(MEMORY_SLOT_LOCK) + Return(Local0) + } + + Method(MEMORY_SLOT_OST_METHOD, 4) { + Acquire(MEMORY_SLOT_LOCK, 0xFFFF) + Store(ToInteger(Arg0), MEMORY_SLOT_SLECTOR) // select DIMM + Store(Arg1, MEMORY_SLOT_OST_EVENT) + Store(Arg2, MEMORY_SLOT_OST_STATUS) + Release(MEMORY_SLOT_LOCK) + } + } // Device() + } // Scope() } diff --git a/include/hw/acpi/pc-hotplug.h b/include/hw/acpi/pc-hotplug.h index 01d38e2e7e..bf5157d7c3 100644 --- a/include/hw/acpi/pc-hotplug.h +++ b/include/hw/acpi/pc-hotplug.h @@ -32,4 +32,25 @@ #define ACPI_MEMORY_HOTPLUG_IO_LEN 24 #define ACPI_MEMORY_HOTPLUG_BASE 0x0a00 +#define MEMORY_HOPTLUG_DEVICE MHPD +#define MEMORY_SLOTS_NUMBER MDNR +#define MEMORY_HOTPLUG_IO_REGION HPMR +#define MEMORY_SLOT_ADDR_LOW MRBL +#define MEMORY_SLOT_ADDR_HIGH MRBH +#define MEMORY_SLOT_SIZE_LOW MRLL +#define MEMORY_SLOT_SIZE_HIGH MRLH +#define MEMORY_SLOT_PROXIMITY MPX +#define MEMORY_SLOT_ENABLED MES +#define MEMORY_SLOT_INSERT_EVENT MINS +#define MEMORY_SLOT_SLECTOR MSEL +#define MEMORY_SLOT_OST_EVENT MOEV +#define MEMORY_SLOT_OST_STATUS MOSC +#define MEMORY_SLOT_LOCK MLCK +#define MEMORY_SLOT_STATUS_METHOD MRST +#define MEMORY_SLOT_CRS_METHOD MCRS +#define MEMORY_SLOT_OST_METHOD MOST +#define MEMORY_SLOT_PROXIMITY_METHOD MPXM +#define MEMORY_SLOT_NOTIFY_METHOD MTFY +#define MEMORY_SLOT_SCAN_METHOD MSCN + #endif From bf1e89395996c3a4c888faf297433482934ec007 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:27 +0200 Subject: [PATCH 031/109] pc: add "hotplug-memory-region-size" property to PC_MACHINE ... it will be used by acpi-build code and by unit tests Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 19 +++++++++++++++++++ include/hw/i386/pc.h | 1 + 2 files changed, 20 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0bedd4695e..fec1e1380f 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -60,6 +60,7 @@ #include "acpi-build.h" #include "hw/mem/pc-dimm.h" #include "trace.h" +#include "qapi/visitor.h" /* debug PC/ISA interrupts */ //#define DEBUG_IRQ @@ -1629,6 +1630,23 @@ static HotplugHandler *pc_get_hotpug_handler(MachineState *machine, pcmc->get_hotplug_handler(machine, dev) : NULL; } +static void +pc_machine_get_hotplug_memory_region_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + int64_t value = memory_region_size(&pcms->hotplug_memory); + + visit_type_int(v, &value, name, errp); +} + +static void pc_machine_initfn(Object *obj) +{ + object_property_add(obj, PC_MACHINE_MEMHP_REGION_SIZE, "int", + pc_machine_get_hotplug_memory_region_size, + NULL, NULL, NULL, NULL); +} + static void pc_machine_class_init(ObjectClass *oc, void *data) { MachineClass *mc = MACHINE_CLASS(oc); @@ -1645,6 +1663,7 @@ static const TypeInfo pc_machine_info = { .parent = TYPE_MACHINE, .abstract = true, .instance_size = sizeof(PCMachineState), + .instance_init = pc_machine_initfn, .class_size = sizeof(PCMachineClass), .class_init = pc_machine_class_init, .interfaces = (InterfaceInfo[]) { diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 6050115042..a2bf22c6a1 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -35,6 +35,7 @@ struct PCMachineState { }; #define PC_MACHINE_ACPI_DEVICE_PROP "acpi-device" +#define PC_MACHINE_MEMHP_REGION_SIZE "hotplug-memory-region-size" /** * PCMachineClass: From cec65193d41099519f14fb744440eeabbfa6e4e3 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:28 +0200 Subject: [PATCH 032/109] pc: ACPI BIOS: reserve SRAT entry for hotplug mem hole Needed for Windows to use hotplugged memory device, otherwise it complains that server is not configured for memory hotplug. Tests shows that aftewards it uses dynamically provided proximity value from _PXM() method if available. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-build.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index 6ff7fa3980..ebc5f034e3 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -1199,6 +1199,10 @@ build_srat(GArray *table_data, GArray *linker, uint64_t curnode; int srat_start, numa_start, slots; uint64_t mem_len, mem_base, next_base; + PCMachineState *pcms = PC_MACHINE(qdev_get_machine()); + ram_addr_t hotplugabble_address_space_size = + object_property_get_int(OBJECT(pcms), PC_MACHINE_MEMHP_REGION_SIZE, + NULL); srat_start = table_data->len; @@ -1263,6 +1267,19 @@ build_srat(GArray *table_data, GArray *linker, acpi_build_srat_memory(numamem, 0, 0, 0, MEM_AFFINITY_NOFLAGS); } + /* + * Entry is required for Windows to enable memory hotplug in OS. + * Memory devices may override proximity set by this entry, + * providing _PXM method if necessary. + */ + if (hotplugabble_address_space_size) { + numamem = acpi_data_push(table_data, sizeof *numamem); + acpi_build_srat_memory(numamem, pcms->hotplug_memory_base, + hotplugabble_address_space_size, 0, + MEM_AFFINITY_HOTPLUGGABLE | + MEM_AFFINITY_ENABLED); + } + build_header(linker, table_data, (void *)(table_data->data + srat_start), "SRAT", From 4635b16770157791145301ff388e2a85512a3b69 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 2 Jun 2014 15:25:29 +0200 Subject: [PATCH 033/109] pc: ACPI BIOS: make GPE.3 handle memory hotplug event on PIIX and Q35 machines also make handler edge based to avoid losing events, the same as it has been done for PCI and CPU hotplug handlers. Signed-off-by: Igor Mammedov Acked-by: Peter Crosthwaite Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-dsdt.dsl | 5 ++++- hw/i386/q35-acpi-dsdt.dsl | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/i386/acpi-dsdt.dsl b/hw/i386/acpi-dsdt.dsl index 8ae933bc38..3cc0ea0f9a 100644 --- a/hw/i386/acpi-dsdt.dsl +++ b/hw/i386/acpi-dsdt.dsl @@ -314,6 +314,7 @@ DefinitionBlock ( /**************************************************************** * General purpose events ****************************************************************/ + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_SCAN_METHOD, MethodObj) Scope(\_GPE) { Name(_HID, "ACPI0006") @@ -330,7 +331,9 @@ DefinitionBlock ( // CPU hotplug event \_SB.PRSC() } - Method(_L03) { + Method(_E03) { + // Memory hotplug event + \_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_SCAN_METHOD() } Method(_L04) { } diff --git a/hw/i386/q35-acpi-dsdt.dsl b/hw/i386/q35-acpi-dsdt.dsl index fddc3b27ff..8c3eae73bf 100644 --- a/hw/i386/q35-acpi-dsdt.dsl +++ b/hw/i386/q35-acpi-dsdt.dsl @@ -410,6 +410,7 @@ DefinitionBlock ( /**************************************************************** * General purpose events ****************************************************************/ + External(\_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_SCAN_METHOD, MethodObj) Scope(\_GPE) { Name(_HID, "ACPI0006") @@ -422,7 +423,9 @@ DefinitionBlock ( // CPU hotplug event \_SB.PRSC() } - Method(_L03) { + Method(_E03) { + // Memory hotplug event + \_SB.PCI0.MEMORY_HOPTLUG_DEVICE.MEMORY_SLOT_SCAN_METHOD() } Method(_L04) { } From 2745d9e77268a89104e2d1dc275d0056eb0df5c2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 8 Jun 2014 15:58:59 +0300 Subject: [PATCH 034/109] acpi: update generated files pdate precompiled ACPI hex files for iasl-less hosts after adding the memory hotplug feature Signed-off-by: Michael S. Tsirkin --- hw/i386/acpi-dsdt.hex.generated | 31 +- hw/i386/q35-acpi-dsdt.hex.generated | 31 +- hw/i386/ssdt-mem.hex.generated | 213 ++++++++ hw/i386/ssdt-misc.hex.generated | 811 +++++++++++++++++++++++++++- hw/i386/ssdt-pcihp.hex.generated | 6 +- hw/i386/ssdt-proc.hex.generated | 6 +- 6 files changed, 1074 insertions(+), 24 deletions(-) create mode 100644 hw/i386/ssdt-mem.hex.generated diff --git a/hw/i386/acpi-dsdt.hex.generated b/hw/i386/acpi-dsdt.hex.generated index e61572a5dd..ee490e89c3 100644 --- a/hw/i386/acpi-dsdt.hex.generated +++ b/hw/i386/acpi-dsdt.hex.generated @@ -3,12 +3,12 @@ static unsigned char AcpiDsdtAmlCode[] = { 0x53, 0x44, 0x54, -0x80, +0x93, 0x11, 0x0, 0x0, 0x1, -0x60, +0xf5, 0x42, 0x58, 0x50, @@ -4285,8 +4285,8 @@ static unsigned char AcpiDsdtAmlCode[] = { 0xa, 0xb, 0x10, -0x42, -0xc, +0x45, +0xd, 0x5f, 0x47, 0x50, @@ -4389,12 +4389,31 @@ static unsigned char AcpiDsdtAmlCode[] = { 0x53, 0x43, 0x14, -0x6, +0x19, 0x5f, -0x4c, +0x45, 0x30, 0x33, 0x0, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x53, +0x43, +0x4e, 0x14, 0x6, 0x5f, diff --git a/hw/i386/q35-acpi-dsdt.hex.generated b/hw/i386/q35-acpi-dsdt.hex.generated index 6b788c9be0..c9eb4ac6ad 100644 --- a/hw/i386/q35-acpi-dsdt.hex.generated +++ b/hw/i386/q35-acpi-dsdt.hex.generated @@ -3,12 +3,12 @@ static unsigned char Q35AcpiDsdtAmlCode[] = { 0x53, 0x44, 0x54, -0xd2, +0xe5, 0x1c, 0x0, 0x0, 0x1, -0x13, +0xb7, 0x42, 0x58, 0x50, @@ -7234,8 +7234,8 @@ static unsigned char Q35AcpiDsdtAmlCode[] = { 0xa, 0xb, 0x10, -0x4f, -0x8, +0x42, +0xa, 0x5f, 0x47, 0x50, @@ -7287,12 +7287,31 @@ static unsigned char Q35AcpiDsdtAmlCode[] = { 0x53, 0x43, 0x14, -0x6, +0x19, 0x5f, -0x4c, +0x45, 0x30, 0x33, 0x0, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x53, +0x43, +0x4e, 0x14, 0x6, 0x5f, diff --git a/hw/i386/ssdt-mem.hex.generated b/hw/i386/ssdt-mem.hex.generated new file mode 100644 index 0000000000..00bd34d269 --- /dev/null +++ b/hw/i386/ssdt-mem.hex.generated @@ -0,0 +1,213 @@ +static unsigned char ssdt_mem_id[] = { +0x35 +}; +static unsigned char ssdm_mem_aml[] = { +0x53, +0x53, +0x44, +0x54, +0xc7, +0x0, +0x0, +0x0, +0x2, +0x71, +0x42, +0x58, +0x50, +0x43, +0x0, +0x0, +0x43, +0x53, +0x53, +0x44, +0x54, +0x0, +0x0, +0x0, +0x1, +0x0, +0x0, +0x0, +0x49, +0x4e, +0x54, +0x4c, +0x15, +0x11, +0x13, +0x20, +0x10, +0x42, +0xa, +0x5c, +0x5f, +0x53, +0x42, +0x5f, +0x5b, +0x82, +0x49, +0x9, +0x4d, +0x50, +0x41, +0x41, +0x8, +0x5f, +0x55, +0x49, +0x44, +0xd, +0x30, +0x78, +0x41, +0x41, +0x0, +0x8, +0x5f, +0x48, +0x49, +0x44, +0xc, +0x41, +0xd0, +0xc, +0x80, +0x14, +0x1e, +0x5f, +0x43, +0x52, +0x53, +0x0, +0xa4, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x43, +0x52, +0x53, +0x5f, +0x55, +0x49, +0x44, +0x14, +0x1e, +0x5f, +0x53, +0x54, +0x41, +0x0, +0xa4, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x52, +0x53, +0x54, +0x5f, +0x55, +0x49, +0x44, +0x14, +0x1e, +0x5f, +0x50, +0x58, +0x4d, +0x0, +0xa4, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x50, +0x58, +0x4d, +0x5f, +0x55, +0x49, +0x44, +0x14, +0x20, +0x5f, +0x4f, +0x53, +0x54, +0x3, +0x5c, +0x2f, +0x4, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x4d, +0x4f, +0x53, +0x54, +0x5f, +0x55, +0x49, +0x44, +0x68, +0x69, +0x6a +}; +static unsigned char ssdt_mem_start[] = { +0x2c +}; +static unsigned char ssdt_mem_end[] = { +0xc7 +}; +static unsigned char ssdt_mem_name[] = { +0x30 +}; diff --git a/hw/i386/ssdt-misc.hex.generated b/hw/i386/ssdt-misc.hex.generated index 55e3bd2aa6..ba4268a60b 100644 --- a/hw/i386/ssdt-misc.hex.generated +++ b/hw/i386/ssdt-misc.hex.generated @@ -4,6 +4,9 @@ static unsigned char acpi_pci64_length[] = { static unsigned char acpi_s4_pkg[] = { 0x8f }; +static unsigned short ssdt_mctrl_nr_slots[] = { +0x1aa +}; static unsigned char acpi_s3_name[] = { 0x7c }; @@ -18,12 +21,12 @@ static unsigned char ssdp_misc_aml[] = { 0x53, 0x44, 0x54, -0x62, -0x1, +0x7e, +0x4, 0x0, 0x0, 0x1, -0x76, +0x8b, 0x42, 0x58, 0x50, @@ -46,8 +49,8 @@ static unsigned char ssdp_misc_aml[] = { 0x4e, 0x54, 0x4c, -0x23, -0x8, +0x15, +0x11, 0x13, 0x20, 0x10, @@ -367,7 +370,803 @@ static unsigned char ssdp_misc_aml[] = { 0x49, 0x4f, 0x4d, -0x58 +0x58, +0x10, +0x4b, +0x31, +0x5c, +0x2e, +0x5f, +0x53, +0x42, +0x5f, +0x50, +0x43, +0x49, +0x30, +0x5b, +0x82, +0x4d, +0x30, +0x4d, +0x48, +0x50, +0x44, +0x8, +0x5f, +0x48, +0x49, +0x44, +0xd, +0x50, +0x4e, +0x50, +0x30, +0x41, +0x30, +0x36, +0x0, +0x8, +0x5f, +0x55, +0x49, +0x44, +0xd, +0x4d, +0x65, +0x6d, +0x6f, +0x72, +0x79, +0x20, +0x68, +0x6f, +0x74, +0x70, +0x6c, +0x75, +0x67, +0x20, +0x72, +0x65, +0x73, +0x6f, +0x75, +0x72, +0x63, +0x65, +0x73, +0x0, +0x8, +0x4d, +0x44, +0x4e, +0x52, +0xc, +0x78, +0x56, +0x34, +0x12, +0x5b, +0x80, +0x48, +0x50, +0x4d, +0x52, +0x1, +0xb, +0x0, +0xa, +0xa, +0x18, +0x8, +0x5f, +0x43, +0x52, +0x53, +0x11, +0xd, +0xa, +0xa, +0x47, +0x1, +0x0, +0xa, +0x0, +0xa, +0x0, +0x18, +0x79, +0x0, +0x14, +0x13, +0x5f, +0x53, +0x54, +0x41, +0x0, +0xa0, +0x9, +0x93, +0x4d, +0x44, +0x4e, +0x52, +0x0, +0xa4, +0x0, +0xa4, +0xa, +0xb, +0x5b, +0x81, +0x1f, +0x48, +0x50, +0x4d, +0x52, +0x3, +0x4d, +0x52, +0x42, +0x4c, +0x20, +0x4d, +0x52, +0x42, +0x48, +0x20, +0x4d, +0x52, +0x4c, +0x4c, +0x20, +0x4d, +0x52, +0x4c, +0x48, +0x20, +0x4d, +0x50, +0x58, +0x5f, +0x20, +0x5b, +0x81, +0x13, +0x48, +0x50, +0x4d, +0x52, +0x1, +0x0, +0x40, +0xa, +0x4d, +0x45, +0x53, +0x5f, +0x1, +0x4d, +0x49, +0x4e, +0x53, +0x1, +0x5b, +0x1, +0x4d, +0x4c, +0x43, +0x4b, +0x0, +0x5b, +0x81, +0x15, +0x48, +0x50, +0x4d, +0x52, +0x3, +0x4d, +0x53, +0x45, +0x4c, +0x20, +0x4d, +0x4f, +0x45, +0x56, +0x20, +0x4d, +0x4f, +0x53, +0x43, +0x20, +0x14, +0x4a, +0x4, +0x4d, +0x53, +0x43, +0x4e, +0x0, +0xa0, +0x9, +0x93, +0x4d, +0x44, +0x4e, +0x52, +0x0, +0xa4, +0x0, +0x70, +0x0, +0x60, +0x5b, +0x23, +0x4d, +0x4c, +0x43, +0x4b, +0xff, +0xff, +0xa2, +0x25, +0x95, +0x60, +0x4d, +0x44, +0x4e, +0x52, +0x70, +0x60, +0x4d, +0x53, +0x45, +0x4c, +0xa0, +0x13, +0x93, +0x4d, +0x49, +0x4e, +0x53, +0x1, +0x4d, +0x54, +0x46, +0x59, +0x60, +0x1, +0x70, +0x1, +0x4d, +0x49, +0x4e, +0x53, +0x72, +0x60, +0x1, +0x60, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b, +0xa4, +0x1, +0x14, +0x2d, +0x4d, +0x52, +0x53, +0x54, +0x1, +0x70, +0x0, +0x60, +0x5b, +0x23, +0x4d, +0x4c, +0x43, +0x4b, +0xff, +0xff, +0x70, +0x99, +0x68, +0x0, +0x4d, +0x53, +0x45, +0x4c, +0xa0, +0xb, +0x93, +0x4d, +0x45, +0x53, +0x5f, +0x1, +0x70, +0xa, +0xf, +0x60, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b, +0xa4, +0x60, +0x14, +0x41, +0x18, +0x4d, +0x43, +0x52, +0x53, +0x9, +0x5b, +0x23, +0x4d, +0x4c, +0x43, +0x4b, +0xff, +0xff, +0x70, +0x99, +0x68, +0x0, +0x4d, +0x53, +0x45, +0x4c, +0x8, +0x4d, +0x52, +0x36, +0x34, +0x11, +0x33, +0xa, +0x30, +0x8a, +0x2b, +0x0, +0x0, +0xc, +0x3, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0xfe, +0xff, +0xff, +0xff, +0xff, +0xff, +0xff, +0xff, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0xff, +0xff, +0xff, +0xff, +0xff, +0xff, +0xff, +0xff, +0x79, +0x0, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0xe, +0x4d, +0x49, +0x4e, +0x4c, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0x12, +0x4d, +0x49, +0x4e, +0x48, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0x26, +0x4c, +0x45, +0x4e, +0x4c, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0x2a, +0x4c, +0x45, +0x4e, +0x48, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0x16, +0x4d, +0x41, +0x58, +0x4c, +0x8a, +0x4d, +0x52, +0x36, +0x34, +0xa, +0x1a, +0x4d, +0x41, +0x58, +0x48, +0x70, +0x4d, +0x52, +0x42, +0x48, +0x4d, +0x49, +0x4e, +0x48, +0x70, +0x4d, +0x52, +0x42, +0x4c, +0x4d, +0x49, +0x4e, +0x4c, +0x70, +0x4d, +0x52, +0x4c, +0x48, +0x4c, +0x45, +0x4e, +0x48, +0x70, +0x4d, +0x52, +0x4c, +0x4c, +0x4c, +0x45, +0x4e, +0x4c, +0x72, +0x4d, +0x49, +0x4e, +0x4c, +0x4c, +0x45, +0x4e, +0x4c, +0x4d, +0x41, +0x58, +0x4c, +0x72, +0x4d, +0x49, +0x4e, +0x48, +0x4c, +0x45, +0x4e, +0x48, +0x4d, +0x41, +0x58, +0x48, +0xa0, +0x14, +0x95, +0x4d, +0x41, +0x58, +0x4c, +0x4d, +0x49, +0x4e, +0x4c, +0x72, +0x4d, +0x41, +0x58, +0x48, +0x1, +0x4d, +0x41, +0x58, +0x48, +0xa0, +0x11, +0x95, +0x4d, +0x41, +0x58, +0x4c, +0x1, +0x74, +0x4d, +0x41, +0x58, +0x48, +0x1, +0x4d, +0x41, +0x58, +0x48, +0x74, +0x4d, +0x41, +0x58, +0x4c, +0x1, +0x4d, +0x41, +0x58, +0x4c, +0xa0, +0x44, +0x7, +0x93, +0x4d, +0x41, +0x58, +0x48, +0x0, +0x8, +0x4d, +0x52, +0x33, +0x32, +0x11, +0x1f, +0xa, +0x1c, +0x87, +0x17, +0x0, +0x0, +0xc, +0x3, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0x0, +0xfe, +0xff, +0xff, +0xff, +0x0, +0x0, +0x0, +0x0, +0xff, +0xff, +0xff, +0xff, +0x79, +0x0, +0x8a, +0x4d, +0x52, +0x33, +0x32, +0xa, +0xa, +0x4d, +0x49, +0x4e, +0x5f, +0x8a, +0x4d, +0x52, +0x33, +0x32, +0xa, +0xe, +0x4d, +0x41, +0x58, +0x5f, +0x8a, +0x4d, +0x52, +0x33, +0x32, +0xa, +0x16, +0x4c, +0x45, +0x4e, +0x5f, +0x70, +0x4d, +0x49, +0x4e, +0x4c, +0x4d, +0x49, +0x4e, +0x5f, +0x70, +0x4d, +0x41, +0x58, +0x4c, +0x4d, +0x41, +0x58, +0x5f, +0x70, +0x4c, +0x45, +0x4e, +0x4c, +0x4c, +0x45, +0x4e, +0x5f, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b, +0xa4, +0x4d, +0x52, +0x33, +0x32, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b, +0xa4, +0x4d, +0x52, +0x36, +0x34, +0x14, +0x24, +0x4d, +0x50, +0x58, +0x4d, +0x1, +0x5b, +0x23, +0x4d, +0x4c, +0x43, +0x4b, +0xff, +0xff, +0x70, +0x99, +0x68, +0x0, +0x4d, +0x53, +0x45, +0x4c, +0x70, +0x4d, +0x50, +0x58, +0x5f, +0x60, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b, +0xa4, +0x60, +0x14, +0x28, +0x4d, +0x4f, +0x53, +0x54, +0x4, +0x5b, +0x23, +0x4d, +0x4c, +0x43, +0x4b, +0xff, +0xff, +0x70, +0x99, +0x68, +0x0, +0x4d, +0x53, +0x45, +0x4c, +0x70, +0x69, +0x4d, +0x4f, +0x45, +0x56, +0x70, +0x6a, +0x4d, +0x4f, +0x53, +0x43, +0x5b, +0x27, +0x4d, +0x4c, +0x43, +0x4b }; static unsigned char ssdt_isa_pest[] = { 0xd0 diff --git a/hw/i386/ssdt-pcihp.hex.generated b/hw/i386/ssdt-pcihp.hex.generated index b599b4663c..72ffa84800 100644 --- a/hw/i386/ssdt-pcihp.hex.generated +++ b/hw/i386/ssdt-pcihp.hex.generated @@ -32,7 +32,7 @@ static unsigned char ssdp_pcihp_aml[] = { 0x0, 0x0, 0x1, -0x6b, +0x70, 0x42, 0x58, 0x50, @@ -55,8 +55,8 @@ static unsigned char ssdp_pcihp_aml[] = { 0x4e, 0x54, 0x4c, -0x23, -0x8, +0x15, +0x11, 0x13, 0x20, 0x10, diff --git a/hw/i386/ssdt-proc.hex.generated b/hw/i386/ssdt-proc.hex.generated index 97e28d4820..4df0734c79 100644 --- a/hw/i386/ssdt-proc.hex.generated +++ b/hw/i386/ssdt-proc.hex.generated @@ -11,7 +11,7 @@ static unsigned char ssdp_proc_aml[] = { 0x0, 0x0, 0x1, -0x78, +0x7d, 0x42, 0x58, 0x50, @@ -34,8 +34,8 @@ static unsigned char ssdp_proc_aml[] = { 0x4e, 0x54, 0x4c, -0x23, -0x8, +0x15, +0x11, 0x13, 0x20, 0x5b, From 63122df2315564253b14d2abec1d854570488f83 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 8 Jun 2014 15:59:58 +0300 Subject: [PATCH 035/109] acpi-test: update expected tables Update ACPI tables test to match new ACPI tables after adding the memory hotplug feature. Signed-off-by: Michael S. Tsirkin --- tests/acpi-test-data/pc/DSDT | Bin 4480 -> 4499 bytes tests/acpi-test-data/pc/SSDT | Bin 2269 -> 3065 bytes tests/acpi-test-data/q35/DSDT | Bin 7378 -> 7397 bytes tests/acpi-test-data/q35/SSDT | Bin 550 -> 1346 bytes 4 files changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/acpi-test-data/pc/DSDT b/tests/acpi-test-data/pc/DSDT index d0bb3de79d2ff98736e8a54e6e8859ab4efeaf03..7ed03fd37effb1b0b2c19f49b5cbff168a29f9bc 100644 GIT binary patch delta 56 zcmZoro~+E}66_K(S&)H&F=!*#76B$#-pL09(j+D0T@8#GV)R+!gPr07oIMSEJpx>O MgPr{*>j;Jb0HJUW%K!iX delta 37 tcmbQN+@Q?m66_MvAjrVLxOXGh76B$Fp2-IU(%9JIeGH5lCff^!006)s3CjQg diff --git a/tests/acpi-test-data/pc/SSDT b/tests/acpi-test-data/pc/SSDT index c987fb237925aa532afb3d795ac5a787548e333d..eb2d8b698ce6a3a910a05244a3b6cf80bf818fb9 100644 GIT binary patch delta 826 zcmZ8fU2D`p6rG`krn7A&ZfqA38u1gQ61!!e-6pG9JF|vNl$Hn+EQ?i?9n$t;@gc4+ zLVaBN;M*em2gKjw59y2Rz4@q%6PTPicg~r)bMJ5SVuMNPMHe+f=(nHjfM$D-Uq4cE zR|Rg@iN6R(_hy7z>2(K5e z5E$`iKM*0U5yl#ZazkmZGWLuT247=Hth$m>n@pC+s321^WwkgPZK7yFguC2^$%ly} z;(&sEWyhm>?n<68(UFRZZcoyf3hwTacyz50%MM6hbY8(EUE6xTD5Tq)J4uryzJ-$6 z?8lw+1kcxr0M#U`k8?*6Jv~UM&h68LCh&qCvtm?ixttBj87kQt+>saXDnP%0cX_8` z9{?=bbe2Pfcq!_ka3PKDe?$VjEg*u@`-hh9L7^z)0{MOlAc^L zh&aah#9JC84UG{tx6Ms<w;a2;->G#-s%jqVuK zcu@?MMM8&5*xIpfL*oege+@LY)4xjHhgOLeoG5Rd=!A|i3&YlLo9sw@C L!Onh@>tsRzue1*V delta 37 tcmaEAdC8K?CD)bv#%yfyJ_g1NlPAiA007YN3IPBB diff --git a/tests/acpi-test-data/q35/SSDT b/tests/acpi-test-data/q35/SSDT index 91996387571e32deecfb5e8731fe2e88d6bec65a..778b79bf428b5d7602b7b80c9434e38c79718bb2 100644 GIT binary patch literal 1346 zcmZWp-%r#)5Z;vo%iaNZM@3>xxWwNWwZ|d)NUyD4q1#)#1u-SrCWd$z4b1@$ya^#-?)GO$I64iUz^STDQ{L`ahs+M2QgS(;$% zHJ${a^niRvid}P?+|OVR;08o@eK(7au5=r86j!U8Plx$K?gnoNJIh`8!Z3{`sCNx2 z;ai_~+6R@^&R+6q|3js{wY#(5e!sO#G;aF=9zE375ImrH!r1KP%<$M|4~RaUR@fcy zpaRP4v3ztmhnf){)V&J48u0QEd4R-*szis=X;8AvxFZ$it}kgsxo54C=+Tv|T4*eD@!bO$B<{;zwdF>as498}%uP8yJhuinKpNu#%88b=Rsb&u*# zcmNl5NuT^^kyM%Qr1>OlYPdSq%~=kIKR(=RxK`uA(qgH`jFZQN6h`o=|+c zvwuKNP{<}BMK6P8fQ<~^_naX+4{*TFaC2j@_7=cKf@g{%1Re$g;5ATyz{4PIq!FMN zjns|mu6~Lf`rByJlq{tr-8*LxdFb=H_Vo_Z&=f&!p)|*Cf4h0PWGofS#1hl;Y72}( zcm*{9oCNIv&Okyc!JW~aU~qIMm?ydhH-xftxU{I>g{I;3>IEd&N*CAk5ZWZn{kLcv zTPXR6J(Ptk9v29XMs!>swS-}07o@qi!rw|d-=-y8nC}-vpHkcmNzW48#06mEID;+2 zf$gjq)Dq4Lnk~KTL(g67<~uD$zGC!k%@R$&Y{L{6S916}iH@eEVJJT From 9e28840658824f52f03f5a69661fd4b22987feeb Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Fri, 6 Jun 2014 18:43:29 +0200 Subject: [PATCH 036/109] virtio: Drop superfluous conditionals around g_free() Signed-off-by: Markus Armbruster Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake --- hw/9pfs/virtio-9p.c | 4 +--- hw/net/virtio-net.c | 24 ++++++++---------------- hw/virtio/vhost.c | 8 ++------ hw/virtio/virtio.c | 12 ++++-------- 4 files changed, 15 insertions(+), 33 deletions(-) diff --git a/hw/9pfs/virtio-9p.c b/hw/9pfs/virtio-9p.c index 9aa6725f09..5861a5b826 100644 --- a/hw/9pfs/virtio-9p.c +++ b/hw/9pfs/virtio-9p.c @@ -299,9 +299,7 @@ static int v9fs_xattr_fid_clunk(V9fsPDU *pdu, V9fsFidState *fidp) free_out: v9fs_string_free(&fidp->fs.xattr.name); free_value: - if (fidp->fs.xattr.value) { - g_free(fidp->fs.xattr.value); - } + g_free(fidp->fs.xattr.value); return retval; } diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 940a7cfe54..dd28a5a0a4 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1509,14 +1509,10 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name, */ assert(type != NULL); - if (n->netclient_name) { - g_free(n->netclient_name); - n->netclient_name = NULL; - } - if (n->netclient_type) { - g_free(n->netclient_type); - n->netclient_type = NULL; - } + g_free(n->netclient_name); + n->netclient_name = NULL; + g_free(n->netclient_type); + n->netclient_type = NULL; if (name != NULL) { n->netclient_name = g_strdup(name); @@ -1616,14 +1612,10 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) unregister_savevm(dev, "virtio-net", n); - if (n->netclient_name) { - g_free(n->netclient_name); - n->netclient_name = NULL; - } - if (n->netclient_type) { - g_free(n->netclient_type); - n->netclient_type = NULL; - } + g_free(n->netclient_name); + n->netclient_name = NULL; + g_free(n->netclient_type); + n->netclient_type = NULL; g_free(n->mac_table.macs); g_free(n->vlans); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index f62cfaf38e..9e6023af36 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -295,9 +295,7 @@ static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) if (dev->log_size) { vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); } - if (dev->log) { - g_free(dev->log); - } + g_free(dev->log); dev->log = log; dev->log_size = size; } @@ -601,9 +599,7 @@ static int vhost_migration_log(MemoryListener *listener, int enable) if (r < 0) { return r; } - if (dev->log) { - g_free(dev->log); - } + g_free(dev->log); dev->log = NULL; dev->log_size = 0; } else { diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index a07ae8ad91..3b938c8783 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1164,10 +1164,8 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { - if (vdev->bus_name) { - g_free(vdev->bus_name); - vdev->bus_name = NULL; - } + g_free(vdev->bus_name); + vdev->bus_name = NULL; if (bus_name) { vdev->bus_name = g_strdup(bus_name); @@ -1206,10 +1204,8 @@ static void virtio_device_unrealize(DeviceState *dev, Error **errp) } } - if (vdev->bus_name) { - g_free(vdev->bus_name); - vdev->bus_name = NULL; - } + g_free(vdev->bus_name); + vdev->bus_name = NULL; } static void virtio_device_class_init(ObjectClass *klass, void *data) From 80e0090a44388d8924f548e49f7390c1e1585761 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Fri, 6 Jun 2014 18:43:30 +0200 Subject: [PATCH 037/109] virtio: Drop superfluous conditionals around g_strdup() Signed-off-by: Markus Armbruster Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake --- hw/net/virtio-net.c | 7 +------ hw/virtio/virtio.c | 6 +----- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index dd28a5a0a4..0a34a8b908 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -1510,13 +1510,8 @@ void virtio_net_set_netclient_name(VirtIONet *n, const char *name, assert(type != NULL); g_free(n->netclient_name); - n->netclient_name = NULL; g_free(n->netclient_type); - n->netclient_type = NULL; - - if (name != NULL) { - n->netclient_name = g_strdup(name); - } + n->netclient_name = g_strdup(name); n->netclient_type = g_strdup(type); } diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 3b938c8783..a3082d569d 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1165,11 +1165,7 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name) { g_free(vdev->bus_name); - vdev->bus_name = NULL; - - if (bus_name) { - vdev->bus_name = g_strdup(bus_name); - } + vdev->bus_name = g_strdup(bus_name); } static void virtio_device_realize(DeviceState *dev, Error **errp) From 292b1634d008d289d633ff5a8f2275288f607e56 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 8 Jun 2014 18:45:25 +0300 Subject: [PATCH 038/109] ich: get rid of spaces in type name Names with spaces in them are nasty, let's not go there. Signed-off-by: Michael S. Tsirkin --- include/hw/i386/ich9.h | 2 +- include/hw/i386/pc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hw/i386/ich9.h b/include/hw/i386/ich9.h index e19143555e..59ea25b49a 100644 --- a/include/hw/i386/ich9.h +++ b/include/hw/i386/ich9.h @@ -24,7 +24,7 @@ I2CBus *ich9_smb_init(PCIBus *bus, int devfn, uint32_t smb_io_base); #define ICH9_CC_SIZE (16 * 1024) /* 16KB */ -#define TYPE_ICH9_LPC_DEVICE "ICH9 LPC" +#define TYPE_ICH9_LPC_DEVICE "ICH9-LPC" #define ICH9_LPC_DEVICE(obj) \ OBJECT_CHECK(ICH9LPCState, (obj), TYPE_ICH9_LPC_DEVICE) diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index a2bf22c6a1..2e6ac04aea 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -295,7 +295,7 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); #define PC_Q35_COMPAT_2_0 \ PC_COMPAT_2_0, \ {\ - .driver = "ICH9 LPC",\ + .driver = "ICH9-LPC",\ .property = "memory-hotplug-support",\ .value = "off",\ } From 9d3cae68ac4de8cbae7bf750d6ebe6cb87cd9365 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 9 Jun 2014 19:28:00 +0200 Subject: [PATCH 039/109] pc: q35: acpi: report error to user on unsupported unplug request Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/isa/lpc_ich9.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index fb2b82d13f..3fe23113b0 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -610,6 +610,8 @@ static void ich9_device_plug_cb(HotplugHandler *hotplug_dev, static void ich9_device_unplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { + error_setg(errp, "acpi: device unplug request for not supported device" + " type: %s", object_get_typename(OBJECT(dev))); } static bool ich9_rst_cnt_needed(void *opaque) From 110f463062d65487fdd0daaaa0df53a2ea9d6619 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 20 May 2014 14:01:42 +0800 Subject: [PATCH 040/109] migration: export SELF_ANNOUNCE_ROUNDS Export it for other users. Signed-off-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/migration/vmstate.h | 2 ++ savevm.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 7e45048355..6edce982b2 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -752,6 +752,8 @@ extern const VMStateInfo vmstate_info_bitmap; #define VMSTATE_END_OF_LIST() \ {} +#define SELF_ANNOUNCE_ROUNDS 5 + int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, int version_id); void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, diff --git a/savevm.c b/savevm.c index 7b2c410ea1..ebffb56a12 100644 --- a/savevm.c +++ b/savevm.c @@ -42,7 +42,6 @@ #include "block/snapshot.h" #include "block/qapi.h" -#define SELF_ANNOUNCE_ROUNDS 5 #ifndef ETH_P_RARP #define ETH_P_RARP 0x8035 From 508e1180d3e8371a902ba721d2c9e1ee04c33a15 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 20 May 2014 14:01:43 +0800 Subject: [PATCH 041/109] migration: introduce self_announce_delay() This patch introduces self_announce_delay() to calculate the delay for the next announce round. This could be used by other device e.g virtio-net who wants to do announcing by itself. Signed-off-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/migration/vmstate.h | 8 ++++++++ savevm.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 6edce982b2..799d2d0f03 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -780,4 +780,12 @@ void vmstate_register_ram(struct MemoryRegion *memory, DeviceState *dev); void vmstate_unregister_ram(struct MemoryRegion *memory, DeviceState *dev); void vmstate_register_ram_global(struct MemoryRegion *memory); +static inline +int64_t self_announce_delay(int round) +{ + assert(round < SELF_ANNOUNCE_ROUNDS && round > 0); + /* delay 50ms, 150ms, 250ms, ... */ + return 50 + (SELF_ANNOUNCE_ROUNDS - round - 1) * 100; +} + #endif diff --git a/savevm.c b/savevm.c index ebffb56a12..6cbdaacacd 100644 --- a/savevm.c +++ b/savevm.c @@ -97,7 +97,7 @@ static void qemu_announce_self_once(void *opaque) if (--count) { /* delay 50ms, 150ms, 250ms, ... */ timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - 50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100); + self_announce_delay(count)); } else { timer_del(timer); timer_free(timer); From f57fcf706342f8545d77a538c4319b752511f672 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 20 May 2014 14:01:44 +0800 Subject: [PATCH 042/109] virtio-net: announce self by guest It's hard to track all mac addresses and their configurations (e.g vlan or ipv6) in qemu. Without this information, it's impossible to build proper garp packet after migration. The only possible solution to this is let guest (who knows all configurations) to do this. So, this patch introduces a new readonly config status bit of virtio-net, VIRTIO_NET_S_ANNOUNCE which is used to notify guest to announce presence of its link through config update interrupt.When guest has done the announcement, it should ack the notification through VIRTIO_NET_CTRL_ANNOUNCE_ACK cmd. This feature is negotiated by a new feature bit VIRTIO_NET_F_ANNOUNCE (which has already been supported by Linux guest). During load, a counter of announcing rounds is set so that after the vm is running it can trigger rounds of config interrupts to notify the guest to build and send the correct garps. Cc: Liuyongan Cc: Amos Kong Signed-off-by: Jason Wang Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/virtio-net.c | 42 ++++++++++++++++++++++++++++++++++ include/hw/i386/pc.h | 5 ++++ include/hw/virtio/virtio-net.h | 17 ++++++++++++++ 3 files changed, 64 insertions(+) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 0a34a8b908..318b033911 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -99,6 +99,16 @@ static bool virtio_net_started(VirtIONet *n, uint8_t status) (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running; } +static void virtio_net_announce_timer(void *opaque) +{ + VirtIONet *n = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(n); + + n->announce_counter--; + n->status |= VIRTIO_NET_S_ANNOUNCE; + virtio_notify_config(vdev); +} + static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) { VirtIODevice *vdev = VIRTIO_DEVICE(n); @@ -322,6 +332,9 @@ static void virtio_net_reset(VirtIODevice *vdev) n->nobcast = 0; /* multiqueue is disabled by default */ n->curr_queues = 1; + timer_del(n->announce_timer); + n->announce_counter = 0; + n->status &= ~VIRTIO_NET_S_ANNOUNCE; /* Flush any MAC and VLAN filter table state */ n->mac_table.in_use = 0; @@ -731,6 +744,23 @@ static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd, return VIRTIO_NET_OK; } +static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK && + n->status & VIRTIO_NET_S_ANNOUNCE) { + n->status &= ~VIRTIO_NET_S_ANNOUNCE; + if (n->announce_counter) { + timer_mod(n->announce_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + self_announce_delay(n->announce_counter)); + } + return VIRTIO_NET_OK; + } else { + return VIRTIO_NET_ERR; + } +} + static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd, struct iovec *iov, unsigned int iov_cnt) { @@ -794,6 +824,8 @@ static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq) status = virtio_net_handle_mac(n, ctrl.cmd, iov, iov_cnt); } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) { status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, iov_cnt); + } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) { + status = virtio_net_handle_announce(n, ctrl.cmd, iov, iov_cnt); } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) { status = virtio_net_handle_mq(n, ctrl.cmd, iov, iov_cnt); } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) { @@ -1451,6 +1483,12 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) qemu_get_subqueue(n->nic, i)->link_down = link_down; } + if (vdev->guest_features & (0x1 << VIRTIO_NET_F_GUEST_ANNOUNCE) && + vdev->guest_features & (0x1 << VIRTIO_NET_F_CTRL_VQ)) { + n->announce_counter = SELF_ANNOUNCE_ROUNDS; + timer_mod(n->announce_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)); + } + return 0; } @@ -1553,6 +1591,8 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) qemu_macaddr_default_if_unset(&n->nic_conf.macaddr); memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac)); n->status = VIRTIO_NET_S_LINK_UP; + n->announce_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + virtio_net_announce_timer, n); if (n->netclient_type) { /* @@ -1629,6 +1669,8 @@ static void virtio_net_device_unrealize(DeviceState *dev, Error **errp) } } + timer_del(n->announce_timer); + timer_free(n->announce_timer); g_free(n->vqs); qemu_del_nic(n->nic); virtio_cleanup(vdev); diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 2e6ac04aea..a9529f418c 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -351,6 +351,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); .driver = "pci-serial-4x",\ .property = "prog_if",\ .value = stringify(0),\ + },\ + {\ + .driver = "virtio-net-pci",\ + .property = "guest_announce",\ + .value = "off",\ } #define PC_COMPAT_1_7 \ diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index 4b32440837..f7fccc08a4 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -49,12 +49,14 @@ #define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ #define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE 21 /* Guest can announce itself */ #define VIRTIO_NET_F_MQ 22 /* Device supports Receive Flow * Steering */ #define VIRTIO_NET_F_CTRL_MAC_ADDR 23 /* Set MAC address */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ +#define VIRTIO_NET_S_ANNOUNCE 2 /* Announcement is needed */ #define TX_TIMER_INTERVAL 150000 /* 150 us */ @@ -193,6 +195,8 @@ typedef struct VirtIONet { char *netclient_name; char *netclient_type; uint64_t curr_guest_offloads; + QEMUTimer *announce_timer; + int announce_counter; } VirtIONet; #define VIRTIO_NET_CTRL_MAC 1 @@ -212,6 +216,18 @@ typedef struct VirtIONet { #define VIRTIO_NET_CTRL_VLAN_ADD 0 #define VIRTIO_NET_CTRL_VLAN_DEL 1 +/* + * Control link announce acknowledgement + * + * VIRTIO_NET_S_ANNOUNCE bit in the status field requests link announcement from + * guest driver. The driver is notified by config space change interrupt. The + * command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that the driver has + * received the notification. It makes the device clear the bit + * VIRTIO_NET_S_ANNOUNCE in the status field. + */ +#define VIRTIO_NET_CTRL_ANNOUNCE 3 + #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0 + /* * Control Multiqueue * @@ -251,6 +267,7 @@ struct virtio_net_ctrl_mq { DEFINE_PROP_BIT("guest_tso6", _state, _field, VIRTIO_NET_F_GUEST_TSO6, true), \ DEFINE_PROP_BIT("guest_ecn", _state, _field, VIRTIO_NET_F_GUEST_ECN, true), \ DEFINE_PROP_BIT("guest_ufo", _state, _field, VIRTIO_NET_F_GUEST_UFO, true), \ + DEFINE_PROP_BIT("guest_announce", _state, _field, VIRTIO_NET_F_GUEST_ANNOUNCE, true), \ DEFINE_PROP_BIT("host_tso4", _state, _field, VIRTIO_NET_F_HOST_TSO4, true), \ DEFINE_PROP_BIT("host_tso6", _state, _field, VIRTIO_NET_F_HOST_TSO6, true), \ DEFINE_PROP_BIT("host_ecn", _state, _field, VIRTIO_NET_F_HOST_ECN, true), \ From 69e03ae64ba01c4846425fb13e494bce3eedfa6b Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:03:35 +0300 Subject: [PATCH 043/109] Add kvm_eventfds_enabled function Add a function to check if the eventfd capability is present in KVM in the host kernel. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Paolo Bonzini --- include/sysemu/kvm.h | 11 +++++++++++ kvm-all.c | 4 ++++ kvm-stub.c | 1 + 3 files changed, 16 insertions(+) diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index e79e92c50e..c4556ad59e 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -43,6 +43,7 @@ extern bool kvm_allowed; extern bool kvm_kernel_irqchip; extern bool kvm_async_interrupts_allowed; extern bool kvm_halt_in_kernel_allowed; +extern bool kvm_eventfds_allowed; extern bool kvm_irqfds_allowed; extern bool kvm_msi_via_irqfd_allowed; extern bool kvm_gsi_routing_allowed; @@ -82,6 +83,15 @@ extern bool kvm_readonly_mem_allowed; */ #define kvm_halt_in_kernel() (kvm_halt_in_kernel_allowed) +/** + * kvm_eventfds_enabled: + * + * Returns: true if we can use eventfds to receive notifications + * from a KVM CPU (ie the kernel supports eventds and we are running + * with a configuration where it is meaningful to use them). + */ +#define kvm_eventfds_enabled() (kvm_eventfds_allowed) + /** * kvm_irqfds_enabled: * @@ -128,6 +138,7 @@ extern bool kvm_readonly_mem_allowed; #define kvm_irqchip_in_kernel() (false) #define kvm_async_interrupts_enabled() (false) #define kvm_halt_in_kernel() (false) +#define kvm_eventfds_enabled() (false) #define kvm_irqfds_enabled() (false) #define kvm_msi_via_irqfd_enabled() (false) #define kvm_gsi_routing_allowed() (false) diff --git a/kvm-all.c b/kvm-all.c index 4e19eff0ef..92f56d8a31 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -113,6 +113,7 @@ KVMState *kvm_state; bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; bool kvm_halt_in_kernel_allowed; +bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; @@ -1541,6 +1542,9 @@ int kvm_init(MachineClass *mc) (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); #endif + kvm_eventfds_allowed = + (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); + ret = kvm_arch_init(s); if (ret < 0) { goto err; diff --git a/kvm-stub.c b/kvm-stub.c index ac33d8666d..8e7737caa9 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -22,6 +22,7 @@ KVMState *kvm_state; bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; +bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; From 7b0bfdf52d694c9a3a96505aa42ce3f8d63acd35 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:03:48 +0300 Subject: [PATCH 044/109] Add chardev API qemu_chr_fe_read_all This function will attempt to read data from the chardev trying to fill the buffer up to the given length. Add tcp_chr_disconnect to reuse disconnection code where needed. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/char.h | 14 ++++++++ qemu-char.c | 83 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 87 insertions(+), 10 deletions(-) diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 7f5eeb38b0..5281123c9a 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -56,6 +56,8 @@ typedef void IOEventHandler(void *opaque, int event); struct CharDriverState { void (*init)(struct CharDriverState *s); int (*chr_write)(struct CharDriverState *s, const uint8_t *buf, int len); + int (*chr_sync_read)(struct CharDriverState *s, + const uint8_t *buf, int len); GSource *(*chr_add_watch)(struct CharDriverState *s, GIOCondition cond); void (*chr_update_read_handler)(struct CharDriverState *s); int (*chr_ioctl)(struct CharDriverState *s, int cmd, void *arg); @@ -188,6 +190,18 @@ int qemu_chr_fe_write(CharDriverState *s, const uint8_t *buf, int len); */ int qemu_chr_fe_write_all(CharDriverState *s, const uint8_t *buf, int len); +/** + * @qemu_chr_fe_read_all: + * + * Read data to a buffer from the back end. + * + * @buf the data buffer + * @len the number of bytes to read + * + * Returns: the number of bytes read + */ +int qemu_chr_fe_read_all(CharDriverState *s, uint8_t *buf, int len); + /** * @qemu_chr_fe_ioctl: * diff --git a/qemu-char.c b/qemu-char.c index f918f90972..8fb11c8341 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -84,6 +84,7 @@ #include "ui/qemu-spice.h" #define READ_BUF_LEN 4096 +#define READ_RETRIES 10 /***********************************************************/ /* character device */ @@ -145,6 +146,41 @@ int qemu_chr_fe_write_all(CharDriverState *s, const uint8_t *buf, int len) return offset; } +int qemu_chr_fe_read_all(CharDriverState *s, uint8_t *buf, int len) +{ + int offset = 0, counter = 10; + int res; + + if (!s->chr_sync_read) { + return 0; + } + + while (offset < len) { + do { + res = s->chr_sync_read(s, buf + offset, len - offset); + if (res == -1 && errno == EAGAIN) { + g_usleep(100); + } + } while (res == -1 && errno == EAGAIN); + + if (res == 0) { + break; + } + + if (res < 0) { + return res; + } + + offset += res; + + if (!counter--) { + break; + } + } + + return offset; +} + int qemu_chr_fe_ioctl(CharDriverState *s, int cmd, void *arg) { if (!s->chr_ioctl) @@ -2454,6 +2490,23 @@ static GSource *tcp_chr_add_watch(CharDriverState *chr, GIOCondition cond) return g_io_create_watch(s->chan, cond); } +static void tcp_chr_disconnect(CharDriverState *chr) +{ + TCPCharDriver *s = chr->opaque; + + s->connected = 0; + if (s->listen_chan) { + s->listen_tag = g_io_add_watch(s->listen_chan, G_IO_IN, + tcp_chr_accept, chr); + } + remove_fd_in_watch(chr); + g_io_channel_unref(s->chan); + s->chan = NULL; + closesocket(s->fd); + s->fd = -1; + qemu_chr_be_event(chr, CHR_EVENT_CLOSED); +} + static gboolean tcp_chr_read(GIOChannel *chan, GIOCondition cond, void *opaque) { CharDriverState *chr = opaque; @@ -2470,16 +2523,7 @@ static gboolean tcp_chr_read(GIOChannel *chan, GIOCondition cond, void *opaque) size = tcp_chr_recv(chr, (void *)buf, len); if (size == 0) { /* connection closed */ - s->connected = 0; - if (s->listen_chan) { - s->listen_tag = g_io_add_watch(s->listen_chan, G_IO_IN, tcp_chr_accept, chr); - } - remove_fd_in_watch(chr); - g_io_channel_unref(s->chan); - s->chan = NULL; - closesocket(s->fd); - s->fd = -1; - qemu_chr_be_event(chr, CHR_EVENT_CLOSED); + tcp_chr_disconnect(chr); } else if (size > 0) { if (s->do_telnetopt) tcp_chr_process_IAC_bytes(chr, s, buf, &size); @@ -2490,6 +2534,24 @@ static gboolean tcp_chr_read(GIOChannel *chan, GIOCondition cond, void *opaque) return TRUE; } +static int tcp_chr_sync_read(CharDriverState *chr, const uint8_t *buf, int len) +{ + TCPCharDriver *s = chr->opaque; + int size; + + if (!s->connected) { + return 0; + } + + size = tcp_chr_recv(chr, (void *) buf, len); + if (size == 0) { + /* connection closed */ + tcp_chr_disconnect(chr); + } + + return size; +} + #ifndef _WIN32 CharDriverState *qemu_chr_open_eventfd(int eventfd) { @@ -2678,6 +2740,7 @@ static CharDriverState *qemu_chr_open_socket_fd(int fd, bool do_nodelay, chr->opaque = s; chr->chr_write = tcp_chr_write; + chr->chr_sync_read = tcp_chr_sync_read; chr->chr_close = tcp_chr_close; chr->get_msgfd = tcp_get_msgfd; chr->chr_add_client = tcp_chr_add_client; From d39aac7aac2abd78c483f8527c5aa9ae6156bd49 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:04:02 +0300 Subject: [PATCH 045/109] Add chardev API qemu_chr_fe_set_msgfds This will set an array of file descriptors to the internal structures. The next time a message is send the array will be send as ancillary data. This feature works on the UNIX domain socket backend only. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/char.h | 14 +++++++ qemu-char.c | 88 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 5281123c9a..1aa080e00a 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -62,6 +62,7 @@ struct CharDriverState { void (*chr_update_read_handler)(struct CharDriverState *s); int (*chr_ioctl)(struct CharDriverState *s, int cmd, void *arg); int (*get_msgfd)(struct CharDriverState *s); + int (*set_msgfds)(struct CharDriverState *s, int *fds, int num); int (*chr_add_client)(struct CharDriverState *chr, int fd); IOEventHandler *chr_event; IOCanReadHandler *chr_can_read; @@ -228,6 +229,19 @@ int qemu_chr_fe_ioctl(CharDriverState *s, int cmd, void *arg); */ int qemu_chr_fe_get_msgfd(CharDriverState *s); +/** + * @qemu_chr_fe_set_msgfds: + * + * For backends capable of fd passing, set an array of fds to be passed with + * the next send operation. + * A subsequent call to this function before calling a write function will + * result in overwriting the fd array with the new value without being send. + * Upon writing the message the fd array is freed. + * + * Returns: -1 if fd passing isn't supported. + */ +int qemu_chr_fe_set_msgfds(CharDriverState *s, int *fds, int num); + /** * @qemu_chr_fe_claim: * diff --git a/qemu-char.c b/qemu-char.c index 8fb11c8341..f9709d9d0d 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -207,6 +207,11 @@ int qemu_chr_fe_get_msgfd(CharDriverState *s) return s->get_msgfd ? s->get_msgfd(s) : -1; } +int qemu_chr_fe_set_msgfds(CharDriverState *s, int *fds, int num) +{ + return s->set_msgfds ? s->set_msgfds(s, fds, num) : -1; +} + int qemu_chr_add_client(CharDriverState *s, int fd) { return s->chr_add_client ? s->chr_add_client(s, fd) : -1; @@ -2333,15 +2338,71 @@ typedef struct { int do_nodelay; int is_unix; int msgfd; + int *write_msgfds; + int write_msgfds_num; } TCPCharDriver; static gboolean tcp_chr_accept(GIOChannel *chan, GIOCondition cond, void *opaque); +#ifndef _WIN32 +static int unix_send_msgfds(CharDriverState *chr, const uint8_t *buf, int len) +{ + TCPCharDriver *s = chr->opaque; + struct msghdr msgh; + struct iovec iov; + int r; + + size_t fd_size = s->write_msgfds_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + /* set the payload */ + iov.iov_base = (uint8_t *) buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + cmsg = CMSG_FIRSTHDR(&msgh); + + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), s->write_msgfds, fd_size); + + do { + r = sendmsg(s->fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + /* free the written msgfds, no matter what */ + if (s->write_msgfds_num) { + g_free(s->write_msgfds); + s->write_msgfds = 0; + s->write_msgfds_num = 0; + } + + return r; +} +#endif + static int tcp_chr_write(CharDriverState *chr, const uint8_t *buf, int len) { TCPCharDriver *s = chr->opaque; if (s->connected) { - return io_channel_send(s->chan, buf, len); +#ifndef _WIN32 + if (s->is_unix && s->write_msgfds_num) { + return unix_send_msgfds(chr, buf, len); + } else +#endif + { + return io_channel_send(s->chan, buf, len); + } } else { /* XXX: indicate an error ? */ return len; @@ -2416,6 +2477,25 @@ static int tcp_get_msgfd(CharDriverState *chr) return fd; } +static int tcp_set_msgfds(CharDriverState *chr, int *fds, int num) +{ + TCPCharDriver *s = chr->opaque; + + /* clear old pending fd array */ + if (s->write_msgfds) { + g_free(s->write_msgfds); + } + + if (num) { + s->write_msgfds = g_malloc(num * sizeof(int)); + memcpy(s->write_msgfds, fds, num * sizeof(int)); + } + + s->write_msgfds_num = num; + + return 0; +} + #ifndef _WIN32 static void unix_process_msgfd(CharDriverState *chr, struct msghdr *msg) { @@ -2683,6 +2763,9 @@ static void tcp_chr_close(CharDriverState *chr) } closesocket(s->listen_fd); } + if (s->write_msgfds_num) { + g_free(s->write_msgfds); + } g_free(s); qemu_chr_be_event(chr, CHR_EVENT_CLOSED); } @@ -2712,6 +2795,8 @@ static CharDriverState *qemu_chr_open_socket_fd(int fd, bool do_nodelay, s->fd = -1; s->listen_fd = -1; s->msgfd = -1; + s->write_msgfds = 0; + s->write_msgfds_num = 0; chr->filename = g_malloc(256); switch (ss.ss_family) { @@ -2743,6 +2828,7 @@ static CharDriverState *qemu_chr_open_socket_fd(int fd, bool do_nodelay, chr->chr_sync_read = tcp_chr_sync_read; chr->chr_close = tcp_chr_close; chr->get_msgfd = tcp_get_msgfd; + chr->set_msgfds = tcp_set_msgfds; chr->chr_add_client = tcp_chr_add_client; chr->chr_add_watch = tcp_chr_add_watch; chr->chr_update_read_handler = tcp_chr_update_read_handler; From c76bf6bb8fbbb233a7d3641e09229d23747d5ee3 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:04:15 +0300 Subject: [PATCH 046/109] Add chardev API qemu_chr_fe_get_msgfds This extends the existing qemu_chr_fe_get_msgfd by allowing to read a set of fds. The function for receiving the fds - unix_process_msgfd is extended to allocate the needed array size. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/char.h | 15 +++++++- qemu-char.c | 85 ++++++++++++++++++++++++++++++++----------- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 1aa080e00a..930aaf1abe 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -61,7 +61,7 @@ struct CharDriverState { GSource *(*chr_add_watch)(struct CharDriverState *s, GIOCondition cond); void (*chr_update_read_handler)(struct CharDriverState *s); int (*chr_ioctl)(struct CharDriverState *s, int cmd, void *arg); - int (*get_msgfd)(struct CharDriverState *s); + int (*get_msgfds)(struct CharDriverState *s, int* fds, int num); int (*set_msgfds)(struct CharDriverState *s, int *fds, int num); int (*chr_add_client)(struct CharDriverState *chr, int fd); IOEventHandler *chr_event; @@ -229,6 +229,19 @@ int qemu_chr_fe_ioctl(CharDriverState *s, int cmd, void *arg); */ int qemu_chr_fe_get_msgfd(CharDriverState *s); +/** + * @qemu_chr_fe_get_msgfds: + * + * For backends capable of fd passing, return the number of file received + * descriptors and fills the fds array up to num elements + * + * Returns: -1 if fd passing isn't supported or there are no pending file + * descriptors. If file descriptors are returned, subsequent calls to + * this function will return -1 until a client sends a new set of file + * descriptors. + */ +int qemu_chr_fe_get_msgfds(CharDriverState *s, int *fds, int num); + /** * @qemu_chr_fe_set_msgfds: * diff --git a/qemu-char.c b/qemu-char.c index f9709d9d0d..b9bef443de 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -204,7 +204,13 @@ void qemu_chr_be_write(CharDriverState *s, uint8_t *buf, int len) int qemu_chr_fe_get_msgfd(CharDriverState *s) { - return s->get_msgfd ? s->get_msgfd(s) : -1; + int fd; + return (qemu_chr_fe_get_msgfds(s, &fd, 1) >= 0) ? fd : -1; +} + +int qemu_chr_fe_get_msgfds(CharDriverState *s, int *fds, int len) +{ + return s->get_msgfds ? s->get_msgfds(s, fds, len) : -1; } int qemu_chr_fe_set_msgfds(CharDriverState *s, int *fds, int num) @@ -2337,7 +2343,8 @@ typedef struct { int do_telnetopt; int do_nodelay; int is_unix; - int msgfd; + int *read_msgfds; + int read_msgfds_num; int *write_msgfds; int write_msgfds_num; } TCPCharDriver; @@ -2469,12 +2476,20 @@ static void tcp_chr_process_IAC_bytes(CharDriverState *chr, *size = j; } -static int tcp_get_msgfd(CharDriverState *chr) +static int tcp_get_msgfds(CharDriverState *chr, int *fds, int num) { TCPCharDriver *s = chr->opaque; - int fd = s->msgfd; - s->msgfd = -1; - return fd; + int to_copy = (s->read_msgfds_num < num) ? s->read_msgfds_num : num; + + if (to_copy) { + memcpy(fds, s->read_msgfds, to_copy * sizeof(int)); + + g_free(s->read_msgfds); + s->read_msgfds = 0; + s->read_msgfds_num = 0; + } + + return to_copy; } static int tcp_set_msgfds(CharDriverState *chr, int *fds, int num) @@ -2503,26 +2518,46 @@ static void unix_process_msgfd(CharDriverState *chr, struct msghdr *msg) struct cmsghdr *cmsg; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { - int fd; + int fd_size, i; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)) || + if (cmsg->cmsg_len < CMSG_LEN(sizeof(int)) || cmsg->cmsg_level != SOL_SOCKET || - cmsg->cmsg_type != SCM_RIGHTS) + cmsg->cmsg_type != SCM_RIGHTS) { continue; + } - fd = *((int *)CMSG_DATA(cmsg)); - if (fd < 0) + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + + if (!fd_size) { continue; + } - /* O_NONBLOCK is preserved across SCM_RIGHTS so reset it */ - qemu_set_block(fd); + /* close and clean read_msgfds */ + for (i = 0; i < s->read_msgfds_num; i++) { + close(s->read_msgfds[i]); + } -#ifndef MSG_CMSG_CLOEXEC - qemu_set_cloexec(fd); -#endif - if (s->msgfd != -1) - close(s->msgfd); - s->msgfd = fd; + if (s->read_msgfds_num) { + g_free(s->read_msgfds); + } + + s->read_msgfds_num = fd_size / sizeof(int); + s->read_msgfds = g_malloc(fd_size); + memcpy(s->read_msgfds, CMSG_DATA(cmsg), fd_size); + + for (i = 0; i < s->read_msgfds_num; i++) { + int fd = s->read_msgfds[i]; + if (fd < 0) { + continue; + } + + /* O_NONBLOCK is preserved across SCM_RIGHTS so reset it */ + qemu_set_block(fd); + + #ifndef MSG_CMSG_CLOEXEC + qemu_set_cloexec(fd); + #endif + } } } @@ -2746,6 +2781,7 @@ static gboolean tcp_chr_accept(GIOChannel *channel, GIOCondition cond, void *opa static void tcp_chr_close(CharDriverState *chr) { TCPCharDriver *s = chr->opaque; + int i; if (s->fd >= 0) { remove_fd_in_watch(chr); if (s->chan) { @@ -2763,6 +2799,12 @@ static void tcp_chr_close(CharDriverState *chr) } closesocket(s->listen_fd); } + if (s->read_msgfds_num) { + for (i = 0; i < s->read_msgfds_num; i++) { + close(s->read_msgfds[i]); + } + g_free(s->read_msgfds); + } if (s->write_msgfds_num) { g_free(s->write_msgfds); } @@ -2794,7 +2836,8 @@ static CharDriverState *qemu_chr_open_socket_fd(int fd, bool do_nodelay, s->connected = 0; s->fd = -1; s->listen_fd = -1; - s->msgfd = -1; + s->read_msgfds = 0; + s->read_msgfds_num = 0; s->write_msgfds = 0; s->write_msgfds_num = 0; @@ -2827,7 +2870,7 @@ static CharDriverState *qemu_chr_open_socket_fd(int fd, bool do_nodelay, chr->chr_write = tcp_chr_write; chr->chr_sync_read = tcp_chr_sync_read; chr->chr_close = tcp_chr_close; - chr->get_msgfd = tcp_get_msgfd; + chr->get_msgfds = tcp_get_msgfds; chr->set_msgfds = tcp_set_msgfds; chr->chr_add_client = tcp_chr_add_client; chr->chr_add_watch = tcp_chr_add_watch; From cdaa86a54b232572bba594bf87a7416e527e460c Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:04:28 +0300 Subject: [PATCH 047/109] Add G_IO_HUP handler for socket chardev This is used to detect that the remote end has disconnected. Just call tcp_char_disconnect on receiving this event. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/char.h | 1 + qemu-char.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/sysemu/char.h b/include/sysemu/char.h index 930aaf1abe..3b835f6fb3 100644 --- a/include/sysemu/char.h +++ b/include/sysemu/char.h @@ -83,6 +83,7 @@ struct CharDriverState { int avail_connections; int is_mux; guint fd_in_tag; + guint fd_hup_tag; QemuOpts *opts; QTAILQ_ENTRY(CharDriverState) next; }; diff --git a/qemu-char.c b/qemu-char.c index b9bef443de..b3bd3b5af4 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -2680,6 +2680,25 @@ CharDriverState *qemu_chr_open_eventfd(int eventfd) } #endif +static gboolean tcp_chr_chan_close(GIOChannel *channel, GIOCondition cond, + void *opaque) +{ + CharDriverState *chr = opaque; + + if (cond != G_IO_HUP) { + return FALSE; + } + + /* connection closed */ + tcp_chr_disconnect(chr); + if (chr->fd_hup_tag) { + g_source_remove(chr->fd_hup_tag); + chr->fd_hup_tag = 0; + } + + return TRUE; +} + static void tcp_chr_connect(void *opaque) { CharDriverState *chr = opaque; @@ -2689,6 +2708,8 @@ static void tcp_chr_connect(void *opaque) if (s->chan) { chr->fd_in_tag = io_add_watch_poll(s->chan, tcp_chr_read_poll, tcp_chr_read, chr); + chr->fd_hup_tag = g_io_add_watch(s->chan, G_IO_HUP, tcp_chr_chan_close, + chr); } qemu_chr_be_generic_open(chr); } From 2e6d46d77ed328d34a94688da8371bcbe243479b Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:04:42 +0300 Subject: [PATCH 048/109] vhost: add vhost_get_features and vhost_ack_features Generalize the features get/ack to be used for both vhost-net and vhost-scsi. In vhost-net add vhost_net_get_feature_bits to select the feature bit set depending on the NetClient kind. Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 56 ++++++++++++++++++++------------------- hw/scsi/vhost-scsi.c | 25 +++++++---------- hw/virtio/vhost.c | 27 +++++++++++++++++++ include/hw/virtio/vhost.h | 5 ++++ 4 files changed, 71 insertions(+), 42 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index a1de2f43a0..6a9a32f56b 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -46,39 +46,41 @@ struct vhost_net { NetClientState *nc; }; +/* Features supported by host kernel. */ +static const int kernel_feature_bits[] = { + VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_NET_F_MRG_RXBUF, + VHOST_INVALID_FEATURE_BIT +}; + +static const int *vhost_net_get_feature_bits(struct vhost_net *net) +{ + const int *feature_bits = 0; + + switch (net->nc->info->type) { + case NET_CLIENT_OPTIONS_KIND_TAP: + feature_bits = kernel_feature_bits; + break; + default: + error_report("Feature bits not defined for this type: %d", + net->nc->info->type); + break; + } + + return feature_bits; +} + unsigned vhost_net_get_features(struct vhost_net *net, unsigned features) { - /* Clear features not supported by host kernel. */ - if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) { - features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); - } - if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) { - features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); - } - if (!(net->dev.features & (1 << VIRTIO_RING_F_EVENT_IDX))) { - features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); - } - if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF))) { - features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); - } - return features; + return vhost_get_features(&net->dev, vhost_net_get_feature_bits(net), + features); } void vhost_net_ack_features(struct vhost_net *net, unsigned features) { - net->dev.acked_features = net->dev.backend_features; - if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) { - net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY); - } - if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC)) { - net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC); - } - if (features & (1 << VIRTIO_RING_F_EVENT_IDX)) { - net->dev.acked_features |= (1 << VIRTIO_RING_F_EVENT_IDX); - } - if (features & (1 << VIRTIO_NET_F_MRG_RXBUF)) { - net->dev.acked_features |= (1 << VIRTIO_NET_F_MRG_RXBUF); - } + vhost_ack_features(&net->dev, vhost_net_get_feature_bits(net), features); } static int vhost_net_get_fd(NetClientState *backend) diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index 668bafa72a..a12d88846f 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -24,6 +24,15 @@ #include "hw/virtio/virtio-scsi.h" #include "hw/virtio/virtio-bus.h" +/* Features supported by host kernel. */ +static const int kernel_feature_bits[] = { + VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + VIRTIO_SCSI_F_HOTPLUG, + VHOST_INVALID_FEATURE_BIT +}; + static int vhost_scsi_set_endpoint(VHostSCSI *s) { VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); @@ -141,21 +150,7 @@ static uint32_t vhost_scsi_get_features(VirtIODevice *vdev, { VHostSCSI *s = VHOST_SCSI(vdev); - /* Clear features not supported by host kernel. */ - if (!(s->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))) { - features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY); - } - if (!(s->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC))) { - features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC); - } - if (!(s->dev.features & (1 << VIRTIO_RING_F_EVENT_IDX))) { - features &= ~(1 << VIRTIO_RING_F_EVENT_IDX); - } - if (!(s->dev.features & (1 << VIRTIO_SCSI_F_HOTPLUG))) { - features &= ~(1 << VIRTIO_SCSI_F_HOTPLUG); - } - - return features; + return vhost_get_features(&s->dev, kernel_feature_bits, features); } static void vhost_scsi_set_config(VirtIODevice *vdev, diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 9e6023af36..a3e872d2f1 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -990,6 +990,33 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, assert(r >= 0); } +unsigned vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, + unsigned features) +{ + const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { + unsigned bit_mask = (1 << *bit); + if (!(hdev->features & bit_mask)) { + features &= ~bit_mask; + } + bit++; + } + return features; +} + +void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, + unsigned features) +{ + const int *bit = feature_bits; + while (*bit != VHOST_INVALID_FEATURE_BIT) { + unsigned bit_mask = (1 << *bit); + if (features & bit_mask) { + hdev->acked_features |= bit_mask; + } + bit++; + } +} + /* Host notifiers must be enabled at this point. */ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) { diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index de24746c7e..df1f2149cc 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -25,6 +25,7 @@ typedef unsigned long vhost_log_chunk_t; #define VHOST_LOG_PAGE 0x1000 #define VHOST_LOG_BITS (8 * sizeof(vhost_log_chunk_t)) #define VHOST_LOG_CHUNK (VHOST_LOG_PAGE * VHOST_LOG_BITS) +#define VHOST_INVALID_FEATURE_BIT (0xff) struct vhost_memory; struct vhost_dev { @@ -68,4 +69,8 @@ bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n); */ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, bool mask); +unsigned vhost_get_features(struct vhost_dev *hdev, const int *feature_bits, + unsigned features); +void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits, + unsigned features); #endif From 212d69f25e8bff3e9dc8af6ca30a6556094fc5a5 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:04:55 +0300 Subject: [PATCH 049/109] vhost_net should call the poll callback only when it is set The poll callback needs to be called when bringing up or down the vhost_net instance. As it is not mandatory for an NetClient to implement it, invoke it only when it is set. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 6a9a32f56b..dcec0f730b 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -168,7 +168,10 @@ static int vhost_net_start_one(struct vhost_net *net, goto fail_start; } - net->nc->info->poll(net->nc, false); + if (net->nc->info->poll) { + net->nc->info->poll(net->nc, false); + } + qemu_set_fd_handler(net->backend, NULL, NULL, NULL); file.fd = net->backend; for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { @@ -185,7 +188,9 @@ fail: int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); assert(r >= 0); } - net->nc->info->poll(net->nc, true); + if (net->nc->info->poll) { + net->nc->info->poll(net->nc, true); + } vhost_dev_stop(&net->dev, dev); fail_start: vhost_dev_disable_notifiers(&net->dev, dev); @@ -206,7 +211,9 @@ static void vhost_net_stop_one(struct vhost_net *net, int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); assert(r >= 0); } - net->nc->info->poll(net->nc, true); + if (net->nc->info->poll) { + net->nc->info->poll(net->nc, true); + } vhost_dev_stop(&net->dev, dev); vhost_dev_disable_notifiers(&net->dev, dev); } From ed8b4afe5ff290ae119237e4fa2142e9f1832230 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:05:08 +0300 Subject: [PATCH 050/109] Refactor virtio-net to use generic get_vhost_net This decouples virtio-net from the TAP netdev backend and allows support for other backends to be implemented. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 30 +++++++++++++++++++++++++++--- hw/net/virtio-net.c | 29 ++++++++--------------------- include/net/vhost_net.h | 1 + 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index dcec0f730b..0b45043a72 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -233,7 +233,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, } for (i = 0; i < total_queues; i++) { - r = vhost_net_start_one(tap_get_vhost_net(ncs[i].peer), dev, i * 2); + r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev, i * 2); if (r < 0) { goto err; @@ -250,7 +250,7 @@ int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, err: while (--i >= 0) { - vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); + vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev); } return r; } @@ -271,7 +271,7 @@ void vhost_net_stop(VirtIODevice *dev, NetClientState *ncs, assert(r >= 0); for (i = 0; i < total_queues; i++) { - vhost_net_stop_one(tap_get_vhost_net(ncs[i].peer), dev); + vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev); } } @@ -291,6 +291,25 @@ void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, { vhost_virtqueue_mask(&net->dev, dev, idx, mask); } + +VHostNetState *get_vhost_net(NetClientState *nc) +{ + VHostNetState *vhost_net = 0; + + if (!nc) { + return 0; + } + + switch (nc->info->type) { + case NET_CLIENT_OPTIONS_KIND_TAP: + vhost_net = tap_get_vhost_net(nc); + break; + default: + break; + } + + return vhost_net; +} #else struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, bool force) @@ -337,4 +356,9 @@ void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, int idx, bool mask) { } + +VHostNetState *get_vhost_net(NetClientState *nc) +{ + return 0; +} #endif diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 318b033911..d8588f3808 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -115,14 +115,7 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) NetClientState *nc = qemu_get_queue(n->nic); int queues = n->multiqueue ? n->max_queues : 1; - if (!nc->peer) { - return; - } - if (nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { - return; - } - - if (!tap_get_vhost_net(nc->peer)) { + if (!get_vhost_net(nc->peer)) { return; } @@ -132,7 +125,7 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) } if (!n->vhost_started) { int r; - if (!vhost_net_query(tap_get_vhost_net(nc->peer), vdev)) { + if (!vhost_net_query(get_vhost_net(nc->peer), vdev)) { return; } n->vhost_started = 1; @@ -465,13 +458,10 @@ static uint32_t virtio_net_get_features(VirtIODevice *vdev, uint32_t features) features &= ~(0x1 << VIRTIO_NET_F_HOST_UFO); } - if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + if (!get_vhost_net(nc->peer)) { return features; } - if (!tap_get_vhost_net(nc->peer)) { - return features; - } - return vhost_net_get_features(tap_get_vhost_net(nc->peer), features); + return vhost_net_get_features(get_vhost_net(nc->peer), features); } static uint32_t virtio_net_bad_features(VirtIODevice *vdev) @@ -535,13 +525,10 @@ static void virtio_net_set_features(VirtIODevice *vdev, uint32_t features) for (i = 0; i < n->max_queues; i++) { NetClientState *nc = qemu_get_subqueue(n->nic, i); - if (!nc->peer || nc->peer->info->type != NET_CLIENT_OPTIONS_KIND_TAP) { + if (!get_vhost_net(nc->peer)) { continue; } - if (!tap_get_vhost_net(nc->peer)) { - continue; - } - vhost_net_ack_features(tap_get_vhost_net(nc->peer), features); + vhost_net_ack_features(get_vhost_net(nc->peer), features); } if ((1 << VIRTIO_NET_F_CTRL_VLAN) & features) { @@ -1514,7 +1501,7 @@ static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx) VirtIONet *n = VIRTIO_NET(vdev); NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); assert(n->vhost_started); - return vhost_net_virtqueue_pending(tap_get_vhost_net(nc->peer), idx); + return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx); } static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, @@ -1523,7 +1510,7 @@ static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx, VirtIONet *n = VIRTIO_NET(vdev); NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(idx)); assert(n->vhost_started); - vhost_net_virtqueue_mask(tap_get_vhost_net(nc->peer), + vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask); } diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h index 2d936bb5f5..e2bd61c784 100644 --- a/include/net/vhost_net.h +++ b/include/net/vhost_net.h @@ -20,4 +20,5 @@ void vhost_net_ack_features(VHostNetState *net, unsigned features); bool vhost_net_virtqueue_pending(VHostNetState *net, int n); void vhost_net_virtqueue_mask(VHostNetState *net, VirtIODevice *dev, int idx, bool mask); +VHostNetState *get_vhost_net(NetClientState *nc); #endif From 81647a655fa4ff99fd5748363a174edd87a40950 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:05:22 +0300 Subject: [PATCH 051/109] vhost_net_init will use VhostNetOptions to get all its arguments vhost_dev_init will replace devfd and devpath with a single opaque argument. This is initialised with a file descriptor. When TAP is used (through vhost_net), open /dev/vhost-net and pass the fd as an opaque parameter in VhostNetOptions. The same applies to vhost-scsi - open /dev/vhost-scsi and pass the fd. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 23 ++++++++++++----------- hw/scsi/vhost-scsi.c | 10 +++++++++- hw/virtio/vhost.c | 12 +++--------- include/hw/virtio/vhost.h | 2 +- include/net/vhost_net.h | 8 +++++++- net/tap.c | 17 +++++++++++++---- 6 files changed, 45 insertions(+), 27 deletions(-) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 0b45043a72..7a5523f36a 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -94,32 +94,34 @@ static int vhost_net_get_fd(NetClientState *backend) } } -struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, - bool force) +struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; struct vhost_net *net = g_malloc(sizeof *net); - if (!backend) { - fprintf(stderr, "vhost-net requires backend to be setup\n"); + + if (!options->net_backend) { + fprintf(stderr, "vhost-net requires net backend to be setup\n"); goto fail; } - r = vhost_net_get_fd(backend); + + r = vhost_net_get_fd(options->net_backend); if (r < 0) { goto fail; } - net->nc = backend; - net->dev.backend_features = qemu_has_vnet_hdr(backend) ? 0 : + net->nc = options->net_backend; + net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend) ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR); net->backend = r; net->dev.nvqs = 2; net->dev.vqs = net->vqs; - r = vhost_dev_init(&net->dev, devfd, "/dev/vhost-net", force); + r = vhost_dev_init(&net->dev, options->opaque, + options->force); if (r < 0) { goto fail; } - if (!qemu_has_vnet_hdr_len(backend, + if (!qemu_has_vnet_hdr_len(options->net_backend, sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); } @@ -311,8 +313,7 @@ VHostNetState *get_vhost_net(NetClientState *nc) return vhost_net; } #else -struct vhost_net *vhost_net_init(NetClientState *backend, int devfd, - bool force) +struct vhost_net *vhost_net_init(VhostNetOptions *options) { error_report("vhost-net support is not compiled in"); return NULL; diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index a12d88846f..e4c98b494d 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -210,6 +210,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) error_setg(errp, "vhost-scsi: unable to parse vhostfd"); return; } + } else { + vhostfd = open("/dev/vhost-scsi", O_RDWR); + if (vhostfd < 0) { + error_setg(errp, "vhost-scsi: open vhost char device failed: %s", + strerror(errno)); + return; + } } virtio_scsi_common_realize(dev, &err); @@ -222,7 +229,8 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs); s->dev.vq_index = 0; - ret = vhost_dev_init(&s->dev, vhostfd, "/dev/vhost-scsi", true); + ret = vhost_dev_init(&s->dev, (void *)(uintptr_t)vhostfd, + true); if (ret < 0) { error_setg(errp, "vhost-scsi: vhost initialization failed: %s", strerror(-ret)); diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index a3e872d2f1..0d3b7bbd05 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -808,19 +808,13 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) event_notifier_cleanup(&vq->masked_notifier); } -int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath, +int vhost_dev_init(struct vhost_dev *hdev, void *opaque, bool force) { uint64_t features; int i, r; - if (devfd >= 0) { - hdev->control = devfd; - } else { - hdev->control = open(devpath, O_RDWR); - if (hdev->control < 0) { - return -errno; - } - } + hdev->control = (uintptr_t) opaque;; + r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); if (r < 0) { goto fail; diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index df1f2149cc..8afc6f9f22 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -51,7 +51,7 @@ struct vhost_dev { hwaddr mem_changed_end_addr; }; -int vhost_dev_init(struct vhost_dev *hdev, int devfd, const char *devpath, +int vhost_dev_init(struct vhost_dev *hdev, void *opaque, bool force); void vhost_dev_cleanup(struct vhost_dev *hdev); bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev); diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h index e2bd61c784..2067ee2298 100644 --- a/include/net/vhost_net.h +++ b/include/net/vhost_net.h @@ -6,7 +6,13 @@ struct vhost_net; typedef struct vhost_net VHostNetState; -VHostNetState *vhost_net_init(NetClientState *backend, int devfd, bool force); +typedef struct VhostNetOptions { + NetClientState *net_backend; + void *opaque; + bool force; +} VhostNetOptions; + +struct vhost_net *vhost_net_init(VhostNetOptions *options); bool vhost_net_query(VHostNetState *net, VirtIODevice *dev); int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, int total_queues); diff --git a/net/tap.c b/net/tap.c index fc1b865e08..5789ff8e0c 100644 --- a/net/tap.c +++ b/net/tap.c @@ -594,6 +594,7 @@ static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, int vnet_hdr, int fd) { TAPState *s; + int vhostfd; s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); if (!s) { @@ -624,7 +625,10 @@ static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, if (tap->has_vhost ? tap->vhost : vhostfdname || (tap->has_vhostforce && tap->vhostforce)) { - int vhostfd; + VhostNetOptions options; + + options.net_backend = &s->nc; + options.force = tap->has_vhostforce && tap->vhostforce; if (tap->has_vhostfd || tap->has_vhostfds) { vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname); @@ -632,11 +636,16 @@ static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, return -1; } } else { - vhostfd = -1; + vhostfd = open("/dev/vhost-net", O_RDWR); + if (vhostfd < 0) { + error_report("tap: open vhost char device failed: %s", + strerror(errno)); + return -1; + } } + options.opaque = (void *)(uintptr_t)vhostfd; - s->vhost_net = vhost_net_init(&s->nc, vhostfd, - tap->has_vhostforce && tap->vhostforce); + s->vhost_net = vhost_net_init(&options); if (!s->vhost_net) { error_report("vhost-net requested but could not be initialized"); return -1; From 24d1eb33eb2ccd995a845a4d4b793e2619a9e460 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:05:35 +0300 Subject: [PATCH 052/109] Add vhost_ops to vhost_dev struct and replace all relevant ioctls Decouple vhost from the Linux kernel by introducing vhost_ops. The intention is to provide different backends - a 'kernel' backend based on the ioctl interface, and an 'user' backend based on a UNIX domain socket and shared memory interface. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 10 +++++--- hw/scsi/vhost-scsi.c | 10 +++++--- hw/virtio/vhost.c | 41 ++++++++++++++++--------------- include/hw/virtio/vhost-backend.h | 27 ++++++++++++++++++++ include/hw/virtio/vhost.h | 2 ++ 5 files changed, 63 insertions(+), 27 deletions(-) create mode 100644 include/hw/virtio/vhost-backend.h diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 7a5523f36a..402f37e8cc 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -177,7 +176,8 @@ static int vhost_net_start_one(struct vhost_net *net, qemu_set_fd_handler(net->backend, NULL, NULL, NULL); file.fd = net->backend; for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + const VhostOps *vhost_ops = net->dev.vhost_ops; + r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); if (r < 0) { r = -errno; goto fail; @@ -187,7 +187,8 @@ static int vhost_net_start_one(struct vhost_net *net, fail: file.fd = -1; while (file.index-- > 0) { - int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + const VhostOps *vhost_ops = net->dev.vhost_ops; + int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); assert(r >= 0); } if (net->nc->info->poll) { @@ -210,7 +211,8 @@ static void vhost_net_stop_one(struct vhost_net *net, } for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - int r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file); + const VhostOps *vhost_ops = net->dev.vhost_ops; + int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); assert(r >= 0); } if (net->nc->info->poll) { diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index e4c98b494d..ed564bc435 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -36,12 +36,13 @@ static const int kernel_feature_bits[] = { static int vhost_scsi_set_endpoint(VHostSCSI *s) { VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); + const VhostOps *vhost_ops = s->dev.vhost_ops; struct vhost_scsi_target backend; int ret; memset(&backend, 0, sizeof(backend)); pstrcpy(backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->conf.wwpn); - ret = ioctl(s->dev.control, VHOST_SCSI_SET_ENDPOINT, &backend); + ret = vhost_ops->vhost_call(&s->dev, VHOST_SCSI_SET_ENDPOINT, &backend); if (ret < 0) { return -errno; } @@ -52,10 +53,11 @@ static void vhost_scsi_clear_endpoint(VHostSCSI *s) { VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); struct vhost_scsi_target backend; + const VhostOps *vhost_ops = s->dev.vhost_ops; memset(&backend, 0, sizeof(backend)); pstrcpy(backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->conf.wwpn); - ioctl(s->dev.control, VHOST_SCSI_CLEAR_ENDPOINT, &backend); + vhost_ops->vhost_call(&s->dev, VHOST_SCSI_CLEAR_ENDPOINT, &backend); } static int vhost_scsi_start(VHostSCSI *s) @@ -64,13 +66,15 @@ static int vhost_scsi_start(VHostSCSI *s) VirtIODevice *vdev = VIRTIO_DEVICE(s); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + const VhostOps *vhost_ops = s->dev.vhost_ops; if (!k->set_guest_notifiers) { error_report("binding does not support guest notifiers"); return -ENOSYS; } - ret = ioctl(s->dev.control, VHOST_SCSI_GET_ABI_VERSION, &abi_version); + ret = vhost_ops->vhost_call(&s->dev, + VHOST_SCSI_GET_ABI_VERSION, &abi_version); if (ret < 0) { return -errno; } diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 0d3b7bbd05..85841019c0 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -13,7 +13,6 @@ * GNU GPL, version 2 or (at your option) any later version. */ -#include #include "hw/virtio/vhost.h" #include "hw/hw.h" #include "qemu/atomic.h" @@ -289,7 +288,7 @@ static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) log = g_malloc0(size * sizeof *log); log_base = (uint64_t)(unsigned long)log; - r = ioctl(dev->control, VHOST_SET_LOG_BASE, &log_base); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base); assert(r >= 0); /* Sync only the range covered by the old log */ if (dev->log_size) { @@ -456,7 +455,7 @@ static void vhost_commit(MemoryListener *listener) } if (!dev->log_enabled) { - r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); assert(r >= 0); dev->memory_changed = false; return; @@ -469,7 +468,7 @@ static void vhost_commit(MemoryListener *listener) if (dev->log_size < log_size) { vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); } - r = ioctl(dev->control, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); assert(r >= 0); /* To log less, can only decrease log size after table update. */ if (dev->log_size > log_size + VHOST_LOG_BUFFER) { @@ -537,7 +536,7 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, .log_guest_addr = vq->used_phys, .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, }; - int r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr); + int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr); if (r < 0) { return -errno; } @@ -551,7 +550,7 @@ static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) if (enable_log) { features |= 0x1 << VHOST_F_LOG_ALL; } - r = ioctl(dev->control, VHOST_SET_FEATURES, &features); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features); return r < 0 ? -errno : 0; } @@ -664,13 +663,13 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); vq->num = state.num = virtio_queue_get_num(vdev, idx); - r = ioctl(dev->control, VHOST_SET_VRING_NUM, &state); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state); if (r) { return -errno; } state.num = virtio_queue_get_last_avail_idx(vdev, idx); - r = ioctl(dev->control, VHOST_SET_VRING_BASE, &state); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state); if (r) { return -errno; } @@ -712,7 +711,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); - r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file); if (r) { r = -errno; goto fail_kick; @@ -750,7 +749,7 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, }; int r; assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); - r = ioctl(dev->control, VHOST_GET_VRING_BASE, &state); + r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state); if (r < 0) { fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); fflush(stderr); @@ -792,7 +791,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(&vq->masked_notifier); - r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file); + r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file); if (r) { r = -errno; goto fail_call; @@ -813,14 +812,17 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, { uint64_t features; int i, r; - hdev->control = (uintptr_t) opaque;; - r = ioctl(hdev->control, VHOST_SET_OWNER, NULL); + if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) { + return -errno; + } + + r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL); if (r < 0) { goto fail; } - r = ioctl(hdev->control, VHOST_GET_FEATURES, &features); + r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features); if (r < 0) { goto fail; } @@ -865,7 +867,7 @@ fail_vq: } fail: r = -errno; - close(hdev->control); + hdev->vhost_ops->vhost_backend_cleanup(hdev); return r; } @@ -878,7 +880,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) memory_listener_unregister(&hdev->memory_listener); g_free(hdev->mem); g_free(hdev->mem_sections); - close(hdev->control); + hdev->vhost_ops->vhost_backend_cleanup(hdev); } bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev) @@ -980,7 +982,7 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, } else { file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); } - r = ioctl(hdev->control, VHOST_SET_VRING_CALL, &file); + r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file); assert(r >= 0); } @@ -1022,7 +1024,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) if (r < 0) { goto fail_features; } - r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem); + r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem); if (r < 0) { r = -errno; goto fail_mem; @@ -1041,8 +1043,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) hdev->log_size = vhost_get_log_size(hdev); hdev->log = hdev->log_size ? g_malloc0(hdev->log_size * sizeof *hdev->log) : NULL; - r = ioctl(hdev->control, VHOST_SET_LOG_BASE, - (uint64_t)(unsigned long)hdev->log); + r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE, hdev->log); if (r < 0) { r = -errno; goto fail_log; diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h new file mode 100644 index 0000000000..14e5878318 --- /dev/null +++ b/include/hw/virtio/vhost-backend.h @@ -0,0 +1,27 @@ +/* + * vhost-backend + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef VHOST_BACKEND_H_ +#define VHOST_BACKEND_H_ + +struct vhost_dev; + +typedef int (*vhost_call)(struct vhost_dev *dev, unsigned long int request, + void *arg); +typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque); +typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev); + +typedef struct VhostOps { + vhost_call vhost_call; + vhost_backend_init vhost_backend_init; + vhost_backend_cleanup vhost_backend_cleanup; +} VhostOps; + +#endif /* VHOST_BACKEND_H_ */ diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 8afc6f9f22..4714277f82 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -2,6 +2,7 @@ #define VHOST_H #include "hw/hw.h" +#include "hw/virtio/vhost-backend.h" #include "hw/virtio/virtio.h" #include "exec/memory.h" @@ -49,6 +50,7 @@ struct vhost_dev { bool memory_changed; hwaddr mem_changed_start_addr; hwaddr mem_changed_end_addr; + const VhostOps *vhost_ops; }; int vhost_dev_init(struct vhost_dev *hdev, void *opaque, From 1a1bfac9ee13d76a4b257431cc4193c8a42efa19 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:05:49 +0300 Subject: [PATCH 053/109] Add vhost-backend and VhostBackendType Use vhost_set_backend_type to initialise a proper vhost_ops structure. In vhost_net_init and vhost_net_start_one call conditionally TAP related initialisation depending on the vhost backend type. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 75 +++++++++++++++++++------------ hw/scsi/vhost-scsi.c | 2 +- hw/virtio/Makefile.objs | 2 +- hw/virtio/vhost-backend.c | 66 +++++++++++++++++++++++++++ hw/virtio/vhost.c | 6 ++- include/hw/virtio/vhost-backend.h | 11 +++++ include/hw/virtio/vhost.h | 4 +- include/net/vhost_net.h | 2 + net/tap.c | 1 + 9 files changed, 135 insertions(+), 34 deletions(-) create mode 100644 hw/virtio/vhost-backend.c diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 402f37e8cc..2ec36d351b 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -96,6 +96,7 @@ static int vhost_net_get_fd(NetClientState *backend) struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; + bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_malloc(sizeof *net); if (!options->net_backend) { @@ -103,20 +104,25 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) goto fail; } - r = vhost_net_get_fd(options->net_backend); - if (r < 0) { - goto fail; + if (backend_kernel) { + r = vhost_net_get_fd(options->net_backend); + if (r < 0) { + goto fail; + } + net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend) + ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR); + net->backend = r; + } else { + net->dev.backend_features = 0; + net->backend = -1; } net->nc = options->net_backend; - net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend) ? 0 : - (1 << VHOST_NET_F_VIRTIO_NET_HDR); - net->backend = r; net->dev.nvqs = 2; net->dev.vqs = net->vqs; r = vhost_dev_init(&net->dev, options->opaque, - options->force); + options->backend_type, options->force); if (r < 0) { goto fail; } @@ -124,13 +130,15 @@ struct vhost_net *vhost_net_init(VhostNetOptions *options) sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); } - if (~net->dev.features & net->dev.backend_features) { - fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", - (uint64_t)(~net->dev.features & net->dev.backend_features)); - vhost_dev_cleanup(&net->dev); - goto fail; + if (backend_kernel) { + if (~net->dev.features & net->dev.backend_features) { + fprintf(stderr, "vhost lacks feature mask %" PRIu64 + " for backend\n", + (uint64_t)(~net->dev.features & net->dev.backend_features)); + vhost_dev_cleanup(&net->dev); + goto fail; + } } - /* Set sane init value. Override when guest acks. */ vhost_net_ack_features(net, 0); return net; @@ -173,23 +181,29 @@ static int vhost_net_start_one(struct vhost_net *net, net->nc->info->poll(net->nc, false); } - qemu_set_fd_handler(net->backend, NULL, NULL, NULL); - file.fd = net->backend; - for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - const VhostOps *vhost_ops = net->dev.vhost_ops; - r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); - if (r < 0) { - r = -errno; - goto fail; + if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { + qemu_set_fd_handler(net->backend, NULL, NULL, NULL); + file.fd = net->backend; + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + const VhostOps *vhost_ops = net->dev.vhost_ops; + r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, + &file); + if (r < 0) { + r = -errno; + goto fail; + } } } return 0; fail: file.fd = -1; - while (file.index-- > 0) { - const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); - assert(r >= 0); + if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { + while (file.index-- > 0) { + const VhostOps *vhost_ops = net->dev.vhost_ops; + int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, + &file); + assert(r >= 0); + } } if (net->nc->info->poll) { net->nc->info->poll(net->nc, true); @@ -210,10 +224,13 @@ static void vhost_net_stop_one(struct vhost_net *net, return; } - for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { - const VhostOps *vhost_ops = net->dev.vhost_ops; - int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); - assert(r >= 0); + if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { + for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { + const VhostOps *vhost_ops = net->dev.vhost_ops; + int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, + &file); + assert(r >= 0); + } } if (net->nc->info->poll) { net->nc->info->poll(net->nc, true); diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c index ed564bc435..9eba5adcfe 100644 --- a/hw/scsi/vhost-scsi.c +++ b/hw/scsi/vhost-scsi.c @@ -234,7 +234,7 @@ static void vhost_scsi_realize(DeviceState *dev, Error **errp) s->dev.vq_index = 0; ret = vhost_dev_init(&s->dev, (void *)(uintptr_t)vhostfd, - true); + VHOST_BACKEND_TYPE_KERNEL, true); if (ret < 0) { error_setg(errp, "vhost-scsi: vhost initialization failed: %s", strerror(-ret)); diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index 1ba53d9cc3..51e5bdb341 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -5,4 +5,4 @@ common-obj-y += virtio-mmio.o common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ obj-y += virtio.o virtio-balloon.o -obj-$(CONFIG_LINUX) += vhost.o +obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c new file mode 100644 index 0000000000..509e103408 --- /dev/null +++ b/hw/virtio/vhost-backend.c @@ -0,0 +1,66 @@ +/* + * vhost-backend + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-backend.h" +#include "qemu/error-report.h" + +#include + +static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + int fd = (uintptr_t) dev->opaque; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + return ioctl(fd, request, arg); +} + +static int vhost_kernel_init(struct vhost_dev *dev, void *opaque) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + dev->opaque = opaque; + + return 0; +} + +static int vhost_kernel_cleanup(struct vhost_dev *dev) +{ + int fd = (uintptr_t) dev->opaque; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_KERNEL); + + return close(fd); +} + +static const VhostOps kernel_ops = { + .backend_type = VHOST_BACKEND_TYPE_KERNEL, + .vhost_call = vhost_kernel_call, + .vhost_backend_init = vhost_kernel_init, + .vhost_backend_cleanup = vhost_kernel_cleanup +}; + +int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) +{ + int r = 0; + + switch (backend_type) { + case VHOST_BACKEND_TYPE_KERNEL: + dev->vhost_ops = &kernel_ops; + break; + default: + error_report("Unknown vhost backend type\n"); + r = -1; + } + + return r; +} diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c index 85841019c0..c1b1aad6cf 100644 --- a/hw/virtio/vhost.c +++ b/hw/virtio/vhost.c @@ -808,11 +808,15 @@ static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq) } int vhost_dev_init(struct vhost_dev *hdev, void *opaque, - bool force) + VhostBackendType backend_type, bool force) { uint64_t features; int i, r; + if (vhost_set_backend_type(hdev, backend_type) < 0) { + return -1; + } + if (hdev->vhost_ops->vhost_backend_init(hdev, opaque) < 0) { return -errno; } diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h index 14e5878318..d31768a1d4 100644 --- a/include/hw/virtio/vhost-backend.h +++ b/include/hw/virtio/vhost-backend.h @@ -11,6 +11,13 @@ #ifndef VHOST_BACKEND_H_ #define VHOST_BACKEND_H_ +typedef enum VhostBackendType { + VHOST_BACKEND_TYPE_NONE = 0, + VHOST_BACKEND_TYPE_KERNEL = 1, + VHOST_BACKEND_TYPE_USER = 2, + VHOST_BACKEND_TYPE_MAX = 3, +} VhostBackendType; + struct vhost_dev; typedef int (*vhost_call)(struct vhost_dev *dev, unsigned long int request, @@ -19,9 +26,13 @@ typedef int (*vhost_backend_init)(struct vhost_dev *dev, void *opaque); typedef int (*vhost_backend_cleanup)(struct vhost_dev *dev); typedef struct VhostOps { + VhostBackendType backend_type; vhost_call vhost_call; vhost_backend_init vhost_backend_init; vhost_backend_cleanup vhost_backend_cleanup; } VhostOps; +int vhost_set_backend_type(struct vhost_dev *dev, + VhostBackendType backend_type); + #endif /* VHOST_BACKEND_H_ */ diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h index 4714277f82..33028ec8c2 100644 --- a/include/hw/virtio/vhost.h +++ b/include/hw/virtio/vhost.h @@ -31,7 +31,6 @@ typedef unsigned long vhost_log_chunk_t; struct vhost_memory; struct vhost_dev { MemoryListener memory_listener; - int control; struct vhost_memory *mem; int n_mem_sections; MemoryRegionSection *mem_sections; @@ -51,10 +50,11 @@ struct vhost_dev { hwaddr mem_changed_start_addr; hwaddr mem_changed_end_addr; const VhostOps *vhost_ops; + void *opaque; }; int vhost_dev_init(struct vhost_dev *hdev, void *opaque, - bool force); + VhostBackendType backend_type, bool force); void vhost_dev_cleanup(struct vhost_dev *hdev); bool vhost_dev_query(struct vhost_dev *hdev, VirtIODevice *vdev); int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev); diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h index 2067ee2298..b1c18a3f3b 100644 --- a/include/net/vhost_net.h +++ b/include/net/vhost_net.h @@ -2,11 +2,13 @@ #define VHOST_NET_H #include "net/net.h" +#include "hw/virtio/vhost-backend.h" struct vhost_net; typedef struct vhost_net VHostNetState; typedef struct VhostNetOptions { + VhostBackendType backend_type; NetClientState *net_backend; void *opaque; bool force; diff --git a/net/tap.c b/net/tap.c index 5789ff8e0c..a40f7f023f 100644 --- a/net/tap.c +++ b/net/tap.c @@ -627,6 +627,7 @@ static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, vhostfdname || (tap->has_vhostforce && tap->vhostforce)) { VhostNetOptions options; + options.backend_type = VHOST_BACKEND_TYPE_KERNEL; options.net_backend = &s->nc; options.force = tap->has_vhostforce && tap->vhostforce; From 5f6f6664bf24dc53f4bf98ba812d55ca93684cd5 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:06:02 +0300 Subject: [PATCH 054/109] Add vhost-user as a vhost backend. The initialization takes a chardev backed by a unix domain socket. It should implement qemu_fe_set_msgfds in order to be able to pass file descriptors to the remote process. Each ioctl request of vhost-kernel has a vhost-user message equivalent, which is sent over the control socket. The general approach is to copy the data from the supplied argument pointer to a designated field in the message. If a file descriptor is to be passed it will be placed in the fds array for inclusion in the sendmsg control header. VHOST_SET_MEM_TABLE ignores the supplied vhost_memory structure and scans the global ram_list for ram blocks with a valid fd field set. This would be set when the '-object memory-file' option with share=on property is used. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/virtio/Makefile.objs | 2 +- hw/virtio/vhost-backend.c | 5 + hw/virtio/vhost-user.c | 342 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 hw/virtio/vhost-user.c diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs index 51e5bdb341..ec9e855bc1 100644 --- a/hw/virtio/Makefile.objs +++ b/hw/virtio/Makefile.objs @@ -5,4 +5,4 @@ common-obj-y += virtio-mmio.o common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += dataplane/ obj-y += virtio.o virtio-balloon.o -obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o +obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c index 509e103408..35316c40d9 100644 --- a/hw/virtio/vhost-backend.c +++ b/hw/virtio/vhost-backend.c @@ -14,6 +14,8 @@ #include +extern const VhostOps user_ops; + static int vhost_kernel_call(struct vhost_dev *dev, unsigned long int request, void *arg) { @@ -57,6 +59,9 @@ int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) case VHOST_BACKEND_TYPE_KERNEL: dev->vhost_ops = &kernel_ops; break; + case VHOST_BACKEND_TYPE_USER: + dev->vhost_ops = &user_ops; + break; default: error_report("Unknown vhost backend type\n"); r = -1; diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c new file mode 100644 index 0000000000..0df6a936a0 --- /dev/null +++ b/hw/virtio/vhost-user.c @@ -0,0 +1,342 @@ +/* + * vhost-user + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "hw/virtio/vhost.h" +#include "hw/virtio/vhost-backend.h" +#include "sysemu/char.h" +#include "sysemu/kvm.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" + +#include +#include +#include +#include +#include +#include + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1<<2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + }; +} QEMU_PACKED VhostUserMsg; + +static VhostUserMsg m __attribute__ ((unused)); +#define VHOST_USER_HDR_SIZE (sizeof(m.request) \ + + sizeof(m.flags) \ + + sizeof(m.size)) + +#define VHOST_USER_PAYLOAD_SIZE (sizeof(m) - VHOST_USER_HDR_SIZE) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +static bool ioeventfd_enabled(void) +{ + return kvm_enabled() && kvm_eventfds_enabled(); +} + +static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = { + -1, /* VHOST_USER_NONE */ + VHOST_GET_FEATURES, /* VHOST_USER_GET_FEATURES */ + VHOST_SET_FEATURES, /* VHOST_USER_SET_FEATURES */ + VHOST_SET_OWNER, /* VHOST_USER_SET_OWNER */ + VHOST_RESET_OWNER, /* VHOST_USER_RESET_OWNER */ + VHOST_SET_MEM_TABLE, /* VHOST_USER_SET_MEM_TABLE */ + VHOST_SET_LOG_BASE, /* VHOST_USER_SET_LOG_BASE */ + VHOST_SET_LOG_FD, /* VHOST_USER_SET_LOG_FD */ + VHOST_SET_VRING_NUM, /* VHOST_USER_SET_VRING_NUM */ + VHOST_SET_VRING_ADDR, /* VHOST_USER_SET_VRING_ADDR */ + VHOST_SET_VRING_BASE, /* VHOST_USER_SET_VRING_BASE */ + VHOST_GET_VRING_BASE, /* VHOST_USER_GET_VRING_BASE */ + VHOST_SET_VRING_KICK, /* VHOST_USER_SET_VRING_KICK */ + VHOST_SET_VRING_CALL, /* VHOST_USER_SET_VRING_CALL */ + VHOST_SET_VRING_ERR /* VHOST_USER_SET_VRING_ERR */ +}; + +static VhostUserRequest vhost_user_request_translate(unsigned long int request) +{ + VhostUserRequest idx; + + for (idx = 0; idx < VHOST_USER_MAX; idx++) { + if (ioctl_to_vhost_user_request[idx] == request) { + break; + } + } + + return (idx == VHOST_USER_MAX) ? VHOST_USER_NONE : idx; +} + +static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) +{ + CharDriverState *chr = dev->opaque; + uint8_t *p = (uint8_t *) msg; + int r, size = VHOST_USER_HDR_SIZE; + + r = qemu_chr_fe_read_all(chr, p, size); + if (r != size) { + error_report("Failed to read msg header. Read %d instead of %d.\n", r, + size); + goto fail; + } + + /* validate received flags */ + if (msg->flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) { + error_report("Failed to read msg header." + " Flags 0x%x instead of 0x%x.\n", msg->flags, + VHOST_USER_REPLY_MASK | VHOST_USER_VERSION); + goto fail; + } + + /* validate message size is sane */ + if (msg->size > VHOST_USER_PAYLOAD_SIZE) { + error_report("Failed to read msg header." + " Size %d exceeds the maximum %zu.\n", msg->size, + VHOST_USER_PAYLOAD_SIZE); + goto fail; + } + + if (msg->size) { + p += VHOST_USER_HDR_SIZE; + size = msg->size; + r = qemu_chr_fe_read_all(chr, p, size); + if (r != size) { + error_report("Failed to read msg payload." + " Read %d instead of %d.\n", r, msg->size); + goto fail; + } + } + + return 0; + +fail: + return -1; +} + +static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, + int *fds, int fd_num) +{ + CharDriverState *chr = dev->opaque; + int size = VHOST_USER_HDR_SIZE + msg->size; + + if (fd_num) { + qemu_chr_fe_set_msgfds(chr, fds, fd_num); + } + + return qemu_chr_fe_write_all(chr, (const uint8_t *) msg, size) == size ? + 0 : -1; +} + +static int vhost_user_call(struct vhost_dev *dev, unsigned long int request, + void *arg) +{ + VhostUserMsg msg; + VhostUserRequest msg_request; + RAMBlock *block = 0; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + size_t fd_num = 0; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + msg_request = vhost_user_request_translate(request); + msg.request = msg_request; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (request) { + case VHOST_GET_FEATURES: + need_reply = 1; + break; + + case VHOST_SET_FEATURES: + case VHOST_SET_LOG_BASE: + msg.u64 = *((__u64 *) arg); + msg.size = sizeof(m.u64); + break; + + case VHOST_SET_OWNER: + case VHOST_RESET_OWNER: + break; + + case VHOST_SET_MEM_TABLE: + QTAILQ_FOREACH(block, &ram_list.blocks, next) + { + if (block->fd > 0) { + msg.memory.regions[fd_num].userspace_addr = + (uintptr_t) block->host; + msg.memory.regions[fd_num].memory_size = block->length; + msg.memory.regions[fd_num].guest_phys_addr = block->offset; + fds[fd_num++] = block->fd; + } + } + + msg.memory.nregions = fd_num; + + if (!fd_num) { + error_report("Failed initializing vhost-user memory map\n" + "consider using -object memory-backend-file share=on\n"); + return -1; + } + + msg.size = sizeof(m.memory.nregions); + msg.size += sizeof(m.memory.padding); + msg.size += fd_num * sizeof(VhostUserMemoryRegion); + + break; + + case VHOST_SET_LOG_FD: + fds[fd_num++] = *((int *) arg); + break; + + case VHOST_SET_VRING_NUM: + case VHOST_SET_VRING_BASE: + memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); + msg.size = sizeof(m.state); + break; + + case VHOST_GET_VRING_BASE: + memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); + msg.size = sizeof(m.state); + need_reply = 1; + break; + + case VHOST_SET_VRING_ADDR: + memcpy(&msg.addr, arg, sizeof(struct vhost_vring_addr)); + msg.size = sizeof(m.addr); + break; + + case VHOST_SET_VRING_KICK: + case VHOST_SET_VRING_CALL: + case VHOST_SET_VRING_ERR: + file = arg; + msg.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(m.u64); + if (ioeventfd_enabled() && file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + break; + default: + error_report("vhost-user trying to send unhandled ioctl\n"); + return -1; + break; + } + + if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + return 0; + } + + if (need_reply) { + if (vhost_user_read(dev, &msg) < 0) { + return 0; + } + + if (msg_request != msg.request) { + error_report("Received unexpected msg type." + " Expected %d received %d\n", msg_request, msg.request); + return -1; + } + + switch (msg_request) { + case VHOST_USER_GET_FEATURES: + if (msg.size != sizeof(m.u64)) { + error_report("Received bad msg size.\n"); + return -1; + } + *((__u64 *) arg) = msg.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(m.state)) { + error_report("Received bad msg size.\n"); + return -1; + } + memcpy(arg, &msg.state, sizeof(struct vhost_vring_state)); + break; + default: + error_report("Received unexpected msg type.\n"); + return -1; + break; + } + } + + return 0; +} + +static int vhost_user_init(struct vhost_dev *dev, void *opaque) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + dev->opaque = opaque; + + return 0; +} + +static int vhost_user_cleanup(struct vhost_dev *dev) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + dev->opaque = 0; + + return 0; +} + +const VhostOps user_ops = { + .backend_type = VHOST_BACKEND_TYPE_USER, + .vhost_call = vhost_user_call, + .vhost_backend_init = vhost_user_init, + .vhost_backend_cleanup = vhost_user_cleanup + }; From 5f4c01cab1abcb25d627600a7efb6ac0f804c2dc Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:06:16 +0300 Subject: [PATCH 055/109] vhost-net: vhost-user feature bits support Handle the feature bits negotiation when using vhost-user. Allow the underlying implementation to have a finer control over all the bits except the VIRTIO_NET_F_MAC. Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/net/vhost_net.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 2ec36d351b..5f06736a8c 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -54,6 +54,38 @@ static const int kernel_feature_bits[] = { VHOST_INVALID_FEATURE_BIT }; +/* Features supported by others. */ +const int user_feature_bits[] = { + VIRTIO_F_NOTIFY_ON_EMPTY, + VIRTIO_RING_F_INDIRECT_DESC, + VIRTIO_RING_F_EVENT_IDX, + + VIRTIO_F_ANY_LAYOUT, + VIRTIO_NET_F_CSUM, + VIRTIO_NET_F_GUEST_CSUM, + VIRTIO_NET_F_GSO, + VIRTIO_NET_F_GUEST_TSO4, + VIRTIO_NET_F_GUEST_TSO6, + VIRTIO_NET_F_GUEST_ECN, + VIRTIO_NET_F_GUEST_UFO, + VIRTIO_NET_F_HOST_TSO4, + VIRTIO_NET_F_HOST_TSO6, + VIRTIO_NET_F_HOST_ECN, + VIRTIO_NET_F_HOST_UFO, + VIRTIO_NET_F_MRG_RXBUF, + VIRTIO_NET_F_STATUS, + VIRTIO_NET_F_CTRL_VQ, + VIRTIO_NET_F_CTRL_RX, + VIRTIO_NET_F_CTRL_VLAN, + VIRTIO_NET_F_CTRL_RX_EXTRA, + VIRTIO_NET_F_CTRL_MAC_ADDR, + VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, + + VIRTIO_NET_F_MQ, + + VHOST_INVALID_FEATURE_BIT +}; + static const int *vhost_net_get_feature_bits(struct vhost_net *net) { const int *feature_bits = 0; @@ -62,6 +94,9 @@ static const int *vhost_net_get_feature_bits(struct vhost_net *net) case NET_CLIENT_OPTIONS_KIND_TAP: feature_bits = kernel_feature_bits; break; + case NET_CLIENT_OPTIONS_KIND_VHOST_USER: + feature_bits = user_feature_bits; + break; default: error_report("Feature bits not defined for this type: %d", net->nc->info->type); From d314f586b3c5f8be243efb02c3944e327d4e11a7 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:06:29 +0300 Subject: [PATCH 056/109] Add new vhost-user netdev backend Add a new QEMU netdev backend that is intended to invoke vhost_net with the vhost-user backend. It uses an Unix socket chardev to establish a communication with the 'slave' (client and server mode supported). At runtime the netdev will handle OPEN/CLOSE events from the chardev. Upon disconnection it will set link_down accordingly and notify virtio-net; the virtio-net interface will go down. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/net/vhost-user.h | 17 +++++ net/Makefile.objs | 2 +- net/clients.h | 3 + net/vhost-user.c | 155 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 include/net/vhost-user.h create mode 100644 net/vhost-user.c diff --git a/include/net/vhost-user.h b/include/net/vhost-user.h new file mode 100644 index 0000000000..85109f63aa --- /dev/null +++ b/include/net/vhost-user.h @@ -0,0 +1,17 @@ +/* + * vhost-user.h + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef VHOST_USER_H_ +#define VHOST_USER_H_ + +struct vhost_net; +struct vhost_net *vhost_user_get_vhost_net(NetClientState *nc); + +#endif /* VHOST_USER_H_ */ diff --git a/net/Makefile.objs b/net/Makefile.objs index c25fe6920c..301f6b6b51 100644 --- a/net/Makefile.objs +++ b/net/Makefile.objs @@ -2,7 +2,7 @@ common-obj-y = net.o queue.o checksum.o util.o hub.o common-obj-y += socket.o common-obj-y += dump.o common-obj-y += eth.o -common-obj-$(CONFIG_POSIX) += tap.o +common-obj-$(CONFIG_POSIX) += tap.o vhost-user.o common-obj-$(CONFIG_LINUX) += tap-linux.o common-obj-$(CONFIG_WIN32) += tap-win32.o common-obj-$(CONFIG_BSD) += tap-bsd.o diff --git a/net/clients.h b/net/clients.h index 7322ff5f33..7f3d4ae9f3 100644 --- a/net/clients.h +++ b/net/clients.h @@ -57,4 +57,7 @@ int net_init_netmap(const NetClientOptions *opts, const char *name, NetClientState *peer); #endif +int net_init_vhost_user(const NetClientOptions *opts, const char *name, + NetClientState *peer); + #endif /* QEMU_NET_CLIENTS_H */ diff --git a/net/vhost-user.c b/net/vhost-user.c new file mode 100644 index 0000000000..4bdd19d8e6 --- /dev/null +++ b/net/vhost-user.c @@ -0,0 +1,155 @@ +/* + * vhost-user.c + * + * Copyright (c) 2013 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "clients.h" +#include "net/vhost_net.h" +#include "net/vhost-user.h" +#include "sysemu/char.h" +#include "qemu/error-report.h" + +typedef struct VhostUserState { + NetClientState nc; + CharDriverState *chr; + bool vhostforce; + VHostNetState *vhost_net; +} VhostUserState; + +VHostNetState *vhost_user_get_vhost_net(NetClientState *nc) +{ + VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + return s->vhost_net; +} + +static int vhost_user_running(VhostUserState *s) +{ + return (s->vhost_net) ? 1 : 0; +} + +static int vhost_user_start(VhostUserState *s) +{ + VhostNetOptions options; + + if (vhost_user_running(s)) { + return 0; + } + + options.backend_type = VHOST_BACKEND_TYPE_USER; + options.net_backend = &s->nc; + options.opaque = s->chr; + options.force = s->vhostforce; + + s->vhost_net = vhost_net_init(&options); + + return vhost_user_running(s) ? 0 : -1; +} + +static void vhost_user_stop(VhostUserState *s) +{ + if (vhost_user_running(s)) { + vhost_net_cleanup(s->vhost_net); + } + + s->vhost_net = 0; +} + +static void vhost_user_cleanup(NetClientState *nc) +{ + VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + + vhost_user_stop(s); + qemu_purge_queued_packets(nc); +} + +static bool vhost_user_has_vnet_hdr(NetClientState *nc) +{ + assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER); + + return true; +} + +static bool vhost_user_has_ufo(NetClientState *nc) +{ + assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER); + + return true; +} + +static NetClientInfo net_vhost_user_info = { + .type = 0, + .size = sizeof(VhostUserState), + .cleanup = vhost_user_cleanup, + .has_vnet_hdr = vhost_user_has_vnet_hdr, + .has_ufo = vhost_user_has_ufo, +}; + +static void net_vhost_link_down(VhostUserState *s, bool link_down) +{ + s->nc.link_down = link_down; + + if (s->nc.peer) { + s->nc.peer->link_down = link_down; + } + + if (s->nc.info->link_status_changed) { + s->nc.info->link_status_changed(&s->nc); + } + + if (s->nc.peer && s->nc.peer->info->link_status_changed) { + s->nc.peer->info->link_status_changed(s->nc.peer); + } +} + +static void net_vhost_user_event(void *opaque, int event) +{ + VhostUserState *s = opaque; + + switch (event) { + case CHR_EVENT_OPENED: + vhost_user_start(s); + net_vhost_link_down(s, false); + error_report("chardev \"%s\" went up\n", s->chr->label); + break; + case CHR_EVENT_CLOSED: + net_vhost_link_down(s, true); + vhost_user_stop(s); + error_report("chardev \"%s\" went down\n", s->chr->label); + break; + } +} + +static int net_vhost_user_init(NetClientState *peer, const char *device, + const char *name, CharDriverState *chr, + bool vhostforce) +{ + NetClientState *nc; + VhostUserState *s; + + nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name); + + snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user to %s", + chr->label); + + s = DO_UPCAST(VhostUserState, nc, nc); + + /* We don't provide a receive callback */ + s->nc.receive_disabled = 1; + s->chr = chr; + s->vhostforce = vhostforce; + + qemu_chr_add_handlers(s->chr, NULL, NULL, net_vhost_user_event, s); + + return 0; +} + +int net_init_vhost_user(const NetClientOptions *opts, const char *name, + NetClientState *peer) +{ + return net_vhost_user_init(peer, "vhost_user", 0, 0, 0); +} From 03ce574442d2ee82f59a5232a24492ad80858d75 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 10 Jun 2014 13:02:16 +0300 Subject: [PATCH 057/109] Add the vhost-user netdev backend to the command line The supplied chardev id will be inspected for supported options. Only a socket backend, with a set path (i.e. a Unix socket) and optionally the server parameter set, will be allowed. Other options (nowait, telnet) will make the chardev unusable and the netdev will not be initialised. Additional checks for validity: - requires `-numa node,memdev=..` - requires `-device virtio-net-*` The `vhostforce` option is used to force vhost-net when we deal with non-MSIX guests. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Acked-by: Luiz Capitulino Reviewed-by: Eric Blake --- configure | 3 ++ hmp-commands.hx | 4 +- hw/net/vhost_net.c | 4 ++ net/hub.c | 1 + net/net.c | 7 +++ net/vhost-user.c | 111 +++++++++++++++++++++++++++++++++++++++++++-- qapi-schema.json | 19 +++++++- qemu-options.hx | 18 ++++++++ 8 files changed, 160 insertions(+), 7 deletions(-) diff --git a/configure b/configure index 27d84d9fdb..d23ded555d 100755 --- a/configure +++ b/configure @@ -4515,6 +4515,9 @@ fi if test "$vhost_scsi" = "yes" ; then echo "CONFIG_VHOST_SCSI=y" >> $config_host_mak fi +if test "$vhost_net" = "yes" ; then + echo "CONFIG_VHOST_NET_USED=y" >> $config_host_mak +fi if test "$blobs" = "yes" ; then echo "INSTALL_BLOBS=yes" >> $config_host_mak fi diff --git a/hmp-commands.hx b/hmp-commands.hx index 5f1a677b85..d0943b1ff3 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1211,7 +1211,7 @@ ETEXI { .name = "host_net_add", .args_type = "device:s,opts:s?", - .params = "tap|user|socket|vde|netmap|bridge|dump [options]", + .params = "tap|user|socket|vde|netmap|bridge|vhost-user|dump [options]", .help = "add host VLAN client", .mhandler.cmd = net_host_device_add, .command_completion = host_net_add_completion, @@ -1241,7 +1241,7 @@ ETEXI { .name = "netdev_add", .args_type = "netdev:O", - .params = "[user|tap|socket|vde|bridge|hubport|netmap],id=str[,prop=value][,...]", + .params = "[user|tap|socket|vde|bridge|hubport|netmap|vhost-user],id=str[,prop=value][,...]", .help = "add host network device", .mhandler.cmd = hmp_netdev_add, .command_completion = netdev_add_completion, diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c index 5f06736a8c..7ac7c21bdb 100644 --- a/hw/net/vhost_net.c +++ b/hw/net/vhost_net.c @@ -15,6 +15,7 @@ #include "net/net.h" #include "net/tap.h" +#include "net/vhost-user.h" #include "hw/virtio/virtio-net.h" #include "net/vhost_net.h" @@ -360,6 +361,9 @@ VHostNetState *get_vhost_net(NetClientState *nc) case NET_CLIENT_OPTIONS_KIND_TAP: vhost_net = tap_get_vhost_net(nc); break; + case NET_CLIENT_OPTIONS_KIND_VHOST_USER: + vhost_net = vhost_user_get_vhost_net(nc); + break; default: break; } diff --git a/net/hub.c b/net/hub.c index 33a99c99ef..7e0f2d6f0d 100644 --- a/net/hub.c +++ b/net/hub.c @@ -322,6 +322,7 @@ void net_hub_check_clients(void) case NET_CLIENT_OPTIONS_KIND_TAP: case NET_CLIENT_OPTIONS_KIND_SOCKET: case NET_CLIENT_OPTIONS_KIND_VDE: + case NET_CLIENT_OPTIONS_KIND_VHOST_USER: has_host_dev = 1; break; default: diff --git a/net/net.c b/net/net.c index 6344160403..3dac29b844 100644 --- a/net/net.c +++ b/net/net.c @@ -62,6 +62,7 @@ const char *host_net_devices[] = { #ifdef CONFIG_VDE "vde", #endif + "vhost-user", NULL, }; @@ -802,6 +803,9 @@ static int (* const net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])( [NET_CLIENT_OPTIONS_KIND_BRIDGE] = net_init_bridge, #endif [NET_CLIENT_OPTIONS_KIND_HUBPORT] = net_init_hubport, +#ifdef CONFIG_VHOST_NET_USED + [NET_CLIENT_OPTIONS_KIND_VHOST_USER] = net_init_vhost_user, +#endif }; @@ -835,6 +839,9 @@ static int net_client_init1(const void *object, int is_netdev, Error **errp) case NET_CLIENT_OPTIONS_KIND_BRIDGE: #endif case NET_CLIENT_OPTIONS_KIND_HUBPORT: +#ifdef CONFIG_VHOST_NET_USED + case NET_CLIENT_OPTIONS_KIND_VHOST_USER: +#endif break; default: diff --git a/net/vhost-user.c b/net/vhost-user.c index 4bdd19d8e6..24e050c772 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -12,6 +12,7 @@ #include "net/vhost_net.h" #include "net/vhost-user.h" #include "sysemu/char.h" +#include "qemu/config-file.h" #include "qemu/error-report.h" typedef struct VhostUserState { @@ -21,9 +22,16 @@ typedef struct VhostUserState { VHostNetState *vhost_net; } VhostUserState; +typedef struct VhostUserChardevProps { + bool is_socket; + bool is_unix; + bool is_server; +} VhostUserChardevProps; + VHostNetState *vhost_user_get_vhost_net(NetClientState *nc) { VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc); + assert(nc->info->type == NET_CLIENT_OPTIONS_KIND_VHOST_USER); return s->vhost_net; } @@ -82,7 +90,7 @@ static bool vhost_user_has_ufo(NetClientState *nc) } static NetClientInfo net_vhost_user_info = { - .type = 0, + .type = NET_CLIENT_OPTIONS_KIND_VHOST_USER, .size = sizeof(VhostUserState), .cleanup = vhost_user_cleanup, .has_vnet_hdr = vhost_user_has_vnet_hdr, @@ -148,8 +156,103 @@ static int net_vhost_user_init(NetClientState *peer, const char *device, return 0; } -int net_init_vhost_user(const NetClientOptions *opts, const char *name, - NetClientState *peer) +static int net_vhost_chardev_opts(const char *name, const char *value, + void *opaque) { - return net_vhost_user_init(peer, "vhost_user", 0, 0, 0); + VhostUserChardevProps *props = opaque; + + if (strcmp(name, "backend") == 0 && strcmp(value, "socket") == 0) { + props->is_socket = true; + } else if (strcmp(name, "path") == 0) { + props->is_unix = true; + } else if (strcmp(name, "server") == 0) { + props->is_server = true; + } else { + error_report("vhost-user does not support a chardev" + " with the following option:\n %s = %s", + name, value); + return -1; + } + return 0; +} + +static CharDriverState *net_vhost_parse_chardev(const NetdevVhostUserOptions *opts) +{ + CharDriverState *chr = qemu_chr_find(opts->chardev); + VhostUserChardevProps props; + + if (chr == NULL) { + error_report("chardev \"%s\" not found", opts->chardev); + return NULL; + } + + /* inspect chardev opts */ + memset(&props, 0, sizeof(props)); + if (qemu_opt_foreach(chr->opts, net_vhost_chardev_opts, &props, true) != 0) { + return NULL; + } + + if (!props.is_socket || !props.is_unix) { + error_report("chardev \"%s\" is not a unix socket", + opts->chardev); + return NULL; + } + + qemu_chr_fe_claim_no_fail(chr); + + return chr; +} + +static int net_vhost_check_net(QemuOpts *opts, void *opaque) +{ + const char *name = opaque; + const char *driver, *netdev; + const char virtio_name[] = "virtio-net-"; + + driver = qemu_opt_get(opts, "driver"); + netdev = qemu_opt_get(opts, "netdev"); + + if (!driver || !netdev) { + return 0; + } + + if (strcmp(netdev, name) == 0 && + strncmp(driver, virtio_name, strlen(virtio_name)) != 0) { + error_report("vhost-user requires frontend driver virtio-net-*"); + return -1; + } + + return 0; +} + +int net_init_vhost_user(const NetClientOptions *opts, const char *name, + NetClientState *peer) +{ + const NetdevVhostUserOptions *vhost_user_opts; + CharDriverState *chr; + bool vhostforce; + + assert(opts->kind == NET_CLIENT_OPTIONS_KIND_VHOST_USER); + vhost_user_opts = opts->vhost_user; + + chr = net_vhost_parse_chardev(vhost_user_opts); + if (!chr) { + error_report("No suitable chardev found"); + return -1; + } + + /* verify net frontend */ + if (qemu_opts_foreach(qemu_find_opts("device"), net_vhost_check_net, + (char *)name, true) == -1) { + return -1; + } + + /* vhostforce for non-MSIX */ + if (vhost_user_opts->has_vhostforce) { + vhostforce = vhost_user_opts->vhostforce; + } else { + vhostforce = false; + } + + return net_vhost_user_init(peer, "vhost_user", name, chr, vhostforce); } diff --git a/qapi-schema.json b/qapi-schema.json index dc2abe479e..f5d89b024c 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -2068,6 +2068,22 @@ 'ifname': 'str', '*devname': 'str' } } +## +# @NetdevVhostUserOptions +# +# Vhost-user network backend +# +# @chardev: name of a unix socket chardev +# +# @vhostforce: #optional vhost on for non-MSIX virtio guests (default: false). +# +# Since 2.1 +## +{ 'type': 'NetdevVhostUserOptions', + 'data': { + 'chardev': 'str', + '*vhostforce': 'bool' } } + ## # @NetClientOptions # @@ -2086,7 +2102,8 @@ 'dump': 'NetdevDumpOptions', 'bridge': 'NetdevBridgeOptions', 'hubport': 'NetdevHubPortOptions', - 'netmap': 'NetdevNetmapOptions' } } + 'netmap': 'NetdevNetmapOptions', + 'vhost-user': 'NetdevVhostUserOptions' } } ## # @NetLegacy diff --git a/qemu-options.hx b/qemu-options.hx index 5a4eff9aa4..ab06df1e01 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1460,6 +1460,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, #ifdef CONFIG_NETMAP "netmap|" #endif + "vhost-user|" "socket|" "hubport],id=str[,option][,option][,...]\n", QEMU_ARCH_ALL) STEXI @@ -1791,6 +1792,23 @@ The hubport netdev lets you connect a NIC to a QEMU "vlan" instead of a single netdev. @code{-net} and @code{-device} with parameter @option{vlan} create the required hub automatically. +@item -netdev vhost-user,chardev=@var{id}[,vhostforce=on|off] + +Establish a vhost-user netdev, backed by a chardev @var{id}. The chardev should +be a unix domain socket backed one. The vhost-user uses a specifically defined +protocol to pass vhost ioctl replacement messages to an application on the other +end of the socket. On non-MSIX guests, the feature can be forced with +@var{vhostforce}. + +Example: +@example +qemu -m 512 -object memory-backend-file,id=mem,size=512M,mem-path=/hugetlbfs,share=on \ + -numa node,memdev=mem \ + -chardev socket,path=/path/to/socket \ + -netdev type=vhost-user,id=net0,chardev=chr0 \ + -device virtio-net-pci,netdev=net0 +@end example + @item -net dump[,vlan=@var{n}][,file=@var{file}][,len=@var{len}] Dump network traffic on VLAN @var{n} to file @var{file} (@file{qemu-vlan0.pcap} by default). At most @var{len} bytes (64k by default) per packet are stored. The file format is From 5fc0e0029162f517357e0d2d4baba7ae1fa48ab1 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 10 Jun 2014 13:02:57 +0300 Subject: [PATCH 058/109] Add vhost-user protocol documentation This document describes the basic message format used by vhost-user for communication over a unix domain socket. The protocol is based on the existing ioctl interface used for the kernel version of vhost. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/specs/vhost-user.txt | 266 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 docs/specs/vhost-user.txt diff --git a/docs/specs/vhost-user.txt b/docs/specs/vhost-user.txt new file mode 100644 index 0000000000..0ea767e4b8 --- /dev/null +++ b/docs/specs/vhost-user.txt @@ -0,0 +1,266 @@ +Vhost-user Protocol +=================== + +Copyright (c) 2014 Virtual Open Systems Sarl. + +This work is licensed under the terms of the GNU GPL, version 2 or later. +See the COPYING file in the top-level directory. +=================== + +This protocol is aiming to complement the ioctl interface used to control the +vhost implementation in the Linux kernel. It implements the control plane needed +to establish virtqueue sharing with a user space process on the same host. It +uses communication over a Unix domain socket to share file descriptors in the +ancillary data of the message. + +The protocol defines 2 sides of the communication, master and slave. Master is +the application that shares its virtqueues, in our case QEMU. Slave is the +consumer of the virtqueues. + +In the current implementation QEMU is the Master, and the Slave is intended to +be a software Ethernet switch running in user space, such as Snabbswitch. + +Master and slave can be either a client (i.e. connecting) or server (listening) +in the socket communication. + +Message Specification +--------------------- + +Note that all numbers are in the machine native byte order. A vhost-user message +consists of 3 header fields and a payload: + +------------------------------------ +| request | flags | size | payload | +------------------------------------ + + * Request: 32-bit type of the request + * Flags: 32-bit bit field: + - Lower 2 bits are the version (currently 0x01) + - Bit 2 is the reply flag - needs to be sent on each reply from the slave + * Size - 32-bit size of the payload + + +Depending on the request type, payload can be: + + * A single 64-bit integer + ------- + | u64 | + ------- + + u64: a 64-bit unsigned integer + + * A vring state description + --------------- + | index | num | + --------------- + + Index: a 32-bit index + Num: a 32-bit number + + * A vring address description + -------------------------------------------------------------- + | index | flags | size | descriptor | used | available | log | + -------------------------------------------------------------- + + Index: a 32-bit vring index + Flags: a 32-bit vring flags + Descriptor: a 64-bit user address of the vring descriptor table + Used: a 64-bit user address of the vring used ring + Available: a 64-bit user address of the vring available ring + Log: a 64-bit guest address for logging + + * Memory regions description + --------------------------------------------------- + | num regions | padding | region0 | ... | region7 | + --------------------------------------------------- + + Num regions: a 32-bit number of regions + Padding: 32-bit + + A region is: + --------------------------------------- + | guest address | size | user address | + --------------------------------------- + + Guest address: a 64-bit guest address of the region + Size: a 64-bit size + User address: a 64-bit user address + + +In QEMU the vhost-user message is implemented with the following struct: + +typedef struct VhostUserMsg { + VhostUserRequest request; + uint32_t flags; + uint32_t size; + union { + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + }; +} QEMU_PACKED VhostUserMsg; + +Communication +------------- + +The protocol for vhost-user is based on the existing implementation of vhost +for the Linux Kernel. Most messages that can be sent via the Unix domain socket +implementing vhost-user have an equivalent ioctl to the kernel implementation. + +The communication consists of master sending message requests and slave sending +message replies. Most of the requests don't require replies. Here is a list of +the ones that do: + + * VHOST_GET_FEATURES + * VHOST_GET_VRING_BASE + +There are several messages that the master sends with file descriptors passed +in the ancillary data: + + * VHOST_SET_MEM_TABLE + * VHOST_SET_LOG_FD + * VHOST_SET_VRING_KICK + * VHOST_SET_VRING_CALL + * VHOST_SET_VRING_ERR + +If Master is unable to send the full message or receives a wrong reply it will +close the connection. An optional reconnection mechanism can be implemented. + +Message types +------------- + + * VHOST_USER_GET_FEATURES + + Id: 2 + Equivalent ioctl: VHOST_GET_FEATURES + Master payload: N/A + Slave payload: u64 + + Get from the underlying vhost implementation the features bitmask. + + * VHOST_USER_SET_FEATURES + + Id: 3 + Ioctl: VHOST_SET_FEATURES + Master payload: u64 + + Enable features in the underlying vhost implementation using a bitmask. + + * VHOST_USER_SET_OWNER + + Id: 4 + Equivalent ioctl: VHOST_SET_OWNER + Master payload: N/A + + Issued when a new connection is established. It sets the current Master + as an owner of the session. This can be used on the Slave as a + "session start" flag. + + * VHOST_USER_RESET_OWNER + + Id: 5 + Equivalent ioctl: VHOST_RESET_OWNER + Master payload: N/A + + Issued when a new connection is about to be closed. The Master will no + longer own this connection (and will usually close it). + + * VHOST_USER_SET_MEM_TABLE + + Id: 6 + Equivalent ioctl: VHOST_SET_MEM_TABLE + Master payload: memory regions description + + Sets the memory map regions on the slave so it can translate the vring + addresses. In the ancillary data there is an array of file descriptors + for each memory mapped region. The size and ordering of the fds matches + the number and ordering of memory regions. + + * VHOST_USER_SET_LOG_BASE + + Id: 7 + Equivalent ioctl: VHOST_SET_LOG_BASE + Master payload: u64 + + Sets the logging base address. + + * VHOST_USER_SET_LOG_FD + + Id: 8 + Equivalent ioctl: VHOST_SET_LOG_FD + Master payload: N/A + + Sets the logging file descriptor, which is passed as ancillary data. + + * VHOST_USER_SET_VRING_NUM + + Id: 9 + Equivalent ioctl: VHOST_SET_VRING_NUM + Master payload: vring state description + + Sets the number of vrings for this owner. + + * VHOST_USER_SET_VRING_ADDR + + Id: 10 + Equivalent ioctl: VHOST_SET_VRING_ADDR + Master payload: vring address description + Slave payload: N/A + + Sets the addresses of the different aspects of the vring. + + * VHOST_USER_SET_VRING_BASE + + Id: 11 + Equivalent ioctl: VHOST_SET_VRING_BASE + Master payload: vring state description + + Sets the base offset in the available vring. + + * VHOST_USER_GET_VRING_BASE + + Id: 12 + Equivalent ioctl: VHOST_USER_GET_VRING_BASE + Master payload: vring state description + Slave payload: vring state description + + Get the available vring base offset. + + * VHOST_USER_SET_VRING_KICK + + Id: 13 + Equivalent ioctl: VHOST_SET_VRING_KICK + Master payload: u64 + + Set the event file descriptor for adding buffers to the vring. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling should be used + instead of waiting for a kick. + + * VHOST_USER_SET_VRING_CALL + + Id: 14 + Equivalent ioctl: VHOST_SET_VRING_CALL + Master payload: u64 + + Set the event file descriptor to signal when buffers are used. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling will be used + instead of waiting for the call. + + * VHOST_USER_SET_VRING_ERR + + Id: 15 + Equivalent ioctl: VHOST_SET_VRING_ERR + Master payload: u64 + + Set the event file descriptor to signal when error occurs. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. From 07a32d6b962d51d1bbd02cdc2ebd9ac56ef52547 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 27 May 2014 15:07:10 +0300 Subject: [PATCH 059/109] libqemustub: add stubs to be able to use qemu-char.c chardev depends on lots of external symbols that are not necessarily needed to be able to use, for example, 'socket chardev'. So add stubs for these functions: - bdrv_commit_all - qemu_chr_open_msmouse - is_daemonized - qemu_add_machine_init_done_notifier - monitor_init - qemu_notify_event - vc_init and this array: - serial_hds Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- stubs/Makefile.objs | 8 ++++++++ stubs/bdrv-commit-all.c | 7 +++++++ stubs/chr-msmouse.c | 7 +++++++ stubs/get-next-serial.c | 3 +++ stubs/is-daemonized.c | 7 +++++++ stubs/machine-init-done.c | 6 ++++++ stubs/monitor-init.c | 6 ++++++ stubs/notify-event.c | 6 ++++++ stubs/vc-init.c | 7 +++++++ 9 files changed, 57 insertions(+) create mode 100644 stubs/bdrv-commit-all.c create mode 100644 stubs/chr-msmouse.c create mode 100644 stubs/get-next-serial.c create mode 100644 stubs/is-daemonized.c create mode 100644 stubs/machine-init-done.c create mode 100644 stubs/monitor-init.c create mode 100644 stubs/notify-event.c create mode 100644 stubs/vc-init.c diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index d99e2b9259..5a0b9174e0 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -1,4 +1,6 @@ stub-obj-y += arch-query-cpu-def.o +stub-obj-y += bdrv-commit-all.o +stub-obj-y += chr-msmouse.o stub-obj-y += clock-warp.o stub-obj-y += cpu-get-clock.o stub-obj-y += cpu-get-icount.o @@ -9,13 +11,18 @@ stub-obj-y += fdset-get-fd.o stub-obj-y += fdset-remove-fd.o stub-obj-y += gdbstub.o stub-obj-y += get-fd.o +stub-obj-y += get-next-serial.o stub-obj-y += get-vm-name.o stub-obj-y += iothread-lock.o +stub-obj-y += is-daemonized.o +stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += mon-is-qmp.o stub-obj-y += mon-printf.o stub-obj-y += mon-protocol-event.o stub-obj-y += mon-set-error.o +stub-obj-y += monitor-init.o +stub-obj-y += notify-event.o stub-obj-y += pci-drive-hot-add.o stub-obj-y += qtest.o stub-obj-y += reset.o @@ -24,6 +31,7 @@ stub-obj-y += set-fd-handler.o stub-obj-y += slirp.o stub-obj-y += sysbus.o stub-obj-y += uuid.o +stub-obj-y += vc-init.o stub-obj-y += vm-stop.o stub-obj-y += vmstate.o stub-obj-$(CONFIG_WIN32) += fd-register.o diff --git a/stubs/bdrv-commit-all.c b/stubs/bdrv-commit-all.c new file mode 100644 index 0000000000..a8e0a95417 --- /dev/null +++ b/stubs/bdrv-commit-all.c @@ -0,0 +1,7 @@ +#include "qemu-common.h" +#include "block/block.h" + +int bdrv_commit_all(void) +{ + return 0; +} diff --git a/stubs/chr-msmouse.c b/stubs/chr-msmouse.c new file mode 100644 index 0000000000..812f8b0abe --- /dev/null +++ b/stubs/chr-msmouse.c @@ -0,0 +1,7 @@ +#include "qemu-common.h" +#include "sysemu/char.h" + +CharDriverState *qemu_chr_open_msmouse(void) +{ + return 0; +} diff --git a/stubs/get-next-serial.c b/stubs/get-next-serial.c new file mode 100644 index 0000000000..40c56d13d7 --- /dev/null +++ b/stubs/get-next-serial.c @@ -0,0 +1,3 @@ +#include "qemu-common.h" + +CharDriverState *serial_hds[0]; diff --git a/stubs/is-daemonized.c b/stubs/is-daemonized.c new file mode 100644 index 0000000000..16ce7c7324 --- /dev/null +++ b/stubs/is-daemonized.c @@ -0,0 +1,7 @@ +#include "qemu-common.h" +#include "sysemu/os-posix.h" + +bool is_daemonized(void) +{ + return true; +} diff --git a/stubs/machine-init-done.c b/stubs/machine-init-done.c new file mode 100644 index 0000000000..28a92555b6 --- /dev/null +++ b/stubs/machine-init-done.c @@ -0,0 +1,6 @@ +#include "qemu-common.h" +#include "sysemu/sysemu.h" + +void qemu_add_machine_init_done_notifier(Notifier *notify) +{ +} diff --git a/stubs/monitor-init.c b/stubs/monitor-init.c new file mode 100644 index 0000000000..563902b412 --- /dev/null +++ b/stubs/monitor-init.c @@ -0,0 +1,6 @@ +#include "qemu-common.h" +#include "monitor/monitor.h" + +void monitor_init(CharDriverState *chr, int flags) +{ +} diff --git a/stubs/notify-event.c b/stubs/notify-event.c new file mode 100644 index 0000000000..32f7289d3a --- /dev/null +++ b/stubs/notify-event.c @@ -0,0 +1,6 @@ +#include "qemu-common.h" +#include "qemu/main-loop.h" + +void qemu_notify_event(void) +{ +} diff --git a/stubs/vc-init.c b/stubs/vc-init.c new file mode 100644 index 0000000000..2af054fe6b --- /dev/null +++ b/stubs/vc-init.c @@ -0,0 +1,7 @@ +#include "qemu-common.h" +#include "ui/console.h" + +CharDriverState *vc_init(ChardevVC *vc) +{ + return 0; +} From a77e6b14b4ffbfb768f96698f0f66e2f00af1323 Mon Sep 17 00:00:00 2001 From: Nikolay Nikolaev Date: Tue, 10 Jun 2014 13:03:23 +0300 Subject: [PATCH 060/109] Add qtest for vhost-user This test creates a 'server' chardev to listen for vhost-user messages. Once VHOST_USER_SET_MEM_TABLE is received it mmaps each received region, and read 1k bytes from it. The read data is compared to data from readl. The test requires hugetlbfs to be already mounted and writable. The mount point defaults to '/hugetlbfs' and can be specified via the environment variable QTEST_HUGETLBFS_PATH. The rom pc-bios/pxe-virtio.rom is used to instantiate a virtio pcicontroller. Signed-off-by: Antonios Motakis Signed-off-by: Nikolay Nikolaev Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: fix up coding style MST: disable vhost test temporarily This test needs a bit more work: issues have been found on legacy systems, disable it for now to avoid false positives for people. Will re-enable after issues are addressed. Reported-by: Igor Mammedov Signed-off-by: Michael S. Tsirkin --- stubs/is-daemonized.c | 6 +- tests/Makefile | 4 + tests/vhost-user-test.c | 312 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 tests/vhost-user-test.c diff --git a/stubs/is-daemonized.c b/stubs/is-daemonized.c index 16ce7c7324..c0ee9171a7 100644 --- a/stubs/is-daemonized.c +++ b/stubs/is-daemonized.c @@ -1,7 +1,9 @@ #include "qemu-common.h" -#include "sysemu/os-posix.h" +/* Win32 has its own inline stub */ +#ifndef _WIN32 bool is_daemonized(void) { - return true; + return false; } +#endif diff --git a/tests/Makefile b/tests/Makefile index 361bb7b6e3..d1c399cf45 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -156,6 +156,7 @@ gcov-files-i386-y += hw/usb/hcd-ehci.c gcov-files-i386-y += hw/usb/hcd-uhci.c gcov-files-i386-y += hw/usb/dev-hid.c gcov-files-i386-y += hw/usb/dev-storage.c +check-qtest-i386-y += tests/vhost-user-test$(EXESUF) check-qtest-x86_64-y = $(check-qtest-i386-y) gcov-files-i386-y += i386-softmmu/hw/timer/mc146818rtc.c gcov-files-x86_64-y = $(subst i386-softmmu/,x86_64-softmmu/,$(gcov-files-i386-y)) @@ -322,9 +323,12 @@ tests/es1370-test$(EXESUF): tests/es1370-test.o tests/intel-hda-test$(EXESUF): tests/intel-hda-test.o tests/ioh3420-test$(EXESUF): tests/ioh3420-test.o tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-pc-obj-y) +tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o libqemuutil.a libqemustub.a tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a +LIBS+= -lutil + # QTest rules TARGETS=$(patsubst %-softmmu,%, $(filter %-softmmu,$(TARGET_DIRS))) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c new file mode 100644 index 0000000000..7c826b49e5 --- /dev/null +++ b/tests/vhost-user-test.c @@ -0,0 +1,312 @@ +/* + * QTest testcase for the vhost-user + * + * Copyright (c) 2014 Virtual Open Systems Sarl. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "libqtest.h" +#include "qemu/option.h" +#include "sysemu/char.h" +#include "sysemu/sysemu.h" + +#include +#include +#include +#include +#include + +#define QEMU_CMD_ACCEL " -machine accel=tcg" +#define QEMU_CMD_MEM " -m 512 -object memory-backend-file,id=mem,size=512M,"\ + "mem-path=%s,share=on -numa node,memdev=mem" +#define QEMU_CMD_CHR " -chardev socket,id=chr0,path=%s" +#define QEMU_CMD_NETDEV " -netdev vhost-user,id=net0,chardev=chr0,vhostforce" +#define QEMU_CMD_NET " -device virtio-net-pci,netdev=net0 " +#define QEMU_CMD_ROM " -option-rom ../pc-bios/pxe-virtio.rom" + +#define QEMU_CMD QEMU_CMD_ACCEL QEMU_CMD_MEM QEMU_CMD_CHR \ + QEMU_CMD_NETDEV QEMU_CMD_NET QEMU_CMD_ROM + +#define HUGETLBFS_MAGIC 0x958458f6 + +/*********** FROM hw/virtio/vhost-user.c *************************************/ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1<<2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + }; +} QEMU_PACKED VhostUserMsg; + +static VhostUserMsg m __attribute__ ((unused)); +#define VHOST_USER_HDR_SIZE (sizeof(m.request) \ + + sizeof(m.flags) \ + + sizeof(m.size)) + +#define VHOST_USER_PAYLOAD_SIZE (sizeof(m) - VHOST_USER_HDR_SIZE) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) +/*****************************************************************************/ + +int fds_num = 0, fds[VHOST_MEMORY_MAX_NREGIONS]; +static VhostUserMemory memory; +static GMutex data_mutex; +static GCond data_cond; + +static void read_guest_mem(void) +{ + uint32_t *guest_mem; + gint64 end_time; + int i, j; + + g_mutex_lock(&data_mutex); + + end_time = g_get_monotonic_time() + 5 * G_TIME_SPAN_SECOND; + while (!fds_num) { + if (!g_cond_wait_until(&data_cond, &data_mutex, end_time)) { + /* timeout has passed */ + g_assert(fds_num); + break; + } + } + + /* check for sanity */ + g_assert_cmpint(fds_num, >, 0); + g_assert_cmpint(fds_num, ==, memory.nregions); + + /* iterate all regions */ + for (i = 0; i < fds_num; i++) { + + /* We'll check only the region statring at 0x0*/ + if (memory.regions[i].guest_phys_addr != 0x0) { + continue; + } + + g_assert_cmpint(memory.regions[i].memory_size, >, 1024); + + guest_mem = mmap(0, memory.regions[i].memory_size, + PROT_READ | PROT_WRITE, MAP_SHARED, fds[i], 0); + + for (j = 0; j < 256; j++) { + uint32_t a = readl(memory.regions[i].guest_phys_addr + j*4); + uint32_t b = guest_mem[j]; + + g_assert_cmpint(a, ==, b); + } + + munmap(guest_mem, memory.regions[i].memory_size); + } + + g_assert_cmpint(1, ==, 1); + g_mutex_unlock(&data_mutex); +} + +static void *thread_function(void *data) +{ + GMainLoop *loop; + loop = g_main_loop_new(NULL, FALSE); + g_main_loop_run(loop); + return NULL; +} + +static int chr_can_read(void *opaque) +{ + return VHOST_USER_HDR_SIZE; +} + +static void chr_read(void *opaque, const uint8_t *buf, int size) +{ + CharDriverState *chr = opaque; + VhostUserMsg msg; + uint8_t *p = (uint8_t *) &msg; + int fd; + + if (size != VHOST_USER_HDR_SIZE) { + g_test_message("Wrong message size received %d\n", size); + return; + } + + memcpy(p, buf, VHOST_USER_HDR_SIZE); + + if (msg.size) { + p += VHOST_USER_HDR_SIZE; + qemu_chr_fe_read_all(chr, p, msg.size); + } + + switch (msg.request) { + case VHOST_USER_GET_FEATURES: + /* send back features to qemu */ + msg.flags |= VHOST_USER_REPLY_MASK; + msg.size = sizeof(m.u64); + msg.u64 = 0; + p = (uint8_t *) &msg; + qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); + break; + + case VHOST_USER_GET_VRING_BASE: + /* send back vring base to qemu */ + msg.flags |= VHOST_USER_REPLY_MASK; + msg.size = sizeof(m.state); + msg.state.num = 0; + p = (uint8_t *) &msg; + qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); + break; + + case VHOST_USER_SET_MEM_TABLE: + /* received the mem table */ + memcpy(&memory, &msg.memory, sizeof(msg.memory)); + fds_num = qemu_chr_fe_get_msgfds(chr, fds, sizeof(fds) / sizeof(int)); + + /* signal the test that it can continue */ + g_cond_signal(&data_cond); + g_mutex_unlock(&data_mutex); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + /* consume the fd */ + qemu_chr_fe_get_msgfds(chr, &fd, 1); + /* + * This is a non-blocking eventfd. + * The receive function forces it to be blocking, + * so revert it back to non-blocking. + */ + qemu_set_nonblock(fd); + break; + default: + break; + } +} + +static const char *init_hugepagefs(void) +{ + const char *path; + struct statfs fs; + int ret; + + path = getenv("QTEST_HUGETLBFS_PATH"); + if (!path) { + path = "/hugetlbfs"; + } + + if (access(path, R_OK | W_OK | X_OK)) { + g_test_message("access on path (%s): %s\n", path, strerror(errno)); + return NULL; + } + + do { + ret = statfs(path, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret != 0) { + g_test_message("statfs on path (%s): %s\n", path, strerror(errno)); + return NULL; + } + + if (fs.f_type != HUGETLBFS_MAGIC) { + g_test_message("Warning: path not on HugeTLBFS: %s\n", path); + return NULL; + } + + return path; +} + +int main(int argc, char **argv) +{ + QTestState *s = NULL; + CharDriverState *chr = NULL; + const char *hugefs = 0; + char *socket_path = 0; + char *qemu_cmd = 0; + char *chr_path = 0; + int ret; + + g_test_init(&argc, &argv, NULL); + + module_call_init(MODULE_INIT_QOM); + + hugefs = init_hugepagefs(); + if (!hugefs) { + return 0; + } + + socket_path = g_strdup_printf("/tmp/vhost-%d.sock", getpid()); + + /* create char dev and add read handlers */ + qemu_add_opts(&qemu_chardev_opts); + chr_path = g_strdup_printf("unix:%s,server,nowait", socket_path); + chr = qemu_chr_new("chr0", chr_path, NULL); + g_free(chr_path); + qemu_chr_add_handlers(chr, chr_can_read, chr_read, NULL, chr); + + /* run the main loop thread so the chardev may operate */ + g_mutex_init(&data_mutex); + g_cond_init(&data_cond); + g_mutex_lock(&data_mutex); + g_thread_new(NULL, thread_function, NULL); + + qemu_cmd = g_strdup_printf(QEMU_CMD, hugefs, socket_path); + s = qtest_start(qemu_cmd); + g_free(qemu_cmd); + + qtest_add_func("/vhost-user/read-guest-mem", read_guest_mem); + + ret = g_test_run(); + + if (s) { + qtest_quit(s); + } + + /* cleanup */ + unlink(socket_path); + g_free(socket_path); + g_cond_clear(&data_cond); + g_mutex_clear(&data_mutex); + + return ret; +} From edd8db8705b2fcaff9a4583330584bf35c4b68a2 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jun 2014 12:23:36 +0300 Subject: [PATCH 061/109] tests: disable vhost test temporarily This test needs a bit more work: issues have been found on legacy systems, disable it for now to avoid false positives for people. Will re-enable after issues are addressed. Reported-by: Igor Mammedov Signed-off-by: Michael S. Tsirkin --- tests/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index d1c399cf45..4caf7deb89 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -156,7 +156,7 @@ gcov-files-i386-y += hw/usb/hcd-ehci.c gcov-files-i386-y += hw/usb/hcd-uhci.c gcov-files-i386-y += hw/usb/dev-hid.c gcov-files-i386-y += hw/usb/dev-storage.c -check-qtest-i386-y += tests/vhost-user-test$(EXESUF) +#check-qtest-i386-y += tests/vhost-user-test$(EXESUF) check-qtest-x86_64-y = $(check-qtest-i386-y) gcov-files-i386-y += i386-softmmu/hw/timer/mc146818rtc.c gcov-files-x86_64-y = $(subst i386-softmmu/,x86_64-softmmu/,$(gcov-files-i386-y)) @@ -327,7 +327,7 @@ tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a -LIBS+= -lutil +#LIBS+= -lutil # QTest rules From 96d0e26c238e8df5d659a9b89f323f2524ec0b74 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:05 +0800 Subject: [PATCH 062/109] NUMA: move numa related code to new file numa.c Signed-off-by: Wanlong Gao Reviewed-by: Eduardo Habkost Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Signed-off-by: Blue Swirl Signed-off-by: Andre Przywara Signed-off-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin MST: comment tweaks --- Makefile.target | 2 +- cpus.c | 14 --- include/exec/cpu-all.h | 2 - include/exec/cpu-common.h | 2 + include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 185 ++++++++++++++++++++++++++++++++++++++ vl.c | 139 +--------------------------- 8 files changed, 192 insertions(+), 156 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index 06c1e59bc4..fc5827cd72 100644 --- a/Makefile.target +++ b/Makefile.target @@ -119,7 +119,7 @@ endif #CONFIG_BSD_USER ######################################################### # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index af06dc0ae6..1ec3a9edd4 100644 --- a/cpus.c +++ b/cpus.c @@ -1312,20 +1312,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ - CPUState *cpu; - int i; - - CPU_FOREACH(cpu) { - for (i = 0; i < nb_numa_nodes; i++) { - if (test_bit(cpu->cpu_index, node_cpumask[i])) { - cpu->numa_node = i; - } - } - } -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index e8363d7248..ed28f1ec63 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -297,8 +297,6 @@ CPUArchState *cpu_copy(CPUArchState *env); /* memory API */ -extern ram_addr_t ram_size; - /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ #define RAM_PREALLOC_MASK (1 << 0) diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 89ec6404cf..e3ec4c8e0c 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -45,6 +45,8 @@ typedef uintptr_t ram_addr_t; # define RAM_ADDR_FMT "%" PRIxPTR #endif +extern ram_addr_t ram_size; + /* memory API */ typedef void CPUWriteMemoryFunc(void *opaque, hwaddr addr, uint32_t value); diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488a05..4f790810bf 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index ba5c7f8093..565c8f60c1 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -144,6 +144,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 0000000000..bd0d2b7b0f --- /dev/null +++ b/numa.c @@ -0,0 +1,185 @@ +/* + * NUMA parameter parsing routines + * + * Copyright (c) 2014 Fujitsu Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" +#include "exec/cpu-common.h" +#include "qemu/bitmap.h" +#include "qom/cpu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ + char *endptr; + unsigned long long value, endvalue; + + /* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ + if (!*cpus) { + return; + } + + if (parse_uint(cpus, &value, &endptr, 10) < 0) { + goto error; + } + if (*endptr == '-') { + if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { + goto error; + } + } else if (*endptr == '\0') { + endvalue = value; + } else { + goto error; + } + + if (endvalue >= MAX_CPUMASK_BITS) { + endvalue = MAX_CPUMASK_BITS - 1; + fprintf(stderr, + "qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); + } + + if (endvalue < value) { + goto error; + } + + bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); + return; + +error: + fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); + exit(1); +} + +void numa_add(const char *optarg) +{ + char option[128]; + char *endptr; + unsigned long long nodenr; + + optarg = get_opt_name(option, 128, optarg, ','); + if (*optarg == ',') { + optarg++; + } + if (!strcmp(option, "node")) { + + if (nb_numa_nodes >= MAX_NODES) { + fprintf(stderr, "qemu: too many NUMA nodes\n"); + exit(1); + } + + if (get_param_value(option, 128, "nodeid", optarg) == 0) { + nodenr = nb_numa_nodes; + } else { + if (parse_uint_full(option, &nodenr, 10) < 0) { + fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); + exit(1); + } + } + + if (nodenr >= MAX_NODES) { + fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); + exit(1); + } + + if (get_param_value(option, 128, "mem", optarg) == 0) { + node_mem[nodenr] = 0; + } else { + int64_t sval; + sval = strtosz(option, &endptr); + if (sval < 0 || *endptr) { + fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); + exit(1); + } + node_mem[nodenr] = sval; + } + if (get_param_value(option, 128, "cpus", optarg) != 0) { + numa_node_parse_cpus(nodenr, option); + } + nb_numa_nodes++; + } else { + fprintf(stderr, "Invalid -numa option: %s\n", option); + exit(1); + } +} + +void set_numa_nodes(void) +{ + if (nb_numa_nodes > 0) { + int i; + + if (nb_numa_nodes > MAX_NODES) { + nb_numa_nodes = MAX_NODES; + } + + /* If no memory size if given for any node, assume the default case + * and distribute the available memory equally across all nodes + */ + for (i = 0; i < nb_numa_nodes; i++) { + if (node_mem[i] != 0) { + break; + } + } + if (i == nb_numa_nodes) { + uint64_t usedmem = 0; + + /* On Linux, the each node's border has to be 8MB aligned, + * the final node gets the rest. + */ + for (i = 0; i < nb_numa_nodes - 1; i++) { + node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); + usedmem += node_mem[i]; + } + node_mem[i] = ram_size - usedmem; + } + + for (i = 0; i < nb_numa_nodes; i++) { + if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { + break; + } + } + /* assigning the VCPUs round-robin is easier to implement, guest OSes + * must cope with this anyway, because there are BIOSes out there in + * real machines which also use this scheme. + */ + if (i == nb_numa_nodes) { + for (i = 0; i < max_cpus; i++) { + set_bit(i, node_cpumask[i % nb_numa_nodes]); + } + } + } +} + +void set_numa_modes(void) +{ + CPUState *cpu; + int i; + + CPU_FOREACH(cpu) { + for (i = 0; i < nb_numa_nodes; i++) { + if (test_bit(cpu->cpu_index, node_cpumask[i])) { + cpu->numa_node = i; + } + } + } +} diff --git a/vl.c b/vl.c index 4c6d6df26c..c87c7d8807 100644 --- a/vl.c +++ b/vl.c @@ -1275,102 +1275,6 @@ char *get_boot_devices_list(size_t *size, bool ignore_suffixes) return list; } -static void numa_node_parse_cpus(int nodenr, const char *cpus) -{ - char *endptr; - unsigned long long value, endvalue; - - /* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ - if (!*cpus) { - return; - } - - if (parse_uint(cpus, &value, &endptr, 10) < 0) { - goto error; - } - if (*endptr == '-') { - if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { - goto error; - } - } else if (*endptr == '\0') { - endvalue = value; - } else { - goto error; - } - - if (endvalue >= MAX_CPUMASK_BITS) { - endvalue = MAX_CPUMASK_BITS - 1; - fprintf(stderr, - "qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); - } - - if (endvalue < value) { - goto error; - } - - bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); - return; - -error: - fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); - exit(1); -} - -static void numa_add(const char *optarg) -{ - char option[128]; - char *endptr; - unsigned long long nodenr; - - optarg = get_opt_name(option, 128, optarg, ','); - if (*optarg == ',') { - optarg++; - } - if (!strcmp(option, "node")) { - - if (nb_numa_nodes >= MAX_NODES) { - fprintf(stderr, "qemu: too many NUMA nodes\n"); - exit(1); - } - - if (get_param_value(option, 128, "nodeid", optarg) == 0) { - nodenr = nb_numa_nodes; - } else { - if (parse_uint_full(option, &nodenr, 10) < 0) { - fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); - exit(1); - } - } - - if (nodenr >= MAX_NODES) { - fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); - exit(1); - } - - if (get_param_value(option, 128, "mem", optarg) == 0) { - node_mem[nodenr] = 0; - } else { - int64_t sval; - sval = strtosz(option, &endptr); - if (sval < 0 || *endptr) { - fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); - exit(1); - } - node_mem[nodenr] = sval; - } - if (get_param_value(option, 128, "cpus", optarg) != 0) { - numa_node_parse_cpus(nodenr, option); - } - nb_numa_nodes++; - } else { - fprintf(stderr, "Invalid -numa option: %s\n", option); - exit(1); - } -} - static QemuOptsList qemu_smp_opts = { .name = "smp-opts", .implied_opt_name = "cpus", @@ -4400,48 +4304,7 @@ int main(int argc, char **argv, char **envp) default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); - if (nb_numa_nodes > 0) { - int i; - - if (nb_numa_nodes > MAX_NODES) { - nb_numa_nodes = MAX_NODES; - } - - /* If no memory size if given for any node, assume the default case - * and distribute the available memory equally across all nodes - */ - for (i = 0; i < nb_numa_nodes; i++) { - if (node_mem[i] != 0) - break; - } - if (i == nb_numa_nodes) { - uint64_t usedmem = 0; - - /* On Linux, the each node's border has to be 8MB aligned, - * the final node gets the rest. - */ - for (i = 0; i < nb_numa_nodes - 1; i++) { - node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); - usedmem += node_mem[i]; - } - node_mem[i] = ram_size - usedmem; - } - - for (i = 0; i < nb_numa_nodes; i++) { - if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { - break; - } - } - /* assigning the VCPUs round-robin is easier to implement, guest OSes - * must cope with this anyway, because there are BIOSes out there in - * real machines which also use this scheme. - */ - if (i == nb_numa_nodes) { - for (i = 0; i < max_cpus; i++) { - set_bit(i, node_cpumask[i % nb_numa_nodes]); - } - } - } + set_numa_nodes(); if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, 1) != 0) { exit(1); From 2b631ec2557eddfe92f1ef80d7fcaedd5db64e28 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:06 +0800 Subject: [PATCH 063/109] NUMA: check if the total numa memory size is equal to ram_size If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI table, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao Reviewed-by: Eduardo Habkost Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Signed-off-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin MST: error message reworded --- numa.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/numa.c b/numa.c index bd0d2b7b0f..e4033994cd 100644 --- a/numa.c +++ b/numa.c @@ -26,6 +26,8 @@ #include "exec/cpu-common.h" #include "qemu/bitmap.h" #include "qom/cpu.h" +#include "qemu/error-report.h" +#include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */ static void numa_node_parse_cpus(int nodenr, const char *cpus) { @@ -126,6 +128,7 @@ void numa_add(const char *optarg) void set_numa_nodes(void) { if (nb_numa_nodes > 0) { + uint64_t numa_total; int i; if (nb_numa_nodes > MAX_NODES) { @@ -153,6 +156,17 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } + numa_total = 0; + for (i = 0; i < nb_numa_nodes; i++) { + numa_total += node_mem[i]; + } + if (numa_total != ram_size) { + error_report("total memory for NUMA nodes (%" PRIu64 ")" + " should equal RAM size (" RAM_ADDR_FMT ")", + numa_total, ram_size); + exit(1); + } + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; From 8c85901ed3cf2ede1ed5957e6e047d3719dceb4e Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:07 +0800 Subject: [PATCH 064/109] NUMA: Add numa_info structure to contain numa nodes info Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao [Fix hw/ppc/spapr.c - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 12 ++++++++---- hw/ppc/spapr.c | 11 ++++++----- include/sysemu/sysemu.h | 8 ++++++-- monitor.c | 2 +- numa.c | 23 ++++++++++++----------- vl.c | 7 +++---- 6 files changed, 36 insertions(+), 27 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index fec1e1380f..ac7ac774e6 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -704,14 +704,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { - if (test_bit(i, node_cpumask[j])) { + if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { - numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); + numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * @@ -1122,8 +1122,12 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); guest_info->apic_xrupt_override = kvm_allows_irq0_override(); guest_info->numa_nodes = nb_numa_nodes; - guest_info->node_mem = g_memdup(node_mem, guest_info->numa_nodes * + guest_info->node_mem = g_malloc0(guest_info->numa_nodes * sizeof *guest_info->node_mem); + for (i = 0; i < nb_numa_nodes; i++) { + guest_info->node_mem[i] = numa_info[i].node_mem; + } + guest_info->node_cpu = g_malloc0(guest_info->apic_id_limit * sizeof *guest_info->node_cpu); @@ -1131,7 +1135,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < guest_info->apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { - if (test_bit(i, node_cpumask[j])) { + if (test_bit(i, numa_info[j].node_cpu)) { guest_info->node_cpu[apic_id] = j; break; } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index e06321cf15..82f183f173 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -673,8 +673,8 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) int i, off; /* memory node(s) */ - if (nb_numa_nodes > 1 && node_mem[0] < ram_size) { - node0_size = node_mem[0]; + if (nb_numa_nodes > 1 && numa_info[0].node_mem < ram_size) { + node0_size = numa_info[0].node_mem; } else { node0_size = ram_size; } @@ -712,7 +712,7 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) if (mem_start >= ram_size) { node_size = 0; } else { - node_size = node_mem[i]; + node_size = numa_info[i].node_mem; if (node_size > ram_size - mem_start) { node_size = ram_size - mem_start; } @@ -857,7 +857,8 @@ static void spapr_reset_htab(sPAPREnvironment *spapr) /* Update the RMA size if necessary */ if (spapr->vrma_adjust) { - hwaddr node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size; + hwaddr node0_size = (nb_numa_nodes > 1) ? + numa_info[0].node_mem : ram_size; spapr->rma_size = kvmppc_rma_size(node0_size, spapr->htab_shift); } } @@ -1289,7 +1290,7 @@ static void ppc_spapr_init(MachineState *machine) MemoryRegion *sysmem = get_system_memory(); MemoryRegion *ram = g_new(MemoryRegion, 1); hwaddr rma_alloc_size; - hwaddr node0_size = (nb_numa_nodes > 1) ? node_mem[0] : ram_size; + hwaddr node0_size = (nb_numa_nodes > 1) ? numa_info[0].node_mem : ram_size; uint32_t initrd_base = 0; long kernel_size = 0, initrd_size = 0; long load_limit, rtas_limit, fw_size; diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 565c8f60c1..3a9308b60d 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -142,8 +143,11 @@ extern QEMUClockType rtc_clock; #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { + uint64_t node_mem; + DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 2901187f5f..ca17e420a9 100644 --- a/monitor.c +++ b/monitor.c @@ -2011,7 +2011,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, - node_mem[i] >> 20); + numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index e4033994cd..f15c4c41c0 100644 --- a/numa.c +++ b/numa.c @@ -65,7 +65,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } - bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); + bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -105,7 +105,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { - node_mem[nodenr] = 0; + numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -113,7 +113,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } - node_mem[nodenr] = sval; + numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -139,7 +139,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { - if (node_mem[i] != 0) { + if (numa_info[i].node_mem != 0) { break; } } @@ -150,15 +150,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { - node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); - usedmem += node_mem[i]; + numa_info[i].node_mem = (ram_size / nb_numa_nodes) & + ~((1 << 23UL) - 1); + usedmem += numa_info[i].node_mem; } - node_mem[i] = ram_size - usedmem; + numa_info[i].node_mem = ram_size - usedmem; } numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { - numa_total += node_mem[i]; + numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { error_report("total memory for NUMA nodes (%" PRIu64 ")" @@ -168,7 +169,7 @@ void set_numa_nodes(void) } for (i = 0; i < nb_numa_nodes; i++) { - if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { + if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) { break; } } @@ -178,7 +179,7 @@ void set_numa_nodes(void) */ if (i == nb_numa_nodes) { for (i = 0; i < max_cpus; i++) { - set_bit(i, node_cpumask[i % nb_numa_nodes]); + set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); } } } @@ -191,7 +192,7 @@ void set_numa_modes(void) CPU_FOREACH(cpu) { for (i = 0; i < nb_numa_nodes; i++) { - if (test_bit(cpu->cpu_index, node_cpumask[i])) { + if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { cpu->numa_node = i; } } diff --git a/vl.c b/vl.c index c87c7d8807..7af90cd41b 100644 --- a/vl.c +++ b/vl.c @@ -195,8 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; -uint64_t node_mem[MAX_NODES]; -unsigned long *node_cpumask[MAX_NODES]; +NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; bool qemu_uuid_set; @@ -2959,8 +2958,8 @@ int main(int argc, char **argv, char **envp) translation = BIOS_ATA_TRANSLATION_AUTO; for (i = 0; i < MAX_NODES; i++) { - node_mem[i] = 0; - node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS); + numa_info[i].node_mem = 0; + bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); } nb_numa_nodes = 0; From 0042109a6ab629aebfb287ff7aee295f24fad40d Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:08 +0800 Subject: [PATCH 065/109] NUMA: convert -numa option to use OptsVisitor Signed-off-by: Wanlong Gao Signed-off-by: Igor Mammedov Tested-by: Eduardo Habkost Reviewed-by: Eduardo Habkost Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/sysemu.h | 3 +- numa.c | 145 +++++++++++++++++++--------------------- qapi-schema.json | 32 +++++++++ vl.c | 11 ++- 4 files changed, 114 insertions(+), 77 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 3a9308b60d..4102be320f 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -148,9 +148,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index f15c4c41c0..6fb0888d60 100644 --- a/numa.c +++ b/numa.c @@ -28,101 +28,96 @@ #include "qom/cpu.h" #include "qemu/error-report.h" #include "include/exec/cpu-common.h" /* for RAM_ADDR_FMT */ +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +#include "qapi/qmp/qerror.h" -static void numa_node_parse_cpus(int nodenr, const char *cpus) +QemuOptsList qemu_numa_opts = { + .name = "numa", + .implied_opt_name = "type", + .head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), + .desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) { - char *endptr; - unsigned long long value, endvalue; + uint16_t nodenr; + uint16List *cpus = NULL; - /* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ - if (!*cpus) { + if (node->has_nodeid) { + nodenr = node->nodeid; + } else { + nodenr = nb_numa_nodes; + } + + if (nodenr >= MAX_NODES) { + error_setg(errp, "Max number of NUMA nodes reached: %" + PRIu16 "\n", nodenr); return; } - if (parse_uint(cpus, &value, &endptr, 10) < 0) { - goto error; - } - if (*endptr == '-') { - if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { - goto error; + for (cpus = node->cpus; cpus; cpus = cpus->next) { + if (cpus->value > MAX_CPUMASK_BITS) { + error_setg(errp, "CPU number %" PRIu16 " is bigger than %d", + cpus->value, MAX_CPUMASK_BITS); + return; } - } else if (*endptr == '\0') { - endvalue = value; - } else { - goto error; + bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } - if (endvalue >= MAX_CPUMASK_BITS) { - endvalue = MAX_CPUMASK_BITS - 1; - fprintf(stderr, - "qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); + if (node->has_mem) { + uint64_t mem_size = node->mem; + const char *mem_str = qemu_opt_get(opts, "mem"); + /* Fix up legacy suffix-less format */ + if (g_ascii_isdigit(mem_str[strlen(mem_str) - 1])) { + mem_size <<= 20; + } + numa_info[nodenr].node_mem = mem_size; } - - if (endvalue < value) { - goto error; - } - - bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); - return; - -error: - fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); - exit(1); } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { - char option[128]; - char *endptr; - unsigned long long nodenr; + NumaOptions *object = NULL; + Error *err = NULL; - optarg = get_opt_name(option, 128, optarg, ','); - if (*optarg == ',') { - optarg++; + { + OptsVisitor *ov = opts_visitor_new(opts); + visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); + opts_visitor_cleanup(ov); } - if (!strcmp(option, "node")) { - if (nb_numa_nodes >= MAX_NODES) { - fprintf(stderr, "qemu: too many NUMA nodes\n"); - exit(1); - } + if (err) { + goto error; + } - if (get_param_value(option, 128, "nodeid", optarg) == 0) { - nodenr = nb_numa_nodes; - } else { - if (parse_uint_full(option, &nodenr, 10) < 0) { - fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); - exit(1); - } - } - - if (nodenr >= MAX_NODES) { - fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); - exit(1); - } - - if (get_param_value(option, 128, "mem", optarg) == 0) { - numa_info[nodenr].node_mem = 0; - } else { - int64_t sval; - sval = strtosz(option, &endptr); - if (sval < 0 || *endptr) { - fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); - exit(1); - } - numa_info[nodenr].node_mem = sval; - } - if (get_param_value(option, 128, "cpus", optarg) != 0) { - numa_node_parse_cpus(nodenr, option); + switch (object->kind) { + case NUMA_OPTIONS_KIND_NODE: + numa_node_parse(object->node, opts, &err); + if (err) { + goto error; } nb_numa_nodes++; - } else { - fprintf(stderr, "Invalid -numa option: %s\n", option); - exit(1); + break; + default: + abort(); } + + return 0; + +error: + qerror_report_err(err); + error_free(err); + + if (object) { + QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); + visit_type_NumaOptions(qapi_dealloc_get_visitor(dv), + &object, NULL, NULL); + qapi_dealloc_visitor_cleanup(dv); + } + + return -1; } void set_numa_nodes(void) diff --git a/qapi-schema.json b/qapi-schema.json index f5d89b024c..e65b7b1489 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3097,3 +3097,35 @@ 'btn' : 'InputBtnEvent', 'rel' : 'InputMoveEvent', 'abs' : 'InputMoveEvent' } } + +## +# @NumaOptions +# +# A discriminated record of NUMA options. (for OptsVisitor) +# +# Since 2.1 +## +{ 'union': 'NumaOptions', + 'data': { + 'node': 'NumaNodeOptions' }} + +## +# @NumaNodeOptions +# +# Create a guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID (increase by 1 from 0 if omitted) +# +# @cpus: #optional VCPUs belonging to this node (assign VCPUS round-robin +# if omitted) +# +# @mem: #optional memory size of this node (equally divide total memory among +# nodes if omitted) +# +# Since: 2.1 +## +{ 'type': 'NumaNodeOptions', + 'data': { + '*nodeid': 'uint16', + '*cpus': ['uint16'], + '*mem': 'size' }} diff --git a/vl.c b/vl.c index 7af90cd41b..469aa9c00c 100644 --- a/vl.c +++ b/vl.c @@ -2938,6 +2938,7 @@ int main(int argc, char **argv, char **envp) qemu_add_opts(&qemu_realtime_opts); qemu_add_opts(&qemu_msg_opts); qemu_add_opts(&qemu_name_opts); + qemu_add_opts(&qemu_numa_opts); runstate_init(); @@ -3133,7 +3134,10 @@ int main(int argc, char **argv, char **envp) } break; case QEMU_OPTION_numa: - numa_add(optarg); + opts = qemu_opts_parse(qemu_find_opts("numa"), optarg, 1); + if (!opts) { + exit(1); + } break; case QEMU_OPTION_display: display_type = select_display(optarg); @@ -4303,6 +4307,11 @@ int main(int argc, char **argv, char **envp) default_drive(default_floppy, snapshot, IF_FLOPPY, 0, FD_OPTS); default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); + if (qemu_opts_foreach(qemu_find_opts("numa"), numa_init_func, + NULL, 1) != 0) { + exit(1); + } + set_numa_nodes(); if (qemu_opts_foreach(qemu_find_opts("mon"), mon_init_func, NULL, 1) != 0) { From 45e30bf3a98e313a22e7565497e50ddd2652eae7 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:09 +0800 Subject: [PATCH 066/109] NUMA: expand MAX_NODES from 64 to 128 libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao Reviewed-by: Eduardo Habkost Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 4102be320f..423d49efc6 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -133,7 +133,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 /* The following shall be true for all CPUs: * cpu->cpu_index < max_cpus <= MAX_CPUMASK_BITS From 4932b8971bb6fa384ce8c1c61db744b22ba83dce Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 14 May 2014 17:43:10 +0800 Subject: [PATCH 067/109] man: improve -numa doc The -numa option documentation in qemu's manpage lacks the command-line options and some information regarding how it relates to options -m and -smp. This commit fills in the missing text. Signed-off-by: Luiz Capitulino Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Signed-off-by: Michael S. Tsirkin Reviewed-by: Eduardo Habkost Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- qemu-options.hx | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index ab06df1e01..06111a67cd 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -97,10 +97,14 @@ ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) STEXI -@item -numa @var{opts} +@item -numa node[,mem=@var{size}][,cpus=@var{cpu[-cpu]}][,nodeid=@var{node}] @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources -are split equally. +Simulate a multi node NUMA system. If @samp{mem} +and @samp{cpus} are omitted, resources are split equally. Also, note +that the -@option{numa} option doesn't allocate any of the specified +resources. That is, it just assigns existing resources to NUMA nodes. This +means that one still has to use the @option{-m}, @option{-smp} options +to allocate RAM and VCPUs respectively. ETEXI DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, From d1169464245bcd4c89cbcc64f8937df61ae6bd4b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:13 +0800 Subject: [PATCH 068/109] qmp: improve error reporting for -object and object-add Use QERR_INVALID_PARAMETER_VALUE for consistency. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- qmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qmp.c b/qmp.c index b722dbec9a..c3c0229cdf 100644 --- a/qmp.c +++ b/qmp.c @@ -540,7 +540,7 @@ void object_add(const char *type, const char *id, const QDict *qdict, klass = object_class_by_name(type); if (!klass) { - error_setg(errp, "invalid class name"); + error_setg(errp, "invalid object type: %s", type); return; } From dfabb8b91655f680eaa1aa05e9f226fbd596a70f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:15 +0800 Subject: [PATCH 069/109] numa: introduce memory_region_allocate_system_memory Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Signed-off-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin Reviewed-by: Eduardo Habkost Signed-off-by: Michael S. Tsirkin MST: resolve conflicts --- hw/i386/pc.c | 3 +-- include/hw/boards.h | 6 +++++- include/sysemu/sysemu.h | 1 + numa.c | 9 +++++++++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index ac7ac774e6..c674c1c796 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1215,8 +1215,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, * with older qemus that used qemu_ram_alloc(). */ ram = g_malloc(sizeof(*ram)); - memory_region_init_ram(ram, NULL, "pc.ram", ram_size); - vmstate_register_ram_global(ram); + memory_region_allocate_system_memory(ram, NULL, "pc.ram", ram_size); *ram_memory = ram; ram_below_4g = g_malloc(sizeof(*ram_below_4g)); memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, diff --git a/include/hw/boards.h b/include/hw/boards.h index 429ac43abc..605a970934 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -43,9 +43,13 @@ struct QEMUMachine { const char *hw_version; }; -#define TYPE_MACHINE_SUFFIX "-machine" +void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, + const char *name, + uint64_t ram_size); + int qemu_register_machine(QEMUMachine *m); +#define TYPE_MACHINE_SUFFIX "-machine" #define TYPE_MACHINE "machine" #undef MACHINE /* BSD defines it and QEMU does not use it */ #define MACHINE(obj) \ diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 423d49efc6..caf88dd870 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -10,6 +10,7 @@ #include "qemu/notify.h" #include "qemu/main-loop.h" #include "qemu/bitmap.h" +#include "qom/object.h" /* vl.c */ diff --git a/numa.c b/numa.c index 6fb0888d60..8bab784a3e 100644 --- a/numa.c +++ b/numa.c @@ -32,6 +32,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "qapi/qmp/qerror.h" +#include "hw/boards.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -193,3 +194,11 @@ void set_numa_modes(void) } } } + +void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, + const char *name, + uint64_t ram_size) +{ + memory_region_init_ram(mr, owner, name, ram_size); + vmstate_register_ram_global(mr); +} From e1c57ab86f3c4ea6532b51cfecf32770b45f5e7a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:18 +0800 Subject: [PATCH 070/109] memory: reorganize file-based allocation Split the internal interface in exec.c to a separate function, and push the check on mem_path up to memory_region_init_ram. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 105 ++++++++++++++++++++++++---------------- include/exec/cpu-all.h | 3 -- include/exec/ram_addr.h | 2 + include/sysemu/sysemu.h | 2 + memory.c | 7 ++- 5 files changed, 73 insertions(+), 46 deletions(-) diff --git a/exec.c b/exec.c index c3fbbb3fb8..02167ca5c9 100644 --- a/exec.c +++ b/exec.c @@ -1262,56 +1262,30 @@ static int memory_try_enable_merging(void *addr, size_t len) return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE); } -ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, - MemoryRegion *mr) +static ram_addr_t ram_block_add(RAMBlock *new_block) { - RAMBlock *block, *new_block; + RAMBlock *block; ram_addr_t old_ram_size, new_ram_size; old_ram_size = last_ram_offset() >> TARGET_PAGE_BITS; - size = TARGET_PAGE_ALIGN(size); - new_block = g_malloc0(sizeof(*new_block)); - new_block->fd = -1; - /* This assumes the iothread lock is taken here too. */ qemu_mutex_lock_ramlist(); - new_block->mr = mr; - new_block->offset = find_ram_offset(size); - if (host) { - new_block->host = host; - new_block->flags |= RAM_PREALLOC_MASK; - } else if (xen_enabled()) { - if (mem_path) { - fprintf(stderr, "-mem-path not supported with Xen\n"); - exit(1); - } - xen_ram_alloc(new_block->offset, size, mr); - } else { - if (mem_path) { - if (phys_mem_alloc != qemu_anon_ram_alloc) { - /* - * file_ram_alloc() needs to allocate just like - * phys_mem_alloc, but we haven't bothered to provide - * a hook there. - */ - fprintf(stderr, - "-mem-path not supported with this accelerator\n"); - exit(1); - } - new_block->host = file_ram_alloc(new_block, size, mem_path); - } - if (!new_block->host) { - new_block->host = phys_mem_alloc(size); + new_block->offset = find_ram_offset(new_block->length); + + if (!new_block->host) { + if (xen_enabled()) { + xen_ram_alloc(new_block->offset, new_block->length, new_block->mr); + } else { + new_block->host = phys_mem_alloc(new_block->length); if (!new_block->host) { fprintf(stderr, "Cannot set up guest memory '%s': %s\n", new_block->mr->name, strerror(errno)); exit(1); } - memory_try_enable_merging(new_block->host, size); + memory_try_enable_merging(new_block->host, new_block->length); } } - new_block->length = size; /* Keep the list sorted from biggest to smallest block. */ QTAILQ_FOREACH(block, &ram_list.blocks, next) { @@ -1339,18 +1313,65 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, old_ram_size, new_ram_size); } } - cpu_physical_memory_set_dirty_range(new_block->offset, size); + cpu_physical_memory_set_dirty_range(new_block->offset, new_block->length); - qemu_ram_setup_dump(new_block->host, size); - qemu_madvise(new_block->host, size, QEMU_MADV_HUGEPAGE); - qemu_madvise(new_block->host, size, QEMU_MADV_DONTFORK); + qemu_ram_setup_dump(new_block->host, new_block->length); + qemu_madvise(new_block->host, new_block->length, QEMU_MADV_HUGEPAGE); + qemu_madvise(new_block->host, new_block->length, QEMU_MADV_DONTFORK); - if (kvm_enabled()) - kvm_setup_guest_memory(new_block->host, size); + if (kvm_enabled()) { + kvm_setup_guest_memory(new_block->host, new_block->length); + } return new_block->offset; } +ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, + const char *mem_path) +{ + RAMBlock *new_block; + + if (xen_enabled()) { + fprintf(stderr, "-mem-path not supported with Xen\n"); + exit(1); + } + + if (phys_mem_alloc != qemu_anon_ram_alloc) { + /* + * file_ram_alloc() needs to allocate just like + * phys_mem_alloc, but we haven't bothered to provide + * a hook there. + */ + fprintf(stderr, + "-mem-path not supported with this accelerator\n"); + exit(1); + } + + size = TARGET_PAGE_ALIGN(size); + new_block = g_malloc0(sizeof(*new_block)); + new_block->mr = mr; + new_block->length = size; + new_block->host = file_ram_alloc(new_block, size, mem_path); + return ram_block_add(new_block); +} + +ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, + MemoryRegion *mr) +{ + RAMBlock *new_block; + + size = TARGET_PAGE_ALIGN(size); + new_block = g_malloc0(sizeof(*new_block)); + new_block->mr = mr; + new_block->length = size; + new_block->fd = -1; + new_block->host = host; + if (host) { + new_block->flags |= RAM_PREALLOC_MASK; + } + return ram_block_add(new_block); +} + ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr) { return qemu_ram_alloc_from_ptr(size, NULL, mr); diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index ed28f1ec63..eaddea6194 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -325,9 +325,6 @@ typedef struct RAMList { } RAMList; extern RAMList ram_list; -extern const char *mem_path; -extern int mem_prealloc; - /* Flags stored in the low bits of the TLB virtual address. These are defined so that fast path ram access is all zeros. */ /* Zero if TLB entry is valid. */ diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index b94de02ea7..9b00638505 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -22,6 +22,8 @@ #ifndef CONFIG_USER_ONLY #include "hw/xen/xen.h" +ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, + const char *mem_path); ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr); ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr); diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index caf88dd870..c4e1bbd9fd 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -133,6 +133,8 @@ extern uint8_t *boot_splash_filedata; extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; +extern const char *mem_path; +extern int mem_prealloc; #define MAX_NODES 128 diff --git a/memory.c b/memory.c index 3e8cd5680b..063effec6b 100644 --- a/memory.c +++ b/memory.c @@ -23,6 +23,7 @@ #include "exec/memory-internal.h" #include "exec/ram_addr.h" +#include "sysemu/sysemu.h" //#define DEBUG_UNASSIGNED @@ -1029,7 +1030,11 @@ void memory_region_init_ram(MemoryRegion *mr, mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; - mr->ram_addr = qemu_ram_alloc(size, mr); + if (mem_path) { + mr->ram_addr = qemu_ram_alloc_from_file(size, mr, mem_path); + } else { + mr->ram_addr = qemu_ram_alloc(size, mr); + } } void memory_region_init_ram_ptr(MemoryRegion *mr, From 38183310be7af6af5ee60e7c9e9647133a7d11c3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:21 +0800 Subject: [PATCH 071/109] memory: move preallocation code out of exec.c So that backends can use it. Since we need the page size for efficiency, move code to compute it out of translate-all.c and into util/oslib-win32.c. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 44 +---------------------- include/qemu/osdep.h | 2 ++ include/sysemu/os-win32.h | 2 ++ translate-all.c | 7 ---- util/oslib-posix.c | 73 +++++++++++++++++++++++++++++++++++++++ util/oslib-win32.c | 19 ++++++++++ 6 files changed, 97 insertions(+), 50 deletions(-) diff --git a/exec.c b/exec.c index 02167ca5c9..dae50a1aa0 100644 --- a/exec.c +++ b/exec.c @@ -1011,13 +1011,6 @@ static long gethugepagesize(const char *path) return fs.f_bsize; } -static sigjmp_buf sigjump; - -static void sigbus_handler(int signal) -{ - siglongjmp(sigjump, 1); -} - static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, const char *path) @@ -1082,42 +1075,7 @@ static void *file_ram_alloc(RAMBlock *block, } if (mem_prealloc) { - int ret, i; - struct sigaction act, oldact; - sigset_t set, oldset; - - memset(&act, 0, sizeof(act)); - act.sa_handler = &sigbus_handler; - act.sa_flags = 0; - - ret = sigaction(SIGBUS, &act, &oldact); - if (ret) { - perror("file_ram_alloc: failed to install signal handler"); - exit(1); - } - - /* unblock SIGBUS */ - sigemptyset(&set); - sigaddset(&set, SIGBUS); - pthread_sigmask(SIG_UNBLOCK, &set, &oldset); - - if (sigsetjmp(sigjump, 1)) { - fprintf(stderr, "file_ram_alloc: failed to preallocate pages\n"); - exit(1); - } - - /* MAP_POPULATE silently ignores failures */ - for (i = 0; i < (memory/hpagesize); i++) { - memset(area + (hpagesize*i), 0, 1); - } - - ret = sigaction(SIGBUS, &oldact, NULL); - if (ret) { - perror("file_ram_alloc: failed to reinstall signal handler"); - exit(1); - } - - pthread_sigmask(SIG_SETMASK, &oldset, NULL); + os_mem_prealloc(fd, area, memory); } block->fd = fd; diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index ffb296692d..9c1a119703 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -251,4 +251,6 @@ void qemu_init_auxval(char **envp); void qemu_set_tty_echo(int fd, bool echo); +void os_mem_prealloc(int fd, char *area, size_t sz); + #endif diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h index bf8523ada1..af3fbc47d8 100644 --- a/include/sysemu/os-win32.h +++ b/include/sysemu/os-win32.h @@ -89,6 +89,8 @@ static inline void os_setup_post(void) {} void os_set_line_buffering(void); static inline void os_set_proc_name(const char *dummy) {} +size_t getpagesize(void); + #if !defined(EPROTONOSUPPORT) # define EPROTONOSUPPORT EINVAL #endif diff --git a/translate-all.c b/translate-all.c index 6b7b46e761..5425d038d9 100644 --- a/translate-all.c +++ b/translate-all.c @@ -295,14 +295,7 @@ void page_size_init(void) { /* NOTE: we can always suppose that qemu_host_page_size >= TARGET_PAGE_SIZE */ -#ifdef _WIN32 - SYSTEM_INFO system_info; - - GetSystemInfo(&system_info); - qemu_real_host_page_size = system_info.dwPageSize; -#else qemu_real_host_page_size = getpagesize(); -#endif if (qemu_host_page_size == 0) { qemu_host_page_size = qemu_real_host_page_size; } diff --git a/util/oslib-posix.c b/util/oslib-posix.c index 8e9c770d28..1524ead755 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -46,6 +46,7 @@ extern int daemon(int, int); #else # define QEMU_VMALLOC_ALIGN getpagesize() #endif +#define HUGETLBFS_MAGIC 0x958458f6 #include #include @@ -58,9 +59,12 @@ extern int daemon(int, int); #include "qemu/sockets.h" #include #include +#include +#include #ifdef CONFIG_LINUX #include +#include #endif #ifdef __FreeBSD__ @@ -332,3 +336,72 @@ char *qemu_get_exec_dir(void) { return g_strdup(exec_dir); } + +static sigjmp_buf sigjump; + +static void sigbus_handler(int signal) +{ + siglongjmp(sigjump, 1); +} + +static size_t fd_getpagesize(int fd) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } + } +#endif + + return getpagesize(); +} + +void os_mem_prealloc(int fd, char *area, size_t memory) +{ + int ret, i; + struct sigaction act, oldact; + sigset_t set, oldset; + size_t hpagesize = fd_getpagesize(fd); + + memset(&act, 0, sizeof(act)); + act.sa_handler = &sigbus_handler; + act.sa_flags = 0; + + ret = sigaction(SIGBUS, &act, &oldact); + if (ret) { + perror("os_mem_prealloc: failed to install signal handler"); + exit(1); + } + + /* unblock SIGBUS */ + sigemptyset(&set); + sigaddset(&set, SIGBUS); + pthread_sigmask(SIG_UNBLOCK, &set, &oldset); + + if (sigsetjmp(sigjump, 1)) { + fprintf(stderr, "os_mem_prealloc: failed to preallocate pages\n"); + exit(1); + } + + /* MAP_POPULATE silently ignores failures */ + memory = (memory + hpagesize - 1) & -hpagesize; + for (i = 0; i < (memory/hpagesize); i++) { + memset(area + (hpagesize*i), 0, 1); + } + + ret = sigaction(SIGBUS, &oldact, NULL); + if (ret) { + perror("os_mem_prealloc: failed to reinstall signal handler"); + exit(1); + } + + pthread_sigmask(SIG_SETMASK, &oldset, NULL); +} diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 69552f7ec3..ee6bcf1256 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -350,3 +350,22 @@ gint g_poll_fixed(GPollFD *fds, guint nfds, gint timeout) return num_completed; } + +size_t getpagesize(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + return system_info.dwPageSize; +} + +void os_mem_prealloc(int fd, char *area, size_t memory) +{ + int i; + size_t pagesize = getpagesize(); + + memory = (memory + pagesize - 1) & -pagesize; + for (i = 0; i < memory / pagesize; i++) { + memset(area + pagesize * i, 0, 1); + } +} From 7bd4f430a3a226c0016de00818a2a67fd58b2047 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:22 +0800 Subject: [PATCH 072/109] memory: move RAM_PREALLOC_MASK to exec.c, rename Prepare for adding more flags. The "_MASK" suffix is unique, kill it. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 9 ++++++--- include/exec/cpu-all.h | 3 --- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/exec.c b/exec.c index dae50a1aa0..bad16e0613 100644 --- a/exec.c +++ b/exec.c @@ -70,6 +70,9 @@ AddressSpace address_space_memory; MemoryRegion io_mem_rom, io_mem_notdirty; static MemoryRegion io_mem_unassigned; +/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ +#define RAM_PREALLOC (1 << 0) + #endif struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus); @@ -1325,7 +1328,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, new_block->fd = -1; new_block->host = host; if (host) { - new_block->flags |= RAM_PREALLOC_MASK; + new_block->flags |= RAM_PREALLOC; } return ram_block_add(new_block); } @@ -1364,7 +1367,7 @@ void qemu_ram_free(ram_addr_t addr) QTAILQ_REMOVE(&ram_list.blocks, block, next); ram_list.mru_block = NULL; ram_list.version++; - if (block->flags & RAM_PREALLOC_MASK) { + if (block->flags & RAM_PREALLOC) { ; } else if (xen_enabled()) { xen_invalidate_map_cache_entry(block->host); @@ -1396,7 +1399,7 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) offset = addr - block->offset; if (offset < block->length) { vaddr = block->host + offset; - if (block->flags & RAM_PREALLOC_MASK) { + if (block->flags & RAM_PREALLOC) { ; } else if (xen_enabled()) { abort(); diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index eaddea6194..f91581fc65 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -297,9 +297,6 @@ CPUArchState *cpu_copy(CPUArchState *env); /* memory API */ -/* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ -#define RAM_PREALLOC_MASK (1 << 0) - typedef struct RAMBlock { struct MemoryRegion *mr; uint8_t *host; From a99d57bb6a9928ece3e5c05941167487b3acf1b4 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Wed, 14 May 2014 17:43:28 +0800 Subject: [PATCH 073/109] configure: add Linux libnuma detection Add detection of libnuma (mostly contained in the numactl package) to the configure script. Can be enabled or disabled on the command line, default is use if available. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- configure | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/configure b/configure index d23ded555d..fb3c856c78 100755 --- a/configure +++ b/configure @@ -334,6 +334,7 @@ tpm="no" libssh2="" vhdx="" quorum="" +numa="" # parse CC options first for opt do @@ -1118,6 +1119,10 @@ for opt do ;; --enable-quorum) quorum="yes" ;; + --disable-numa) numa="no" + ;; + --enable-numa) numa="yes" + ;; *) echo "ERROR: unknown option $opt" echo "Try '$0 --help' for more information" @@ -1384,6 +1389,8 @@ Advanced options (experts only): --enable-vhdx enable support for the Microsoft VHDX image format --disable-quorum disable quorum block filter support --enable-quorum enable quorum block filter support + --disable-numa disable libnuma support + --enable-numa enable libnuma support NOTE: The object files are built at the place where configure is launched EOF @@ -3167,6 +3174,26 @@ if compile_prog "" "" ; then splice=yes fi +########################################## +# libnuma probe + +if test "$numa" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { return numa_available(); } +EOF + + if compile_prog "" "-lnuma" ; then + numa=yes + libs_softmmu="-lnuma $libs_softmmu" + else + if test "$numa" = "yes" ; then + feature_not_found "numa" "install numactl devel" + fi + numa=no + fi +fi + ########################################## # signalfd probe signalfd="no" @@ -4241,6 +4268,7 @@ echo "vhdx $vhdx" echo "Quorum $quorum" echo "lzo support $lzo" echo "snappy support $snappy" +echo "NUMA host support $numa" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -5210,6 +5238,10 @@ if [ "$dtc_internal" = "yes" ]; then echo "config-host.h: subdir-dtc" >> $config_host_mak fi +if test "$numa" = "yes"; then + echo "CONFIG_NUMA=y" >> $config_host_mak +fi + # build tree in object directory in case the source is not in the current directory DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests" DIRS="$DIRS fsdev" From 7f8f9ef1dac25e76706ee0454bb6cfa097489170 Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Wed, 14 May 2014 17:43:30 +0800 Subject: [PATCH 074/109] Introduce signed range. Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Tested-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: split up patch --- include/qemu/range.h | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/include/qemu/range.h b/include/qemu/range.h index aae9720161..cfa021fd48 100644 --- a/include/qemu/range.h +++ b/include/qemu/range.h @@ -3,6 +3,7 @@ #include #include +#include "qemu/queue.h" /* * Operations on 64 bit address ranges. @@ -60,4 +61,75 @@ static inline int ranges_overlap(uint64_t first1, uint64_t len1, return !(last2 < first1 || last1 < first2); } +/* 0,1 can merge with 1,2 but don't overlap */ +static inline bool ranges_can_merge(Range *range1, Range *range2) +{ + return !(range1->end < range2->begin || range2->end < range1->begin); +} + +static inline int range_merge(Range *range1, Range *range2) +{ + if (ranges_can_merge(range1, range2)) { + if (range1->end < range2->end) { + range1->end = range2->end; + } + if (range1->begin > range2->begin) { + range1->begin = range2->begin; + } + return 0; + } + + return -1; +} + +static inline GList *g_list_insert_sorted_merged(GList *list, + gpointer data, + GCompareFunc func) +{ + GList *l, *next = NULL; + Range *r, *nextr; + + if (!list) { + list = g_list_insert_sorted(list, data, func); + return list; + } + + nextr = data; + l = list; + while (l && l != next && nextr) { + r = l->data; + if (ranges_can_merge(r, nextr)) { + range_merge(r, nextr); + l = g_list_remove_link(l, next); + next = g_list_next(l); + if (next) { + nextr = next->data; + } else { + nextr = NULL; + } + } else { + l = g_list_next(l); + } + } + + if (!l) { + list = g_list_insert_sorted(list, data, func); + } + + return list; +} + +static inline gint range_compare(gconstpointer a, gconstpointer b) +{ + Range *ra = (Range *)a, *rb = (Range *)b; + if (ra->begin == rb->begin && ra->end == rb->end) { + return 0; + } else if (range_get_last(ra->begin, ra->end) < + range_get_last(rb->begin, rb->end)) { + return -1; + } else { + return 1; + } +} + #endif From 1f21772db0b14049b1b4bfef1f039b40827c676f Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Wed, 14 May 2014 17:43:33 +0800 Subject: [PATCH 075/109] qom: introduce object_property_get_enum and object_property_get_uint16List Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- include/qom/object.h | 28 ++++++++++++++++++++++++++++ qom/object.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/include/qom/object.h b/include/qom/object.h index a641dcde10..b882ccc85f 100644 --- a/include/qom/object.h +++ b/include/qom/object.h @@ -916,6 +916,34 @@ void object_property_set_int(Object *obj, int64_t value, int64_t object_property_get_int(Object *obj, const char *name, Error **errp); +/** + * object_property_get_enum: + * @obj: the object + * @name: the name of the property + * @strings: strings corresponding to enums + * @errp: returns an error if this function fails + * + * Returns: the value of the property, converted to an integer, or + * undefined if an error occurs (including when the property value is not + * an enum). + */ +int object_property_get_enum(Object *obj, const char *name, + const char *strings[], Error **errp); + +/** + * object_property_get_uint16List: + * @obj: the object + * @name: the name of the property + * @list: the returned int list + * @errp: returns an error if this function fails + * + * Returns: the value of the property, converted to integers, or + * undefined if an error occurs (including when the property value is not + * an list of integers). + */ +void object_property_get_uint16List(Object *obj, const char *name, + uint16List **list, Error **errp); + /** * object_property_set: * @obj: the object diff --git a/qom/object.c b/qom/object.c index e42b254303..3876618c2e 100644 --- a/qom/object.c +++ b/qom/object.c @@ -13,6 +13,7 @@ #include "qom/object.h" #include "qemu-common.h" #include "qapi/visitor.h" +#include "qapi-visit.h" #include "qapi/string-input-visitor.h" #include "qapi/string-output-visitor.h" #include "qapi/qmp/qerror.h" @@ -938,6 +939,40 @@ int64_t object_property_get_int(Object *obj, const char *name, return retval; } +int object_property_get_enum(Object *obj, const char *name, + const char *strings[], Error **errp) +{ + StringOutputVisitor *sov; + StringInputVisitor *siv; + int ret; + + sov = string_output_visitor_new(false); + object_property_get(obj, string_output_get_visitor(sov), name, errp); + siv = string_input_visitor_new(string_output_get_string(sov)); + string_output_visitor_cleanup(sov); + visit_type_enum(string_input_get_visitor(siv), + &ret, strings, NULL, name, errp); + string_input_visitor_cleanup(siv); + + return ret; +} + +void object_property_get_uint16List(Object *obj, const char *name, + uint16List **list, Error **errp) +{ + StringOutputVisitor *ov; + StringInputVisitor *iv; + + ov = string_output_visitor_new(false); + object_property_get(obj, string_output_get_visitor(ov), + name, errp); + iv = string_input_visitor_new(string_output_get_string(ov)); + visit_type_uint16List(string_input_get_visitor(iv), + list, NULL, errp); + string_output_visitor_cleanup(ov); + string_input_visitor_cleanup(iv); +} + void object_property_parse(Object *obj, const char *string, const char *name, Error **errp) { From 7febe36f9adbb34756a6a6765a36ea49b6e502ac Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:17 +0800 Subject: [PATCH 076/109] numa: add -numa node,memdev= option This option provides the infrastructure for binding guest NUMA nodes to host NUMA nodes. For example: -object memory-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 \ -numa node,nodeid=0,cpus=0,memdev=ram-node0 \ -object memory-ram,size=1024M,policy=interleave,host-nodes=1-3,id=ram-node1 \ -numa node,nodeid=1,cpus=1,memdev=ram-node1 The option replaces "-numa node,mem=". Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Signed-off-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin MST: conflict resolution --- include/sysemu/sysemu.h | 1 + numa.c | 67 +++++++++++++++++++++++++++++++++++++++-- qapi-schema.json | 11 +++++-- qemu-options.hx | 12 ++++++-- vl.c | 4 +-- 5 files changed, 84 insertions(+), 11 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index c4e1bbd9fd..277230db49 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -149,6 +149,7 @@ extern int nb_numa_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); + struct HostMemoryBackend *node_memdev; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index 8bab784a3e..b24bb9dd9f 100644 --- a/numa.c +++ b/numa.c @@ -33,6 +33,7 @@ #include "qapi/dealloc-visitor.h" #include "qapi/qmp/qerror.h" #include "hw/boards.h" +#include "sysemu/hostmem.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -41,6 +42,8 @@ QemuOptsList qemu_numa_opts = { .desc = { { 0 } } /* validated with OptsVisitor */ }; +static int have_memdevs = -1; + static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) { uint16_t nodenr; @@ -67,6 +70,20 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } + if (node->has_mem && node->has_memdev) { + error_setg(errp, "qemu: cannot specify both mem= and memdev=\n"); + return; + } + + if (have_memdevs == -1) { + have_memdevs = node->has_memdev; + } + if (node->has_memdev != have_memdevs) { + error_setg(errp, "qemu: memdev option must be specified for either " + "all or no nodes\n"); + return; + } + if (node->has_mem) { uint64_t mem_size = node->mem; const char *mem_str = qemu_opt_get(opts, "mem"); @@ -76,6 +93,18 @@ static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp) } numa_info[nodenr].node_mem = mem_size; } + if (node->has_memdev) { + Object *o; + o = object_resolve_path_type(node->memdev, TYPE_MEMORY_BACKEND, NULL); + if (!o) { + error_setg(errp, "memdev=%s is ambiguous", node->memdev); + return; + } + + object_ref(o); + numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL); + numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); + } } int numa_init_func(QemuOpts *opts, void *opaque) @@ -195,10 +224,42 @@ void set_numa_modes(void) } } -void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, - const char *name, - uint64_t ram_size) +static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, + const char *name, + uint64_t ram_size) { memory_region_init_ram(mr, owner, name, ram_size); vmstate_register_ram_global(mr); } + +void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, + const char *name, + uint64_t ram_size) +{ + uint64_t addr = 0; + int i; + + if (nb_numa_nodes == 0 || !have_memdevs) { + allocate_system_memory_nonnuma(mr, owner, name, ram_size); + return; + } + + memory_region_init(mr, owner, name, ram_size); + for (i = 0; i < MAX_NODES; i++) { + Error *local_err = NULL; + uint64_t size = numa_info[i].node_mem; + HostMemoryBackend *backend = numa_info[i].node_memdev; + if (!backend) { + continue; + } + MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err); + if (local_err) { + qerror_report_err(local_err); + exit(1); + } + + memory_region_add_subregion(mr, addr, seg); + vmstate_register_ram_global(seg); + addr += size; + } +} diff --git a/qapi-schema.json b/qapi-schema.json index e65b7b1489..e05f8ba4f9 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3119,8 +3119,12 @@ # @cpus: #optional VCPUs belonging to this node (assign VCPUS round-robin # if omitted) # -# @mem: #optional memory size of this node (equally divide total memory among -# nodes if omitted) +# @mem: #optional memory size of this node; mutually exclusive with @memdev. +# Equally divide total memory among nodes if both @mem and @memdev are +# omitted. +# +# @memdev: #optional memory backend object. If specified for one node, +# it must be specified for all nodes. # # Since: 2.1 ## @@ -3128,4 +3132,5 @@ 'data': { '*nodeid': 'uint16', '*cpus': ['uint16'], - '*mem': 'size' }} + '*mem': 'size', + '*memdev': 'str' }} diff --git a/qemu-options.hx b/qemu-options.hx index 06111a67cd..ca75760b27 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,16 +95,22 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, - "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) + "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n" + "-numa node[,memdev=id][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{cpu[-cpu]}][,nodeid=@var{node}] +@item -numa node[,memdev=@var{id}][,cpus=@var{cpu[-cpu]}][,nodeid=@var{node}] @findex -numa -Simulate a multi node NUMA system. If @samp{mem} +Simulate a multi node NUMA system. If @samp{mem}, @samp{memdev} and @samp{cpus} are omitted, resources are split equally. Also, note that the -@option{numa} option doesn't allocate any of the specified resources. That is, it just assigns existing resources to NUMA nodes. This means that one still has to use the @option{-m}, @option{-smp} options -to allocate RAM and VCPUs respectively. +to allocate RAM and VCPUs respectively, and possibly @option{-object} +to specify the memory backend for the @samp{memdev} suboption. + +@samp{mem} and @samp{memdev} are mutually exclusive. Furthermore, if one +node uses @samp{memdev}, all of them have to use it. ETEXI DEF("add-fd", HAS_ARG, QEMU_OPTION_add_fd, diff --git a/vl.c b/vl.c index 469aa9c00c..f78f6da1c1 100644 --- a/vl.c +++ b/vl.c @@ -3952,6 +3952,8 @@ int main(int argc, char **argv, char **envp) exit(1); } + cpu_exec_init_all(); + current_machine = MACHINE(object_new(object_class_get_name( OBJECT_CLASS(machine_class)))); object_property_add_child(object_get_root(), "machine", @@ -4289,8 +4291,6 @@ int main(int argc, char **argv, char **envp) } } - cpu_exec_init_all(); - blk_mig_init(); ram_mig_init(); From 0b183fc871e61f4a586fdef2c0f880b6a856e444 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:19 +0800 Subject: [PATCH 077/109] memory: move mem_path handling to memory_region_allocate_system_memory Like the previous patch did in exec.c, split memory_region_init_ram and memory_region_init_ram_from_file, and push mem_path one step further up. Other RAM regions than system memory will now be backed by regular RAM. Also, boards that do not use memory_region_allocate_system_memory will not support -mem-path anymore. This can be changed before the patches are merged by migrating boards to use the function. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- exec.c | 10 ++-------- include/exec/memory.h | 18 ++++++++++++++++++ memory.c | 21 ++++++++++++++++----- numa.c | 11 ++++++++++- 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/exec.c b/exec.c index bad16e0613..11e0328095 100644 --- a/exec.c +++ b/exec.c @@ -1090,14 +1090,6 @@ error: } return NULL; } -#else -static void *file_ram_alloc(RAMBlock *block, - ram_addr_t memory, - const char *path) -{ - fprintf(stderr, "-mem-path not supported on this host\n"); - exit(1); -} #endif static ram_addr_t find_ram_offset(ram_addr_t size) @@ -1287,6 +1279,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block) return new_block->offset; } +#ifdef __linux__ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, const char *mem_path) { @@ -1315,6 +1308,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, new_block->host = file_ram_alloc(new_block, size, mem_path); return ram_block_add(new_block); } +#endif ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr) diff --git a/include/exec/memory.h b/include/exec/memory.h index f4c8d4933e..65ddb7a2a8 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -311,6 +311,24 @@ void memory_region_init_ram(MemoryRegion *mr, const char *name, uint64_t size); +#ifdef __linux__ +/** + * memory_region_init_ram_from_file: Initialize RAM memory region with a + * mmap-ed backend. + * + * @mr: the #MemoryRegion to be initialized. + * @owner: the object that tracks the region's reference count + * @name: the name of the region. + * @size: size of the region. + * @path: the path in which to allocate the RAM. + */ +void memory_region_init_ram_from_file(MemoryRegion *mr, + struct Object *owner, + const char *name, + uint64_t size, + const char *path); +#endif + /** * memory_region_init_ram_ptr: Initialize RAM memory region from a * user-provided pointer. Accesses into the diff --git a/memory.c b/memory.c index 063effec6b..09f98fcbe6 100644 --- a/memory.c +++ b/memory.c @@ -1030,13 +1030,24 @@ void memory_region_init_ram(MemoryRegion *mr, mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; - if (mem_path) { - mr->ram_addr = qemu_ram_alloc_from_file(size, mr, mem_path); - } else { - mr->ram_addr = qemu_ram_alloc(size, mr); - } + mr->ram_addr = qemu_ram_alloc(size, mr); } +#ifdef __linux__ +void memory_region_init_ram_from_file(MemoryRegion *mr, + struct Object *owner, + const char *name, + uint64_t size, + const char *path) +{ + memory_region_init(mr, owner, name, size); + mr->ram = true; + mr->terminates = true; + mr->destructor = memory_region_destructor_ram; + mr->ram_addr = qemu_ram_alloc_from_file(size, mr, path); +} +#endif + void memory_region_init_ram_ptr(MemoryRegion *mr, Object *owner, const char *name, diff --git a/numa.c b/numa.c index b24bb9dd9f..7c36bb5b60 100644 --- a/numa.c +++ b/numa.c @@ -228,7 +228,16 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, const char *name, uint64_t ram_size) { - memory_region_init_ram(mr, owner, name, ram_size); + if (mem_path) { +#ifdef __linux__ + memory_region_init_ram_from_file(mr, owner, name, ram_size, mem_path); +#else + fprintf(stderr, "-mem-path not supported on this host\n"); + exit(1); +#endif + } else { + memory_region_init_ram(mr, owner, name, ram_size); + } vmstate_register_ram_global(mr); } From 7f56e740a68c9f4ccebf7ad7590e82fbb30ffc87 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 14 May 2014 17:43:20 +0800 Subject: [PATCH 078/109] memory: add error propagation to file-based RAM allocation Right now, -mem-path will fall back to RAM-based allocation in some cases. This should never happen with "-object memory-file", prepare the code by adding correct error propagation. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: drop \n at end of error messages --- exec.c | 36 ++++++++++++++++++++++++------------ include/exec/memory.h | 5 ++++- include/exec/ram_addr.h | 2 +- memory.c | 5 +++-- numa.c | 13 ++++++++++++- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/exec.c b/exec.c index 11e0328095..997ef6a5a8 100644 --- a/exec.c +++ b/exec.c @@ -1016,7 +1016,8 @@ static long gethugepagesize(const char *path) static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, - const char *path) + const char *path, + Error **errp) { char *filename; char *sanitized_name; @@ -1035,7 +1036,8 @@ static void *file_ram_alloc(RAMBlock *block, } if (kvm_enabled() && !kvm_has_sync_mmu()) { - fprintf(stderr, "host lacks kvm mmu notifiers, -mem-path unsupported\n"); + error_setg(errp, + "host lacks kvm mmu notifiers, -mem-path unsupported"); goto error; } @@ -1052,7 +1054,8 @@ static void *file_ram_alloc(RAMBlock *block, fd = mkstemp(filename); if (fd < 0) { - perror("unable to create backing store for hugepages"); + error_setg_errno(errp, errno, + "unable to create backing store for hugepages"); g_free(filename); goto error; } @@ -1067,12 +1070,14 @@ static void *file_ram_alloc(RAMBlock *block, * If anything goes wrong with it under other filesystems, * mmap will fail. */ - if (ftruncate(fd, memory)) + if (ftruncate(fd, memory)) { perror("ftruncate"); + } area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (area == MAP_FAILED) { - perror("file_ram_alloc: can't mmap RAM pages"); + error_setg_errno(errp, errno, + "unable to map backing store for hugepages"); close(fd); goto error; } @@ -1281,13 +1286,14 @@ static ram_addr_t ram_block_add(RAMBlock *new_block) #ifdef __linux__ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, - const char *mem_path) + const char *mem_path, + Error **errp) { RAMBlock *new_block; if (xen_enabled()) { - fprintf(stderr, "-mem-path not supported with Xen\n"); - exit(1); + error_setg(errp, "-mem-path not supported with Xen"); + return -1; } if (phys_mem_alloc != qemu_anon_ram_alloc) { @@ -1296,16 +1302,22 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, * phys_mem_alloc, but we haven't bothered to provide * a hook there. */ - fprintf(stderr, - "-mem-path not supported with this accelerator\n"); - exit(1); + error_setg(errp, + "-mem-path not supported with this accelerator"); + return -1; } size = TARGET_PAGE_ALIGN(size); new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->length = size; - new_block->host = file_ram_alloc(new_block, size, mem_path); + new_block->host = file_ram_alloc(new_block, size, + mem_path, errp); + if (!new_block->host) { + g_free(new_block); + return -1; + } + return ram_block_add(new_block); } #endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 65ddb7a2a8..4c7bacf959 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -31,6 +31,7 @@ #include "qemu/queue.h" #include "qemu/int128.h" #include "qemu/notify.h" +#include "qapi/error.h" #define MAX_PHYS_ADDR_SPACE_BITS 62 #define MAX_PHYS_ADDR (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1) @@ -321,12 +322,14 @@ void memory_region_init_ram(MemoryRegion *mr, * @name: the name of the region. * @size: size of the region. * @path: the path in which to allocate the RAM. + * @errp: pointer to Error*, to store an error if it happens. */ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, - const char *path); + const char *path, + Error **errp); #endif /** diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 9b00638505..deafcebac0 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -23,7 +23,7 @@ #include "hw/xen/xen.h" ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, - const char *mem_path); + const char *mem_path, Error **errp); ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr); ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr); diff --git a/memory.c b/memory.c index 09f98fcbe6..5ef7167848 100644 --- a/memory.c +++ b/memory.c @@ -1038,13 +1038,14 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, - const char *path) + const char *path, + Error **errp) { memory_region_init(mr, owner, name, size); mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; - mr->ram_addr = qemu_ram_alloc_from_file(size, mr, path); + mr->ram_addr = qemu_ram_alloc_from_file(size, mr, path, errp); } #endif diff --git a/numa.c b/numa.c index 7c36bb5b60..8af9c916ac 100644 --- a/numa.c +++ b/numa.c @@ -230,7 +230,18 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, { if (mem_path) { #ifdef __linux__ - memory_region_init_ram_from_file(mr, owner, name, ram_size, mem_path); + Error *err = NULL; + memory_region_init_ram_from_file(mr, owner, name, ram_size, + mem_path, &err); + + /* Legacy behavior: if allocation failed, fall back to + * regular RAM allocation. + */ + if (!memory_region_size(mr)) { + qerror_report_err(err); + error_free(err); + memory_region_init_ram(mr, owner, name, ram_size); + } #else fprintf(stderr, "-mem-path not supported on this host\n"); exit(1); From c4090f8ef661fc314610883008c6593c720db564 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:15 +0800 Subject: [PATCH 079/109] vl: redo -object parsing Follow the lines of the HMP implementation, using OptsVisitor to parse the options. This gives access to OptsVisitor's rich parsing of integer lists. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- vl.c | 69 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/vl.c b/vl.c index f78f6da1c1..b27c2a6422 100644 --- a/vl.c +++ b/vl.c @@ -116,7 +116,7 @@ int main(int argc, char **argv) #include "ui/qemu-spice.h" #include "qapi/string-input-visitor.h" -#include "qom/object_interfaces.h" +#include "qapi/opts-visitor.h" #define DEFAULT_RAM_SIZE 128 @@ -2822,44 +2822,51 @@ static int object_set_property(const char *name, const char *value, void *opaque static int object_create(QemuOpts *opts, void *opaque) { - const char *type = qemu_opt_get(opts, "qom-type"); - const char *id = qemu_opts_id(opts); - Error *local_err = NULL; - Object *obj; + Error *err = NULL; + char *type = NULL; + char *id = NULL; + void *dummy = NULL; + OptsVisitor *ov; + QDict *pdict; - g_assert(type != NULL); + ov = opts_visitor_new(opts); + pdict = qemu_opts_to_qdict(opts, NULL); - if (id == NULL) { - qerror_report(QERR_MISSING_PARAMETER, "id"); - return -1; - } - - obj = object_new(type); - if (qemu_opt_foreach(opts, object_set_property, obj, 1) < 0) { - object_unref(obj); - return -1; - } - - if (!object_dynamic_cast(obj, TYPE_USER_CREATABLE)) { - error_setg(&local_err, "object '%s' isn't supported by -object", - id); + visit_start_struct(opts_get_visitor(ov), &dummy, NULL, NULL, 0, &err); + if (err) { goto out; } - object_property_add_child(container_get(object_get_root(), "/objects"), - id, obj, &local_err); - - user_creatable_complete(obj, &local_err); - if (local_err) { - object_property_del(container_get(object_get_root(), "/objects"), - id, &error_abort); + qdict_del(pdict, "qom-type"); + visit_type_str(opts_get_visitor(ov), &type, "qom-type", &err); + if (err) { goto out; } + + qdict_del(pdict, "id"); + visit_type_str(opts_get_visitor(ov), &id, "id", &err); + if (err) { + goto out; + } + + object_add(type, id, pdict, opts_get_visitor(ov), &err); + if (err) { + goto out; + } + visit_end_struct(opts_get_visitor(ov), &err); + if (err) { + qmp_object_del(id, NULL); + } + out: - object_unref(obj); - if (local_err) { - qerror_report_err(local_err); - error_free(local_err); + opts_visitor_cleanup(ov); + + QDECREF(pdict); + g_free(id); + g_free(type); + g_free(dummy); + if (err) { + qerror_report_err(err); return -1; } return 0; From 9521d42b546f2f624d4dcd299e13a9ab37eb1d82 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:17 +0800 Subject: [PATCH 080/109] pc: pass MachineState to pc_memory_init Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Reviewed-By: Igor Mammedov Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 24 ++++++++++++------------ hw/i386/pc_piix.c | 8 +++----- hw/i386/pc_q35.c | 4 +--- include/hw/i386/pc.h | 7 +++---- 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index c674c1c796..67eb45089e 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1190,10 +1190,8 @@ void pc_acpi_init(const char *default_dsdt) } } -FWCfgState *pc_memory_init(MemoryRegion *system_memory, - const char *kernel_filename, - const char *kernel_cmdline, - const char *initrd_filename, +FWCfgState *pc_memory_init(MachineState *machine, + MemoryRegion *system_memory, ram_addr_t below_4g_mem_size, ram_addr_t above_4g_mem_size, MemoryRegion *rom_memory, @@ -1204,18 +1202,19 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, MemoryRegion *ram, *option_rom_mr; MemoryRegion *ram_below_4g, *ram_above_4g; FWCfgState *fw_cfg; - ram_addr_t ram_size = below_4g_mem_size + above_4g_mem_size; - MachineState *machine = MACHINE(qdev_get_machine()); PCMachineState *pcms = PC_MACHINE(machine); - linux_boot = (kernel_filename != NULL); + assert(machine->ram_size == below_4g_mem_size + above_4g_mem_size); + + linux_boot = (machine->kernel_filename != NULL); /* Allocate RAM. We allocate it as a single memory region and use * aliases to address portions of it, mostly for backwards compatibility * with older qemus that used qemu_ram_alloc(). */ ram = g_malloc(sizeof(*ram)); - memory_region_allocate_system_memory(ram, NULL, "pc.ram", ram_size); + memory_region_allocate_system_memory(ram, NULL, "pc.ram", + machine->ram_size); *ram_memory = ram; ram_below_4g = g_malloc(sizeof(*ram_below_4g)); memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, @@ -1233,7 +1232,7 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, if (!guest_info->has_reserved_memory && (machine->ram_slots || - (machine->maxram_size > ram_size))) { + (machine->maxram_size > machine->ram_size))) { MachineClass *mc = MACHINE_GET_CLASS(machine); error_report("\"-memory 'slots|maxmem'\" is not supported by: %s", @@ -1243,9 +1242,9 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, /* initialize hotplug memory address space */ if (guest_info->has_reserved_memory && - (ram_size < machine->maxram_size)) { + (machine->ram_size < machine->maxram_size)) { ram_addr_t hotplug_mem_size = - machine->maxram_size - ram_size; + machine->maxram_size - machine->ram_size; if (machine->ram_slots > ACPI_MAX_RAM_SLOTS) { error_report("unsupported amount of memory slots: %"PRIu64, @@ -1290,7 +1289,8 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, } if (linux_boot) { - load_linux(fw_cfg, kernel_filename, initrd_filename, kernel_cmdline, below_4g_mem_size); + load_linux(fw_cfg, machine->kernel_filename, machine->initrd_filename, + machine->kernel_cmdline, below_4g_mem_size); } for (i = 0; i < nb_option_roms; i++) { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index a13e8d67db..3e7524b961 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -156,11 +156,9 @@ static void pc_init1(MachineState *machine, /* allocate ram and load rom/bios */ if (!xen_enabled()) { - fw_cfg = pc_memory_init(system_memory, - machine->kernel_filename, machine->kernel_cmdline, - machine->initrd_filename, - below_4g_mem_size, above_4g_mem_size, - rom_memory, &ram_memory, guest_info); + fw_cfg = pc_memory_init(machine, system_memory, + below_4g_mem_size, above_4g_mem_size, + rom_memory, &ram_memory, guest_info); } gsi_state = g_malloc0(sizeof(*gsi_state)); diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 629eb2d69b..aa71332ee1 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -143,9 +143,7 @@ static void pc_q35_init(MachineState *machine) /* allocate ram and load rom/bios */ if (!xen_enabled()) { - pc_memory_init(get_system_memory(), - machine->kernel_filename, machine->kernel_cmdline, - machine->initrd_filename, + pc_memory_init(machine, get_system_memory(), below_4g_mem_size, above_4g_mem_size, rom_memory, &ram_memory, guest_info); } diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index a9529f418c..76d4c6e725 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -3,6 +3,7 @@ #include "qemu-common.h" #include "exec/memory.h" +#include "hw/boards.h" #include "hw/isa/isa.h" #include "hw/block/fdc.h" #include "net/net.h" @@ -183,10 +184,8 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory, MemoryRegion *pci_address_space); -FWCfgState *pc_memory_init(MemoryRegion *system_memory, - const char *kernel_filename, - const char *kernel_cmdline, - const char *initrd_filename, +FWCfgState *pc_memory_init(MachineState *machine, + MemoryRegion *system_memory, ram_addr_t below_4g_mem_size, ram_addr_t above_4g_mem_size, MemoryRegion *rom_memory, From 58f4662c6c98a9fd05519745661177792d7aeede Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:18 +0800 Subject: [PATCH 081/109] backend:hostmem: replace hostmemory with host_memory ..to keep names consistant. Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/backends/hostmem.c b/backends/hostmem.c index 2f578acc81..2d9fd00cb3 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -17,8 +17,8 @@ #include "qom/object_interfaces.h" static void -hostmemory_backend_get_size(Object *obj, Visitor *v, void *opaque, - const char *name, Error **errp) +host_memory_backend_get_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); uint64_t value = backend->size; @@ -27,8 +27,8 @@ hostmemory_backend_get_size(Object *obj, Visitor *v, void *opaque, } static void -hostmemory_backend_set_size(Object *obj, Visitor *v, void *opaque, - const char *name, Error **errp) +host_memory_backend_set_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); Error *local_err = NULL; @@ -53,14 +53,14 @@ out: error_propagate(errp, local_err); } -static void hostmemory_backend_init(Object *obj) +static void host_memory_backend_init(Object *obj) { object_property_add(obj, "size", "int", - hostmemory_backend_get_size, - hostmemory_backend_set_size, NULL, NULL, NULL); + host_memory_backend_get_size, + host_memory_backend_set_size, NULL, NULL, NULL); } -static void hostmemory_backend_finalize(Object *obj) +static void host_memory_backend_finalize(Object *obj) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); @@ -75,14 +75,14 @@ host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) return memory_region_size(&backend->mr) ? &backend->mr : NULL; } -static const TypeInfo hostmemory_backend_info = { +static const TypeInfo host_memory_backend_info = { .name = TYPE_MEMORY_BACKEND, .parent = TYPE_OBJECT, .abstract = true, .class_size = sizeof(HostMemoryBackendClass), .instance_size = sizeof(HostMemoryBackend), - .instance_init = hostmemory_backend_init, - .instance_finalize = hostmemory_backend_finalize, + .instance_init = host_memory_backend_init, + .instance_finalize = host_memory_backend_finalize, .interfaces = (InterfaceInfo[]) { { TYPE_USER_CREATABLE }, { } @@ -91,7 +91,7 @@ static const TypeInfo hostmemory_backend_info = { static void register_types(void) { - type_register_static(&hostmemory_backend_info); + type_register_static(&host_memory_backend_info); } type_init(register_types); From bd9262d95f9172a5f4897aeea341c219a6c6cf96 Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:19 +0800 Subject: [PATCH 082/109] hostmem: separate allocation from UserCreatable complete method This allows the superclass to set various policies on the memory region that the subclass creates. Drops hostmem-ram's complete method accordingly. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem-ram.c | 7 +++---- backends/hostmem.c | 20 ++++++++++++++++++++ include/sysemu/hostmem.h | 2 ++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c index bba2ebcce8..d9a8290dc9 100644 --- a/backends/hostmem-ram.c +++ b/backends/hostmem-ram.c @@ -16,9 +16,8 @@ static void -ram_backend_memory_init(UserCreatable *uc, Error **errp) +ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) { - HostMemoryBackend *backend = MEMORY_BACKEND(uc); char *path; if (!backend->size) { @@ -35,9 +34,9 @@ ram_backend_memory_init(UserCreatable *uc, Error **errp) static void ram_backend_class_init(ObjectClass *oc, void *data) { - UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc); - ucc->complete = ram_backend_memory_init; + bc->alloc = ram_backend_memory_alloc; } static const TypeInfo ram_backend_info = { diff --git a/backends/hostmem.c b/backends/hostmem.c index 2d9fd00cb3..1738774895 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -75,11 +75,31 @@ host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) return memory_region_size(&backend->mr) ? &backend->mr : NULL; } +static void +host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(uc); + HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc); + + if (bc->alloc) { + bc->alloc(backend, errp); + } +} + +static void +host_memory_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = host_memory_backend_memory_complete; +} + static const TypeInfo host_memory_backend_info = { .name = TYPE_MEMORY_BACKEND, .parent = TYPE_OBJECT, .abstract = true, .class_size = sizeof(HostMemoryBackendClass), + .class_init = host_memory_backend_class_init, .instance_size = sizeof(HostMemoryBackend), .instance_init = host_memory_backend_init, .instance_finalize = host_memory_backend_finalize, diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index 4fc081eb17..923f6721d2 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -34,6 +34,8 @@ typedef struct HostMemoryBackendClass HostMemoryBackendClass; */ struct HostMemoryBackendClass { ObjectClass parent_class; + + void (*alloc)(HostMemoryBackend *backend, Error **errp); }; /** From 52330e1a96bd09e8570c5694585798982da27adf Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:21 +0800 Subject: [PATCH 083/109] hostmem: add file-based HostMemoryBackend Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: comment tweak --- backends/Makefile.objs | 1 + backends/hostmem-file.c | 107 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 backends/hostmem-file.c diff --git a/backends/Makefile.objs b/backends/Makefile.objs index 7fb7acdd11..506a46c33b 100644 --- a/backends/Makefile.objs +++ b/backends/Makefile.objs @@ -8,3 +8,4 @@ baum.o-cflags := $(SDL_CFLAGS) common-obj-$(CONFIG_TPM) += tpm.o common-obj-y += hostmem.o hostmem-ram.o +common-obj-$(CONFIG_LINUX) += hostmem-file.o diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c new file mode 100644 index 0000000000..55d1c62ddd --- /dev/null +++ b/backends/hostmem-file.c @@ -0,0 +1,107 @@ +/* + * QEMU Host Memory Backend for hugetlbfs + * + * Copyright (C) 2013-2014 Red Hat Inc + * + * Authors: + * Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "sysemu/hostmem.h" +#include "qom/object_interfaces.h" + +/* hostmem-file.c */ +/** + * @TYPE_MEMORY_BACKEND_FILE: + * name of backend that uses mmap on a file descriptor + */ +#define TYPE_MEMORY_BACKEND_FILE "memory-backend-file" + +#define MEMORY_BACKEND_FILE(obj) \ + OBJECT_CHECK(HostMemoryBackendFile, (obj), TYPE_MEMORY_BACKEND_FILE) + +typedef struct HostMemoryBackendFile HostMemoryBackendFile; + +struct HostMemoryBackendFile { + HostMemoryBackend parent_obj; + char *mem_path; +}; + +static void +file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend); + + if (!backend->size) { + error_setg(errp, "can't create backend with size 0"); + return; + } + if (!fb->mem_path) { + error_setg(errp, "mem_path property not set"); + return; + } +#ifndef CONFIG_LINUX + error_setg(errp, "-mem-path not supported on this host"); +#else + if (!memory_region_size(&backend->mr)) { + memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), + object_get_canonical_path(OBJECT(backend)), + backend->size, + fb->mem_path, errp); + } +#endif +} + +static void +file_backend_class_init(ObjectClass *oc, void *data) +{ + HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc); + + bc->alloc = file_backend_memory_alloc; +} + +static char *get_mem_path(Object *o, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + return g_strdup(fb->mem_path); +} + +static void set_mem_path(Object *o, const char *str, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + if (memory_region_size(&backend->mr)) { + error_setg(errp, "cannot change property value"); + return; + } + if (fb->mem_path) { + g_free(fb->mem_path); + } + fb->mem_path = g_strdup(str); +} + +static void +file_backend_instance_init(Object *o) +{ + object_property_add_str(o, "mem-path", get_mem_path, + set_mem_path, NULL); +} + +static const TypeInfo file_backend_info = { + .name = TYPE_MEMORY_BACKEND_FILE, + .parent = TYPE_MEMORY_BACKEND, + .class_init = file_backend_class_init, + .instance_init = file_backend_instance_init, + .instance_size = sizeof(HostMemoryBackendFile), +}; + +static void register_types(void) +{ + type_register_static(&file_backend_info); +} + +type_init(register_types); From 2925020d339d3b53fbb4fb82fbfb32b71f1b41ff Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jun 2014 21:48:19 +0300 Subject: [PATCH 084/109] osdep: add merge and dump flags will be used by follow up patch Signed-off-by: Michael S. Tsirkin --- include/qemu/osdep.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 9c1a119703..9438a4fb1a 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -133,6 +133,8 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_DONTNEED POSIX_MADV_DONTNEED #define QEMU_MADV_DONTFORK QEMU_MADV_INVALID #define QEMU_MADV_MERGEABLE QEMU_MADV_INVALID +#define QEMU_MADV_UNMERGEABLE QEMU_MADV_INVALID +#define QEMU_MADV_DODUMP QEMU_MADV_INVALID #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID @@ -142,6 +144,8 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_DONTNEED QEMU_MADV_INVALID #define QEMU_MADV_DONTFORK QEMU_MADV_INVALID #define QEMU_MADV_MERGEABLE QEMU_MADV_INVALID +#define QEMU_MADV_UNMERGEABLE QEMU_MADV_INVALID +#define QEMU_MADV_DODUMP QEMU_MADV_INVALID #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID From 605d0a945d020e3024aa7faf8a0e8b471371d8ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:22 +0800 Subject: [PATCH 085/109] hostmem: add merge and dump properties Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem.c | 84 +++++++++++++++++++++++++++++++++++++++- include/qemu/osdep.h | 10 +++++ include/sysemu/hostmem.h | 1 + 3 files changed, 94 insertions(+), 1 deletion(-) diff --git a/backends/hostmem.c b/backends/hostmem.c index 1738774895..a2550fe7e3 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -53,8 +53,73 @@ out: error_propagate(errp, local_err); } +static bool host_memory_backend_get_merge(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->merge; +} + +static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (!memory_region_size(&backend->mr)) { + backend->merge = value; + return; + } + + if (value != backend->merge) { + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + qemu_madvise(ptr, sz, + value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE); + backend->merge = value; + } +} + +static bool host_memory_backend_get_dump(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->dump; +} + +static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (!memory_region_size(&backend->mr)) { + backend->dump = value; + return; + } + + if (value != backend->dump) { + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + qemu_madvise(ptr, sz, + value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP); + backend->dump = value; + } +} + static void host_memory_backend_init(Object *obj) { + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + backend->merge = qemu_opt_get_bool(qemu_get_machine_opts(), + "mem-merge", true); + backend->dump = qemu_opt_get_bool(qemu_get_machine_opts(), + "dump-guest-core", true); + + object_property_add_bool(obj, "merge", + host_memory_backend_get_merge, + host_memory_backend_set_merge, NULL); + object_property_add_bool(obj, "dump", + host_memory_backend_get_dump, + host_memory_backend_set_dump, NULL); object_property_add(obj, "size", "int", host_memory_backend_get_size, host_memory_backend_set_size, NULL, NULL, NULL); @@ -80,9 +145,26 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(uc); HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc); + Error *local_err = NULL; + void *ptr; + uint64_t sz; if (bc->alloc) { - bc->alloc(backend, errp); + bc->alloc(backend, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + ptr = memory_region_get_ram_ptr(&backend->mr); + sz = memory_region_size(&backend->mr); + + if (backend->merge) { + qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE); + } + if (!backend->dump) { + qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP); + } } } diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 9438a4fb1a..6d35c1bcba 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -116,6 +116,16 @@ void qemu_anon_ram_free(void *ptr, size_t size); #else #define QEMU_MADV_MERGEABLE QEMU_MADV_INVALID #endif +#ifdef MADV_UNMERGEABLE +#define QEMU_MADV_UNMERGEABLE MADV_UNMERGEABLE +#else +#define QEMU_MADV_UNMERGEABLE QEMU_MADV_INVALID +#endif +#ifdef MADV_DODUMP +#define QEMU_MADV_DODUMP MADV_DODUMP +#else +#define QEMU_MADV_DODUMP QEMU_MADV_INVALID +#endif #ifdef MADV_DONTDUMP #define QEMU_MADV_DONTDUMP MADV_DONTDUMP #else diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index 923f6721d2..ede5ec90c7 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -52,6 +52,7 @@ struct HostMemoryBackend { /* protected */ uint64_t size; + bool merge, dump; MemoryRegion mr; }; From a35ba7be4b696d4c7b47318fd2022e6c3eca0a63 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:23 +0800 Subject: [PATCH 086/109] hostmem: allow preallocation of any memory region And allow preallocation of file-based memory even without -mem-prealloc. Some care is necessary because -mem-prealloc does not allow disabling preallocation for hostmem-file. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem-file.c | 3 +++ backends/hostmem.c | 42 ++++++++++++++++++++++++++++++++++++++++ exec.c | 7 +++++++ include/exec/memory.h | 10 ++++++++++ include/exec/ram_addr.h | 1 + include/sysemu/hostmem.h | 1 + memory.c | 11 +++++++++++ 7 files changed, 75 insertions(+) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 55d1c62ddd..92659c119d 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -9,7 +9,9 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ +#include "qemu-common.h" #include "sysemu/hostmem.h" +#include "sysemu/sysemu.h" #include "qom/object_interfaces.h" /* hostmem-file.c */ @@ -46,6 +48,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) error_setg(errp, "-mem-path not supported on this host"); #else if (!memory_region_size(&backend->mr)) { + backend->force_prealloc = mem_prealloc; memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), object_get_canonical_path(OBJECT(backend)), backend->size, diff --git a/backends/hostmem.c b/backends/hostmem.c index a2550fe7e3..ebef6206a8 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -105,6 +105,41 @@ static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp) } } +static bool host_memory_backend_get_prealloc(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->prealloc || backend->force_prealloc; +} + +static void host_memory_backend_set_prealloc(Object *obj, bool value, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (backend->force_prealloc) { + if (value) { + error_setg(errp, + "remove -mem-prealloc to use the prealloc property"); + return; + } + } + + if (!memory_region_size(&backend->mr)) { + backend->prealloc = value; + return; + } + + if (value && !backend->prealloc) { + int fd = memory_region_get_fd(&backend->mr); + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + os_mem_prealloc(fd, ptr, sz); + backend->prealloc = true; + } +} + static void host_memory_backend_init(Object *obj) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); @@ -113,6 +148,7 @@ static void host_memory_backend_init(Object *obj) "mem-merge", true); backend->dump = qemu_opt_get_bool(qemu_get_machine_opts(), "dump-guest-core", true); + backend->prealloc = mem_prealloc; object_property_add_bool(obj, "merge", host_memory_backend_get_merge, @@ -120,6 +156,9 @@ static void host_memory_backend_init(Object *obj) object_property_add_bool(obj, "dump", host_memory_backend_get_dump, host_memory_backend_set_dump, NULL); + object_property_add_bool(obj, "prealloc", + host_memory_backend_get_prealloc, + host_memory_backend_set_prealloc, NULL); object_property_add(obj, "size", "int", host_memory_backend_get_size, host_memory_backend_set_size, NULL, NULL, NULL); @@ -165,6 +204,9 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) if (!backend->dump) { qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP); } + if (backend->prealloc) { + os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); + } } } diff --git a/exec.c b/exec.c index 997ef6a5a8..a27923a258 100644 --- a/exec.c +++ b/exec.c @@ -1448,6 +1448,13 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) } #endif /* !_WIN32 */ +int qemu_get_ram_fd(ram_addr_t addr) +{ + RAMBlock *block = qemu_get_ram_block(addr); + + return block->fd; +} + /* Return a host pointer to ram allocated with qemu_ram_alloc. With the exception of the softmmu code in this file, this should only be used for local memory (e.g. video ram) that the device owns, diff --git a/include/exec/memory.h b/include/exec/memory.h index 4c7bacf959..1cf5981b2e 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -533,6 +533,16 @@ bool memory_region_is_logging(MemoryRegion *mr); */ bool memory_region_is_rom(MemoryRegion *mr); +/** + * memory_region_get_fd: Get a file descriptor backing a RAM memory region. + * + * Returns a file descriptor backing a file-based RAM memory region, + * or -1 if the region is not a file-based RAM memory region. + * + * @mr: the RAM or alias memory region being queried. + */ +int memory_region_get_fd(MemoryRegion *mr); + /** * memory_region_get_ram_ptr: Get a pointer into a RAM memory region. * diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index deafcebac0..fcc7ef0180 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -27,6 +27,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr); ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr); +int qemu_get_ram_fd(ram_addr_t addr); void *qemu_get_ram_ptr(ram_addr_t addr); void qemu_ram_free(ram_addr_t addr); void qemu_ram_free_from_ptr(ram_addr_t addr); diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index ede5ec90c7..4cae673c4b 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -53,6 +53,7 @@ struct HostMemoryBackend { /* protected */ uint64_t size; bool merge, dump; + bool prealloc, force_prealloc; MemoryRegion mr; }; diff --git a/memory.c b/memory.c index 5ef7167848..e7f1160f4e 100644 --- a/memory.c +++ b/memory.c @@ -1271,6 +1271,17 @@ void memory_region_reset_dirty(MemoryRegion *mr, hwaddr addr, cpu_physical_memory_reset_dirty(mr->ram_addr + addr, size, client); } +int memory_region_get_fd(MemoryRegion *mr) +{ + if (mr->alias) { + return memory_region_get_fd(mr->alias); + } + + assert(mr->terminates); + + return qemu_get_ram_fd(mr->ram_addr & TARGET_PAGE_MASK); +} + void *memory_region_get_ram_ptr(MemoryRegion *mr) { if (mr->alias) { From dbcb8981183592be129b2e624b7bcd4245e75fbc Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2014 19:15:24 +0800 Subject: [PATCH 087/109] hostmem: add property to map memory with MAP_SHARED A new "share" property can be used with the "memory-file" backend to map memory with MAP_SHARED instead of MAP_PRIVATE. Signed-off-by: Paolo Bonzini Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem-file.c | 26 +++++++++++++++++++++++++- exec.c | 18 ++++++++++-------- include/exec/memory.h | 2 ++ include/exec/ram_addr.h | 3 ++- memory.c | 3 ++- numa.c | 2 +- 6 files changed, 42 insertions(+), 12 deletions(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 92659c119d..51799943f1 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -28,6 +28,8 @@ typedef struct HostMemoryBackendFile HostMemoryBackendFile; struct HostMemoryBackendFile { HostMemoryBackend parent_obj; + + bool share; char *mem_path; }; @@ -51,7 +53,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) backend->force_prealloc = mem_prealloc; memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), object_get_canonical_path(OBJECT(backend)), - backend->size, + backend->size, fb->share, fb->mem_path, errp); } #endif @@ -87,9 +89,31 @@ static void set_mem_path(Object *o, const char *str, Error **errp) fb->mem_path = g_strdup(str); } +static bool file_memory_backend_get_share(Object *o, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + return fb->share; +} + +static void file_memory_backend_set_share(Object *o, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + if (memory_region_size(&backend->mr)) { + error_setg(errp, "cannot change property value"); + return; + } + fb->share = value; +} + static void file_backend_instance_init(Object *o) { + object_property_add_bool(o, "share", + file_memory_backend_get_share, + file_memory_backend_set_share, NULL); object_property_add_str(o, "mem-path", get_mem_path, set_mem_path, NULL); } diff --git a/exec.c b/exec.c index a27923a258..1ca7baca0b 100644 --- a/exec.c +++ b/exec.c @@ -73,6 +73,9 @@ static MemoryRegion io_mem_unassigned; /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */ #define RAM_PREALLOC (1 << 0) +/* RAM is mmap-ed with MAP_SHARED */ +#define RAM_SHARED (1 << 1) + #endif struct CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus); @@ -1074,7 +1077,9 @@ static void *file_ram_alloc(RAMBlock *block, perror("ftruncate"); } - area = mmap(0, memory, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + area = mmap(0, memory, PROT_READ | PROT_WRITE, + (block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE), + fd, 0); if (area == MAP_FAILED) { error_setg_errno(errp, errno, "unable to map backing store for hugepages"); @@ -1286,7 +1291,7 @@ static ram_addr_t ram_block_add(RAMBlock *new_block) #ifdef __linux__ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, - const char *mem_path, + bool share, const char *mem_path, Error **errp) { RAMBlock *new_block; @@ -1311,6 +1316,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->length = size; + new_block->flags = share ? RAM_SHARED : 0; new_block->host = file_ram_alloc(new_block, size, mem_path, errp); if (!new_block->host) { @@ -1413,12 +1419,8 @@ void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) flags = MAP_FIXED; munmap(vaddr, length); if (block->fd >= 0) { -#ifdef MAP_POPULATE - flags |= mem_prealloc ? MAP_POPULATE | MAP_SHARED : - MAP_PRIVATE; -#else - flags |= MAP_PRIVATE; -#endif + flags |= (block->flags & RAM_SHARED ? + MAP_SHARED : MAP_PRIVATE); area = mmap(vaddr, length, PROT_READ | PROT_WRITE, flags, block->fd, offset); } else { diff --git a/include/exec/memory.h b/include/exec/memory.h index 1cf5981b2e..3d778d70f0 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -321,6 +321,7 @@ void memory_region_init_ram(MemoryRegion *mr, * @owner: the object that tracks the region's reference count * @name: the name of the region. * @size: size of the region. + * @share: %true if memory must be mmaped with the MAP_SHARED flag * @path: the path in which to allocate the RAM. * @errp: pointer to Error*, to store an error if it happens. */ @@ -328,6 +329,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + bool share, const char *path, Error **errp); #endif diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index fcc7ef0180..55ca67681f 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -23,7 +23,8 @@ #include "hw/xen/xen.h" ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, - const char *mem_path, Error **errp); + bool share, const char *mem_path, + Error **errp); ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, MemoryRegion *mr); ram_addr_t qemu_ram_alloc(ram_addr_t size, MemoryRegion *mr); diff --git a/memory.c b/memory.c index e7f1160f4e..b91a60a921 100644 --- a/memory.c +++ b/memory.c @@ -1038,6 +1038,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + bool share, const char *path, Error **errp) { @@ -1045,7 +1046,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; - mr->ram_addr = qemu_ram_alloc_from_file(size, mr, path, errp); + mr->ram_addr = qemu_ram_alloc_from_file(size, mr, share, path, errp); } #endif diff --git a/numa.c b/numa.c index 8af9c916ac..711f6825cb 100644 --- a/numa.c +++ b/numa.c @@ -231,7 +231,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, if (mem_path) { #ifdef __linux__ Error *err = NULL; - memory_region_init_ram_from_file(mr, owner, name, ram_size, + memory_region_init_ram_from_file(mr, owner, name, ram_size, false, mem_path, &err); /* Legacy behavior: if allocation failed, fall back to From 4cf1b76bf1e2cbb91b1123d47505a6586195800c Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:25 +0800 Subject: [PATCH 088/109] hostmem: add properties for NUMA memory policy Signed-off-by: Hu Tao [Raise errors on setting properties if !CONFIG_NUMA. Add BUILD_BUG_ON checks. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Marcelo Tosatti Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- backends/hostmem.c | 136 ++++++++++++++++++++++++++++++++++++++- include/sysemu/hostmem.h | 4 ++ qapi-schema.json | 20 ++++++ 3 files changed, 159 insertions(+), 1 deletion(-) diff --git a/backends/hostmem.c b/backends/hostmem.c index ebef6206a8..ca10c51b51 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -10,12 +10,21 @@ * See the COPYING file in the top-level directory. */ #include "sysemu/hostmem.h" -#include "sysemu/sysemu.h" #include "qapi/visitor.h" +#include "qapi-types.h" +#include "qapi-visit.h" #include "qapi/qmp/qerror.h" #include "qemu/config-file.h" #include "qom/object_interfaces.h" +#ifdef CONFIG_NUMA +#include +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE); +#endif + static void host_memory_backend_get_size(Object *obj, Visitor *v, void *opaque, const char *name, Error **errp) @@ -53,6 +62,84 @@ out: error_propagate(errp, local_err); } +static void +host_memory_backend_get_host_nodes(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint16List *host_nodes = NULL; + uint16List **node = &host_nodes; + unsigned long value; + + value = find_first_bit(backend->host_nodes, MAX_NODES); + if (value == MAX_NODES) { + return; + } + + *node = g_malloc0(sizeof(**node)); + (*node)->value = value; + node = &(*node)->next; + + do { + value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1); + if (value == MAX_NODES) { + break; + } + + *node = g_malloc0(sizeof(**node)); + (*node)->value = value; + node = &(*node)->next; + } while (true); + + visit_type_uint16List(v, &host_nodes, name, errp); +} + +static void +host_memory_backend_set_host_nodes(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ +#ifdef CONFIG_NUMA + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint16List *l = NULL; + + visit_type_uint16List(v, &l, name, errp); + + while (l) { + bitmap_set(backend->host_nodes, l->value, 1); + l = l->next; + } +#else + error_setg(errp, "NUMA node binding are not supported by this QEMU"); +#endif +} + +static void +host_memory_backend_get_policy(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + int policy = backend->policy; + + visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp); +} + +static void +host_memory_backend_set_policy(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + int policy; + + visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp); + backend->policy = policy; + +#ifndef CONFIG_NUMA + if (policy != HOST_MEM_POLICY_DEFAULT) { + error_setg(errp, "NUMA policies are not supported by this QEMU"); + } +#endif +} + static bool host_memory_backend_get_merge(Object *obj, Error **errp) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); @@ -162,6 +249,12 @@ static void host_memory_backend_init(Object *obj) object_property_add(obj, "size", "int", host_memory_backend_get_size, host_memory_backend_set_size, NULL, NULL, NULL); + object_property_add(obj, "host-nodes", "int", + host_memory_backend_get_host_nodes, + host_memory_backend_set_host_nodes, NULL, NULL, NULL); + object_property_add(obj, "policy", "str", + host_memory_backend_get_policy, + host_memory_backend_set_policy, NULL, NULL, NULL); } static void host_memory_backend_finalize(Object *obj) @@ -204,6 +297,47 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) if (!backend->dump) { qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP); } +#ifdef CONFIG_NUMA + unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES); + /* lastbit == MAX_NODES means maxnode = 0 */ + unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1); + /* ensure policy won't be ignored in case memory is preallocated + * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so + * this doesn't catch hugepage case. */ + unsigned flags = MPOL_MF_STRICT; + + /* check for invalid host-nodes and policies and give more verbose + * error messages than mbind(). */ + if (maxnode && backend->policy == MPOL_DEFAULT) { + error_setg(errp, "host-nodes must be empty for policy default," + " or you should explicitly specify a policy other" + " than default"); + return; + } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) { + error_setg(errp, "host-nodes must be set for policy %s", + HostMemPolicy_lookup[backend->policy]); + return; + } + + /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1 + * as argument to mbind() due to an old Linux bug (feature?) which + * cuts off the last specified node. This means backend->host_nodes + * must have MAX_NODES+1 bits available. + */ + assert(sizeof(backend->host_nodes) >= + BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long)); + assert(maxnode <= MAX_NODES); + if (mbind(ptr, sz, backend->policy, + maxnode ? backend->host_nodes : NULL, maxnode + 1, flags)) { + error_setg_errno(errp, errno, + "cannot bind memory to host NUMA nodes"); + return; + } +#endif + /* Preallocate memory after the NUMA policy has been instantiated. + * This is necessary to guarantee memory is allocated with + * specified NUMA policy in place. + */ if (backend->prealloc) { os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); } diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index 4cae673c4b..1ce439415d 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -12,10 +12,12 @@ #ifndef QEMU_RAM_H #define QEMU_RAM_H +#include "sysemu/sysemu.h" /* for MAX_NODES */ #include "qom/object.h" #include "qapi/error.h" #include "exec/memory.h" #include "qemu/option.h" +#include "qemu/bitmap.h" #define TYPE_MEMORY_BACKEND "memory-backend" #define MEMORY_BACKEND(obj) \ @@ -54,6 +56,8 @@ struct HostMemoryBackend { uint64_t size; bool merge, dump; bool prealloc, force_prealloc; + DECLARE_BITMAP(host_nodes, MAX_NODES + 1); + HostMemPolicy policy; MemoryRegion mr; }; diff --git a/qapi-schema.json b/qapi-schema.json index e05f8ba4f9..10a324a79a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3134,3 +3134,23 @@ '*cpus': ['uint16'], '*mem': 'size', '*memdev': 'str' }} + +## +# @HostMemPolicy +# +# Host memory policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred host nodes for allocation +# +# @bind: a strict policy that restricts memory allocation to the +# host nodes specified +# +# @interleave: memory allocations are interleaved across the set +# of host nodes specified +# +# Since 2.1 +## +{ 'enum': 'HostMemPolicy', + 'data': [ 'default', 'preferred', 'bind', 'interleave' ] } From 76b5d8507d08db18bc0bd3c75349b357a82ff1a2 Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Mon, 16 Jun 2014 18:05:41 +0800 Subject: [PATCH 089/109] qmp: add query-memdev Add qmp command query-memdev to query for information of memory devices Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- numa.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++ qapi-schema.json | 40 +++++++++++++++++++++++ qmp-commands.hx | 38 ++++++++++++++++++++++ 3 files changed, 162 insertions(+) diff --git a/numa.c b/numa.c index 711f6825cb..eef0717785 100644 --- a/numa.c +++ b/numa.c @@ -34,6 +34,7 @@ #include "qapi/qmp/qerror.h" #include "hw/boards.h" #include "sysemu/hostmem.h" +#include "qmp-commands.h" QemuOptsList qemu_numa_opts = { .name = "numa", @@ -283,3 +284,86 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner, addr += size; } } + +static int query_memdev(Object *obj, void *opaque) +{ + MemdevList **list = opaque; + Error *err = NULL; + + if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { + MemdevList *m = g_malloc0(sizeof(*m)); + + m->value = g_malloc0(sizeof(*m->value)); + + m->value->size = object_property_get_int(obj, "size", + &err); + if (err) { + goto error; + } + + m->value->merge = object_property_get_bool(obj, "merge", + &err); + if (err) { + goto error; + } + + m->value->dump = object_property_get_bool(obj, "dump", + &err); + if (err) { + goto error; + } + + m->value->prealloc = object_property_get_bool(obj, + "prealloc", &err); + if (err) { + goto error; + } + + m->value->policy = object_property_get_enum(obj, + "policy", + HostMemPolicy_lookup, + &err); + if (err) { + goto error; + } + + object_property_get_uint16List(obj, "host-nodes", + &m->value->host_nodes, &err); + if (err) { + goto error; + } + + m->next = *list; + *list = m; + } + + return 0; +error: + return -1; +} + +MemdevList *qmp_query_memdev(Error **errp) +{ + Object *obj; + MemdevList *list = NULL, *m; + + obj = object_resolve_path("/objects", NULL); + if (obj == NULL) { + return NULL; + } + + if (object_child_foreach(obj, query_memdev, &list) != 0) { + goto error; + } + + return list; + +error: + while (list) { + m = list; + list = list->next; + g_free(m->value); + g_free(m); + } + return NULL; +} diff --git a/qapi-schema.json b/qapi-schema.json index 10a324a79a..082bc3ab4e 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3154,3 +3154,43 @@ ## { 'enum': 'HostMemPolicy', 'data': [ 'default', 'preferred', 'bind', 'interleave' ] } + +## +# @Memdev: +# +# Information of memory device +# +# @size: memory device size +# +# @merge: enables or disables memory merge support +# +# @dump: includes memory device's memory in a core dump or not +# +# @prealloc: enables or disables memory preallocation +# +# @host-nodes: host nodes for its memory policy +# +# @policy: memory policy of memory device +# +# Since: 2.1 +## + +{ 'type': 'Memdev', + 'data': { + 'size': 'size', + 'merge': 'bool', + 'dump': 'bool', + 'prealloc': 'bool', + 'host-nodes': ['uint16'], + 'policy': 'HostMemPolicy' }} + +## +# @query-memdev: +# +# Returns information for all memory devices. +# +# Returns: a list of @Memdev. +# +# Since: 2.1 +## +{ 'command': 'query-memdev', 'returns': ['Memdev'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index d6bb0f483f..d99f23591a 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3573,4 +3573,42 @@ Example: } } } ] } +EQMP + + { + .name = "query-memdev", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_memdev, + }, + +SQMP +query-memdev +------------ + +Show memory devices information. + + +Example (1): + +-> { "execute": "query-memdev" } +<- { "return": [ + { + "size": 536870912, + "merge": false, + "dump": true, + "prealloc": false, + "host-nodes": [0, 1], + "policy": "bind" + }, + { + "size": 536870912, + "merge": false, + "dump": true, + "prealloc": true, + "host-nodes": [2, 3], + "policy": "preferred" + } + ] + } + EQMP From eb1539b2343f2040decab39ccb9dc50dda75001c Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Wed, 14 May 2014 17:43:35 +0800 Subject: [PATCH 090/109] hmp: add info memdev This is the hmp counterpart of qmp query-memdev. Signed-off-by: Hu Tao Signed-off-by: Michael S. Tsirkin Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: fix build on 32 bit --- hmp.c | 36 ++++++++++++++++++++++++++++++++++++ hmp.h | 1 + monitor.c | 7 +++++++ 3 files changed, 44 insertions(+) diff --git a/hmp.c b/hmp.c index ccc35d41a1..41006f5eef 100644 --- a/hmp.c +++ b/hmp.c @@ -22,6 +22,8 @@ #include "qemu/sockets.h" #include "monitor/monitor.h" #include "qapi/opts-visitor.h" +#include "qapi/string-output-visitor.h" +#include "qapi-visit.h" #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" @@ -1676,3 +1678,37 @@ void hmp_object_del(Monitor *mon, const QDict *qdict) qmp_object_del(id, &err); hmp_handle_error(mon, &err); } + +void hmp_info_memdev(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + MemdevList *memdev_list = qmp_query_memdev(&err); + MemdevList *m = memdev_list; + StringOutputVisitor *ov; + int i = 0; + + + while (m) { + ov = string_output_visitor_new(false); + visit_type_uint16List(string_output_get_visitor(ov), + &m->value->host_nodes, NULL, NULL); + monitor_printf(mon, "memory device %d\n", i); + monitor_printf(mon, " size: %" PRId64 "\n", m->value->size); + monitor_printf(mon, " merge: %s\n", + m->value->merge ? "true" : "false"); + monitor_printf(mon, " dump: %s\n", + m->value->dump ? "true" : "false"); + monitor_printf(mon, " prealloc: %s\n", + m->value->prealloc ? "true" : "false"); + monitor_printf(mon, " policy: %s\n", + HostMemPolicy_lookup[m->value->policy]); + monitor_printf(mon, " host nodes: %s\n", + string_output_get_string(ov)); + + string_output_visitor_cleanup(ov); + m = m->next; + i++; + } + + monitor_printf(mon, "\n"); +} diff --git a/hmp.h b/hmp.h index 2d9b0a2b0b..4fd3c4a901 100644 --- a/hmp.h +++ b/hmp.h @@ -93,6 +93,7 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict); void hmp_cpu_add(Monitor *mon, const QDict *qdict); void hmp_object_add(Monitor *mon, const QDict *qdict); void hmp_object_del(Monitor *mon, const QDict *qdict); +void hmp_info_memdev(Monitor *mon, const QDict *qdict); void object_add_completion(ReadLineState *rs, int nb_args, const char *str); void object_del_completion(ReadLineState *rs, int nb_args, const char *str); void device_add_completion(ReadLineState *rs, int nb_args, const char *str); diff --git a/monitor.c b/monitor.c index ca17e420a9..afbb2d7379 100644 --- a/monitor.c +++ b/monitor.c @@ -2963,6 +2963,13 @@ static mon_cmd_t info_cmds[] = { .help = "show the TPM device", .mhandler.cmd = hmp_info_tpm, }, + { + .name = "memdev", + .args_type = "", + .params = "", + .help = "show the memory device", + .mhandler.cmd = hmp_info_memdev, + }, { .name = NULL, }, From cac124d17c5c3c75c588e12a8d7b44c9b057cb9a Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:26 +0800 Subject: [PATCH 091/109] tests: fix memory leak in test of string input visitor Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/test-string-input-visitor.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test-string-input-visitor.c b/tests/test-string-input-visitor.c index 877e737714..01a78f04ab 100644 --- a/tests/test-string-input-visitor.c +++ b/tests/test-string-input-visitor.c @@ -193,6 +193,7 @@ static void test_visitor_in_fuzz(TestInputVisitorData *data, v = visitor_input_test_init(data, buf); visit_type_int(v, &ires, NULL, NULL); + visitor_input_teardown(data, NULL); v = visitor_input_test_init(data, buf); visit_type_bool(v, &bres, NULL, NULL); @@ -200,11 +201,13 @@ static void test_visitor_in_fuzz(TestInputVisitorData *data, v = visitor_input_test_init(data, buf); visit_type_number(v, &nres, NULL, NULL); + visitor_input_teardown(data, NULL); v = visitor_input_test_init(data, buf); sres = NULL; visit_type_str(v, &sres, NULL, NULL); g_free(sres); + visitor_input_teardown(data, NULL); v = visitor_input_test_init(data, buf); visit_type_EnumOne(v, &eres, NULL, NULL); From 659268ffbff0ffc823eb30dbe0ce83bd5703933e Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:27 +0800 Subject: [PATCH 092/109] qapi: make string input visitor parse int list Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Tested-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: split up patch --- qapi/string-input-visitor.c | 195 ++++++++++++++++++++++++++++-- tests/test-string-input-visitor.c | 36 ++++++ 2 files changed, 223 insertions(+), 8 deletions(-) diff --git a/qapi/string-input-visitor.c b/qapi/string-input-visitor.c index 5780944792..0b2490b3da 100644 --- a/qapi/string-input-visitor.c +++ b/qapi/string-input-visitor.c @@ -15,31 +15,205 @@ #include "qapi/visitor-impl.h" #include "qapi/qmp/qerror.h" #include "qemu/option.h" +#include "qemu/queue.h" +#include "qemu/range.h" + struct StringInputVisitor { Visitor visitor; + + bool head; + + GList *ranges; + GList *cur_range; + int64_t cur; + const char *string; }; +static void parse_str(StringInputVisitor *siv, Error **errp) +{ + char *str = (char *) siv->string; + long long start, end; + Range *cur; + char *endptr; + + if (siv->ranges) { + return; + } + + errno = 0; + do { + start = strtoll(str, &endptr, 0); + if (errno == 0 && endptr > str && INT64_MIN <= start && + start <= INT64_MAX) { + if (*endptr == '\0') { + cur = g_malloc0(sizeof(*cur)); + cur->begin = start; + cur->end = start + 1; + siv->ranges = g_list_insert_sorted_merged(siv->ranges, cur, + range_compare); + cur = NULL; + str = NULL; + } else if (*endptr == '-') { + str = endptr + 1; + end = strtoll(str, &endptr, 0); + if (errno == 0 && endptr > str && + INT64_MIN <= end && end <= INT64_MAX && start <= end && + (start > INT64_MAX - 65536 || + end < start + 65536)) { + if (*endptr == '\0') { + cur = g_malloc0(sizeof(*cur)); + cur->begin = start; + cur->end = end + 1; + siv->ranges = + g_list_insert_sorted_merged(siv->ranges, + cur, + range_compare); + cur = NULL; + str = NULL; + } else if (*endptr == ',') { + str = endptr + 1; + cur = g_malloc0(sizeof(*cur)); + cur->begin = start; + cur->end = end + 1; + siv->ranges = + g_list_insert_sorted_merged(siv->ranges, + cur, + range_compare); + cur = NULL; + } else { + goto error; + } + } else { + goto error; + } + } else if (*endptr == ',') { + str = endptr + 1; + cur = g_malloc0(sizeof(*cur)); + cur->begin = start; + cur->end = start + 1; + siv->ranges = g_list_insert_sorted_merged(siv->ranges, + cur, + range_compare); + cur = NULL; + } else { + goto error; + } + } else { + goto error; + } + } while (str); + + return; +error: + g_list_free_full(siv->ranges, g_free); + assert(siv->ranges == NULL); +} + +static void +start_list(Visitor *v, const char *name, Error **errp) +{ + StringInputVisitor *siv = DO_UPCAST(StringInputVisitor, visitor, v); + + parse_str(siv, errp); + + siv->cur_range = g_list_first(siv->ranges); + if (siv->cur_range) { + Range *r = siv->cur_range->data; + if (r) { + siv->cur = r->begin; + } + } +} + +static GenericList * +next_list(Visitor *v, GenericList **list, Error **errp) +{ + StringInputVisitor *siv = DO_UPCAST(StringInputVisitor, visitor, v); + GenericList **link; + Range *r; + + if (!siv->ranges || !siv->cur_range) { + return NULL; + } + + r = siv->cur_range->data; + if (!r) { + return NULL; + } + + if (siv->cur < r->begin || siv->cur >= r->end) { + siv->cur_range = g_list_next(siv->cur_range); + if (!siv->cur_range) { + return NULL; + } + r = siv->cur_range->data; + if (!r) { + return NULL; + } + siv->cur = r->begin; + } + + if (siv->head) { + link = list; + siv->head = false; + } else { + link = &(*list)->next; + } + + *link = g_malloc0(sizeof **link); + return *link; +} + +static void +end_list(Visitor *v, Error **errp) +{ + StringInputVisitor *siv = DO_UPCAST(StringInputVisitor, visitor, v); + siv->head = true; +} + static void parse_type_int(Visitor *v, int64_t *obj, const char *name, Error **errp) { StringInputVisitor *siv = DO_UPCAST(StringInputVisitor, visitor, v); - char *endp = (char *) siv->string; - long long val; - errno = 0; - if (siv->string) { - val = strtoll(siv->string, &endp, 0); - } - if (!siv->string || errno || endp == siv->string || *endp) { + if (!siv->string) { error_set(errp, QERR_INVALID_PARAMETER_TYPE, name ? name : "null", "integer"); return; } - *obj = val; + parse_str(siv, errp); + + if (!siv->ranges) { + goto error; + } + + if (!siv->cur_range) { + Range *r; + + siv->cur_range = g_list_first(siv->ranges); + if (!siv->cur_range) { + goto error; + } + + r = siv->cur_range->data; + if (!r) { + goto error; + } + + siv->cur = r->begin; + } + + *obj = siv->cur; + siv->cur++; + return; + +error: + error_set(errp, QERR_INVALID_PARAMETER_VALUE, name, + "an int64 value or range"); } static void parse_type_size(Visitor *v, uint64_t *obj, const char *name, @@ -140,6 +314,7 @@ Visitor *string_input_get_visitor(StringInputVisitor *v) void string_input_visitor_cleanup(StringInputVisitor *v) { + g_list_free_full(v->ranges, g_free); g_free(v); } @@ -155,8 +330,12 @@ StringInputVisitor *string_input_visitor_new(const char *str) v->visitor.type_bool = parse_type_bool; v->visitor.type_str = parse_type_str; v->visitor.type_number = parse_type_number; + v->visitor.start_list = start_list; + v->visitor.next_list = next_list; + v->visitor.end_list = end_list; v->visitor.optional = parse_optional; v->string = str; + v->head = true; return v; } diff --git a/tests/test-string-input-visitor.c b/tests/test-string-input-visitor.c index 01a78f04ab..b01e2f2b79 100644 --- a/tests/test-string-input-visitor.c +++ b/tests/test-string-input-visitor.c @@ -64,6 +64,35 @@ static void test_visitor_in_int(TestInputVisitorData *data, g_assert_cmpint(res, ==, value); } +static void test_visitor_in_intList(TestInputVisitorData *data, + const void *unused) +{ + int64_t value[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20}; + int16List *res = NULL, *tmp; + Error *errp = NULL; + Visitor *v; + int i = 0; + + v = visitor_input_test_init(data, "1,2,0,2-4,20,5-9,1-8"); + + visit_type_int16List(v, &res, NULL, &errp); + g_assert(errp == NULL); + tmp = res; + while (i < sizeof(value) / sizeof(value[0])) { + g_assert(tmp); + g_assert_cmpint(tmp->value, ==, value[i++]); + tmp = tmp->next; + } + g_assert(!tmp); + + tmp = res; + while (tmp) { + res = res->next; + g_free(tmp); + tmp = res; + } +} + static void test_visitor_in_bool(TestInputVisitorData *data, const void *unused) { @@ -170,6 +199,7 @@ static void test_visitor_in_fuzz(TestInputVisitorData *data, const void *unused) { int64_t ires; + intList *ilres; bool bres; double nres; char *sres; @@ -195,6 +225,10 @@ static void test_visitor_in_fuzz(TestInputVisitorData *data, visit_type_int(v, &ires, NULL, NULL); visitor_input_teardown(data, NULL); + v = visitor_input_test_init(data, buf); + visit_type_intList(v, &ilres, NULL, NULL); + visitor_input_teardown(data, NULL); + v = visitor_input_test_init(data, buf); visit_type_bool(v, &bres, NULL, NULL); visitor_input_teardown(data, NULL); @@ -231,6 +265,8 @@ int main(int argc, char **argv) input_visitor_test_add("/string-visitor/input/int", &in_visitor_data, test_visitor_in_int); + input_visitor_test_add("/string-visitor/input/intList", + &in_visitor_data, test_visitor_in_intList); input_visitor_test_add("/string-visitor/input/bool", &in_visitor_data, test_visitor_in_bool); input_visitor_test_add("/string-visitor/input/number", From 69e255635d0126dcce60b17454a7ec23e7afc174 Mon Sep 17 00:00:00 2001 From: Hu Tao Date: Tue, 10 Jun 2014 19:15:28 +0800 Subject: [PATCH 093/109] qapi: make string output visitor parse int list Signed-off-by: Hu Tao Acked-by: Michael S. Tsirkin Tested-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin MST: split up patch --- qapi/string-output-visitor.c | 229 +++++++++++++++++++++++++++-- tests/test-string-output-visitor.c | 38 ++++- 2 files changed, 255 insertions(+), 12 deletions(-) diff --git a/qapi/string-output-visitor.c b/qapi/string-output-visitor.c index fb1d2e806d..1c0834abad 100644 --- a/qapi/string-output-visitor.c +++ b/qapi/string-output-visitor.c @@ -16,32 +16,181 @@ #include "qapi/qmp/qerror.h" #include "qemu/host-utils.h" #include +#include "qemu/range.h" + +enum ListMode { + LM_NONE, /* not traversing a list of repeated options */ + LM_STARTED, /* start_list() succeeded */ + + LM_IN_PROGRESS, /* next_list() has been called. + * + * Generating the next list link will consume the most + * recently parsed QemuOpt instance of the repeated + * option. + * + * Parsing a value into the list link will examine the + * next QemuOpt instance of the repeated option, and + * possibly enter LM_SIGNED_INTERVAL or + * LM_UNSIGNED_INTERVAL. + */ + + LM_SIGNED_INTERVAL, /* next_list() has been called. + * + * Generating the next list link will consume the most + * recently stored element from the signed interval, + * parsed from the most recent QemuOpt instance of the + * repeated option. This may consume QemuOpt itself + * and return to LM_IN_PROGRESS. + * + * Parsing a value into the list link will store the + * next element of the signed interval. + */ + + LM_UNSIGNED_INTERVAL,/* Same as above, only for an unsigned interval. */ + + LM_END +}; + +typedef enum ListMode ListMode; struct StringOutputVisitor { Visitor visitor; bool human; - char *string; + GString *string; + bool head; + ListMode list_mode; + union { + int64_t s; + uint64_t u; + } range_start, range_end; + GList *ranges; }; static void string_output_set(StringOutputVisitor *sov, char *string) { - g_free(sov->string); - sov->string = string; + if (sov->string) { + g_string_free(sov->string, true); + } + sov->string = g_string_new(string); + g_free(string); +} + +static void string_output_append(StringOutputVisitor *sov, int64_t a) +{ + Range *r = g_malloc0(sizeof(*r)); + r->begin = a; + r->end = a + 1; + sov->ranges = g_list_insert_sorted_merged(sov->ranges, r, range_compare); +} + +static void string_output_append_range(StringOutputVisitor *sov, + int64_t s, int64_t e) +{ + Range *r = g_malloc0(sizeof(*r)); + r->begin = s; + r->end = e + 1; + sov->ranges = g_list_insert_sorted_merged(sov->ranges, r, range_compare); +} + +static void format_string(StringOutputVisitor *sov, Range *r, bool next, + bool human) +{ + if (r->end - r->begin > 1) { + if (human) { + g_string_append_printf(sov->string, "%" PRIx64 "-%" PRIx64, + r->begin, r->end - 1); + + } else { + g_string_append_printf(sov->string, "%" PRId64 "-%" PRId64, + r->begin, r->end - 1); + } + } else { + if (human) { + g_string_append_printf(sov->string, "%" PRIx64, r->begin); + } else { + g_string_append_printf(sov->string, "%" PRId64, r->begin); + } + } + if (next) { + g_string_append(sov->string, ","); + } } static void print_type_int(Visitor *v, int64_t *obj, const char *name, Error **errp) { StringOutputVisitor *sov = DO_UPCAST(StringOutputVisitor, visitor, v); - char *out; + GList *l; + + switch (sov->list_mode) { + case LM_NONE: + string_output_append(sov, *obj); + break; + + case LM_STARTED: + sov->range_start.s = *obj; + sov->range_end.s = *obj; + sov->list_mode = LM_IN_PROGRESS; + return; + + case LM_IN_PROGRESS: + if (sov->range_end.s + 1 == *obj) { + sov->range_end.s++; + } else { + if (sov->range_start.s == sov->range_end.s) { + string_output_append(sov, sov->range_end.s); + } else { + assert(sov->range_start.s < sov->range_end.s); + string_output_append_range(sov, sov->range_start.s, + sov->range_end.s); + } + + sov->range_start.s = *obj; + sov->range_end.s = *obj; + } + return; + + case LM_END: + if (sov->range_end.s + 1 == *obj) { + sov->range_end.s++; + assert(sov->range_start.s < sov->range_end.s); + string_output_append_range(sov, sov->range_start.s, + sov->range_end.s); + } else { + if (sov->range_start.s == sov->range_end.s) { + string_output_append(sov, sov->range_end.s); + } else { + assert(sov->range_start.s < sov->range_end.s); + + string_output_append_range(sov, sov->range_start.s, + sov->range_end.s); + } + string_output_append(sov, *obj); + } + break; + + default: + abort(); + } + + l = sov->ranges; + while (l) { + Range *r = l->data; + format_string(sov, r, l->next != NULL, false); + l = l->next; + } if (sov->human) { - out = g_strdup_printf("%lld (%#llx)", (long long) *obj, (long long) *obj); - } else { - out = g_strdup_printf("%lld", (long long) *obj); + l = sov->ranges; + g_string_append(sov->string, " ("); + while (l) { + Range *r = l->data; + format_string(sov, r, l->next != NULL, false); + l = l->next; + } + g_string_append(sov->string, ")"); } - string_output_set(sov, out); } static void print_type_size(Visitor *v, uint64_t *obj, const char *name, @@ -103,9 +252,61 @@ static void print_type_number(Visitor *v, double *obj, const char *name, string_output_set(sov, g_strdup_printf("%f", *obj)); } +static void +start_list(Visitor *v, const char *name, Error **errp) +{ + StringOutputVisitor *sov = DO_UPCAST(StringOutputVisitor, visitor, v); + + /* we can't traverse a list in a list */ + assert(sov->list_mode == LM_NONE); + sov->list_mode = LM_STARTED; + sov->head = true; +} + +static GenericList * +next_list(Visitor *v, GenericList **list, Error **errp) +{ + StringOutputVisitor *sov = DO_UPCAST(StringOutputVisitor, visitor, v); + GenericList *ret = NULL; + if (*list) { + if (sov->head) { + ret = *list; + } else { + ret = (*list)->next; + } + + if (sov->head) { + if (ret && ret->next == NULL) { + sov->list_mode = LM_NONE; + } + sov->head = false; + } else { + if (ret && ret->next == NULL) { + sov->list_mode = LM_END; + } + } + } + + return ret; +} + +static void +end_list(Visitor *v, Error **errp) +{ + StringOutputVisitor *sov = DO_UPCAST(StringOutputVisitor, visitor, v); + + assert(sov->list_mode == LM_STARTED || + sov->list_mode == LM_END || + sov->list_mode == LM_NONE || + sov->list_mode == LM_IN_PROGRESS); + sov->list_mode = LM_NONE; + sov->head = true; + +} + char *string_output_get_string(StringOutputVisitor *sov) { - char *string = sov->string; + char *string = g_string_free(sov->string, false); sov->string = NULL; return string; } @@ -117,7 +318,11 @@ Visitor *string_output_get_visitor(StringOutputVisitor *sov) void string_output_visitor_cleanup(StringOutputVisitor *sov) { - g_free(sov->string); + if (sov->string) { + g_string_free(sov->string, true); + } + + g_list_free_full(sov->ranges, g_free); g_free(sov); } @@ -127,6 +332,7 @@ StringOutputVisitor *string_output_visitor_new(bool human) v = g_malloc0(sizeof(*v)); + v->string = g_string_new(NULL); v->human = human; v->visitor.type_enum = output_type_enum; v->visitor.type_int = print_type_int; @@ -134,6 +340,9 @@ StringOutputVisitor *string_output_visitor_new(bool human) v->visitor.type_bool = print_type_bool; v->visitor.type_str = print_type_str; v->visitor.type_number = print_type_number; + v->visitor.start_list = start_list; + v->visitor.next_list = next_list; + v->visitor.end_list = end_list; return v; } diff --git a/tests/test-string-output-visitor.c b/tests/test-string-output-visitor.c index 2af5a21ab5..28e7359a2a 100644 --- a/tests/test-string-output-visitor.c +++ b/tests/test-string-output-visitor.c @@ -44,7 +44,7 @@ static void visitor_output_teardown(TestOutputVisitorData *data, static void test_visitor_out_int(TestOutputVisitorData *data, const void *unused) { - int64_t value = -42; + int64_t value = 42; Error *err = NULL; char *str; @@ -53,10 +53,42 @@ static void test_visitor_out_int(TestOutputVisitorData *data, str = string_output_get_string(data->sov); g_assert(str != NULL); - g_assert_cmpstr(str, ==, "-42"); + g_assert_cmpstr(str, ==, "42"); g_free(str); } +static void test_visitor_out_intList(TestOutputVisitorData *data, + const void *unused) +{ + int64_t value[] = {0, 1, 9, 10, 16, 15, 14, + 3, 4, 5, 6, 11, 12, 13, 21, 22, INT64_MAX - 1, INT64_MAX}; + intList *list = NULL, **tmp = &list; + int i; + Error *errp = NULL; + char *str; + + for (i = 0; i < sizeof(value) / sizeof(value[0]); i++) { + *tmp = g_malloc0(sizeof(**tmp)); + (*tmp)->value = value[i]; + tmp = &(*tmp)->next; + } + + visit_type_intList(data->ov, &list, NULL, &errp); + g_assert(errp == NULL); + + str = string_output_get_string(data->sov); + g_assert(str != NULL); + g_assert_cmpstr(str, ==, + "0-1,3-6,9-16,21-22,9223372036854775806-9223372036854775807"); + g_free(str); + while (list) { + intList *tmp2; + tmp2 = list->next; + g_free(list); + list = tmp2; + } +} + static void test_visitor_out_bool(TestOutputVisitorData *data, const void *unused) { @@ -182,6 +214,8 @@ int main(int argc, char **argv) &out_visitor_data, test_visitor_out_enum); output_visitor_test_add("/string-visitor/output/enum-errors", &out_visitor_data, test_visitor_out_enum_errors); + output_visitor_test_add("/string-visitor/output/intList", + &out_visitor_data, test_visitor_out_intList); g_test_run(); From 0d156683f667ef20fa20abc21e85ee776e6c1472 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Jun 2014 18:07:03 +0300 Subject: [PATCH 094/109] qapi: fix build on glib < 2.28 The following commits: qapi: make string output visitor parse int list qapi: make string input visitor parse int list break with glib < 2.28 since they use the new g_list_free_full function. Open-code that to fix build on old systems. Cc: Hu Tao Signed-off-by: Michael S. Tsirkin --- qapi/string-input-visitor.c | 13 ++++++++++--- qapi/string-output-visitor.c | 8 +++++++- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/qapi/string-input-visitor.c b/qapi/string-input-visitor.c index 0b2490b3da..72722e6571 100644 --- a/qapi/string-input-visitor.c +++ b/qapi/string-input-visitor.c @@ -32,6 +32,11 @@ struct StringInputVisitor const char *string; }; +static void free_range(void *range, void *dummy) +{ + g_free(range); +} + static void parse_str(StringInputVisitor *siv, Error **errp) { char *str = (char *) siv->string; @@ -108,8 +113,9 @@ static void parse_str(StringInputVisitor *siv, Error **errp) return; error: - g_list_free_full(siv->ranges, g_free); - assert(siv->ranges == NULL); + g_list_foreach(siv->ranges, free_range, NULL); + g_list_free(siv->ranges); + siv->ranges = NULL; } static void @@ -314,7 +320,8 @@ Visitor *string_input_get_visitor(StringInputVisitor *v) void string_input_visitor_cleanup(StringInputVisitor *v) { - g_list_free_full(v->ranges, g_free); + g_list_foreach(v->ranges, free_range, NULL); + g_list_free(v->ranges); g_free(v); } diff --git a/qapi/string-output-visitor.c b/qapi/string-output-visitor.c index 1c0834abad..8735b00448 100644 --- a/qapi/string-output-visitor.c +++ b/qapi/string-output-visitor.c @@ -316,13 +316,19 @@ Visitor *string_output_get_visitor(StringOutputVisitor *sov) return &sov->visitor; } +static void free_range(void *range, void *dummy) +{ + g_free(range); +} + void string_output_visitor_cleanup(StringOutputVisitor *sov) { if (sov->string) { g_string_free(sov->string, true); } - g_list_free_full(sov->ranges, g_free); + g_list_foreach(sov->ranges, free_range, NULL); + g_list_free(sov->ranges); g_free(sov); } From b7b34d055d82abaa511b35c9fc24efbb63dca0b1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Jun 2014 14:52:08 +0200 Subject: [PATCH 095/109] qdev: reorganize error reporting in bus_set_realized MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No semantic change. Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Tested-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Andreas Färber --- hw/core/qdev.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 3226a71d30..65aa041285 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -573,27 +573,19 @@ static void bus_set_realized(Object *obj, bool value, Error **errp) if (value && !bus->realized) { if (bc->realize) { bc->realize(bus, &local_err); - - if (local_err != NULL) { - goto error; - } - } } else if (!value && bus->realized) { if (bc->unrealize) { bc->unrealize(bus, &local_err); - - if (local_err != NULL) { - goto error; - } } } - bus->realized = value; - return; + if (local_err != NULL) { + error_propagate(errp, local_err); + return; + } -error: - error_propagate(errp, local_err); + bus->realized = value; } void qbus_create_inplace(void *bus, size_t size, const char *typename, From 5942a19040fed313b316ab7b6e3d2d8e7b1625bb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 11 Jun 2014 14:52:09 +0200 Subject: [PATCH 096/109] qdev: recursively unrealize devices when unrealizing bus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the patch was posted that became 5c21ce7 (qdev: Realize buses on device realization, 2014-03-12), it included recursive realization and unrealization of devices when the bus's "realized" property was toggled. However, due to the same old worries about recursive realization and prerequisites not being realized yet, those hunks were dropped when committing the patch. Unfortunately, this causes a use-after-free bug (easily reproduced by a PCI hot-unplug action). Before the patch, device_unparent behaved as follows: for each child bus unparent bus ----------------------------. | for each child device | | unparent device ---------------. | | | unrealize device | | | | call dc->unparent | | | '------------------------------- | '----------------------------------------' unrealize device After the patch, it behaves as follows instead: unrealize device --------------------. | for each child bus | | unrealize bus (A) | '------------------------------------' for each child bus unparent bus ----------------------. | for each child device | | unrealize device (B) | | call dc->unparent | '----------------------------------' At the step marked (B) the device might use data from the bus that is not available anymore due to step (A). To fix this, we need to unrealize devices before step (A). To sidestep concerns about recursive realization, only do recursive unrealization and leave the "value && !bus->realized" case as it is. The resulting flow is: for each child bus unrealize bus ---------------------. | for each child device | | unrealize device (B) | | call bc->unrealize (A) | '----------------------------------' unrealize device for each child bus unparent bus ----------------------. | for each child device | | unparent device | '----------------------------------' where everything is "powered down" before it is unassembled. Cc: qemu-stable@nongnu.org Signed-off-by: Paolo Bonzini Tested-by: Michael S. Tsirkin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin Reviewed-by: Andreas Färber --- hw/core/qdev.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 65aa041285..b9cd4fc814 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -568,14 +568,25 @@ static void bus_set_realized(Object *obj, bool value, Error **errp) { BusState *bus = BUS(obj); BusClass *bc = BUS_GET_CLASS(bus); + BusChild *kid; Error *local_err = NULL; if (value && !bus->realized) { if (bc->realize) { bc->realize(bus, &local_err); } + + /* TODO: recursive realization */ } else if (!value && bus->realized) { - if (bc->unrealize) { + QTAILQ_FOREACH(kid, &bus->children, sibling) { + DeviceState *dev = kid->child; + object_property_set_bool(OBJECT(dev), false, "realized", + &local_err); + if (local_err != NULL) { + break; + } + } + if (bc->unrealize && local_err == NULL) { bc->unrealize(bus, &local_err); } } From c059451cc14483e261c92411b829124c660573fa Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Jun 2014 15:20:18 +0300 Subject: [PATCH 097/109] qmp: clean out whitespace Signed-off-by: Michael S. Tsirkin --- qmp-commands.hx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qmp-commands.hx b/qmp-commands.hx index d99f23591a..e47c3ea900 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3571,7 +3571,7 @@ Example: "format":"qcow2", "virtual-size":2048000 } - } } ] } + } } ] } EQMP From dc61b2953121bf3656cd79efd4f2437dacfd6d8a Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 18:33:40 +0200 Subject: [PATCH 098/109] pc: acpi: do not hardcode preprocessor but use one provided by environment, in addition force C style preprocessing so that 'gcc -E' or "clang -E" wouldn't ignore .dsl files. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/i386/Makefile.objs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs index a65473c253..48014abf0a 100644 --- a/hw/i386/Makefile.objs +++ b/hw/i386/Makefile.objs @@ -18,7 +18,7 @@ iasl-option=$(shell if test -z "`$(1) $(2) 2>&1 > /dev/null`" \ ifdef IASL #IASL Present. Generate hex files from .dsl hw/i386/%.hex: $(SRC_PATH)/hw/i386/%.dsl $(SRC_PATH)/scripts/acpi_extract_preprocess.py $(SRC_PATH)/scripts/acpi_extract.py - $(call quiet-command, cpp -P $(QEMU_DGFLAGS) $(QEMU_INCLUDES) $< -o $*.dsl.i.orig, " CPP $(TARGET_DIR)$*.dsl.i.orig") + $(call quiet-command, $(CPP) -x c -P $(QEMU_DGFLAGS) $(QEMU_INCLUDES) $< -o $*.dsl.i.orig, " CPP $(TARGET_DIR)$*.dsl.i.orig") $(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract_preprocess.py $*.dsl.i.orig > $*.dsl.i, " ACPI_PREPROCESS $(TARGET_DIR)$*.dsl.i") $(call quiet-command, $(IASL) $(call iasl-option,$(IASL),-Pn,) -vs -l -tc -p $* $*.dsl.i $(if $(V), , > /dev/null) 2>&1 ," IASL $(TARGET_DIR)$*.dsl.i") $(call quiet-command, $(PYTHON) $(SRC_PATH)/scripts/acpi_extract.py $*.lst > $*.off, " ACPI_EXTRACT $(TARGET_DIR)$*.off") From c3ba3095070728f9c247f9e89652b9ea3f94319f Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 17 Jun 2014 12:17:05 +0200 Subject: [PATCH 099/109] numa: handle mmaped memory allocation failure correctly when memory_region_init_ram_from_file() fails memory_region_size() will still return size that was provided at region init time. Instead use errp to properly detect error condition. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numa.c b/numa.c index eef0717785..e471afe04a 100644 --- a/numa.c +++ b/numa.c @@ -238,7 +238,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, /* Legacy behavior: if allocation failed, fall back to * regular RAM allocation. */ - if (!memory_region_size(mr)) { + if (err) { qerror_report_err(err); error_free(err); memory_region_init_ram(mr, owner, name, ram_size); From 6f2e27301d8cfea0a40dda7ec8a6dc1b9d0e228e Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 19:12:25 +0200 Subject: [PATCH 100/109] qmp: add query-memory-devices command ... allowing to get state of present memory devices. Currently implemented only for PCDIMMDevice. Signed-off-by: Igor Mammedov Reviewed-by: Eric Blake Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/mem/pc-dimm.c | 39 +++++++++++++++++++++++++ include/hw/mem/pc-dimm.h | 2 ++ qapi-schema.json | 51 +++++++++++++++++++++++++++++++++ qmp-commands.hx | 27 +++++++++++++++++ qmp.c | 11 +++++++ stubs/Makefile.objs | 1 + stubs/qmp_pc_dimm_device_list.c | 7 +++++ 7 files changed, 138 insertions(+) create mode 100644 stubs/qmp_pc_dimm_device_list.c diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 8c2656821a..ad176b700b 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -23,6 +23,45 @@ #include "qapi/visitor.h" #include "qemu/range.h" +int qmp_pc_dimm_device_list(Object *obj, void *opaque) +{ + MemoryDeviceInfoList ***prev = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + + if (dev->realized) { + MemoryDeviceInfoList *elem = g_new0(MemoryDeviceInfoList, 1); + MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); + PCDIMMDeviceInfo *di = g_new0(PCDIMMDeviceInfo, 1); + DeviceClass *dc = DEVICE_GET_CLASS(obj); + PCDIMMDevice *dimm = PC_DIMM(obj); + + if (dev->id) { + di->has_id = true; + di->id = g_strdup(dev->id); + } + di->hotplugged = dev->hotplugged; + di->hotpluggable = dc->hotpluggable; + di->addr = dimm->addr; + di->slot = dimm->slot; + di->node = dimm->node; + di->size = object_property_get_int(OBJECT(dimm), PC_DIMM_SIZE_PROP, + NULL); + di->memdev = object_get_canonical_path(OBJECT(dimm->hostmem)); + + info->dimm = di; + elem->value = info; + elem->next = NULL; + **prev = elem; + *prev = &elem->next; + } + } + + object_child_foreach(obj, qmp_pc_dimm_device_list, opaque); + return 0; +} + static int pc_dimm_slot2bitmap(Object *obj, void *opaque) { unsigned long *bitmap = opaque; diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index 0f4a6ba1e4..761eeef801 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -76,4 +76,6 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, Error **errp); int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp); + +int qmp_pc_dimm_device_list(Object *obj, void *opaque); #endif diff --git a/qapi-schema.json b/qapi-schema.json index 082bc3ab4e..d51a208c05 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3194,3 +3194,54 @@ # Since: 2.1 ## { 'command': 'query-memdev', 'returns': ['Memdev'] } +# @PCDIMMDeviceInfo: +# +# PCDIMMDevice state information +# +# @id: #optional device's ID +# +# @addr: physical address, where device is mapped +# +# @size: size of memory that the device provides +# +# @slot: slot number at which device is plugged in +# +# @node: NUMA node number where device is plugged in +# +# @memdev: memory backend linked with device +# +# @hotplugged: true if device was hotplugged +# +# @hotpluggable: true if device if could be added/removed while machine is running +# +# Since: 2.1 +## +{ 'type': 'PCDIMMDeviceInfo', + 'data': { '*id': 'str', + 'addr': 'int', + 'size': 'int', + 'slot': 'int', + 'node': 'int', + 'memdev': 'str', + 'hotplugged': 'bool', + 'hotpluggable': 'bool' + } +} + +## +# @MemoryDeviceInfo: +# +# Union containing information about a memory device +# +# Since: 2.1 +## +{ 'union': 'MemoryDeviceInfo', 'data': {'dimm': 'PCDIMMDeviceInfo'} } + +## +# @query-memory-devices +# +# Lists available memory devices and their state +# +# Since: 2.1 +## +{ 'command': 'query-memory-devices', 'returns': ['MemoryDeviceInfo'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index e47c3ea900..81054d0b1b 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3611,4 +3611,31 @@ Example (1): ] } +EQMP + + { + .name = "query-memory-devices", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_memory_devices, + }, + +SQMP +@query-memory-devices +-------------------- + +Return a list of memory devices. + +Example: +-> { "execute": "query-memory-devices" } +<- { "return": [ { "data": + { "addr": 5368709120, + "hotpluggable": true, + "hotplugged": true, + "id": "d1", + "memdev": "/objects/memX", + "node": 0, + "size": 1073741824, + "slot": 0}, + "type": "dimm" + } ] } EQMP diff --git a/qmp.c b/qmp.c index c3c0229cdf..835fd78851 100644 --- a/qmp.c +++ b/qmp.c @@ -28,6 +28,7 @@ #include "qapi/qmp-input-visitor.h" #include "hw/boards.h" #include "qom/object_interfaces.h" +#include "hw/mem/pc-dimm.h" NameInfo *qmp_query_name(Error **errp) { @@ -628,3 +629,13 @@ void qmp_object_del(const char *id, Error **errp) } object_unparent(obj); } + +MemoryDeviceInfoList *qmp_query_memory_devices(Error **errp) +{ + MemoryDeviceInfoList *head = NULL; + MemoryDeviceInfoList **prev = &head; + + qmp_pc_dimm_device_list(qdev_get_machine(), &prev); + + return head; +} diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 5a0b9174e0..997d68d5b9 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -37,3 +37,4 @@ stub-obj-y += vmstate.o stub-obj-$(CONFIG_WIN32) += fd-register.o stub-obj-y += cpus.o stub-obj-y += kvm.o +stub-obj-y += qmp_pc_dimm_device_list.o diff --git a/stubs/qmp_pc_dimm_device_list.c b/stubs/qmp_pc_dimm_device_list.c new file mode 100644 index 0000000000..5cb220c66c --- /dev/null +++ b/stubs/qmp_pc_dimm_device_list.c @@ -0,0 +1,7 @@ +#include "qom/object.h" +#include "hw/mem/pc-dimm.h" + +int qmp_pc_dimm_device_list(Object *obj, void *opaque) +{ + return 0; +} From 521b3673ac16ec7fa33b1ec2cdfe75ec708f073e Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 19:12:26 +0200 Subject: [PATCH 101/109] acpi: introduce TYPE_ACPI_DEVICE_IF interface ... it will be used to abstract generic ACPI bits from device that implements ACPI interface. ACPIOSTInfo type is used for passing-through raw _OST event/status codes reported by guest OS to a management layer. It lets management tools interpret values as specified by ACPI spec if it is interested in it. QEMU doesn't encode these values as enum, since it doesn't need to handle them and it allows interface to scale well without any changes in QEMU while guest OS and management evolves in time. Signed-off-by: Igor Mammedov Reviewed-by: Eric Blake Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/Makefile.objs | 1 + hw/acpi/acpi_interface.c | 15 ++++++++++ include/hw/acpi/acpi_dev_interface.h | 43 ++++++++++++++++++++++++++++ qapi-schema.json | 31 ++++++++++++++++++++ 4 files changed, 90 insertions(+) create mode 100644 hw/acpi/acpi_interface.c create mode 100644 include/hw/acpi/acpi_dev_interface.h diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 004e1b2090..acd2389431 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -1,2 +1,3 @@ common-obj-$(CONFIG_ACPI) += core.o piix4.o ich9.o pcihp.o cpu_hotplug.o common-obj-$(CONFIG_ACPI) += memory_hotplug.o +common-obj-$(CONFIG_ACPI) += acpi_interface.o diff --git a/hw/acpi/acpi_interface.c b/hw/acpi/acpi_interface.c new file mode 100644 index 0000000000..c181bb2262 --- /dev/null +++ b/hw/acpi/acpi_interface.c @@ -0,0 +1,15 @@ +#include "hw/acpi/acpi_dev_interface.h" +#include "qemu/module.h" + +static void register_types(void) +{ + static const TypeInfo acpi_dev_if_info = { + .name = TYPE_ACPI_DEVICE_IF, + .parent = TYPE_INTERFACE, + .class_size = sizeof(AcpiDeviceIfClass), + }; + + type_register_static(&acpi_dev_if_info); +} + +type_init(register_types) diff --git a/include/hw/acpi/acpi_dev_interface.h b/include/hw/acpi/acpi_dev_interface.h new file mode 100644 index 0000000000..f245f8d236 --- /dev/null +++ b/include/hw/acpi/acpi_dev_interface.h @@ -0,0 +1,43 @@ +#ifndef ACPI_DEV_INTERFACE_H +#define ACPI_DEV_INTERFACE_H + +#include "qom/object.h" +#include "qapi-types.h" + +#define TYPE_ACPI_DEVICE_IF "acpi-device-interface" + +#define ACPI_DEVICE_IF_CLASS(klass) \ + OBJECT_CLASS_CHECK(AcpiDeviceIfClass, (klass), \ + TYPE_ACPI_DEVICE_IF) +#define ACPI_DEVICE_IF_GET_CLASS(obj) \ + OBJECT_GET_CLASS(AcpiDeviceIfClass, (obj), \ + TYPE_ACPI_DEVICE_IF) +#define ACPI_DEVICE_IF(obj) \ + INTERFACE_CHECK(AcpiDeviceIf, (obj), \ + TYPE_ACPI_DEVICE_IF) + + +typedef struct AcpiDeviceIf { + /* */ + Object Parent; +} AcpiDeviceIf; + +/** + * AcpiDeviceIfClass: + * + * ospm_status: returns status of ACPI device objects, reported + * via _OST method if device supports it. + * + * Interface is designed for providing unified interface + * to generic ACPI functionality that could be used without + * knowledge about internals of actual device that implements + * ACPI interface. + */ +typedef struct AcpiDeviceIfClass { + /* */ + InterfaceClass parent_class; + + /* */ + void (*ospm_status)(AcpiDeviceIf *adev, ACPIOSTInfoList ***list); +} AcpiDeviceIfClass; +#endif diff --git a/qapi-schema.json b/qapi-schema.json index d51a208c05..ff30ace82b 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3245,3 +3245,34 @@ # Since: 2.1 ## { 'command': 'query-memory-devices', 'returns': ['MemoryDeviceInfo'] } + +## @ACPISlotType +# +# @DIMM: memory slot +# +{ 'enum': 'ACPISlotType', 'data': [ 'DIMM' ] } + +## @ACPIOSTInfo +# +# OSPM Status Indication for a device +# For description of possible values of @source and @status fields +# see "_OST (OSPM Status Indication)" chapter of ACPI5.0 spec. +# +# @device: #optional device ID associated with slot +# +# @slot: slot ID, unique per slot of a given @slot-type +# +# @slot-type: type of the slot +# +# @source: an integer containing the source event +# +# @status: an integer containing the status code +# +# Since: 2.1 +## +{ 'type': 'ACPIOSTInfo', + 'data' : { '*device': 'str', + 'slot': 'str', + 'slot-type': 'ACPISlotType', + 'source': 'int', + 'status': 'int' } } From 43f50410088847376c8b146e817a9042bd2a5f36 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 19:12:27 +0200 Subject: [PATCH 102/109] acpi: implement ospm_status() method for PIIX4/ICH9_LPC devices ... using TYPE_ACPI_DEVICE_IF interface. Which provides status reporting of ACPI declared memory devices Signed-off-by: Igor Mammedov Reviewed-by: Eric Blake Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/ich9.c | 7 +++++++ hw/acpi/memory_hotplug.c | 31 +++++++++++++++++++++++++++++++ hw/acpi/piix4.c | 11 +++++++++++ hw/isa/lpc_ich9.c | 3 +++ include/hw/acpi/ich9.h | 3 +++ include/hw/acpi/memory_hotplug.h | 1 + 6 files changed, 56 insertions(+) diff --git a/hw/acpi/ich9.c b/hw/acpi/ich9.c index 0ecc241458..e7d6c77b34 100644 --- a/hw/acpi/ich9.c +++ b/hw/acpi/ich9.c @@ -309,3 +309,10 @@ void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp) " type: %s", object_get_typename(OBJECT(dev))); } } + +void ich9_pm_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) +{ + ICH9LPCState *s = ICH9_LPC_DEVICE(adev); + + acpi_memory_ospm_status(&s->pm.acpi_memory_hotplug, list); +} diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 71c7a5e83f..e7009bc355 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -4,6 +4,37 @@ #include "hw/boards.h" #include "trace.h" +static ACPIOSTInfo *acpi_memory_device_status(int slot, MemStatus *mdev) +{ + ACPIOSTInfo *info = g_new0(ACPIOSTInfo, 1); + + info->slot_type = ACPI_SLOT_TYPE_DIMM; + info->slot = g_strdup_printf("%d", slot); + info->source = mdev->ost_event; + info->status = mdev->ost_status; + if (mdev->dimm) { + DeviceState *dev = DEVICE(mdev->dimm); + if (dev->id) { + info->device = g_strdup(dev->id); + info->has_device = true; + } + } + return info; +} + +void acpi_memory_ospm_status(MemHotplugState *mem_st, ACPIOSTInfoList ***list) +{ + int i; + + for (i = 0; i < mem_st->dev_count; i++) { + ACPIOSTInfoList *elem = g_new0(ACPIOSTInfoList, 1); + elem->value = acpi_memory_device_status(i, &mem_st->devs[i]); + elem->next = NULL; + **list = elem; + *list = &elem->next; + } +} + static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, unsigned int size) { diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index 01b3b4cb64..b72b34e5c9 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -35,6 +35,7 @@ #include "hw/hotplug.h" #include "hw/mem/pc-dimm.h" #include "hw/acpi/memory_hotplug.h" +#include "hw/acpi/acpi_dev_interface.h" //#define DEBUG @@ -572,6 +573,13 @@ static void piix4_acpi_system_hot_add_init(MemoryRegion *parent, } } +static void piix4_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list) +{ + PIIX4PMState *s = PIIX4_PM(adev); + + acpi_memory_ospm_status(&s->acpi_memory_hotplug, list); +} + static Property piix4_pm_properties[] = { DEFINE_PROP_UINT32("smb_io_base", PIIX4PMState, smb_io_base, 0), DEFINE_PROP_UINT8(ACPI_PM_PROP_S3_DISABLED, PIIX4PMState, disable_s3, 0), @@ -589,6 +597,7 @@ static void piix4_pm_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_CLASS(klass); k->init = piix4_pm_initfn; k->config_write = pm_write_config; @@ -607,6 +616,7 @@ static void piix4_pm_class_init(ObjectClass *klass, void *data) dc->hotpluggable = false; hc->plug = piix4_device_plug_cb; hc->unplug = piix4_device_unplug_cb; + adevc->ospm_status = piix4_ospm_status; } static const TypeInfo piix4_pm_info = { @@ -616,6 +626,7 @@ static const TypeInfo piix4_pm_info = { .class_init = piix4_pm_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, + { TYPE_ACPI_DEVICE_IF }, { } } }; diff --git a/hw/isa/lpc_ich9.c b/hw/isa/lpc_ich9.c index 3fe23113b0..b846d81990 100644 --- a/hw/isa/lpc_ich9.c +++ b/hw/isa/lpc_ich9.c @@ -658,6 +658,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass); + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_CLASS(klass); set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories); dc->reset = ich9_lpc_reset; @@ -676,6 +677,7 @@ static void ich9_lpc_class_init(ObjectClass *klass, void *data) dc->cannot_instantiate_with_device_add_yet = true; hc->plug = ich9_device_plug_cb; hc->unplug = ich9_device_unplug_cb; + adevc->ospm_status = ich9_pm_ospm_status; } static const TypeInfo ich9_lpc_info = { @@ -686,6 +688,7 @@ static const TypeInfo ich9_lpc_info = { .class_init = ich9_lpc_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_HOTPLUG_HANDLER }, + { TYPE_ACPI_DEVICE_IF }, { } } }; diff --git a/include/hw/acpi/ich9.h b/include/hw/acpi/ich9.h index 1977f1b206..7e42448ef9 100644 --- a/include/hw/acpi/ich9.h +++ b/include/hw/acpi/ich9.h @@ -24,6 +24,7 @@ #include "hw/acpi/acpi.h" #include "hw/acpi/cpu_hotplug.h" #include "hw/acpi/memory_hotplug.h" +#include "hw/acpi/acpi_dev_interface.h" typedef struct ICH9LPCPMRegs { /* @@ -59,4 +60,6 @@ extern const VMStateDescription vmstate_ich9_pm; void ich9_pm_add_properties(Object *obj, ICH9LPCPMRegs *pm, Error **errp); void ich9_pm_device_plug_cb(ICH9LPCPMRegs *pm, DeviceState *dev, Error **errp); + +void ich9_pm_ospm_status(AcpiDeviceIf *adev, ACPIOSTInfoList ***list); #endif /* HW_ACPI_ICH9_H */ diff --git a/include/hw/acpi/memory_hotplug.h b/include/hw/acpi/memory_hotplug.h index 458845933b..7bbf8a0064 100644 --- a/include/hw/acpi/memory_hotplug.h +++ b/include/hw/acpi/memory_hotplug.h @@ -34,4 +34,5 @@ extern const VMStateDescription vmstate_memory_hotplug; VMSTATE_STRUCT(memhp, state, 1, \ vmstate_memory_hotplug, MemHotplugState) +void acpi_memory_ospm_status(MemHotplugState *mem_st, ACPIOSTInfoList ***list); #endif From 02419bcb3f896fc42b50b2b04e2938365b8f7350 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 19:12:28 +0200 Subject: [PATCH 103/109] qmp: add query-acpi-ospm-status command ... to get ACPI OSPM status reported by ACPI devices via _OST method. Signed-off-by: Igor Mammedov Reviewed-by: Eric Blake Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- qapi-schema.json | 10 ++++++++++ qmp-commands.hx | 22 ++++++++++++++++++++++ qmp.c | 20 ++++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/qapi-schema.json b/qapi-schema.json index ff30ace82b..98350048f6 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3276,3 +3276,13 @@ 'slot-type': 'ACPISlotType', 'source': 'int', 'status': 'int' } } + +## +# @query-acpi-ospm-status +# +# Lists ACPI OSPM status of ACPI device objects, +# which might be reported via _OST method +# +# Since: 2.1 +## +{ 'command': 'query-acpi-ospm-status', 'returns': ['ACPIOSTInfo'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index 81054d0b1b..e4a1c80434 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3639,3 +3639,25 @@ Example: "type": "dimm" } ] } EQMP + + { + .name = "query-acpi-ospm-status", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_input_query_acpi_ospm_status, + }, + +SQMP +@query-acpi-ospm-status +-------------------- + +Return list of ACPIOSTInfo for devices that support status reporting +via ACPI _OST method. + +Example: +-> { "execute": "query-acpi-ospm-status" } +<- { "return": [ { "device": "d1", "slot": "0", "slot-type": "DIMM", "source": 1, "status": 0}, + { "slot": "1", "slot-type": "DIMM", "source": 0, "status": 0}, + { "slot": "2", "slot-type": "DIMM", "source": 0, "status": 0}, + { "slot": "3", "slot-type": "DIMM", "source": 0, "status": 0} + ]} +EQMP diff --git a/qmp.c b/qmp.c index 835fd78851..dca6efb7b8 100644 --- a/qmp.c +++ b/qmp.c @@ -29,6 +29,7 @@ #include "hw/boards.h" #include "qom/object_interfaces.h" #include "hw/mem/pc-dimm.h" +#include "hw/acpi/acpi_dev_interface.h" NameInfo *qmp_query_name(Error **errp) { @@ -639,3 +640,22 @@ MemoryDeviceInfoList *qmp_query_memory_devices(Error **errp) return head; } + +ACPIOSTInfoList *qmp_query_acpi_ospm_status(Error **errp) +{ + bool ambig; + ACPIOSTInfoList *head = NULL; + ACPIOSTInfoList **prev = &head; + Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, &ambig); + + if (obj) { + AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj); + AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj); + + adevc->ospm_status(adev, &prev); + } else { + error_setg(errp, "command is not supported, missing ACPI device"); + } + + return head; +} From 02edd407f3f1a1c3aaf5247d131735cf71657d8b Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Mon, 16 Jun 2014 19:12:29 +0200 Subject: [PATCH 104/109] qmp: add ACPI_DEVICE_OST event handling emits event when ACPI OSPM evaluates _OST method of ACPI device. Signed-off-by: Igor Mammedov Reviewed-by: Eric Blake Acked-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- docs/qmp/qmp-events.txt | 10 ++++++++++ hw/acpi/memory_hotplug.c | 29 ++++++++++++++++++++++++++++- include/monitor/monitor.h | 1 + monitor.c | 1 + 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt index 145402e078..019db53ec8 100644 --- a/docs/qmp/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -1,6 +1,16 @@ QEMU Machine Protocol Events ============================ +ACPI_DEVICE_OST +--------------- + +Emitted when guest executes ACPI _OST method. + + - data: ACPIOSTInfo type as described in qapi-schema.json + +{ "event": "ACPI_DEVICE_OST", + "data": { "device": "d1", "slot": "0", "slot-type": "DIMM", "source": 1, "status": 0 } } + BALLOON_CHANGE -------------- diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index e7009bc355..de4ddc204f 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -3,6 +3,10 @@ #include "hw/mem/pc-dimm.h" #include "hw/boards.h" #include "trace.h" +#include "qapi-visit.h" +#include "monitor/monitor.h" +#include "qapi/dealloc-visitor.h" +#include "qapi/qmp-output-visitor.h" static ACPIOSTInfo *acpi_memory_device_status(int slot, MemStatus *mdev) { @@ -35,6 +39,29 @@ void acpi_memory_ospm_status(MemHotplugState *mem_st, ACPIOSTInfoList ***list) } } +static void acpi_memory_ost_mon_event(const MemHotplugState *mem_st) +{ + Visitor *v; + QObject *out_info; + QapiDeallocVisitor *md; + QmpOutputVisitor *mo = qmp_output_visitor_new(); + MemStatus *mdev = &mem_st->devs[mem_st->selector]; + ACPIOSTInfo *info = acpi_memory_device_status(mem_st->selector, mdev); + + v = qmp_output_get_visitor(mo); + visit_type_ACPIOSTInfo(v, &info, "unused", NULL); + + out_info = qmp_output_get_qobject(mo); + monitor_protocol_event(QEVENT_ACPI_OST, out_info); + qobject_decref(out_info); + + qmp_output_visitor_cleanup(mo); + md = qapi_dealloc_visitor_new(); + v = qapi_dealloc_get_visitor(md); + visit_type_ACPIOSTInfo(v, &info, "unused", NULL); + qapi_dealloc_visitor_cleanup(md); +} + static uint64_t acpi_memory_hotplug_read(void *opaque, hwaddr addr, unsigned int size) { @@ -119,8 +146,8 @@ static void acpi_memory_hotplug_write(void *opaque, hwaddr addr, uint64_t data, mdev = &mem_st->devs[mem_st->selector]; mdev->ost_status = data; trace_mhp_acpi_write_ost_status(mem_st->selector, mdev->ost_status); - /* TODO: report async error */ /* TODO: implement memory removal on guest signal */ + acpi_memory_ost_mon_event(mem_st); break; case 0x14: mdev = &mem_st->devs[mem_st->selector]; diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 1c1f56f36b..97696ea693 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -51,6 +51,7 @@ typedef enum MonitorEvent { QEVENT_BLOCK_IMAGE_CORRUPTED, QEVENT_QUORUM_FAILURE, QEVENT_QUORUM_REPORT_BAD, + QEVENT_ACPI_OST, /* Add to 'monitor_event_names' array in monitor.c when * defining new events here */ diff --git a/monitor.c b/monitor.c index afbb2d7379..c7f879713e 100644 --- a/monitor.c +++ b/monitor.c @@ -487,6 +487,7 @@ static const char *monitor_event_names[] = { [QEVENT_BLOCK_IMAGE_CORRUPTED] = "BLOCK_IMAGE_CORRUPTED", [QEVENT_QUORUM_FAILURE] = "QUORUM_FAILURE", [QEVENT_QUORUM_REPORT_BAD] = "QUORUM_REPORT_BAD", + [QEVENT_ACPI_OST] = "ACPI_DEVICE_OST", }; QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) != QEVENT_MAX) From b4acfbcd95ac9a668e2f49dd9e1449ea81263752 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 17 Jun 2014 22:02:00 +0300 Subject: [PATCH 105/109] acpi: rephrase comment "only upto" is not proper English. Say "up to" and drop "only". Reported-by: Eric Blake Signed-off-by: Michael S. Tsirkin --- include/hw/acpi/acpi.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/hw/acpi/acpi.h b/include/hw/acpi/acpi.h index e93de6cab1..1f678b4bf2 100644 --- a/include/hw/acpi/acpi.h +++ b/include/hw/acpi/acpi.h @@ -27,8 +27,7 @@ #include "hw/irq.h" /* - * current device naming scheme supports - * only upto 256 memory devices + * current device naming scheme supports up to 256 memory devices */ #define ACPI_MAX_RAM_SLOTS 256 From c210ee95f22e90733eceeb18b9ca1914ff29e482 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jun 2014 17:50:07 +0300 Subject: [PATCH 106/109] qapi: fix input visitor bugs Remove dead code. Reset errno to 0 before each strtoull call, as the man page requires. Reported-by: Eric Blake Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake --- qapi/string-input-visitor.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/qapi/string-input-visitor.c b/qapi/string-input-visitor.c index 72722e6571..d8a8db02ed 100644 --- a/qapi/string-input-visitor.c +++ b/qapi/string-input-visitor.c @@ -48,11 +48,10 @@ static void parse_str(StringInputVisitor *siv, Error **errp) return; } - errno = 0; do { + errno = 0; start = strtoll(str, &endptr, 0); - if (errno == 0 && endptr > str && INT64_MIN <= start && - start <= INT64_MAX) { + if (errno == 0 && endptr > str) { if (*endptr == '\0') { cur = g_malloc0(sizeof(*cur)); cur->begin = start; @@ -63,9 +62,9 @@ static void parse_str(StringInputVisitor *siv, Error **errp) str = NULL; } else if (*endptr == '-') { str = endptr + 1; + errno = 0; end = strtoll(str, &endptr, 0); - if (errno == 0 && endptr > str && - INT64_MIN <= end && end <= INT64_MAX && start <= end && + if (errno == 0 && endptr > str && start <= end && (start > INT64_MAX - 65536 || end < start + 65536)) { if (*endptr == '\0') { From 6ffa1695769d0227e28715f7e77cdc49aeab1b8c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jun 2014 17:52:52 +0300 Subject: [PATCH 107/109] tests: simplify code Use error_abort instead of open-coded assert. Cleaner and shorter. Reported-by: Eric Blake Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake --- tests/test-string-input-visitor.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test-string-input-visitor.c b/tests/test-string-input-visitor.c index b01e2f2b79..8e3433e0c7 100644 --- a/tests/test-string-input-visitor.c +++ b/tests/test-string-input-visitor.c @@ -69,14 +69,12 @@ static void test_visitor_in_intList(TestInputVisitorData *data, { int64_t value[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20}; int16List *res = NULL, *tmp; - Error *errp = NULL; Visitor *v; int i = 0; v = visitor_input_test_init(data, "1,2,0,2-4,20,5-9,1-8"); - visit_type_int16List(v, &res, NULL, &errp); - g_assert(errp == NULL); + visit_type_int16List(v, &res, NULL, &error_abort); tmp = res; while (i < sizeof(value) / sizeof(value[0])) { g_assert(tmp); From 56fdfb6106276b761e360a84321bab0765bfa15d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 18 Jun 2014 17:59:51 +0300 Subject: [PATCH 108/109] qapi/string-output-visitor: fix bugs in human mode, we are creating the string: 16-31 (16-31) instead of 16-17 (10-1f) because we forgot to pass 'true' as the human parameter on one of the two calls to format_string. Also, this is a worsening of quality; previously we would produce 16 (0x10) to make it obvious which number was hex. Fix these issues. Reported-by: Eric Blake Signed-off-by: Michael S. Tsirkin Reviewed-by: Eric Blake --- qapi/string-output-visitor.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qapi/string-output-visitor.c b/qapi/string-output-visitor.c index 8735b00448..e9aca3bfdc 100644 --- a/qapi/string-output-visitor.c +++ b/qapi/string-output-visitor.c @@ -98,7 +98,7 @@ static void format_string(StringOutputVisitor *sov, Range *r, bool next, { if (r->end - r->begin > 1) { if (human) { - g_string_append_printf(sov->string, "%" PRIx64 "-%" PRIx64, + g_string_append_printf(sov->string, "0x%" PRIx64 "-%" PRIx64, r->begin, r->end - 1); } else { @@ -107,7 +107,7 @@ static void format_string(StringOutputVisitor *sov, Range *r, bool next, } } else { if (human) { - g_string_append_printf(sov->string, "%" PRIx64, r->begin); + g_string_append_printf(sov->string, "0x%" PRIx64, r->begin); } else { g_string_append_printf(sov->string, "%" PRId64, r->begin); } @@ -186,7 +186,7 @@ static void print_type_int(Visitor *v, int64_t *obj, const char *name, g_string_append(sov->string, " ("); while (l) { Range *r = l->data; - format_string(sov, r, l->next != NULL, false); + format_string(sov, r, l->next != NULL, true); l = l->next; } g_string_append(sov->string, ")"); From 705456c0d7f24fbd76733c891525b8eeea332e8b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 19 Jun 2014 13:08:35 +0300 Subject: [PATCH 109/109] numa: use RAM_ADDR_FMT with ram_addr_t commit 4407ab055be995e64633322a78e64dfa376dc534 vl.c: extend -m option to support options for memory hotplug prints ram_addr_t with u64 format, this is wrong for some systems, in particular w32. print ram_addr_t with RAM_ADDR_FMT to fix build on w32. Signed-off-by: Michael S. Tsirkin --- vl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vl.c b/vl.c index b27c2a6422..54b46271c2 100644 --- a/vl.c +++ b/vl.c @@ -3302,16 +3302,16 @@ int main(int argc, char **argv, char **envp) sz = qemu_opt_get_size(opts, "maxmem", 0); if (sz < ram_size) { fprintf(stderr, "qemu: invalid -m option value: maxmem " - "(%" PRIu64 ") <= initial memory (%" - PRIu64 ")\n", sz, ram_size); + "(%" PRIu64 ") <= initial memory (" + RAM_ADDR_FMT ")\n", sz, ram_size); exit(EXIT_FAILURE); } slots = qemu_opt_get_number(opts, "slots", 0); if ((sz > ram_size) && !slots) { fprintf(stderr, "qemu: invalid -m option value: maxmem " - "(%" PRIu64 ") more than initial memory (%" - PRIu64 ") but no hotplug slots where " + "(%" PRIu64 ") more than initial memory (" + RAM_ADDR_FMT ") but no hotplug slots where " "specified\n", sz, ram_size); exit(EXIT_FAILURE); } @@ -3319,8 +3319,8 @@ int main(int argc, char **argv, char **envp) if ((sz <= ram_size) && slots) { fprintf(stderr, "qemu: invalid -m option value: %" PRIu64 " hotplug slots where specified but " - "maxmem (%" PRIu64 ") <= initial memory (%" - PRIu64 ")\n", slots, sz, ram_size); + "maxmem (%" PRIu64 ") <= initial memory (" + RAM_ADDR_FMT ")\n", slots, sz, ram_size); exit(EXIT_FAILURE); } maxram_size = sz;