From dcff1035dfdfb4c76634df64a5359ac18749f7ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 23 Oct 2017 15:18:07 +0100 Subject: [PATCH 01/19] memfd: split qemu_memfd_alloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a function to only create a memfd, without mmap. The function is used in the following memory backend. Signed-off-by: Marc-André Lureau Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20171023141815.17709-2-marcandre.lureau@redhat.com> Signed-off-by: Eduardo Habkost --- include/qemu/memfd.h | 1 + util/memfd.c | 61 ++++++++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/include/qemu/memfd.h b/include/qemu/memfd.h index 745a8c501e..41c24d807c 100644 --- a/include/qemu/memfd.h +++ b/include/qemu/memfd.h @@ -16,6 +16,7 @@ #define F_SEAL_WRITE 0x0008 /* prevent writes */ #endif +int qemu_memfd_create(const char *name, size_t size, unsigned int seals); void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, int *fd); void qemu_memfd_free(void *ptr, size_t size, int fd); diff --git a/util/memfd.c b/util/memfd.c index 412e94a405..3a82505f8d 100644 --- a/util/memfd.c +++ b/util/memfd.c @@ -53,6 +53,38 @@ static int memfd_create(const char *name, unsigned int flags) #define MFD_ALLOW_SEALING 0x0002U #endif +int qemu_memfd_create(const char *name, size_t size, unsigned int seals) +{ + int mfd = -1; + +#ifdef CONFIG_LINUX + unsigned int flags = MFD_CLOEXEC; + + if (seals) { + flags |= MFD_ALLOW_SEALING; + } + + mfd = memfd_create(name, flags); + if (mfd < 0) { + return -1; + } + + if (ftruncate(mfd, size) == -1) { + perror("ftruncate"); + close(mfd); + return -1; + } + + if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { + perror("fcntl"); + close(mfd); + return -1; + } +#endif + + return mfd; +} + /* * This is a best-effort helper for shared memory allocation, with * optional sealing. The helper will do his best to allocate using @@ -63,35 +95,14 @@ void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, int *fd) { void *ptr; - int mfd = -1; + int mfd = qemu_memfd_create(name, size, seals); - *fd = -1; - -#ifdef CONFIG_LINUX - if (seals) { - mfd = memfd_create(name, MFD_ALLOW_SEALING | MFD_CLOEXEC); + /* some systems have memfd without sealing */ + if (mfd == -1) { + mfd = qemu_memfd_create(name, size, 0); } if (mfd == -1) { - /* some systems have memfd without sealing */ - mfd = memfd_create(name, MFD_CLOEXEC); - seals = 0; - } -#endif - - if (mfd != -1) { - if (ftruncate(mfd, size) == -1) { - perror("ftruncate"); - close(mfd); - return NULL; - } - - if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { - perror("fcntl"); - close(mfd); - return NULL; - } - } else { const char *tmpdir = g_get_tmp_dir(); gchar *fname; From e3ab04deb036a707fdf1ca0418cb80c4cd9302f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Mon, 23 Oct 2017 15:18:08 +0100 Subject: [PATCH 02/19] memfd: remove needless include MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marc-André Lureau Reviewed-by: Philippe Mathieu-Daudé Message-Id: <20171023141815.17709-3-marcandre.lureau@redhat.com> Signed-off-by: Eduardo Habkost --- util/memfd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/util/memfd.c b/util/memfd.c index 3a82505f8d..dce61f9d21 100644 --- a/util/memfd.c +++ b/util/memfd.c @@ -27,8 +27,6 @@ #include "qemu/osdep.h" -#include - #include "qemu/memfd.h" #if defined CONFIG_LINUX && !defined CONFIG_MEMFD From c7cddce1f7eb4b6d1ae979349f3dacb130a37814 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 28 Nov 2017 16:15:28 +0000 Subject: [PATCH 03/19] qemu-options: document missing memory-backend-file options This patch adds undocumented memory-backend-file options to the documentation. Signed-off-by: Stefan Hajnoczi Message-Id: <20171128161529.3025-2-stefanha@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Eduardo Habkost Signed-off-by: Eduardo Habkost --- qemu-options.hx | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/qemu-options.hx b/qemu-options.hx index 678181c599..fe8c04f644 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3972,18 +3972,24 @@ property must be set. These objects are placed in the @table @option -@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off} +@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} Creates a memory file backend object, which can be used to back -the guest RAM with huge pages. The @option{id} parameter is a -unique ID that will be used to reference this memory region -when configuring the @option{-numa} argument. The @option{size} -option provides the size of the memory region, and accepts -common suffixes, eg @option{500M}. The @option{mem-path} provides -the path to either a shared memory or huge page filesystem mount. +the guest RAM with huge pages. + +The @option{id} parameter is a unique ID that will be used to reference this +memory region when configuring the @option{-numa} argument. + +The @option{size} option provides the size of the memory region, and accepts +common suffixes, eg @option{500M}. + +The @option{mem-path} provides the path to either a shared memory or huge page +filesystem mount. + The @option{share} boolean option determines whether the memory region is marked as private to QEMU, or shared. The latter allows a co-operating external process to access the QEMU memory region. + Setting the @option{discard-data} boolean option to @var{on} indicates that file contents can be destroyed when QEMU exits, to avoid unnecessarily flushing data to the backing file. Note @@ -3991,6 +3997,34 @@ that @option{discard-data} is only an optimization, and QEMU might not discard file contents if it aborts unexpectedly or is terminated using SIGKILL. +The @option{merge} boolean option enables memory merge, also known as +MADV_MERGEABLE, so that Kernel Samepage Merging will consider the pages for +memory deduplication. + +Setting the @option{dump} boolean option to @var{off} excludes the memory from +core dumps. This feature is also known as MADV_DONTDUMP. + +The @option{prealloc} boolean option enables memory preallocation. + +The @option{host-nodes} option binds the memory range to a list of NUMA host +nodes. + +The @option{policy} option sets the NUMA policy to one of the following values: + +@table @option +@item @var{default} +default host policy + +@item @var{preferred} +prefer the given host node list for allocation + +@item @var{bind} +restrict memory allocation to the given host node list + +@item @var{interleave} +interleave memory allocations across the given host node list +@end table + @item -object rng-random,id=@var{id},filename=@var{/dev/random} Creates a random number generator backend which obtains entropy from From cd19491ac19209d2554d593ea4f3bd9f6e3ed6b7 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 28 Nov 2017 16:15:29 +0000 Subject: [PATCH 04/19] qemu-options: document memory-backend-ram The documentation should mention -object memory-backend-ram. Suggested-by: Yumei Huang Signed-off-by: Stefan Hajnoczi Message-Id: <20171128161529.3025-3-stefanha@redhat.com> Reviewed-by: Eric Blake Reviewed-by: Eduardo Habkost Signed-off-by: Eduardo Habkost --- qemu-options.hx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index fe8c04f644..5b0ee43b18 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -4025,6 +4025,13 @@ restrict memory allocation to the given host node list interleave memory allocations across the given host node list @end table +@item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} + +Creates a memory backend object, which can be used to back the guest RAM. +Memory backend objects offer more control than the @option{-m} option that is +traditionally used to define guest RAM. Please refer to +@option{memory-backend-file} for a description of the options. + @item -object rng-random,id=@var{id},filename=@var{/dev/random} Creates a random number generator backend which obtains entropy from From 2d19c656612bbd104bb139217b12017a6992a898 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 28 Nov 2017 15:53:58 +0100 Subject: [PATCH 05/19] numa: fix missing '-numa cpu' in '-help' output commit 419fcdec3c (numa: add '-numa cpu,...' option for property based node mapping) added '-numa cpu' option but forgot to update appropriate section for '--help'. Add '-numa cpu' description to '-help' output Reported-by: Markus Armbruster Signed-off-by: Igor Mammedov Message-Id: <1511880838-56509-1-git-send-email-imammedo@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Eduardo Habkost --- qemu-options.hx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qemu-options.hx b/qemu-options.hx index 5b0ee43b18..b3e03c5464 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -169,7 +169,9 @@ ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node]\n" - "-numa dist,src=source,dst=destination,val=distance\n", QEMU_ARCH_ALL) + "-numa dist,src=source,dst=destination,val=distance\n" + "-numa cpu,node-id=node[,socket-id=x][,core-id=y][,thread-id=z]\n", + QEMU_ARCH_ALL) STEXI @item -numa node[,mem=@var{size}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] @itemx -numa node[,memdev=@var{id}][,cpus=@var{firstcpu}[-@var{lastcpu}]][,nodeid=@var{node}] From 0bd1909da606a60f50128290fc319db020fa303c Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:05 -0200 Subject: [PATCH 06/19] machine: Replace has_dynamic_sysbus with list of allowed devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing has_dynamic_sysbus flag makes the machine accept every user-creatable sysbus device type on the command-line. Replace it with a list of allowed device types, so machines can easily accept some sysbus devices while rejecting others. To keep exactly the same behavior as before, the existing has_dynamic_sysbus=true assignments are replaced with a TYPE_SYS_BUS_DEVICE entry on the allowed list. Other patches will replace the TYPE_SYS_BUS_DEVICE entries with more specific lists of devices. Cc: Peter Maydell Cc: Marcel Apfelbaum Cc: "Michael S. Tsirkin" Cc: Alexander Graf Cc: David Gibson Cc: Stefano Stabellini Cc: Anthony Perard Cc: qemu-arm@nongnu.org Cc: qemu-ppc@nongnu.org Cc: xen-devel@lists.xenproject.org Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-2-ehabkost@redhat.com> Reviewed-by: Greg Kurz Reviewed-by: David Gibson Reviewed-by: Marc-André Lureau Reviewed-by: Marcel Apfelbaum Signed-off-by: Eduardo Habkost --- hw/arm/virt.c | 3 ++- hw/core/machine.c | 45 +++++++++++++++++++++++++++++--------------- hw/i386/pc_q35.c | 3 ++- hw/ppc/e500plat.c | 4 +++- hw/ppc/spapr.c | 3 ++- hw/xen/xen_backend.c | 7 ++++++- include/hw/boards.h | 5 ++++- 7 files changed, 49 insertions(+), 21 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 543f9bd6cc..7549895fd2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1591,7 +1591,8 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) * configuration of the particular instance. */ mc->max_cpus = 255; - mc->has_dynamic_sysbus = true; + /*TODO: allow only sysbus devices that really work with this machine */ + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; mc->pci_allow_0_address = true; diff --git a/hw/core/machine.c b/hw/core/machine.c index c857f3f934..0320a8efa1 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -334,29 +334,44 @@ static bool machine_get_enforce_config_section(Object *obj, Error **errp) return ms->enforce_config_section; } -static void error_on_sysbus_device(SysBusDevice *sbdev, void *opaque) +void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type) { - error_report("Option '-device %s' cannot be handled by this machine", - object_class_get_name(object_get_class(OBJECT(sbdev)))); - exit(1); + strList *item = g_new0(strList, 1); + + item->value = g_strdup(type); + item->next = mc->allowed_dynamic_sysbus_devices; + mc->allowed_dynamic_sysbus_devices = item; +} + +static void validate_sysbus_device(SysBusDevice *sbdev, void *opaque) +{ + MachineState *machine = opaque; + MachineClass *mc = MACHINE_GET_CLASS(machine); + bool allowed = false; + strList *wl; + + for (wl = mc->allowed_dynamic_sysbus_devices; + !allowed && wl; + wl = wl->next) { + allowed |= !!object_dynamic_cast(OBJECT(sbdev), wl->value); + } + + if (!allowed) { + error_report("Option '-device %s' cannot be handled by this machine", + object_class_get_name(object_get_class(OBJECT(sbdev)))); + exit(1); + } } static void machine_init_notify(Notifier *notifier, void *data) { - Object *machine = qdev_get_machine(); - ObjectClass *oc = object_get_class(machine); - MachineClass *mc = MACHINE_CLASS(oc); - - if (mc->has_dynamic_sysbus) { - /* Our machine can handle dynamic sysbus devices, we're all good */ - return; - } + MachineState *machine = MACHINE(qdev_get_machine()); /* - * Loop through all dynamically created devices and check whether there - * are sysbus devices among them. If there are, error out. + * Loop through all dynamically created sysbus devices and check if they are + * all allowed. If a device is not allowed, error out. */ - foreach_dynamic_sysbus_device(error_on_sysbus_device, NULL); + foreach_dynamic_sysbus_device(validate_sysbus_device, machine); } HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 5c6c608fcb..0505730a99 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -299,7 +299,8 @@ static void pc_q35_machine_options(MachineClass *m) m->default_machine_opts = "firmware=bios-256k.bin"; m->default_display = "std"; m->no_floppy = 1; - m->has_dynamic_sysbus = true; + /*TODO: allow only sysbus devices that really work with this machine */ + machine_class_allow_dynamic_sysbus_dev(m, TYPE_SYS_BUS_DEVICE); m->max_cpus = 288; } diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c index e59e80fb9e..438118c29b 100644 --- a/hw/ppc/e500plat.c +++ b/hw/ppc/e500plat.c @@ -15,6 +15,7 @@ #include "hw/boards.h" #include "sysemu/device_tree.h" #include "sysemu/kvm.h" +#include "hw/sysbus.h" #include "hw/pci/pci.h" #include "hw/ppc/openpic.h" #include "kvm_ppc.h" @@ -63,7 +64,8 @@ static void e500plat_machine_init(MachineClass *mc) mc->desc = "generic paravirt e500 platform"; mc->init = e500plat_init; mc->max_cpus = 32; - mc->has_dynamic_sysbus = true; + /*TODO: allow only sysbus devices that really work with this machine */ + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("e500v2_v30"); } diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 499ab647d8..5847175fd9 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3843,7 +3843,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) mc->default_boot_order = ""; mc->default_ram_size = 512 * M_BYTE; mc->kvm_type = spapr_kvm_type; - mc->has_dynamic_sysbus = true; + /*TODO: allow only sysbus devices that really work with this machine */ + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); mc->pci_allow_0_address = true; mc->get_hotplug_handler = spapr_get_hotplug_handler; hc->pre_plug = spapr_machine_device_pre_plug; diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c index 0f849a26d2..82380ea9ee 100644 --- a/hw/xen/xen_backend.c +++ b/hw/xen/xen_backend.c @@ -564,7 +564,12 @@ static void xen_set_dynamic_sysbus(void) ObjectClass *oc = object_get_class(machine); MachineClass *mc = MACHINE_CLASS(oc); - mc->has_dynamic_sysbus = true; + /* + * Emulate old mc->has_dynamic_sysbus=true assignment + * + *TODO: add only Xen devices to the list + */ + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); } int xen_be_register(const char *type, struct XenDevOps *ops) diff --git a/include/hw/boards.h b/include/hw/boards.h index 156b16f7a6..041bc08971 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -76,6 +76,9 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp); +void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type); + + /** * CPUArchId: * @arch_id - architecture-dependent CPU ID of present or possible CPU @@ -179,7 +182,6 @@ struct MachineClass { no_floppy:1, no_cdrom:1, no_sdcard:1, - has_dynamic_sysbus:1, pci_allow_0_address:1, legacy_fw_cfg_order:1; int is_default; @@ -197,6 +199,7 @@ struct MachineClass { bool ignore_memory_transaction_failures; int numa_mem_align_shift; const char **valid_cpu_types; + strList *allowed_dynamic_sysbus_devices; bool auto_enable_numa_with_memhp; void (*numa_auto_assign_ram)(MachineClass *mc, NodeInfo *nodes, int nb_nodes, ram_addr_t size); From 6f2062b9758ebc64dfbbda642b0c407c38131ea3 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:06 -0200 Subject: [PATCH 07/19] hw/arm/virt: Allow only supported dynamic sysbus devices Replace the TYPE_SYS_BUS_DEVICE entry in the allowed sysbus device list with the two device types that are really supported by the virt machine: vfio-amd-xgbe and vfio-calxeda-xgmac. Cc: Peter Maydell Cc: qemu-arm@nongnu.org Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-3-ehabkost@redhat.com> Signed-off-by: Eduardo Habkost --- hw/arm/virt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 7549895fd2..4a6fdcc4f5 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -34,6 +34,8 @@ #include "hw/arm/arm.h" #include "hw/arm/primecell.h" #include "hw/arm/virt.h" +#include "hw/vfio/vfio-calxeda-xgmac.h" +#include "hw/vfio/vfio-amd-xgbe.h" #include "hw/devices.h" #include "net/net.h" #include "sysemu/block-backend.h" @@ -1591,8 +1593,8 @@ static void virt_machine_class_init(ObjectClass *oc, void *data) * configuration of the particular instance. */ mc->max_cpus = 255; - /*TODO: allow only sysbus devices that really work with this machine */ - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_CALXEDA_XGMAC); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_VFIO_AMD_XGBE); mc->block_default_type = IF_VIRTIO; mc->no_cdrom = 1; mc->pci_allow_0_address = true; From 50d01d240f26a43a1d8c0667078bcab7186f2f96 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:07 -0200 Subject: [PATCH 08/19] ppc: e500: Allow only supported dynamic sysbus devices platform_bus_create_devtree() already rejects all dynamic sysbus devices except TYPE_ETSEC_COMMON, so register it as the only allowed dynamic sysbus device for the ppce500 machine-type. Cc: Alexander Graf Cc: David Gibson Cc: qemu-ppc@nongnu.org Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-4-ehabkost@redhat.com> Acked-by: David Gibson Reviewed-by: Greg Kurz Signed-off-by: Eduardo Habkost --- hw/ppc/e500plat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hw/ppc/e500plat.c b/hw/ppc/e500plat.c index 438118c29b..81d03e1038 100644 --- a/hw/ppc/e500plat.c +++ b/hw/ppc/e500plat.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "qemu-common.h" #include "e500.h" +#include "hw/net/fsl_etsec/etsec.h" #include "hw/boards.h" #include "sysemu/device_tree.h" #include "sysemu/kvm.h" @@ -64,8 +65,7 @@ static void e500plat_machine_init(MachineClass *mc) mc->desc = "generic paravirt e500 platform"; mc->init = e500plat_init; mc->max_cpus = 32; - /*TODO: allow only sysbus devices that really work with this machine */ - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_ETSEC_COMMON); mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("e500v2_v30"); } From 7da79a167aa11af6fad353bfbc825fbff2db68a2 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:08 -0200 Subject: [PATCH 09/19] spapr: Allow only supported dynamic sysbus devices TYPE_SPAPR_PCI_HOST_BRIDGE is the only dynamic sysbus device not rejected by ppc_spapr_reset(), so it can be the only entry on the allowed list. Cc: David Gibson Cc: Alexander Graf Cc: qemu-ppc@nongnu.org Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-5-ehabkost@redhat.com> Acked-by: David Gibson Reviewed-by: Greg Kurz Signed-off-by: Eduardo Habkost --- hw/ppc/spapr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 5847175fd9..278f9de1e7 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3843,8 +3843,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) mc->default_boot_order = ""; mc->default_ram_size = 512 * M_BYTE; mc->kvm_type = spapr_kvm_type; - /*TODO: allow only sysbus devices that really work with this machine */ - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SPAPR_PCI_HOST_BRIDGE); mc->pci_allow_0_address = true; mc->get_hotplug_handler = spapr_get_hotplug_handler; hc->pre_plug = spapr_machine_device_pre_plug; From b1b68e1094af0e8c7fcf6faff1a77b7787b3628b Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:09 -0200 Subject: [PATCH 10/19] xen: Add only xen-sysdev to dynamic sysbus device list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no need to make the machine allow every possible sysbus device. We can now just add xen-sysdev to the allowed list. Cc: Stefano Stabellini Cc: Anthony Perard Cc: xen-devel@lists.xenproject.org Cc: Juergen Gross Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-6-ehabkost@redhat.com> Reviewed-by: Marc-André Lureau Acked-by: Anthony PERARD Signed-off-by: Eduardo Habkost --- hw/xen/xen_backend.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/hw/xen/xen_backend.c b/hw/xen/xen_backend.c index 82380ea9ee..7445b506ac 100644 --- a/hw/xen/xen_backend.c +++ b/hw/xen/xen_backend.c @@ -564,12 +564,7 @@ static void xen_set_dynamic_sysbus(void) ObjectClass *oc = object_get_class(machine); MachineClass *mc = MACHINE_CLASS(oc); - /* - * Emulate old mc->has_dynamic_sysbus=true assignment - * - *TODO: add only Xen devices to the list - */ - machine_class_allow_dynamic_sysbus_dev(mc, TYPE_SYS_BUS_DEVICE); + machine_class_allow_dynamic_sysbus_dev(mc, TYPE_XENSYSDEV); } int xen_be_register(const char *type, struct XenDevOps *ops) From ef18310d5495c4ce0e1289ed7fc9833d73fe5ee1 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Sat, 25 Nov 2017 13:16:10 -0200 Subject: [PATCH 11/19] q35: Allow only supported dynamic sysbus devices The only user-creatable sysbus devices in qemu-system-x86_64 are amd-iommu, intel-iommu, and xen-backend. xen-backend is handled by xen_set_dynamic_sysbus(), so we only need to add amd-iommu and intel-iommu. Cc: "Michael S. Tsirkin" Cc: Marcel Apfelbaum Signed-off-by: Eduardo Habkost Message-Id: <20171125151610.20547-7-ehabkost@redhat.com> Reviewed-by: Marcel Apfelbaum Signed-off-by: Eduardo Habkost --- hw/i386/pc_q35.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 0505730a99..ed3a0b8ff7 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -42,6 +42,8 @@ #include "exec/address-spaces.h" #include "hw/i386/pc.h" #include "hw/i386/ich9.h" +#include "hw/i386/amd_iommu.h" +#include "hw/i386/intel_iommu.h" #include "hw/smbios/smbios.h" #include "hw/ide/pci.h" #include "hw/ide/ahci.h" @@ -299,8 +301,8 @@ static void pc_q35_machine_options(MachineClass *m) m->default_machine_opts = "firmware=bios-256k.bin"; m->default_display = "std"; m->no_floppy = 1; - /*TODO: allow only sysbus devices that really work with this machine */ - machine_class_allow_dynamic_sysbus_dev(m, TYPE_SYS_BUS_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_AMD_IOMMU_DEVICE); + machine_class_allow_dynamic_sysbus_dev(m, TYPE_INTEL_IOMMU_DEVICE); m->max_cpus = 288; } From 58346214d03ffcd774e86e3ce72b4196769eb710 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 2 Nov 2017 11:10:05 +0100 Subject: [PATCH 12/19] qdev_monitor: Simplify error handling in qdev_device_add() Instead of doing the clean-ups on errors multiple times, introduce a jump label at the end of the function that can be used by all error paths that need this cleanup. Suggested-by: Igor Mammedov Signed-off-by: Thomas Huth Message-Id: <1509617407-21191-2-git-send-email-thuth@redhat.com> Reviewed-by: Cornelia Huck Signed-off-by: Eduardo Habkost --- qdev-monitor.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/qdev-monitor.c b/qdev-monitor.c index b4abb4b5ea..2abb80d7e4 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -619,22 +619,22 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) /* set properties */ if (qemu_opt_foreach(opts, set_property, dev, &err)) { - error_propagate(errp, err); - object_unparent(OBJECT(dev)); - object_unref(OBJECT(dev)); - return NULL; + goto err_del_dev; } dev->opts = opts; object_property_set_bool(OBJECT(dev), true, "realized", &err); if (err != NULL) { - error_propagate(errp, err); dev->opts = NULL; - object_unparent(OBJECT(dev)); - object_unref(OBJECT(dev)); - return NULL; + goto err_del_dev; } return dev; + +err_del_dev: + error_propagate(errp, err); + object_unparent(OBJECT(dev)); + object_unref(OBJECT(dev)); + return NULL; } From 03fcbd9dc5084ff4676c153fbe04fb0fcf939d09 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 2 Nov 2017 11:10:06 +0100 Subject: [PATCH 13/19] qdev: Check for the availability of a hotplug controller before adding a device The qdev_unplug() function contains a g_assert(hotplug_ctrl) statement, so QEMU crashes when the user tries to device_add + device_del a device that does not have a corresponding hotplug controller. This could be provoked for a couple of devices in the past (see commit 4c93950659487c7ad or 84ebd3e8c7d4fe955 for example), and can currently for example also be triggered like this: $ s390x-softmmu/qemu-system-s390x -M none -nographic QEMU 2.10.50 monitor - type 'help' for more information (qemu) device_add qemu-s390x-cpu,id=x (qemu) device_del x ** ERROR:qemu/qdev-monitor.c:872:qdev_unplug: assertion failed: (hotplug_ctrl) Aborted (core dumped) So devices clearly need a hotplug controller when they should be usable with device_add. The code in qdev_device_add() already checks whether the bus has a proper hotplug controller, but for devices that do not have a corresponding bus, there is no appropriate check available yet. In that case we should check whether the machine itself provides a suitable hotplug controller and refuse to plug the device if none is available. Reviewed-by: Igor Mammedov Signed-off-by: Thomas Huth Message-Id: <1509617407-21191-3-git-send-email-thuth@redhat.com> Reviewed-by: Cornelia Huck Signed-off-by: Eduardo Habkost --- hw/core/qdev.c | 28 ++++++++++++++++++++-------- include/hw/qdev-core.h | 1 + qdev-monitor.c | 5 +++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/hw/core/qdev.c b/hw/core/qdev.c index 11112951a5..f739753e3a 100644 --- a/hw/core/qdev.c +++ b/hw/core/qdev.c @@ -253,19 +253,31 @@ void qdev_set_legacy_instance_id(DeviceState *dev, int alias_id, dev->alias_required_for_version = required_for_version; } +HotplugHandler *qdev_get_machine_hotplug_handler(DeviceState *dev) +{ + MachineState *machine; + MachineClass *mc; + Object *m_obj = qdev_get_machine(); + + if (object_dynamic_cast(m_obj, TYPE_MACHINE)) { + machine = MACHINE(m_obj); + mc = MACHINE_GET_CLASS(machine); + if (mc->get_hotplug_handler) { + return mc->get_hotplug_handler(machine, dev); + } + } + + return NULL; +} + HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev) { - HotplugHandler *hotplug_ctrl = NULL; + HotplugHandler *hotplug_ctrl; if (dev->parent_bus && dev->parent_bus->hotplug_handler) { hotplug_ctrl = dev->parent_bus->hotplug_handler; - } else if (object_dynamic_cast(qdev_get_machine(), TYPE_MACHINE)) { - MachineState *machine = MACHINE(qdev_get_machine()); - MachineClass *mc = MACHINE_GET_CLASS(machine); - - if (mc->get_hotplug_handler) { - hotplug_ctrl = mc->get_hotplug_handler(machine, dev); - } + } else { + hotplug_ctrl = qdev_get_machine_hotplug_handler(dev); } return hotplug_ctrl; } diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 0a71bf83f0..51473eee7b 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -286,6 +286,7 @@ DeviceState *qdev_try_create(BusState *bus, const char *name); void qdev_init_nofail(DeviceState *dev); void qdev_set_legacy_instance_id(DeviceState *dev, int alias_id, int required_for_version); +HotplugHandler *qdev_get_machine_hotplug_handler(DeviceState *dev); HotplugHandler *qdev_get_hotplug_handler(DeviceState *dev); void qdev_unplug(DeviceState *dev, Error **errp); void qdev_simple_device_unplug_cb(HotplugHandler *hotplug_dev, diff --git a/qdev-monitor.c b/qdev-monitor.c index 2abb80d7e4..c436616446 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -613,6 +613,11 @@ DeviceState *qdev_device_add(QemuOpts *opts, Error **errp) if (bus) { qdev_set_parent_bus(dev, bus); + } else if (qdev_hotplug && !qdev_get_machine_hotplug_handler(dev)) { + /* No bus, no machine hotplug handler --> device is not hotpluggable */ + error_setg(&err, "Device '%s' can not be hotplugged on this machine", + driver); + goto err_del_dev; } qdev_set_id(dev, qemu_opts_id(opts)); From 1e2bdd2e20844f6bc343232ea1bb6f64c54a95ce Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Mon, 18 Dec 2017 17:10:38 +0100 Subject: [PATCH 14/19] scripts: Remove fixed entries from the device-crash-test These are crashes / errors which have been fixed already in the past months. We can remove these from the device-crash-test script now. Signed-off-by: Thomas Huth Message-Id: <1513613438-11017-1-git-send-email-thuth@redhat.com> Signed-off-by: Eduardo Habkost --- scripts/device-crash-test | 8 -------- 1 file changed, 8 deletions(-) diff --git a/scripts/device-crash-test b/scripts/device-crash-test index 827d8ec2af..7417177ebb 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -207,11 +207,9 @@ ERROR_WHITELIST = [ # Known crashes will generate error messages, but won't be fatal. # Those entries must be removed once we fix the crashes. {'exitcode':-6, 'log':r"Device 'serial0' is in use", 'loglevel':logging.ERROR}, - {'exitcode':-6, 'log':r"spapr_rtas_register: Assertion .*rtas_table\[token\]\.name.* failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"qemu_net_client_setup: Assertion `!peer->peer' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r'RAMBlock "[\w.-]+" already registered', 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"find_ram_offset: Assertion `size != 0' failed.", 'loglevel':logging.ERROR}, - {'exitcode':-6, 'log':r"puv3_load_kernel: Assertion `kernel_filename != NULL' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"add_cpreg_to_hashtable: code should not be reached", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"qemu_alloc_display: Assertion `surface->image != NULL' failed", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"Unexpected error in error_set_from_qdev_prop_error", 'loglevel':logging.ERROR}, @@ -219,16 +217,10 @@ ERROR_WHITELIST = [ {'exitcode':-6, 'log':r"Object .* is not an instance of type generic-pc-machine", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"Object .* is not an instance of type e500-ccsr", 'loglevel':logging.ERROR}, {'exitcode':-6, 'log':r"vmstate_register_with_alias_id: Assertion `!se->compat \|\| se->instance_id == 0' failed", 'loglevel':logging.ERROR}, - {'exitcode':-11, 'device':'stm32f205-soc', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'xlnx,zynqmp', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'mips-cps', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'gus', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'a9mpcore_priv', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'a15mpcore_priv', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'isa-serial', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'sb16', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'device':'cs4231a', 'loglevel':logging.ERROR, 'expected':True}, - {'exitcode':-11, 'device':'arm-gicv3', 'loglevel':logging.ERROR, 'expected':True}, {'exitcode':-11, 'machine':'isapc', 'device':'.*-iommu', 'loglevel':logging.ERROR, 'expected':True}, # everything else (including SIGABRT and SIGSEGV) will be a fatal error: From 983768431676f9ab8599a0b4813e1ca17af70838 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 11 Dec 2017 15:28:04 +0800 Subject: [PATCH 15/19] hostmem-file: add "align" option When mmap(2) the backend files, QEMU uses the host page size (getpagesize(2)) by default as the alignment of mapping address. However, some backends may require alignments different than the page size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned, fails with a kernel message like [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff) Because there is no common approach to get such alignment requirement, we add the 'align' option to 'memory-backend-file', so that users or management utils, which have enough knowledge about the backend, can specify a proper alignment via this option. Signed-off-by: Haozhong Zhang Message-Id: <20171211072806.2812-2-haozhong.zhang@intel.com> Reviewed-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi [ehabkost: fixed typo, fixed error_setg() format string] Signed-off-by: Eduardo Habkost --- backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++- docs/nvdimm.txt | 16 ++++++++++++++++ exec.c | 8 +++++++- include/exec/memory.h | 3 +++ memory.c | 2 ++ numa.c | 2 +- qemu-options.hx | 9 ++++++++- 7 files changed, 77 insertions(+), 4 deletions(-) diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index e44c319915..e319ec1ad8 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -34,6 +34,7 @@ struct HostMemoryBackendFile { bool share; bool discard_data; char *mem_path; + uint64_t align; }; static void @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) path = object_get_canonical_path(OBJECT(backend)); memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), path, - backend->size, fb->share, + backend->size, fb->align, fb->share, fb->mem_path, errp); g_free(path); } @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value, MEMORY_BACKEND_FILE(o)->discard_data = value; } +static void file_memory_backend_get_align(Object *o, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + uint64_t val = fb->align; + + visit_type_size(v, name, &val, errp); +} + +static void file_memory_backend_set_align(Object *o, Visitor *v, + const char *name, void *opaque, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + Error *local_err = NULL; + uint64_t val; + + if (host_memory_backend_mr_inited(backend)) { + error_setg(&local_err, "cannot change property value"); + goto out; + } + + visit_type_size(v, name, &val, &local_err); + if (local_err) { + goto out; + } + fb->align = val; + + out: + error_propagate(errp, local_err); +} + static void file_backend_unparent(Object *obj) { HostMemoryBackend *backend = MEMORY_BACKEND(obj); @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data) object_class_property_add_str(oc, "mem-path", get_mem_path, set_mem_path, &error_abort); + object_class_property_add(oc, "align", "int", + file_memory_backend_get_align, + file_memory_backend_set_align, + NULL, NULL, &error_abort); } static void file_backend_instance_finalize(Object *o) diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt index 2d9f8c0e8c..21249dd062 100644 --- a/docs/nvdimm.txt +++ b/docs/nvdimm.txt @@ -122,3 +122,19 @@ Note: M >= size of RAM devices + size of statically plugged vNVDIMM devices + size of hotplugged vNVDIMM devices + +Alignment +--------- + +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping +address to the page size (getpagesize(2)) by default. However, some +types of backends may require an alignment different than the page +size. In that case, QEMU v2.12.0 and later provide 'align' option to +memory-backend-file to allow users to specify the proper alignment. + +For example, device dax require the 2 MB alignment, so we can use +following QEMU command line options to use it (/dev/dax0.0) as the +backend of vNVDIMM: + + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M + -device nvdimm,id=nvdimm1,memdev=mem1 diff --git a/exec.c b/exec.c index d28fc0cd3d..629a508385 100644 --- a/exec.c +++ b/exec.c @@ -1612,7 +1612,13 @@ static void *file_ram_alloc(RAMBlock *block, void *area; block->page_size = qemu_fd_getpagesize(fd); - block->mr->align = block->page_size; + if (block->mr->align % block->page_size) { + error_setg(errp, "alignment 0x%" PRIx64 + " must be multiples of page size 0x%zx", + block->mr->align, block->page_size); + return NULL; + } + block->mr->align = MAX(block->page_size, block->mr->align); #if defined(__s390x__) if (kvm_enabled()) { block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); diff --git a/include/exec/memory.h b/include/exec/memory.h index a4cabdf44c..07c5d6d597 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr, * @name: Region name, becomes part of RAMBlock name used in migration stream * must be unique within any device * @size: size of the region. + * @align: alignment of the region base address; if 0, the default alignment + * (getpagesize()) will be used. * @share: %true if memory must be mmaped with the MAP_SHARED flag * @path: the path in which to allocate the RAM. * @errp: pointer to Error*, to store an error if it happens. @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + uint64_t align, bool share, const char *path, Error **errp); diff --git a/memory.c b/memory.c index 4b41fb837b..449a1429b9 100644 --- a/memory.c +++ b/memory.c @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, struct Object *owner, const char *name, uint64_t size, + uint64_t align, bool share, const char *path, Error **errp) @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, mr->ram = true; mr->terminates = true; mr->destructor = memory_region_destructor_ram; + mr->align = align; mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp); mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0; } diff --git a/numa.c b/numa.c index 7b9c33ad12..83675a03f3 100644 --- a/numa.c +++ b/numa.c @@ -456,7 +456,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner, if (mem_path) { #ifdef __linux__ Error *err = NULL; - memory_region_init_ram_from_file(mr, owner, name, ram_size, false, + memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false, mem_path, &err); if (err) { error_report_err(err); diff --git a/qemu-options.hx b/qemu-options.hx index b3e03c5464..5ff741a4af 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -3974,7 +3974,7 @@ property must be set. These objects are placed in the @table @option -@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} +@item -object memory-backend-file,id=@var{id},size=@var{size},mem-path=@var{dir},share=@var{on|off},discard-data=@var{on|off},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave},align=@var{align} Creates a memory file backend object, which can be used to back the guest RAM with huge pages. @@ -4027,6 +4027,13 @@ restrict memory allocation to the given host node list interleave memory allocations across the given host node list @end table +The @option{align} option specifies the base address alignment when +QEMU mmap(2) @option{mem-path}, and accepts common suffixes, eg +@option{2M}. Some backend store specified by @option{mem-path} +requires an alignment different than the default one used by QEMU, eg +the device DAX /dev/dax0.0 requires 2M alignment rather than 4K. In +such cases, users can specify the required alignment via this option. + @item -object memory-backend-ram,id=@var{id},merge=@var{on|off},dump=@var{on|off},prealloc=@var{on|off},size=@var{size},host-nodes=@var{host-nodes},policy=@var{default|preferred|bind|interleave} Creates a memory backend object, which can be used to back the guest RAM. From da6789c27c2ea71765cfab04bad9a42b5426f0bd Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 11 Dec 2017 15:28:05 +0800 Subject: [PATCH 16/19] nvdimm: add a macro for property "label-size" Signed-off-by: Haozhong Zhang Reviewed-by: Stefan Hajnoczi Message-Id: <20171211072806.2812-3-haozhong.zhang@intel.com> Reviewed-by: Michael S. Tsirkin Signed-off-by: Eduardo Habkost --- hw/mem/nvdimm.c | 2 +- include/hw/mem/nvdimm.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index 952fce5ec8..618c3d677b 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -66,7 +66,7 @@ out: static void nvdimm_init(Object *obj) { - object_property_add(obj, "label-size", "int", + object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); } diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 03e1ff9558..28e68ddf59 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -47,6 +47,9 @@ #define NVDIMM_CLASS(oc) OBJECT_CLASS_CHECK(NVDIMMClass, (oc), TYPE_NVDIMM) #define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \ TYPE_NVDIMM) + +#define NVDIMM_LABLE_SIZE_PROP "label-size" + struct NVDIMMDevice { /* private */ PCDIMMDevice parent_obj; From cb836434cda103fac3c06174e70bf5c9b7083b8e Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 11 Dec 2017 15:28:06 +0800 Subject: [PATCH 17/19] nvdimm: add 'unarmed' option Currently the only vNVDIMM backend can guarantee the guest write persistence is device DAX on Linux, because no host-side kernel cache is involved in the guest access to it. The approach to detect whether the backend is device DAX needs to access sysfs, which may not work with SELinux. Instead, we add the 'unarmed' option to device 'nvdimm', so that users or management utils, which have enough knowledge about the backend, can control the unarmed flag in guest ACPI NFIT via this option. The guest Linux NVDIMM driver, for example, will mark the corresponding vNVDIMM device read-only if the unarmed flag in guest NFIT is set. The default value of 'unarmed' option is 'off' in order to keep the backwards compatibility. Signed-off-by: Haozhong Zhang Message-Id: <20171211072806.2812-4-haozhong.zhang@intel.com> Reviewed-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Signed-off-by: Eduardo Habkost --- docs/nvdimm.txt | 15 +++++++++++++++ hw/acpi/nvdimm.c | 7 +++++++ hw/mem/nvdimm.c | 26 ++++++++++++++++++++++++++ include/hw/mem/nvdimm.h | 9 +++++++++ 4 files changed, 57 insertions(+) diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt index 21249dd062..e903d8bb09 100644 --- a/docs/nvdimm.txt +++ b/docs/nvdimm.txt @@ -138,3 +138,18 @@ backend of vNVDIMM: -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M -device nvdimm,id=nvdimm1,memdev=mem1 + +Guest Data Persistence +---------------------- + +Though QEMU supports multiple types of vNVDIMM backends on Linux, +currently the only one that can guarantee the guest write persistence +is the device DAX on the real NVDIMM device (e.g., /dev/dax0.0), to +which all guest access do not involve any host-side kernel cache. + +When using other types of backends, it's suggested to set 'unarmed' +option of '-device nvdimm' to 'on', which sets the unarmed flag of the +guest NVDIMM region mapping structure. This unarmed flag indicates +guest software that this vNVDIMM device contains a region that cannot +accept persistent writes. In result, for example, the guest Linux +NVDIMM driver, marks such vNVDIMM device as read-only. diff --git a/hw/acpi/nvdimm.c b/hw/acpi/nvdimm.c index 6ceea196e7..59d6e4254c 100644 --- a/hw/acpi/nvdimm.c +++ b/hw/acpi/nvdimm.c @@ -138,6 +138,8 @@ struct NvdimmNfitMemDev { } QEMU_PACKED; typedef struct NvdimmNfitMemDev NvdimmNfitMemDev; +#define ACPI_NFIT_MEM_NOT_ARMED (1 << 3) + /* * NVDIMM Control Region Structure * @@ -284,6 +286,7 @@ static void nvdimm_build_structure_memdev(GArray *structures, DeviceState *dev) { NvdimmNfitMemDev *nfit_memdev; + NVDIMMDevice *nvdimm = NVDIMM(OBJECT(dev)); uint64_t size = object_property_get_uint(OBJECT(dev), PC_DIMM_SIZE_PROP, NULL); int slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, @@ -312,6 +315,10 @@ nvdimm_build_structure_memdev(GArray *structures, DeviceState *dev) /* Only one interleave for PMEM. */ nfit_memdev->interleave_ways = cpu_to_le16(1); + + if (nvdimm->unarmed) { + nfit_memdev->flags |= cpu_to_le16(ACPI_NFIT_MEM_NOT_ARMED); + } } /* diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c index 618c3d677b..61e677f92f 100644 --- a/hw/mem/nvdimm.c +++ b/hw/mem/nvdimm.c @@ -25,6 +25,7 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qapi/visitor.h" +#include "qapi-visit.h" #include "hw/mem/nvdimm.h" static void nvdimm_get_label_size(Object *obj, Visitor *v, const char *name, @@ -64,11 +65,36 @@ out: error_propagate(errp, local_err); } +static bool nvdimm_get_unarmed(Object *obj, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + + return nvdimm->unarmed; +} + +static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(obj); + Error *local_err = NULL; + + if (memory_region_size(&nvdimm->nvdimm_mr)) { + error_setg(&local_err, "cannot change property value"); + goto out; + } + + nvdimm->unarmed = value; + + out: + error_propagate(errp, local_err); +} + static void nvdimm_init(Object *obj) { object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int", nvdimm_get_label_size, nvdimm_set_label_size, NULL, NULL, NULL); + object_property_add_bool(obj, NVDIMM_UNARMED_PROP, + nvdimm_get_unarmed, nvdimm_set_unarmed, NULL); } static MemoryRegion *nvdimm_get_memory_region(PCDIMMDevice *dimm, Error **errp) diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h index 28e68ddf59..7fd87c4e1c 100644 --- a/include/hw/mem/nvdimm.h +++ b/include/hw/mem/nvdimm.h @@ -49,6 +49,7 @@ TYPE_NVDIMM) #define NVDIMM_LABLE_SIZE_PROP "label-size" +#define NVDIMM_UNARMED_PROP "unarmed" struct NVDIMMDevice { /* private */ @@ -74,6 +75,14 @@ struct NVDIMMDevice { * guest via ACPI NFIT and _FIT method if NVDIMM hotplug is supported. */ MemoryRegion nvdimm_mr; + + /* + * The 'on' value results in the unarmed flag set in ACPI NFIT, + * which can be used to notify guest implicitly that the host + * backend (e.g., files on HDD, /dev/pmemX, etc.) cannot guarantee + * the guest write persistence. + */ + bool unarmed; }; typedef struct NVDIMMDevice NVDIMMDevice; From d342eb7662bcfe47c26615b67025ae59a383489d Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Wed, 10 Jan 2018 16:22:50 +0100 Subject: [PATCH 18/19] possible_cpus: add CPUArchId::type field Remove dependency of possible_cpus on 1st CPU instance, which decouples configuration data from CPU instances that are created using that data. Also later it would be used for enabling early cpu to numa node configuration at runtime qmp_query_hotpluggable_cpus() should provide a list of available cpu slots at early stage, before machine_init() is called and the 1st cpu is created, so that mgmt might be able to call it and use output to set numa mapping. Use MachineClass::possible_cpu_arch_ids() callback to set cpu type info, along with the rest of possible cpu properties, to let machine define which cpu type* will be used. * for SPAPR it will be a spapr core type and for ARM/s390x/x86 a respective descendant of CPUClass. Move parse_numa_opts() in vl.c after cpu_model is parsed into cpu_type so that possible_cpu_arch_ids() would know which cpu_type to use during layout initialization. Signed-off-by: Igor Mammedov Reviewed-by: David Gibson Message-Id: <1515597770-268979-1-git-send-email-imammedo@redhat.com> Signed-off-by: Eduardo Habkost --- hw/arm/virt.c | 3 ++- hw/core/machine.c | 12 ++++++------ hw/i386/pc.c | 4 +++- hw/ppc/spapr.c | 13 ++++++++----- hw/s390x/s390-virtio-ccw.c | 1 + include/hw/boards.h | 2 ++ vl.c | 3 +-- 7 files changed, 23 insertions(+), 15 deletions(-) diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 4a6fdcc4f5..a4537af400 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -1359,7 +1359,7 @@ static void machvirt_init(MachineState *machine) break; } - cpuobj = object_new(machine->cpu_type); + cpuobj = object_new(possible_cpus->cpus[n].type); object_property_set_int(cpuobj, possible_cpus->cpus[n].arch_id, "mp-affinity", NULL); @@ -1575,6 +1575,7 @@ static const CPUArchIdList *virt_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (n = 0; n < ms->possible_cpus->len; n++) { + ms->possible_cpus->cpus[n].type = ms->cpu_type; ms->possible_cpus->cpus[n].arch_id = virt_cpu_mp_affinity(vms, n); ms->possible_cpus->cpus[n].props.has_thread_id = true; diff --git a/hw/core/machine.c b/hw/core/machine.c index 0320a8efa1..cdc1163dc6 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -377,18 +377,18 @@ static void machine_init_notify(Notifier *notifier, void *data) HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine) { int i; - Object *cpu; HotpluggableCPUList *head = NULL; - const char *cpu_type; + MachineClass *mc = MACHINE_GET_CLASS(machine); + + /* force board to initialize possible_cpus if it hasn't been done yet */ + mc->possible_cpu_arch_ids(machine); - cpu = machine->possible_cpus->cpus[0].cpu; - assert(cpu); /* Boot cpu is always present */ - cpu_type = object_get_typename(cpu); for (i = 0; i < machine->possible_cpus->len; i++) { + Object *cpu; HotpluggableCPUList *list_item = g_new0(typeof(*list_item), 1); HotpluggableCPU *cpu_item = g_new0(typeof(*cpu_item), 1); - cpu_item->type = g_strdup(cpu_type); + cpu_item->type = g_strdup(machine->possible_cpus->cpus[i].type); cpu_item->vcpus_count = machine->possible_cpus->cpus[i].vcpus_count; cpu_item->props = g_memdup(&machine->possible_cpus->cpus[i].props, sizeof(*cpu_item->props)); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 55686bf5d8..ccc50baa85 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1148,7 +1148,8 @@ void pc_cpus_init(PCMachineState *pcms) pcms->apic_id_limit = x86_cpu_apic_id_from_index(max_cpus - 1) + 1; possible_cpus = mc->possible_cpu_arch_ids(ms); for (i = 0; i < smp_cpus; i++) { - pc_new_cpu(ms->cpu_type, possible_cpus->cpus[i].arch_id, &error_fatal); + pc_new_cpu(possible_cpus->cpus[i].type, possible_cpus->cpus[i].arch_id, + &error_fatal); } } @@ -2307,6 +2308,7 @@ static const CPUArchIdList *pc_possible_cpu_arch_ids(MachineState *ms) for (i = 0; i < ms->possible_cpus->len; i++) { X86CPUTopoInfo topo; + ms->possible_cpus->cpus[i].type = ms->cpu_type; ms->possible_cpus->cpus[i].vcpus_count = 1; ms->possible_cpus->cpus[i].arch_id = x86_cpu_apic_id_from_index(i); x86_topo_ids_from_apicid(ms->possible_cpus->cpus[i].arch_id, diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 278f9de1e7..a781dd22e7 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2226,11 +2226,6 @@ static void spapr_init_cpus(sPAPRMachineState *spapr) int boot_cores_nr = smp_cpus / smp_threads; int i; - if (!type) { - error_report("Unable to find sPAPR CPU Core definition"); - exit(1); - } - possible_cpus = mc->possible_cpu_arch_ids(machine); if (mc->has_hotpluggable_cpus) { if (smp_cpus % smp_threads) { @@ -3545,6 +3540,7 @@ static int64_t spapr_get_default_cpu_node_id(const MachineState *ms, int idx) static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) { int i; + const char *core_type; int spapr_max_cores = max_cpus / smp_threads; MachineClass *mc = MACHINE_GET_CLASS(machine); @@ -3556,12 +3552,19 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine) return machine->possible_cpus; } + core_type = spapr_get_cpu_core_type(machine->cpu_type); + if (!core_type) { + error_report("Unable to find sPAPR CPU Core definition"); + exit(1); + } + machine->possible_cpus = g_malloc0(sizeof(CPUArchIdList) + sizeof(CPUArchId) * spapr_max_cores); machine->possible_cpus->len = spapr_max_cores; for (i = 0; i < machine->possible_cpus->len; i++) { int core_id = i * smp_threads; + machine->possible_cpus->cpus[i].type = core_type; machine->possible_cpus->cpus[i].vcpus_count = smp_threads; machine->possible_cpus->cpus[i].arch_id = core_id; machine->possible_cpus->cpus[i].props.has_core_id = true; diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 35df7e19c5..3807dcb097 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -414,6 +414,7 @@ static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms) sizeof(CPUArchId) * max_cpus); ms->possible_cpus->len = max_cpus; for (i = 0; i < ms->possible_cpus->len; i++) { + ms->possible_cpus->cpus[i].type = ms->cpu_type; ms->possible_cpus->cpus[i].vcpus_count = 1; ms->possible_cpus->cpus[i].arch_id = i; ms->possible_cpus->cpus[i].props.has_core_id = true; diff --git a/include/hw/boards.h b/include/hw/boards.h index 041bc08971..efb0a9edfd 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -83,6 +83,7 @@ void machine_class_allow_dynamic_sysbus_dev(MachineClass *mc, const char *type); * CPUArchId: * @arch_id - architecture-dependent CPU ID of present or possible CPU * @cpu - pointer to corresponding CPU object if it's present on NULL otherwise + * @type - QOM class name of possible @cpu object * @props - CPU object properties, initialized by board * #vcpus_count - number of threads provided by @cpu object */ @@ -91,6 +92,7 @@ typedef struct { int64_t vcpus_count; CpuInstanceProperties props; Object *cpu; + const char *type; } CPUArchId; /** diff --git a/vl.c b/vl.c index 2586f25952..e725ecbc08 100644 --- a/vl.c +++ b/vl.c @@ -4611,8 +4611,6 @@ int main(int argc, char **argv, char **envp) current_machine->boot_order = boot_order; current_machine->cpu_model = cpu_model; - parse_numa_opts(current_machine); - /* parse features once if machine provides default cpu_type */ if (machine_class->default_cpu_type) { current_machine->cpu_type = machine_class->default_cpu_type; @@ -4621,6 +4619,7 @@ int main(int argc, char **argv, char **envp) cpu_parse_cpu_model(machine_class->default_cpu_type, cpu_model); } } + parse_numa_opts(current_machine); machine_run_board_init(current_machine); From d6b6abc51dda79a97f2c7bd6652c1940c068f1ec Mon Sep 17 00:00:00 2001 From: Marcel Apfelbaum Date: Mon, 8 Jan 2018 23:50:07 +0200 Subject: [PATCH 19/19] fw_cfg: fix memory corruption when all fw_cfg slots are used When all the fw_cfg slots are used, a write is made outside the bounds of the fw_cfg files array as part of the sort algorithm. Fix it by avoiding an unnecessary array element move. Fix also an assert while at it. Signed-off-by: Marcel Apfelbaum Message-Id: <20180108215007.46471-1-marcel@redhat.com> Reviewed-by: Laszlo Ersek Signed-off-by: Eduardo Habkost --- hw/nvram/fw_cfg.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c index 753ac0e4ea..4313484b21 100644 --- a/hw/nvram/fw_cfg.c +++ b/hw/nvram/fw_cfg.c @@ -784,7 +784,7 @@ void fw_cfg_add_file_callback(FWCfgState *s, const char *filename, * index and "i - 1" is the one being copied from, thus the * unusual start and end in the for statement. */ - for (i = count + 1; i > index; i--) { + for (i = count; i > index; i--) { s->files->f[i] = s->files->f[i - 1]; s->files->f[i].select = cpu_to_be16(FW_CFG_FILE_FIRST + i); s->entries[0][FW_CFG_FILE_FIRST + i] = @@ -833,7 +833,6 @@ void *fw_cfg_modify_file(FWCfgState *s, const char *filename, assert(s->files); index = be32_to_cpu(s->files->count); - assert(index < fw_cfg_file_slots(s)); for (i = 0; i < index; i++) { if (strcmp(filename, s->files->f[i].name) == 0) { @@ -843,6 +842,9 @@ void *fw_cfg_modify_file(FWCfgState *s, const char *filename, return ptr; } } + + assert(index < fw_cfg_file_slots(s)); + /* add new one */ fw_cfg_add_file_callback(s, filename, NULL, NULL, NULL, data, len, true); return NULL;