From c10aaaab0fcfaf7ad68ccbc07a9069d7f6310532 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 24 May 2019 15:48:18 +1000 Subject: [PATCH 01/44] tests: Fix up docker cross builds for ppc64 (BE) targets We currently have docker cross building targets for powerpc (32-bit, BE) and ppc64el (64-bit, LE), but not for pcp64 (64-bit, BE). This is an irritating gap in make check-tcg coverage so correct it. Signed-off-by: David Gibson --- tests/docker/Makefile.include | 1 + tests/docker/dockerfiles/debian-ppc64-cross.docker | 11 +++++++++++ tests/tcg/ppc/Makefile.include | 3 +++ 3 files changed, 15 insertions(+) create mode 100644 tests/docker/dockerfiles/debian-ppc64-cross.docker diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include index c0e1bf57a3..aaf5396b85 100644 --- a/tests/docker/Makefile.include +++ b/tests/docker/Makefile.include @@ -107,6 +107,7 @@ docker-image-debian-sparc64-cross: docker-image-debian-sid docker-image-debian-mips64-cross: docker-image-debian-sid docker-image-debian-riscv64-cross: docker-image-debian-sid docker-image-debian-powerpc-cross: docker-image-debian-sid +docker-image-debian-ppc64-cross: docker-image-debian-sid docker-image-travis: NOUSER=1 # Specialist build images, sometimes very limited tools diff --git a/tests/docker/dockerfiles/debian-ppc64-cross.docker b/tests/docker/dockerfiles/debian-ppc64-cross.docker new file mode 100644 index 0000000000..7f239c322d --- /dev/null +++ b/tests/docker/dockerfiles/debian-ppc64-cross.docker @@ -0,0 +1,11 @@ +# +# Docker ppc64 cross-compiler target +# +# This docker target builds on the debian sid base image which +# contains cross compilers for Debian "ports" targets. +FROM qemu:debian-sid + +RUN DEBIAN_FRONTEND=noninteractive eatmydata \ + apt-get install -y --no-install-recommends \ + gcc-powerpc64-linux-gnu \ + libc6-dev-ppc64-cross || { echo "Failed to build - see debian-sid.docker notes"; exit 1; } diff --git a/tests/tcg/ppc/Makefile.include b/tests/tcg/ppc/Makefile.include index b062c30dd3..ae01fb8fad 100644 --- a/tests/tcg/ppc/Makefile.include +++ b/tests/tcg/ppc/Makefile.include @@ -1,6 +1,9 @@ ifeq ($(TARGET_NAME),ppc) DOCKER_IMAGE=debian-powerpc-cross DOCKER_CROSS_COMPILER=powerpc-linux-gnu-gcc +else ifeq ($(TARGET_NAME),ppc64) +DOCKER_IMAGE=debian-ppc64-cross +DOCKER_CROSS_COMPILER=powerpc64-linux-gnu-gcc else ifeq ($(TARGET_NAME),ppc64le) DOCKER_IMAGE=debian-ppc64el-cross DOCKER_CROSS_COMPILER=powerpc64le-linux-gnu-gcc From f8378accda78acdc601f228254e2a41b4531621b Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 1 May 2019 15:38:18 -0700 Subject: [PATCH 02/44] configure: Distinguish ppc64 and ppc64le hosts We cannot use the ppc64le host compiler to build ppc64(be) guest code. Clean up confusion between cross_cc_powerpc and cross_cc_ppc; make use of the cflags variable as well. Signed-off-by: Richard Henderson Message-Id: <20190501223819.8584-2-richard.henderson@linaro.org> [dwg: Dropped hunk relating to ppc64abi32, it doesn't test properly] Signed-off-by: David Gibson --- configure | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/configure b/configure index 528b9ff705..07eb2b3942 100755 --- a/configure +++ b/configure @@ -198,7 +198,7 @@ supported_kvm_target() { i386:i386 | i386:x86_64 | i386:x32 | \ x86_64:i386 | x86_64:x86_64 | x86_64:x32 | \ mips:mips | mipsel:mips | \ - ppc:ppc | ppc64:ppc | ppc:ppc64 | ppc64:ppc64 | \ + ppc:ppc | ppc64:ppc | ppc:ppc64 | ppc64:ppc64 | ppc64:ppc64le | \ s390x:s390x) return 0 ;; @@ -502,8 +502,11 @@ cross_cc_arm="arm-linux-gnueabihf-gcc" cross_cc_cflags_armeb="-mbig-endian" cross_cc_i386="i386-pc-linux-gnu-gcc" cross_cc_cflags_i386="" -cross_cc_powerpc="powerpc-linux-gnu-gcc" -cross_cc_powerpc="powerpc-linux-gnu-gcc" +cross_cc_ppc="powerpc-linux-gnu-gcc" +cross_cc_cflags_ppc="-m32" +cross_cc_ppc64="powerpc-linux-gnu-gcc" +cross_cc_cflags_ppc64="-m64" +cross_cc_ppc64le="powerpc64le-linux-gnu-gcc" enabled_cross_compilers="" @@ -700,7 +703,11 @@ elif check_define __sparc__ ; then fi elif check_define _ARCH_PPC ; then if check_define _ARCH_PPC64 ; then - cpu="ppc64" + if check_define _LITTLE_ENDIAN ; then + cpu="ppc64le" + else + cpu="ppc64" + fi else cpu="ppc" fi @@ -731,10 +738,14 @@ ARCH= # Note that this case should only have supported host CPUs, not guests. case "$cpu" in ppc|ppc64|s390|s390x|sparc64|x32|riscv32|riscv64) - cpu="$cpu" supported_cpu="yes" eval "cross_cc_${cpu}=\$host_cc" ;; + ppc64le) + ARCH="ppc64" + supported_cpu="yes" + cross_cc_ppc64le=$host_cc + ;; i386|i486|i586|i686|i86pc|BePC) cpu="i386" supported_cpu="yes" @@ -1538,8 +1549,8 @@ case "$cpu" in ppc) CPU_CFLAGS="-m32" LDFLAGS="-m32 $LDFLAGS" - cross_cc_powerpc=$cc - cross_cc_cflags_powerpc=$CPU_CFLAGS + cross_cc_ppc=$cc + cross_cc_cflags_ppc="$CPU_CFLAGS" ;; ppc64) CPU_CFLAGS="-m64" @@ -6191,7 +6202,7 @@ if { test "$cpu" = "i386" || test "$cpu" = "x86_64"; } && \ fi done fi -if test "$cpu" = "ppc64" && test "$targetos" != "Darwin" ; then +if test "$ARCH" = "ppc64" && test "$targetos" != "Darwin" ; then roms="$roms spapr-rtas" fi @@ -7378,7 +7389,7 @@ if test "$linux" = "yes" ; then i386|x86_64|x32) linux_arch=x86 ;; - ppc|ppc64) + ppc|ppc64|ppc64le) linux_arch=powerpc ;; s390x) @@ -7539,7 +7550,8 @@ case "$target_name" in ;; ppc) gdb_xml_files="power-core.xml power-fpu.xml power-altivec.xml power-spe.xml" - target_compiler=$cross_cc_powerpc + target_compiler=$cross_cc_ppc + target_compiler_cflags="$cross_cc_cflags_ppc" ;; ppc64) TARGET_BASE_ARCH=ppc @@ -7547,6 +7559,7 @@ case "$target_name" in mttcg=yes gdb_xml_files="power64-core.xml power-fpu.xml power-altivec.xml power-spe.xml power-vsx.xml" target_compiler=$cross_cc_ppc64 + target_compiler_cflags="$cross_cc_cflags_ppc64" ;; ppc64le) TARGET_ARCH=ppc64 From 3ff1075ab614d488c3cc3730a6d0d3c08f1965c4 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 1 May 2019 15:38:19 -0700 Subject: [PATCH 03/44] configure: Use quotes around uses of $CPU_CFLAGS About half of the values to which CPU_CFLAGS is set have multiple space separated arguments. Signed-off-by: Richard Henderson Message-Id: <20190501223819.8584-3-richard.henderson@linaro.org> Signed-off-by: David Gibson --- configure | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/configure b/configure index 07eb2b3942..991fef51ee 100755 --- a/configure +++ b/configure @@ -1556,37 +1556,37 @@ case "$cpu" in CPU_CFLAGS="-m64" LDFLAGS="-m64 $LDFLAGS" cross_cc_ppc64=$cc - cross_cc_cflags_ppc64=$CPU_CFLAGS + cross_cc_cflags_ppc64="$CPU_CFLAGS" ;; sparc) CPU_CFLAGS="-m32 -mv8plus -mcpu=ultrasparc" LDFLAGS="-m32 -mv8plus $LDFLAGS" cross_cc_sparc=$cc - cross_cc_cflags_sparc=$CPU_CFLAGS + cross_cc_cflags_sparc="$CPU_CFLAGS" ;; sparc64) CPU_CFLAGS="-m64 -mcpu=ultrasparc" LDFLAGS="-m64 $LDFLAGS" cross_cc_sparc64=$cc - cross_cc_cflags_sparc64=$CPU_CFLAGS + cross_cc_cflags_sparc64="$CPU_CFLAGS" ;; s390) CPU_CFLAGS="-m31" LDFLAGS="-m31 $LDFLAGS" cross_cc_s390=$cc - cross_cc_cflags_s390=$CPU_CFLAGS + cross_cc_cflags_s390="$CPU_CFLAGS" ;; s390x) CPU_CFLAGS="-m64" LDFLAGS="-m64 $LDFLAGS" cross_cc_s390x=$cc - cross_cc_cflags_s390x=$CPU_CFLAGS + cross_cc_cflags_s390x="$CPU_CFLAGS" ;; i386) CPU_CFLAGS="-m32" LDFLAGS="-m32 $LDFLAGS" cross_cc_i386=$cc - cross_cc_cflags_i386=$CPU_CFLAGS + cross_cc_cflags_i386="$CPU_CFLAGS" ;; x86_64) # ??? Only extremely old AMD cpus do not have cmpxchg16b. @@ -1595,13 +1595,13 @@ case "$cpu" in CPU_CFLAGS="-m64 -mcx16" LDFLAGS="-m64 $LDFLAGS" cross_cc_x86_64=$cc - cross_cc_cflags_x86_64=$CPU_CFLAGS + cross_cc_cflags_x86_64="$CPU_CFLAGS" ;; x32) CPU_CFLAGS="-mx32" LDFLAGS="-mx32 $LDFLAGS" cross_cc_i386=$cc - cross_cc_cflags_i386=$CPU_CFLAGS + cross_cc_cflags_i386="$CPU_CFLAGS" ;; # No special flags required for other host CPUs esac From 228152c27e7085fec1cc8c86259a5e21ee5c7f89 Mon Sep 17 00:00:00 2001 From: Boxuan Li Date: Wed, 1 May 2019 01:28:42 +0800 Subject: [PATCH 04/44] target/ppc/kvm: Fix trace typo Signed-off-by: Boxuan Li Message-Id: <20190430172842.27369-1-liboxuan@connect.hku.hk> Signed-off-by: David Gibson --- target/ppc/kvm.c | 2 +- target/ppc/trace-events | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 02e22e2017..1a9caf8f40 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -1721,7 +1721,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) trace_kvm_handle_dcr_write(); ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data); } else { - trace_kvm_handle_drc_read(); + trace_kvm_handle_dcr_read(); ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data); } break; diff --git a/target/ppc/trace-events b/target/ppc/trace-events index 7b3cfe11fd..3dc6740706 100644 --- a/target/ppc/trace-events +++ b/target/ppc/trace-events @@ -22,7 +22,7 @@ kvm_failed_put_vpa(void) "Warning: Unable to set VPA information to KVM" kvm_failed_get_vpa(void) "Warning: Unable to get VPA information from KVM" kvm_injected_interrupt(int irq) "injected interrupt %d" kvm_handle_dcr_write(void) "handle dcr write" -kvm_handle_drc_read(void) "handle dcr read" +kvm_handle_dcr_read(void) "handle dcr read" kvm_handle_halt(void) "handle halt" kvm_handle_papr_hcall(void) "handle PAPR hypercall" kvm_handle_epr(void) "handle epr" From c50be9e1ec705512c622366f80861436eafacffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Sun, 5 May 2019 17:28:37 +0200 Subject: [PATCH 05/44] hw/ppc/prep: use TYPE_MC146818_RTC instead of a hardcoded string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20190505152839.18650-2-philmd@redhat.com> Signed-off-by: David Gibson --- hw/ppc/prep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index b7f459d475..ebee321148 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -601,7 +601,7 @@ static int prep_set_cmos_checksum(DeviceState *dev, void *opaque) uint16_t checksum = *(uint16_t *)opaque; ISADevice *rtc; - if (object_dynamic_cast(OBJECT(dev), "mc146818rtc")) { + if (object_dynamic_cast(OBJECT(dev), TYPE_MC146818_RTC)) { rtc = ISA_DEVICE(dev); rtc_set_memory(rtc, 0x2e, checksum & 0xff); rtc_set_memory(rtc, 0x3e, checksum & 0xff); From 2e8f85189d477e1009e13235ef73c86834664571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Sun, 5 May 2019 17:28:38 +0200 Subject: [PATCH 06/44] hw/ppc/40p: Move the MC146818 RTC to the board where it belongs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MC146818 RTC was incorrectly added to the i82378 chipset in commit a04ff940974a. In the next commit (506b7ddf8893) the PReP machine use the i82378. Since the MC146818 is specific to the PReP machine, move its use there. Fixes: a04ff940974a Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20190505152839.18650-3-philmd@redhat.com> Signed-off-by: David Gibson --- hw/isa/i82378.c | 4 ---- hw/ppc/prep.c | 3 +++ 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/hw/isa/i82378.c b/hw/isa/i82378.c index a5d67bc6d7..c08970b24a 100644 --- a/hw/isa/i82378.c +++ b/hw/isa/i82378.c @@ -21,7 +21,6 @@ #include "hw/pci/pci.h" #include "hw/i386/pc.h" #include "hw/timer/i8254.h" -#include "hw/timer/mc146818rtc.h" #include "hw/audio/pcspk.h" #define TYPE_I82378 "i82378" @@ -105,9 +104,6 @@ static void i82378_realize(PCIDevice *pci, Error **errp) /* 2 82C37 (dma) */ isa = isa_create_simple(isabus, "i82374"); - - /* timer */ - isa_create_simple(isabus, TYPE_MC146818_RTC); } static void i82378_init(Object *obj) diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index ebee321148..7a0d311d43 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -675,6 +675,9 @@ static void ibm_40p_init(MachineState *machine) qdev_prop_set_uint32(dev, "ram-size", machine->ram_size); qdev_init_nofail(dev); + /* RTC */ + isa_create_simple(isa_bus, TYPE_MC146818_RTC); + /* initialize CMOS checksums */ cmos_checksum = 0x6aa9; qbus_walk_children(BUS(isa_bus), prep_set_cmos_checksum, NULL, NULL, NULL, From 1dbe3d196d81da45df9f17d6959871c08bdb9dac Mon Sep 17 00:00:00 2001 From: Artyom Tarasenko Date: Sun, 5 May 2019 17:28:39 +0200 Subject: [PATCH 07/44] hw/ppc/40p: use 1900 as a base year MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AIX 5.1 expects the base year to be 1900. Adjust accordingly. Signed-off-by: Artyom Tarasenko Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20190505152839.18650-4-philmd@redhat.com> Signed-off-by: David Gibson --- hw/ppc/prep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index 7a0d311d43..2a8009e20b 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -676,7 +676,9 @@ static void ibm_40p_init(MachineState *machine) qdev_init_nofail(dev); /* RTC */ - isa_create_simple(isa_bus, TYPE_MC146818_RTC); + dev = DEVICE(isa_create(isa_bus, TYPE_MC146818_RTC)); + qdev_prop_set_int32(dev, "base_year", 1900); + qdev_init_nofail(dev); /* initialize CMOS checksums */ cmos_checksum = 0x6aa9; From 83f192d34d2525f9c0053c2179bdbc69327b9158 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Mon, 6 May 2019 11:48:03 +1000 Subject: [PATCH 08/44] target/ppc: Add ibm,purr and ibm,spurr device-tree properties The ibm,purr and ibm,spurr device tree properties are used to indicate that the processor implements the Processor Utilisation of Resources Register (PURR) and Scaled Processor Utilisation of Resources Registers (SPURR), respectively. Each property has a single value which represents the level of architecture supported. A value of 1 for ibm,purr means support for the version of the PURR defined in book 3 in version 2.02 of the architecture. A value of 1 for ibm,spurr means support for the version of the SPURR defined in version 2.05 of the architecture. Add these properties for all processors for which the PURR and SPURR registers are generated. Fixes: 0da6f3fef9a "spapr: Reorganize CPU dt generation code" Signed-off-by: Suraj Jitindar Singh Message-Id: <20190506014803.21299-1-sjitindarsingh@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 2ef3ce4362..8580a8dc67 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -500,7 +500,10 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset, _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0))); if (env->spr_cb[SPR_PURR].oea_read) { - _FDT((fdt_setprop(fdt, offset, "ibm,purr", NULL, 0))); + _FDT((fdt_setprop_cell(fdt, offset, "ibm,purr", 1))); + } + if (env->spr_cb[SPR_SPURR].oea_read) { + _FDT((fdt_setprop_cell(fdt, offset, "ibm,spurr", 1))); } if (ppc_hash64_has(cpu, PPC_HASH64_1TSEG)) { From cf4e9363f7fd889d8d804c8f78e8927782c2aa48 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 7 May 2019 10:48:03 +1000 Subject: [PATCH 09/44] target/ppc: Fix xvxsigdp Fix a typo in xvxsigdp where we put both results into the lower doubleword. Fixes: dd977e4f45cb ("target/ppc: Optimize x[sv]xsigdp using deposit_i64()") Signed-off-by: Anton Blanchard Message-Id: <20190507004811.29968-1-anton@ozlabs.org> Signed-off-by: David Gibson --- target/ppc/translate/vsx-impl.inc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c index 11d9b75d01..4d8ca7cf32 100644 --- a/target/ppc/translate/vsx-impl.inc.c +++ b/target/ppc/translate/vsx-impl.inc.c @@ -1820,7 +1820,7 @@ static void gen_xvxsigdp(DisasContext *ctx) tcg_gen_movi_i64(t0, 0x0010000000000000); tcg_gen_movcond_i64(TCG_COND_EQ, t0, exp, zr, zr, t0); tcg_gen_movcond_i64(TCG_COND_EQ, t0, exp, nan, zr, t0); - tcg_gen_deposit_i64(xth, t0, xbl, 0, 52); + tcg_gen_deposit_i64(xtl, t0, xbl, 0, 52); set_cpu_vsrl(xT(ctx->opcode), xtl); tcg_temp_free_i64(t0); From d47a751adab7833e9831408376077bc8dba41d5d Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 7 May 2019 10:48:05 +1000 Subject: [PATCH 10/44] target/ppc: Fix xxbrq, xxbrw Fix a typo in xxbrq and xxbrw where we put both results into the lower doubleword. Fixes: 8b3b2d75c7c0 ("introduce get_cpu_vsr{l,h}() and set_cpu_vsr{l,h}() helpers for VSR register access") Signed-off-by: Anton Blanchard Message-Id: <20190507004811.29968-3-anton@ozlabs.org> Signed-off-by: David Gibson --- target/ppc/translate/vsx-impl.inc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c index 4d8ca7cf32..d29f60e2f9 100644 --- a/target/ppc/translate/vsx-impl.inc.c +++ b/target/ppc/translate/vsx-impl.inc.c @@ -1192,7 +1192,7 @@ static void gen_xxbrq(DisasContext *ctx) tcg_gen_bswap64_i64(xtl, xbh); set_cpu_vsrl(xT(ctx->opcode), xtl); tcg_gen_mov_i64(xth, t0); - set_cpu_vsrl(xT(ctx->opcode), xth); + set_cpu_vsrh(xT(ctx->opcode), xth); tcg_temp_free_i64(t0); tcg_temp_free_i64(xth); @@ -1220,7 +1220,7 @@ static void gen_xxbrw(DisasContext *ctx) get_cpu_vsrl(xbl, xB(ctx->opcode)); gen_bswap32x4(xth, xtl, xbh, xbl); - set_cpu_vsrl(xT(ctx->opcode), xth); + set_cpu_vsrh(xT(ctx->opcode), xth); set_cpu_vsrl(xT(ctx->opcode), xtl); tcg_temp_free_i64(xth); From 63be02fc69d44442dc7eb316d44f1c1fbe49c075 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 7 May 2019 10:48:08 +1000 Subject: [PATCH 11/44] target/ppc: Fix vslv and vsrv vslv and vsrv are broken on little endian, we append 00 to the high byte not the low byte. Fix it by using the VsrB() accessor. Signed-off-by: Anton Blanchard Message-Id: <20190507004811.29968-6-anton@ozlabs.org> Signed-off-by: David Gibson --- target/ppc/int_helper.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 9af779ad38..2bad2d5620 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1815,10 +1815,10 @@ void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) size = ARRAY_SIZE(r->u8); for (i = 0; i < size; i++) { - shift = b->u8[i] & 0x7; /* extract shift value */ - bytes = (a->u8[i] << 8) + /* extract adjacent bytes */ - (((i + 1) < size) ? a->u8[i + 1] : 0); - r->u8[i] = (bytes << shift) >> 8; /* shift and store result */ + shift = b->VsrB(i) & 0x7; /* extract shift value */ + bytes = (a->VsrB(i) << 8) + /* extract adjacent bytes */ + (((i + 1) < size) ? a->VsrB(i + 1) : 0); + r->VsrB(i) = (bytes << shift) >> 8; /* shift and store result */ } } @@ -1833,10 +1833,10 @@ void helper_vsrv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) * order will guarantee that computed result is not fed back. */ for (i = ARRAY_SIZE(r->u8) - 1; i >= 0; i--) { - shift = b->u8[i] & 0x7; /* extract shift value */ - bytes = ((i ? a->u8[i - 1] : 0) << 8) + a->u8[i]; + shift = b->VsrB(i) & 0x7; /* extract shift value */ + bytes = ((i ? a->VsrB(i - 1) : 0) << 8) + a->VsrB(i); /* extract adjacent bytes */ - r->u8[i] = (bytes >> shift) & 0xFF; /* shift and store result */ + r->VsrB(i) = (bytes >> shift) & 0xFF; /* shift and store result */ } } From 7fa0ddc1d63806769d1b6246a62708d3bde39037 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 7 May 2019 10:48:11 +1000 Subject: [PATCH 12/44] target/ppc: Fix vsum2sws A recent cleanup changed the pre zeroing of the result from 64 bit to 32 bit operations: - result.u64[i] = 0; + result.VsrW(i) = 0; This corrupts the result. Fixes: 60594fea298d ("target/ppc: remove various HOST_WORDS_BIGENDIAN hacks in int_helper.c") Signed-off-by: Anton Blanchard Message-Id: <20190507004811.29968-9-anton@ozlabs.org> Signed-off-by: David Gibson --- target/ppc/int_helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 2bad2d5620..093ef74b59 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -2053,7 +2053,7 @@ void helper_vsum2sws(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) for (i = 0; i < ARRAY_SIZE(r->u64); i++) { int64_t t = (int64_t)b->VsrSW(upper + i * 2); - result.VsrW(i) = 0; + result.VsrD(i) = 0; for (j = 0; j < ARRAY_SIZE(r->u64); j++) { t += a->VsrSW(2 * i + j); } From 4c406ca7345e874f16110734907af970c968d727 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 9 May 2019 06:17:33 +1000 Subject: [PATCH 13/44] target/ppc: Fix xxspltib xxspltib raises a VMX or a VSX exception depending on the register set it is operating on. We had a check, but it was backwards. Fixes: f113283525a4 ("target-ppc: add xxspltib instruction") Signed-off-by: Anton Blanchard Message-Id: <20190509061713.69490488@kryten> Signed-off-by: David Gibson --- target/ppc/translate/vsx-impl.inc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c index d29f60e2f9..4b8f6cefe3 100644 --- a/target/ppc/translate/vsx-impl.inc.c +++ b/target/ppc/translate/vsx-impl.inc.c @@ -1355,13 +1355,13 @@ static void gen_xxspltib(DisasContext *ctx) int rt = xT(ctx->opcode); if (rt < 32) { - if (unlikely(!ctx->altivec_enabled)) { - gen_exception(ctx, POWERPC_EXCP_VPU); + if (unlikely(!ctx->vsx_enabled)) { + gen_exception(ctx, POWERPC_EXCP_VSXU); return; } } else { - if (unlikely(!ctx->vsx_enabled)) { - gen_exception(ctx, POWERPC_EXCP_VSXU); + if (unlikely(!ctx->altivec_enabled)) { + gen_exception(ctx, POWERPC_EXCP_VPU); return; } } From 7f9136f90dd2675c4cee29eed447f9213f94f3e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 8 May 2019 19:19:44 +0200 Subject: [PATCH 14/44] spapr/xive: EQ page should be naturally aligned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the OS configures the EQ page in which to receive event notifications from the XIVE interrupt controller, the page should be naturally aligned. Add this check. Signed-off-by: Cédric Le Goater Message-Id: <20190508171946.657-2-clg@kaod.org> Reviewed-by: Greg Kurz [dwg: Minor change for printf warning on some platforms] Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 097f88d460..33da1a52c6 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -993,6 +993,12 @@ static target_ulong h_int_set_queue_config(PowerPCCPU *cpu, case 16: case 21: case 24: + if (!QEMU_IS_ALIGNED(qpage, 1ul << qsize)) { + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: EQ @0x%" HWADDR_PRIx + " is not naturally aligned with %" HWADDR_PRIx "\n", + qpage, (hwaddr)1 << qsize); + return H_P4; + } end.w2 = cpu_to_be32((qpage >> 32) & 0x0fffffff); end.w3 = cpu_to_be32(qpage & 0xffffffff); end.w0 |= cpu_to_be32(END_W0_ENQUEUE); From 13df93244efbd4bb8b4cf4e26104a26033178674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 8 May 2019 19:19:45 +0200 Subject: [PATCH 15/44] spapr/xive: fix EQ page addresses above 64GB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The high order bits of the address of the OS event queue is stored in bits [4-31] of word2 of the XIVE END internal structures and the low order bits in word3. This structure is using Big Endian ordering and computing the value requires some simple arithmetic which happens to be wrong. The mask removing bits [0-3] of word2 is applied to the wrong value and the resulting address is bogus when above 64GB. Guests with more than 64GB of RAM will allocate pages for the OS event queues which will reside above the 64GB limit. In this case, the XIVE device model will wake up the CPUs in case of a notification, such as IPIs, but the update of the event queue will be written at the wrong place in memory. The result is uncertain as the guest memory is trashed and IPI are not delivered. Introduce a helper xive_end_qaddr() to compute this value correctly in all places where it is used. Signed-off-by: Cédric Le Goater Message-Id: <20190508171946.657-3-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 3 +-- hw/intc/xive.c | 9 +++------ include/hw/ppc/xive_regs.h | 6 ++++++ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 33da1a52c6..a19e998093 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -1150,8 +1150,7 @@ static target_ulong h_int_get_queue_config(PowerPCCPU *cpu, } if (xive_end_is_enqueue(end)) { - args[1] = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 - | be32_to_cpu(end->w3); + args[1] = xive_end_qaddr(end); args[2] = xive_get_field32(END_W0_QSIZE, end->w0) + 12; } else { args[1] = 0; diff --git a/hw/intc/xive.c b/hw/intc/xive.c index a0b87001da..dcf2fcd108 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -1042,8 +1042,7 @@ static const TypeInfo xive_source_info = { void xive_end_queue_pic_print_info(XiveEND *end, uint32_t width, Monitor *mon) { - uint64_t qaddr_base = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 - | be32_to_cpu(end->w3); + uint64_t qaddr_base = xive_end_qaddr(end); uint32_t qsize = xive_get_field32(END_W0_QSIZE, end->w0); uint32_t qindex = xive_get_field32(END_W1_PAGE_OFF, end->w1); uint32_t qentries = 1 << (qsize + 10); @@ -1072,8 +1071,7 @@ void xive_end_queue_pic_print_info(XiveEND *end, uint32_t width, Monitor *mon) void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon) { - uint64_t qaddr_base = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 - | be32_to_cpu(end->w3); + uint64_t qaddr_base = xive_end_qaddr(end); uint32_t qindex = xive_get_field32(END_W1_PAGE_OFF, end->w1); uint32_t qgen = xive_get_field32(END_W1_GENERATION, end->w1); uint32_t qsize = xive_get_field32(END_W0_QSIZE, end->w0); @@ -1101,8 +1099,7 @@ void xive_end_pic_print_info(XiveEND *end, uint32_t end_idx, Monitor *mon) static void xive_end_enqueue(XiveEND *end, uint32_t data) { - uint64_t qaddr_base = (uint64_t) be32_to_cpu(end->w2 & 0x0fffffff) << 32 - | be32_to_cpu(end->w3); + uint64_t qaddr_base = xive_end_qaddr(end); uint32_t qsize = xive_get_field32(END_W0_QSIZE, end->w0); uint32_t qindex = xive_get_field32(END_W1_PAGE_OFF, end->w1); uint32_t qgen = xive_get_field32(END_W1_GENERATION, end->w1); diff --git a/include/hw/ppc/xive_regs.h b/include/hw/ppc/xive_regs.h index bf36678a24..1a8c5b5e64 100644 --- a/include/hw/ppc/xive_regs.h +++ b/include/hw/ppc/xive_regs.h @@ -208,6 +208,12 @@ typedef struct XiveEND { #define xive_end_is_backlog(end) (be32_to_cpu((end)->w0) & END_W0_BACKLOG) #define xive_end_is_escalate(end) (be32_to_cpu((end)->w0) & END_W0_ESCALATE_CTL) +static inline uint64_t xive_end_qaddr(XiveEND *end) +{ + return ((uint64_t) be32_to_cpu(end->w2) & 0x0fffffff) << 32 | + be32_to_cpu(end->w3); +} + /* Notification Virtual Target (NVT) */ typedef struct XiveNVT { uint32_t w0; From fb2e8b5132fa592753e330b2675093e907265cdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 8 May 2019 19:19:46 +0200 Subject: [PATCH 16/44] spapr/xive: print out the EQ page address in the monitor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This proved to be a useful information when debugging issues with OS event queues allocated above 64GB. Signed-off-by: Cédric Le Goater Message-Id: <20190508171946.657-4-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index a19e998093..58cc6e2b50 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -120,6 +120,7 @@ static int spapr_xive_target_to_end(uint32_t target, uint8_t prio, static void spapr_xive_end_pic_print_info(SpaprXive *xive, XiveEND *end, Monitor *mon) { + uint64_t qaddr_base = xive_end_qaddr(end); uint32_t qindex = xive_get_field32(END_W1_PAGE_OFF, end->w1); uint32_t qgen = xive_get_field32(END_W1_GENERATION, end->w1); uint32_t qsize = xive_get_field32(END_W0_QSIZE, end->w0); @@ -127,9 +128,9 @@ static void spapr_xive_end_pic_print_info(SpaprXive *xive, XiveEND *end, uint32_t nvt = xive_get_field32(END_W6_NVT_INDEX, end->w6); uint8_t priority = xive_get_field32(END_W7_F0_PRIORITY, end->w7); - monitor_printf(mon, "%3d/%d % 6d/%5d ^%d", + monitor_printf(mon, "%3d/%d % 6d/%5d @%"PRIx64" ^%d", spapr_xive_nvt_to_target(0, nvt), - priority, qindex, qentries, qgen); + priority, qindex, qentries, qaddr_base, qgen); xive_end_queue_pic_print_info(end, 6, mon); monitor_printf(mon, "]"); From f81d69fcea571af19de01377d2b474a675f573d6 Mon Sep 17 00:00:00 2001 From: Satheesh Rajendran Date: Thu, 9 May 2019 13:37:50 +0530 Subject: [PATCH 17/44] Fix typo on "info pic" monitor cmd output for xive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of LISN i.e "Logical Interrupt Source Number" as per Xive PAPR document "info pic" prints as LSIN, let's fix it. Signed-off-by: Satheesh Rajendran Message-Id: <20190509080750.21999-1-sathnaga@linux.vnet.ibm.com> Reviewed-by: Greg Kurz Reviewed-by: Cédric Le Goater Reviewed-by: Stefano Garzarella Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 58cc6e2b50..62e13ac353 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -141,7 +141,7 @@ void spapr_xive_pic_print_info(SpaprXive *xive, Monitor *mon) XiveSource *xsrc = &xive->source; int i; - monitor_printf(mon, " LSIN PQ EISN CPU/PRIO EQ\n"); + monitor_printf(mon, " LISN PQ EISN CPU/PRIO EQ\n"); for (i = 0; i < xive->nr_irqs; i++) { uint8_t pq = xive_source_esb_get(xsrc, i); From e04c5dd139a319a97d50e29c0a84b5c630e6e740 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 9 May 2019 10:35:45 +1000 Subject: [PATCH 18/44] target/ppc: Optimise VSX_LOAD_SCALAR_DS and VSX_VECTOR_LOAD_STORE A few small optimisations: In VSX_LOAD_SCALAR_DS() we can don't need to read the VSR via get_cpu_vsrh(). Split VSX_VECTOR_LOAD_STORE() into two functions. Loads only need to write the VSRs (set_cpu_vsr*()) and stores only need to read the VSRs (get_cpu_vsr*()) Thanks to Mark Cave-Ayland for the suggestions. Signed-off-by: Anton Blanchard Message-Id: <20190509103545.4a7fa71a@kryten> Reviewed-by: Mark Cave-Ayland Signed-off-by: David Gibson --- target/ppc/translate/vsx-impl.inc.c | 72 ++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c index 4b8f6cefe3..c39829cf33 100644 --- a/target/ppc/translate/vsx-impl.inc.c +++ b/target/ppc/translate/vsx-impl.inc.c @@ -227,7 +227,62 @@ static void gen_lxvb16x(DisasContext *ctx) tcg_temp_free_i64(xtl); } -#define VSX_VECTOR_LOAD_STORE(name, op, indexed) \ +#define VSX_VECTOR_LOAD(name, op, indexed) \ +static void gen_##name(DisasContext *ctx) \ +{ \ + int xt; \ + TCGv EA; \ + TCGv_i64 xth; \ + TCGv_i64 xtl; \ + \ + if (indexed) { \ + xt = xT(ctx->opcode); \ + } else { \ + xt = DQxT(ctx->opcode); \ + } \ + \ + if (xt < 32) { \ + if (unlikely(!ctx->vsx_enabled)) { \ + gen_exception(ctx, POWERPC_EXCP_VSXU); \ + return; \ + } \ + } else { \ + if (unlikely(!ctx->altivec_enabled)) { \ + gen_exception(ctx, POWERPC_EXCP_VPU); \ + return; \ + } \ + } \ + xth = tcg_temp_new_i64(); \ + xtl = tcg_temp_new_i64(); \ + gen_set_access_type(ctx, ACCESS_INT); \ + EA = tcg_temp_new(); \ + if (indexed) { \ + gen_addr_reg_index(ctx, EA); \ + } else { \ + gen_addr_imm_index(ctx, EA, 0x0F); \ + } \ + if (ctx->le_mode) { \ + tcg_gen_qemu_##op(xtl, EA, ctx->mem_idx, MO_LEQ); \ + set_cpu_vsrl(xt, xtl); \ + tcg_gen_addi_tl(EA, EA, 8); \ + tcg_gen_qemu_##op(xth, EA, ctx->mem_idx, MO_LEQ); \ + set_cpu_vsrh(xt, xth); \ + } else { \ + tcg_gen_qemu_##op(xth, EA, ctx->mem_idx, MO_BEQ); \ + set_cpu_vsrh(xt, xth); \ + tcg_gen_addi_tl(EA, EA, 8); \ + tcg_gen_qemu_##op(xtl, EA, ctx->mem_idx, MO_BEQ); \ + set_cpu_vsrl(xt, xtl); \ + } \ + tcg_temp_free(EA); \ + tcg_temp_free_i64(xth); \ + tcg_temp_free_i64(xtl); \ +} + +VSX_VECTOR_LOAD(lxv, ld_i64, 0) +VSX_VECTOR_LOAD(lxvx, ld_i64, 1) + +#define VSX_VECTOR_STORE(name, op, indexed) \ static void gen_##name(DisasContext *ctx) \ { \ int xt; \ @@ -265,26 +320,20 @@ static void gen_##name(DisasContext *ctx) \ } \ if (ctx->le_mode) { \ tcg_gen_qemu_##op(xtl, EA, ctx->mem_idx, MO_LEQ); \ - set_cpu_vsrl(xt, xtl); \ tcg_gen_addi_tl(EA, EA, 8); \ tcg_gen_qemu_##op(xth, EA, ctx->mem_idx, MO_LEQ); \ - set_cpu_vsrh(xt, xth); \ } else { \ tcg_gen_qemu_##op(xth, EA, ctx->mem_idx, MO_BEQ); \ - set_cpu_vsrh(xt, xth); \ tcg_gen_addi_tl(EA, EA, 8); \ tcg_gen_qemu_##op(xtl, EA, ctx->mem_idx, MO_BEQ); \ - set_cpu_vsrl(xt, xtl); \ } \ tcg_temp_free(EA); \ tcg_temp_free_i64(xth); \ tcg_temp_free_i64(xtl); \ } -VSX_VECTOR_LOAD_STORE(lxv, ld_i64, 0) -VSX_VECTOR_LOAD_STORE(stxv, st_i64, 0) -VSX_VECTOR_LOAD_STORE(lxvx, ld_i64, 1) -VSX_VECTOR_LOAD_STORE(stxvx, st_i64, 1) +VSX_VECTOR_STORE(stxv, st_i64, 0) +VSX_VECTOR_STORE(stxvx, st_i64, 1) #ifdef TARGET_PPC64 #define VSX_VECTOR_LOAD_STORE_LENGTH(name) \ @@ -329,7 +378,6 @@ static void gen_##name(DisasContext *ctx) \ return; \ } \ xth = tcg_temp_new_i64(); \ - get_cpu_vsrh(xth, rD(ctx->opcode) + 32); \ gen_set_access_type(ctx, ACCESS_INT); \ EA = tcg_temp_new(); \ gen_addr_imm_index(ctx, EA, 0x03); \ @@ -513,8 +561,8 @@ static void gen_##name(DisasContext *ctx) \ tcg_temp_free_i64(xth); \ } -VSX_LOAD_SCALAR_DS(stxsd, st64_i64) -VSX_LOAD_SCALAR_DS(stxssp, st32fs) +VSX_STORE_SCALAR_DS(stxsd, st64_i64) +VSX_STORE_SCALAR_DS(stxssp, st32fs) static void gen_mfvsrwz(DisasContext *ctx) { From 77bd8937c03dd55e57cc257951ad07c185303c3e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 9 May 2019 10:49:12 +1000 Subject: [PATCH 19/44] target/ppc: Fix xvabs[sd]p, xvnabs[sd]p, xvneg[sd]p, xvcpsgn[sd]p We were using set_cpu_vsr*() when we should have used get_cpu_vsr*(). Fixes: 8b3b2d75c7c0 ("introduce get_cpu_vsr{l,h}() and set_cpu_vsr{l,h}() helpers for VSR register access") Signed-off-by: Anton Blanchard Message-Id: <20190509104912.6b754dff@kryten> Reviewed-by: Mark Cave-Ayland Signed-off-by: David Gibson --- target/ppc/translate/vsx-impl.inc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/target/ppc/translate/vsx-impl.inc.c b/target/ppc/translate/vsx-impl.inc.c index c39829cf33..199d22da97 100644 --- a/target/ppc/translate/vsx-impl.inc.c +++ b/target/ppc/translate/vsx-impl.inc.c @@ -906,8 +906,8 @@ static void glue(gen_, name)(DisasContext *ctx) \ xbh = tcg_temp_new_i64(); \ xbl = tcg_temp_new_i64(); \ sgm = tcg_temp_new_i64(); \ - set_cpu_vsrh(xB(ctx->opcode), xbh); \ - set_cpu_vsrl(xB(ctx->opcode), xbl); \ + get_cpu_vsrh(xbh, xB(ctx->opcode)); \ + get_cpu_vsrl(xbl, xB(ctx->opcode)); \ tcg_gen_movi_i64(sgm, sgn_mask); \ switch (op) { \ case OP_ABS: { \ From e7f78db9fbb18900c724fd8ebfc46b6962203b98 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 15 May 2019 19:04:24 +0200 Subject: [PATCH 20/44] spapr/xive: Sanity checks of OV5 during CAS If a machine is started with ic-mode=xive but the guest only knows about XICS, eg. an RHEL 7.6 guest, the kernel panics. This is expected but a bit unfortunate since the crash doesn't provide much information for the end user to guess what's happening. Detect that during CAS and exit QEMU with a proper error message instead, like it is already done for the MMU. Even if this is less likely to happen, the opposite case of a guest that only knows about XIVE would certainly fail all the same if the machine is started with ic-mode=xics. Also, the only valid values a guest can pass in byte 23 of OV5 during CAS are 0b00 (XIVE legacy mode) and 0b01 (XIVE exploitation mode). Any other value is a bug, at least with the current spec. Again, it does not seem right to let the guest go on without a precise idea of the interrupt mode it asked for. Handle these cases as well. Reported-by: Satheesh Rajendran Signed-off-by: Greg Kurz Message-Id: <155793986451.464434.12887933000007255549.stgit@bahia.lan> Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 6c16d2b120..63a55614b8 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1513,6 +1513,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, bool guest_radix; Error *local_err = NULL; bool raw_mode_supported = false; + bool guest_xive; cas_pvr = cas_check_pvr(spapr, cpu, &addr, &raw_mode_supported, &local_err); if (local_err) { @@ -1545,10 +1546,17 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, error_report("guest requested hash and radix MMU, which is invalid."); exit(EXIT_FAILURE); } + if (spapr_ovec_test(ov5_guest, OV5_XIVE_BOTH)) { + error_report("guest requested an invalid interrupt mode"); + exit(EXIT_FAILURE); + } + /* The radix/hash bit in byte 24 requires special handling: */ guest_radix = spapr_ovec_test(ov5_guest, OV5_MMU_RADIX_300); spapr_ovec_clear(ov5_guest, OV5_MMU_RADIX_300); + guest_xive = spapr_ovec_test(ov5_guest, OV5_XIVE_EXPLOIT); + /* * HPT resizing is a bit of a special case, because when enabled * we assume an HPT guest will support it until it says it @@ -1632,6 +1640,22 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, ov5_updates) != 0); } + /* + * Ensure the guest asks for an interrupt mode we support; otherwise + * terminate the boot. + */ + if (guest_xive) { + if (spapr->irq->ov5 == SPAPR_OV5_XIVE_LEGACY) { + error_report("Guest requested unavailable interrupt mode (XIVE)"); + exit(EXIT_FAILURE); + } + } else { + if (spapr->irq->ov5 == SPAPR_OV5_XIVE_EXPLOIT) { + error_report("Guest requested unavailable interrupt mode (XICS)"); + exit(EXIT_FAILURE); + } + } + /* * Generate a machine reset when we have an update of the * interrupt mode. Only required when the machine supports both From 70de096748dc0243b9c6aec5458b3ff270fe13e6 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Thu, 16 May 2019 10:57:44 +1000 Subject: [PATCH 21/44] target/ppc: Set PSSCR_EC on cpu halt to prevent spurious wakeup The processor stop status and control register (PSSCR) is used to control the power saving facilities of the thread. The exit criterion bit (EC) is used to specify whether the thread should be woken by any interrupt (EC == 0) or only an interrupt enabled in the LPCR to wake the thread (EC == 1). The rtas facilities start-cpu and self-stop are used to transition a vcpu between the stopped and running states. When a vcpu is stopped it may only be started again by the start-cpu rtas call. Currently a vcpu in the stopped state will start again whenever an interrupt comes along due to PSSCR_EC being cleared, and while this is architecturally correct for a hardware thread, a vcpu is expected to only be woken by calling start-cpu. This means when performing a reboot on a tcg machine that the secondary threads will restart while the primary is still in slof, this is unsupported and causes call traces like: SLOF ********************************************************************** QEMU Starting Build Date = Jan 14 2019 18:00:39 FW Version = git-a5b428e1c1eae703 Press "s" to enter Open Firmware. qemu: fatal: Trying to deliver HV exception (MSR) 70 with no HV support NIP 6d61676963313230 LR 000000003dbe0308 CTR 6d61676963313233 XER 0000000000000000 CPU#1 MSR 0000000000000000 HID0 0000000000000000 HF 0000000000000000 iidx 3 didx 3 TB 00000026 115746031956 DECR 18446744073326238463 GPR00 000000003dbe0308 000000003e669fe0 000000003dc10700 0000000000000003 GPR04 000000003dc62198 000000003dc62178 000000003dc0ea48 0000000000000030 GPR08 000000003dc621a8 0000000000000018 000000003e466008 000000003dc50700 GPR12 c00000000093a4e0 c00000003ffff300 c00000003e533f90 0000000000000000 GPR16 0000000000000000 0000000000000000 000000003e466010 000000003dc0b040 GPR20 0000000000008000 000000000000f003 0000000000000006 000000003e66a050 GPR24 000000003dc06400 000000003dc0ae70 0000000000000003 000000000000f001 GPR28 000000003e66a060 ffffffffffffffff 6d61676963313233 0000000000000028 CR 28000222 [ E L - - - E E E ] RES ffffffffffffffff FPR00 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPR04 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPR08 0000000000000000 0000000000000000 0000000000000000 00000000311825e0 FPR12 00000000311825e0 0000000000000000 0000000000000000 0000000000000000 FPR16 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPR20 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPR24 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPR28 0000000000000000 0000000000000000 0000000000000000 0000000000000000 FPSCR 0000000000000000 SRR0 000000003dbe06b0 SRR1 0000000000080000 PVR 00000000004e1200 VRSAVE 0000000000000000 SPRG0 000000003dbe0308 SPRG1 000000003e669fe0 SPRG2 00000000000000d8 SPRG3 000000003dbe0308 SPRG4 0000000000000000 SPRG5 0000000000000000 SPRG6 0000000000000000 SPRG7 0000000000000000 HSRR0 6d61676963313230 HSRR1 0000000000000000 CFAR 000000003dbe3e64 LPCR 0000000004020008 PTCR 0000000000000000 DAR 0000000000000000 DSISR 0000000000000000 Aborted (core dumped) To fix this, set the PSSCR_EC bit when a vcpu is stopped to disable it from coming back online until the start-cpu rtas call is made. Fixes: 21c0d66a9c99 ("target/ppc: Fix support for "STOP light" states on POWER9") Signed-off-by: Suraj Jitindar Singh Message-Id: <20190516005744.24366-1-sjitindarsingh@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_cpu_core.c | 2 ++ hw/ppc/spapr_rtas.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index f04e06cdf6..5621fb9a3d 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -58,9 +58,11 @@ static void spapr_cpu_reset(void *opaque) * * Disable Power-saving mode Exit Cause exceptions for the CPU, so * we don't get spurious wakups before an RTAS start-cpu call. + * For the same reason, set PSSCR_EC. */ lpcr &= ~(LPCR_VPM0 | LPCR_VPM1 | LPCR_ISL | LPCR_KBV | pcc->lpcr_pm); lpcr |= LPCR_LPES0 | LPCR_LPES1; + env->spr[SPR_PSSCR] |= PSSCR_EC; /* Set RMLS to the max (ie, 16G) */ lpcr &= ~LPCR_RMLS; diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index ee24212765..5bc1a93271 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -177,6 +177,7 @@ static void rtas_start_cpu(PowerPCCPU *callcpu, SpaprMachineState *spapr, } else { lpcr &= ~(LPCR_UPRT | LPCR_GTSE | LPCR_HR); } + env->spr[SPR_PSSCR] &= ~PSSCR_EC; } ppc_store_lpcr(newcpu, lpcr); @@ -205,8 +206,11 @@ static void rtas_stop_self(PowerPCCPU *cpu, SpaprMachineState *spapr, /* Disable Power-saving mode Exit Cause exceptions for the CPU. * This could deliver an interrupt on a dying CPU and crash the - * guest */ + * guest. + * For the same reason, set PSSCR_EC. + */ ppc_store_lpcr(cpu, env->spr[SPR_LPCR] & ~pcc->lpcr_pm); + env->spr[SPR_PSSCR] |= PSSCR_EC; cs->halted = 1; kvmppc_set_reg_ppc_online(cpu, 0); qemu_cpu_kick(cs); From 64d4a53431733fe9d50e94a5a33b15d151f7f8e9 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Fri, 17 May 2019 14:10:44 +1000 Subject: [PATCH 22/44] spapr: Add forgotten capability to migration stream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit spapr machine capabilities are supposed to be sent in the migration stream so that we can sanity check the source and destination have compatible configuration. Unfortunately, when we added the hpt-max-page-size capability, we forgot to add it to the migration state. This means that we can generate spurious warnings when both ends are configured for large pages, or potentially fail to warn if the source is configured for huge pages, but the destination is not. Fixes: 2309832afda "spapr: Maximum (HPT) pagesize property" Signed-off-by: David Gibson Reviewed-by: Cédric Le Goater --- hw/ppc/spapr.c | 1 + hw/ppc/spapr_caps.c | 1 + include/hw/ppc/spapr.h | 1 + 3 files changed, 3 insertions(+) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 8580a8dc67..bcae30ad26 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2125,6 +2125,7 @@ static const VMStateDescription vmstate_spapr = { &vmstate_spapr_cap_cfpc, &vmstate_spapr_cap_sbbc, &vmstate_spapr_cap_ibs, + &vmstate_spapr_cap_hpt_maxpagesize, &vmstate_spapr_irq_map, &vmstate_spapr_cap_nested_kvm_hv, &vmstate_spapr_dtb, diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 9b1c10baa6..658eb15a14 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -703,6 +703,7 @@ SPAPR_CAP_MIG_STATE(dfp, SPAPR_CAP_DFP); SPAPR_CAP_MIG_STATE(cfpc, SPAPR_CAP_CFPC); SPAPR_CAP_MIG_STATE(sbbc, SPAPR_CAP_SBBC); SPAPR_CAP_MIG_STATE(ibs, SPAPR_CAP_IBS); +SPAPR_CAP_MIG_STATE(hpt_maxpagesize, SPAPR_CAP_HPT_MAXPAGESIZE); SPAPR_CAP_MIG_STATE(nested_kvm_hv, SPAPR_CAP_NESTED_KVM_HV); SPAPR_CAP_MIG_STATE(large_decr, SPAPR_CAP_LARGE_DECREMENTER); SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST); diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 7e32f309c2..9fc91c8f5e 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -849,6 +849,7 @@ extern const VMStateDescription vmstate_spapr_cap_dfp; extern const VMStateDescription vmstate_spapr_cap_cfpc; extern const VMStateDescription vmstate_spapr_cap_sbbc; extern const VMStateDescription vmstate_spapr_cap_ibs; +extern const VMStateDescription vmstate_spapr_cap_hpt_maxpagesize; extern const VMStateDescription vmstate_spapr_cap_nested_kvm_hv; extern const VMStateDescription vmstate_spapr_cap_large_decr; extern const VMStateDescription vmstate_spapr_cap_ccf_assist; From 571fbe6ccd7a159789e5d473e2837d45764197ec Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 18 May 2019 12:14:29 -0700 Subject: [PATCH 23/44] target/ppc: Use vector variable shifts for VSL, VSR, VSRA The gvec expanders take care of masking the shift amount against the element width. Signed-off-by: Richard Henderson Message-Id: <20190518191430.21686-2-richard.henderson@linaro.org> Signed-off-by: David Gibson --- target/ppc/helper.h | 12 ---------- target/ppc/int_helper.c | 37 ----------------------------- target/ppc/translate/vmx-impl.inc.c | 24 +++++++++---------- 3 files changed, 12 insertions(+), 61 deletions(-) diff --git a/target/ppc/helper.h b/target/ppc/helper.h index 638a6e99c4..02b67a333e 100644 --- a/target/ppc/helper.h +++ b/target/ppc/helper.h @@ -180,18 +180,6 @@ DEF_HELPER_3(vmuloub, void, avr, avr, avr) DEF_HELPER_3(vmulouh, void, avr, avr, avr) DEF_HELPER_3(vmulouw, void, avr, avr, avr) DEF_HELPER_3(vmuluwm, void, avr, avr, avr) -DEF_HELPER_3(vsrab, void, avr, avr, avr) -DEF_HELPER_3(vsrah, void, avr, avr, avr) -DEF_HELPER_3(vsraw, void, avr, avr, avr) -DEF_HELPER_3(vsrad, void, avr, avr, avr) -DEF_HELPER_3(vsrb, void, avr, avr, avr) -DEF_HELPER_3(vsrh, void, avr, avr, avr) -DEF_HELPER_3(vsrw, void, avr, avr, avr) -DEF_HELPER_3(vsrd, void, avr, avr, avr) -DEF_HELPER_3(vslb, void, avr, avr, avr) -DEF_HELPER_3(vslh, void, avr, avr, avr) -DEF_HELPER_3(vslw, void, avr, avr, avr) -DEF_HELPER_3(vsld, void, avr, avr, avr) DEF_HELPER_3(vslo, void, avr, avr, avr) DEF_HELPER_3(vsro, void, avr, avr, avr) DEF_HELPER_3(vsrv, void, avr, avr, avr) diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c index 093ef74b59..8ce89f2ad9 100644 --- a/target/ppc/int_helper.c +++ b/target/ppc/int_helper.c @@ -1791,23 +1791,6 @@ VSHIFT(l, 1) VSHIFT(r, 0) #undef VSHIFT -#define VSL(suffix, element, mask) \ - void helper_vsl##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ - { \ - int i; \ - \ - for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ - unsigned int shift = b->element[i] & mask; \ - \ - r->element[i] = a->element[i] << shift; \ - } \ - } -VSL(b, u8, 0x7) -VSL(h, u16, 0x0F) -VSL(w, u32, 0x1F) -VSL(d, u64, 0x3F) -#undef VSL - void helper_vslv(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int i; @@ -1980,26 +1963,6 @@ VNEG(vnegw, s32) VNEG(vnegd, s64) #undef VNEG -#define VSR(suffix, element, mask) \ - void helper_vsr##suffix(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ - { \ - int i; \ - \ - for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ - unsigned int shift = b->element[i] & mask; \ - r->element[i] = a->element[i] >> shift; \ - } \ - } -VSR(ab, s8, 0x7) -VSR(ah, s16, 0xF) -VSR(aw, s32, 0x1F) -VSR(ad, s64, 0x3F) -VSR(b, u8, 0x7) -VSR(h, u16, 0xF) -VSR(w, u32, 0x1F) -VSR(d, u64, 0x3F) -#undef VSR - void helper_vsro(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) { int sh = (b->VsrB(0xf) >> 3) & 0xf; diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c index 6861f4c5b9..663275b729 100644 --- a/target/ppc/translate/vmx-impl.inc.c +++ b/target/ppc/translate/vmx-impl.inc.c @@ -530,21 +530,21 @@ GEN_VXFORM(vmuleuw, 4, 10); GEN_VXFORM(vmulesb, 4, 12); GEN_VXFORM(vmulesh, 4, 13); GEN_VXFORM(vmulesw, 4, 14); -GEN_VXFORM(vslb, 2, 4); -GEN_VXFORM(vslh, 2, 5); -GEN_VXFORM(vslw, 2, 6); +GEN_VXFORM_V(vslb, MO_8, tcg_gen_gvec_shlv, 2, 4); +GEN_VXFORM_V(vslh, MO_16, tcg_gen_gvec_shlv, 2, 5); +GEN_VXFORM_V(vslw, MO_32, tcg_gen_gvec_shlv, 2, 6); GEN_VXFORM(vrlwnm, 2, 6); GEN_VXFORM_DUAL(vslw, PPC_ALTIVEC, PPC_NONE, \ vrlwnm, PPC_NONE, PPC2_ISA300) -GEN_VXFORM(vsld, 2, 23); -GEN_VXFORM(vsrb, 2, 8); -GEN_VXFORM(vsrh, 2, 9); -GEN_VXFORM(vsrw, 2, 10); -GEN_VXFORM(vsrd, 2, 27); -GEN_VXFORM(vsrab, 2, 12); -GEN_VXFORM(vsrah, 2, 13); -GEN_VXFORM(vsraw, 2, 14); -GEN_VXFORM(vsrad, 2, 15); +GEN_VXFORM_V(vsld, MO_64, tcg_gen_gvec_shlv, 2, 23); +GEN_VXFORM_V(vsrb, MO_8, tcg_gen_gvec_shrv, 2, 8); +GEN_VXFORM_V(vsrh, MO_16, tcg_gen_gvec_shrv, 2, 9); +GEN_VXFORM_V(vsrw, MO_32, tcg_gen_gvec_shrv, 2, 10); +GEN_VXFORM_V(vsrd, MO_64, tcg_gen_gvec_shrv, 2, 27); +GEN_VXFORM_V(vsrab, MO_8, tcg_gen_gvec_sarv, 2, 12); +GEN_VXFORM_V(vsrah, MO_16, tcg_gen_gvec_sarv, 2, 13); +GEN_VXFORM_V(vsraw, MO_32, tcg_gen_gvec_sarv, 2, 14); +GEN_VXFORM_V(vsrad, MO_64, tcg_gen_gvec_sarv, 2, 15); GEN_VXFORM(vsrv, 2, 28); GEN_VXFORM(vslv, 2, 29); GEN_VXFORM(vslo, 6, 16); From eb3cba827294e2c128cd484f423da648fada8f68 Mon Sep 17 00:00:00 2001 From: David Gibson Date: Mon, 20 May 2019 15:38:40 +1000 Subject: [PATCH 24/44] spapr: Fix phb_placement backwards compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we added support for NVLink2 passthrough devices, we changed the phb_placement hook to handle the placement of NVLink2 bridges' specific resources. For compatibility we use a version that doesn't do this allocation for old machine types. However, because of the delay between when the patch was posted and when it was merged, we ended up with that compatibility hook applying for machine versions 3.1 and earlier whereas it should apply for 4.0 and earlier (since the patch was applied early in the 4.1 tree). Fixes: ec132efaa81 "spapr: Support NVIDIA V100 GPU with NVLink2" Reported-by: Laurent Vivier Signed-off-by: David Gibson Reviewed-by: Cédric Le Goater Reviewed-by: Greg Kurz Reviewed-by: Laurent Vivier --- hw/ppc/spapr.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index bcae30ad26..39e698e9b0 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4411,18 +4411,7 @@ DEFINE_SPAPR_MACHINE(4_1, "4.1", true); /* * pseries-4.0 */ -static void spapr_machine_4_0_class_options(MachineClass *mc) -{ - spapr_machine_4_1_class_options(mc); - compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); -} - -DEFINE_SPAPR_MACHINE(4_0, "4.0", false); - -/* - * pseries-3.1 - */ -static void phb_placement_3_1(SpaprMachineState *spapr, uint32_t index, +static void phb_placement_4_0(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, hwaddr *mmio32, hwaddr *mmio64, unsigned n_dma, uint32_t *liobns, @@ -4434,6 +4423,20 @@ static void phb_placement_3_1(SpaprMachineState *spapr, uint32_t index, *nv2atsd = 0; } +static void spapr_machine_4_0_class_options(MachineClass *mc) +{ + SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); + + spapr_machine_4_1_class_options(mc); + compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); + smc->phb_placement = phb_placement_4_0; +} + +DEFINE_SPAPR_MACHINE(4_0, "4.0", false); + +/* + * pseries-3.1 + */ static void spapr_machine_3_1_class_options(MachineClass *mc) { SpaprMachineClass *smc = SPAPR_MACHINE_CLASS(mc); @@ -4449,7 +4452,6 @@ static void spapr_machine_3_1_class_options(MachineClass *mc) smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN; smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN; smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_OFF; - smc->phb_placement = phb_placement_3_1; } DEFINE_SPAPR_MACHINE(3_1, "3.1", false); From 75de59416df857b429f1aac028855090c69f2ea9 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Thu, 16 May 2019 09:36:57 +0200 Subject: [PATCH 25/44] spapr: Print out extra hints when CAS negotiation of interrupt mode fails Let's suggest to the user how the machine should be configured to allow the guest to boot successfully. Suggested-by: Satheesh Rajendran Signed-off-by: Greg Kurz Message-Id: <155799221739.527449.14907564571096243745.stgit@bahia.lan> Reviewed-by: Satheesh Rajendran Tested-by: Satheesh Rajendran [dwg: Adjusted for style error] Signed-off-by: David Gibson --- hw/ppc/spapr_hcall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 63a55614b8..0a050ad3d8 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1646,12 +1646,14 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, */ if (guest_xive) { if (spapr->irq->ov5 == SPAPR_OV5_XIVE_LEGACY) { - error_report("Guest requested unavailable interrupt mode (XIVE)"); + error_report( +"Guest requested unavailable interrupt mode (XIVE), try the ic-mode=xive or ic-mode=dual machine property"); exit(EXIT_FAILURE); } } else { if (spapr->irq->ov5 == SPAPR_OV5_XIVE_EXPLOIT) { - error_report("Guest requested unavailable interrupt mode (XICS)"); + error_report( +"Guest requested unavailable interrupt mode (XICS), either don't set the ic-mode machine property or try ic-mode=xics or ic-mode=dual"); exit(EXIT_FAILURE); } } From 38afd772f802ff787ea16af73b0c0d24a8c46b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:33 +0200 Subject: [PATCH 26/44] spapr/xive: add KVM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This introduces a set of helpers when KVM is in use, which create the KVM XIVE device, initialize the interrupt sources at a KVM level and connect the interrupt presenters to the vCPU. They also handle the initialization of the TIMA and the source ESB memory regions of the controller. These have a different type under KVM. They are 'ram device' memory mappings, similarly to VFIO, exposed to the guest and the associated VMAs on the host are populated dynamically with the appropriate pages using a fault handler. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-3-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/Makefile.objs | 1 + hw/intc/spapr_xive.c | 48 +++++++- hw/intc/spapr_xive_kvm.c | 237 ++++++++++++++++++++++++++++++++++++ hw/intc/xive.c | 21 +++- hw/ppc/Kconfig | 5 + hw/ppc/spapr_irq.c | 6 +- include/hw/ppc/spapr_xive.h | 10 ++ include/hw/ppc/xive.h | 13 ++ target/ppc/kvm.c | 7 ++ target/ppc/kvm_ppc.h | 6 + 10 files changed, 344 insertions(+), 10 deletions(-) create mode 100644 hw/intc/spapr_xive_kvm.c diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs index df712c3e6c..03019b9a03 100644 --- a/hw/intc/Makefile.objs +++ b/hw/intc/Makefile.objs @@ -39,6 +39,7 @@ obj-$(CONFIG_XICS_SPAPR) += xics_spapr.o obj-$(CONFIG_XICS_KVM) += xics_kvm.o obj-$(CONFIG_XIVE) += xive.o obj-$(CONFIG_XIVE_SPAPR) += spapr_xive.o +obj-$(CONFIG_XIVE_KVM) += spapr_xive_kvm.o obj-$(CONFIG_POWERNV) += xics_pnv.o pnv_xive.o obj-$(CONFIG_ALLWINNER_A10_PIC) += allwinner-a10-pic.o obj-$(CONFIG_S390_FLIC) += s390_flic.o diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 62e13ac353..27632683e6 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -174,7 +174,7 @@ void spapr_xive_pic_print_info(SpaprXive *xive, Monitor *mon) } } -static void spapr_xive_map_mmio(SpaprXive *xive) +void spapr_xive_map_mmio(SpaprXive *xive) { sysbus_mmio_map(SYS_BUS_DEVICE(xive), 0, xive->vc_base); sysbus_mmio_map(SYS_BUS_DEVICE(xive), 1, xive->end_base); @@ -251,6 +251,9 @@ static void spapr_xive_instance_init(Object *obj) object_initialize_child(obj, "end_source", &xive->end_source, sizeof(xive->end_source), TYPE_XIVE_END_SOURCE, &error_abort, NULL); + + /* Not connected to the KVM XIVE device */ + xive->fd = -1; } static void spapr_xive_realize(DeviceState *dev, Error **errp) @@ -259,6 +262,7 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) XiveSource *xsrc = &xive->source; XiveENDSource *end_xsrc = &xive->end_source; Error *local_err = NULL; + MachineState *machine = MACHINE(qdev_get_machine()); if (!xive->nr_irqs) { error_setg(errp, "Number of interrupt needs to be greater 0"); @@ -305,6 +309,32 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) xive->eat = g_new0(XiveEAS, xive->nr_irqs); xive->endt = g_new0(XiveEND, xive->nr_ends); + xive->nodename = g_strdup_printf("interrupt-controller@%" PRIx64, + xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT)); + + qemu_register_reset(spapr_xive_reset, dev); + + if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { + kvmppc_xive_connect(xive, &local_err); + if (local_err && machine_kernel_irqchip_required(machine)) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); + error_propagate(errp, local_err); + return; + } + + if (!local_err) { + return; + } + + /* + * We failed to initialize the XIVE KVM device, fallback to + * emulated mode + */ + error_prepend(&local_err, "kernel_irqchip allowed but unavailable: "); + warn_report_err(local_err); + } + /* TIMA initialization */ memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive, "xive.tima", 4ull << TM_SHIFT); @@ -316,11 +346,6 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) /* Map all regions */ spapr_xive_map_mmio(xive); - - xive->nodename = g_strdup_printf("interrupt-controller@%" PRIx64, - xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT)); - - qemu_register_reset(spapr_xive_reset, dev); } static int spapr_xive_get_eas(XiveRouter *xrtr, uint8_t eas_blk, @@ -495,6 +520,17 @@ bool spapr_xive_irq_claim(SpaprXive *xive, uint32_t lisn, bool lsi) if (lsi) { xive_source_irq_set_lsi(xsrc, lisn); } + + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_source_reset_one(xsrc, lisn, &local_err); + if (local_err) { + error_report_err(local_err); + return false; + } + } + return true; } diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c new file mode 100644 index 0000000000..7d9e771e8a --- /dev/null +++ b/hw/intc/spapr_xive_kvm.c @@ -0,0 +1,237 @@ +/* + * QEMU PowerPC sPAPR XIVE interrupt controller model + * + * Copyright (c) 2017-2019, IBM Corporation. + * + * This code is licensed under the GPL version 2 or later. See the + * COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "target/ppc/cpu.h" +#include "sysemu/cpus.h" +#include "sysemu/kvm.h" +#include "hw/ppc/spapr.h" +#include "hw/ppc/spapr_xive.h" +#include "hw/ppc/xive.h" +#include "kvm_ppc.h" + +#include + +/* + * Helpers for CPU hotplug + * + * TODO: make a common KVMEnabledCPU layer for XICS and XIVE + */ +typedef struct KVMEnabledCPU { + unsigned long vcpu_id; + QLIST_ENTRY(KVMEnabledCPU) node; +} KVMEnabledCPU; + +static QLIST_HEAD(, KVMEnabledCPU) + kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus); + +static bool kvm_cpu_is_enabled(CPUState *cs) +{ + KVMEnabledCPU *enabled_cpu; + unsigned long vcpu_id = kvm_arch_vcpu_id(cs); + + QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) { + if (enabled_cpu->vcpu_id == vcpu_id) { + return true; + } + } + return false; +} + +static void kvm_cpu_enable(CPUState *cs) +{ + KVMEnabledCPU *enabled_cpu; + unsigned long vcpu_id = kvm_arch_vcpu_id(cs); + + enabled_cpu = g_malloc(sizeof(*enabled_cpu)); + enabled_cpu->vcpu_id = vcpu_id; + QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node); +} + +/* + * XIVE Thread Interrupt Management context (KVM) + */ + +void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) +{ + SpaprXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive; + unsigned long vcpu_id; + int ret; + + /* Check if CPU was hot unplugged and replugged. */ + if (kvm_cpu_is_enabled(tctx->cs)) { + return; + } + + vcpu_id = kvm_arch_vcpu_id(tctx->cs); + + ret = kvm_vcpu_enable_cap(tctx->cs, KVM_CAP_PPC_IRQ_XIVE, 0, xive->fd, + vcpu_id, 0); + if (ret < 0) { + error_setg(errp, "XIVE: unable to connect CPU%ld to KVM device: %s", + vcpu_id, strerror(errno)); + return; + } + + kvm_cpu_enable(tctx->cs); +} + +/* + * XIVE Interrupt Source (KVM) + */ + +/* + * At reset, the interrupt sources are simply created and MASKED. We + * only need to inform the KVM XIVE device about their type: LSI or + * MSI. + */ +void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) +{ + SpaprXive *xive = SPAPR_XIVE(xsrc->xive); + uint64_t state = 0; + + if (xive_source_irq_is_lsi(xsrc, srcno)) { + state |= KVM_XIVE_LEVEL_SENSITIVE; + if (xsrc->status[srcno] & XIVE_STATUS_ASSERTED) { + state |= KVM_XIVE_LEVEL_ASSERTED; + } + } + + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCE, srcno, &state, + true, errp); +} + +void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) +{ + int i; + + for (i = 0; i < xsrc->nr_irqs; i++) { + Error *local_err = NULL; + + kvmppc_xive_source_reset_one(xsrc, i, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} + +void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val) +{ + XiveSource *xsrc = opaque; + struct kvm_irq_level args; + int rc; + + args.irq = srcno; + if (!xive_source_irq_is_lsi(xsrc, srcno)) { + if (!val) { + return; + } + args.level = KVM_INTERRUPT_SET; + } else { + if (val) { + xsrc->status[srcno] |= XIVE_STATUS_ASSERTED; + args.level = KVM_INTERRUPT_SET_LEVEL; + } else { + xsrc->status[srcno] &= ~XIVE_STATUS_ASSERTED; + args.level = KVM_INTERRUPT_UNSET; + } + } + rc = kvm_vm_ioctl(kvm_state, KVM_IRQ_LINE, &args); + if (rc < 0) { + error_report("XIVE: kvm_irq_line() failed : %s", strerror(errno)); + } +} + +/* + * sPAPR XIVE interrupt controller (KVM) + */ + +static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, + Error **errp) +{ + void *addr; + uint32_t page_shift = 16; /* TODO: fix page_shift */ + + addr = mmap(NULL, len, PROT_WRITE | PROT_READ, MAP_SHARED, xive->fd, + pgoff << page_shift); + if (addr == MAP_FAILED) { + error_setg_errno(errp, errno, "XIVE: unable to set memory mapping"); + return NULL; + } + + return addr; +} + +/* + * All the XIVE memory regions are now backed by mappings from the KVM + * XIVE device. + */ +void kvmppc_xive_connect(SpaprXive *xive, Error **errp) +{ + XiveSource *xsrc = &xive->source; + XiveENDSource *end_xsrc = &xive->end_source; + Error *local_err = NULL; + size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs; + size_t tima_len = 4ull << TM_SHIFT; + + if (!kvmppc_has_cap_xive()) { + error_setg(errp, "IRQ_XIVE capability must be present for KVM"); + return; + } + + /* First, create the KVM XIVE device */ + xive->fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_XIVE, false); + if (xive->fd < 0) { + error_setg_errno(errp, -xive->fd, "XIVE: error creating KVM device"); + return; + } + + /* + * 1. Source ESB pages - KVM mapping + */ + xsrc->esb_mmap = kvmppc_xive_mmap(xive, KVM_XIVE_ESB_PAGE_OFFSET, esb_len, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc), + "xive.esb", esb_len, xsrc->esb_mmap); + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio); + + /* + * 2. END ESB pages (No KVM support yet) + */ + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio); + + /* + * 3. TIMA pages - KVM mapping + */ + xive->tm_mmap = kvmppc_xive_mmap(xive, KVM_XIVE_TIMA_PAGE_OFFSET, tima_len, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive), + "xive.tima", tima_len, xive->tm_mmap); + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio); + + kvm_kernel_irqchip = true; + kvm_msi_via_irqfd_allowed = true; + kvm_gsi_direct_mapping = true; + + /* Map all regions */ + spapr_xive_map_mmio(xive); +} diff --git a/hw/intc/xive.c b/hw/intc/xive.c index dcf2fcd108..78047adb11 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -555,6 +555,15 @@ static void xive_tctx_realize(DeviceState *dev, Error **errp) return; } + /* Connect the presenter to the VCPU (required for CPU hotplug) */ + if (kvm_irqchip_in_kernel()) { + kvmppc_xive_cpu_connect(tctx, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + qemu_register_reset(xive_tctx_reset, dev); } @@ -957,6 +966,10 @@ static void xive_source_reset(void *dev) /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */ memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs); + + if (kvm_irqchip_in_kernel()) { + kvmppc_xive_source_reset(xsrc, &error_fatal); + } } static void xive_source_realize(DeviceState *dev, Error **errp) @@ -990,9 +1003,11 @@ static void xive_source_realize(DeviceState *dev, Error **errp) xsrc->status = g_malloc0(xsrc->nr_irqs); xsrc->lsi_map = bitmap_new(xsrc->nr_irqs); - memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc), - &xive_source_esb_ops, xsrc, "xive.esb", - (1ull << xsrc->esb_shift) * xsrc->nr_irqs); + if (!kvm_irqchip_in_kernel()) { + memory_region_init_io(&xsrc->esb_mmio, OBJECT(xsrc), + &xive_source_esb_ops, xsrc, "xive.esb", + (1ull << xsrc->esb_shift) * xsrc->nr_irqs); + } qemu_register_reset(xive_source_reset, dev); } diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig index a3465155f0..f927ec9c74 100644 --- a/hw/ppc/Kconfig +++ b/hw/ppc/Kconfig @@ -122,3 +122,8 @@ config XIVE_SPAPR default y depends on PSERIES select XIVE + +config XIVE_KVM + bool + default y + depends on XIVE_SPAPR && KVM diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index b1f79ea9de..5c4a44855d 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -372,7 +372,11 @@ static void spapr_irq_set_irq_xive(void *opaque, int srcno, int val) { SpaprMachineState *spapr = opaque; - xive_source_set_irq(&spapr->xive->source, srcno, val); + if (kvm_irqchip_in_kernel()) { + kvmppc_xive_source_set_irq(&spapr->xive->source, srcno, val); + } else { + xive_source_set_irq(&spapr->xive->source, srcno, val); + } } static const char *spapr_irq_get_nodename_xive(SpaprMachineState *spapr) diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index fc3e9652f9..0edcc762de 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -38,6 +38,10 @@ typedef struct SpaprXive { /* TIMA mapping address */ hwaddr tm_base; MemoryRegion tm_mmio; + + /* KVM support */ + int fd; + void *tm_mmap; } SpaprXive; bool spapr_xive_irq_claim(SpaprXive *xive, uint32_t lisn, bool lsi); @@ -49,5 +53,11 @@ void spapr_dt_xive(SpaprMachineState *spapr, uint32_t nr_servers, void *fdt, uint32_t phandle); void spapr_xive_set_tctx_os_cam(XiveTCTX *tctx); void spapr_xive_mmio_set_enabled(SpaprXive *xive, bool enable); +void spapr_xive_map_mmio(SpaprXive *xive); + +/* + * KVM XIVE device helpers + */ +void kvmppc_xive_connect(SpaprXive *xive, Error **errp); #endif /* PPC_SPAPR_XIVE_H */ diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index c4f27742ca..dd115da30e 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -140,6 +140,7 @@ #ifndef PPC_XIVE_H #define PPC_XIVE_H +#include "sysemu/kvm.h" #include "hw/qdev-core.h" #include "hw/sysbus.h" #include "hw/ppc/xive_regs.h" @@ -194,6 +195,9 @@ typedef struct XiveSource { uint32_t esb_shift; MemoryRegion esb_mmio; + /* KVM support */ + void *esb_mmap; + XiveNotifier *xive; } XiveSource; @@ -423,4 +427,13 @@ static inline uint32_t xive_nvt_cam_line(uint8_t nvt_blk, uint32_t nvt_idx) return (nvt_blk << 19) | nvt_idx; } +/* + * KVM XIVE device helpers + */ + +void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp); +void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp); +void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val); +void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp); + #endif /* PPC_XIVE_H */ diff --git a/target/ppc/kvm.c b/target/ppc/kvm.c index 1a9caf8f40..3bf0a46c33 100644 --- a/target/ppc/kvm.c +++ b/target/ppc/kvm.c @@ -75,6 +75,7 @@ static int cap_fixup_hcalls; static int cap_htm; /* Hardware transactional memory support */ static int cap_mmu_radix; static int cap_mmu_hash_v3; +static int cap_xive; static int cap_resize_hpt; static int cap_ppc_pvr_compat; static int cap_ppc_safe_cache; @@ -146,6 +147,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) cap_htm = kvm_vm_check_extension(s, KVM_CAP_PPC_HTM); cap_mmu_radix = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_RADIX); cap_mmu_hash_v3 = kvm_vm_check_extension(s, KVM_CAP_PPC_MMU_HASH_V3); + cap_xive = kvm_vm_check_extension(s, KVM_CAP_PPC_IRQ_XIVE); cap_resize_hpt = kvm_vm_check_extension(s, KVM_CAP_SPAPR_RESIZE_HPT); kvmppc_get_cpu_characteristics(s); cap_ppc_nested_kvm_hv = kvm_vm_check_extension(s, KVM_CAP_PPC_NESTED_HV); @@ -2478,6 +2480,11 @@ static int parse_cap_ppc_count_cache_flush_assist(struct kvm_ppc_cpu_char c) return 0; } +bool kvmppc_has_cap_xive(void) +{ + return cap_xive; +} + static void kvmppc_get_cpu_characteristics(KVMState *s) { struct kvm_ppc_cpu_char c; diff --git a/target/ppc/kvm_ppc.h b/target/ppc/kvm_ppc.h index 22385134b4..45776cad79 100644 --- a/target/ppc/kvm_ppc.h +++ b/target/ppc/kvm_ppc.h @@ -60,6 +60,7 @@ bool kvmppc_has_cap_fixup_hcalls(void); bool kvmppc_has_cap_htm(void); bool kvmppc_has_cap_mmu_radix(void); bool kvmppc_has_cap_mmu_hash_v3(void); +bool kvmppc_has_cap_xive(void); int kvmppc_get_cap_safe_cache(void); int kvmppc_get_cap_safe_bounds_check(void); int kvmppc_get_cap_safe_indirect_branch(void); @@ -316,6 +317,11 @@ static inline bool kvmppc_has_cap_mmu_hash_v3(void) return false; } +static inline bool kvmppc_has_cap_xive(void) +{ + return false; +} + static inline int kvmppc_get_cap_safe_cache(void) { return 0; From 0c575703e487b6e36d226b67e0c8d08c004ce998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:34 +0200 Subject: [PATCH 27/44] spapr/xive: add hcall support when under KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XIVE hcalls are all redirected to QEMU as none are on a fast path. When necessary, QEMU invokes KVM through specific ioctls to perform host operations. QEMU should have done the necessary checks before calling KVM and, in case of failure, H_HARDWARE is simply returned. H_INT_ESB is a special case that could have been handled under KVM but the impact on performance was low when under QEMU. Here are some figures : kernel irqchip OFF ON H_INT_ESB KVM QEMU rtl8139 (LSI ) 1.19 1.24 1.23 Gbits/sec virtio 31.80 42.30 -- Gbits/sec Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-4-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 90 ++++++++++++++-- hw/intc/spapr_xive_kvm.c | 197 ++++++++++++++++++++++++++++++++++++ include/hw/ppc/spapr_xive.h | 15 +++ 3 files changed, 294 insertions(+), 8 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 27632683e6..03f92c3e65 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -86,6 +86,22 @@ static int spapr_xive_target_to_nvt(uint32_t target, * sPAPR END indexing uses a simple mapping of the CPU vcpu_id, 8 * priorities per CPU */ +int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, + uint32_t *out_server, uint8_t *out_prio) +{ + + assert(end_blk == SPAPR_XIVE_BLOCK_ID); + + if (out_server) { + *out_server = end_idx >> 3; + } + + if (out_prio) { + *out_prio = end_idx & 0x7; + } + return 0; +} + static void spapr_xive_cpu_to_end(PowerPCCPU *cpu, uint8_t prio, uint8_t *out_end_blk, uint32_t *out_end_idx) { @@ -792,6 +808,16 @@ static target_ulong h_int_set_source_config(PowerPCCPU *cpu, new_eas.w = xive_set_field64(EAS_END_DATA, new_eas.w, eisn); } + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_set_source_config(xive, lisn, &new_eas, &local_err); + if (local_err) { + error_report_err(local_err); + return H_HARDWARE; + } + } + out: xive->eat[lisn] = new_eas; return H_SUCCESS; @@ -1103,6 +1129,16 @@ static target_ulong h_int_set_queue_config(PowerPCCPU *cpu, */ out: + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_set_queue_config(xive, end_blk, end_idx, &end, &local_err); + if (local_err) { + error_report_err(local_err); + return H_HARDWARE; + } + } + /* Update END */ memcpy(&xive->endt[end_idx], &end, sizeof(XiveEND)); return H_SUCCESS; @@ -1194,6 +1230,16 @@ static target_ulong h_int_get_queue_config(PowerPCCPU *cpu, args[2] = 0; } + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_get_queue_config(xive, end_blk, end_idx, end, &local_err); + if (local_err) { + error_report_err(local_err); + return H_HARDWARE; + } + } + /* TODO: do we need any locking on the END ? */ if (flags & SPAPR_XIVE_END_DEBUG) { /* Load the event queue generation number into the return flags */ @@ -1346,15 +1392,20 @@ static target_ulong h_int_esb(PowerPCCPU *cpu, return H_P3; } - mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset; + if (kvm_irqchip_in_kernel()) { + args[0] = kvmppc_xive_esb_rw(xsrc, lisn, offset, data, + flags & SPAPR_XIVE_ESB_STORE); + } else { + mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset; - if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8, - (flags & SPAPR_XIVE_ESB_STORE))) { - qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%" - HWADDR_PRIx "\n", mmio_addr); - return H_HARDWARE; + if (dma_memory_rw(&address_space_memory, mmio_addr, &data, 8, + (flags & SPAPR_XIVE_ESB_STORE))) { + qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%" + HWADDR_PRIx "\n", mmio_addr); + return H_HARDWARE; + } + args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data; } - args[0] = (flags & SPAPR_XIVE_ESB_STORE) ? -1 : data; return H_SUCCESS; } @@ -1411,7 +1462,20 @@ static target_ulong h_int_sync(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - /* This is not real hardware. Nothing to be done */ + /* + * This is not real hardware. Nothing to be done unless when + * under KVM + */ + + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_sync_source(xive, lisn, &local_err); + if (local_err) { + error_report_err(local_err); + return H_HARDWARE; + } + } return H_SUCCESS; } @@ -1446,6 +1510,16 @@ static target_ulong h_int_reset(PowerPCCPU *cpu, } device_reset(DEVICE(xive)); + + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_reset(xive, &local_err); + if (local_err) { + error_report_err(local_err); + return H_HARDWARE; + } + } return H_SUCCESS; } diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 7d9e771e8a..964bad0c23 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -89,6 +89,50 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) * XIVE Interrupt Source (KVM) */ +void kvmppc_xive_set_source_config(SpaprXive *xive, uint32_t lisn, XiveEAS *eas, + Error **errp) +{ + uint32_t end_idx; + uint32_t end_blk; + uint8_t priority; + uint32_t server; + bool masked; + uint32_t eisn; + uint64_t kvm_src; + Error *local_err = NULL; + + assert(xive_eas_is_valid(eas)); + + end_idx = xive_get_field64(EAS_END_INDEX, eas->w); + end_blk = xive_get_field64(EAS_END_BLOCK, eas->w); + eisn = xive_get_field64(EAS_END_DATA, eas->w); + masked = xive_eas_is_masked(eas); + + spapr_xive_end_to_target(end_blk, end_idx, &server, &priority); + + kvm_src = priority << KVM_XIVE_SOURCE_PRIORITY_SHIFT & + KVM_XIVE_SOURCE_PRIORITY_MASK; + kvm_src |= server << KVM_XIVE_SOURCE_SERVER_SHIFT & + KVM_XIVE_SOURCE_SERVER_MASK; + kvm_src |= ((uint64_t) masked << KVM_XIVE_SOURCE_MASKED_SHIFT) & + KVM_XIVE_SOURCE_MASKED_MASK; + kvm_src |= ((uint64_t)eisn << KVM_XIVE_SOURCE_EISN_SHIFT) & + KVM_XIVE_SOURCE_EISN_MASK; + + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCE_CONFIG, lisn, + &kvm_src, true, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } +} + +void kvmppc_xive_sync_source(SpaprXive *xive, uint32_t lisn, Error **errp) +{ + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_SOURCE_SYNC, lisn, + NULL, true, errp); +} + /* * At reset, the interrupt sources are simply created and MASKED. We * only need to inform the KVM XIVE device about their type: LSI or @@ -125,6 +169,64 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) } } +/* + * This is used to perform the magic loads on the ESB pages, described + * in xive.h. + * + * Memory barriers should not be needed for loads (no store for now). + */ +static uint64_t xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, + uint64_t data, bool write) +{ + uint64_t *addr = xsrc->esb_mmap + xive_source_esb_mgmt(xsrc, srcno) + + offset; + + if (write) { + *addr = cpu_to_be64(data); + return -1; + } else { + /* Prevent the compiler from optimizing away the load */ + volatile uint64_t value = be64_to_cpu(*addr); + return value; + } +} + +static uint8_t xive_esb_read(XiveSource *xsrc, int srcno, uint32_t offset) +{ + return xive_esb_rw(xsrc, srcno, offset, 0, 0) & 0x3; +} + +static void xive_esb_trigger(XiveSource *xsrc, int srcno) +{ + uint64_t *addr = xsrc->esb_mmap + xive_source_esb_page(xsrc, srcno); + + *addr = 0x0; +} + +uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, + uint64_t data, bool write) +{ + if (write) { + return xive_esb_rw(xsrc, srcno, offset, data, 1); + } + + /* + * Special Load EOI handling for LSI sources. Q bit is never set + * and the interrupt should be re-triggered if the level is still + * asserted. + */ + if (xive_source_irq_is_lsi(xsrc, srcno) && + offset == XIVE_ESB_LOAD_EOI) { + xive_esb_read(xsrc, srcno, XIVE_ESB_SET_PQ_00); + if (xsrc->status[srcno] & XIVE_STATUS_ASSERTED) { + xive_esb_trigger(xsrc, srcno); + } + return 0; + } else { + return xive_esb_rw(xsrc, srcno, offset, 0, 0); + } +} + void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val) { XiveSource *xsrc = opaque; @@ -155,6 +257,101 @@ void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val) /* * sPAPR XIVE interrupt controller (KVM) */ +void kvmppc_xive_get_queue_config(SpaprXive *xive, uint8_t end_blk, + uint32_t end_idx, XiveEND *end, + Error **errp) +{ + struct kvm_ppc_xive_eq kvm_eq = { 0 }; + uint64_t kvm_eq_idx; + uint8_t priority; + uint32_t server; + Error *local_err = NULL; + + assert(xive_end_is_valid(end)); + + /* Encode the tuple (server, prio) as a KVM EQ index */ + spapr_xive_end_to_target(end_blk, end_idx, &server, &priority); + + kvm_eq_idx = priority << KVM_XIVE_EQ_PRIORITY_SHIFT & + KVM_XIVE_EQ_PRIORITY_MASK; + kvm_eq_idx |= server << KVM_XIVE_EQ_SERVER_SHIFT & + KVM_XIVE_EQ_SERVER_MASK; + + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ_CONFIG, kvm_eq_idx, + &kvm_eq, false, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + /* + * The EQ index and toggle bit are updated by HW. These are the + * only fields from KVM we want to update QEMU with. The other END + * fields should already be in the QEMU END table. + */ + end->w1 = xive_set_field32(END_W1_GENERATION, 0ul, kvm_eq.qtoggle) | + xive_set_field32(END_W1_PAGE_OFF, 0ul, kvm_eq.qindex); +} + +void kvmppc_xive_set_queue_config(SpaprXive *xive, uint8_t end_blk, + uint32_t end_idx, XiveEND *end, + Error **errp) +{ + struct kvm_ppc_xive_eq kvm_eq = { 0 }; + uint64_t kvm_eq_idx; + uint8_t priority; + uint32_t server; + Error *local_err = NULL; + + /* + * Build the KVM state from the local END structure. + */ + + kvm_eq.flags = 0; + if (xive_get_field32(END_W0_UCOND_NOTIFY, end->w0)) { + kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY; + } + + /* + * If the hcall is disabling the EQ, set the size and page address + * to zero. When migrating, only valid ENDs are taken into + * account. + */ + if (xive_end_is_valid(end)) { + kvm_eq.qshift = xive_get_field32(END_W0_QSIZE, end->w0) + 12; + kvm_eq.qaddr = xive_end_qaddr(end); + /* + * The EQ toggle bit and index should only be relevant when + * restoring the EQ state + */ + kvm_eq.qtoggle = xive_get_field32(END_W1_GENERATION, end->w1); + kvm_eq.qindex = xive_get_field32(END_W1_PAGE_OFF, end->w1); + } else { + kvm_eq.qshift = 0; + kvm_eq.qaddr = 0; + } + + /* Encode the tuple (server, prio) as a KVM EQ index */ + spapr_xive_end_to_target(end_blk, end_idx, &server, &priority); + + kvm_eq_idx = priority << KVM_XIVE_EQ_PRIORITY_SHIFT & + KVM_XIVE_EQ_PRIORITY_MASK; + kvm_eq_idx |= server << KVM_XIVE_EQ_SERVER_SHIFT & + KVM_XIVE_EQ_SERVER_MASK; + + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_EQ_CONFIG, kvm_eq_idx, + &kvm_eq, true, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } +} + +void kvmppc_xive_reset(SpaprXive *xive, Error **errp) +{ + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, KVM_DEV_XIVE_RESET, + NULL, true, errp); +} static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, Error **errp) diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 0edcc762de..03685910e7 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -55,9 +55,24 @@ void spapr_xive_set_tctx_os_cam(XiveTCTX *tctx); void spapr_xive_mmio_set_enabled(SpaprXive *xive, bool enable); void spapr_xive_map_mmio(SpaprXive *xive); +int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, + uint32_t *out_server, uint8_t *out_prio); + /* * KVM XIVE device helpers */ void kvmppc_xive_connect(SpaprXive *xive, Error **errp); +void kvmppc_xive_reset(SpaprXive *xive, Error **errp); +void kvmppc_xive_set_source_config(SpaprXive *xive, uint32_t lisn, XiveEAS *eas, + Error **errp); +void kvmppc_xive_sync_source(SpaprXive *xive, uint32_t lisn, Error **errp); +uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, + uint64_t data, bool write); +void kvmppc_xive_set_queue_config(SpaprXive *xive, uint8_t end_blk, + uint32_t end_idx, XiveEND *end, + Error **errp); +void kvmppc_xive_get_queue_config(SpaprXive *xive, uint8_t end_blk, + uint32_t end_idx, XiveEND *end, + Error **errp); #endif /* PPC_SPAPR_XIVE_H */ From 7bfc759c02b8d19fd76d70d138deea0025f6f461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:35 +0200 Subject: [PATCH 28/44] spapr/xive: add state synchronization with KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extends the KVM XIVE device backend with 'synchronize_state' methods used to retrieve the state from KVM. The HW state of the sources, the KVM device and the thread interrupt contexts are collected for the monitor usage and also migration. These get operations rely on their KVM counterpart in the host kernel which acts as a proxy for OPAL, the host firmware. The set operations will be added for migration support later. Signed-off-by: Cédric Le Goater Message-Id: <20190513084245.25755-5-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 17 ++++--- hw/intc/spapr_xive_kvm.c | 90 +++++++++++++++++++++++++++++++++++++ hw/intc/xive.c | 10 +++++ include/hw/ppc/spapr_xive.h | 8 ++++ include/hw/ppc/xive.h | 1 + 5 files changed, 119 insertions(+), 7 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 03f92c3e65..e771db5fd0 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -40,13 +40,6 @@ #define SPAPR_XIVE_NVT_BASE 0x400 -/* - * The sPAPR machine has a unique XIVE IC device. Assign a fixed value - * to the controller block id value. It can nevertheless be changed - * for testing purpose. - */ -#define SPAPR_XIVE_BLOCK_ID 0x0 - /* * sPAPR NVT and END indexing helpers */ @@ -157,6 +150,16 @@ void spapr_xive_pic_print_info(SpaprXive *xive, Monitor *mon) XiveSource *xsrc = &xive->source; int i; + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_synchronize_state(xive, &local_err); + if (local_err) { + error_report_err(local_err); + return; + } + } + monitor_printf(mon, " LISN PQ EISN CPU/PRIO EQ\n"); for (i = 0; i < xive->nr_irqs; i++) { diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 964bad0c23..8dd4f96e0b 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -60,6 +60,54 @@ static void kvm_cpu_enable(CPUState *cs) /* * XIVE Thread Interrupt Management context (KVM) */ +static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp) +{ + uint64_t state[2] = { 0 }; + int ret; + + ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_VP_STATE, state); + if (ret != 0) { + error_setg_errno(errp, errno, + "XIVE: could not capture KVM state of CPU %ld", + kvm_arch_vcpu_id(tctx->cs)); + return; + } + + /* word0 and word1 of the OS ring. */ + *((uint64_t *) &tctx->regs[TM_QW1_OS]) = state[0]; +} + +typedef struct { + XiveTCTX *tctx; + Error *err; +} XiveCpuGetState; + +static void kvmppc_xive_cpu_do_synchronize_state(CPUState *cpu, + run_on_cpu_data arg) +{ + XiveCpuGetState *s = arg.host_ptr; + + kvmppc_xive_cpu_get_state(s->tctx, &s->err); +} + +void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp) +{ + XiveCpuGetState s = { + .tctx = tctx, + .err = NULL, + }; + + /* + * Kick the vCPU to make sure they are available for the KVM ioctl. + */ + run_on_cpu(tctx->cs, kvmppc_xive_cpu_do_synchronize_state, + RUN_ON_CPU_HOST_PTR(&s)); + + if (s.err) { + error_propagate(errp, s.err); + return; + } +} void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) { @@ -227,6 +275,19 @@ uint64_t kvmppc_xive_esb_rw(XiveSource *xsrc, int srcno, uint32_t offset, } } +static void kvmppc_xive_source_get_state(XiveSource *xsrc) +{ + int i; + + for (i = 0; i < xsrc->nr_irqs; i++) { + /* Perform a load without side effect to retrieve the PQ bits */ + uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); + + /* and save PQ locally */ + xive_source_esb_set(xsrc, i, pq); + } +} + void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val) { XiveSource *xsrc = opaque; @@ -353,6 +414,35 @@ void kvmppc_xive_reset(SpaprXive *xive, Error **errp) NULL, true, errp); } +static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp) +{ + Error *local_err = NULL; + int i; + + for (i = 0; i < xive->nr_ends; i++) { + if (!xive_end_is_valid(&xive->endt[i])) { + continue; + } + + kvmppc_xive_get_queue_config(xive, SPAPR_XIVE_BLOCK_ID, i, + &xive->endt[i], &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} + +void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp) +{ + kvmppc_xive_source_get_state(&xive->source); + + /* EAT: there is no extra state to query from KVM */ + + /* ENDT */ + kvmppc_xive_get_queues(xive, errp); +} + static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, Error **errp) { diff --git a/hw/intc/xive.c b/hw/intc/xive.c index 78047adb11..7f1c54a7b5 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -493,6 +493,16 @@ void xive_tctx_pic_print_info(XiveTCTX *tctx, Monitor *mon) int cpu_index = tctx->cs ? tctx->cs->cpu_index : -1; int i; + if (kvm_irqchip_in_kernel()) { + Error *local_err = NULL; + + kvmppc_xive_cpu_synchronize_state(tctx, &local_err); + if (local_err) { + error_report_err(local_err); + return; + } + } + monitor_printf(mon, "CPU[%04x]: QW NSR CPPR IPB LSMFB ACK# INC AGE PIPR" " W2\n", cpu_index); diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 03685910e7..7e49badd8c 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -44,6 +44,13 @@ typedef struct SpaprXive { void *tm_mmap; } SpaprXive; +/* + * The sPAPR machine has a unique XIVE IC device. Assign a fixed value + * to the controller block id value. It can nevertheless be changed + * for testing purpose. + */ +#define SPAPR_XIVE_BLOCK_ID 0x0 + bool spapr_xive_irq_claim(SpaprXive *xive, uint32_t lisn, bool lsi); bool spapr_xive_irq_free(SpaprXive *xive, uint32_t lisn); void spapr_xive_pic_print_info(SpaprXive *xive, Monitor *mon); @@ -74,5 +81,6 @@ void kvmppc_xive_set_queue_config(SpaprXive *xive, uint8_t end_blk, void kvmppc_xive_get_queue_config(SpaprXive *xive, uint8_t end_blk, uint32_t end_idx, XiveEND *end, Error **errp); +void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp); #endif /* PPC_SPAPR_XIVE_H */ diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index dd115da30e..78c919c4a5 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -435,5 +435,6 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp); void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp); void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val); void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp); +void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp); #endif /* PPC_XIVE_H */ From 9b88cd7673dddf9336f50540e5735eb6f190200a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:36 +0200 Subject: [PATCH 29/44] spapr/xive: introduce a VM state change handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This handler is in charge of stabilizing the flow of event notifications in the XIVE controller before migrating a guest. This is a requirement before transferring the guest EQ pages to a destination. When the VM is stopped, the handler sets the source PQs to PENDING to stop the flow of events and to possibly catch a triggered interrupt occuring while the VM is stopped. Their previous state is saved. The XIVE controller is then synced through KVM to flush any in-flight event notification and to stabilize the EQs. At this stage, the EQ pages are marked dirty to make sure the EQ pages are transferred if a migration sequence is in progress. The previous configuration of the sources is restored when the VM resumes, after a migration or a stop. If an interrupt was queued while the VM was stopped, the handler simply generates the missing trigger. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-6-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 96 ++++++++++++++++++++++++++++++++++++- include/hw/ppc/spapr_xive.h | 1 + 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 8dd4f96e0b..735577a6f8 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -433,9 +433,100 @@ static void kvmppc_xive_get_queues(SpaprXive *xive, Error **errp) } } +/* + * The primary goal of the XIVE VM change handler is to mark the EQ + * pages dirty when all XIVE event notifications have stopped. + * + * Whenever the VM is stopped, the VM change handler sets the source + * PQs to PENDING to stop the flow of events and to possibly catch a + * triggered interrupt occuring while the VM is stopped. The previous + * state is saved in anticipation of a migration. The XIVE controller + * is then synced through KVM to flush any in-flight event + * notification and stabilize the EQs. + * + * At this stage, we can mark the EQ page dirty and let a migration + * sequence transfer the EQ pages to the destination, which is done + * just after the stop state. + * + * The previous configuration of the sources is restored when the VM + * runs again. If an interrupt was queued while the VM was stopped, + * simply generate a trigger. + */ +static void kvmppc_xive_change_state_handler(void *opaque, int running, + RunState state) +{ + SpaprXive *xive = opaque; + XiveSource *xsrc = &xive->source; + Error *local_err = NULL; + int i; + + /* + * Restore the sources to their initial state. This is called when + * the VM resumes after a stop or a migration. + */ + if (running) { + for (i = 0; i < xsrc->nr_irqs; i++) { + uint8_t pq = xive_source_esb_get(xsrc, i); + uint8_t old_pq; + + old_pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_00 + (pq << 8)); + + /* + * An interrupt was queued while the VM was stopped, + * generate a trigger. + */ + if (pq == XIVE_ESB_RESET && old_pq == XIVE_ESB_QUEUED) { + xive_esb_trigger(xsrc, i); + } + } + + return; + } + + /* + * Mask the sources, to stop the flow of event notifications, and + * save the PQs locally in the XiveSource object. The XiveSource + * state will be collected later on by its vmstate handler if a + * migration is in progress. + */ + for (i = 0; i < xsrc->nr_irqs; i++) { + uint8_t pq = xive_esb_read(xsrc, i, XIVE_ESB_GET); + + /* + * PQ is set to PENDING to possibly catch a triggered + * interrupt occuring while the VM is stopped (hotplug event + * for instance) . + */ + if (pq != XIVE_ESB_OFF) { + pq = xive_esb_read(xsrc, i, XIVE_ESB_SET_PQ_10); + } + xive_source_esb_set(xsrc, i, pq); + } + + /* + * Sync the XIVE controller in KVM, to flush in-flight event + * notification that should be enqueued in the EQs and mark the + * XIVE EQ pages dirty to collect all updates. + */ + kvm_device_access(xive->fd, KVM_DEV_XIVE_GRP_CTRL, + KVM_DEV_XIVE_EQ_SYNC, NULL, true, &local_err); + if (local_err) { + error_report_err(local_err); + return; + } +} + void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp) { - kvmppc_xive_source_get_state(&xive->source); + /* + * When the VM is stopped, the sources are masked and the previous + * state is saved in anticipation of a migration. We should not + * synchronize the source state in that case else we will override + * the saved state. + */ + if (runstate_is_running()) { + kvmppc_xive_source_get_state(&xive->source); + } /* EAT: there is no extra state to query from KVM */ @@ -515,6 +606,9 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) "xive.tima", tima_len, xive->tm_mmap); sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio); + xive->change = qemu_add_vm_change_state_handler( + kvmppc_xive_change_state_handler, xive); + kvm_kernel_irqchip = true; kvm_msi_via_irqfd_allowed = true; kvm_gsi_direct_mapping = true; diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 7e49badd8c..734662c12a 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -42,6 +42,7 @@ typedef struct SpaprXive { /* KVM support */ int fd; void *tm_mmap; + VMChangeStateEntry *change; } SpaprXive; /* From 277dd3d7712ae056d2614ea164f6560afd5d71d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:37 +0200 Subject: [PATCH 30/44] spapr/xive: add migration support for KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the VM is stopped, the VM state handler stabilizes the XIVE IC and marks the EQ pages dirty. These are then transferred to destination before the transfer of the device vmstates starts. The SpaprXive interrupt controller model captures the XIVE internal tables, EAT and ENDT and the XiveTCTX model does the same for the thread interrupt context registers. At restart, the SpaprXive 'post_load' method restores all the XIVE states. It is called by the sPAPR machine 'post_load' method, when all XIVE states have been transferred and loaded. Finally, the source states are restored in the VM change state handler when the machine reaches the running state. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-7-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 24 ++++++++++ hw/intc/spapr_xive_kvm.c | 95 ++++++++++++++++++++++++++++++++++++- hw/intc/xive.c | 17 +++++++ hw/ppc/spapr_irq.c | 2 +- include/hw/ppc/spapr_xive.h | 3 ++ include/hw/ppc/xive.h | 1 + 6 files changed, 140 insertions(+), 2 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index e771db5fd0..0aa5d8a55e 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -472,10 +472,34 @@ static const VMStateDescription vmstate_spapr_xive_eas = { }, }; +static int vmstate_spapr_xive_pre_save(void *opaque) +{ + if (kvm_irqchip_in_kernel()) { + return kvmppc_xive_pre_save(SPAPR_XIVE(opaque)); + } + + return 0; +} + +/* + * Called by the sPAPR IRQ backend 'post_load' method at the machine + * level. + */ +int spapr_xive_post_load(SpaprXive *xive, int version_id) +{ + if (kvm_irqchip_in_kernel()) { + return kvmppc_xive_post_load(xive, version_id); + } + + return 0; +} + static const VMStateDescription vmstate_spapr_xive = { .name = TYPE_SPAPR_XIVE, .version_id = 1, .minimum_version_id = 1, + .pre_save = vmstate_spapr_xive_pre_save, + .post_load = NULL, /* handled at the machine level */ .fields = (VMStateField[]) { VMSTATE_UINT32_EQUAL(nr_irqs, SpaprXive, NULL), VMSTATE_STRUCT_VARRAY_POINTER_UINT32(eat, SpaprXive, nr_irqs, diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 735577a6f8..3999e4b7ed 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -15,6 +15,7 @@ #include "sysemu/cpus.h" #include "sysemu/kvm.h" #include "hw/ppc/spapr.h" +#include "hw/ppc/spapr_cpu_core.h" #include "hw/ppc/spapr_xive.h" #include "hw/ppc/xive.h" #include "kvm_ppc.h" @@ -60,7 +61,24 @@ static void kvm_cpu_enable(CPUState *cs) /* * XIVE Thread Interrupt Management context (KVM) */ -static void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp) + +static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp) +{ + uint64_t state[2]; + int ret; + + /* word0 and word1 of the OS ring. */ + state[0] = *((uint64_t *) &tctx->regs[TM_QW1_OS]); + + ret = kvm_set_one_reg(tctx->cs, KVM_REG_PPC_VP_STATE, state); + if (ret != 0) { + error_setg_errno(errp, errno, + "XIVE: could not restore KVM state of CPU %ld", + kvm_arch_vcpu_id(tctx->cs)); + } +} + +void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp) { uint64_t state[2] = { 0 }; int ret; @@ -534,6 +552,81 @@ void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp) kvmppc_xive_get_queues(xive, errp); } +/* + * The SpaprXive 'pre_save' method is called by the vmstate handler of + * the SpaprXive model, after the XIVE controller is synced in the VM + * change handler. + */ +int kvmppc_xive_pre_save(SpaprXive *xive) +{ + Error *local_err = NULL; + + /* EAT: there is no extra state to query from KVM */ + + /* ENDT */ + kvmppc_xive_get_queues(xive, &local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + + return 0; +} + +/* + * The SpaprXive 'post_load' method is not called by a vmstate + * handler. It is called at the sPAPR machine level at the end of the + * migration sequence by the sPAPR IRQ backend 'post_load' method, + * when all XIVE states have been transferred and loaded. + */ +int kvmppc_xive_post_load(SpaprXive *xive, int version_id) +{ + Error *local_err = NULL; + CPUState *cs; + int i; + + /* Restore the ENDT first. The targetting depends on it. */ + for (i = 0; i < xive->nr_ends; i++) { + if (!xive_end_is_valid(&xive->endt[i])) { + continue; + } + + kvmppc_xive_set_queue_config(xive, SPAPR_XIVE_BLOCK_ID, i, + &xive->endt[i], &local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + } + + /* Restore the EAT */ + for (i = 0; i < xive->nr_irqs; i++) { + if (!xive_eas_is_valid(&xive->eat[i])) { + continue; + } + + kvmppc_xive_set_source_config(xive, i, &xive->eat[i], &local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + } + + /* Restore the thread interrupt contexts */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + + kvmppc_xive_cpu_set_state(spapr_cpu_state(cpu)->tctx, &local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + } + + /* The source states will be restored when the machine starts running */ + return 0; +} + static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, Error **errp) { diff --git a/hw/intc/xive.c b/hw/intc/xive.c index 7f1c54a7b5..b5ebb33527 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -582,10 +582,27 @@ static void xive_tctx_unrealize(DeviceState *dev, Error **errp) qemu_unregister_reset(xive_tctx_reset, dev); } +static int vmstate_xive_tctx_pre_save(void *opaque) +{ + Error *local_err = NULL; + + if (kvm_irqchip_in_kernel()) { + kvmppc_xive_cpu_get_state(XIVE_TCTX(opaque), &local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + } + + return 0; +} + static const VMStateDescription vmstate_xive_tctx = { .name = TYPE_XIVE_TCTX, .version_id = 1, .minimum_version_id = 1, + .pre_save = vmstate_xive_tctx_pre_save, + .post_load = NULL, /* handled by the sPAPRxive model */ .fields = (VMStateField[]) { VMSTATE_BUFFER(regs, XiveTCTX), VMSTATE_END_OF_LIST() diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index 5c4a44855d..8d371523e6 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -350,7 +350,7 @@ static void spapr_irq_cpu_intc_create_xive(SpaprMachineState *spapr, static int spapr_irq_post_load_xive(SpaprMachineState *spapr, int version_id) { - return 0; + return spapr_xive_post_load(spapr->xive, version_id); } static void spapr_irq_reset_xive(SpaprMachineState *spapr, Error **errp) diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 734662c12a..04294b0ca2 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -55,6 +55,7 @@ typedef struct SpaprXive { bool spapr_xive_irq_claim(SpaprXive *xive, uint32_t lisn, bool lsi); bool spapr_xive_irq_free(SpaprXive *xive, uint32_t lisn); void spapr_xive_pic_print_info(SpaprXive *xive, Monitor *mon); +int spapr_xive_post_load(SpaprXive *xive, int version_id); void spapr_xive_hcall_init(SpaprMachineState *spapr); void spapr_dt_xive(SpaprMachineState *spapr, uint32_t nr_servers, void *fdt, @@ -83,5 +84,7 @@ void kvmppc_xive_get_queue_config(SpaprXive *xive, uint8_t end_blk, uint32_t end_idx, XiveEND *end, Error **errp); void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp); +int kvmppc_xive_pre_save(SpaprXive *xive); +int kvmppc_xive_post_load(SpaprXive *xive, int version_id); #endif /* PPC_SPAPR_XIVE_H */ diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index 78c919c4a5..edb8937f17 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -436,5 +436,6 @@ void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp); void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val); void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp); void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp); +void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp); #endif /* PPC_XIVE_H */ From 0dc9f5f8496a23da375a3b19556e265ba22478e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:38 +0200 Subject: [PATCH 31/44] spapr/xive: activate KVM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All is in place for KVM now. State synchronization and migration will come next. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-8-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/spapr_irq.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index 8d371523e6..e969683f5c 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -248,19 +248,10 @@ SpaprIrq spapr_irq_xics = { static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs, Error **errp) { - MachineState *machine = MACHINE(spapr); uint32_t nr_servers = spapr_max_server_number(spapr); DeviceState *dev; int i; - /* KVM XIVE device not yet available */ - if (kvm_enabled()) { - if (machine_kernel_irqchip_required(machine)) { - error_setg(errp, "kernel_irqchip requested. no KVM XIVE support"); - return; - } - } - dev = qdev_create(NULL, TYPE_SPAPR_XIVE); qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs); /* From 90c20e1e2ce0fb52975ff4d6590620ef0129790e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:39 +0200 Subject: [PATCH 32/44] sysbus: add a sysbus_mmio_unmap() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will be used to remove the MMIO regions of the POWER9 XIVE interrupt controller when the sPAPR machine is reseted. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-9-clg@kaod.org> Signed-off-by: David Gibson --- hw/core/sysbus.c | 10 ++++++++++ include/hw/sysbus.h | 1 + 2 files changed, 11 insertions(+) diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c index 307cf90a51..689a867a22 100644 --- a/hw/core/sysbus.c +++ b/hw/core/sysbus.c @@ -153,6 +153,16 @@ static void sysbus_mmio_map_common(SysBusDevice *dev, int n, hwaddr addr, } } +void sysbus_mmio_unmap(SysBusDevice *dev, int n) +{ + assert(n >= 0 && n < dev->num_mmio); + + if (dev->mmio[n].addr != (hwaddr)-1) { + memory_region_del_subregion(get_system_memory(), dev->mmio[n].memory); + dev->mmio[n].addr = (hwaddr)-1; + } +} + void sysbus_mmio_map(SysBusDevice *dev, int n, hwaddr addr) { sysbus_mmio_map_common(dev, n, addr, false, 0); diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h index 1aedcf05c9..4c668fbbdc 100644 --- a/include/hw/sysbus.h +++ b/include/hw/sysbus.h @@ -89,6 +89,7 @@ qemu_irq sysbus_get_connected_irq(SysBusDevice *dev, int n); void sysbus_mmio_map(SysBusDevice *dev, int n, hwaddr addr); void sysbus_mmio_map_overlap(SysBusDevice *dev, int n, hwaddr addr, int priority); +void sysbus_mmio_unmap(SysBusDevice *dev, int n); void sysbus_add_io(SysBusDevice *dev, hwaddr addr, MemoryRegion *mem); MemoryRegion *sysbus_address_space(SysBusDevice *dev); From 56b11587dfc79dc6453d857159801dcc38f24d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:40 +0200 Subject: [PATCH 33/44] spapr: introduce routines to delete the KVM IRQ device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a new interrupt mode is chosen by CAS, the machine generates a reset to reconfigure. At this point, the connection with the previous KVM device needs to be closed and a new connection needs to opened with the KVM device operating the chosen interrupt mode. New routines are introduced to destroy the XICS and the XIVE KVM devices. They make use of a new KVM device ioctl which destroys the device and also disconnects the IRQ presenters from the vCPUs. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-10-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 56 +++++++++++++++++++++++++++++++++++++ hw/intc/xics_kvm.c | 51 +++++++++++++++++++++++++++++++++ include/hw/ppc/spapr_xive.h | 1 + include/hw/ppc/xics_spapr.h | 1 + 4 files changed, 109 insertions(+) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 3999e4b7ed..259cd1db95 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -58,6 +58,16 @@ static void kvm_cpu_enable(CPUState *cs) QLIST_INSERT_HEAD(&kvm_enabled_cpus, enabled_cpu, node); } +static void kvm_cpu_disable_all(void) +{ + KVMEnabledCPU *enabled_cpu, *next; + + QLIST_FOREACH_SAFE(enabled_cpu, &kvm_enabled_cpus, node, next) { + QLIST_REMOVE(enabled_cpu, node); + g_free(enabled_cpu); + } +} + /* * XIVE Thread Interrupt Management context (KVM) */ @@ -709,3 +719,49 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) /* Map all regions */ spapr_xive_map_mmio(xive); } + +void kvmppc_xive_disconnect(SpaprXive *xive, Error **errp) +{ + XiveSource *xsrc; + size_t esb_len; + + /* The KVM XIVE device is not in use */ + if (!xive || xive->fd == -1) { + return; + } + + if (!kvmppc_has_cap_xive()) { + error_setg(errp, "IRQ_XIVE capability must be present for KVM"); + return; + } + + /* Clear the KVM mapping */ + xsrc = &xive->source; + esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs; + + sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 0); + munmap(xsrc->esb_mmap, esb_len); + + sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 1); + + sysbus_mmio_unmap(SYS_BUS_DEVICE(xive), 2); + munmap(xive->tm_mmap, 4ull << TM_SHIFT); + + /* + * When the KVM device fd is closed, the KVM device is destroyed + * and removed from the list of devices of the VM. The VCPU + * presenters are also detached from the device. + */ + close(xive->fd); + xive->fd = -1; + + kvm_kernel_irqchip = false; + kvm_msi_via_irqfd_allowed = false; + kvm_gsi_direct_mapping = false; + + /* Clear the local list of presenter (hotplug) */ + kvm_cpu_disable_all(); + + /* VM Change state handler is not needed anymore */ + qemu_del_vm_change_state_handler(xive->change); +} diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c index 78a252e6df..1185846ff1 100644 --- a/hw/intc/xics_kvm.c +++ b/hw/intc/xics_kvm.c @@ -51,6 +51,16 @@ typedef struct KVMEnabledICP { static QLIST_HEAD(, KVMEnabledICP) kvm_enabled_icps = QLIST_HEAD_INITIALIZER(&kvm_enabled_icps); +static void kvm_disable_icps(void) +{ + KVMEnabledICP *enabled_icp, *next; + + QLIST_FOREACH_SAFE(enabled_icp, &kvm_enabled_icps, node, next) { + QLIST_REMOVE(enabled_icp, node); + g_free(enabled_icp); + } +} + /* * ICP-KVM */ @@ -360,3 +370,44 @@ fail: kvmppc_define_rtas_kernel_token(0, "ibm,int-off"); return -1; } + +void xics_kvm_disconnect(SpaprMachineState *spapr, Error **errp) +{ + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return; + } + + if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) { + error_setg(errp, + "KVM and IRQ_XICS capability must be present for KVM XICS device"); + return; + } + + /* + * Only on P9 using the XICS-on XIVE KVM device: + * + * When the KVM device fd is closed, the device is destroyed and + * removed from the list of devices of the VM. The VCPU presenters + * are also detached from the device. + */ + close(kernel_xics_fd); + kernel_xics_fd = -1; + + spapr_rtas_unregister(RTAS_IBM_SET_XIVE); + spapr_rtas_unregister(RTAS_IBM_GET_XIVE); + spapr_rtas_unregister(RTAS_IBM_INT_OFF); + spapr_rtas_unregister(RTAS_IBM_INT_ON); + + kvmppc_define_rtas_kernel_token(0, "ibm,set-xive"); + kvmppc_define_rtas_kernel_token(0, "ibm,get-xive"); + kvmppc_define_rtas_kernel_token(0, "ibm,int-on"); + kvmppc_define_rtas_kernel_token(0, "ibm,int-off"); + + kvm_kernel_irqchip = false; + kvm_msi_via_irqfd_allowed = false; + kvm_gsi_direct_mapping = false; + + /* Clear the presenter from the VCPUs */ + kvm_disable_icps(); +} diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 04294b0ca2..0b5e972d52 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -71,6 +71,7 @@ int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, * KVM XIVE device helpers */ void kvmppc_xive_connect(SpaprXive *xive, Error **errp); +void kvmppc_xive_disconnect(SpaprXive *xive, Error **errp); void kvmppc_xive_reset(SpaprXive *xive, Error **errp); void kvmppc_xive_set_source_config(SpaprXive *xive, uint32_t lisn, XiveEAS *eas, Error **errp); diff --git a/include/hw/ppc/xics_spapr.h b/include/hw/ppc/xics_spapr.h index 15a8dcff66..2476b540ed 100644 --- a/include/hw/ppc/xics_spapr.h +++ b/include/hw/ppc/xics_spapr.h @@ -34,6 +34,7 @@ void spapr_dt_xics(SpaprMachineState *spapr, uint32_t nr_servers, void *fdt, uint32_t phandle); int xics_kvm_init(SpaprMachineState *spapr, Error **errp); +void xics_kvm_disconnect(SpaprMachineState *spapr, Error **errp); void xics_spapr_init(SpaprMachineState *spapr); #endif /* XICS_SPAPR_H */ From 3bf84e99c823987704d1324cf0e34e7597e737a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:41 +0200 Subject: [PATCH 34/44] spapr: check for the activation of the KVM IRQ device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The activation of the KVM IRQ device depends on the interrupt mode chosen at CAS time by the machine and some methods used at reset or by the migration need to be protected. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Signed-off-by: Cédric Le Goater Message-Id: <20190513084245.25755-11-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 33 +++++++++++++++++++++++++++++++++ hw/intc/xics_kvm.c | 31 ++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 259cd1db95..078d18d775 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -90,9 +90,15 @@ static void kvmppc_xive_cpu_set_state(XiveTCTX *tctx, Error **errp) void kvmppc_xive_cpu_get_state(XiveTCTX *tctx, Error **errp) { + SpaprXive *xive = SPAPR_MACHINE(qdev_get_machine())->xive; uint64_t state[2] = { 0 }; int ret; + /* The KVM XIVE device is not in use */ + if (xive->fd == -1) { + return; + } + ret = kvm_get_one_reg(tctx->cs, KVM_REG_PPC_VP_STATE, state); if (ret != 0) { error_setg_errno(errp, errno, @@ -143,6 +149,11 @@ void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) unsigned long vcpu_id; int ret; + /* The KVM XIVE device is not in use */ + if (xive->fd == -1) { + return; + } + /* Check if CPU was hot unplugged and replugged. */ if (kvm_cpu_is_enabled(tctx->cs)) { return; @@ -219,6 +230,11 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) SpaprXive *xive = SPAPR_XIVE(xsrc->xive); uint64_t state = 0; + /* The KVM XIVE device is not in use */ + if (xive->fd == -1) { + return; + } + if (xive_source_irq_is_lsi(xsrc, srcno)) { state |= KVM_XIVE_LEVEL_SENSITIVE; if (xsrc->status[srcno] & XIVE_STATUS_ASSERTED) { @@ -319,9 +335,13 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc) void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val) { XiveSource *xsrc = opaque; + SpaprXive *xive = SPAPR_XIVE(xsrc->xive); struct kvm_irq_level args; int rc; + /* The KVM XIVE device should be in use */ + assert(xive->fd != -1); + args.irq = srcno; if (!xive_source_irq_is_lsi(xsrc, srcno)) { if (!val) { @@ -546,6 +566,11 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running, void kvmppc_xive_synchronize_state(SpaprXive *xive, Error **errp) { + /* The KVM XIVE device is not in use */ + if (xive->fd == -1) { + return; + } + /* * When the VM is stopped, the sources are masked and the previous * state is saved in anticipation of a migration. We should not @@ -571,6 +596,11 @@ int kvmppc_xive_pre_save(SpaprXive *xive) { Error *local_err = NULL; + /* The KVM XIVE device is not in use */ + if (xive->fd == -1) { + return 0; + } + /* EAT: there is no extra state to query from KVM */ /* ENDT */ @@ -595,6 +625,9 @@ int kvmppc_xive_post_load(SpaprXive *xive, int version_id) CPUState *cs; int i; + /* The KVM XIVE device should be in use */ + assert(xive->fd != -1); + /* Restore the ENDT first. The targetting depends on it. */ for (i = 0; i < xive->nr_ends; i++) { if (!xive_end_is_valid(&xive->endt[i])) { diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c index 1185846ff1..12bd5190cf 100644 --- a/hw/intc/xics_kvm.c +++ b/hw/intc/xics_kvm.c @@ -69,6 +69,11 @@ void icp_get_kvm_state(ICPState *icp) uint64_t state; int ret; + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return; + } + /* ICP for this CPU thread is not in use, exiting */ if (!icp->cs) { return; @@ -105,6 +110,11 @@ int icp_set_kvm_state(ICPState *icp) uint64_t state; int ret; + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return 0; + } + /* ICP for this CPU thread is not in use, exiting */ if (!icp->cs) { return 0; @@ -133,8 +143,9 @@ void icp_kvm_realize(DeviceState *dev, Error **errp) unsigned long vcpu_id; int ret; + /* The KVM XICS device is not in use */ if (kernel_xics_fd == -1) { - abort(); + return; } cs = icp->cs; @@ -170,6 +181,11 @@ void ics_get_kvm_state(ICSState *ics) uint64_t state; int i; + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return; + } + for (i = 0; i < ics->nr_irqs; i++) { ICSIRQState *irq = &ics->irqs[i]; @@ -230,6 +246,11 @@ int ics_set_kvm_state_one(ICSState *ics, int srcno) ICSIRQState *irq = &ics->irqs[srcno]; int ret; + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return 0; + } + state = irq->server; state |= (uint64_t)(irq->saved_priority & KVM_XICS_PRIORITY_MASK) << KVM_XICS_PRIORITY_SHIFT; @@ -269,6 +290,11 @@ int ics_set_kvm_state(ICSState *ics) { int i; + /* The KVM XICS device is not in use */ + if (kernel_xics_fd == -1) { + return 0; + } + for (i = 0; i < ics->nr_irqs; i++) { int ret; @@ -286,6 +312,9 @@ void ics_kvm_set_irq(ICSState *ics, int srcno, int val) struct kvm_irq_level args; int rc; + /* The KVM XICS device should be in use */ + assert(kernel_xics_fd != -1); + args.irq = srcno + ics->offset; if (ics->irqs[srcno].flags & XICS_FLAGS_IRQ_MSI) { if (!val) { From ae805ea9073bb97363d867ef081be27e2c63d782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:42 +0200 Subject: [PATCH 35/44] spapr/irq: introduce a spapr_irq_init_device() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The way the XICS and the XIVE devices are initialized follows the same pattern. First, try to connect to the KVM device and if not possible fallback on the emulated device, unless a kernel_irqchip is required. The spapr_irq_init_device() routine implements this sequence in generic way using new sPAPR IRQ handlers ->init_emu() and ->init_kvm(). The XIVE init sequence is moved under the associated sPAPR IRQ ->init() handler. This will change again when KVM support is added for the dual interrupt mode. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-12-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 26 +++-------- hw/ppc/spapr_irq.c | 89 +++++++++++++++++++++++++++++-------- include/hw/ppc/spapr_irq.h | 2 + include/hw/ppc/spapr_xive.h | 1 + 4 files changed, 78 insertions(+), 40 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 0aa5d8a55e..a79574b23c 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -281,7 +281,6 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) XiveSource *xsrc = &xive->source; XiveENDSource *end_xsrc = &xive->end_source; Error *local_err = NULL; - MachineState *machine = MACHINE(qdev_get_machine()); if (!xive->nr_irqs) { error_setg(errp, "Number of interrupt needs to be greater 0"); @@ -332,27 +331,12 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT)); qemu_register_reset(spapr_xive_reset, dev); +} - if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { - kvmppc_xive_connect(xive, &local_err); - if (local_err && machine_kernel_irqchip_required(machine)) { - error_prepend(&local_err, - "kernel_irqchip requested but unavailable: "); - error_propagate(errp, local_err); - return; - } - - if (!local_err) { - return; - } - - /* - * We failed to initialize the XIVE KVM device, fallback to - * emulated mode - */ - error_prepend(&local_err, "kernel_irqchip allowed but unavailable: "); - warn_report_err(local_err); - } +void spapr_xive_init(SpaprXive *xive, Error **errp) +{ + XiveSource *xsrc = &xive->source; + XiveENDSource *end_xsrc = &xive->end_source; /* TIMA initialization */ memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive, diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index e969683f5c..d1e87577fb 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -62,6 +62,35 @@ void spapr_irq_msi_reset(SpaprMachineState *spapr) bitmap_clear(spapr->irq_map, 0, spapr->irq_map_nr); } +static void spapr_irq_init_device(SpaprMachineState *spapr, + SpaprIrq *irq, Error **errp) +{ + MachineState *machine = MACHINE(spapr); + Error *local_err = NULL; + + if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { + irq->init_kvm(spapr, &local_err); + if (local_err && machine_kernel_irqchip_required(machine)) { + error_prepend(&local_err, + "kernel_irqchip requested but unavailable: "); + error_propagate(errp, local_err); + return; + } + + if (!local_err) { + return; + } + + /* + * We failed to initialize the KVM device, fallback to + * emulated mode + */ + error_prepend(&local_err, "kernel_irqchip allowed but unavailable: "); + warn_report_err(local_err); + } + + irq->init_emu(spapr, errp); +} /* * XICS IRQ backend. @@ -70,28 +99,13 @@ void spapr_irq_msi_reset(SpaprMachineState *spapr) static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs, Error **errp) { - MachineState *machine = MACHINE(spapr); Object *obj; Error *local_err = NULL; - bool xics_kvm = false; - if (kvm_enabled()) { - if (machine_kernel_irqchip_allowed(machine) && - !xics_kvm_init(spapr, &local_err)) { - xics_kvm = true; - } - if (machine_kernel_irqchip_required(machine) && !xics_kvm) { - error_prepend(&local_err, - "kernel_irqchip requested but unavailable: "); - error_propagate(errp, local_err); - return; - } - error_free(local_err); - local_err = NULL; - } - - if (!xics_kvm) { - xics_spapr_init(spapr); + spapr_irq_init_device(spapr, &spapr_irq_xics, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; } obj = object_new(TYPE_ICS_SIMPLE); @@ -220,6 +234,18 @@ static const char *spapr_irq_get_nodename_xics(SpaprMachineState *spapr) return XICS_NODENAME; } +static void spapr_irq_init_emu_xics(SpaprMachineState *spapr, Error **errp) +{ + xics_spapr_init(spapr); +} + +static void spapr_irq_init_kvm_xics(SpaprMachineState *spapr, Error **errp) +{ + if (kvm_enabled()) { + xics_kvm_init(spapr, errp); + } +} + #define SPAPR_IRQ_XICS_NR_IRQS 0x1000 #define SPAPR_IRQ_XICS_NR_MSIS \ (XICS_IRQ_BASE + SPAPR_IRQ_XICS_NR_IRQS - SPAPR_IRQ_MSI) @@ -240,6 +266,8 @@ SpaprIrq spapr_irq_xics = { .reset = spapr_irq_reset_xics, .set_irq = spapr_irq_set_irq_xics, .get_nodename = spapr_irq_get_nodename_xics, + .init_emu = spapr_irq_init_emu_xics, + .init_kvm = spapr_irq_init_kvm_xics, }; /* @@ -251,6 +279,7 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs, uint32_t nr_servers = spapr_max_server_number(spapr); DeviceState *dev; int i; + Error *local_err = NULL; dev = qdev_create(NULL, TYPE_SPAPR_XIVE); qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs); @@ -268,6 +297,12 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs, } spapr_xive_hcall_init(spapr); + + spapr_irq_init_device(spapr, &spapr_irq_xive, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } } static int spapr_irq_claim_xive(SpaprMachineState *spapr, int irq, bool lsi, @@ -375,6 +410,18 @@ static const char *spapr_irq_get_nodename_xive(SpaprMachineState *spapr) return spapr->xive->nodename; } +static void spapr_irq_init_emu_xive(SpaprMachineState *spapr, Error **errp) +{ + spapr_xive_init(spapr->xive, errp); +} + +static void spapr_irq_init_kvm_xive(SpaprMachineState *spapr, Error **errp) +{ + if (kvm_enabled()) { + kvmppc_xive_connect(spapr->xive, errp); + } +} + /* * XIVE uses the full IRQ number space. Set it to 8K to be compatible * with XICS. @@ -399,6 +446,8 @@ SpaprIrq spapr_irq_xive = { .reset = spapr_irq_reset_xive, .set_irq = spapr_irq_set_irq_xive, .get_nodename = spapr_irq_get_nodename_xive, + .init_emu = spapr_irq_init_emu_xive, + .init_kvm = spapr_irq_init_kvm_xive, }; /* @@ -560,6 +609,8 @@ SpaprIrq spapr_irq_dual = { .reset = spapr_irq_reset_dual, .set_irq = spapr_irq_set_irq_dual, .get_nodename = spapr_irq_get_nodename_dual, + .init_emu = NULL, /* should not be used */ + .init_kvm = NULL, /* should not be used */ }; diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h index b855f74e44..14cab73c9c 100644 --- a/include/hw/ppc/spapr_irq.h +++ b/include/hw/ppc/spapr_irq.h @@ -48,6 +48,8 @@ typedef struct SpaprIrq { void (*reset)(SpaprMachineState *spapr, Error **errp); void (*set_irq)(void *opaque, int srcno, int val); const char *(*get_nodename)(SpaprMachineState *spapr); + void (*init_emu)(SpaprMachineState *spapr, Error **errp); + void (*init_kvm)(SpaprMachineState *spapr, Error **errp); } SpaprIrq; extern SpaprIrq spapr_irq_xics; diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index 0b5e972d52..b26befcf6b 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -66,6 +66,7 @@ void spapr_xive_map_mmio(SpaprXive *xive); int spapr_xive_end_to_target(uint8_t end_blk, uint32_t end_idx, uint32_t *out_server, uint8_t *out_prio); +void spapr_xive_init(SpaprXive *xive, Error **errp); /* * KVM XIVE device helpers From cf435df697e50db5ed1ec60e5efe639123a03154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:43 +0200 Subject: [PATCH 36/44] spapr/irq: initialize the IRQ device only once MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a check to make sure that the routine initializing the emulated IRQ device is called once. We don't have much to test on the XICS side, so we introduce a 'init' boolean under ICSState. Signed-off-by: Cédric Le Goater Message-Id: <20190513084245.25755-13-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 9 +++++++++ hw/intc/xics_spapr.c | 7 +++++++ include/hw/ppc/xics.h | 1 + 3 files changed, 17 insertions(+) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index a79574b23c..f6f6c29d6a 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -338,6 +338,15 @@ void spapr_xive_init(SpaprXive *xive, Error **errp) XiveSource *xsrc = &xive->source; XiveENDSource *end_xsrc = &xive->end_source; + /* + * The emulated XIVE device can only be initialized once. If the + * ESB memory region has been already mapped, it means we have been + * through there. + */ + if (memory_region_is_mapped(&xsrc->esb_mmio)) { + return; + } + /* TIMA initialization */ memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive, "xive.tima", 4ull << TM_SHIFT); diff --git a/hw/intc/xics_spapr.c b/hw/intc/xics_spapr.c index 9d2b8adef7..5a1835e8b1 100644 --- a/hw/intc/xics_spapr.c +++ b/hw/intc/xics_spapr.c @@ -239,6 +239,13 @@ static void rtas_int_on(PowerPCCPU *cpu, SpaprMachineState *spapr, void xics_spapr_init(SpaprMachineState *spapr) { + /* Emulated mode can only be initialized once. */ + if (spapr->ics->init) { + return; + } + + spapr->ics->init = true; + /* Registration of global state belongs into realize */ spapr_rtas_register(RTAS_IBM_SET_XIVE, "ibm,set-xive", rtas_set_xive); spapr_rtas_register(RTAS_IBM_GET_XIVE, "ibm,get-xive", rtas_get_xive); diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h index eb65ad7e43..d6f8e4c4c2 100644 --- a/include/hw/ppc/xics.h +++ b/include/hw/ppc/xics.h @@ -119,6 +119,7 @@ struct ICSState { uint32_t offset; ICSIRQState *irqs; XICSFabric *xics; + bool init; /* sPAPR ICS device initialized */ }; #define ICS_PROP_XICS "xics" From 83629419a52f393d67317b14a861d3062e37c5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:44 +0200 Subject: [PATCH 37/44] ppc/xics: fix irq priority in ics_set_irq_type() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent commits changed the behavior of ics_set_irq_type() to initialize correctly LSIs at the KVM level. ics_set_irq_type() is also called by the realize routine of the different devices of the machine when initial interrupts are claimed, before the ICSState device is reseted. In the case, the ICSIRQState priority is 0x0 and the call to ics_set_irq_type() results in configuring the target of the interrupt. On P9, when using the KVM XICS-on-XIVE device, the target is configured to be server 0, priority 0 and the event queue 0 is created automatically by KVM. With the dual interrupt mode creating the KVM device at reset, it leads to unexpected effects on the guest, mostly blocking IPIs. This is wrong, fix it by reseting the ICSIRQState structure when ics_set_irq_type() is called. Fixes: commit 6cead90c5c9c ("xics: Write source state to KVM at claim time") Signed-off-by: Greg Kurz Signed-off-by: Cédric Le Goater Message-Id: <20190513084245.25755-14-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/xics.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hw/intc/xics.c b/hw/intc/xics.c index af7dc709ab..79f5a8a916 100644 --- a/hw/intc/xics.c +++ b/hw/intc/xics.c @@ -610,6 +610,12 @@ static const TypeInfo ics_simple_info = { .class_size = sizeof(ICSStateClass), }; +static void ics_reset_irq(ICSIRQState *irq) +{ + irq->priority = 0xff; + irq->saved_priority = 0xff; +} + static void ics_base_reset(DeviceState *dev) { ICSState *ics = ICS_BASE(dev); @@ -623,8 +629,7 @@ static void ics_base_reset(DeviceState *dev) memset(ics->irqs, 0, sizeof(ICSIRQState) * ics->nr_irqs); for (i = 0; i < ics->nr_irqs; i++) { - ics->irqs[i].priority = 0xff; - ics->irqs[i].saved_priority = 0xff; + ics_reset_irq(ics->irqs + i); ics->irqs[i].flags = flags[i]; } } @@ -760,6 +765,7 @@ void ics_set_irq_type(ICSState *ics, int srcno, bool lsi) lsi ? XICS_FLAGS_IRQ_LSI : XICS_FLAGS_IRQ_MSI; if (kvm_irqchip_in_kernel()) { + ics_reset_irq(ics->irqs + srcno); ics_set_kvm_state_one(ics, srcno); } } From 3f777abc7107a27a2c19ee0eb823054fc9ecc902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 13 May 2019 10:42:45 +0200 Subject: [PATCH 38/44] spapr/irq: add KVM support to the 'dual' machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interrupt mode is chosen by the CAS negotiation process and activated after a reset to take into account the required changes in the machine. This brings new constraints on how the associated KVM IRQ device is initialized. Currently, each model takes care of the initialization of the KVM device in their realize method but this is not possible anymore as the initialization needs to be done globaly when the interrupt mode is known, i.e. when machine is reseted. It also means that we need a way to delete a KVM device when another mode is chosen. Also, to support migration, the QEMU objects holding the state to transfer should always be available but not necessarily activated. The overall approach of this proposal is to initialize both interrupt mode at the QEMU level to keep the IRQ number space in sync and to allow switching from one mode to another. For the KVM side of things, the whole initialization of the KVM device, sources and presenters, is grouped in a single routine. The XICS and XIVE sPAPR IRQ reset handlers are modified accordingly to handle the init and the delete sequences of the KVM device. Signed-off-by: Cédric Le Goater Reviewed-by: David Gibson Message-Id: <20190513084245.25755-15-clg@kaod.org> Signed-off-by: David Gibson --- hw/intc/spapr_xive_kvm.c | 29 +++++++++++++++++++- hw/intc/xics_kvm.c | 31 +++++++++++++++++++++ hw/intc/xive.c | 4 --- hw/ppc/spapr_irq.c | 58 ++++++++++++++++++++++++++-------------- include/hw/ppc/xive.h | 1 - 5 files changed, 97 insertions(+), 26 deletions(-) diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index 078d18d775..ec170b3045 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -246,7 +246,7 @@ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) true, errp); } -void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) +static void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) { int i; @@ -697,6 +697,15 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) Error *local_err = NULL; size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs; size_t tima_len = 4ull << TM_SHIFT; + CPUState *cs; + + /* + * The KVM XIVE device already in use. This is the case when + * rebooting under the XIVE-only interrupt mode. + */ + if (xive->fd != -1) { + return; + } if (!kvmppc_has_cap_xive()) { error_setg(errp, "IRQ_XIVE capability must be present for KVM"); @@ -745,6 +754,24 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) xive->change = qemu_add_vm_change_state_handler( kvmppc_xive_change_state_handler, xive); + /* Connect the presenters to the initial VCPUs of the machine */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + + kvmppc_xive_cpu_connect(spapr_cpu_state(cpu)->tctx, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } + + /* Update the KVM sources */ + kvmppc_xive_source_reset(xsrc, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + kvm_kernel_irqchip = true; kvm_msi_via_irqfd_allowed = true; kvm_gsi_direct_mapping = true; diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c index 12bd5190cf..5ba5b77561 100644 --- a/hw/intc/xics_kvm.c +++ b/hw/intc/xics_kvm.c @@ -33,6 +33,7 @@ #include "trace.h" #include "sysemu/kvm.h" #include "hw/ppc/spapr.h" +#include "hw/ppc/spapr_cpu_core.h" #include "hw/ppc/xics.h" #include "hw/ppc/xics_spapr.h" #include "kvm_ppc.h" @@ -342,6 +343,16 @@ static void rtas_dummy(PowerPCCPU *cpu, SpaprMachineState *spapr, int xics_kvm_init(SpaprMachineState *spapr, Error **errp) { int rc; + CPUState *cs; + Error *local_err = NULL; + + /* + * The KVM XICS device already in use. This is the case when + * rebooting under the XICS-only interrupt mode. + */ + if (kernel_xics_fd != -1) { + return 0; + } if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_IRQ_XICS)) { error_setg(errp, @@ -390,6 +401,26 @@ int xics_kvm_init(SpaprMachineState *spapr, Error **errp) kvm_msi_via_irqfd_allowed = true; kvm_gsi_direct_mapping = true; + /* Create the presenters */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + + icp_kvm_realize(DEVICE(spapr_cpu_state(cpu)->icp), &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } + } + + /* Update the KVM sources */ + ics_set_kvm_state(spapr->ics); + + /* Connect the presenters to the initial VCPUs of the machine */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + icp_set_kvm_state(spapr_cpu_state(cpu)->icp); + } + return 0; fail: diff --git a/hw/intc/xive.c b/hw/intc/xive.c index b5ebb33527..0c74e47aa4 100644 --- a/hw/intc/xive.c +++ b/hw/intc/xive.c @@ -993,10 +993,6 @@ static void xive_source_reset(void *dev) /* PQs are initialized to 0b01 (Q=1) which corresponds to "ints off" */ memset(xsrc->status, XIVE_ESB_OFF, xsrc->nr_irqs); - - if (kvm_irqchip_in_kernel()) { - kvmppc_xive_source_reset(xsrc, &error_fatal); - } } static void xive_source_realize(DeviceState *dev, Error **errp) diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index d1e87577fb..3156daf093 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -102,12 +102,6 @@ static void spapr_irq_init_xics(SpaprMachineState *spapr, int nr_irqs, Object *obj; Error *local_err = NULL; - spapr_irq_init_device(spapr, &spapr_irq_xics, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - obj = object_new(TYPE_ICS_SIMPLE); object_property_add_child(OBJECT(spapr), "ics", obj, &error_abort); object_property_add_const_link(obj, ICS_PROP_XICS, OBJECT(spapr), @@ -226,7 +220,13 @@ static void spapr_irq_set_irq_xics(void *opaque, int srcno, int val) static void spapr_irq_reset_xics(SpaprMachineState *spapr, Error **errp) { - /* TODO: create the KVM XICS device */ + Error *local_err = NULL; + + spapr_irq_init_device(spapr, &spapr_irq_xics, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } } static const char *spapr_irq_get_nodename_xics(SpaprMachineState *spapr) @@ -279,7 +279,6 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs, uint32_t nr_servers = spapr_max_server_number(spapr); DeviceState *dev; int i; - Error *local_err = NULL; dev = qdev_create(NULL, TYPE_SPAPR_XIVE); qdev_prop_set_uint32(dev, "nr-irqs", nr_irqs); @@ -297,12 +296,6 @@ static void spapr_irq_init_xive(SpaprMachineState *spapr, int nr_irqs, } spapr_xive_hcall_init(spapr); - - spapr_irq_init_device(spapr, &spapr_irq_xive, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } } static int spapr_irq_claim_xive(SpaprMachineState *spapr, int irq, bool lsi, @@ -382,6 +375,7 @@ static int spapr_irq_post_load_xive(SpaprMachineState *spapr, int version_id) static void spapr_irq_reset_xive(SpaprMachineState *spapr, Error **errp) { CPUState *cs; + Error *local_err = NULL; CPU_FOREACH(cs) { PowerPCCPU *cpu = POWERPC_CPU(cs); @@ -390,6 +384,12 @@ static void spapr_irq_reset_xive(SpaprMachineState *spapr, Error **errp) spapr_xive_set_tctx_os_cam(spapr_cpu_state(cpu)->tctx); } + spapr_irq_init_device(spapr, &spapr_irq_xive, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + /* Activate the XIVE MMIOs */ spapr_xive_mmio_set_enabled(spapr->xive, true); } @@ -472,14 +472,8 @@ static SpaprIrq *spapr_irq_current(SpaprMachineState *spapr) static void spapr_irq_init_dual(SpaprMachineState *spapr, int nr_irqs, Error **errp) { - MachineState *machine = MACHINE(spapr); Error *local_err = NULL; - if (kvm_enabled() && machine_kernel_irqchip_allowed(machine)) { - error_setg(errp, "No KVM support for the 'dual' machine"); - return; - } - spapr_irq_xics.init(spapr, spapr_irq_xics.nr_irqs, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -558,6 +552,9 @@ static int spapr_irq_post_load_dual(SpaprMachineState *spapr, int version_id) * defaults to XICS at startup. */ if (spapr_ovec_test(spapr->ov5_cas, OV5_XIVE_EXPLOIT)) { + if (kvm_irqchip_in_kernel()) { + xics_kvm_disconnect(spapr, &error_fatal); + } spapr_irq_xive.reset(spapr, &error_fatal); } @@ -566,12 +563,30 @@ static int spapr_irq_post_load_dual(SpaprMachineState *spapr, int version_id) static void spapr_irq_reset_dual(SpaprMachineState *spapr, Error **errp) { + Error *local_err = NULL; + /* * Deactivate the XIVE MMIOs. The XIVE backend will reenable them * if selected. */ spapr_xive_mmio_set_enabled(spapr->xive, false); + /* Destroy all KVM devices */ + if (kvm_irqchip_in_kernel()) { + xics_kvm_disconnect(spapr, &local_err); + if (local_err) { + error_propagate(errp, local_err); + error_prepend(errp, "KVM XICS disconnect failed: "); + return; + } + kvmppc_xive_disconnect(spapr->xive, &local_err); + if (local_err) { + error_propagate(errp, local_err); + error_prepend(errp, "KVM XIVE disconnect failed: "); + return; + } + } + spapr_irq_current(spapr)->reset(spapr, errp); } @@ -809,6 +824,9 @@ SpaprIrq spapr_irq_xics_legacy = { .dt_populate = spapr_dt_xics, .cpu_intc_create = spapr_irq_cpu_intc_create_xics, .post_load = spapr_irq_post_load_xics, + .reset = spapr_irq_reset_xics, .set_irq = spapr_irq_set_irq_xics, .get_nodename = spapr_irq_get_nodename_xics, + .init_emu = spapr_irq_init_emu_xics, + .init_kvm = spapr_irq_init_kvm_xics, }; diff --git a/include/hw/ppc/xive.h b/include/hw/ppc/xive.h index edb8937f17..d872f96d1a 100644 --- a/include/hw/ppc/xive.h +++ b/include/hw/ppc/xive.h @@ -432,7 +432,6 @@ static inline uint32_t xive_nvt_cam_line(uint8_t nvt_blk, uint32_t nvt_idx) */ void kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp); -void kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp); void kvmppc_xive_source_set_irq(void *opaque, int srcno, int val); void kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp); void kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp); From 24563a587f964ff16477feab54686ccf617d3570 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 21 May 2019 10:24:11 +0200 Subject: [PATCH 39/44] docs: provide documentation on the POWER9 XIVE interrupt controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This documents the overall XIVE architecture and the XIVE support for sPAPR guest machines (pseries). It also provides documentation on the 'info pic' command. Signed-off-by: Cédric Le Goater Message-Id: <20190521082411.24719-1-clg@kaod.org> Reviewed-by: Satheesh Rajendran Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- MAINTAINERS | 1 + docs/index.rst | 1 + docs/specs/index.rst | 13 +++ docs/specs/ppc-spapr-xive.rst | 174 +++++++++++++++++++++++++++++ docs/specs/ppc-xive.rst | 199 ++++++++++++++++++++++++++++++++++ 5 files changed, 388 insertions(+) create mode 100644 docs/specs/index.rst create mode 100644 docs/specs/ppc-spapr-xive.rst create mode 100644 docs/specs/ppc-xive.rst diff --git a/MAINTAINERS b/MAINTAINERS index 1f5f8b7a2c..6f0609d61b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1712,6 +1712,7 @@ L: qemu-ppc@nongnu.org S: Supported F: hw/*/*xive* F: include/hw/*/*xive* +F: docs/*/*xive* Subsystems ---------- diff --git a/docs/index.rst b/docs/index.rst index 3690955dd1..baa5791c17 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,4 +12,5 @@ Welcome to QEMU's documentation! interop/index devel/index + specs/index diff --git a/docs/specs/index.rst b/docs/specs/index.rst new file mode 100644 index 0000000000..2e927519c2 --- /dev/null +++ b/docs/specs/index.rst @@ -0,0 +1,13 @@ +. This is the top level page for the 'specs' manual + + +QEMU full-system emulation guest hardware specifications +======================================================== + + +Contents: + +.. toctree:: + :maxdepth: 2 + + xive diff --git a/docs/specs/ppc-spapr-xive.rst b/docs/specs/ppc-spapr-xive.rst new file mode 100644 index 0000000000..539ce7ca4e --- /dev/null +++ b/docs/specs/ppc-spapr-xive.rst @@ -0,0 +1,174 @@ +XIVE for sPAPR (pseries machines) +================================= + +The POWER9 processor comes with a new interrupt controller +architecture, called XIVE as "eXternal Interrupt Virtualization +Engine". It supports a larger number of interrupt sources and offers +virtualization features which enables the HW to deliver interrupts +directly to virtual processors without hypervisor assistance. + +A QEMU ``pseries`` machine (which is PAPR compliant) using POWER9 +processors can run under two interrupt modes: + +- *Legacy Compatibility Mode* + + the hypervisor provides identical interfaces and similar + functionality to PAPR+ Version 2.7. This is the default mode + + It is also referred as *XICS* in QEMU. + +- *XIVE native exploitation mode* + + the hypervisor provides new interfaces to manage the XIVE control + structures, and provides direct control for interrupt management + through MMIO pages. + +Which interrupt modes can be used by the machine is negotiated with +the guest O/S during the Client Architecture Support negotiation +sequence. The two modes are mutually exclusive. + +Both interrupt mode share the same IRQ number space. See below for the +layout. + +CAS Negotiation +--------------- + +QEMU advertises the supported interrupt modes in the device tree +property "ibm,arch-vec-5-platform-support" in byte 23 and the OS +Selection for XIVE is indicated in the "ibm,architecture-vec-5" +property byte 23. + +The interrupt modes supported by the machine depend on the CPU type +(POWER9 is required for XIVE) but also on the machine property +``ic-mode`` which can be set on the command line. It can take the +following values: ``xics``, ``xive``, ``dual`` and currently ``xics`` +is the default but it may change in the future. + +The choosen interrupt mode is activated after a reconfiguration done +in a machine reset. + +XIVE Device tree properties +--------------------------- + +The properties for the PAPR interrupt controller node when the *XIVE +native exploitation mode* is selected shoud contain: + +- ``device_type`` + + value should be "power-ivpe". + +- ``compatible`` + + value should be "ibm,power-ivpe". + +- ``reg`` + + contains the base address and size of the thread interrupt + managnement areas (TIMA), for the User level and for the Guest OS + level. Only the Guest OS level is taken into account today. + +- ``ibm,xive-eq-sizes`` + + the size of the event queues. One cell per size supported, contains + log2 of size, in ascending order. + +- ``ibm,xive-lisn-ranges`` + + the IRQ interrupt number ranges assigned to the guest for the IPIs. + +The root node also exports : + +- ``ibm,plat-res-int-priorities`` + + contains a list of priorities that the hypervisor has reserved for + its own use. + +IRQ number space +---------------- + +IRQ Number space of the ``pseries`` machine is 8K wide and is the same +for both interrupt mode. The different ranges are defined as follow : + +- ``0x0000 .. 0x0FFF`` 4K CPU IPIs (only used under XIVE) +- ``0x1000 .. 0x1000`` 1 EPOW +- ``0x1001 .. 0x1001`` 1 HOTPLUG +- ``0x1100 .. 0x11FF`` 256 VIO devices +- ``0x1200 .. 0x127F`` 32 PHBs devices +- ``0x1280 .. 0x12FF`` unused +- ``0x1300 .. 0x1FFF`` PHB MSIs + +Monitoring XIVE +--------------- + +The state of the XIVE interrupt controller can be queried through the +monitor commands ``info pic``. The output comes in two parts. + +First, the state of the thread interrupt context registers is dumped +for each CPU : + +:: + + (qemu) info pic + CPU[0000]: QW NSR CPPR IPB LSMFB ACK# INC AGE PIPR W2 + CPU[0000]: USER 00 00 00 00 00 00 00 00 00000000 + CPU[0000]: OS 00 ff 00 00 ff 00 ff ff 80000400 + CPU[0000]: POOL 00 00 00 00 00 00 00 00 00000000 + CPU[0000]: PHYS 00 00 00 00 00 00 00 ff 00000000 + ... + +In the case of a ``pseries`` machine, QEMU acts as the hypervisor and only +the O/S and USER register rings make sense. ``W2`` contains the vCPU CAM +line which is set to the VP identifier. + +Then comes the routing information which aggregates the EAS and the +END configuration: + +:: + + ... + LISN PQ EISN CPU/PRIO EQ + 00000000 MSI -- 00000010 0/6 380/16384 @1fe3e0000 ^1 [ 80000010 ... ] + 00000001 MSI -- 00000010 1/6 305/16384 @1fc230000 ^1 [ 80000010 ... ] + 00000002 MSI -- 00000010 2/6 220/16384 @1fc2f0000 ^1 [ 80000010 ... ] + 00000003 MSI -- 00000010 3/6 201/16384 @1fc390000 ^1 [ 80000010 ... ] + 00000004 MSI -Q M 00000000 + 00000005 MSI -Q M 00000000 + 00000006 MSI -Q M 00000000 + 00000007 MSI -Q M 00000000 + 00001000 MSI -- 00000012 0/6 380/16384 @1fe3e0000 ^1 [ 80000010 ... ] + 00001001 MSI -- 00000013 0/6 380/16384 @1fe3e0000 ^1 [ 80000010 ... ] + 00001100 MSI -- 00000100 1/6 305/16384 @1fc230000 ^1 [ 80000010 ... ] + 00001101 MSI -Q M 00000000 + 00001200 LSI -Q M 00000000 + 00001201 LSI -Q M 00000000 + 00001202 LSI -Q M 00000000 + 00001203 LSI -Q M 00000000 + 00001300 MSI -- 00000102 1/6 305/16384 @1fc230000 ^1 [ 80000010 ... ] + 00001301 MSI -- 00000103 2/6 220/16384 @1fc2f0000 ^1 [ 80000010 ... ] + 00001302 MSI -- 00000104 3/6 201/16384 @1fc390000 ^1 [ 80000010 ... ] + +The source information and configuration: + +- The ``LISN`` column outputs the interrupt number of the source in + range ``[ 0x0 ... 0x1FFF ]`` and its type : ``MSI`` or ``LSI`` +- The ``PQ`` column reflects the state of the PQ bits of the source : + + - ``--`` source is ready to take events + - ``P-`` an event was sent and an EOI is PENDING + - ``PQ`` an event was QUEUED + - ``-Q`` source is OFF + + a ``M`` indicates that source is *MASKED* at the EAS level, + +The targeting configuration : + +- The ``EISN`` column is the event data that will be queued in the event + queue of the O/S. +- The ``CPU/PRIO`` column is the tuple defining the CPU number and + priority queue serving the source. +- The ``EQ`` column outputs : + + - the current index of the event queue/ the max number of entries + - the O/S event queue address + - the toggle bit + - the last entries that were pushed in the event queue. diff --git a/docs/specs/ppc-xive.rst b/docs/specs/ppc-xive.rst new file mode 100644 index 0000000000..b997dc0629 --- /dev/null +++ b/docs/specs/ppc-xive.rst @@ -0,0 +1,199 @@ +================================ +POWER9 XIVE interrupt controller +================================ + +The POWER9 processor comes with a new interrupt controller +architecture, called XIVE as "eXternal Interrupt Virtualization +Engine". + +Compared to the previous architecture, the main characteristics of +XIVE are to support a larger number of interrupt sources and to +deliver interrupts directly to virtual processors without hypervisor +assistance. This removes the context switches required for the +delivery process. + + +XIVE architecture +================= + +The XIVE IC is composed of three sub-engines, each taking care of a +processing layer of external interrupts: + +- Interrupt Virtualization Source Engine (IVSE), or Source Controller + (SC). These are found in PCI PHBs, in the PSI host bridge + controller, but also inside the main controller for the core IPIs + and other sub-chips (NX, CAP, NPU) of the chip/processor. They are + configured to feed the IVRE with events. +- Interrupt Virtualization Routing Engine (IVRE) or Virtualization + Controller (VC). It handles event coalescing and perform interrupt + routing by matching an event source number with an Event + Notification Descriptor (END). +- Interrupt Virtualization Presentation Engine (IVPE) or Presentation + Controller (PC). It maintains the interrupt context state of each + thread and handles the delivery of the external interrupt to the + thread. + +:: + + XIVE Interrupt Controller + +------------------------------------+ IPIs + | +---------+ +---------+ +--------+ | +-------+ + | |IVRE | |Common Q | |IVPE |----> | CORES | + | | esb | | | | |----> | | + | | eas | | Bridge | | tctx |----> | | + | |SC end | | | | nvt | | | | + +------+ | +---------+ +----+----+ +--------+ | +-+-+-+-+ + | RAM | +------------------|-----------------+ | | | + | | | | | | + | | | | | | + | | +--------------------v------------------------v-v-v--+ other + | <--+ Power Bus +--> chips + | esb | +---------+-----------------------+------------------+ + | eas | | | + | end | +--|------+ | + | nvt | +----+----+ | +----+----+ + +------+ |IVSE | | |IVSE | + | | | | | + | PQ-bits | | | PQ-bits | + | local |-+ | in VC | + +---------+ +---------+ + PCIe NX,NPU,CAPI + + + PQ-bits: 2 bits source state machine (P:pending Q:queued) + esb: Event State Buffer (Array of PQ bits in an IVSE) + eas: Event Assignment Structure + end: Event Notification Descriptor + nvt: Notification Virtual Target + tctx: Thread interrupt Context registers + + + +XIVE internal tables +-------------------- + +Each of the sub-engines uses a set of tables to redirect interrupts +from event sources to CPU threads. + +:: + + +-------+ + User or O/S | EQ | + or +------>|entries| + Hypervisor | | .. | + Memory | +-------+ + | ^ + | | + +-------------------------------------------------+ + | | + Hypervisor +------+ +---+--+ +---+--+ +------+ + Memory | ESB | | EAT | | ENDT | | NVTT | + (skiboot) +----+-+ +----+-+ +----+-+ +------+ + ^ | ^ | ^ | ^ + | | | | | | | + +-------------------------------------------------+ + | | | | | | | + | | | | | | | + +----|--|--------|--|--------|--|-+ +-|-----+ +------+ + | | | | | | | | | | tctx| |Thread| + IPI or ---+ + v + v + v |---| + .. |-----> | + HW events | | | | | | + | IVRE | | IVPE | +------+ + +---------------------------------+ +-------+ + + +The IVSE have a 2-bits state machine, P for pending and Q for queued, +for each source that allows events to be triggered. They are stored in +an Event State Buffer (ESB) array and can be controlled by MMIOs. + +If the event is let through, the IVRE looks up in the Event Assignment +Structure (EAS) table for an Event Notification Descriptor (END) +configured for the source. Each Event Notification Descriptor defines +a notification path to a CPU and an in-memory Event Queue, in which +will be enqueued an EQ data for the O/S to pull. + +The IVPE determines if a Notification Virtual Target (NVT) can handle +the event by scanning the thread contexts of the VCPUs dispatched on +the processor HW threads. It maintains the interrupt context state of +each thread in a NVT table. + +XIVE thread interrupt context +----------------------------- + +The XIVE presenter can generate four different exceptions to its +HW threads: + +- hypervisor exception +- O/S exception +- Event-Based Branch (user level) +- msgsnd (doorbell) + +Each exception has a state independent from the others called a Thread +Interrupt Management context. This context is a set of registers which +lets the thread handle priority management and interrupt +acknowledgment among other things. The most important ones being : + +- Interrupt Priority Register (PIPR) +- Interrupt Pending Buffer (IPB) +- Current Processor Priority (CPPR) +- Notification Source Register (NSR) + +TIMA +~~~~ + +The Thread Interrupt Management registers are accessible through a +specific MMIO region, called the Thread Interrupt Management Area +(TIMA), four aligned pages, each exposing a different view of the +registers. First page (page address ending in ``0b00``) gives access +to the entire context and is reserved for the ring 0 view for the +physical thread context. The second (page address ending in ``0b01``) +is for the hypervisor, ring 1 view. The third (page address ending in +``0b10``) is for the operating system, ring 2 view. The fourth (page +address ending in ``0b11``) is for user level, ring 3 view. + +Interrupt flow from an O/S perspective +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +After an event data has been enqueued in the O/S Event Queue, the IVPE +raises the bit corresponding to the priority of the pending interrupt +in the register IBP (Interrupt Pending Buffer) to indicate that an +event is pending in one of the 8 priority queues. The Pending +Interrupt Priority Register (PIPR) is also updated using the IPB. This +register represent the priority of the most favored pending +notification. + +The PIPR is then compared to the the Current Processor Priority +Register (CPPR). If it is more favored (numerically less than), the +CPU interrupt line is raised and the EO bit of the Notification Source +Register (NSR) is updated to notify the presence of an exception for +the O/S. The O/S acknowledges the interrupt with a special load in the +Thread Interrupt Management Area. + +The O/S handles the interrupt and when done, performs an EOI using a +MMIO operation on the ESB management page of the associate source. + +Overview of the QEMU models for XIVE +==================================== + +The XiveSource models the IVSE in general, internal and external. It +handles the source ESBs and the MMIO interface to control them. + +The XiveNotifier is a small helper interface interconnecting the +XiveSource to the XiveRouter. + +The XiveRouter is an abstract model acting as a combined IVRE and +IVPE. It routes event notifications using the EAS and END tables to +the IVPE sub-engine which does a CAM scan to find a CPU to deliver the +exception. Storage should be provided by the inheriting classes. + +XiveEnDSource is a special source object. It exposes the END ESB MMIOs +of the Event Queues which are used for coalescing event notifications +and for escalation. Not used on the field, only to sync the EQ cache +in OPAL. + +Finally, the XiveTCTX contains the interrupt state context of a thread, +four sets of registers, one for each exception that can be delivered +to a CPU. These contexts are scanned by the IVPE to find a matching VP +when a notification is triggered. It also models the Thread Interrupt +Management Area (TIMA), which exposes the thread context registers to +the CPU for interrupt management. From cdd71c8e9dffc97c5e0569b6ea14dfb65fc74851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 22 May 2019 09:40:15 +0200 Subject: [PATCH 40/44] spapr/xive: fix multiple resets when using the 'dual' interrupt mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today, when a reset occurs on a pseries machine using the 'dual' interrupt mode, the KVM devices are released and recreated depending on the interrupt mode selected by CAS. If XIVE is selected, the SysBus memory regions of the SpaprXive model are initialized by the KVM backend initialization routine each time a reset occurs. This leads to a crash after a couple of resets because the machine reaches the QDEV_MAX_MMIO limit of SysBusDevice : qemu-system-ppc64: hw/core/sysbus.c:193: sysbus_init_mmio: Assertion `dev->num_mmio < QDEV_MAX_MMIO' failed. To fix, initialize the SysBus memory regions in spapr_xive_realize() called only once and remove the same inits from the QEMU and KVM backend initialization routines which are called at each reset. Reported-by: Satheesh Rajendran Signed-off-by: Cédric Le Goater Message-Id: <20190522074016.10521-2-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/intc/spapr_xive.c | 11 +++++------ hw/intc/spapr_xive_kvm.c | 4 ---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index f6f6c29d6a..62e0ef8fa5 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -331,12 +331,16 @@ static void spapr_xive_realize(DeviceState *dev, Error **errp) xive->tm_base + XIVE_TM_USER_PAGE * (1 << TM_SHIFT)); qemu_register_reset(spapr_xive_reset, dev); + + /* Define all XIVE MMIO regions on SysBus */ + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio); + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio); + sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio); } void spapr_xive_init(SpaprXive *xive, Error **errp) { XiveSource *xsrc = &xive->source; - XiveENDSource *end_xsrc = &xive->end_source; /* * The emulated XIVE device can only be initialized once. If the @@ -351,11 +355,6 @@ void spapr_xive_init(SpaprXive *xive, Error **errp) memory_region_init_io(&xive->tm_mmio, OBJECT(xive), &xive_tm_ops, xive, "xive.tima", 4ull << TM_SHIFT); - /* Define all XIVE MMIO regions on SysBus */ - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio); - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio); - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio); - /* Map all regions */ spapr_xive_map_mmio(xive); } diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index ec170b3045..b48f135838 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -693,7 +693,6 @@ static void *kvmppc_xive_mmap(SpaprXive *xive, int pgoff, size_t len, void kvmppc_xive_connect(SpaprXive *xive, Error **errp) { XiveSource *xsrc = &xive->source; - XiveENDSource *end_xsrc = &xive->end_source; Error *local_err = NULL; size_t esb_len = (1ull << xsrc->esb_shift) * xsrc->nr_irqs; size_t tima_len = 4ull << TM_SHIFT; @@ -731,12 +730,10 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) memory_region_init_ram_device_ptr(&xsrc->esb_mmio, OBJECT(xsrc), "xive.esb", esb_len, xsrc->esb_mmap); - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xsrc->esb_mmio); /* * 2. END ESB pages (No KVM support yet) */ - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &end_xsrc->esb_mmio); /* * 3. TIMA pages - KVM mapping @@ -749,7 +746,6 @@ void kvmppc_xive_connect(SpaprXive *xive, Error **errp) } memory_region_init_ram_device_ptr(&xive->tm_mmio, OBJECT(xive), "xive.tima", tima_len, xive->tm_mmap); - sysbus_init_mmio(SYS_BUS_DEVICE(xive), &xive->tm_mmio); xive->change = qemu_add_vm_change_state_handler( kvmppc_xive_change_state_handler, xive); From bd94bc06479ad3be5e2d5db70fde9e8098b77d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Wed, 22 May 2019 09:40:16 +0200 Subject: [PATCH 41/44] spapr: change default interrupt mode to 'dual' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that XIVE support is complete (QEMU emulated and KVM devices), change the pseries machine to advertise both interrupt modes: XICS (P7/P8) and XIVE (P9). The machine default interrupt modes depends on the version. Current settings are: pseries default interrupt mode 4.1 dual 4.0 xics 3.1 xics 3.0 legacy xics (different IRQ number space layout) Signed-off-by: Cédric Le Goater Message-Id: <20190522074016.10521-3-clg@kaod.org> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 39e698e9b0..4fd16b43f0 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4352,7 +4352,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_OFF; spapr_caps_add_properties(smc, &error_abort); - smc->irq = &spapr_irq_xics; + smc->irq = &spapr_irq_dual; smc->dr_phb_enabled = true; } @@ -4430,6 +4430,7 @@ static void spapr_machine_4_0_class_options(MachineClass *mc) spapr_machine_4_1_class_options(mc); compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); smc->phb_placement = phb_placement_4_0; + smc->irq = &spapr_irq_xics; } DEFINE_SPAPR_MACHINE(4_0, "4.0", false); From 3725ef1a944bbe1173b55fdabe76fb17876f1d9e Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 22 May 2019 15:43:46 +0200 Subject: [PATCH 42/44] spapr: Don't migrate the hpt_maxpagesize cap to older machine types Commit 0b8c89be7f7b added the hpt_maxpagesize capability to the migration stream. This is okay for new machine types but it breaks backward migration to older QEMUs, which don't expect the extra subsection. Add a compatibility boolean flag to the sPAPR machine class and use it to skip migration of the capability for machine types 4.0 and older. This fixes migration to an older QEMU. Note that the destination will emit a warning: qemu-system-ppc64: warning: cap-hpt-max-page-size lower level (16) in incoming stream than on destination (24) This is expected and harmless though. It is okay to migrate from a lower HPT maximum page size (64k) to a greater one (16M). Fixes: 0b8c89be7f7b "spapr: Add forgotten capability to migration stream" Based-on: <20190522074016.10521-3-clg@kaod.org> Signed-off-by: Greg Kurz Message-Id: <155853262675.1158324.17301777846476373459.stgit@bahia.lan> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 1 + hw/ppc/spapr_caps.c | 12 +++++++++++- include/hw/ppc/spapr.h | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4fd16b43f0..e2b33e5890 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4431,6 +4431,7 @@ static void spapr_machine_4_0_class_options(MachineClass *mc) compat_props_add(mc->compat_props, hw_compat_4_0, hw_compat_4_0_len); smc->phb_placement = phb_placement_4_0; smc->irq = &spapr_irq_xics; + smc->pre_4_1_migration = true; } DEFINE_SPAPR_MACHINE(4_0, "4.0", false); diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index 658eb15a14..31b4661399 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -64,6 +64,7 @@ typedef struct SpaprCapabilityInfo { void (*apply)(SpaprMachineState *spapr, uint8_t val, Error **errp); void (*cpu_apply)(SpaprMachineState *spapr, PowerPCCPU *cpu, uint8_t val, Error **errp); + bool (*migrate_needed)(void *opaque); } SpaprCapabilityInfo; static void spapr_cap_get_bool(Object *obj, Visitor *v, const char *name, @@ -350,6 +351,11 @@ static void cap_hpt_maxpagesize_apply(SpaprMachineState *spapr, spapr_check_pagesize(spapr, qemu_minrampagesize(), errp); } +static bool cap_hpt_maxpagesize_migrate_needed(void *opaque) +{ + return !SPAPR_MACHINE_GET_CLASS(opaque)->pre_4_1_migration; +} + static bool spapr_pagesize_cb(void *opaque, uint32_t seg_pshift, uint32_t pshift) { @@ -542,6 +548,7 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = { .type = "int", .apply = cap_hpt_maxpagesize_apply, .cpu_apply = cap_hpt_maxpagesize_cpu_apply, + .migrate_needed = cap_hpt_maxpagesize_migrate_needed, }, [SPAPR_CAP_NESTED_KVM_HV] = { .name = "nested-hv", @@ -679,8 +686,11 @@ int spapr_caps_post_migration(SpaprMachineState *spapr) static bool spapr_cap_##sname##_needed(void *opaque) \ { \ SpaprMachineState *spapr = opaque; \ + bool (*needed)(void *opaque) = \ + capability_table[cap].migrate_needed; \ \ - return spapr->cmd_line_caps[cap] && \ + return needed ? needed(opaque) : true && \ + spapr->cmd_line_caps[cap] && \ (spapr->eff.caps[cap] != \ spapr->def.caps[cap]); \ } \ diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 9fc91c8f5e..4f5becf1f3 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -119,6 +119,7 @@ struct SpaprMachineClass { bool pre_2_10_has_unused_icps; bool legacy_irq_allocation; bool broken_host_serial_model; /* present real host info to the guest */ + bool pre_4_1_migration; /* don't migrate hpt-max-page-size */ void (*phb_placement)(SpaprMachineState *spapr, uint32_t index, uint64_t *buid, hwaddr *pio, From 83b90bf026767df5faefe3daca9f1649a5591418 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 27 May 2019 09:17:49 +0200 Subject: [PATCH 43/44] ppc/pnv: introduce new skiboot platform properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Newer skiboots (after 6.3) support QEMU platforms that have characteristics closer to real OpenPOWER systems. The CPU type is used to define the BMC drivers: Aspeed AST2400 for POWER8 processors and AST2500 for POWER9s. Advertise the new platform property names, "qemu,powernv8" and "qemu,powernv9", using the CPU type chosen for the QEMU PowerNV machine. Also, advertise the original platform name "qemu,powernv" in case of POWER8 processors for compatibility with older skiboots. Signed-off-by: Cédric Le Goater Message-Id: <20190527071749.31499-1-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c index 31aa20ee25..046f0a83c8 100644 --- a/hw/ppc/pnv.c +++ b/hw/ppc/pnv.c @@ -450,7 +450,8 @@ static void pnv_dt_power_mgt(void *fdt) static void *pnv_dt_create(MachineState *machine) { - const char plat_compat[] = "qemu,powernv\0ibm,powernv"; + const char plat_compat8[] = "qemu,powernv8\0qemu,powernv\0ibm,powernv"; + const char plat_compat9[] = "qemu,powernv9\0ibm,powernv"; PnvMachineState *pnv = PNV_MACHINE(machine); void *fdt; char *buf; @@ -465,8 +466,14 @@ static void *pnv_dt_create(MachineState *machine) _FDT((fdt_setprop_cell(fdt, 0, "#size-cells", 0x2))); _FDT((fdt_setprop_string(fdt, 0, "model", "IBM PowerNV (emulated by qemu)"))); - _FDT((fdt_setprop(fdt, 0, "compatible", plat_compat, - sizeof(plat_compat)))); + if (pnv_is_power9(pnv)) { + _FDT((fdt_setprop(fdt, 0, "compatible", plat_compat9, + sizeof(plat_compat9)))); + } else { + _FDT((fdt_setprop(fdt, 0, "compatible", plat_compat8, + sizeof(plat_compat8)))); + } + buf = qemu_uuid_unparse_strdup(&qemu_uuid); _FDT((fdt_setprop_string(fdt, 0, "vm,uuid", buf))); From ce4b1b56852ea741170ae85d3b8c0771c1ca7c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Mon, 27 May 2019 09:17:22 +0200 Subject: [PATCH 44/44] ppc/pnv: add dummy XSCOM registers for PRD initialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PRD (Processor recovery diagnostics) is a service available on OpenPower systems. The opal-prd daemon initializes the PowerPC Processor through the XSCOM bus and then waits for hardware diagnostic events. Signed-off-by: Cédric Le Goater Message-Id: <20190527071722.31424-1-clg@kaod.org> Signed-off-by: David Gibson --- hw/ppc/pnv_xscom.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/hw/ppc/pnv_xscom.c b/hw/ppc/pnv_xscom.c index c285ef514e..f53a6d7a94 100644 --- a/hw/ppc/pnv_xscom.c +++ b/hw/ppc/pnv_xscom.c @@ -29,6 +29,12 @@ #include +/* PRD registers */ +#define PRD_P8_IPOLL_REG_MASK 0x01020013 +#define PRD_P8_IPOLL_REG_STATUS 0x01020014 +#define PRD_P9_IPOLL_REG_MASK 0x000F0033 +#define PRD_P9_IPOLL_REG_STATUS 0x000F0034 + static void xscom_complete(CPUState *cs, uint64_t hmer_bits) { /* @@ -70,6 +76,12 @@ static uint64_t xscom_read_default(PnvChip *chip, uint32_t pcba) case 0x1010c00: /* PIBAM FIR */ case 0x1010c03: /* PIBAM FIR MASK */ + /* PRD registers */ + case PRD_P8_IPOLL_REG_MASK: + case PRD_P8_IPOLL_REG_STATUS: + case PRD_P9_IPOLL_REG_MASK: + case PRD_P9_IPOLL_REG_STATUS: + /* P9 xscom reset */ case 0x0090018: /* Receive status reg */ case 0x0090012: /* log register */ @@ -124,6 +136,12 @@ static bool xscom_write_default(PnvChip *chip, uint32_t pcba, uint64_t val) case 0x201302a: /* CAPP stuff */ case 0x2013801: /* CAPP stuff */ case 0x2013802: /* CAPP stuff */ + + /* P8 PRD registers */ + case PRD_P8_IPOLL_REG_MASK: + case PRD_P8_IPOLL_REG_STATUS: + case PRD_P9_IPOLL_REG_MASK: + case PRD_P9_IPOLL_REG_STATUS: return true; default: return false;