diff --git a/exec.c b/exec.c index 7683afb6a8..e34b602bdf 100644 --- a/exec.c +++ b/exec.c @@ -899,6 +899,7 @@ Property cpu_common_props[] = { DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, MemoryRegion *), #endif + DEFINE_PROP_BOOL("start-powered-off", CPUState, start_powered_off, false), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/core/cpu.c b/hw/core/cpu.c index 22bc3f974a..8f65383ffb 100644 --- a/hw/core/cpu.c +++ b/hw/core/cpu.c @@ -258,7 +258,7 @@ static void cpu_common_reset(DeviceState *dev) } cpu->interrupt_request = 0; - cpu->halted = 0; + cpu->halted = cpu->start_powered_off; cpu->mem_io_pc = 0; cpu->icount_extra = 0; atomic_set(&cpu->icount_decr_ptr->u32, 0); diff --git a/hw/input/adb.c b/hw/input/adb.c index 013fcc9c54..84331b9fce 100644 --- a/hw/input/adb.c +++ b/hw/input/adb.c @@ -309,6 +309,7 @@ static void adb_device_class_init(ObjectClass *oc, void *data) static const TypeInfo adb_device_type_info = { .name = TYPE_ADB_DEVICE, .parent = TYPE_DEVICE, + .class_size = sizeof(ADBDeviceClass), .instance_size = sizeof(ADBDevice), .abstract = true, .class_init = adb_device_class_init, diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c index 4bd0d606ba..1fa09f287a 100644 --- a/hw/intc/spapr_xive.c +++ b/hw/intc/spapr_xive.c @@ -595,6 +595,7 @@ static Property spapr_xive_properties[] = { DEFINE_PROP_UINT32("nr-ends", SpaprXive, nr_ends, 0), DEFINE_PROP_UINT64("vc-base", SpaprXive, vc_base, SPAPR_XIVE_VC_BASE), DEFINE_PROP_UINT64("tm-base", SpaprXive, tm_base, SPAPR_XIVE_TM_BASE), + DEFINE_PROP_UINT8("hv-prio", SpaprXive, hv_prio, 7), DEFINE_PROP_END_OF_LIST(), }; @@ -692,12 +693,13 @@ static void spapr_xive_dt(SpaprInterruptController *intc, uint32_t nr_servers, cpu_to_be32(16), /* 64K */ }; /* - * The following array is in sync with the reserved priorities - * defined by the 'spapr_xive_priority_is_reserved' routine. + * QEMU/KVM only needs to define a single range to reserve the + * escalation priority. A priority bitmask would have been more + * appropriate. */ uint32_t plat_res_int_priorities[] = { - cpu_to_be32(7), /* start */ - cpu_to_be32(0xf8), /* count */ + cpu_to_be32(xive->hv_prio), /* start */ + cpu_to_be32(0xff - xive->hv_prio), /* count */ }; /* Thread Interrupt Management Area : User (ring 3) and OS (ring 2) */ @@ -844,19 +846,12 @@ type_init(spapr_xive_register_types) */ /* - * Linux hosts under OPAL reserve priority 7 for their own escalation - * interrupts (DD2.X POWER9). So we only allow the guest to use - * priorities [0..6]. + * On POWER9, the KVM XIVE device uses priority 7 for the escalation + * interrupts. So we only allow the guest to use priorities [0..6]. */ -static bool spapr_xive_priority_is_reserved(uint8_t priority) +static bool spapr_xive_priority_is_reserved(SpaprXive *xive, uint8_t priority) { - switch (priority) { - case 0 ... 6: - return false; - case 7: /* OPAL escalation queue */ - default: - return true; - } + return priority >= xive->hv_prio; } /* @@ -1053,7 +1048,7 @@ static target_ulong h_int_set_source_config(PowerPCCPU *cpu, new_eas.w = eas.w & cpu_to_be64(~EAS_MASKED); } - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P4; @@ -1212,7 +1207,7 @@ static target_ulong h_int_get_queue_info(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; @@ -1299,7 +1294,7 @@ static target_ulong h_int_set_queue_config(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; @@ -1466,7 +1461,7 @@ static target_ulong h_int_get_queue_config(PowerPCCPU *cpu, * This is not needed when running the emulation under QEMU */ - if (spapr_xive_priority_is_reserved(priority)) { + if (spapr_xive_priority_is_reserved(xive, priority)) { qemu_log_mask(LOG_GUEST_ERROR, "XIVE: priority " TARGET_FMT_ld " is reserved\n", priority); return H_P3; diff --git a/hw/intc/spapr_xive_kvm.c b/hw/intc/spapr_xive_kvm.c index e8667ce5f6..66bf4c06fe 100644 --- a/hw/intc/spapr_xive_kvm.c +++ b/hw/intc/spapr_xive_kvm.c @@ -36,10 +36,9 @@ typedef struct KVMEnabledCPU { static QLIST_HEAD(, KVMEnabledCPU) kvm_enabled_cpus = QLIST_HEAD_INITIALIZER(&kvm_enabled_cpus); -static bool kvm_cpu_is_enabled(CPUState *cs) +static bool kvm_cpu_is_enabled(unsigned long vcpu_id) { KVMEnabledCPU *enabled_cpu; - unsigned long vcpu_id = kvm_arch_vcpu_id(cs); QLIST_FOREACH(enabled_cpu, &kvm_enabled_cpus, node) { if (enabled_cpu->vcpu_id == vcpu_id) { @@ -147,6 +146,45 @@ int kvmppc_xive_cpu_synchronize_state(XiveTCTX *tctx, Error **errp) return s.ret; } +/* + * Allocate the vCPU IPIs from the vCPU context. This will allocate + * the XIVE IPI interrupt on the chip on which the vCPU is running. + * This gives a better distribution of IPIs when the guest has a lot + * of vCPUs. When the vCPUs are pinned, this will make the IPI local + * to the chip of the vCPU. It will reduce rerouting between interrupt + * controllers and gives better performance. + */ +typedef struct { + SpaprXive *xive; + Error *err; + int rc; +} XiveInitIPI; + +static void kvmppc_xive_reset_ipi_on_cpu(CPUState *cs, run_on_cpu_data arg) +{ + unsigned long ipi = kvm_arch_vcpu_id(cs); + XiveInitIPI *s = arg.host_ptr; + uint64_t state = 0; + + s->rc = kvm_device_access(s->xive->fd, KVM_DEV_XIVE_GRP_SOURCE, ipi, + &state, true, &s->err); +} + +static int kvmppc_xive_reset_ipi(SpaprXive *xive, CPUState *cs, Error **errp) +{ + XiveInitIPI s = { + .xive = xive, + .err = NULL, + .rc = 0, + }; + + run_on_cpu(cs, kvmppc_xive_reset_ipi_on_cpu, RUN_ON_CPU_HOST_PTR(&s)); + if (s.err) { + error_propagate(errp, s.err); + } + return s.rc; +} + int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) { ERRP_GUARD(); @@ -157,7 +195,7 @@ int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) assert(xive->fd != -1); /* Check if CPU was hot unplugged and replugged. */ - if (kvm_cpu_is_enabled(tctx->cs)) { + if (kvm_cpu_is_enabled(kvm_arch_vcpu_id(tctx->cs))) { return 0; } @@ -176,6 +214,12 @@ int kvmppc_xive_cpu_connect(XiveTCTX *tctx, Error **errp) return ret; } + /* Create/reset the vCPU IPI */ + ret = kvmppc_xive_reset_ipi(xive, tctx->cs, errp); + if (ret < 0) { + return ret; + } + kvm_cpu_enable(tctx->cs); return 0; } @@ -235,6 +279,12 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) assert(xive->fd != -1); + /* + * The vCPU IPIs are now allocated in kvmppc_xive_cpu_connect() + * and not with all sources in kvmppc_xive_source_reset() + */ + assert(srcno >= SPAPR_XIRQ_BASE); + if (xive_source_irq_is_lsi(xsrc, srcno)) { state |= KVM_XIVE_LEVEL_SENSITIVE; if (xsrc->status[srcno] & XIVE_STATUS_ASSERTED) { @@ -246,12 +296,28 @@ int kvmppc_xive_source_reset_one(XiveSource *xsrc, int srcno, Error **errp) true, errp); } +/* + * To be valid, a source must have been claimed by the machine (valid + * entry in the EAS table) and if it is a vCPU IPI, the vCPU should + * have been enabled, which means the IPI has been allocated in + * kvmppc_xive_cpu_connect(). + */ +static bool xive_source_is_valid(SpaprXive *xive, int i) +{ + return xive_eas_is_valid(&xive->eat[i]) && + (i >= SPAPR_XIRQ_BASE || kvm_cpu_is_enabled(i)); +} + static int kvmppc_xive_source_reset(XiveSource *xsrc, Error **errp) { SpaprXive *xive = SPAPR_XIVE(xsrc->xive); int i; - for (i = 0; i < xsrc->nr_irqs; i++) { + /* + * Skip the vCPU IPIs. These are created/reset when the vCPUs are + * connected in kvmppc_xive_cpu_connect() + */ + for (i = SPAPR_XIRQ_BASE; i < xsrc->nr_irqs; i++) { int ret; if (!xive_eas_is_valid(&xive->eat[i])) { @@ -333,7 +399,7 @@ static void kvmppc_xive_source_get_state(XiveSource *xsrc) for (i = 0; i < xsrc->nr_irqs; i++) { uint8_t pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -516,7 +582,7 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running, uint8_t pq; uint8_t old_pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -544,7 +610,7 @@ static void kvmppc_xive_change_state_handler(void *opaque, int running, for (i = 0; i < xsrc->nr_irqs; i++) { uint8_t pq; - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } @@ -647,22 +713,22 @@ int kvmppc_xive_post_load(SpaprXive *xive, int version_id) } } + /* + * We can only restore the source config if the source has been + * previously set in KVM. Since we don't do that at reset time + * when restoring a VM, let's do it now. + */ + ret = kvmppc_xive_source_reset(&xive->source, &local_err); + if (ret < 0) { + goto fail; + } + /* Restore the EAT */ for (i = 0; i < xive->nr_irqs; i++) { - if (!xive_eas_is_valid(&xive->eat[i])) { + if (!xive_source_is_valid(xive, i)) { continue; } - /* - * We can only restore the source config if the source has been - * previously set in KVM. Since we don't do that for all interrupts - * at reset time anymore, let's do it now. - */ - ret = kvmppc_xive_source_reset_one(&xive->source, i, &local_err); - if (ret < 0) { - goto fail; - } - ret = kvmppc_xive_set_source_config(xive, i, &xive->eat[i], &local_err); if (ret < 0) { goto fail; diff --git a/hw/mips/cps.c b/hw/mips/cps.c index 615e1a1ad2..23c0f87e41 100644 --- a/hw/mips/cps.c +++ b/hw/mips/cps.c @@ -52,9 +52,6 @@ static void main_cpu_reset(void *opaque) CPUState *cs = CPU(cpu); cpu_reset(cs); - - /* All VPs are halted on reset. Leave powering up to CPC. */ - cs->halted = 1; } static bool cpu_mips_itu_supported(CPUMIPSState *env) @@ -76,7 +73,17 @@ static void mips_cps_realize(DeviceState *dev, Error **errp) bool saar_present = false; for (i = 0; i < s->num_vp; i++) { - cpu = MIPS_CPU(cpu_create(s->cpu_type)); + cpu = MIPS_CPU(object_new(s->cpu_type)); + + /* All VPs are halted on reset. Leave powering up to CPC. */ + if (!object_property_set_bool(OBJECT(cpu), "start-powered-off", true, + errp)) { + return; + } + + if (!qdev_realize_and_unref(DEVICE(cpu), NULL, errp)) { + return; + } /* Init internal devices */ cpu_mips_irq_init_cpu(cpu); diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index ab9884e315..ae39b9358e 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -704,9 +704,6 @@ static void ppce500_cpu_reset_sec(void *opaque) cpu_reset(cs); - /* Secondary CPU starts in halted state for now. Needs to change when - implementing non-kernel boot. */ - cs->halted = 1; cs->exception_index = EXCP_HLT; } @@ -865,7 +862,7 @@ void ppce500_init(MachineState *machine) CPUState *cs; qemu_irq *input; - cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); + cpu = POWERPC_CPU(object_new(machine->cpu_type)); env = &cpu->env; cs = CPU(cpu); @@ -875,6 +872,14 @@ void ppce500_init(MachineState *machine) exit(1); } + /* + * Secondary CPU starts in halted state for now. Needs to change + * when implementing non-kernel boot. + */ + object_property_set_bool(OBJECT(cs), "start-powered-off", i != 0, + &error_fatal); + qdev_realize_and_unref(DEVICE(cs), NULL, &error_fatal); + if (!firstenv) { firstenv = env; } diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index 918969b320..ffa2ec37fa 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -25,7 +25,8 @@ ppc_ss.add(when: 'CONFIG_PSERIES', if_true: files( 'spapr_irq.c', 'spapr_tpm_proxy.c', 'spapr_nvdimm.c', - 'spapr_rtas_ddw.c' + 'spapr_rtas_ddw.c', + 'spapr_numa.c', )) ppc_ss.add(when: 'CONFIG_SPAPR_RNG', if_true: files('spapr_rng.c')) ppc_ss.add(when: ['CONFIG_PSERIES', 'CONFIG_LINUX'], if_true: files( diff --git a/hw/ppc/pnv_bmc.c b/hw/ppc/pnv_bmc.c index 2e1a03daa4..67ebb16c4d 100644 --- a/hw/ppc/pnv_bmc.c +++ b/hw/ppc/pnv_bmc.c @@ -140,6 +140,27 @@ static uint16_t bytes_to_blocks(uint32_t bytes) return bytes >> BLOCK_SHIFT; } +static uint32_t blocks_to_bytes(uint16_t blocks) +{ + return blocks << BLOCK_SHIFT; +} + +static int hiomap_erase(PnvPnor *pnor, uint32_t offset, uint32_t size) +{ + MemTxResult result; + int i; + + for (i = 0; i < size / 4; i++) { + result = memory_region_dispatch_write(&pnor->mmio, offset + i * 4, + 0xFFFFFFFF, MO_32, + MEMTXATTRS_UNSPECIFIED); + if (result != MEMTX_OK) { + return -1; + } + } + return 0; +} + static void hiomap_cmd(IPMIBmcSim *ibs, uint8_t *cmd, unsigned int cmd_len, RspBuffer *rsp) { @@ -155,10 +176,16 @@ static void hiomap_cmd(IPMIBmcSim *ibs, uint8_t *cmd, unsigned int cmd_len, switch (cmd[2]) { case HIOMAP_C_MARK_DIRTY: case HIOMAP_C_FLUSH: - case HIOMAP_C_ERASE: case HIOMAP_C_ACK: break; + case HIOMAP_C_ERASE: + if (hiomap_erase(pnor, blocks_to_bytes(cmd[5] << 8 | cmd[4]), + blocks_to_bytes(cmd[7] << 8 | cmd[6]))) { + rsp_buffer_set_error(rsp, IPMI_CC_UNSPECIFIED); + } + break; + case HIOMAP_C_GET_INFO: rsp_buffer_push(rsp, 2); /* Version 2 */ rsp_buffer_push(rsp, BLOCK_SHIFT); /* block size */ diff --git a/hw/ppc/pnv_lpc.c b/hw/ppc/pnv_lpc.c index b5ffa48dac..23f1e09492 100644 --- a/hw/ppc/pnv_lpc.c +++ b/hw/ppc/pnv_lpc.c @@ -646,7 +646,6 @@ static void pnv_lpc_power8_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_power8_info = { .name = TYPE_PNV8_LPC, .parent = TYPE_PNV_LPC, - .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_power8_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_PNV_XSCOM_INTERFACE }, @@ -687,7 +686,6 @@ static void pnv_lpc_power9_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_power9_info = { .name = TYPE_PNV9_LPC, .parent = TYPE_PNV_LPC, - .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_power9_class_init, }; @@ -768,6 +766,7 @@ static void pnv_lpc_class_init(ObjectClass *klass, void *data) static const TypeInfo pnv_lpc_info = { .name = TYPE_PNV_LPC, .parent = TYPE_DEVICE, + .instance_size = sizeof(PnvLpcController), .class_init = pnv_lpc_class_init, .class_size = sizeof(PnvLpcClass), .abstract = true, diff --git a/hw/ppc/ppc4xx_pci.c b/hw/ppc/ppc4xx_pci.c index 3ea47df71f..503ef46b39 100644 --- a/hw/ppc/ppc4xx_pci.c +++ b/hw/ppc/ppc4xx_pci.c @@ -256,10 +256,7 @@ static void ppc4xx_pci_set_irq(void *opaque, int irq_num, int level) qemu_irq *pci_irqs = opaque; trace_ppc4xx_pci_set_irq(irq_num); - if (irq_num < 0) { - fprintf(stderr, "%s: PCI irq %d\n", __func__, irq_num); - return; - } + assert(irq_num >= 0); qemu_set_irq(pci_irqs[irq_num], level); } @@ -320,7 +317,8 @@ static void ppc4xx_pcihost_realize(DeviceState *dev, Error **errp) b = pci_register_root_bus(dev, NULL, ppc4xx_pci_set_irq, ppc4xx_pci_map_irq, s->irq, get_system_memory(), - get_system_io(), 0, 4, TYPE_PCI_BUS); + get_system_io(), 0, ARRAY_SIZE(s->irq), + TYPE_PCI_BUS); h->bus = b; pci_create_simple(b, 0, "ppc4xx-host-bridge"); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index dd2fa4826b..9bce1892b5 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -81,6 +81,7 @@ #include "hw/mem/memory-device.h" #include "hw/ppc/spapr_tpm_proxy.h" #include "hw/ppc/spapr_nvdimm.h" +#include "hw/ppc/spapr_numa.h" #include "monitor/monitor.h" @@ -201,21 +202,6 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, return ret; } -static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu) -{ - int index = spapr_get_vcpu_id(cpu); - uint32_t associativity[] = {cpu_to_be32(0x5), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(cpu->node_id), - cpu_to_be32(index)}; - - /* Advertise NUMA via ibm,associativity */ - return fdt_setprop(fdt, offset, "ibm,associativity", associativity, - sizeof(associativity)); -} - static void spapr_dt_pa_features(SpaprMachineState *spapr, PowerPCCPU *cpu, void *fdt, int offset) @@ -313,14 +299,9 @@ static void add_str(GString *s, const gchar *s1) g_string_append_len(s, s1, strlen(s1) + 1); } -static int spapr_dt_memory_node(void *fdt, int nodeid, hwaddr start, - hwaddr size) +static int spapr_dt_memory_node(SpaprMachineState *spapr, void *fdt, int nodeid, + hwaddr start, hwaddr size) { - uint32_t associativity[] = { - cpu_to_be32(0x4), /* length */ - cpu_to_be32(0x0), cpu_to_be32(0x0), - cpu_to_be32(0x0), cpu_to_be32(nodeid) - }; char mem_name[32]; uint64_t mem_reg_property[2]; int off; @@ -334,8 +315,7 @@ static int spapr_dt_memory_node(void *fdt, int nodeid, hwaddr start, _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); _FDT((fdt_setprop(fdt, off, "reg", mem_reg_property, sizeof(mem_reg_property)))); - _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(spapr, fdt, off, nodeid); return off; } @@ -555,13 +535,10 @@ static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, void *fdt) { MachineState *machine = MACHINE(spapr); - int nb_numa_nodes = machine->numa_state->num_nodes; - int ret, i, offset; + int ret, offset; uint64_t lmb_size = SPAPR_MEMORY_BLOCK_SIZE; uint32_t prop_lmb_size[] = {cpu_to_be32(lmb_size >> 32), cpu_to_be32(lmb_size & 0xffffffff)}; - uint32_t *int_buf, *cur_index, buf_len; - int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; MemoryDeviceInfoList *dimms = NULL; /* @@ -602,25 +579,7 @@ static int spapr_dt_dynamic_reconfiguration_memory(SpaprMachineState *spapr, return ret; } - /* ibm,associativity-lookup-arrays */ - buf_len = (nr_nodes * 4 + 2) * sizeof(uint32_t); - cur_index = int_buf = g_malloc0(buf_len); - int_buf[0] = cpu_to_be32(nr_nodes); - int_buf[1] = cpu_to_be32(4); /* Number of entries per associativity list */ - cur_index += 2; - for (i = 0; i < nr_nodes; i++) { - uint32_t associativity[] = { - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(i) - }; - memcpy(cur_index, associativity, sizeof(associativity)); - cur_index += 4; - } - ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, - (cur_index - int_buf) * sizeof(uint32_t)); - g_free(int_buf); + ret = spapr_numa_write_assoc_lookup_arrays(spapr, fdt, offset); return ret; } @@ -648,7 +607,7 @@ static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt) if (!mem_start) { /* spapr_machine_init() checks for rma_size <= node0_size * already */ - spapr_dt_memory_node(fdt, i, 0, spapr->rma_size); + spapr_dt_memory_node(spapr, fdt, i, 0, spapr->rma_size); mem_start += spapr->rma_size; node_size -= spapr->rma_size; } @@ -660,7 +619,7 @@ static int spapr_dt_memory(SpaprMachineState *spapr, void *fdt) sizetmp = 1ULL << ctzl(mem_start); } - spapr_dt_memory_node(fdt, i, mem_start, sizetmp); + spapr_dt_memory_node(spapr, fdt, i, mem_start, sizetmp); node_size -= sizetmp; mem_start += sizetmp; } @@ -790,7 +749,7 @@ static void spapr_dt_cpu(CPUState *cs, void *fdt, int offset, pft_size_prop, sizeof(pft_size_prop)))); if (ms->numa_state->num_nodes > 1) { - _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cpu)); + _FDT(spapr_numa_fixup_cpu_dt(spapr, fdt, offset, cpu)); } _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt)); @@ -891,16 +850,9 @@ static int spapr_dt_rng(void *fdt) static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) { MachineState *ms = MACHINE(spapr); - SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(ms); int rtas; GString *hypertas = g_string_sized_new(256); GString *qemu_hypertas = g_string_sized_new(256); - uint32_t refpoints[] = { - cpu_to_be32(0x4), - cpu_to_be32(0x4), - cpu_to_be32(0x2), - }; - uint32_t nr_refpoints = ARRAY_SIZE(refpoints); uint64_t max_device_addr = MACHINE(spapr)->device_memory->base + memory_region_size(&MACHINE(spapr)->device_memory->mr); uint32_t lrdr_capacity[] = { @@ -910,14 +862,6 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE & 0xffffffff), cpu_to_be32(ms->smp.max_cpus / ms->smp.threads), }; - uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); - uint32_t maxdomains[] = { - cpu_to_be32(4), - maxdomain, - maxdomain, - maxdomain, - cpu_to_be32(spapr->gpu_numa_id), - }; _FDT(rtas = fdt_add_subnode(fdt, 0, "rtas")); @@ -953,15 +897,7 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) qemu_hypertas->str, qemu_hypertas->len)); g_string_free(qemu_hypertas, TRUE); - if (smc->pre_5_1_assoc_refpoints) { - nr_refpoints = 2; - } - - _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", - refpoints, nr_refpoints * sizeof(refpoints[0]))); - - _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", - maxdomains, sizeof(maxdomains))); + spapr_numa_write_rtas_dt(spapr, fdt, rtas); /* * FWNMI reserves RTAS_ERROR_LOG_MAX for the machine check error log, @@ -1297,7 +1233,7 @@ void *spapr_build_fdt(SpaprMachineState *spapr, bool reset, size_t space) /* NVDIMM devices */ if (mc->nvdimm_supported) { - spapr_dt_persistent_memory(fdt); + spapr_dt_persistent_memory(spapr, fdt); } return fdt; @@ -2832,6 +2768,9 @@ static void spapr_machine_init(MachineState *machine) */ spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes); + /* Init numa_assoc_array */ + spapr_numa_associativity_init(spapr, machine); + if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, spapr->max_compat_pvr)) { @@ -3416,7 +3355,7 @@ int spapr_lmb_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, addr = spapr_drc_index(drc) * SPAPR_MEMORY_BLOCK_SIZE; node = object_property_get_uint(OBJECT(drc->dev), PC_DIMM_NODE_PROP, &error_abort); - *fdt_start_offset = spapr_dt_memory_node(fdt, node, addr, + *fdt_start_offset = spapr_dt_memory_node(spapr, fdt, node, addr, SPAPR_MEMORY_BLOCK_SIZE); return 0; } @@ -3520,7 +3459,6 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, { const SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(hotplug_dev); SpaprMachineState *spapr = SPAPR_MACHINE(hotplug_dev); - const MachineClass *mc = MACHINE_CLASS(smc); bool is_nvdimm = object_dynamic_cast(OBJECT(dev), TYPE_NVDIMM); PCDIMMDevice *dimm = PC_DIMM(dev); Error *local_err = NULL; @@ -3533,27 +3471,22 @@ static void spapr_memory_pre_plug(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - if (is_nvdimm && !mc->nvdimm_supported) { - error_setg(errp, "NVDIMM hotplug not supported for this machine"); - return; - } - size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &local_err); if (local_err) { error_propagate(errp, local_err); return; } - if (!is_nvdimm && size % SPAPR_MEMORY_BLOCK_SIZE) { - error_setg(errp, "Hotplugged memory size must be a multiple of " - "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB); - return; - } else if (is_nvdimm) { - spapr_nvdimm_validate_opts(NVDIMM(dev), size, &local_err); + if (is_nvdimm) { + spapr_nvdimm_validate(hotplug_dev, NVDIMM(dev), size, &local_err); if (local_err) { error_propagate(errp, local_err); return; } + } else if (size % SPAPR_MEMORY_BLOCK_SIZE) { + error_setg(errp, "Hotplugged memory size must be a multiple of " + "%" PRIu64 " MB", SPAPR_MEMORY_BLOCK_SIZE / MiB); + return; } memdev = object_property_get_link(OBJECT(dimm), PC_DIMM_MEMDEV_PROP, diff --git a/hw/ppc/spapr_cpu_core.c b/hw/ppc/spapr_cpu_core.c index c4f47dcc04..2125fdac34 100644 --- a/hw/ppc/spapr_cpu_core.c +++ b/hw/ppc/spapr_cpu_core.c @@ -36,11 +36,6 @@ static void spapr_reset_vcpu(PowerPCCPU *cpu) cpu_reset(cs); - /* All CPUs start halted. CPU0 is unhalted from the machine level - * reset code and the rest are explicitly started up by the guest - * using an RTAS call */ - cs->halted = 1; - env->spr[SPR_HIOR] = 0; lpcr = env->spr[SPR_LPCR]; @@ -274,6 +269,11 @@ static PowerPCCPU *spapr_create_vcpu(SpaprCpuCore *sc, int i, Error **errp) cs = CPU(obj); cpu = POWERPC_CPU(obj); + /* + * All CPUs start halted. CPU0 is unhalted from the machine level reset code + * and the rest are explicitly started up by the guest using an RTAS call. + */ + cs->start_powered_off = true; cs->cpu_index = cc->core_id + i; spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err); if (local_err) { diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index c1d01228c6..c2776b6a7d 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1873,42 +1873,6 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, return ret; } -static target_ulong h_home_node_associativity(PowerPCCPU *cpu, - SpaprMachineState *spapr, - target_ulong opcode, - target_ulong *args) -{ - target_ulong flags = args[0]; - target_ulong procno = args[1]; - PowerPCCPU *tcpu; - int idx; - - /* only support procno from H_REGISTER_VPA */ - if (flags != 0x1) { - return H_FUNCTION; - } - - tcpu = spapr_find_cpu(procno); - if (tcpu == NULL) { - return H_P2; - } - - /* sequence is the same as in the "ibm,associativity" property */ - - idx = 0; -#define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ - ((uint64_t)(b) & 0xffffffff)) - args[idx++] = ASSOCIATIVITY(0, 0); - args[idx++] = ASSOCIATIVITY(0, tcpu->node_id); - args[idx++] = ASSOCIATIVITY(procno, -1); - for ( ; idx < 6; idx++) { - args[idx] = -1; - } -#undef ASSOCIATIVITY - - return H_SUCCESS; -} - static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, @@ -2139,10 +2103,6 @@ static void hypercall_register_types(void) spapr_register_hypercall(KVMPPC_H_CAS, h_client_architecture_support); spapr_register_hypercall(KVMPPC_H_UPDATE_DT, h_update_dt); - - /* Virtual Processor Home Node */ - spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, - h_home_node_associativity); } type_init(hypercall_register_types) diff --git a/hw/ppc/spapr_irq.c b/hw/ppc/spapr_irq.c index 72bb938375..f59960339e 100644 --- a/hw/ppc/spapr_irq.c +++ b/hw/ppc/spapr_irq.c @@ -172,7 +172,7 @@ static int spapr_irq_check(SpaprMachineState *spapr, Error **errp) * To cover both and not confuse the OS, add an early failure in * QEMU. */ - if (spapr->irq == &spapr_irq_xive) { + if (!spapr->irq->xics) { error_setg(errp, "XIVE-only machines require a POWER9 CPU"); return -1; } diff --git a/hw/ppc/spapr_numa.c b/hw/ppc/spapr_numa.c new file mode 100644 index 0000000000..64fe567f5d --- /dev/null +++ b/hw/ppc/spapr_numa.c @@ -0,0 +1,242 @@ +/* + * QEMU PowerPC pSeries Logical Partition NUMA associativity handling + * + * Copyright IBM Corp. 2020 + * + * Authors: + * Daniel Henrique Barboza + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "hw/ppc/spapr_numa.h" +#include "hw/pci-host/spapr.h" +#include "hw/ppc/fdt.h" + +/* Moved from hw/ppc/spapr_pci_nvlink2.c */ +#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) + +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine) +{ + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + int nb_numa_nodes = machine->numa_state->num_nodes; + int i, j, max_nodes_with_gpus; + + /* + * For all associativity arrays: first position is the size, + * position MAX_DISTANCE_REF_POINTS is always the numa_id, + * represented by the index 'i'. + * + * This will break on sparse NUMA setups, when/if QEMU starts + * to support it, because there will be no more guarantee that + * 'i' will be a valid node_id set by the user. + */ + for (i = 0; i < nb_numa_nodes; i++) { + spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + } + + /* + * Initialize NVLink GPU associativity arrays. We know that + * the first GPU will take the first available NUMA id, and + * we'll have a maximum of NVGPU_MAX_NUM GPUs in the machine. + * At this point we're not sure if there are GPUs or not, but + * let's initialize the associativity arrays and allow NVLink + * GPUs to be handled like regular NUMA nodes later on. + */ + max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM; + + for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) { + spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + + for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) { + uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ? + SPAPR_GPU_NUMA_ID : cpu_to_be32(i); + spapr->numa_assoc_array[i][j] = gpu_assoc; + } + + spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i); + } +} + +void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, + int offset, int nodeid) +{ + _FDT((fdt_setprop(fdt, offset, "ibm,associativity", + spapr->numa_assoc_array[nodeid], + sizeof(spapr->numa_assoc_array[nodeid])))); +} + +static uint32_t *spapr_numa_get_vcpu_assoc(SpaprMachineState *spapr, + PowerPCCPU *cpu) +{ + uint32_t *vcpu_assoc = g_new(uint32_t, VCPU_ASSOC_SIZE); + int index = spapr_get_vcpu_id(cpu); + + /* + * VCPUs have an extra 'cpu_id' value in ibm,associativity + * compared to other resources. Increment the size at index + * 0, put cpu_id last, then copy the remaining associativity + * domains. + */ + vcpu_assoc[0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS + 1); + vcpu_assoc[VCPU_ASSOC_SIZE - 1] = cpu_to_be32(index); + memcpy(vcpu_assoc + 1, spapr->numa_assoc_array[cpu->node_id] + 1, + (VCPU_ASSOC_SIZE - 2) * sizeof(uint32_t)); + + return vcpu_assoc; +} + +int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, + int offset, PowerPCCPU *cpu) +{ + g_autofree uint32_t *vcpu_assoc = NULL; + + vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, cpu); + + /* Advertise NUMA via ibm,associativity */ + return fdt_setprop(fdt, offset, "ibm,associativity", vcpu_assoc, + VCPU_ASSOC_SIZE * sizeof(uint32_t)); +} + + +int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, + int offset) +{ + MachineState *machine = MACHINE(spapr); + int nb_numa_nodes = machine->numa_state->num_nodes; + int nr_nodes = nb_numa_nodes ? nb_numa_nodes : 1; + uint32_t *int_buf, *cur_index, buf_len; + int ret, i; + + /* ibm,associativity-lookup-arrays */ + buf_len = (nr_nodes * MAX_DISTANCE_REF_POINTS + 2) * sizeof(uint32_t); + cur_index = int_buf = g_malloc0(buf_len); + int_buf[0] = cpu_to_be32(nr_nodes); + /* Number of entries per associativity list */ + int_buf[1] = cpu_to_be32(MAX_DISTANCE_REF_POINTS); + cur_index += 2; + for (i = 0; i < nr_nodes; i++) { + /* + * For the lookup-array we use the ibm,associativity array, + * from numa_assoc_array. without the first element (size). + */ + uint32_t *associativity = spapr->numa_assoc_array[i]; + memcpy(cur_index, ++associativity, + sizeof(uint32_t) * MAX_DISTANCE_REF_POINTS); + cur_index += MAX_DISTANCE_REF_POINTS; + } + ret = fdt_setprop(fdt, offset, "ibm,associativity-lookup-arrays", int_buf, + (cur_index - int_buf) * sizeof(uint32_t)); + g_free(int_buf); + + return ret; +} + +/* + * Helper that writes ibm,associativity-reference-points and + * max-associativity-domains in the RTAS pointed by @rtas + * in the DT @fdt. + */ +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas) +{ + SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr); + uint32_t refpoints[] = { + cpu_to_be32(0x4), + cpu_to_be32(0x4), + cpu_to_be32(0x2), + }; + uint32_t nr_refpoints = ARRAY_SIZE(refpoints); + uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0); + uint32_t maxdomains[] = { + cpu_to_be32(4), + maxdomain, + maxdomain, + maxdomain, + cpu_to_be32(spapr->gpu_numa_id), + }; + + if (smc->pre_5_1_assoc_refpoints) { + nr_refpoints = 2; + } + + _FDT(fdt_setprop(fdt, rtas, "ibm,associativity-reference-points", + refpoints, nr_refpoints * sizeof(refpoints[0]))); + + _FDT(fdt_setprop(fdt, rtas, "ibm,max-associativity-domains", + maxdomains, sizeof(maxdomains))); +} + +static target_ulong h_home_node_associativity(PowerPCCPU *cpu, + SpaprMachineState *spapr, + target_ulong opcode, + target_ulong *args) +{ + g_autofree uint32_t *vcpu_assoc = NULL; + target_ulong flags = args[0]; + target_ulong procno = args[1]; + PowerPCCPU *tcpu; + int idx, assoc_idx; + + /* only support procno from H_REGISTER_VPA */ + if (flags != 0x1) { + return H_FUNCTION; + } + + tcpu = spapr_find_cpu(procno); + if (tcpu == NULL) { + return H_P2; + } + + /* + * Given that we want to be flexible with the sizes and indexes, + * we must consider that there is a hard limit of how many + * associativities domain we can fit in R4 up to R9, which would be + * 12 associativity domains for vcpus. Assert and bail if that's + * not the case. + */ + G_STATIC_ASSERT((VCPU_ASSOC_SIZE - 1) <= 12); + + vcpu_assoc = spapr_numa_get_vcpu_assoc(spapr, tcpu); + /* assoc_idx starts at 1 to skip associativity size */ + assoc_idx = 1; + +#define ASSOCIATIVITY(a, b) (((uint64_t)(a) << 32) | \ + ((uint64_t)(b) & 0xffffffff)) + + for (idx = 0; idx < 6; idx++) { + int32_t a, b; + + /* + * vcpu_assoc[] will contain the associativity domains for tcpu, + * including tcpu->node_id and procno, meaning that we don't + * need to use these variables here. + * + * We'll read 2 values at a time to fill up the ASSOCIATIVITY() + * macro. The ternary will fill the remaining registers with -1 + * after we went through vcpu_assoc[]. + */ + a = assoc_idx < VCPU_ASSOC_SIZE ? + be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; + b = assoc_idx < VCPU_ASSOC_SIZE ? + be32_to_cpu(vcpu_assoc[assoc_idx++]) : -1; + + args[idx] = ASSOCIATIVITY(a, b); + } +#undef ASSOCIATIVITY + + return H_SUCCESS; +} + +static void spapr_numa_register_types(void) +{ + /* Virtual Processor Home Node */ + spapr_register_hypercall(H_HOME_NODE_ASSOCIATIVITY, + h_home_node_associativity); +} + +type_init(spapr_numa_register_types) diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c index 81410aa63f..63872054f3 100644 --- a/hw/ppc/spapr_nvdimm.c +++ b/hw/ppc/spapr_nvdimm.c @@ -27,16 +27,41 @@ #include "hw/ppc/spapr_nvdimm.h" #include "hw/mem/nvdimm.h" #include "qemu/nvdimm-utils.h" +#include "qemu/option.h" #include "hw/ppc/fdt.h" #include "qemu/range.h" +#include "sysemu/sysemu.h" +#include "hw/ppc/spapr_numa.h" -void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, - Error **errp) +void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, + uint64_t size, Error **errp) { - char *uuidstr = NULL; + const MachineClass *mc = MACHINE_GET_CLASS(hotplug_dev); + const MachineState *ms = MACHINE(hotplug_dev); + const char *nvdimm_opt = qemu_opt_get(qemu_get_machine_opts(), "nvdimm"); + g_autofree char *uuidstr = NULL; QemuUUID uuid; int ret; + if (!mc->nvdimm_supported) { + error_setg(errp, "NVDIMM hotplug not supported for this machine"); + return; + } + + /* + * NVDIMM support went live in 5.1 without considering that, in + * other archs, the user needs to enable NVDIMM support with the + * 'nvdimm' machine option and the default behavior is NVDIMM + * support disabled. It is too late to roll back to the standard + * behavior without breaking 5.1 guests. What we can do is to + * ensure that, if the user sets nvdimm=off, we error out + * regardless of being 5.1 or newer. + */ + if (!ms->nvdimms_state->is_enabled && nvdimm_opt) { + error_setg(errp, "nvdimm device found but 'nvdimm=off' was set"); + return; + } + if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP, &error_abort) == 0) { error_setg(errp, "PAPR requires NVDIMM devices to have label-size set"); @@ -54,7 +79,6 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, &error_abort); ret = qemu_uuid_parse(uuidstr, &uuid); g_assert(!ret); - g_free(uuidstr); if (qemu_uuid_is_null(&uuid)) { error_setg(errp, "NVDIMM device requires the uuid to be set"); @@ -83,16 +107,6 @@ void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp) } } -int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, - void *fdt, int *fdt_start_offset, Error **errp) -{ - NVDIMMDevice *nvdimm = NVDIMM(drc->dev); - - *fdt_start_offset = spapr_dt_nvdimm(fdt, 0, nvdimm); - - return 0; -} - void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) { MachineState *machine = MACHINE(spapr); @@ -104,8 +118,8 @@ void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr) } -int spapr_dt_nvdimm(void *fdt, int parent_offset, - NVDIMMDevice *nvdimm) +static int spapr_dt_nvdimm(SpaprMachineState *spapr, void *fdt, + int parent_offset, NVDIMMDevice *nvdimm) { int child_offset; char *buf; @@ -115,11 +129,6 @@ int spapr_dt_nvdimm(void *fdt, int parent_offset, &error_abort); uint64_t slot = object_property_get_uint(OBJECT(nvdimm), PC_DIMM_SLOT_PROP, &error_abort); - uint32_t associativity[] = { - cpu_to_be32(0x4), /* length */ - cpu_to_be32(0x0), cpu_to_be32(0x0), - cpu_to_be32(0x0), cpu_to_be32(node) - }; uint64_t lsize = nvdimm->label_size; uint64_t size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP, NULL); @@ -139,8 +148,7 @@ int spapr_dt_nvdimm(void *fdt, int parent_offset, _FDT((fdt_setprop_string(fdt, child_offset, "compatible", "ibm,pmemory"))); _FDT((fdt_setprop_string(fdt, child_offset, "device_type", "ibm,pmemory"))); - _FDT((fdt_setprop(fdt, child_offset, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(spapr, fdt, child_offset, node); buf = qemu_uuid_unparse_strdup(&nvdimm->uuid); _FDT((fdt_setprop_string(fdt, child_offset, "ibm,unit-guid", buf))); @@ -161,7 +169,17 @@ int spapr_dt_nvdimm(void *fdt, int parent_offset, return child_offset; } -void spapr_dt_persistent_memory(void *fdt) +int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, + void *fdt, int *fdt_start_offset, Error **errp) +{ + NVDIMMDevice *nvdimm = NVDIMM(drc->dev); + + *fdt_start_offset = spapr_dt_nvdimm(spapr, fdt, 0, nvdimm); + + return 0; +} + +void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt) { int offset = fdt_subnode_offset(fdt, 0, "persistent-memory"); GSList *iter, *nvdimms = nvdimm_get_device_list(); @@ -179,7 +197,7 @@ void spapr_dt_persistent_memory(void *fdt) for (iter = nvdimms; iter; iter = iter->next) { NVDIMMDevice *nvdimm = iter->data; - spapr_dt_nvdimm(fdt, offset, nvdimm); + spapr_dt_nvdimm(spapr, fdt, offset, nvdimm); } g_slist_free(nvdimms); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index 0a418f1e67..4d97ff6c70 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -52,6 +52,7 @@ #include "sysemu/kvm.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" +#include "hw/ppc/spapr_numa.h" /* Copied from the kernel arch/powerpc/platforms/pseries/msi.c */ #define RTAS_QUERY_FN 0 @@ -2321,11 +2322,6 @@ int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb, cpu_to_be32(1), cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) }; - uint32_t associativity[] = {cpu_to_be32(0x4), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(phb->numa_node)}; SpaprTceTable *tcet; SpaprDrc *drc; Error *err = NULL; @@ -2358,8 +2354,7 @@ int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb, /* Advertise NUMA via ibm,associativity */ if (phb->numa_node != -1) { - _FDT(fdt_setprop(fdt, bus_off, "ibm,associativity", associativity, - sizeof(associativity))); + spapr_numa_write_associativity_dt(spapr, fdt, bus_off, phb->numa_node); } /* Build the interrupt-map, this must matches what is done diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c index 76ae77ebc8..8ef9b40a18 100644 --- a/hw/ppc/spapr_pci_nvlink2.c +++ b/hw/ppc/spapr_pci_nvlink2.c @@ -26,6 +26,7 @@ #include "qemu-common.h" #include "hw/pci/pci.h" #include "hw/pci-host/spapr.h" +#include "hw/ppc/spapr_numa.h" #include "qemu/error-report.h" #include "hw/ppc/fdt.h" #include "hw/pci/pci_bridge.h" @@ -37,8 +38,6 @@ #define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \ ((gn) << 4) | (nn)) -#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1)) - typedef struct SpaprPhbPciNvGpuSlot { uint64_t tgt; uint64_t gpa; @@ -360,13 +359,6 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev), "nvlink2-mr[0]", &error_abort); - uint32_t associativity[] = { - cpu_to_be32(0x4), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id), - cpu_to_be32(nvslot->numa_id) - }; uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL); uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) }; char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa); @@ -376,14 +368,8 @@ void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt) _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg)))); - if (sphb->pre_5_1_assoc) { - associativity[1] = SPAPR_GPU_NUMA_ID; - associativity[2] = SPAPR_GPU_NUMA_ID; - associativity[3] = SPAPR_GPU_NUMA_ID; - } - - _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity, - sizeof(associativity)))); + spapr_numa_write_associativity_dt(SPAPR_MACHINE(qdev_get_machine()), + fdt, off, nvslot->numa_id); _FDT((fdt_setprop_string(fdt, off, "compatible", "ibm,coherent-device-memory"))); diff --git a/hw/scsi/spapr_vscsi.c b/hw/scsi/spapr_vscsi.c index d17dc03c73..57f0a1336f 100644 --- a/hw/scsi/spapr_vscsi.c +++ b/hw/scsi/spapr_vscsi.c @@ -1219,6 +1219,9 @@ static void spapr_vscsi_realize(SpaprVioDevice *dev, Error **errp) scsi_bus_new(&s->bus, sizeof(s->bus), DEVICE(dev), &vscsi_scsi_info, NULL); + + /* ibmvscsi SCSI bus does not allow hotplug. */ + qbus_set_hotplug_handler(BUS(&s->bus), NULL); } void spapr_vscsi_create(SpaprVioBus *bus) diff --git a/hw/sparc/sun4m.c b/hw/sparc/sun4m.c index cf7dfa4af5..6bf9d27d8a 100644 --- a/hw/sparc/sun4m.c +++ b/hw/sparc/sun4m.c @@ -218,22 +218,12 @@ static void dummy_cpu_set_irq(void *opaque, int irq, int level) { } -static void main_cpu_reset(void *opaque) +static void sun4m_cpu_reset(void *opaque) { SPARCCPU *cpu = opaque; CPUState *cs = CPU(cpu); cpu_reset(cs); - cs->halted = 0; -} - -static void secondary_cpu_reset(void *opaque) -{ - SPARCCPU *cpu = opaque; - CPUState *cs = CPU(cpu); - - cpu_reset(cs); - cs->halted = 1; } static void cpu_halt_signal(void *opaque, int irq, int level) @@ -819,21 +809,17 @@ static const TypeInfo ram_info = { static void cpu_devinit(const char *cpu_type, unsigned int id, uint64_t prom_addr, qemu_irq **cpu_irqs) { - CPUState *cs; SPARCCPU *cpu; CPUSPARCState *env; - cpu = SPARC_CPU(cpu_create(cpu_type)); + cpu = SPARC_CPU(object_new(cpu_type)); env = &cpu->env; cpu_sparc_set_id(env, id); - if (id == 0) { - qemu_register_reset(main_cpu_reset, cpu); - } else { - qemu_register_reset(secondary_cpu_reset, cpu); - cs = CPU(cpu); - cs->halted = 1; - } + qemu_register_reset(sun4m_cpu_reset, cpu); + object_property_set_bool(OBJECT(cpu), "start-powered-off", id != 0, + &error_fatal); + qdev_realize_and_unref(DEVICE(cpu), NULL, &error_fatal); *cpu_irqs = qemu_allocate_irqs(cpu_set_irq, cpu, MAX_PILS); env->prom_addr = prom_addr; } diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 8f145733ce..9fc2696db5 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -374,6 +374,10 @@ struct CPUState { bool created; bool stop; bool stopped; + + /* Should CPU start in powered-off state? */ + bool start_powered_off; + bool unplug; bool crash_occurred; bool exit_request; diff --git a/include/hw/ipmi/ipmi.h b/include/hw/ipmi/ipmi.h index 8a99d958bb..c1efdaa4cb 100644 --- a/include/hw/ipmi/ipmi.h +++ b/include/hw/ipmi/ipmi.h @@ -53,6 +53,7 @@ enum ipmi_op { #define IPMI_CC_INVALID_DATA_FIELD 0xcc #define IPMI_CC_BMC_INIT_IN_PROGRESS 0xd2 #define IPMI_CC_COMMAND_NOT_SUPPORTED 0xd5 +#define IPMI_CC_UNSPECIFIED 0xff #define IPMI_NETFN_APP 0x06 #define IPMI_NETFN_OEM 0x3a diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index a1e230ad39..e50a2672e3 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -105,6 +105,21 @@ typedef enum { #define FDT_MAX_SIZE 0x100000 +/* + * NUMA related macros. MAX_DISTANCE_REF_POINTS was taken + * from Linux kernel arch/powerpc/mm/numa.h. It represents the + * amount of associativity domains for non-CPU resources. + * + * NUMA_ASSOC_SIZE is the base array size of an ibm,associativity + * array for any non-CPU resource. + * + * VCPU_ASSOC_SIZE represents the size of ibm,associativity array + * for CPUs, which has an extra element (vcpu_id) in the end. + */ +#define MAX_DISTANCE_REF_POINTS 4 +#define NUMA_ASSOC_SIZE (MAX_DISTANCE_REF_POINTS + 1) +#define VCPU_ASSOC_SIZE (NUMA_ASSOC_SIZE + 1) + typedef struct SpaprCapabilities SpaprCapabilities; struct SpaprCapabilities { uint8_t caps[SPAPR_CAP_NUM]; @@ -231,6 +246,8 @@ struct SpaprMachineState { unsigned gpu_numa_id; SpaprTpmProxy *tpm_proxy; + uint32_t numa_assoc_array[MAX_NODES][NUMA_ASSOC_SIZE]; + Error *fwnmi_migration_blocker; }; diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index 21af8deac1..f270860769 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -29,62 +29,21 @@ TYPE_SPAPR_DR_CONNECTOR) #define TYPE_SPAPR_DRC_PHYSICAL "spapr-drc-physical" -#define SPAPR_DRC_PHYSICAL_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PHYSICAL) -#define SPAPR_DRC_PHYSICAL_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, \ - TYPE_SPAPR_DRC_PHYSICAL) #define SPAPR_DRC_PHYSICAL(obj) OBJECT_CHECK(SpaprDrcPhysical, (obj), \ TYPE_SPAPR_DRC_PHYSICAL) #define TYPE_SPAPR_DRC_LOGICAL "spapr-drc-logical" -#define SPAPR_DRC_LOGICAL_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_LOGICAL) -#define SPAPR_DRC_LOGICAL_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, \ - TYPE_SPAPR_DRC_LOGICAL) -#define SPAPR_DRC_LOGICAL(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_LOGICAL) #define TYPE_SPAPR_DRC_CPU "spapr-drc-cpu" -#define SPAPR_DRC_CPU_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_CPU) -#define SPAPR_DRC_CPU_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_CPU) -#define SPAPR_DRC_CPU(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_CPU) #define TYPE_SPAPR_DRC_PCI "spapr-drc-pci" -#define SPAPR_DRC_PCI_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PCI) -#define SPAPR_DRC_PCI_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PCI) -#define SPAPR_DRC_PCI(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PCI) #define TYPE_SPAPR_DRC_LMB "spapr-drc-lmb" -#define SPAPR_DRC_LMB_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_LMB) -#define SPAPR_DRC_LMB_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_LMB) -#define SPAPR_DRC_LMB(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_LMB) #define TYPE_SPAPR_DRC_PHB "spapr-drc-phb" -#define SPAPR_DRC_PHB_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PHB) -#define SPAPR_DRC_PHB_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PHB) -#define SPAPR_DRC_PHB(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PHB) #define TYPE_SPAPR_DRC_PMEM "spapr-drc-pmem" -#define SPAPR_DRC_PMEM_GET_CLASS(obj) \ - OBJECT_GET_CLASS(SpaprDrcClass, obj, TYPE_SPAPR_DRC_PMEM) -#define SPAPR_DRC_PMEM_CLASS(klass) \ - OBJECT_CLASS_CHECK(SpaprDrcClass, klass, TYPE_SPAPR_DRC_PMEM) -#define SPAPR_DRC_PMEM(obj) OBJECT_CHECK(SpaprDrc, (obj), \ - TYPE_SPAPR_DRC_PMEM) + /* * Various hotplug types managed by SpaprDrc * diff --git a/include/hw/ppc/spapr_numa.h b/include/hw/ppc/spapr_numa.h new file mode 100644 index 0000000000..b3fd950634 --- /dev/null +++ b/include/hw/ppc/spapr_numa.h @@ -0,0 +1,35 @@ +/* + * QEMU PowerPC pSeries Logical Partition NUMA associativity handling + * + * Copyright IBM Corp. 2020 + * + * Authors: + * Daniel Henrique Barboza + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_SPAPR_NUMA_H +#define HW_SPAPR_NUMA_H + +#include "hw/boards.h" +#include "hw/ppc/spapr.h" + +/* + * Having both SpaprMachineState and MachineState as arguments + * feels odd, but it will spare a MACHINE() call inside the + * function. spapr_machine_init() is the only caller for it, and + * it has both pointers resolved already. + */ +void spapr_numa_associativity_init(SpaprMachineState *spapr, + MachineState *machine); +void spapr_numa_write_rtas_dt(SpaprMachineState *spapr, void *fdt, int rtas); +void spapr_numa_write_associativity_dt(SpaprMachineState *spapr, void *fdt, + int offset, int nodeid); +int spapr_numa_fixup_cpu_dt(SpaprMachineState *spapr, void *fdt, + int offset, PowerPCCPU *cpu); +int spapr_numa_write_assoc_lookup_arrays(SpaprMachineState *spapr, void *fdt, + int offset); + +#endif /* HW_SPAPR_NUMA_H */ diff --git a/include/hw/ppc/spapr_nvdimm.h b/include/hw/ppc/spapr_nvdimm.h index b3330cc485..3eb344e8e9 100644 --- a/include/hw/ppc/spapr_nvdimm.h +++ b/include/hw/ppc/spapr_nvdimm.h @@ -27,10 +27,9 @@ QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE); int spapr_pmem_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); -int spapr_dt_nvdimm(void *fdt, int parent_offset, NVDIMMDevice *nvdimm); -void spapr_dt_persistent_memory(void *fdt); -void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, uint64_t size, - Error **errp); +void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt); +void spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm, + uint64_t size, Error **errp); void spapr_add_nvdimm(DeviceState *dev, uint64_t slot, Error **errp); void spapr_create_nvdimm_dr_connectors(SpaprMachineState *spapr); diff --git a/include/hw/ppc/spapr_xive.h b/include/hw/ppc/spapr_xive.h index a1c8540ab4..26c8d90d71 100644 --- a/include/hw/ppc/spapr_xive.h +++ b/include/hw/ppc/spapr_xive.h @@ -49,6 +49,8 @@ typedef struct SpaprXive { void *tm_mmap; MemoryRegion tm_mmio_kvm; VMChangeStateEntry *change; + + uint8_t hv_prio; } SpaprXive; typedef struct SpaprXiveClass { diff --git a/target/arm/cpu.c b/target/arm/cpu.c index c179e0752d..6b4e708c08 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -174,8 +174,7 @@ static void arm_cpu_reset(DeviceState *dev) env->vfp.xregs[ARM_VFP_MVFR1] = cpu->isar.mvfr1; env->vfp.xregs[ARM_VFP_MVFR2] = cpu->isar.mvfr2; - cpu->power_state = cpu->start_powered_off ? PSCI_OFF : PSCI_ON; - s->halted = cpu->start_powered_off; + cpu->power_state = s->start_powered_off ? PSCI_OFF : PSCI_ON; if (arm_feature(env, ARM_FEATURE_IWMMXT)) { env->iwmmxt.cregs[ARM_IWMMXT_wCID] = 0x69051000 | 'Q'; @@ -2186,7 +2185,6 @@ static const ARMCPUInfo arm_cpus[] = { }; static Property arm_cpu_properties[] = { - DEFINE_PROP_BOOL("start-powered-off", ARMCPU, start_powered_off, false), DEFINE_PROP_UINT32("psci-conduit", ARMCPU, psci_conduit, 0), DEFINE_PROP_UINT64("midr", ARMCPU, midr, 0), DEFINE_PROP_UINT64("mp-affinity", ARMCPU, diff --git a/target/arm/cpu.h b/target/arm/cpu.h index a1c7d8ebae..6036f61d60 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -817,9 +817,6 @@ struct ARMCPU { */ uint32_t psci_version; - /* Should CPU start in PSCI powered-off state? */ - bool start_powered_off; - /* Current power state, access guarded by BQL */ ARMPSCIState power_state; diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c index 0af46b41c8..1f2b8f8b7a 100644 --- a/target/arm/kvm32.c +++ b/target/arm/kvm32.c @@ -218,7 +218,7 @@ int kvm_arch_init_vcpu(CPUState *cs) /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); - if (cpu->start_powered_off) { + if (cs->start_powered_off) { cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF; } if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) { diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c index ef1e960285..987b35e33f 100644 --- a/target/arm/kvm64.c +++ b/target/arm/kvm64.c @@ -774,7 +774,7 @@ int kvm_arch_init_vcpu(CPUState *cs) /* Determine init features for this CPU */ memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features)); - if (cpu->start_powered_off) { + if (cs->start_powered_off) { cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF; } if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) { diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 08eb674d22..749cd548f0 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -291,9 +291,9 @@ static void s390_cpu_initfn(Object *obj) S390CPU *cpu = S390_CPU(obj); cpu_set_cpustate_pointers(cpu); - cs->halted = 1; cs->exception_index = EXCP_HLT; #if !defined(CONFIG_USER_ONLY) + cs->start_powered_off = true; object_property_add(obj, "crash-information", "GuestPanicInformation", s390_cpu_get_crash_info_qom, NULL, NULL, NULL); cpu->env.tod_timer =