30499fdd98
Running a guest with 128 NUMA nodes crashes QEMU:
../../util/error.c:59: error_setv: Assertion `*errp == NULL' failed.
The crash happens when setting the FWNMI migration blocker:
2861 if (spapr_get_cap(spapr, SPAPR_CAP_FWNMI) == SPAPR_CAP_ON) {
2862 /* Create the error string for live migration blocker */
2863 error_setg(&spapr->fwnmi_migration_blocker,
2864 "A machine check is being handled during migration. The handler"
2865 "may run and log hardware error on the destination");
2866 }
Inspection reveals that papr->fwnmi_migration_blocker isn't NULL:
(gdb) p spapr->fwnmi_migration_blocker
$1 = (Error *) 0x8000000004000000
Since this is the only place where papr->fwnmi_migration_blocker is
set, this means someone wrote there in our back. Further analysis
points to spapr_numa_associativity_init(), especially the part
that initializes the associative arrays for NVLink GPUs:
max_nodes_with_gpus = nb_numa_nodes + NVGPU_MAX_NUM;
ie. max_nodes_with_gpus = 128 + 6, but the array isn't sized to
accommodate the 6 extra nodes:
struct SpaprMachineState {
.
.
.
uint32_t numa_assoc_array[MAX_NODES][NUMA_ASSOC_SIZE];
Error *fwnmi_migration_blocker;
};
and the following loops happily overwrite spapr->fwnmi_migration_blocker,
and probably more:
for (i = nb_numa_nodes; i < max_nodes_with_gpus; i++) {
spapr->numa_assoc_array[i][0] = cpu_to_be32(MAX_DISTANCE_REF_POINTS);
for (j = 1; j < MAX_DISTANCE_REF_POINTS; j++) {
uint32_t gpu_assoc = smc->pre_5_1_assoc_refpoints ?
SPAPR_GPU_NUMA_ID : cpu_to_be32(i);
spapr->numa_assoc_array[i][j] = gpu_assoc;
}
spapr->numa_assoc_array[i][MAX_DISTANCE_REF_POINTS] = cpu_to_be32(i);
}
Fix the size of the array. This requires "hw/ppc/spapr.h" to see
NVGPU_MAX_NUM. Including "hw/pci-host/spapr.h" introduces a
circular dependency that breaks the build, so this moves the
definition of NVGPU_MAX_NUM to "hw/ppc/spapr.h" instead.
Reported-by: Min Deng <mdeng@redhat.com>
BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1908693
Fixes: dd7e1d7ae4
("spapr_numa: move NVLink2 associativity handling to spapr_numa.c")
Cc: danielhb413@gmail.com
Signed-off-by: Greg Kurz <groug@kaod.org>
Message-Id: <160829960428.734871.12634150161215429514.stgit@bahia.lan>
Reviewed-by: Daniel Henrique Barboza <danielhb413@gmail.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
214 lines
6.7 KiB
C
214 lines
6.7 KiB
C
/*
|
|
* QEMU SPAPR PCI BUS definitions
|
|
*
|
|
* Copyright (c) 2011 Alexey Kardashevskiy <aik@au1.ibm.com>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#ifndef PCI_HOST_SPAPR_H
|
|
#define PCI_HOST_SPAPR_H
|
|
|
|
#include "hw/ppc/spapr.h"
|
|
#include "hw/pci/pci.h"
|
|
#include "hw/pci/pci_host.h"
|
|
#include "hw/ppc/xics.h"
|
|
#include "qom/object.h"
|
|
|
|
#define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
|
|
|
|
OBJECT_DECLARE_SIMPLE_TYPE(SpaprPhbState, SPAPR_PCI_HOST_BRIDGE)
|
|
|
|
#define SPAPR_PCI_DMA_MAX_WINDOWS 2
|
|
|
|
|
|
typedef struct SpaprPciMsi {
|
|
uint32_t first_irq;
|
|
uint32_t num;
|
|
} SpaprPciMsi;
|
|
|
|
typedef struct SpaprPciMsiMig {
|
|
uint32_t key;
|
|
SpaprPciMsi value;
|
|
} SpaprPciMsiMig;
|
|
|
|
typedef struct SpaprPciLsi {
|
|
uint32_t irq;
|
|
} SpaprPciLsi;
|
|
|
|
typedef struct SpaprPhbPciNvGpuConfig SpaprPhbPciNvGpuConfig;
|
|
|
|
struct SpaprPhbState {
|
|
PCIHostState parent_obj;
|
|
|
|
uint32_t index;
|
|
uint64_t buid;
|
|
char *dtbusname;
|
|
bool dr_enabled;
|
|
|
|
MemoryRegion memspace, iospace;
|
|
hwaddr mem_win_addr, mem_win_size, mem64_win_addr, mem64_win_size;
|
|
uint64_t mem64_win_pciaddr;
|
|
hwaddr io_win_addr, io_win_size;
|
|
MemoryRegion mem32window, mem64window, iowindow, msiwindow;
|
|
|
|
uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
|
|
hwaddr dma_win_addr, dma_win_size;
|
|
AddressSpace iommu_as;
|
|
MemoryRegion iommu_root;
|
|
|
|
SpaprPciLsi lsi_table[PCI_NUM_PINS];
|
|
|
|
GHashTable *msi;
|
|
/* Temporary cache for migration purposes */
|
|
int32_t msi_devs_num;
|
|
SpaprPciMsiMig *msi_devs;
|
|
|
|
QLIST_ENTRY(SpaprPhbState) list;
|
|
|
|
bool ddw_enabled;
|
|
uint64_t page_size_mask;
|
|
uint64_t dma64_win_addr;
|
|
|
|
uint32_t numa_node;
|
|
|
|
bool pcie_ecs; /* Allow access to PCIe extended config space? */
|
|
|
|
/* Fields for migration compatibility hacks */
|
|
bool pre_2_8_migration;
|
|
uint32_t mig_liobn;
|
|
hwaddr mig_mem_win_addr, mig_mem_win_size;
|
|
hwaddr mig_io_win_addr, mig_io_win_size;
|
|
hwaddr nv2_gpa_win_addr;
|
|
hwaddr nv2_atsd_win_addr;
|
|
SpaprPhbPciNvGpuConfig *nvgpus;
|
|
bool pre_5_1_assoc;
|
|
};
|
|
|
|
#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
|
|
#define SPAPR_PCI_MEM32_WIN_SIZE \
|
|
((1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET)
|
|
#define SPAPR_PCI_MEM64_WIN_SIZE 0x10000000000ULL /* 1 TiB */
|
|
|
|
/* All PCI outbound windows will be within this range */
|
|
#define SPAPR_PCI_BASE (1ULL << 45) /* 32 TiB */
|
|
#define SPAPR_PCI_LIMIT (1ULL << 46) /* 64 TiB */
|
|
|
|
#define SPAPR_MAX_PHBS ((SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / \
|
|
SPAPR_PCI_MEM64_WIN_SIZE - 1)
|
|
|
|
#define SPAPR_PCI_IO_WIN_SIZE 0x10000
|
|
|
|
#define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL
|
|
|
|
#define SPAPR_PCI_NV2RAM64_WIN_BASE SPAPR_PCI_LIMIT
|
|
#define SPAPR_PCI_NV2RAM64_WIN_SIZE (2 * TiB) /* For up to 6 GPUs 256GB each */
|
|
|
|
/* Max number of NVLinks per GPU in any physical box */
|
|
#define NVGPU_MAX_LINKS 3
|
|
|
|
/*
|
|
* GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB
|
|
* which is enough. We do not need DMA for ATSD so we put them at 128TiB.
|
|
*/
|
|
#define SPAPR_PCI_NV2ATSD_WIN_BASE (128 * TiB)
|
|
#define SPAPR_PCI_NV2ATSD_WIN_SIZE (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \
|
|
64 * KiB)
|
|
|
|
int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb,
|
|
uint32_t intc_phandle, void *fdt, int *node_offset);
|
|
|
|
void spapr_pci_rtas_init(void);
|
|
|
|
SpaprPhbState *spapr_pci_find_phb(SpaprMachineState *spapr, uint64_t buid);
|
|
PCIDevice *spapr_pci_find_dev(SpaprMachineState *spapr, uint64_t buid,
|
|
uint32_t config_addr);
|
|
|
|
/* DRC callbacks */
|
|
void spapr_phb_remove_pci_device_cb(DeviceState *dev);
|
|
int spapr_pci_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
|
|
void *fdt, int *fdt_start_offset, Error **errp);
|
|
|
|
/* VFIO EEH hooks */
|
|
#ifdef CONFIG_LINUX
|
|
bool spapr_phb_eeh_available(SpaprPhbState *sphb);
|
|
int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
|
|
unsigned int addr, int option);
|
|
int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state);
|
|
int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option);
|
|
int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb);
|
|
void spapr_phb_vfio_reset(DeviceState *qdev);
|
|
void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp);
|
|
void spapr_phb_nvgpu_free(SpaprPhbState *sphb);
|
|
void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
|
|
Error **errp);
|
|
void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt);
|
|
void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
|
|
SpaprPhbState *sphb);
|
|
#else
|
|
static inline bool spapr_phb_eeh_available(SpaprPhbState *sphb)
|
|
{
|
|
return false;
|
|
}
|
|
static inline int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
|
|
unsigned int addr, int option)
|
|
{
|
|
return RTAS_OUT_HW_ERROR;
|
|
}
|
|
static inline int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb,
|
|
int *state)
|
|
{
|
|
return RTAS_OUT_HW_ERROR;
|
|
}
|
|
static inline int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
|
|
{
|
|
return RTAS_OUT_HW_ERROR;
|
|
}
|
|
static inline int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
|
|
{
|
|
return RTAS_OUT_HW_ERROR;
|
|
}
|
|
static inline void spapr_phb_vfio_reset(DeviceState *qdev)
|
|
{
|
|
}
|
|
static inline void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
|
|
{
|
|
}
|
|
static inline void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
|
|
{
|
|
}
|
|
static inline void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt,
|
|
int bus_off, Error **errp)
|
|
{
|
|
}
|
|
static inline void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb,
|
|
void *fdt)
|
|
{
|
|
}
|
|
static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt,
|
|
int offset,
|
|
SpaprPhbState *sphb)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
void spapr_phb_dma_reset(SpaprPhbState *sphb);
|
|
|
|
static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb)
|
|
{
|
|
return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
|
|
}
|
|
|
|
#endif /* PCI_HOST_SPAPR_H */
|