2011-10-30 21:16:46 +04:00
|
|
|
/*
|
|
|
|
* QEMU SPAPR PCI BUS definitions
|
|
|
|
*
|
|
|
|
* Copyright (c) 2011 Alexey Kardashevskiy <aik@au1.ibm.com>
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
2020-10-23 15:44:24 +03:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2011-10-30 21:16:46 +04:00
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2016-06-29 11:12:57 +03:00
|
|
|
#ifndef PCI_HOST_SPAPR_H
|
|
|
|
#define PCI_HOST_SPAPR_H
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2016-06-29 12:31:09 +03:00
|
|
|
#include "hw/ppc/spapr.h"
|
2012-12-12 16:24:50 +04:00
|
|
|
#include "hw/pci/pci.h"
|
|
|
|
#include "hw/pci/pci_host.h"
|
2013-02-05 20:06:20 +04:00
|
|
|
#include "hw/ppc/xics.h"
|
2020-09-03 23:43:22 +03:00
|
|
|
#include "qom/object.h"
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2012-08-20 21:08:05 +04:00
|
|
|
#define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
|
|
|
|
|
2020-09-16 21:25:19 +03:00
|
|
|
OBJECT_DECLARE_SIMPLE_TYPE(SpaprPhbState, SPAPR_PCI_HOST_BRIDGE)
|
2012-08-20 21:08:05 +04:00
|
|
|
|
2016-07-04 06:33:07 +03:00
|
|
|
#define SPAPR_PCI_DMA_MAX_WINDOWS 2
|
|
|
|
|
2014-05-27 09:36:31 +04:00
|
|
|
|
2019-08-28 21:20:44 +03:00
|
|
|
typedef struct SpaprPciMsi {
|
2014-05-30 13:34:20 +04:00
|
|
|
uint32_t first_irq;
|
|
|
|
uint32_t num;
|
2019-08-28 21:20:44 +03:00
|
|
|
} SpaprPciMsi;
|
2014-05-30 13:34:20 +04:00
|
|
|
|
2019-08-28 21:20:44 +03:00
|
|
|
typedef struct SpaprPciMsiMig {
|
2014-05-30 13:34:20 +04:00
|
|
|
uint32_t key;
|
2019-08-28 21:20:44 +03:00
|
|
|
SpaprPciMsi value;
|
|
|
|
} SpaprPciMsiMig;
|
|
|
|
|
|
|
|
typedef struct SpaprPciLsi {
|
|
|
|
uint32_t irq;
|
|
|
|
} SpaprPciLsi;
|
|
|
|
|
|
|
|
typedef struct SpaprPhbPciNvGpuConfig SpaprPhbPciNvGpuConfig;
|
2014-05-30 13:34:20 +04:00
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
struct SpaprPhbState {
|
2012-08-20 21:08:09 +04:00
|
|
|
PCIHostState parent_obj;
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2015-01-14 05:33:39 +03:00
|
|
|
uint32_t index;
|
2011-10-30 21:16:46 +04:00
|
|
|
uint64_t buid;
|
2012-03-12 21:50:24 +04:00
|
|
|
char *dtbusname;
|
2015-05-07 08:33:52 +03:00
|
|
|
bool dr_enabled;
|
2011-10-30 21:16:46 +04:00
|
|
|
|
|
|
|
MemoryRegion memspace, iospace;
|
spapr_pci: Add a 64-bit MMIO window
On real hardware, and under pHyp, the PCI host bridges on Power machines
typically advertise two outbound MMIO windows from the guest's physical
memory space to PCI memory space:
- A 32-bit window which maps onto 2GiB..4GiB in the PCI address space
- A 64-bit window which maps onto a large region somewhere high in PCI
address space (traditionally this used an identity mapping from guest
physical address to PCI address, but that's not always the case)
The qemu implementation in spapr-pci-host-bridge, however, only supports a
single outbound MMIO window, however. At least some Linux versions expect
the two windows however, so we arranged this window to map onto the PCI
memory space from 2 GiB..~64 GiB, then advertised it as two contiguous
windows, the "32-bit" window from 2G..4G and the "64-bit" window from
4G..~64G.
This approach means, however, that the 64G window is not naturally aligned.
In turn this limits the size of the largest BAR we can map (which does have
to be naturally aligned) to roughly half of the total window. With some
large nVidia GPGPU cards which have huge memory BARs, this is starting to
be a problem.
This patch adds true support for separate 32-bit and 64-bit outbound MMIO
windows to the spapr-pci-host-bridge implementation, each of which can
be independently configured. The 32-bit window always maps to 2G.. in PCI
space, but the PCI address of the 64-bit window can be configured (it
defaults to the same as the guest physical address).
So as not to break possible existing configurations, as long as a 64-bit
window is not specified, a large single window can be specified. This
will appear the same way to the guest as the old approach, although it's
now implemented by two contiguous memory regions rather than a single one.
For now, this only adds the possibility of 64-bit windows. The default
configuration still uses the legacy mode.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
2016-10-11 06:23:33 +03:00
|
|
|
hwaddr mem_win_addr, mem_win_size, mem64_win_addr, mem64_win_size;
|
|
|
|
uint64_t mem64_win_pciaddr;
|
|
|
|
hwaddr io_win_addr, io_win_size;
|
|
|
|
MemoryRegion mem32window, mem64window, iowindow, msiwindow;
|
2012-08-07 20:10:37 +04:00
|
|
|
|
2016-07-04 06:33:07 +03:00
|
|
|
uint32_t dma_liobn[SPAPR_PCI_DMA_MAX_WINDOWS];
|
2015-09-24 02:56:44 +03:00
|
|
|
hwaddr dma_win_addr, dma_win_size;
|
2012-10-30 15:47:48 +04:00
|
|
|
AddressSpace iommu_as;
|
2014-05-27 09:36:32 +04:00
|
|
|
MemoryRegion iommu_root;
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2019-08-28 21:20:44 +03:00
|
|
|
SpaprPciLsi lsi_table[PCI_NUM_PINS];
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2014-05-30 13:34:20 +04:00
|
|
|
GHashTable *msi;
|
|
|
|
/* Temporary cache for migration purposes */
|
|
|
|
int32_t msi_devs_num;
|
2019-08-28 21:20:44 +03:00
|
|
|
SpaprPciMsiMig *msi_devs;
|
2012-08-07 20:10:37 +04:00
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
QLIST_ENTRY(SpaprPhbState) list;
|
2016-07-04 06:33:07 +03:00
|
|
|
|
|
|
|
bool ddw_enabled;
|
|
|
|
uint64_t page_size_mask;
|
|
|
|
uint64_t dma64_win_addr;
|
2016-07-27 11:03:38 +03:00
|
|
|
|
|
|
|
uint32_t numa_node;
|
2016-11-23 02:26:38 +03:00
|
|
|
|
2017-03-14 03:54:17 +03:00
|
|
|
bool pcie_ecs; /* Allow access to PCIe extended config space? */
|
|
|
|
|
2016-11-23 02:26:38 +03:00
|
|
|
/* Fields for migration compatibility hacks */
|
|
|
|
bool pre_2_8_migration;
|
|
|
|
uint32_t mig_liobn;
|
|
|
|
hwaddr mig_mem_win_addr, mig_mem_win_size;
|
|
|
|
hwaddr mig_io_win_addr, mig_io_win_size;
|
spapr: Support NVIDIA V100 GPU with NVLink2
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
implements special regions for such GPUs and emulates an NVLink bridge.
NVLink2-enabled POWER9 CPUs also provide address translation services
which includes an ATS shootdown (ATSD) register exported via the NVLink
bridge device.
This adds a quirk to VFIO to map the GPU memory and create an MR;
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
this to get the MR and map it to the system address space.
Another quirk does the same for ATSD.
This adds additional steps to sPAPR PHB setup:
1. Search for specific GPUs and NPUs, collect findings in
sPAPRPHBState::nvgpus, manage system address space mappings;
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
"memory-block", "link-speed" to advertise the NVLink2 function to
the guest;
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
the guest OS from accessing the new memory until it is onlined) and
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
uses it for link discovery.
This allocates space for GPU RAM and ATSD like we do for MMIOs by
adding 2 new parameters to the phb_placement() hook. Older machine types
set these to zero.
This puts new memory nodes in a separate NUMA node to as the GPU RAM
needs to be configured equally distant from any other node in the system.
Unlike the host setup which assigns numa ids from 255 downwards, this
adds new NUMA nodes after the user configures nodes or from 1 if none
were configured.
This adds requirement similar to EEH - one IOMMU group per vPHB.
The reason for this is that ATSD registers belong to a physical NPU
so they cannot invalidate translations on GPUs attached to another NPU.
It is guaranteed by the host platform as it does not mix NVLink bridges
or GPUs from different NPU in the same IOMMU group. If more than one
IOMMU group is detected on a vPHB, this disables ATSD support for that
vPHB and prints a warning.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for vfio portions]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-12 11:21:03 +03:00
|
|
|
hwaddr nv2_gpa_win_addr;
|
|
|
|
hwaddr nv2_atsd_win_addr;
|
2019-08-28 21:20:44 +03:00
|
|
|
SpaprPhbPciNvGpuConfig *nvgpus;
|
2020-07-17 01:56:55 +03:00
|
|
|
bool pre_5_1_assoc;
|
2014-05-27 09:36:31 +04:00
|
|
|
};
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2015-01-30 04:53:19 +03:00
|
|
|
#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
|
spapr_pci: Add a 64-bit MMIO window
On real hardware, and under pHyp, the PCI host bridges on Power machines
typically advertise two outbound MMIO windows from the guest's physical
memory space to PCI memory space:
- A 32-bit window which maps onto 2GiB..4GiB in the PCI address space
- A 64-bit window which maps onto a large region somewhere high in PCI
address space (traditionally this used an identity mapping from guest
physical address to PCI address, but that's not always the case)
The qemu implementation in spapr-pci-host-bridge, however, only supports a
single outbound MMIO window, however. At least some Linux versions expect
the two windows however, so we arranged this window to map onto the PCI
memory space from 2 GiB..~64 GiB, then advertised it as two contiguous
windows, the "32-bit" window from 2G..4G and the "64-bit" window from
4G..~64G.
This approach means, however, that the 64G window is not naturally aligned.
In turn this limits the size of the largest BAR we can map (which does have
to be naturally aligned) to roughly half of the total window. With some
large nVidia GPGPU cards which have huge memory BARs, this is starting to
be a problem.
This patch adds true support for separate 32-bit and 64-bit outbound MMIO
windows to the spapr-pci-host-bridge implementation, each of which can
be independently configured. The 32-bit window always maps to 2G.. in PCI
space, but the PCI address of the 64-bit window can be configured (it
defaults to the same as the guest physical address).
So as not to break possible existing configurations, as long as a 64-bit
window is not specified, a large single window can be specified. This
will appear the same way to the guest as the old approach, although it's
now implemented by two contiguous memory regions rather than a single one.
For now, this only adds the possibility of 64-bit windows. The default
configuration still uses the legacy mode.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
2016-10-11 06:23:33 +03:00
|
|
|
#define SPAPR_PCI_MEM32_WIN_SIZE \
|
|
|
|
((1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET)
|
2016-10-16 04:04:15 +03:00
|
|
|
#define SPAPR_PCI_MEM64_WIN_SIZE 0x10000000000ULL /* 1 TiB */
|
2015-01-30 04:53:19 +03:00
|
|
|
|
2018-12-21 03:36:53 +03:00
|
|
|
/* All PCI outbound windows will be within this range */
|
2016-10-16 04:04:15 +03:00
|
|
|
#define SPAPR_PCI_BASE (1ULL << 45) /* 32 TiB */
|
|
|
|
#define SPAPR_PCI_LIMIT (1ULL << 46) /* 64 TiB */
|
|
|
|
|
2018-12-21 03:36:53 +03:00
|
|
|
#define SPAPR_MAX_PHBS ((SPAPR_PCI_LIMIT - SPAPR_PCI_BASE) / \
|
|
|
|
SPAPR_PCI_MEM64_WIN_SIZE - 1)
|
|
|
|
|
2013-01-23 21:20:39 +04:00
|
|
|
#define SPAPR_PCI_IO_WIN_SIZE 0x10000
|
2013-07-12 11:38:24 +04:00
|
|
|
|
|
|
|
#define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL
|
2013-01-23 21:20:39 +04:00
|
|
|
|
spapr: Support NVIDIA V100 GPU with NVLink2
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
implements special regions for such GPUs and emulates an NVLink bridge.
NVLink2-enabled POWER9 CPUs also provide address translation services
which includes an ATS shootdown (ATSD) register exported via the NVLink
bridge device.
This adds a quirk to VFIO to map the GPU memory and create an MR;
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
this to get the MR and map it to the system address space.
Another quirk does the same for ATSD.
This adds additional steps to sPAPR PHB setup:
1. Search for specific GPUs and NPUs, collect findings in
sPAPRPHBState::nvgpus, manage system address space mappings;
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
"memory-block", "link-speed" to advertise the NVLink2 function to
the guest;
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
the guest OS from accessing the new memory until it is onlined) and
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
uses it for link discovery.
This allocates space for GPU RAM and ATSD like we do for MMIOs by
adding 2 new parameters to the phb_placement() hook. Older machine types
set these to zero.
This puts new memory nodes in a separate NUMA node to as the GPU RAM
needs to be configured equally distant from any other node in the system.
Unlike the host setup which assigns numa ids from 255 downwards, this
adds new NUMA nodes after the user configures nodes or from 1 if none
were configured.
This adds requirement similar to EEH - one IOMMU group per vPHB.
The reason for this is that ATSD registers belong to a physical NPU
so they cannot invalidate translations on GPUs attached to another NPU.
It is guaranteed by the host platform as it does not mix NVLink bridges
or GPUs from different NPU in the same IOMMU group. If more than one
IOMMU group is detected on a vPHB, this disables ATSD support for that
vPHB and prints a warning.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for vfio portions]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-12 11:21:03 +03:00
|
|
|
#define SPAPR_PCI_NV2RAM64_WIN_BASE SPAPR_PCI_LIMIT
|
|
|
|
#define SPAPR_PCI_NV2RAM64_WIN_SIZE (2 * TiB) /* For up to 6 GPUs 256GB each */
|
|
|
|
|
|
|
|
/* Max number of NVLinks per GPU in any physical box */
|
|
|
|
#define NVGPU_MAX_LINKS 3
|
|
|
|
|
|
|
|
/*
|
|
|
|
* GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB
|
|
|
|
* which is enough. We do not need DMA for ATSD so we put them at 128TiB.
|
|
|
|
*/
|
|
|
|
#define SPAPR_PCI_NV2ATSD_WIN_BASE (128 * TiB)
|
|
|
|
#define SPAPR_PCI_NV2ATSD_WIN_SIZE (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \
|
|
|
|
64 * KiB)
|
|
|
|
|
2019-09-27 06:44:58 +03:00
|
|
|
int spapr_dt_phb(SpaprMachineState *spapr, SpaprPhbState *phb,
|
|
|
|
uint32_t intc_phandle, void *fdt, int *node_offset);
|
2011-10-30 21:16:46 +04:00
|
|
|
|
2012-08-07 20:10:33 +04:00
|
|
|
void spapr_pci_rtas_init(void);
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
SpaprPhbState *spapr_pci_find_phb(SpaprMachineState *spapr, uint64_t buid);
|
|
|
|
PCIDevice *spapr_pci_find_dev(SpaprMachineState *spapr, uint64_t buid,
|
2015-05-07 08:33:34 +03:00
|
|
|
uint32_t config_addr);
|
|
|
|
|
2019-02-19 20:17:53 +03:00
|
|
|
/* DRC callbacks */
|
2017-05-22 22:35:48 +03:00
|
|
|
void spapr_phb_remove_pci_device_cb(DeviceState *dev);
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
int spapr_pci_dt_populate(SpaprDrc *drc, SpaprMachineState *spapr,
|
2019-02-19 20:17:53 +03:00
|
|
|
void *fdt, int *fdt_start_offset, Error **errp);
|
2017-05-22 22:35:48 +03:00
|
|
|
|
2016-02-29 09:45:05 +03:00
|
|
|
/* VFIO EEH hooks */
|
|
|
|
#ifdef CONFIG_LINUX
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
bool spapr_phb_eeh_available(SpaprPhbState *sphb);
|
|
|
|
int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
|
2016-02-29 09:45:05 +03:00
|
|
|
unsigned int addr, int option);
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb, int *state);
|
|
|
|
int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option);
|
|
|
|
int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb);
|
2016-02-29 09:45:05 +03:00
|
|
|
void spapr_phb_vfio_reset(DeviceState *qdev);
|
spapr: Support NVIDIA V100 GPU with NVLink2
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
implements special regions for such GPUs and emulates an NVLink bridge.
NVLink2-enabled POWER9 CPUs also provide address translation services
which includes an ATS shootdown (ATSD) register exported via the NVLink
bridge device.
This adds a quirk to VFIO to map the GPU memory and create an MR;
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
this to get the MR and map it to the system address space.
Another quirk does the same for ATSD.
This adds additional steps to sPAPR PHB setup:
1. Search for specific GPUs and NPUs, collect findings in
sPAPRPHBState::nvgpus, manage system address space mappings;
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
"memory-block", "link-speed" to advertise the NVLink2 function to
the guest;
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
the guest OS from accessing the new memory until it is onlined) and
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
uses it for link discovery.
This allocates space for GPU RAM and ATSD like we do for MMIOs by
adding 2 new parameters to the phb_placement() hook. Older machine types
set these to zero.
This puts new memory nodes in a separate NUMA node to as the GPU RAM
needs to be configured equally distant from any other node in the system.
Unlike the host setup which assigns numa ids from 255 downwards, this
adds new NUMA nodes after the user configures nodes or from 1 if none
were configured.
This adds requirement similar to EEH - one IOMMU group per vPHB.
The reason for this is that ATSD registers belong to a physical NPU
so they cannot invalidate translations on GPUs attached to another NPU.
It is guaranteed by the host platform as it does not mix NVLink bridges
or GPUs from different NPU in the same IOMMU group. If more than one
IOMMU group is detected on a vPHB, this disables ATSD support for that
vPHB and prints a warning.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for vfio portions]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-12 11:21:03 +03:00
|
|
|
void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp);
|
|
|
|
void spapr_phb_nvgpu_free(SpaprPhbState *sphb);
|
|
|
|
void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt, int bus_off,
|
|
|
|
Error **errp);
|
|
|
|
void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb, void *fdt);
|
|
|
|
void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
|
|
|
|
SpaprPhbState *sphb);
|
2016-02-29 09:45:05 +03:00
|
|
|
#else
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline bool spapr_phb_eeh_available(SpaprPhbState *sphb)
|
2016-02-29 09:19:42 +03:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline int spapr_phb_vfio_eeh_set_option(SpaprPhbState *sphb,
|
2016-02-29 09:45:05 +03:00
|
|
|
unsigned int addr, int option)
|
|
|
|
{
|
|
|
|
return RTAS_OUT_HW_ERROR;
|
|
|
|
}
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline int spapr_phb_vfio_eeh_get_state(SpaprPhbState *sphb,
|
2016-02-29 09:45:05 +03:00
|
|
|
int *state)
|
|
|
|
{
|
|
|
|
return RTAS_OUT_HW_ERROR;
|
|
|
|
}
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline int spapr_phb_vfio_eeh_reset(SpaprPhbState *sphb, int option)
|
2016-02-29 09:45:05 +03:00
|
|
|
{
|
|
|
|
return RTAS_OUT_HW_ERROR;
|
|
|
|
}
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline int spapr_phb_vfio_eeh_configure(SpaprPhbState *sphb)
|
2016-02-29 09:45:05 +03:00
|
|
|
{
|
|
|
|
return RTAS_OUT_HW_ERROR;
|
|
|
|
}
|
|
|
|
static inline void spapr_phb_vfio_reset(DeviceState *qdev)
|
|
|
|
{
|
|
|
|
}
|
spapr: Support NVIDIA V100 GPU with NVLink2
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
implements special regions for such GPUs and emulates an NVLink bridge.
NVLink2-enabled POWER9 CPUs also provide address translation services
which includes an ATS shootdown (ATSD) register exported via the NVLink
bridge device.
This adds a quirk to VFIO to map the GPU memory and create an MR;
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
this to get the MR and map it to the system address space.
Another quirk does the same for ATSD.
This adds additional steps to sPAPR PHB setup:
1. Search for specific GPUs and NPUs, collect findings in
sPAPRPHBState::nvgpus, manage system address space mappings;
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
"memory-block", "link-speed" to advertise the NVLink2 function to
the guest;
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
the guest OS from accessing the new memory until it is onlined) and
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
uses it for link discovery.
This allocates space for GPU RAM and ATSD like we do for MMIOs by
adding 2 new parameters to the phb_placement() hook. Older machine types
set these to zero.
This puts new memory nodes in a separate NUMA node to as the GPU RAM
needs to be configured equally distant from any other node in the system.
Unlike the host setup which assigns numa ids from 255 downwards, this
adds new NUMA nodes after the user configures nodes or from 1 if none
were configured.
This adds requirement similar to EEH - one IOMMU group per vPHB.
The reason for this is that ATSD registers belong to a physical NPU
so they cannot invalidate translations on GPUs attached to another NPU.
It is guaranteed by the host platform as it does not mix NVLink bridges
or GPUs from different NPU in the same IOMMU group. If more than one
IOMMU group is detected on a vPHB, this disables ATSD support for that
vPHB and prints a warning.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for vfio portions]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-12 11:21:03 +03:00
|
|
|
static inline void spapr_phb_nvgpu_setup(SpaprPhbState *sphb, Error **errp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spapr_phb_nvgpu_free(SpaprPhbState *sphb)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spapr_phb_nvgpu_populate_dt(SpaprPhbState *sphb, void *fdt,
|
|
|
|
int bus_off, Error **errp)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spapr_phb_nvgpu_ram_populate_dt(SpaprPhbState *sphb,
|
|
|
|
void *fdt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt,
|
|
|
|
int offset,
|
|
|
|
SpaprPhbState *sphb)
|
|
|
|
{
|
|
|
|
}
|
2016-02-29 09:45:05 +03:00
|
|
|
#endif
|
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
void spapr_phb_dma_reset(SpaprPhbState *sphb);
|
2016-06-01 11:57:39 +03:00
|
|
|
|
spapr: Use CamelCase properly
The qemu coding standard is to use CamelCase for type and structure names,
and the pseries code follows that... sort of. There are quite a lot of
places where we bend the rules in order to preserve the capitalization of
internal acronyms like "PHB", "TCE", "DIMM" and most commonly "sPAPR".
That was a bad idea - it frequently leads to names ending up with hard to
read clusters of capital letters, and means they don't catch the eye as
type identifiers, which is kind of the point of the CamelCase convention in
the first place.
In short, keeping type identifiers look like CamelCase is more important
than preserving standard capitalization of internal "words". So, this
patch renames a heap of spapr internal type names to a more standard
CamelCase.
In addition to case changes, we also make some other identifier renames:
VIOsPAPR* -> SpaprVio*
The reverse word ordering was only ever used to mitigate the capital
cluster, so revert to the natural ordering.
VIOsPAPRVTYDevice -> SpaprVioVty
VIOsPAPRVLANDevice -> SpaprVioVlan
Brevity, since the "Device" didn't add useful information
sPAPRDRConnector -> SpaprDrc
sPAPRDRConnectorClass -> SpaprDrcClass
Brevity, and makes it clearer this is the same thing as a "DRC"
mentioned in many other places in the code
This is 100% a mechanical search-and-replace patch. It will, however,
conflict with essentially any and all outstanding patches touching the
spapr code.
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
2019-03-06 07:35:37 +03:00
|
|
|
static inline unsigned spapr_phb_windows_supported(SpaprPhbState *sphb)
|
2019-02-19 20:18:18 +03:00
|
|
|
{
|
|
|
|
return sphb->ddw_enabled ? SPAPR_PCI_DMA_MAX_WINDOWS : 1;
|
|
|
|
}
|
|
|
|
|
2016-06-29 11:12:57 +03:00
|
|
|
#endif /* PCI_HOST_SPAPR_H */
|