442 lines
13 KiB
C
442 lines
13 KiB
C
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2017-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* nv-ibmnpu.c - interface with the ibmnpu (IBM NVLink Processing Unit) "module"
|
|
*/
|
|
#include "nv-linux.h"
|
|
|
|
#if defined(NVCPU_PPC64LE)
|
|
#include "nv-ibmnpu.h"
|
|
#include "nv-rsync.h"
|
|
|
|
/*
|
|
* Temporary query to get the L1D cache block size directly from the device
|
|
* tree for the offline cache flush workaround, since the ppc64_caches symbol
|
|
* is unavailable to us.
|
|
*/
|
|
const NvU32 P9_L1D_CACHE_DEFAULT_BLOCK_SIZE = 0x80;
|
|
|
|
static NvU32 nv_ibm_get_cpu_l1d_cache_block_size(void)
|
|
{
|
|
const __be32 *block_size_prop;
|
|
|
|
/*
|
|
* Attempt to look up the block size from device tree. If unavailable, just
|
|
* return the default that we see on these systems.
|
|
*/
|
|
struct device_node *cpu = of_find_node_by_type(NULL, "cpu");
|
|
if (!cpu)
|
|
{
|
|
return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
|
|
}
|
|
|
|
block_size_prop = of_get_property(cpu, "d-cache-block-size", NULL);
|
|
if (!block_size_prop)
|
|
{
|
|
return P9_L1D_CACHE_DEFAULT_BLOCK_SIZE;
|
|
}
|
|
|
|
return be32_to_cpu(*block_size_prop);
|
|
}
|
|
|
|
/*
|
|
* GPU device memory can be exposed to the kernel as NUMA node memory via the
|
|
* IBMNPU devices associated with the GPU. The platform firmware will specify
|
|
* the parameters of where the memory lives in the system address space via
|
|
* firmware properties on the IBMNPU devices. These properties specify what
|
|
* memory can be accessed through the IBMNPU device, and the driver can online
|
|
* a GPU device's memory into the range accessible by its associated IBMNPU
|
|
* devices.
|
|
*
|
|
* This function calls over to the IBMNPU driver to query the parameters from
|
|
* firmware, and validates that the resulting parameters are acceptable.
|
|
*/
|
|
static void nv_init_ibmnpu_numa_info(nv_state_t *nv)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
nv_npu_numa_info_t *npu_numa_info = &nvl->npu->numa_info;
|
|
struct pci_dev *npu_dev = nvl->npu->devs[0];
|
|
NvU64 spa, gpa, aper_size;
|
|
|
|
/*
|
|
* Terminology:
|
|
* - system physical address (spa): 47-bit NVIDIA physical address, which
|
|
* is the CPU real address with the NVLink address compression scheme
|
|
* already applied in firmware.
|
|
* - guest physical address (gpa): 56-bit physical address as seen by the
|
|
* operating system. This is the base address that we should use for
|
|
* onlining device memory.
|
|
*/
|
|
nvl->numa_info.node_id = ibmnpu_device_get_memory_config(npu_dev, &spa, &gpa,
|
|
&aper_size);
|
|
if (nvl->numa_info.node_id == NUMA_NO_NODE)
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "No NUMA memory aperture found\n");
|
|
return;
|
|
}
|
|
|
|
/* Validate that the compressed system physical address is not too wide */
|
|
if (spa & (~(BIT_ULL(nv_volta_dma_addr_size) - 1)))
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
|
|
"Invalid NUMA memory system pa 0x%llx"
|
|
" on IBM-NPU device %04x:%02x:%02x.%u\n",
|
|
spa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
|
|
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
|
|
goto invalid_numa_config;
|
|
}
|
|
|
|
/*
|
|
* Validate that the guest physical address is aligned to 128GB.
|
|
* This alignment requirement comes from the Volta address space
|
|
* size on POWER9.
|
|
*/
|
|
if (!IS_ALIGNED(gpa, BIT_ULL(nv_volta_addr_space_width)))
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
|
|
"Invalid alignment in NUMA memory guest pa 0x%llx"
|
|
" on IBM-NPU device %04x:%02x:%02x.%u\n",
|
|
gpa, NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
|
|
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
|
|
goto invalid_numa_config;
|
|
}
|
|
|
|
/* Validate that the aperture can map all of the device's framebuffer */
|
|
if (aper_size < nv->fb->size)
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
|
|
"Insufficient NUMA memory aperture size 0x%llx"
|
|
" on IBM-NPU device %04x:%02x:%02x.%u (0x%llx required)\n",
|
|
aper_size, NV_PCI_DOMAIN_NUMBER(npu_dev),
|
|
NV_PCI_BUS_NUMBER(npu_dev), NV_PCI_SLOT_NUMBER(npu_dev),
|
|
PCI_FUNC(npu_dev->devfn), nv->fb->size);
|
|
goto invalid_numa_config;
|
|
}
|
|
|
|
npu_numa_info->compr_sys_phys_addr = spa;
|
|
npu_numa_info->guest_phys_addr = gpa;
|
|
|
|
if (NVreg_EnableUserNUMAManagement)
|
|
{
|
|
NV_ATOMIC_SET(nvl->numa_info.status, NV_IOCTL_NUMA_STATUS_OFFLINE);
|
|
}
|
|
else
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "User-mode NUMA onlining disabled.\n");
|
|
nvl->numa_info.node_id = NUMA_NO_NODE;
|
|
}
|
|
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv, "NUMA memory aperture: "
|
|
"[spa = 0x%llx, gpa = 0x%llx, aper_size = 0x%llx]\n",
|
|
spa, gpa, aper_size);
|
|
|
|
/* Get the CPU's L1D cache block size for offlining cache flush */
|
|
npu_numa_info->l1d_cache_block_size = nv_ibm_get_cpu_l1d_cache_block_size();
|
|
|
|
return;
|
|
|
|
invalid_numa_config:
|
|
NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
|
|
"NUMA memory aperture disabled due to invalid firmware configuration\n");
|
|
nvl->numa_info.node_id = NUMA_NO_NODE;
|
|
}
|
|
|
|
void nv_init_ibmnpu_info(nv_state_t *nv)
|
|
{
|
|
#if defined(NV_PNV_PCI_GET_NPU_DEV_PRESENT)
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
struct pci_dev *npu_dev = pnv_pci_get_npu_dev(nvl->pci_dev, 0);
|
|
NvU8 dev_count;
|
|
|
|
if (!npu_dev)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (os_alloc_mem((void **)&nvl->npu, sizeof(nv_ibmnpu_info_t)) != NV_OK)
|
|
{
|
|
return;
|
|
}
|
|
|
|
os_mem_set(nvl->npu, 0, sizeof(nv_ibmnpu_info_t));
|
|
|
|
/* Find any other IBMNPU devices attached to this GPU */
|
|
for (nvl->npu->devs[0] = npu_dev, dev_count = 1;
|
|
dev_count < NV_MAX_ATTACHED_IBMNPUS; dev_count++)
|
|
{
|
|
nvl->npu->devs[dev_count] = pnv_pci_get_npu_dev(nvl->pci_dev, dev_count);
|
|
if (!nvl->npu->devs[dev_count])
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
|
|
nvl->npu->dev_count = dev_count;
|
|
|
|
/*
|
|
* If we run out of space for IBMNPU devices, NV_MAX_ATTACHED_IBMNPUS will
|
|
* need to be bumped.
|
|
*/
|
|
WARN_ON((dev_count == NV_MAX_ATTACHED_IBMNPUS) &&
|
|
pnv_pci_get_npu_dev(nvl->pci_dev, dev_count));
|
|
|
|
ibmnpu_device_get_genregs_info(npu_dev, &nvl->npu->genregs);
|
|
|
|
if (nvl->npu->genregs.size > 0)
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
|
|
"IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
|
|
" has a generation register space 0x%llx-0x%llx\n",
|
|
NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
|
|
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn),
|
|
nvl->npu->genregs.start_addr,
|
|
nvl->npu->genregs.start_addr + nvl->npu->genregs.size - 1);
|
|
}
|
|
else
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
|
|
"IBM-NPU device %04x:%02x:%02x.%u associated with GPU "
|
|
"does not support generation registers\n",
|
|
NV_PCI_DOMAIN_NUMBER(npu_dev), NV_PCI_BUS_NUMBER(npu_dev),
|
|
NV_PCI_SLOT_NUMBER(npu_dev), PCI_FUNC(npu_dev->devfn));
|
|
}
|
|
|
|
nv_init_ibmnpu_numa_info(nv);
|
|
#endif
|
|
}
|
|
|
|
void nv_destroy_ibmnpu_info(nv_state_t *nv)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (nvl->npu != NULL)
|
|
{
|
|
os_free_mem(nvl->npu);
|
|
nvl->npu = NULL;
|
|
}
|
|
}
|
|
|
|
int nv_init_ibmnpu_devices(nv_state_t *nv)
|
|
{
|
|
NvU8 i;
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (!nvl->npu)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
for (i = 0; i < nvl->npu->dev_count; i++)
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
|
|
"Initializing IBM-NPU device %04x:%02x:%02x.%u\n",
|
|
NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
|
|
NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
|
|
NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
|
|
PCI_FUNC(nvl->npu->devs[i]->devfn));
|
|
|
|
if (ibmnpu_init_device(nvl->npu->devs[i]) != NVL_SUCCESS)
|
|
{
|
|
nv_unregister_ibmnpu_devices(nv);
|
|
return -EIO;
|
|
}
|
|
|
|
nvl->npu->initialized_dev_count++;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void nv_unregister_ibmnpu_devices(nv_state_t *nv)
|
|
{
|
|
NvU8 i;
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (!nvl->npu)
|
|
{
|
|
return;
|
|
}
|
|
|
|
for (i = 0; i < nvl->npu->initialized_dev_count; i++)
|
|
{
|
|
NV_DEV_PRINTF(NV_DBG_SETUP, nv,
|
|
"Unregistering IBM-NPU device %04x:%02x:%02x.%u\n",
|
|
NV_PCI_DOMAIN_NUMBER(nvl->npu->devs[i]),
|
|
NV_PCI_BUS_NUMBER(nvl->npu->devs[i]),
|
|
NV_PCI_SLOT_NUMBER(nvl->npu->devs[i]),
|
|
PCI_FUNC(nvl->npu->devs[i]->devfn));
|
|
|
|
ibmnpu_unregister_device(nvl->npu->devs[i]);
|
|
}
|
|
|
|
nvl->npu->initialized_dev_count = 0;
|
|
}
|
|
|
|
NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
|
|
NvU64 *size, void **device)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
|
|
{
|
|
return NV_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
if (addr)
|
|
{
|
|
*addr = nvl->npu->genregs.start_addr;
|
|
}
|
|
|
|
if (size)
|
|
{
|
|
*size = nvl->npu->genregs.size;
|
|
}
|
|
|
|
if (device)
|
|
{
|
|
*device = (void*)nvl->npu->devs[0];
|
|
}
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
|
|
NvBool *mode)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
|
|
{
|
|
return NV_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
*mode = nv_get_rsync_relaxed_ordering_mode(nv);
|
|
|
|
return NV_OK;
|
|
}
|
|
|
|
void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (nvl->npu == NULL || nvl->npu->genregs.size == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
nv_wait_for_rsync(nv);
|
|
}
|
|
|
|
int nv_get_ibmnpu_chip_id(nv_state_t *nv)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
|
|
if (nvl->npu == NULL)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
return ibmnpu_device_get_chip_id(nvl->npu->devs[0]);
|
|
}
|
|
|
|
void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 cpu_virtual, NvU64 size)
|
|
{
|
|
nv_linux_state_t *nvl = NV_GET_NVL_FROM_NV_STATE(nv);
|
|
NvU64 offset, cbsize;
|
|
|
|
/*
|
|
* The range is commonly an ioremap()ed mapping of the GPU's ATS range and
|
|
* needs to be compared against the created mappings. Alternatively, kernel
|
|
* page tables can be dumped through sysfs if CONFIG_PPC_PTDUMP is enabled.
|
|
*/
|
|
NV_DEV_PRINTF(NV_DBG_INFO, nv,
|
|
"Flushing CPU virtual range [0x%llx, 0x%llx)\n",
|
|
cpu_virtual, cpu_virtual + size);
|
|
|
|
cbsize = nvl->npu->numa_info.l1d_cache_block_size;
|
|
|
|
asm volatile("sync; isync" ::: "memory");
|
|
|
|
/* Force eviction of any cache lines from the NUMA-onlined region. */
|
|
for (offset = 0; offset < size; offset += cbsize)
|
|
{
|
|
asm volatile("dcbf %0,%1" :: "r" (cpu_virtual), "r" (offset) : "memory");
|
|
|
|
/* Reschedule if necessary to avoid lockup warnings */
|
|
cond_resched();
|
|
}
|
|
|
|
asm volatile("sync; isync" ::: "memory");
|
|
}
|
|
|
|
#else
|
|
|
|
void nv_init_ibmnpu_info(nv_state_t *nv)
|
|
{
|
|
}
|
|
|
|
void nv_destroy_ibmnpu_info(nv_state_t *nv)
|
|
{
|
|
}
|
|
|
|
int nv_init_ibmnpu_devices(nv_state_t *nv)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void nv_unregister_ibmnpu_devices(nv_state_t *nv)
|
|
{
|
|
}
|
|
|
|
NV_STATUS NV_API_CALL nv_get_ibmnpu_genreg_info(nv_state_t *nv, NvU64 *addr,
|
|
NvU64 *size, void **device)
|
|
{
|
|
return NV_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
NV_STATUS NV_API_CALL nv_get_ibmnpu_relaxed_ordering_mode(nv_state_t *nv,
|
|
NvBool *mode)
|
|
{
|
|
return NV_ERR_NOT_SUPPORTED;
|
|
}
|
|
|
|
void NV_API_CALL nv_wait_for_ibmnpu_rsync(nv_state_t *nv)
|
|
{
|
|
}
|
|
|
|
int nv_get_ibmnpu_chip_id(nv_state_t *nv)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
void NV_API_CALL nv_ibmnpu_cache_flush_range(nv_state_t *nv, NvU64 virtual, NvU64 size)
|
|
{
|
|
}
|
|
|
|
void nv_ibmnpu_cache_flush_numa_region(nv_state_t *nv)
|
|
{
|
|
}
|
|
|
|
#endif
|