pcie_sriov: Allow user to create SR-IOV device

A user can create a SR-IOV device by specifying the PF with the
sriov-pf property of the VFs. The VFs must be added before the PF.

A user-creatable VF must have PCIDeviceClass::sriov_vf_user_creatable
set. Such a VF cannot refer to the PF because it is created before the
PF.

A PF that user-creatable VFs can be attached calls
pcie_sriov_pf_init_from_user_created_vfs() during realization and
pcie_sriov_pf_exit() when exiting.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Message-Id: <20240715-sriov-v5-5-3f5539093ffc@daynix.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
This commit is contained in:
Akihiko Odaki 2024-07-15 14:19:11 +09:00 committed by Michael S. Tsirkin
parent 47cc753e50
commit 122173a583
4 changed files with 293 additions and 83 deletions

View File

@ -85,6 +85,7 @@ static Property pci_props[] = {
QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
DEFINE_PROP_STRING("sriov-pf", PCIDevice, sriov_pf),
DEFINE_PROP_END_OF_LIST()
};
@ -959,13 +960,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp)
dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
}
/*
* With SR/IOV and ARI, a device at function 0 need not be a multifunction
* device, as it may just be a VF that ended up with function 0 in
* the legacy PCI interpretation. Avoid failing in such cases:
*/
if (pci_is_vf(dev) &&
dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
/* SR/IOV is not handled here. */
if (pci_is_vf(dev)) {
return;
}
@ -998,7 +994,8 @@ static void pci_init_multifunction(PCIBus *bus, PCIDevice *dev, Error **errp)
}
/* function 0 indicates single function, so function > 0 must be NULL */
for (func = 1; func < PCI_FUNC_MAX; ++func) {
if (bus->devices[PCI_DEVFN(slot, func)]) {
PCIDevice *device = bus->devices[PCI_DEVFN(slot, func)];
if (device && !pci_is_vf(device)) {
error_setg(errp, "PCI: %x.0 indicates single function, "
"but %x.%x is already populated.",
slot, slot, func);
@ -1283,6 +1280,7 @@ static void pci_qdev_unrealize(DeviceState *dev)
pci_unregister_io_regions(pci_dev);
pci_del_option_rom(pci_dev);
pcie_sriov_unregister_device(pci_dev);
if (pc->exit) {
pc->exit(pci_dev);
@ -1314,7 +1312,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
pcibus_t size = memory_region_size(memory);
uint8_t hdr_type;
assert(!pci_is_vf(pci_dev)); /* VFs must use pcie_sriov_vf_register_bar */
assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS);
assert(is_power_of_2(size));
@ -1325,7 +1322,6 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
assert(hdr_type != PCI_HEADER_TYPE_BRIDGE || region_num < 2);
r = &pci_dev->io_regions[region_num];
r->addr = PCI_BAR_UNMAPPED;
r->size = size;
r->type = type;
r->memory = memory;
@ -1333,6 +1329,18 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
? pci_get_bus(pci_dev)->address_space_io
: pci_get_bus(pci_dev)->address_space_mem;
if (pci_is_vf(pci_dev)) {
PCIDevice *pf = pci_dev->exp.sriov_vf.pf;
assert(!pf || type == pf->exp.sriov_pf.vf_bar_type[region_num]);
r->addr = pci_bar_address(pci_dev, region_num, r->type, r->size);
if (r->addr != PCI_BAR_UNMAPPED) {
memory_region_add_subregion_overlap(r->address_space,
r->addr, r->memory, 1);
}
} else {
r->addr = PCI_BAR_UNMAPPED;
wmask = ~(size - 1);
if (region_num == PCI_ROM_SLOT) {
/* ROM enable bit is writable */
@ -1350,6 +1358,7 @@ void pci_register_bar(PCIDevice *pci_dev, int region_num,
pci_set_long(pci_dev->wmask + addr, wmask & 0xffffffff);
pci_set_long(pci_dev->cmask + addr, 0xffffffff);
}
}
}
static void pci_update_vga(PCIDevice *pci_dev)
@ -2109,6 +2118,11 @@ static void pci_qdev_realize(DeviceState *qdev, Error **errp)
}
}
if (!pcie_sriov_register_device(pci_dev, errp)) {
pci_qdev_unrealize(DEVICE(pci_dev));
return;
}
/*
* A PCIe Downstream Port that do not have ARI Forwarding enabled must
* associate only Device 0 with the device attached to the bus

View File

@ -20,6 +20,8 @@
#include "qapi/error.h"
#include "trace.h"
static GHashTable *pfs;
static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs)
{
for (uint16_t i = 0; i < total_vfs; i++) {
@ -31,14 +33,49 @@ static void unparent_vfs(PCIDevice *dev, uint16_t total_vfs)
dev->exp.sriov_pf.vf = NULL;
}
bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
const char *vfname, uint16_t vf_dev_id,
uint16_t init_vfs, uint16_t total_vfs,
uint16_t vf_offset, uint16_t vf_stride,
Error **errp)
static void clear_ctrl_vfe(PCIDevice *dev)
{
uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
}
static void register_vfs(PCIDevice *dev)
{
uint16_t num_vfs;
uint16_t i;
uint16_t sriov_cap = dev->exp.sriov_cap;
assert(sriov_cap > 0);
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
clear_ctrl_vfe(dev);
return;
}
trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
}
}
static void unregister_vfs(PCIDevice *dev)
{
uint16_t i;
uint8_t *cfg = dev->config + dev->exp.sriov_cap;
trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
}
}
static bool pcie_sriov_pf_init_common(PCIDevice *dev, uint16_t offset,
uint16_t vf_dev_id, uint16_t init_vfs,
uint16_t total_vfs, uint16_t vf_offset,
uint16_t vf_stride, Error **errp)
{
BusState *bus = qdev_get_parent_bus(&dev->qdev);
int32_t devfn = dev->devfn + vf_offset;
uint8_t *cfg = dev->config + offset;
uint8_t *wmask;
@ -100,6 +137,28 @@ bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
qdev_prop_set_bit(&dev->qdev, "multifunction", true);
return true;
}
bool pcie_sriov_pf_init(PCIDevice *dev, uint16_t offset,
const char *vfname, uint16_t vf_dev_id,
uint16_t init_vfs, uint16_t total_vfs,
uint16_t vf_offset, uint16_t vf_stride,
Error **errp)
{
BusState *bus = qdev_get_parent_bus(&dev->qdev);
int32_t devfn = dev->devfn + vf_offset;
if (pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
error_setg(errp, "attaching user-created SR-IOV VF unsupported");
return false;
}
if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, init_vfs,
total_vfs, vf_offset, vf_stride, errp)) {
return false;
}
dev->exp.sriov_pf.vf = g_new(PCIDevice *, total_vfs);
for (uint16_t i = 0; i < total_vfs; i++) {
@ -129,7 +188,24 @@ void pcie_sriov_pf_exit(PCIDevice *dev)
{
uint8_t *cfg = dev->config + dev->exp.sriov_cap;
if (dev->exp.sriov_pf.vf_user_created) {
uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
uint16_t total_vfs = pci_get_word(dev->config + PCI_SRIOV_TOTAL_VF);
uint16_t vf_dev_id = pci_get_word(dev->config + PCI_SRIOV_VF_DID);
unregister_vfs(dev);
for (uint16_t i = 0; i < total_vfs; i++) {
PCIDevice *vf = dev->exp.sriov_pf.vf[i];
vf->exp.sriov_vf.pf = NULL;
pci_config_set_vendor_id(vf->config, ven_id);
pci_config_set_device_id(vf->config, vf_dev_id);
}
} else {
unparent_vfs(dev, pci_get_word(cfg + PCI_SRIOV_TOTAL_VF));
}
}
void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
@ -162,74 +238,172 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory)
{
PCIIORegion *r;
PCIBus *bus = pci_get_bus(dev);
uint8_t type;
pcibus_t size = memory_region_size(memory);
assert(pci_is_vf(dev)); /* PFs must use pci_register_bar */
assert(region_num >= 0);
assert(region_num < PCI_NUM_REGIONS);
assert(dev->exp.sriov_vf.pf);
type = dev->exp.sriov_vf.pf->exp.sriov_pf.vf_bar_type[region_num];
if (!is_power_of_2(size)) {
error_report("%s: PCI region size must be a power"
" of two - type=0x%x, size=0x%"FMT_PCIBUS,
__func__, type, size);
exit(1);
}
r = &dev->io_regions[region_num];
r->memory = memory;
r->address_space =
type & PCI_BASE_ADDRESS_SPACE_IO
? bus->address_space_io
: bus->address_space_mem;
r->size = size;
r->type = type;
r->addr = pci_bar_address(dev, region_num, r->type, r->size);
if (r->addr != PCI_BAR_UNMAPPED) {
memory_region_add_subregion_overlap(r->address_space,
r->addr, r->memory, 1);
}
return pci_register_bar(dev, region_num, type, memory);
}
static void clear_ctrl_vfe(PCIDevice *dev)
static gint compare_vf_devfns(gconstpointer a, gconstpointer b)
{
uint8_t *ctrl = dev->config + dev->exp.sriov_cap + PCI_SRIOV_CTRL;
pci_set_word(ctrl, pci_get_word(ctrl) & ~PCI_SRIOV_CTRL_VFE);
return (*(PCIDevice **)a)->devfn - (*(PCIDevice **)b)->devfn;
}
static void register_vfs(PCIDevice *dev)
int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
uint16_t offset,
Error **errp)
{
uint16_t num_vfs;
GPtrArray *pf;
PCIDevice **vfs;
BusState *bus = qdev_get_parent_bus(DEVICE(dev));
uint16_t ven_id = pci_get_word(dev->config + PCI_VENDOR_ID);
uint16_t vf_dev_id;
uint16_t vf_offset;
uint16_t vf_stride;
uint16_t i;
uint16_t sriov_cap = dev->exp.sriov_cap;
assert(sriov_cap > 0);
num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
if (num_vfs > pci_get_word(dev->config + sriov_cap + PCI_SRIOV_TOTAL_VF)) {
clear_ctrl_vfe(dev);
return;
if (!pfs || !dev->qdev.id) {
return 0;
}
trace_sriov_register_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn), num_vfs);
for (i = 0; i < num_vfs; i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], true);
pf = g_hash_table_lookup(pfs, dev->qdev.id);
if (!pf) {
return 0;
}
if (pf->len > UINT16_MAX) {
error_setg(errp, "too many VFs");
return -1;
}
g_ptr_array_sort(pf, compare_vf_devfns);
vfs = (void *)pf->pdata;
if (vfs[0]->devfn <= dev->devfn) {
error_setg(errp, "a VF function number is less than the PF function number");
return -1;
}
vf_dev_id = pci_get_word(vfs[0]->config + PCI_DEVICE_ID);
vf_offset = vfs[0]->devfn - dev->devfn;
vf_stride = pf->len < 2 ? 0 : vfs[1]->devfn - vfs[0]->devfn;
for (i = 0; i < pf->len; i++) {
if (bus != qdev_get_parent_bus(&vfs[i]->qdev)) {
error_setg(errp, "SR-IOV VF parent bus mismatches with PF");
return -1;
}
if (ven_id != pci_get_word(vfs[i]->config + PCI_VENDOR_ID)) {
error_setg(errp, "SR-IOV VF vendor ID mismatches with PF");
return -1;
}
if (vf_dev_id != pci_get_word(vfs[i]->config + PCI_DEVICE_ID)) {
error_setg(errp, "inconsistent SR-IOV VF device IDs");
return -1;
}
for (size_t j = 0; j < PCI_NUM_REGIONS; j++) {
if (vfs[i]->io_regions[j].size != vfs[0]->io_regions[j].size ||
vfs[i]->io_regions[j].type != vfs[0]->io_regions[j].type) {
error_setg(errp, "inconsistent SR-IOV BARs");
return -1;
}
}
if (vfs[i]->devfn - vfs[0]->devfn != vf_stride * i) {
error_setg(errp, "inconsistent SR-IOV stride");
return -1;
}
}
if (!pcie_sriov_pf_init_common(dev, offset, vf_dev_id, pf->len,
pf->len, vf_offset, vf_stride, errp)) {
return -1;
}
for (i = 0; i < pf->len; i++) {
vfs[i]->exp.sriov_vf.pf = dev;
vfs[i]->exp.sriov_vf.vf_number = i;
/* set vid/did according to sr/iov spec - they are not used */
pci_config_set_vendor_id(vfs[i]->config, 0xffff);
pci_config_set_device_id(vfs[i]->config, 0xffff);
}
dev->exp.sriov_pf.vf = vfs;
dev->exp.sriov_pf.vf_user_created = true;
for (i = 0; i < PCI_NUM_REGIONS; i++) {
PCIIORegion *region = &vfs[0]->io_regions[i];
if (region->size) {
pcie_sriov_pf_init_vf_bar(dev, i, region->type, region->size);
}
}
return PCI_EXT_CAP_SRIOV_SIZEOF;
}
static void unregister_vfs(PCIDevice *dev)
bool pcie_sriov_register_device(PCIDevice *dev, Error **errp)
{
uint16_t i;
uint8_t *cfg = dev->config + dev->exp.sriov_cap;
if (!dev->exp.sriov_pf.vf && dev->qdev.id &&
pfs && g_hash_table_contains(pfs, dev->qdev.id)) {
error_setg(errp, "attaching user-created SR-IOV VF unsupported");
return false;
}
trace_sriov_unregister_vfs(dev->name, PCI_SLOT(dev->devfn),
PCI_FUNC(dev->devfn));
for (i = 0; i < pci_get_word(cfg + PCI_SRIOV_TOTAL_VF); i++) {
pci_set_enabled(dev->exp.sriov_pf.vf[i], false);
if (dev->sriov_pf) {
PCIDevice *pci_pf;
GPtrArray *pf;
if (!PCI_DEVICE_GET_CLASS(dev)->sriov_vf_user_creatable) {
error_setg(errp, "user cannot create SR-IOV VF with this device type");
return false;
}
if (!pci_is_express(dev)) {
error_setg(errp, "PCI Express is required for SR-IOV VF");
return false;
}
if (!pci_qdev_find_device(dev->sriov_pf, &pci_pf)) {
error_setg(errp, "PCI device specified as SR-IOV PF already exists");
return false;
}
if (!pfs) {
pfs = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
}
pf = g_hash_table_lookup(pfs, dev->sriov_pf);
if (!pf) {
pf = g_ptr_array_new();
g_hash_table_insert(pfs, g_strdup(dev->sriov_pf), pf);
}
g_ptr_array_add(pf, dev);
}
return true;
}
void pcie_sriov_unregister_device(PCIDevice *dev)
{
if (dev->sriov_pf && pfs) {
GPtrArray *pf = g_hash_table_lookup(pfs, dev->sriov_pf);
if (pf) {
g_ptr_array_remove_fast(pf, dev);
if (!pf->len) {
g_hash_table_remove(pfs, dev->sriov_pf);
g_ptr_array_free(pf, FALSE);
}
}
}
}
@ -316,7 +490,7 @@ void pcie_sriov_pf_add_sup_pgsize(PCIDevice *dev, uint16_t opt_sup_pgsize)
uint16_t pcie_sriov_vf_number(PCIDevice *dev)
{
assert(pci_is_vf(dev));
assert(dev->exp.sriov_vf.pf);
return dev->exp.sriov_vf.vf_number;
}

View File

@ -37,6 +37,8 @@ struct PCIDeviceClass {
uint16_t subsystem_id; /* only for header type = 0 */
const char *romfile; /* rom bar */
bool sriov_vf_user_creatable;
};
enum PCIReqIDType {
@ -160,6 +162,8 @@ struct PCIDevice {
/* ID of standby device in net_failover pair */
char *failover_pair_id;
uint32_t acpi_index;
char *sriov_pf;
};
static inline int pci_intx(PCIDevice *pci_dev)
@ -192,7 +196,7 @@ static inline int pci_is_express_downstream_port(const PCIDevice *d)
static inline int pci_is_vf(const PCIDevice *d)
{
return d->exp.sriov_vf.pf != NULL;
return d->sriov_pf || d->exp.sriov_vf.pf != NULL;
}
static inline uint32_t pci_config_size(const PCIDevice *d)

View File

@ -18,6 +18,7 @@
typedef struct PCIESriovPF {
uint8_t vf_bar_type[PCI_NUM_REGIONS]; /* Store type for each VF bar */
PCIDevice **vf; /* Pointer to an array of num_vfs VF devices */
bool vf_user_created; /* If VFs are created by user */
} PCIESriovPF;
typedef struct PCIESriovVF {
@ -40,6 +41,23 @@ void pcie_sriov_pf_init_vf_bar(PCIDevice *dev, int region_num,
void pcie_sriov_vf_register_bar(PCIDevice *dev, int region_num,
MemoryRegion *memory);
/**
* pcie_sriov_pf_init_from_user_created_vfs() - Initialize PF with user-created
* VFs.
* @dev: A PCIe device being realized.
* @offset: The offset of the SR-IOV capability.
* @errp: pointer to Error*, to store an error if it happens.
*
* Return: The size of added capability. 0 if the user did not create VFs.
* -1 if failed.
*/
int16_t pcie_sriov_pf_init_from_user_created_vfs(PCIDevice *dev,
uint16_t offset,
Error **errp);
bool pcie_sriov_register_device(PCIDevice *dev, Error **errp);
void pcie_sriov_unregister_device(PCIDevice *dev);
/*
* Default (minimal) page size support values
* as required by the SR/IOV standard: